1
0
Fork 0
mirror of synced 2024-05-21 12:42:44 +12:00

Merge pull request #746 from Serene-Arc/bug_fix_663

Closes https://github.com/aliparlakci/bulk-downloader-for-reddit/issues/663
This commit is contained in:
Serene 2023-01-05 17:46:20 +10:00 committed by GitHub
commit 2589e29304
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 108 additions and 20 deletions

View file

@ -143,6 +143,10 @@ The following options are common between both the `archive` and `download` comma
- Can be specified multiple times
- Disables certain modules from being used
- See [Disabling Modules](#disabling-modules) for more information and a list of module names
- `--filename-restriction-scheme`
- Can be: `windows`, `linux`
- Turns off the OS detection and specifies which system to use when making filenames
- See [Filesystem Restrictions](#filesystem-restrictions)
- `--ignore-user`
- This will add a user to ignore
- Can be specified multiple times
@ -372,6 +376,7 @@ The following keys are optional, and defaults will be used if they cannot be fou
- `max_wait_time`
- `time_format`
- `disabled_modules`
- `filename-restriction-scheme`
All of these should not be modified unless you know what you're doing, as the default values will enable the BDFR to function just fine. A configuration is included in the BDFR when it is installed, and this will be placed in the configuration directory as the default.
@ -421,6 +426,14 @@ Running scenarios concurrently (at the same time) however, is more complicated.
The way to fix this is to use the `--log` option to manually specify where the logfile is to be stored. If the given location is unique to each instance of the BDFR, then it will run fine.
## Filesystem Restrictions
Different filesystems have different restrictions for what files and directories can be named. Thesse are separated into two broad categories: Linux-based filesystems, which have very few restrictions; and Windows-based filesystems, which are much more restrictive in terms if forbidden characters and length of paths.
During the normal course of operation, the BDFR detects what filesystem it is running on and formats any filenames and directories to conform to the rules that are expected of it. However, there are cases where this will fail. When running on a Linux-based machine, or another system where the home filesystem is permissive, and accessing a share or drive with a less permissive system, the BDFR will assume that the *home* filesystem's rules apply. For example, when downloading to a SAMBA share from Ubuntu, there will be errors as SAMBA is more restrictive than Ubuntu.
The best option would be to always download to a filesystem that is as permission as possible, such as an NFS share or ext4 drive. However, when this is not possible, the BDFR allows for the restriction scheme to be manually specified at either the command-line or in the configuration file. At the command-line, this is done with `--filename-restriction-scheme windows`, or else an option by the same name in the configuration file.
## Manipulating Logfiles
The logfiles that the BDFR outputs are consistent and quite detailed and in a format that is amenable to regex. To this end, a number of bash scripts have been [included here](./scripts). They show examples for how to extract successfully downloaded IDs, failed IDs, and more besides.

View file

@ -20,15 +20,16 @@ _common_options = [
click.argument("directory", type=str),
click.option("--authenticate", is_flag=True, default=None),
click.option("--config", type=str, default=None),
click.option("--opts", type=str, default=None),
click.option("--disable-module", multiple=True, default=None, type=str),
click.option("--exclude-id", default=None, multiple=True),
click.option("--exclude-id-file", default=None, multiple=True),
click.option("--file-scheme", default=None, type=str),
click.option("--filename-restriction-scheme", type=click.Choice(("linux", "windows")), default=None),
click.option("--folder-scheme", default=None, type=str),
click.option("--ignore-user", type=str, multiple=True, default=None),
click.option("--include-id-file", multiple=True, default=None),
click.option("--log", type=str, default=None),
click.option("--opts", type=str, default=None),
click.option("--saved", is_flag=True, default=None),
click.option("--search", default=None, type=str),
click.option("--submitted", is_flag=True, default=None),

View file

@ -23,6 +23,7 @@ class Configuration(Namespace):
self.exclude_id = []
self.exclude_id_file = []
self.file_scheme: str = "{REDDITOR}_{TITLE}_{POSTID}"
self.filename_restriction_scheme = None
self.folder_scheme: str = "{SUBREDDIT}"
self.ignore_user = []
self.include_id_file = []

View file

@ -107,6 +107,11 @@ class RedditConnector(metaclass=ABCMeta):
self.args.time_format = option
if not self.args.disable_module:
self.args.disable_module = [self.cfg_parser.get("DEFAULT", "disabled_modules", fallback="")]
if not self.args.filename_restriction_scheme:
self.args.filename_restriction_scheme = self.cfg_parser.get(
"DEFAULT", "filename_restriction_scheme", fallback=None
)
logger.debug(f"Setting filename restriction scheme to '{self.args.filename_restriction_scheme}'")
# Update config on disk
with open(self.config_location, "w") as file:
self.cfg_parser.write(file)
@ -395,7 +400,9 @@ class RedditConnector(metaclass=ABCMeta):
raise errors.BulkDownloaderException(f"User {name} is banned")
def create_file_name_formatter(self) -> FileNameFormatter:
return FileNameFormatter(self.args.file_scheme, self.args.folder_scheme, self.args.time_format)
return FileNameFormatter(
self.args.file_scheme, self.args.folder_scheme, self.args.time_format, self.args.filename_restriction_scheme
)
def create_time_filter(self) -> RedditTypes.TimeType:
try:

View file

@ -27,13 +27,26 @@ class FileNameFormatter:
"title",
"upvotes",
)
WINDOWS_MAX_PATH_LENGTH = 260
LINUX_MAX_PATH_LENGTH = 4096
def __init__(self, file_format_string: str, directory_format_string: str, time_format_string: str):
def __init__(
self,
file_format_string: str,
directory_format_string: str,
time_format_string: str,
restriction_scheme: Optional[str] = None,
):
if not self.validate_string(file_format_string):
raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string')
self.file_format_string = file_format_string
self.directory_format_string: list[str] = directory_format_string.split("/")
self.time_format_string = time_format_string
self.restiction_scheme = restriction_scheme.lower().strip() if restriction_scheme else None
if self.restiction_scheme == "windows":
self.max_path = self.WINDOWS_MAX_PATH_LENGTH
else:
self.max_path = self.find_max_path_length()
def _format_name(self, submission: Union[Comment, Submission], format_string: str) -> str:
if isinstance(submission, Submission):
@ -52,9 +65,12 @@ class FileNameFormatter:
result = result.replace("/", "")
if platform.system() == "Windows":
if self.restiction_scheme is None:
if platform.system() == "Windows":
result = FileNameFormatter._format_for_windows(result)
elif self.restiction_scheme == "windows":
logger.debug("Forcing Windows-compatible filenames")
result = FileNameFormatter._format_for_windows(result)
return result
@staticmethod
@ -126,14 +142,13 @@ class FileNameFormatter:
raise BulkDownloaderException(f"Could not determine path name: {subfolder}, {index}, {resource.extension}")
return file_path
@staticmethod
def limit_file_name_length(filename: str, ending: str, root: Path) -> Path:
def limit_file_name_length(self, filename: str, ending: str, root: Path) -> Path:
root = root.resolve().expanduser()
possible_id = re.search(r"((?:_\w{6})?$)", filename)
if possible_id:
ending = possible_id.group(1) + ending
filename = filename[: possible_id.start()]
max_path = FileNameFormatter.find_max_path_length()
max_path = self.max_path
max_file_part_length_chars = 255 - len(ending)
max_file_part_length_bytes = 255 - len(ending.encode("utf-8"))
max_path_length = max_path - len(ending) - len(str(root)) - 1
@ -157,9 +172,9 @@ class FileNameFormatter:
return int(subprocess.check_output(["getconf", "PATH_MAX", "/"]))
except (ValueError, subprocess.CalledProcessError, OSError):
if platform.system() == "Windows":
return 260
return FileNameFormatter.WINDOWS_MAX_PATH_LENGTH
else:
return 4096
return FileNameFormatter.LINUX_MAX_PATH_LENGTH
def format_resource_paths(
self,

View file

@ -400,6 +400,8 @@ def test_cli_download_score_filter(test_args: list[str], was_filtered: bool, tmp
assert ("filtered due to score" in result.output) == was_filtered
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason="A test config file is required for integration tests")
@pytest.mark.parametrize(
("test_args", "response"),
@ -408,7 +410,7 @@ def test_cli_download_score_filter(test_args: list[str], was_filtered: bool, tmp
(["--user", "nasa", "--submitted"], 504),
),
)
def test_user_serv_fail(test_args: list[str], response: int, tmp_path: Path):
def test_cli_download_user_reddit_server_error(test_args: list[str], response: int, tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_download_runner(test_args, tmp_path)
with patch("bdfr.connector.sleep", return_value=None):
@ -419,3 +421,22 @@ def test_user_serv_fail(test_args: list[str], response: int, tmp_path: Path):
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert f"received {response} HTTP response" in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason="A test config file is required for integration tests")
@pytest.mark.parametrize(
"test_args",
(
["-l", "102vd5i", "--filename-restriction-scheme", "windows"],
["-l", "m3hxzd", "--filename-restriction-scheme", "windows"],
),
)
def test_cli_download_explicit_filename_restriction_scheme(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_download_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert "Downloaded submission" in result.output
assert "Forcing Windows-compatible filenames" in result.output

View file

@ -33,6 +33,16 @@ def submission() -> MagicMock:
return test
@pytest.fixture()
def test_formatter() -> FileNameFormatter:
out = FileNameFormatter("{TITLE}", "", "ISO")
return out
def check_valid_windows_path(test_string: str):
return test_string == FileNameFormatter._format_for_windows(test_string)
def do_test_string_equality(result: Union[Path, str], expected: str) -> bool:
if platform.system() == "Windows":
expected = FileNameFormatter._format_for_windows(expected)
@ -91,6 +101,15 @@ def test_check_format_string_validity(test_string: str, expected: bool):
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(
"restriction_scheme",
(
"windows",
"linux",
"bla",
None,
),
)
@pytest.mark.parametrize(
("test_format_string", "expected"),
(
@ -102,10 +121,17 @@ def test_check_format_string_validity(test_string: str, expected: bool):
("{REDDITOR}_{TITLE}_{POSTID}", "Kirsty-Blue_George Russel acknowledges the Twitter trend about him_w22m5l"),
),
)
def test_format_name_real(test_format_string: str, expected: str, reddit_submission: praw.models.Submission):
test_formatter = FileNameFormatter(test_format_string, "", "")
def test_format_name_real(
test_format_string: str,
expected: str,
reddit_submission: praw.models.Submission,
restriction_scheme: Optional[str],
):
test_formatter = FileNameFormatter(test_format_string, "", "", restriction_scheme)
result = test_formatter._format_name(reddit_submission, test_format_string)
assert do_test_string_equality(result, expected)
if restriction_scheme == "windows":
assert check_valid_windows_path(result)
@pytest.mark.online
@ -211,8 +237,8 @@ def test_format_multiple_resources():
("😍💕✨" * 100, "_1.png"),
),
)
def test_limit_filename_length(test_filename: str, test_ending: str):
result = FileNameFormatter.limit_file_name_length(test_filename, test_ending, Path("."))
def test_limit_filename_length(test_filename: str, test_ending: str, test_formatter: FileNameFormatter):
result = test_formatter.limit_file_name_length(test_filename, test_ending, Path("."))
assert len(result.name) <= 255
assert len(result.name.encode("utf-8")) <= 255
assert len(str(result)) <= FileNameFormatter.find_max_path_length()
@ -233,8 +259,10 @@ def test_limit_filename_length(test_filename: str, test_ending: str):
("😍💕✨" * 100 + "_aaa1aa", "_1.png", "_aaa1aa_1.png"),
),
)
def test_preserve_id_append_when_shortening(test_filename: str, test_ending: str, expected_end: str):
result = FileNameFormatter.limit_file_name_length(test_filename, test_ending, Path("."))
def test_preserve_id_append_when_shortening(
test_filename: str, test_ending: str, expected_end: str, test_formatter: FileNameFormatter
):
result = test_formatter.limit_file_name_length(test_filename, test_ending, Path("."))
assert len(result.name) <= 255
assert len(result.name.encode("utf-8")) <= 255
assert result.name.endswith(expected_end)
@ -264,8 +292,8 @@ def test_shorten_filename_real(submission: MagicMock, tmp_path: Path):
("a" * 500, "_bbbbbb.jpg"),
),
)
def test_shorten_path(test_name: str, test_ending: str, tmp_path: Path):
result = FileNameFormatter.limit_file_name_length(test_name, test_ending, tmp_path)
def test_shorten_path(test_name: str, test_ending: str, tmp_path: Path, test_formatter: FileNameFormatter):
result = test_formatter.limit_file_name_length(test_name, test_ending, tmp_path)
assert len(str(result.name)) <= 255
assert len(str(result.name).encode("UTF-8")) <= 255
assert len(str(result.name).encode("cp1252")) <= 255
@ -462,7 +490,9 @@ def test_get_max_path_length():
def test_windows_max_path(tmp_path: Path):
with unittest.mock.patch("platform.system", return_value="Windows"):
with unittest.mock.patch("bdfr.file_name_formatter.FileNameFormatter.find_max_path_length", return_value=260):
result = FileNameFormatter.limit_file_name_length("test" * 100, "_1.png", tmp_path)
mock = MagicMock()
mock.max_path = 260
result = FileNameFormatter.limit_file_name_length(mock, "test" * 100, "_1.png", tmp_path)
assert len(str(result)) <= 260
assert len(result.name) <= (260 - len(str(tmp_path)))