From f40ac35f4af9f26223cc12eb78a1f10c8ff83a1f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 3 Jan 2023 19:16:23 +1000 Subject: [PATCH 1/7] Add option to determine restriction scheme --- bdfr/file_name_formatter.py | 15 ++++++++++++--- tests/test_file_name_formatter.py | 24 ++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index 9ee481d..0330336 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -28,12 +28,19 @@ class FileNameFormatter: "upvotes", ) - def __init__(self, file_format_string: str, directory_format_string: str, time_format_string: str): + def __init__( + self, + file_format_string: str, + directory_format_string: str, + time_format_string: str, + restriction_scheme: Optional[str] = None, + ): if not self.validate_string(file_format_string): raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string') self.file_format_string = file_format_string self.directory_format_string: list[str] = directory_format_string.split("/") self.time_format_string = time_format_string + self.restiction_scheme = restriction_scheme.lower().strip() if restriction_scheme else None def _format_name(self, submission: Union[Comment, Submission], format_string: str) -> str: if isinstance(submission, Submission): @@ -52,9 +59,11 @@ class FileNameFormatter: result = result.replace("/", "") - if platform.system() == "Windows": + if self.restiction_scheme is None: + if platform.system() == "Windows": + result = FileNameFormatter._format_for_windows(result) + elif self.restiction_scheme == "windows": result = FileNameFormatter._format_for_windows(result) - return result @staticmethod diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index 4964b3b..5f94e5f 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -33,6 +33,10 @@ def submission() -> MagicMock: return test +def check_valid_windows_path(test_string: str): + return test_string == FileNameFormatter._format_for_windows(test_string) + + def do_test_string_equality(result: Union[Path, str], expected: str) -> bool: if platform.system() == "Windows": expected = FileNameFormatter._format_for_windows(expected) @@ -91,6 +95,15 @@ def test_check_format_string_validity(test_string: str, expected: bool): @pytest.mark.online @pytest.mark.reddit +@pytest.mark.parametrize( + "restriction_scheme", + ( + "windows", + "linux", + "bla", + None, + ), +) @pytest.mark.parametrize( ("test_format_string", "expected"), ( @@ -102,10 +115,17 @@ def test_check_format_string_validity(test_string: str, expected: bool): ("{REDDITOR}_{TITLE}_{POSTID}", "Kirsty-Blue_George Russel acknowledges the Twitter trend about him_w22m5l"), ), ) -def test_format_name_real(test_format_string: str, expected: str, reddit_submission: praw.models.Submission): - test_formatter = FileNameFormatter(test_format_string, "", "") +def test_format_name_real( + test_format_string: str, + expected: str, + reddit_submission: praw.models.Submission, + restriction_scheme: Optional[str], +): + test_formatter = FileNameFormatter(test_format_string, "", "", restriction_scheme) result = test_formatter._format_name(reddit_submission, test_format_string) assert do_test_string_equality(result, expected) + if restriction_scheme == "windows": + assert check_valid_windows_path(result) @pytest.mark.online From 4f07e92c5ee47d38941815d2fe48d90de884e287 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 4 Jan 2023 19:04:31 +1000 Subject: [PATCH 2/7] Add option to classes --- bdfr/__main__.py | 3 ++- bdfr/configuration.py | 1 + bdfr/connector.py | 4 ++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index e35ba0a..2823ce1 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -20,15 +20,16 @@ _common_options = [ click.argument("directory", type=str), click.option("--authenticate", is_flag=True, default=None), click.option("--config", type=str, default=None), - click.option("--opts", type=str, default=None), click.option("--disable-module", multiple=True, default=None, type=str), click.option("--exclude-id", default=None, multiple=True), click.option("--exclude-id-file", default=None, multiple=True), click.option("--file-scheme", default=None, type=str), + click.option("--filename-restriction-scheme", type=click.Choice(("linux", "windows")), default=None), click.option("--folder-scheme", default=None, type=str), click.option("--ignore-user", type=str, multiple=True, default=None), click.option("--include-id-file", multiple=True, default=None), click.option("--log", type=str, default=None), + click.option("--opts", type=str, default=None), click.option("--saved", is_flag=True, default=None), click.option("--search", default=None, type=str), click.option("--submitted", is_flag=True, default=None), diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 0d00192..78ae12e 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -23,6 +23,7 @@ class Configuration(Namespace): self.exclude_id = [] self.exclude_id_file = [] self.file_scheme: str = "{REDDITOR}_{TITLE}_{POSTID}" + self.filename_restriction_scheme = None self.folder_scheme: str = "{SUBREDDIT}" self.ignore_user = [] self.include_id_file = [] diff --git a/bdfr/connector.py b/bdfr/connector.py index 6d7bc64..8fe149a 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -107,6 +107,10 @@ class RedditConnector(metaclass=ABCMeta): self.args.time_format = option if not self.args.disable_module: self.args.disable_module = [self.cfg_parser.get("DEFAULT", "disabled_modules", fallback="")] + if not self.args.filename_restriction_scheme: + self.args.filename_restriction_scheme = self.cfg_parser.get( + "DEFAULT", "filename_restriction_scheme", fallback=None + ) # Update config on disk with open(self.config_location, "w") as file: self.cfg_parser.write(file) From 8c57dc228370d101722ec9d9e23786305c7e8638 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 4 Jan 2023 19:07:31 +1000 Subject: [PATCH 3/7] Add missing pytest flags to test --- tests/integration_tests/test_download_integration.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 138ea61..545bd31 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -400,6 +400,8 @@ def test_cli_download_score_filter(test_args: list[str], was_filtered: bool, tmp assert ("filtered due to score" in result.output) == was_filtered +@pytest.mark.online +@pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason="A test config file is required for integration tests") @pytest.mark.parametrize( ("test_args", "response"), From b64f5080257711465e2c752cda8c56d3bacc8e51 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 4 Jan 2023 19:37:02 +1000 Subject: [PATCH 4/7] Conform path length to filename scheme restriction --- bdfr/file_name_formatter.py | 16 +++++++++++----- tests/test_file_name_formatter.py | 24 +++++++++++++++++------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index 0330336..dd04fad 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -27,6 +27,8 @@ class FileNameFormatter: "title", "upvotes", ) + WINDOWS_MAX_PATH_LENGTH = 260 + LINUX_MAX_PATH_LENGTH = 4096 def __init__( self, @@ -41,6 +43,10 @@ class FileNameFormatter: self.directory_format_string: list[str] = directory_format_string.split("/") self.time_format_string = time_format_string self.restiction_scheme = restriction_scheme.lower().strip() if restriction_scheme else None + if self.restiction_scheme == "windows": + self.max_path = self.WINDOWS_MAX_PATH_LENGTH + else: + self.max_path = self.find_max_path_length() def _format_name(self, submission: Union[Comment, Submission], format_string: str) -> str: if isinstance(submission, Submission): @@ -63,6 +69,7 @@ class FileNameFormatter: if platform.system() == "Windows": result = FileNameFormatter._format_for_windows(result) elif self.restiction_scheme == "windows": + logger.debug("Forcing Windows-compatible filenames") result = FileNameFormatter._format_for_windows(result) return result @@ -135,14 +142,13 @@ class FileNameFormatter: raise BulkDownloaderException(f"Could not determine path name: {subfolder}, {index}, {resource.extension}") return file_path - @staticmethod - def limit_file_name_length(filename: str, ending: str, root: Path) -> Path: + def limit_file_name_length(self, filename: str, ending: str, root: Path) -> Path: root = root.resolve().expanduser() possible_id = re.search(r"((?:_\w{6})?$)", filename) if possible_id: ending = possible_id.group(1) + ending filename = filename[: possible_id.start()] - max_path = FileNameFormatter.find_max_path_length() + max_path = self.max_path max_file_part_length_chars = 255 - len(ending) max_file_part_length_bytes = 255 - len(ending.encode("utf-8")) max_path_length = max_path - len(ending) - len(str(root)) - 1 @@ -166,9 +172,9 @@ class FileNameFormatter: return int(subprocess.check_output(["getconf", "PATH_MAX", "/"])) except (ValueError, subprocess.CalledProcessError, OSError): if platform.system() == "Windows": - return 260 + return FileNameFormatter.WINDOWS_MAX_PATH_LENGTH else: - return 4096 + return FileNameFormatter.LINUX_MAX_PATH_LENGTH def format_resource_paths( self, diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index 5f94e5f..fb34a53 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -33,6 +33,12 @@ def submission() -> MagicMock: return test +@pytest.fixture() +def test_formatter() -> FileNameFormatter: + out = FileNameFormatter("{TITLE}", "", "ISO") + return out + + def check_valid_windows_path(test_string: str): return test_string == FileNameFormatter._format_for_windows(test_string) @@ -231,8 +237,8 @@ def test_format_multiple_resources(): ("😍💕✨" * 100, "_1.png"), ), ) -def test_limit_filename_length(test_filename: str, test_ending: str): - result = FileNameFormatter.limit_file_name_length(test_filename, test_ending, Path(".")) +def test_limit_filename_length(test_filename: str, test_ending: str, test_formatter: FileNameFormatter): + result = test_formatter.limit_file_name_length(test_filename, test_ending, Path(".")) assert len(result.name) <= 255 assert len(result.name.encode("utf-8")) <= 255 assert len(str(result)) <= FileNameFormatter.find_max_path_length() @@ -253,8 +259,10 @@ def test_limit_filename_length(test_filename: str, test_ending: str): ("😍💕✨" * 100 + "_aaa1aa", "_1.png", "_aaa1aa_1.png"), ), ) -def test_preserve_id_append_when_shortening(test_filename: str, test_ending: str, expected_end: str): - result = FileNameFormatter.limit_file_name_length(test_filename, test_ending, Path(".")) +def test_preserve_id_append_when_shortening( + test_filename: str, test_ending: str, expected_end: str, test_formatter: FileNameFormatter +): + result = test_formatter.limit_file_name_length(test_filename, test_ending, Path(".")) assert len(result.name) <= 255 assert len(result.name.encode("utf-8")) <= 255 assert result.name.endswith(expected_end) @@ -284,8 +292,8 @@ def test_shorten_filename_real(submission: MagicMock, tmp_path: Path): ("a" * 500, "_bbbbbb.jpg"), ), ) -def test_shorten_path(test_name: str, test_ending: str, tmp_path: Path): - result = FileNameFormatter.limit_file_name_length(test_name, test_ending, tmp_path) +def test_shorten_path(test_name: str, test_ending: str, tmp_path: Path, test_formatter: FileNameFormatter): + result = test_formatter.limit_file_name_length(test_name, test_ending, tmp_path) assert len(str(result.name)) <= 255 assert len(str(result.name).encode("UTF-8")) <= 255 assert len(str(result.name).encode("cp1252")) <= 255 @@ -482,7 +490,9 @@ def test_get_max_path_length(): def test_windows_max_path(tmp_path: Path): with unittest.mock.patch("platform.system", return_value="Windows"): with unittest.mock.patch("bdfr.file_name_formatter.FileNameFormatter.find_max_path_length", return_value=260): - result = FileNameFormatter.limit_file_name_length("test" * 100, "_1.png", tmp_path) + mock = MagicMock() + mock.max_path = 260 + result = FileNameFormatter.limit_file_name_length(mock, "test" * 100, "_1.png", tmp_path) assert len(str(result)) <= 260 assert len(result.name) <= (260 - len(str(tmp_path))) From 77a01e1627e8c47a6cf27f76894c08f661894d14 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 4 Jan 2023 19:49:09 +1000 Subject: [PATCH 5/7] Add tests for new option --- bdfr/connector.py | 5 ++++- .../test_download_integration.py | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index 8fe149a..860750d 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -111,6 +111,7 @@ class RedditConnector(metaclass=ABCMeta): self.args.filename_restriction_scheme = self.cfg_parser.get( "DEFAULT", "filename_restriction_scheme", fallback=None ) + logger.debug(f"Setting filename restriction scheme to '{self.args.filename_restriction_scheme}'") # Update config on disk with open(self.config_location, "w") as file: self.cfg_parser.write(file) @@ -399,7 +400,9 @@ class RedditConnector(metaclass=ABCMeta): raise errors.BulkDownloaderException(f"User {name} is banned") def create_file_name_formatter(self) -> FileNameFormatter: - return FileNameFormatter(self.args.file_scheme, self.args.folder_scheme, self.args.time_format) + return FileNameFormatter( + self.args.file_scheme, self.args.folder_scheme, self.args.time_format, self.args.filename_restriction_scheme + ) def create_time_filter(self) -> RedditTypes.TimeType: try: diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 545bd31..35ca0c0 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -421,3 +421,22 @@ def test_user_serv_fail(test_args: list[str], response: int, tmp_path: Path): result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert f"received {response} HTTP response" in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason="A test config file is required for integration tests") +@pytest.mark.parametrize( + "test_args", + ( + ["-l", "102vd5i", "--filename-restriction-scheme", "windows"], + ["-l", "m3hxzd", "--filename-restriction-scheme", "windows"], + ), +) +def test_cli_download_explicit_filename_restriction_scheme(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert "Downloaded submission" in result.output + assert "Forcing Windows-compatible filenames" in result.output From 12311029e4d8f4354c14fbc2fd368e8fe3c34505 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 4 Jan 2023 19:49:50 +1000 Subject: [PATCH 6/7] Conform test name --- tests/integration_tests/test_download_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 35ca0c0..4dae353 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -410,7 +410,7 @@ def test_cli_download_score_filter(test_args: list[str], was_filtered: bool, tmp (["--user", "nasa", "--submitted"], 504), ), ) -def test_user_serv_fail(test_args: list[str], response: int, tmp_path: Path): +def test_cli_download_user_reddit_server_error(test_args: list[str], response: int, tmp_path: Path): runner = CliRunner() test_args = create_basic_args_for_download_runner(test_args, tmp_path) with patch("bdfr.connector.sleep", return_value=None): From 241021fa391a2e287d1d04ab86e7b4aa5b548741 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 5 Jan 2023 17:38:37 +1000 Subject: [PATCH 7/7] Add new option details --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index a174d80..b0bce0a 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,10 @@ The following options are common between both the `archive` and `download` comma - Can be specified multiple times - Disables certain modules from being used - See [Disabling Modules](#disabling-modules) for more information and a list of module names +- `--filename-restriction-scheme` + - Can be: `windows`, `linux` + - Turns off the OS detection and specifies which system to use when making filenames + - See [Filesystem Restrictions](#filesystem-restrictions) - `--ignore-user` - This will add a user to ignore - Can be specified multiple times @@ -372,6 +376,7 @@ The following keys are optional, and defaults will be used if they cannot be fou - `max_wait_time` - `time_format` - `disabled_modules` +- `filename-restriction-scheme` All of these should not be modified unless you know what you're doing, as the default values will enable the BDFR to function just fine. A configuration is included in the BDFR when it is installed, and this will be placed in the configuration directory as the default. @@ -421,6 +426,14 @@ Running scenarios concurrently (at the same time) however, is more complicated. The way to fix this is to use the `--log` option to manually specify where the logfile is to be stored. If the given location is unique to each instance of the BDFR, then it will run fine. +## Filesystem Restrictions + +Different filesystems have different restrictions for what files and directories can be named. Thesse are separated into two broad categories: Linux-based filesystems, which have very few restrictions; and Windows-based filesystems, which are much more restrictive in terms if forbidden characters and length of paths. + +During the normal course of operation, the BDFR detects what filesystem it is running on and formats any filenames and directories to conform to the rules that are expected of it. However, there are cases where this will fail. When running on a Linux-based machine, or another system where the home filesystem is permissive, and accessing a share or drive with a less permissive system, the BDFR will assume that the *home* filesystem's rules apply. For example, when downloading to a SAMBA share from Ubuntu, there will be errors as SAMBA is more restrictive than Ubuntu. + +The best option would be to always download to a filesystem that is as permission as possible, such as an NFS share or ext4 drive. However, when this is not possible, the BDFR allows for the restriction scheme to be manually specified at either the command-line or in the configuration file. At the command-line, this is done with `--filename-restriction-scheme windows`, or else an option by the same name in the configuration file. + ## Manipulating Logfiles The logfiles that the BDFR outputs are consistent and quite detailed and in a format that is amenable to regex. To this end, a number of bash scripts have been [included here](./scripts). They show examples for how to extract successfully downloaded IDs, failed IDs, and more besides.