From 2b885451e79c7d09c29729601ecbe51c0e61f862 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 20 Mar 2021 12:03:53 +1000 Subject: [PATCH] Add ability to make hard links for duplicates --- bulkredditdownloader/__main__.py | 1 + bulkredditdownloader/configuration.py | 1 + bulkredditdownloader/downloader.py | 39 ++++++++------- bulkredditdownloader/tests/test_downloader.py | 47 ++++++++++++++----- 4 files changed, 60 insertions(+), 28 deletions(-) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 6d24303..1e40bb0 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -43,6 +43,7 @@ def cli(): @cli.command('download') +@click.option('--make-hard-links', is_flag=True, default=None) @click.option('--no-dupes', is_flag=True, default=None) @click.option('--search-existing', is_flag=True, default=None) @click.option('--set-file-scheme', default=None, type=str) diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py index 09d1b8a..7c298b4 100644 --- a/bulkredditdownloader/configuration.py +++ b/bulkredditdownloader/configuration.py @@ -31,6 +31,7 @@ class Configuration(Namespace): self.upvoted: bool = False self.user: Optional[str] = None self.verbose: int = 0 + self.make_hard_links = False # Archiver-specific options self.format = 'json' diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index e810086..dc9c301 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -74,9 +74,10 @@ class RedditDownloader: self._create_reddit_instance() self._resolve_user_name() - self.master_hash_list = [] if self.args.search_existing: - self.master_hash_list.extend(self.scan_existing_files(self.download_directory)) + self.master_hash_list = self.scan_existing_files(self.download_directory) + else: + self.master_hash_list = {} self.authenticator = self._create_authenticator() logger.log(9, 'Created site authenticator') @@ -341,27 +342,33 @@ class RedditDownloader: logger.error( f'Failed to download resource from {res.url} with downloader {downloader_class.__name__}') return - if res.hash.hexdigest() in self.master_hash_list and self.args.no_dupes: - logger.warning(f'Resource from "{res.url}" and hash "{res.hash.hexdigest()}" downloaded elsewhere') - else: - # TODO: consider making a hard link/symlink here - destination.parent.mkdir(parents=True, exist_ok=True) - with open(destination, 'wb') as file: - file.write(res.content) - logger.debug(f'Written file to {destination}') - self.master_hash_list.append(res.hash.hexdigest()) - logger.debug(f'Hash added to master list: {res.hash.hexdigest()}') - logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}') + resource_hash = res.hash.hexdigest() + if resource_hash in self.master_hash_list: + if self.args.no_dupes: + logger.warning(f'Resource from "{res.url}" and hash "{resource_hash}" downloaded elsewhere') + return + elif self.args.make_hard_links: + self.master_hash_list[resource_hash].link_to(destination) + logger.debug( + f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}') + return + destination.parent.mkdir(parents=True, exist_ok=True) + with open(destination, 'wb') as file: + file.write(res.content) + logger.debug(f'Written file to {destination}') + self.master_hash_list[resource_hash] = destination + logger.debug(f'Hash added to master list: {resource_hash}') + logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}') @staticmethod - def scan_existing_files(directory: Path) -> list[str]: + def scan_existing_files(directory: Path) -> dict[str, Path]: files = [] for (dirpath, dirnames, filenames) in os.walk(directory): files.extend([Path(dirpath, file) for file in filenames]) logger.info(f'Calculating hashes for {len(files)} files') - hash_list = [] + hash_list = {} for existing_file in files: with open(existing_file, 'rb') as file: - hash_list.append(hashlib.md5(file.read()).hexdigest()) + hash_list[hashlib.md5(file.read()).hexdigest()] = existing_file logger.log(9, f'Hash calculated for file at {existing_file}') return hash_list diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index cf5c4fa..e263a5d 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -27,12 +27,13 @@ def args() -> Configuration: @pytest.fixture() -def downloader_mock(args: argparse.Namespace): - mock_downloader = MagicMock() - mock_downloader.args = args - mock_downloader._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name - mock_downloader._split_args_input = RedditDownloader._split_args_input - return mock_downloader +def downloader_mock(args: Configuration): + downloader_mock = MagicMock() + downloader_mock.args = args + downloader_mock._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name + downloader_mock._split_args_input = RedditDownloader._split_args_input + downloader_mock.master_hash_list = {} + return downloader_mock def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]): @@ -285,7 +286,6 @@ def test_download_submission( downloader_mock.args.set_folder_scheme = '' downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) downloader_mock.download_directory = tmp_path - downloader_mock.master_hash_list = [] submission = downloader_mock.reddit_instance.submission(id=test_submission_id) RedditDownloader._download_submission(downloader_mock, submission) folder_contents = list(tmp_path.iterdir()) @@ -305,9 +305,8 @@ def test_download_submission_file_exists( downloader_mock.args.set_folder_scheme = '' downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) downloader_mock.download_directory = tmp_path - downloader_mock.master_hash_list = [] submission = downloader_mock.reddit_instance.submission(id='m1hqw6') - Path(tmp_path, 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6_1.png').touch() + Path(tmp_path, 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6.png').touch() RedditDownloader._download_submission(downloader_mock, submission) folder_contents = list(tmp_path.iterdir()) output = capsys.readouterr() @@ -329,7 +328,7 @@ def test_download_submission_hash_exists( downloader_mock.args.no_dupes = True downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) downloader_mock.download_directory = tmp_path - downloader_mock.master_hash_list = ['a912af8905ae468e0121e9940f797ad7'] + downloader_mock.master_hash_list = {'a912af8905ae468e0121e9940f797ad7': None} submission = downloader_mock.reddit_instance.submission(id='m1hqw6') RedditDownloader._download_submission(downloader_mock, submission) folder_contents = list(tmp_path.iterdir()) @@ -356,8 +355,7 @@ def test_sanitise_subreddit_name(test_name: str, expected: str): def test_search_existing_files(): results = RedditDownloader.scan_existing_files(Path('.')) - assert all([isinstance(result, str) for result in results]) - assert len(results) >= 40 + assert len(results.keys()) >= 40 @pytest.mark.parametrize(('test_subreddit_entries', 'expected'), ( @@ -370,3 +368,28 @@ def test_search_existing_files(): def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: set[str]): results = RedditDownloader._split_args_input(test_subreddit_entries) assert results == expected + + +@pytest.mark.online +@pytest.mark.reddit +def test_mark_hard_link(downloader_mock: MagicMock, tmp_path: Path, reddit_instance: praw.Reddit): + downloader_mock.reddit_instance = reddit_instance + downloader_mock.args.make_hard_links = True + downloader_mock.download_directory = tmp_path + downloader_mock.args.set_folder_scheme = '' + downloader_mock.args.set_file_scheme = '{POSTID}' + downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) + submission = downloader_mock.reddit_instance.submission(id='m1hqw6') + original = Path(tmp_path, 'm1hqw6.png') + + RedditDownloader._download_submission(downloader_mock, submission) + assert original.exists() + + downloader_mock.args.set_file_scheme = 'test2_{POSTID}' + downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) + RedditDownloader._download_submission(downloader_mock, submission) + test_file_1_stats = original.stat() + test_file_2_inode = Path(tmp_path, 'test2_m1hqw6.png').stat().st_ino + + assert test_file_1_stats.st_nlink == 2 + assert test_file_1_stats.st_ino == test_file_2_inode