1
0
Fork 0
mirror of synced 2024-06-10 14:24:37 +12:00

Add ability to make hard links for duplicates

This commit is contained in:
Serene-Arc 2021-03-20 12:03:53 +10:00 committed by Ali Parlakci
parent 902f796178
commit 2b885451e7
4 changed files with 60 additions and 28 deletions

View file

@ -43,6 +43,7 @@ def cli():
@cli.command('download')
@click.option('--make-hard-links', is_flag=True, default=None)
@click.option('--no-dupes', is_flag=True, default=None)
@click.option('--search-existing', is_flag=True, default=None)
@click.option('--set-file-scheme', default=None, type=str)

View file

@ -31,6 +31,7 @@ class Configuration(Namespace):
self.upvoted: bool = False
self.user: Optional[str] = None
self.verbose: int = 0
self.make_hard_links = False
# Archiver-specific options
self.format = 'json'

View file

@ -74,9 +74,10 @@ class RedditDownloader:
self._create_reddit_instance()
self._resolve_user_name()
self.master_hash_list = []
if self.args.search_existing:
self.master_hash_list.extend(self.scan_existing_files(self.download_directory))
self.master_hash_list = self.scan_existing_files(self.download_directory)
else:
self.master_hash_list = {}
self.authenticator = self._create_authenticator()
logger.log(9, 'Created site authenticator')
@ -341,27 +342,33 @@ class RedditDownloader:
logger.error(
f'Failed to download resource from {res.url} with downloader {downloader_class.__name__}')
return
if res.hash.hexdigest() in self.master_hash_list and self.args.no_dupes:
logger.warning(f'Resource from "{res.url}" and hash "{res.hash.hexdigest()}" downloaded elsewhere')
else:
# TODO: consider making a hard link/symlink here
destination.parent.mkdir(parents=True, exist_ok=True)
with open(destination, 'wb') as file:
file.write(res.content)
logger.debug(f'Written file to {destination}')
self.master_hash_list.append(res.hash.hexdigest())
logger.debug(f'Hash added to master list: {res.hash.hexdigest()}')
logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}')
resource_hash = res.hash.hexdigest()
if resource_hash in self.master_hash_list:
if self.args.no_dupes:
logger.warning(f'Resource from "{res.url}" and hash "{resource_hash}" downloaded elsewhere')
return
elif self.args.make_hard_links:
self.master_hash_list[resource_hash].link_to(destination)
logger.debug(
f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}')
return
destination.parent.mkdir(parents=True, exist_ok=True)
with open(destination, 'wb') as file:
file.write(res.content)
logger.debug(f'Written file to {destination}')
self.master_hash_list[resource_hash] = destination
logger.debug(f'Hash added to master list: {resource_hash}')
logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}')
@staticmethod
def scan_existing_files(directory: Path) -> list[str]:
def scan_existing_files(directory: Path) -> dict[str, Path]:
files = []
for (dirpath, dirnames, filenames) in os.walk(directory):
files.extend([Path(dirpath, file) for file in filenames])
logger.info(f'Calculating hashes for {len(files)} files')
hash_list = []
hash_list = {}
for existing_file in files:
with open(existing_file, 'rb') as file:
hash_list.append(hashlib.md5(file.read()).hexdigest())
hash_list[hashlib.md5(file.read()).hexdigest()] = existing_file
logger.log(9, f'Hash calculated for file at {existing_file}')
return hash_list

View file

@ -27,12 +27,13 @@ def args() -> Configuration:
@pytest.fixture()
def downloader_mock(args: argparse.Namespace):
mock_downloader = MagicMock()
mock_downloader.args = args
mock_downloader._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name
mock_downloader._split_args_input = RedditDownloader._split_args_input
return mock_downloader
def downloader_mock(args: Configuration):
downloader_mock = MagicMock()
downloader_mock.args = args
downloader_mock._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name
downloader_mock._split_args_input = RedditDownloader._split_args_input
downloader_mock.master_hash_list = {}
return downloader_mock
def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]):
@ -285,7 +286,6 @@ def test_download_submission(
downloader_mock.args.set_folder_scheme = ''
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
downloader_mock.download_directory = tmp_path
downloader_mock.master_hash_list = []
submission = downloader_mock.reddit_instance.submission(id=test_submission_id)
RedditDownloader._download_submission(downloader_mock, submission)
folder_contents = list(tmp_path.iterdir())
@ -305,9 +305,8 @@ def test_download_submission_file_exists(
downloader_mock.args.set_folder_scheme = ''
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
downloader_mock.download_directory = tmp_path
downloader_mock.master_hash_list = []
submission = downloader_mock.reddit_instance.submission(id='m1hqw6')
Path(tmp_path, 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6_1.png').touch()
Path(tmp_path, 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6.png').touch()
RedditDownloader._download_submission(downloader_mock, submission)
folder_contents = list(tmp_path.iterdir())
output = capsys.readouterr()
@ -329,7 +328,7 @@ def test_download_submission_hash_exists(
downloader_mock.args.no_dupes = True
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
downloader_mock.download_directory = tmp_path
downloader_mock.master_hash_list = ['a912af8905ae468e0121e9940f797ad7']
downloader_mock.master_hash_list = {'a912af8905ae468e0121e9940f797ad7': None}
submission = downloader_mock.reddit_instance.submission(id='m1hqw6')
RedditDownloader._download_submission(downloader_mock, submission)
folder_contents = list(tmp_path.iterdir())
@ -356,8 +355,7 @@ def test_sanitise_subreddit_name(test_name: str, expected: str):
def test_search_existing_files():
results = RedditDownloader.scan_existing_files(Path('.'))
assert all([isinstance(result, str) for result in results])
assert len(results) >= 40
assert len(results.keys()) >= 40
@pytest.mark.parametrize(('test_subreddit_entries', 'expected'), (
@ -370,3 +368,28 @@ def test_search_existing_files():
def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: set[str]):
results = RedditDownloader._split_args_input(test_subreddit_entries)
assert results == expected
@pytest.mark.online
@pytest.mark.reddit
def test_mark_hard_link(downloader_mock: MagicMock, tmp_path: Path, reddit_instance: praw.Reddit):
downloader_mock.reddit_instance = reddit_instance
downloader_mock.args.make_hard_links = True
downloader_mock.download_directory = tmp_path
downloader_mock.args.set_folder_scheme = ''
downloader_mock.args.set_file_scheme = '{POSTID}'
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
submission = downloader_mock.reddit_instance.submission(id='m1hqw6')
original = Path(tmp_path, 'm1hqw6.png')
RedditDownloader._download_submission(downloader_mock, submission)
assert original.exists()
downloader_mock.args.set_file_scheme = 'test2_{POSTID}'
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
RedditDownloader._download_submission(downloader_mock, submission)
test_file_1_stats = original.stat()
test_file_2_inode = Path(tmp_path, 'test2_m1hqw6.png').stat().st_ino
assert test_file_1_stats.st_nlink == 2
assert test_file_1_stats.st_ino == test_file_2_inode