Add ability to make hard links for duplicates
This commit is contained in:
parent
902f796178
commit
2b885451e7
|
@ -43,6 +43,7 @@ def cli():
|
|||
|
||||
|
||||
@cli.command('download')
|
||||
@click.option('--make-hard-links', is_flag=True, default=None)
|
||||
@click.option('--no-dupes', is_flag=True, default=None)
|
||||
@click.option('--search-existing', is_flag=True, default=None)
|
||||
@click.option('--set-file-scheme', default=None, type=str)
|
||||
|
|
|
@ -31,6 +31,7 @@ class Configuration(Namespace):
|
|||
self.upvoted: bool = False
|
||||
self.user: Optional[str] = None
|
||||
self.verbose: int = 0
|
||||
self.make_hard_links = False
|
||||
|
||||
# Archiver-specific options
|
||||
self.format = 'json'
|
||||
|
|
|
@ -74,9 +74,10 @@ class RedditDownloader:
|
|||
self._create_reddit_instance()
|
||||
self._resolve_user_name()
|
||||
|
||||
self.master_hash_list = []
|
||||
if self.args.search_existing:
|
||||
self.master_hash_list.extend(self.scan_existing_files(self.download_directory))
|
||||
self.master_hash_list = self.scan_existing_files(self.download_directory)
|
||||
else:
|
||||
self.master_hash_list = {}
|
||||
self.authenticator = self._create_authenticator()
|
||||
logger.log(9, 'Created site authenticator')
|
||||
|
||||
|
@ -341,27 +342,33 @@ class RedditDownloader:
|
|||
logger.error(
|
||||
f'Failed to download resource from {res.url} with downloader {downloader_class.__name__}')
|
||||
return
|
||||
if res.hash.hexdigest() in self.master_hash_list and self.args.no_dupes:
|
||||
logger.warning(f'Resource from "{res.url}" and hash "{res.hash.hexdigest()}" downloaded elsewhere')
|
||||
else:
|
||||
# TODO: consider making a hard link/symlink here
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(destination, 'wb') as file:
|
||||
file.write(res.content)
|
||||
logger.debug(f'Written file to {destination}')
|
||||
self.master_hash_list.append(res.hash.hexdigest())
|
||||
logger.debug(f'Hash added to master list: {res.hash.hexdigest()}')
|
||||
logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}')
|
||||
resource_hash = res.hash.hexdigest()
|
||||
if resource_hash in self.master_hash_list:
|
||||
if self.args.no_dupes:
|
||||
logger.warning(f'Resource from "{res.url}" and hash "{resource_hash}" downloaded elsewhere')
|
||||
return
|
||||
elif self.args.make_hard_links:
|
||||
self.master_hash_list[resource_hash].link_to(destination)
|
||||
logger.debug(
|
||||
f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}')
|
||||
return
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(destination, 'wb') as file:
|
||||
file.write(res.content)
|
||||
logger.debug(f'Written file to {destination}')
|
||||
self.master_hash_list[resource_hash] = destination
|
||||
logger.debug(f'Hash added to master list: {resource_hash}')
|
||||
logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}')
|
||||
|
||||
@staticmethod
|
||||
def scan_existing_files(directory: Path) -> list[str]:
|
||||
def scan_existing_files(directory: Path) -> dict[str, Path]:
|
||||
files = []
|
||||
for (dirpath, dirnames, filenames) in os.walk(directory):
|
||||
files.extend([Path(dirpath, file) for file in filenames])
|
||||
logger.info(f'Calculating hashes for {len(files)} files')
|
||||
hash_list = []
|
||||
hash_list = {}
|
||||
for existing_file in files:
|
||||
with open(existing_file, 'rb') as file:
|
||||
hash_list.append(hashlib.md5(file.read()).hexdigest())
|
||||
hash_list[hashlib.md5(file.read()).hexdigest()] = existing_file
|
||||
logger.log(9, f'Hash calculated for file at {existing_file}')
|
||||
return hash_list
|
||||
|
|
|
@ -27,12 +27,13 @@ def args() -> Configuration:
|
|||
|
||||
|
||||
@pytest.fixture()
|
||||
def downloader_mock(args: argparse.Namespace):
|
||||
mock_downloader = MagicMock()
|
||||
mock_downloader.args = args
|
||||
mock_downloader._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name
|
||||
mock_downloader._split_args_input = RedditDownloader._split_args_input
|
||||
return mock_downloader
|
||||
def downloader_mock(args: Configuration):
|
||||
downloader_mock = MagicMock()
|
||||
downloader_mock.args = args
|
||||
downloader_mock._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name
|
||||
downloader_mock._split_args_input = RedditDownloader._split_args_input
|
||||
downloader_mock.master_hash_list = {}
|
||||
return downloader_mock
|
||||
|
||||
|
||||
def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]):
|
||||
|
@ -285,7 +286,6 @@ def test_download_submission(
|
|||
downloader_mock.args.set_folder_scheme = ''
|
||||
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
|
||||
downloader_mock.download_directory = tmp_path
|
||||
downloader_mock.master_hash_list = []
|
||||
submission = downloader_mock.reddit_instance.submission(id=test_submission_id)
|
||||
RedditDownloader._download_submission(downloader_mock, submission)
|
||||
folder_contents = list(tmp_path.iterdir())
|
||||
|
@ -305,9 +305,8 @@ def test_download_submission_file_exists(
|
|||
downloader_mock.args.set_folder_scheme = ''
|
||||
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
|
||||
downloader_mock.download_directory = tmp_path
|
||||
downloader_mock.master_hash_list = []
|
||||
submission = downloader_mock.reddit_instance.submission(id='m1hqw6')
|
||||
Path(tmp_path, 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6_1.png').touch()
|
||||
Path(tmp_path, 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6.png').touch()
|
||||
RedditDownloader._download_submission(downloader_mock, submission)
|
||||
folder_contents = list(tmp_path.iterdir())
|
||||
output = capsys.readouterr()
|
||||
|
@ -329,7 +328,7 @@ def test_download_submission_hash_exists(
|
|||
downloader_mock.args.no_dupes = True
|
||||
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
|
||||
downloader_mock.download_directory = tmp_path
|
||||
downloader_mock.master_hash_list = ['a912af8905ae468e0121e9940f797ad7']
|
||||
downloader_mock.master_hash_list = {'a912af8905ae468e0121e9940f797ad7': None}
|
||||
submission = downloader_mock.reddit_instance.submission(id='m1hqw6')
|
||||
RedditDownloader._download_submission(downloader_mock, submission)
|
||||
folder_contents = list(tmp_path.iterdir())
|
||||
|
@ -356,8 +355,7 @@ def test_sanitise_subreddit_name(test_name: str, expected: str):
|
|||
|
||||
def test_search_existing_files():
|
||||
results = RedditDownloader.scan_existing_files(Path('.'))
|
||||
assert all([isinstance(result, str) for result in results])
|
||||
assert len(results) >= 40
|
||||
assert len(results.keys()) >= 40
|
||||
|
||||
|
||||
@pytest.mark.parametrize(('test_subreddit_entries', 'expected'), (
|
||||
|
@ -370,3 +368,28 @@ def test_search_existing_files():
|
|||
def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: set[str]):
|
||||
results = RedditDownloader._split_args_input(test_subreddit_entries)
|
||||
assert results == expected
|
||||
|
||||
|
||||
@pytest.mark.online
|
||||
@pytest.mark.reddit
|
||||
def test_mark_hard_link(downloader_mock: MagicMock, tmp_path: Path, reddit_instance: praw.Reddit):
|
||||
downloader_mock.reddit_instance = reddit_instance
|
||||
downloader_mock.args.make_hard_links = True
|
||||
downloader_mock.download_directory = tmp_path
|
||||
downloader_mock.args.set_folder_scheme = ''
|
||||
downloader_mock.args.set_file_scheme = '{POSTID}'
|
||||
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
|
||||
submission = downloader_mock.reddit_instance.submission(id='m1hqw6')
|
||||
original = Path(tmp_path, 'm1hqw6.png')
|
||||
|
||||
RedditDownloader._download_submission(downloader_mock, submission)
|
||||
assert original.exists()
|
||||
|
||||
downloader_mock.args.set_file_scheme = 'test2_{POSTID}'
|
||||
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
|
||||
RedditDownloader._download_submission(downloader_mock, submission)
|
||||
test_file_1_stats = original.stat()
|
||||
test_file_2_inode = Path(tmp_path, 'test2_m1hqw6.png').stat().st_ino
|
||||
|
||||
assert test_file_1_stats.st_nlink == 2
|
||||
assert test_file_1_stats.st_ino == test_file_2_inode
|
||||
|
|
Loading…
Reference in a new issue