Add ability to make hard links for duplicates
This commit is contained in:
parent
902f796178
commit
2b885451e7
|
@ -43,6 +43,7 @@ def cli():
|
||||||
|
|
||||||
|
|
||||||
@cli.command('download')
|
@cli.command('download')
|
||||||
|
@click.option('--make-hard-links', is_flag=True, default=None)
|
||||||
@click.option('--no-dupes', is_flag=True, default=None)
|
@click.option('--no-dupes', is_flag=True, default=None)
|
||||||
@click.option('--search-existing', is_flag=True, default=None)
|
@click.option('--search-existing', is_flag=True, default=None)
|
||||||
@click.option('--set-file-scheme', default=None, type=str)
|
@click.option('--set-file-scheme', default=None, type=str)
|
||||||
|
|
|
@ -31,6 +31,7 @@ class Configuration(Namespace):
|
||||||
self.upvoted: bool = False
|
self.upvoted: bool = False
|
||||||
self.user: Optional[str] = None
|
self.user: Optional[str] = None
|
||||||
self.verbose: int = 0
|
self.verbose: int = 0
|
||||||
|
self.make_hard_links = False
|
||||||
|
|
||||||
# Archiver-specific options
|
# Archiver-specific options
|
||||||
self.format = 'json'
|
self.format = 'json'
|
||||||
|
|
|
@ -74,9 +74,10 @@ class RedditDownloader:
|
||||||
self._create_reddit_instance()
|
self._create_reddit_instance()
|
||||||
self._resolve_user_name()
|
self._resolve_user_name()
|
||||||
|
|
||||||
self.master_hash_list = []
|
|
||||||
if self.args.search_existing:
|
if self.args.search_existing:
|
||||||
self.master_hash_list.extend(self.scan_existing_files(self.download_directory))
|
self.master_hash_list = self.scan_existing_files(self.download_directory)
|
||||||
|
else:
|
||||||
|
self.master_hash_list = {}
|
||||||
self.authenticator = self._create_authenticator()
|
self.authenticator = self._create_authenticator()
|
||||||
logger.log(9, 'Created site authenticator')
|
logger.log(9, 'Created site authenticator')
|
||||||
|
|
||||||
|
@ -341,27 +342,33 @@ class RedditDownloader:
|
||||||
logger.error(
|
logger.error(
|
||||||
f'Failed to download resource from {res.url} with downloader {downloader_class.__name__}')
|
f'Failed to download resource from {res.url} with downloader {downloader_class.__name__}')
|
||||||
return
|
return
|
||||||
if res.hash.hexdigest() in self.master_hash_list and self.args.no_dupes:
|
resource_hash = res.hash.hexdigest()
|
||||||
logger.warning(f'Resource from "{res.url}" and hash "{res.hash.hexdigest()}" downloaded elsewhere')
|
if resource_hash in self.master_hash_list:
|
||||||
else:
|
if self.args.no_dupes:
|
||||||
# TODO: consider making a hard link/symlink here
|
logger.warning(f'Resource from "{res.url}" and hash "{resource_hash}" downloaded elsewhere')
|
||||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
return
|
||||||
with open(destination, 'wb') as file:
|
elif self.args.make_hard_links:
|
||||||
file.write(res.content)
|
self.master_hash_list[resource_hash].link_to(destination)
|
||||||
logger.debug(f'Written file to {destination}')
|
logger.debug(
|
||||||
self.master_hash_list.append(res.hash.hexdigest())
|
f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}')
|
||||||
logger.debug(f'Hash added to master list: {res.hash.hexdigest()}')
|
return
|
||||||
logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}')
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(destination, 'wb') as file:
|
||||||
|
file.write(res.content)
|
||||||
|
logger.debug(f'Written file to {destination}')
|
||||||
|
self.master_hash_list[resource_hash] = destination
|
||||||
|
logger.debug(f'Hash added to master list: {resource_hash}')
|
||||||
|
logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def scan_existing_files(directory: Path) -> list[str]:
|
def scan_existing_files(directory: Path) -> dict[str, Path]:
|
||||||
files = []
|
files = []
|
||||||
for (dirpath, dirnames, filenames) in os.walk(directory):
|
for (dirpath, dirnames, filenames) in os.walk(directory):
|
||||||
files.extend([Path(dirpath, file) for file in filenames])
|
files.extend([Path(dirpath, file) for file in filenames])
|
||||||
logger.info(f'Calculating hashes for {len(files)} files')
|
logger.info(f'Calculating hashes for {len(files)} files')
|
||||||
hash_list = []
|
hash_list = {}
|
||||||
for existing_file in files:
|
for existing_file in files:
|
||||||
with open(existing_file, 'rb') as file:
|
with open(existing_file, 'rb') as file:
|
||||||
hash_list.append(hashlib.md5(file.read()).hexdigest())
|
hash_list[hashlib.md5(file.read()).hexdigest()] = existing_file
|
||||||
logger.log(9, f'Hash calculated for file at {existing_file}')
|
logger.log(9, f'Hash calculated for file at {existing_file}')
|
||||||
return hash_list
|
return hash_list
|
||||||
|
|
|
@ -27,12 +27,13 @@ def args() -> Configuration:
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
def downloader_mock(args: argparse.Namespace):
|
def downloader_mock(args: Configuration):
|
||||||
mock_downloader = MagicMock()
|
downloader_mock = MagicMock()
|
||||||
mock_downloader.args = args
|
downloader_mock.args = args
|
||||||
mock_downloader._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name
|
downloader_mock._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name
|
||||||
mock_downloader._split_args_input = RedditDownloader._split_args_input
|
downloader_mock._split_args_input = RedditDownloader._split_args_input
|
||||||
return mock_downloader
|
downloader_mock.master_hash_list = {}
|
||||||
|
return downloader_mock
|
||||||
|
|
||||||
|
|
||||||
def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]):
|
def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]):
|
||||||
|
@ -285,7 +286,6 @@ def test_download_submission(
|
||||||
downloader_mock.args.set_folder_scheme = ''
|
downloader_mock.args.set_folder_scheme = ''
|
||||||
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
|
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
|
||||||
downloader_mock.download_directory = tmp_path
|
downloader_mock.download_directory = tmp_path
|
||||||
downloader_mock.master_hash_list = []
|
|
||||||
submission = downloader_mock.reddit_instance.submission(id=test_submission_id)
|
submission = downloader_mock.reddit_instance.submission(id=test_submission_id)
|
||||||
RedditDownloader._download_submission(downloader_mock, submission)
|
RedditDownloader._download_submission(downloader_mock, submission)
|
||||||
folder_contents = list(tmp_path.iterdir())
|
folder_contents = list(tmp_path.iterdir())
|
||||||
|
@ -305,9 +305,8 @@ def test_download_submission_file_exists(
|
||||||
downloader_mock.args.set_folder_scheme = ''
|
downloader_mock.args.set_folder_scheme = ''
|
||||||
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
|
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
|
||||||
downloader_mock.download_directory = tmp_path
|
downloader_mock.download_directory = tmp_path
|
||||||
downloader_mock.master_hash_list = []
|
|
||||||
submission = downloader_mock.reddit_instance.submission(id='m1hqw6')
|
submission = downloader_mock.reddit_instance.submission(id='m1hqw6')
|
||||||
Path(tmp_path, 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6_1.png').touch()
|
Path(tmp_path, 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6.png').touch()
|
||||||
RedditDownloader._download_submission(downloader_mock, submission)
|
RedditDownloader._download_submission(downloader_mock, submission)
|
||||||
folder_contents = list(tmp_path.iterdir())
|
folder_contents = list(tmp_path.iterdir())
|
||||||
output = capsys.readouterr()
|
output = capsys.readouterr()
|
||||||
|
@ -329,7 +328,7 @@ def test_download_submission_hash_exists(
|
||||||
downloader_mock.args.no_dupes = True
|
downloader_mock.args.no_dupes = True
|
||||||
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
|
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
|
||||||
downloader_mock.download_directory = tmp_path
|
downloader_mock.download_directory = tmp_path
|
||||||
downloader_mock.master_hash_list = ['a912af8905ae468e0121e9940f797ad7']
|
downloader_mock.master_hash_list = {'a912af8905ae468e0121e9940f797ad7': None}
|
||||||
submission = downloader_mock.reddit_instance.submission(id='m1hqw6')
|
submission = downloader_mock.reddit_instance.submission(id='m1hqw6')
|
||||||
RedditDownloader._download_submission(downloader_mock, submission)
|
RedditDownloader._download_submission(downloader_mock, submission)
|
||||||
folder_contents = list(tmp_path.iterdir())
|
folder_contents = list(tmp_path.iterdir())
|
||||||
|
@ -356,8 +355,7 @@ def test_sanitise_subreddit_name(test_name: str, expected: str):
|
||||||
|
|
||||||
def test_search_existing_files():
|
def test_search_existing_files():
|
||||||
results = RedditDownloader.scan_existing_files(Path('.'))
|
results = RedditDownloader.scan_existing_files(Path('.'))
|
||||||
assert all([isinstance(result, str) for result in results])
|
assert len(results.keys()) >= 40
|
||||||
assert len(results) >= 40
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(('test_subreddit_entries', 'expected'), (
|
@pytest.mark.parametrize(('test_subreddit_entries', 'expected'), (
|
||||||
|
@ -370,3 +368,28 @@ def test_search_existing_files():
|
||||||
def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: set[str]):
|
def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: set[str]):
|
||||||
results = RedditDownloader._split_args_input(test_subreddit_entries)
|
results = RedditDownloader._split_args_input(test_subreddit_entries)
|
||||||
assert results == expected
|
assert results == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.online
|
||||||
|
@pytest.mark.reddit
|
||||||
|
def test_mark_hard_link(downloader_mock: MagicMock, tmp_path: Path, reddit_instance: praw.Reddit):
|
||||||
|
downloader_mock.reddit_instance = reddit_instance
|
||||||
|
downloader_mock.args.make_hard_links = True
|
||||||
|
downloader_mock.download_directory = tmp_path
|
||||||
|
downloader_mock.args.set_folder_scheme = ''
|
||||||
|
downloader_mock.args.set_file_scheme = '{POSTID}'
|
||||||
|
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
|
||||||
|
submission = downloader_mock.reddit_instance.submission(id='m1hqw6')
|
||||||
|
original = Path(tmp_path, 'm1hqw6.png')
|
||||||
|
|
||||||
|
RedditDownloader._download_submission(downloader_mock, submission)
|
||||||
|
assert original.exists()
|
||||||
|
|
||||||
|
downloader_mock.args.set_file_scheme = 'test2_{POSTID}'
|
||||||
|
downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock)
|
||||||
|
RedditDownloader._download_submission(downloader_mock, submission)
|
||||||
|
test_file_1_stats = original.stat()
|
||||||
|
test_file_2_inode = Path(tmp_path, 'test2_m1hqw6.png').stat().st_ino
|
||||||
|
|
||||||
|
assert test_file_1_stats.st_nlink == 2
|
||||||
|
assert test_file_1_stats.st_ino == test_file_2_inode
|
||||||
|
|
Loading…
Reference in a new issue