diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index d8d1f08..8a48958 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -39,6 +39,7 @@ def cli(): @click.option('--set-folder-scheme', default=None, type=str) @click.option('--no-dupes', is_flag=True, default=None) @click.option('--config', type=str, default=None) +@click.option('--search-existing', is_flag=True, default=None) @click.pass_context def cli_download(context: click.Context, **_): config = Configuration() diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py index a467cd1..6633ec2 100644 --- a/bulkredditdownloader/configuration.py +++ b/bulkredditdownloader/configuration.py @@ -19,6 +19,7 @@ class Configuration(Namespace): self.no_dupes: bool = False self.saved: bool = False self.search: Optional[str] = None + self.search_existing: bool = False self.set_file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}' self.set_folder_scheme: str = '{SUBREDDIT}' self.skip: list[str] = [] diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 5b939b0..d8eb54d 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -2,7 +2,9 @@ # coding=utf-8 import configparser +import hashlib import logging +import os import re import socket from datetime import datetime @@ -116,7 +118,7 @@ class RedditDownloader: return master_list def _determine_directories(self): - self.download_directory = Path(self.args.directory) + self.download_directory = Path(self.args.directory).resolve().expanduser() self.logfile_directory = self.download_directory / 'LOG_FILES' self.config_directory = self.config_directories.user_config_dir @@ -313,3 +315,15 @@ class RedditDownloader: self.master_hash_list.append(res.hash.hexdigest()) logger.debug(f'Hash added to master list: {res.hash.hexdigest()}') logger.info(f'Downloaded submission {submission.name}') + + def scan_existing_files(self) -> list[str]: + files = [] + for (dirpath, dirnames, filenames) in os.walk(self.download_directory): + files.extend([Path(dirpath, file) for file in filenames]) + logger.info(f'Calculating hashes for {len(files)} files') + hash_list = [] + for existing_file in files: + with open(existing_file, 'rb') as file: + hash_list.append(hashlib.md5(file.read()).hexdigest()) + logger.log(9, f'Hash calculated for file at {existing_file}') + return hash_list diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 2c4208f..dc7c427 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -407,3 +407,10 @@ def test_download_submission_hash_exists( def test_sanitise_subreddit_name(test_name: str, expected: str): result = RedditDownloader._sanitise_subreddit_name(test_name) assert result == expected + + +def test_search_existing_files(downloader_mock: MagicMock): + downloader_mock.download_directory = Path('.').resolve().expanduser() + results = RedditDownloader.scan_existing_files(downloader_mock) + assert all([isinstance(result, str) for result in results]) + assert len(results) >= 40