From 99fe3312a4f343fb45b6d55cf21e54bb10f9dd33 Mon Sep 17 00:00:00 2001 From: BlipRanger <1860540+BlipRanger@users.noreply.github.com> Date: Wed, 12 May 2021 10:18:02 -0400 Subject: [PATCH 001/110] Bind socket to '0.0.0.0' rather than 'localhost' to allow for more flexible OAuth connection. --- bdfr/oauth2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index 6b27599..bd60c9b 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -70,8 +70,8 @@ class OAuth2Authenticator: def receive_connection() -> socket.socket: server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind(('localhost', 7634)) - logger.log(9, 'Server listening on localhost:7634') + server.bind(('0.0.0.0', 7634)) + logger.log(9, 'Server listening on 0.0.0.0:7634') server.listen(1) client = server.accept()[0] From 850faffc29706efc36703363f9baba88b649e331 Mon Sep 17 00:00:00 2001 From: Thayol Date: Wed, 5 Jan 2022 01:17:59 +0100 Subject: [PATCH 002/110] Add PowerShell scripts --- .gitattributes | 2 ++ scripts/extract_failed_ids.ps1 | 21 +++++++++++++++++++++ scripts/extract_successful_ids.ps1 | 21 +++++++++++++++++++++ scripts/print_summary.ps1 | 30 ++++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+) create mode 100644 .gitattributes create mode 100644 scripts/extract_failed_ids.ps1 create mode 100644 scripts/extract_successful_ids.ps1 create mode 100644 scripts/print_summary.ps1 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..c16e947 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Declare files that will always have CRLF line endings on checkout. +*.ps1 text eol=crlf \ No newline at end of file diff --git a/scripts/extract_failed_ids.ps1 b/scripts/extract_failed_ids.ps1 new file mode 100644 index 0000000..17d96f6 --- /dev/null +++ b/scripts/extract_failed_ids.ps1 @@ -0,0 +1,21 @@ +if (Test-Path -Path $args[0] -PathType Leaf) { + $file=$args[0] +} +else { + Write-Host "CANNOT FIND LOG FILE" + Exit 1 +} + +if ($args[1] -ne $null) { + $output=$args[1] + Write-Host "Outputting IDs to $output" +} +else { + $output="./failed.txt" +} + +Select-String -Path $file -Pattern "Could not download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 11 | Select-Object -First 1 } | foreach { $_.substring(0,$_.Length-1) } >> $output +Select-String -Path $file -Pattern "Failed to download resource" | ForEach-Object { -split $_.Line | Select-Object -Skip 14 | Select-Object -First 1 } >> $output +Select-String -Path $file -Pattern "failed to download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } | foreach { $_.substring(0,$_.Length-1) } >> $output +Select-String -Path $file -Pattern "Failed to write file" | ForEach-Object { -split $_.Line | Select-Object -Skip 12 | Select-Object -First 1 } | foreach { $_.substring(0,$_.Length-1) } >> $output +Select-String -Path $file -Pattern "skipped due to disabled module" | ForEach-Object { -split $_.Line | Select-Object -Skip 8 | Select-Object -First 1 } >> $output diff --git a/scripts/extract_successful_ids.ps1 b/scripts/extract_successful_ids.ps1 new file mode 100644 index 0000000..3dbb315 --- /dev/null +++ b/scripts/extract_successful_ids.ps1 @@ -0,0 +1,21 @@ +if (Test-Path -Path $args[0] -PathType Leaf) { + $file=$args[0] +} +else { + Write-Host "CANNOT FIND LOG FILE" + Exit 1 +} + +if ($args[1] -ne $null) { + $output=$args[1] + Write-Host "Outputting IDs to $output" +} +else { + $output="./successful.txt" +} + +Select-String -Path $file -Pattern "Downloaded submission" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output +Select-String -Path $file -Pattern "Resource hash" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output +Select-String -Path $file -Pattern "Download filter" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output +Select-String -Path $file -Pattern "already exists, continuing" | ForEach-Object { -split $_.Line | Select-Object -Last 4 | Select-Object -SkipLast 3 } >> $output +Select-String -Path $file -Pattern "Hard link made" | ForEach-Object { -split $_.Line | Select-Object -Last 1 } >> $output diff --git a/scripts/print_summary.ps1 b/scripts/print_summary.ps1 new file mode 100644 index 0000000..5d85b09 --- /dev/null +++ b/scripts/print_summary.ps1 @@ -0,0 +1,30 @@ +if (Test-Path -Path $args[0] -PathType Leaf) { + $file=$args[0] +} +else { + Write-Host "CANNOT FIND LOG FILE" + Exit 1 +} + +if ($args[1] -ne $null) { + $output=$args[1] + Write-Host "Outputting IDs to $output" +} +else { + $output="./successful.txt" +} + +Write-Host -NoNewline "Downloaded submissions: " +Write-Host (Select-String -Path $file -Pattern "Downloaded submission" -AllMatches).Matches.Count +Write-Host -NoNewline "Failed downloads: " +Write-Host (Select-String -Path $file -Pattern "failed to download submission" -AllMatches).Matches.Count +Write-Host -NoNewline "Files already downloaded: " +Write-Host (Select-String -Path $file -Pattern "already exists, continuing" -AllMatches).Matches.Count +Write-Host -NoNewline "Hard linked submissions: " +Write-Host (Select-String -Path $file -Pattern "Hard link made" -AllMatches).Matches.Count +Write-Host -NoNewline "Excluded submissions: " +Write-Host (Select-String -Path $file -Pattern "in exclusion list" -AllMatches).Matches.Count +Write-Host -NoNewline "Files with existing hash skipped: " +Write-Host (Select-String -Path $file -Pattern "downloaded elsewhere" -AllMatches).Matches.Count +Write-Host -NoNewline "Submissions from excluded subreddits: " +Write-Host (Select-String -Path $file -Pattern "in skip list" -AllMatches).Matches.Count From ac3a8e913df84019b0d6dcd7403d5f9a4e946832 Mon Sep 17 00:00:00 2001 From: Thayol Date: Wed, 5 Jan 2022 13:13:45 +0100 Subject: [PATCH 003/110] Fix wrong offset --- scripts/extract_successful_ids.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/extract_successful_ids.ps1 b/scripts/extract_successful_ids.ps1 index 3dbb315..00722f1 100644 --- a/scripts/extract_successful_ids.ps1 +++ b/scripts/extract_successful_ids.ps1 @@ -16,6 +16,6 @@ else { Select-String -Path $file -Pattern "Downloaded submission" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output Select-String -Path $file -Pattern "Resource hash" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output -Select-String -Path $file -Pattern "Download filter" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output +Select-String -Path $file -Pattern "Download filter" | ForEach-Object { -split $_.Line | Select-Object -Last 4 | Select-Object -SkipLast 3 } >> $output Select-String -Path $file -Pattern "already exists, continuing" | ForEach-Object { -split $_.Line | Select-Object -Last 4 | Select-Object -SkipLast 3 } >> $output Select-String -Path $file -Pattern "Hard link made" | ForEach-Object { -split $_.Line | Select-Object -Last 1 } >> $output From 8ec45a9302dbf420dbfaed382e3d0758be3fd71c Mon Sep 17 00:00:00 2001 From: Thayol Date: Thu, 6 Jan 2022 04:06:46 +0100 Subject: [PATCH 004/110] Fix Bash script: Failed to write --- scripts/extract_failed_ids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index f96bd9a..8addf7e 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -18,6 +18,6 @@ fi grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ; grep 'Failed to download resource' "$file" | awk '{ print $15 }' ; grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; - grep 'Failed to write file' "$file" | awk '{ print $13 }' | rev | cut -c 2- | rev ; + grep 'Failed to write file' "$file" | awk '{ print $14 }' ; grep 'skipped due to disabled module' "$file" | awk '{ print $9 }' ; } >>"$output" From 3811ec37fb121675a3d5c3007ab96c9c44794144 Mon Sep 17 00:00:00 2001 From: Thayol Date: Thu, 6 Jan 2022 12:16:44 +0100 Subject: [PATCH 005/110] Fix offset and remove substring --- scripts/extract_failed_ids.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/extract_failed_ids.ps1 b/scripts/extract_failed_ids.ps1 index 17d96f6..be2d2cb 100644 --- a/scripts/extract_failed_ids.ps1 +++ b/scripts/extract_failed_ids.ps1 @@ -17,5 +17,5 @@ else { Select-String -Path $file -Pattern "Could not download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 11 | Select-Object -First 1 } | foreach { $_.substring(0,$_.Length-1) } >> $output Select-String -Path $file -Pattern "Failed to download resource" | ForEach-Object { -split $_.Line | Select-Object -Skip 14 | Select-Object -First 1 } >> $output Select-String -Path $file -Pattern "failed to download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } | foreach { $_.substring(0,$_.Length-1) } >> $output -Select-String -Path $file -Pattern "Failed to write file" | ForEach-Object { -split $_.Line | Select-Object -Skip 12 | Select-Object -First 1 } | foreach { $_.substring(0,$_.Length-1) } >> $output +Select-String -Path $file -Pattern "Failed to write file" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } >> $output Select-String -Path $file -Pattern "skipped due to disabled module" | ForEach-Object { -split $_.Line | Select-Object -Skip 8 | Select-Object -First 1 } >> $output From 0177b434c2817909037b3f1c392a1e943057edf6 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 18 Feb 2022 10:21:52 +1000 Subject: [PATCH 006/110] Add --subscribed option --- bdfr/__main__.py | 1 + bdfr/configuration.py | 1 + bdfr/connector.py | 20 +++++++---- bdfr/default_config.cfg | 2 +- .../test_download_integration.py | 35 ++++++++++--------- tests/test_connector.py | 18 +++++++++- 6 files changed, 52 insertions(+), 25 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index de658de..56ffb0f 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -23,6 +23,7 @@ _common_options = [ click.option('--saved', is_flag=True, default=None), click.option('--search', default=None, type=str), click.option('--submitted', is_flag=True, default=None), + click.option('--subscribed', is_flag=True, default=None), click.option('--time-format', type=str, default=None), click.option('--upvoted', is_flag=True, default=None), click.option('-L', '--limit', default=None, type=int), diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 81fa3e4..ef24e36 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -35,6 +35,7 @@ class Configuration(Namespace): self.skip_subreddit: list[str] = [] self.sort: str = 'hot' self.submitted: bool = False + self.subscribed: bool = True self.subreddit: list[str] = [] self.time: str = 'all' self.time_format = None diff --git a/bdfr/connector.py b/bdfr/connector.py index 506e23f..e04d9ef 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -243,9 +243,19 @@ class RedditConnector(metaclass=ABCMeta): return set(all_entries) def get_subreddits(self) -> list[praw.models.ListingGenerator]: - if self.args.subreddit: - out = [] - for reddit in self.split_args_input(self.args.subreddit): + out = [] + subscribed_subreddits = set() + if self.args.subscribed: + if self.args.authenticate: + try: + subscribed_subreddits = list(self.reddit_instance.user.subreddits(limit=None)) + subscribed_subreddits = set([s.display_name for s in subscribed_subreddits]) + except prawcore.InsufficientScope: + logger.error('BDFR has insufficient scope to access subreddit lists') + else: + logger.error('Cannot find subscribed subreddits without an authenticated instance') + if self.args.subreddit or subscribed_subreddits: + for reddit in self.split_args_input(self.args.subreddit) | subscribed_subreddits: if reddit == 'friends' and self.authenticated is False: logger.error('Cannot read friends subreddit without an authenticated instance') continue @@ -270,9 +280,7 @@ class RedditConnector(metaclass=ABCMeta): logger.debug(f'Added submissions from subreddit {reddit}') except (errors.BulkDownloaderException, praw.exceptions.PRAWException) as e: logger.error(f'Failed to get submissions for subreddit {reddit}: {e}') - return out - else: - return [] + return out def resolve_user_name(self, in_name: str) -> str: if in_name == 'me': diff --git a/bdfr/default_config.cfg b/bdfr/default_config.cfg index b8039a9..c601152 100644 --- a/bdfr/default_config.cfg +++ b/bdfr/default_config.cfg @@ -1,7 +1,7 @@ [DEFAULT] client_id = U-6gk4ZCh3IeNQ client_secret = 7CZHY6AmKweZME5s50SfDGylaPg -scopes = identity, history, read, save +scopes = identity, history, read, save, mysubreddits backup_log_count = 3 max_wait_time = 120 time_format = ISO \ No newline at end of file diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index bd53382..75216dd 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -31,23 +31,23 @@ def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['-s', 'Mindustry', '-L', 1], - ['-s', 'r/Mindustry', '-L', 1], - ['-s', 'r/mindustry', '-L', 1], - ['-s', 'mindustry', '-L', 1], - ['-s', 'https://www.reddit.com/r/TrollXChromosomes/', '-L', 1], - ['-s', 'r/TrollXChromosomes/', '-L', 1], - ['-s', 'TrollXChromosomes/', '-L', 1], - ['-s', 'trollxchromosomes', '-L', 1], - ['-s', 'trollxchromosomes,mindustry,python', '-L', 1], - ['-s', 'trollxchromosomes, mindustry, python', '-L', 1], - ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day'], - ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new'], - ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new'], - ['-s', 'trollxchromosomes', '-L', 1, '--search', 'women'], - ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--search', 'women'], - ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new', '--search', 'women'], - ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new', '--search', 'women'], + ['-s', 'Mindustry', '-L', 3], + ['-s', 'r/Mindustry', '-L', 3], + ['-s', 'r/mindustry', '-L', 3], + ['-s', 'mindustry', '-L', 3], + ['-s', 'https://www.reddit.com/r/TrollXChromosomes/', '-L', 3], + ['-s', 'r/TrollXChromosomes/', '-L', 3], + ['-s', 'TrollXChromosomes/', '-L', 3], + ['-s', 'trollxchromosomes', '-L', 3], + ['-s', 'trollxchromosomes,mindustry,python', '-L', 3], + ['-s', 'trollxchromosomes, mindustry, python', '-L', 3], + ['-s', 'trollxchromosomes', '-L', 3, '--time', 'day'], + ['-s', 'trollxchromosomes', '-L', 3, '--sort', 'new'], + ['-s', 'trollxchromosomes', '-L', 3, '--time', 'day', '--sort', 'new'], + ['-s', 'trollxchromosomes', '-L', 3, '--search', 'women'], + ['-s', 'trollxchromosomes', '-L', 3, '--time', 'day', '--search', 'women'], + ['-s', 'trollxchromosomes', '-L', 3, '--sort', 'new', '--search', 'women'], + ['-s', 'trollxchromosomes', '-L', 3, '--time', 'day', '--sort', 'new', '--search', 'women'], )) def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): runner = CliRunner() @@ -64,6 +64,7 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['-s', 'hentai', '-L', 10, '--search', 'red', '--authenticate'], + ['--authenticate', '--subscribed', '-L', 10], )) def test_cli_download_search_subreddits_authenticated(test_args: list[str], tmp_path: Path): runner = CliRunner() diff --git a/tests/test_connector.py b/tests/test_connector.py index 9fe58f2..3a10757 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -336,13 +336,29 @@ def test_get_user_authenticated_lists( downloader_mock.args.__dict__[test_flag] = True downloader_mock.reddit_instance = authenticated_reddit_instance downloader_mock.args.limit = 10 - downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.determine_sort_function.return_value = praw.models.Subreddit.hot downloader_mock.sort_filter = RedditTypes.SortType.HOT downloader_mock.args.user = [RedditConnector.resolve_user_name(downloader_mock, 'me')] results = RedditConnector.get_user_data(downloader_mock) assert_all_results_are_submissions_or_comments(10, results) +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.authenticated +def test_get_subscribed_subreddits(downloader_mock: MagicMock, authenticated_reddit_instance: praw.Reddit): + downloader_mock.reddit_instance = authenticated_reddit_instance + downloader_mock.args.limit = 10 + downloader_mock.args.authenticate = True + downloader_mock.args.subscribed = True + downloader_mock.determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.sort_filter = RedditTypes.SortType.HOT + results = RedditConnector.get_subreddits(downloader_mock) + assert all([isinstance(s, praw.models.ListingGenerator) for s in results]) + assert len(results) > 0 + + @pytest.mark.parametrize(('test_name', 'expected'), ( ('Mindustry', 'Mindustry'), ('Futurology', 'Futurology'), From 9deef63fdda3bb2623fd035c93aad6ec8c8ef8c2 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 18 Feb 2022 12:04:37 +1000 Subject: [PATCH 007/110] Add support for Redgifs images and galleries --- bdfr/site_downloaders/redgifs.py | 31 +++++++++++++---- tests/site_downloaders/test_redgifs.py | 48 ++++++++++++++++++-------- 2 files changed, 57 insertions(+), 22 deletions(-) diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index a62fedb..12fb24d 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -17,11 +17,11 @@ class Redgifs(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - media_url = self._get_link(self.post.url) - return [Resource(self.post, media_url, Resource.retry_download(media_url), '.mp4')] + media_urls = self._get_link(self.post.url) + return [Resource(self.post, m, Resource.retry_download(m), '.mp4') for m in media_urls] @staticmethod - def _get_link(url: str) -> str: + def _get_link(url: str) -> set[str]: try: redgif_id = re.match(r'.*/(.*?)/?$', url).group(1) except AttributeError: @@ -32,16 +32,33 @@ class Redgifs(BaseDownloader): 'Chrome/90.0.4430.93 Safari/537.36', } - content = Redgifs.retrieve_url(f'https://api.redgifs.com/v1/gfycats/{redgif_id}', headers=headers) + content = Redgifs.retrieve_url(f'https://api.redgifs.com/v2/gifs/{redgif_id}', headers=headers) if content is None: raise SiteDownloaderError('Could not read the page source') try: - out = json.loads(content.text)['gfyItem']['mp4Url'] - except (KeyError, AttributeError): - raise SiteDownloaderError('Failed to find JSON data in page') + response_json = json.loads(content.text) except json.JSONDecodeError as e: raise SiteDownloaderError(f'Received data was not valid JSON: {e}') + out = set() + try: + if response_json['gif']['type'] == 1: # type 1 is a video + out.add(response_json['gif']['urls']['hd']) + elif response_json['gif']['type'] == 2: # type 2 is an image + if response_json['gif']['gallery']: + content = Redgifs.retrieve_url( + f'https://api.redgifs.com/v2/gallery/{response_json["gif"]["gallery"]}', + headers=headers, + ) + response_json = json.loads(content.text) + out = {p['urls']['hd'] for p in response_json['gifs']} + else: + out.add(response_json['gif']['urls']['hd']) + else: + raise KeyError + except (KeyError, AttributeError): + raise SiteDownloaderError('Failed to find JSON data in page') + return out diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index 571f044..a1f571e 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -12,30 +12,48 @@ from bdfr.site_downloaders.redgifs import Redgifs @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected'), ( ('https://redgifs.com/watch/frighteningvictorioussalamander', - 'https://thumbs2.redgifs.com/FrighteningVictoriousSalamander.mp4'), + {'https://thumbs2.redgifs.com/FrighteningVictoriousSalamander.mp4'}), ('https://redgifs.com/watch/springgreendecisivetaruca', - 'https://thumbs2.redgifs.com/SpringgreenDecisiveTaruca.mp4'), + {'https://thumbs2.redgifs.com/SpringgreenDecisiveTaruca.mp4'}), ('https://www.redgifs.com/watch/palegoldenrodrawhalibut', - 'https://thumbs2.redgifs.com/PalegoldenrodRawHalibut.mp4'), + {'https://thumbs2.redgifs.com/PalegoldenrodRawHalibut.mp4'}), + ('https://redgifs.com/watch/hollowintentsnowyowl', + {'https://thumbs2.redgifs.com/HollowIntentSnowyowl-large.jpg'}), + ('https://www.redgifs.com/watch/lustrousstickywaxwing', + {'https://thumbs2.redgifs.com/EntireEnchantingHypsilophodon-large.jpg', + 'https://thumbs2.redgifs.com/FancyMagnificentAdamsstaghornedbeetle-large.jpg', + 'https://thumbs2.redgifs.com/LustrousStickyWaxwing-large.jpg', + 'https://thumbs2.redgifs.com/ParchedWindyArmyworm-large.jpg', + 'https://thumbs2.redgifs.com/ThunderousColorlessErmine-large.jpg', + 'https://thumbs2.redgifs.com/UnripeUnkemptWoodpecker-large.jpg'}), )) -def test_get_link(test_url: str, expected: str): +def test_get_link(test_url: str, expected: set[str]): result = Redgifs._get_link(test_url) assert result == expected @pytest.mark.online -@pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://redgifs.com/watch/frighteningvictorioussalamander', '4007c35d9e1f4b67091b5f12cffda00a'), - ('https://redgifs.com/watch/springgreendecisivetaruca', '8dac487ac49a1f18cc1b4dabe23f0869'), - ('https://redgifs.com/watch/leafysaltydungbeetle', '076792c660b9c024c0471ef4759af8bd'), - ('https://www.redgifs.com/watch/palegoldenrodrawhalibut', '46d5aa77fe80c6407de1ecc92801c10e'), +@pytest.mark.parametrize(('test_url', 'expected_hashes'), ( + ('https://redgifs.com/watch/frighteningvictorioussalamander', {'4007c35d9e1f4b67091b5f12cffda00a'}), + ('https://redgifs.com/watch/springgreendecisivetaruca', {'8dac487ac49a1f18cc1b4dabe23f0869'}), + ('https://redgifs.com/watch/leafysaltydungbeetle', {'076792c660b9c024c0471ef4759af8bd'}), + ('https://www.redgifs.com/watch/palegoldenrodrawhalibut', {'46d5aa77fe80c6407de1ecc92801c10e'}), + ('https://redgifs.com/watch/hollowintentsnowyowl', + {'5ee51fa15e0a58e98f11dea6a6cca771'}), + ('https://www.redgifs.com/watch/lustrousstickywaxwing', + {'b461e55664f07bed8d2f41d8586728fa', + '30ba079a8ed7d7adf17929dc3064c10f', + '0d4f149d170d29fc2f015c1121bab18b', + '53987d99cfd77fd65b5fdade3718f9f1', + 'fb2e7d972846b83bf4016447d3060d60', + '44fb28f72ec9a5cca63fa4369ab4f672'}), )) -def test_download_resource(test_url: str, expected_hash: str): +def test_download_resource(test_url: str, expected_hashes: set[str]): mock_submission = Mock() mock_submission.url = test_url test_site = Redgifs(mock_submission) - resources = test_site.find_resources() - assert len(resources) == 1 - assert isinstance(resources[0], Resource) - resources[0].download() - assert resources[0].hash.hexdigest() == expected_hash + results = test_site.find_resources() + assert all([isinstance(res, Resource) for res in results]) + [res.download() for res in results] + hashes = set([res.hash.hexdigest() for res in results]) + assert hashes == set(expected_hashes) From 6e0c64265254754d25302e06416e03c951c848e7 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 18 Feb 2022 12:30:38 +1000 Subject: [PATCH 008/110] Add file scheme naming for archiver --- bdfr/__main__.py | 4 ++-- .../integration_tests/test_archive_integration.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 56ffb0f..451a7f0 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -17,6 +17,8 @@ _common_options = [ click.option('--authenticate', is_flag=True, default=None), click.option('--config', type=str, default=None), click.option('--disable-module', multiple=True, default=None, type=str), + click.option('--file-scheme', default=None, type=str), + click.option('--folder-scheme', default=None, type=str), click.option('--ignore-user', type=str, multiple=True, default=None), click.option('--include-id-file', multiple=True, default=None), click.option('--log', type=str, default=None), @@ -38,8 +40,6 @@ _common_options = [ ] _downloader_options = [ - click.option('--file-scheme', default=None, type=str), - click.option('--folder-scheme', default=None, type=str), click.option('--make-hard-links', is_flag=True, default=None), click.option('--max-wait-time', type=int, default=None), click.option('--no-dupes', is_flag=True, default=None), diff --git a/tests/integration_tests/test_archive_integration.py b/tests/integration_tests/test_archive_integration.py index 5ef04a6..2234c5a 100644 --- a/tests/integration_tests/test_archive_integration.py +++ b/tests/integration_tests/test_archive_integration.py @@ -121,3 +121,18 @@ def test_cli_archive_ignore_user(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 assert 'being an ignored user' in result.output assert 'Attempting to archive submission' not in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--file-scheme', '{TITLE}', '-l', 'suy011'], +)) +def test_cli_archive_file_format(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Attempting to archive submission' in result.output + assert re.search('format at /.+?/Judge says Trump and two adult', result.output) From 71f84420cbe3abc9650ab07c68065985f033a6d5 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 18 Feb 2022 12:42:42 +1000 Subject: [PATCH 009/110] Increase version number --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 198ebe7..67a1deb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.5.2 +version = 2.6.0 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc From 764531951076283d2d7c37e4a1ad59b43df3516f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 18 Feb 2022 12:49:46 +1000 Subject: [PATCH 010/110] Fix gfycat after redgifs changes --- bdfr/site_downloaders/gfycat.py | 4 ++-- tests/site_downloaders/test_gfycat.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bdfr/site_downloaders/gfycat.py b/bdfr/site_downloaders/gfycat.py index 6accaab..c8da9df 100644 --- a/bdfr/site_downloaders/gfycat.py +++ b/bdfr/site_downloaders/gfycat.py @@ -21,7 +21,7 @@ class Gfycat(Redgifs): return super().find_resources(authenticator) @staticmethod - def _get_link(url: str) -> str: + def _get_link(url: str) -> set[str]: gfycat_id = re.match(r'.*/(.*?)/?$', url).group(1) url = 'https://gfycat.com/' + gfycat_id @@ -39,4 +39,4 @@ class Gfycat(Redgifs): raise SiteDownloaderError(f'Failed to download Gfycat link {url}: {e}') except json.JSONDecodeError as e: raise SiteDownloaderError(f'Did not receive valid JSON data: {e}') - return out + return {out,} diff --git a/tests/site_downloaders/test_gfycat.py b/tests/site_downloaders/test_gfycat.py index 981d01d..3b40840 100644 --- a/tests/site_downloaders/test_gfycat.py +++ b/tests/site_downloaders/test_gfycat.py @@ -16,7 +16,7 @@ from bdfr.site_downloaders.gfycat import Gfycat )) def test_get_link(test_url: str, expected_url: str): result = Gfycat._get_link(test_url) - assert result == expected_url + assert result.pop() == expected_url @pytest.mark.online From 160ee372b9145752191b9c8e234b7f80aba593a9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 18 Feb 2022 12:51:51 +1000 Subject: [PATCH 011/110] Update test hashes --- .../fallback_downloaders/test_ytdlp_fallback.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py index 2c4a4f6..503f14e 100644 --- a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py @@ -35,8 +35,8 @@ def test_info_extraction_bad(test_url: str): @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://streamable.com/dt46y', 'b7e465adaade5f2b6d8c2b4b7d0a2878'), ('https://streamable.com/t8sem', '49b2d1220c485455548f1edbc05d4ecf'), - ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '21968d3d92161ea5e0abdcaf6311b06c'), - ('https://v.redd.it/9z1dnk3xr5k61', '351a2b57e888df5ccbc508056511f38d'), + ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '0a406f3d2e09b3d3ba43bf97185b83e3'), + ('https://v.redd.it/9z1dnk3xr5k61', 'c7765c33972549465c87dcbd59eb3d5d'), )) def test_find_resources(test_url: str, expected_hash: str): test_submission = MagicMock() From 06988c40b33e572b0c9944e580e47feedeb4439f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 20 Feb 2022 15:48:02 +1000 Subject: [PATCH 012/110] Switch redgifs to dynamic file extensions --- bdfr/site_downloaders/redgifs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 12fb24d..f7ea56a 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -18,7 +18,7 @@ class Redgifs(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: media_urls = self._get_link(self.post.url) - return [Resource(self.post, m, Resource.retry_download(m), '.mp4') for m in media_urls] + return [Resource(self.post, m, Resource.retry_download(m), None) for m in media_urls] @staticmethod def _get_link(url: str) -> set[str]: From 81b7fe853b1ac2761f9ddddfe8072b14e69a72ba Mon Sep 17 00:00:00 2001 From: sinclairkosh <102016413+sinclairkosh@users.noreply.github.com> Date: Tue, 22 Mar 2022 05:53:43 +1100 Subject: [PATCH 013/110] Update Readme with some command clarifications Clarify that fact that downloading by user doesn't work the same way as downloading by subreddit. Feel free to user a better example username. :) --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index b84aa3d..47c0dde 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,12 @@ However, these commands are not enough. You should chain parameters in [Options] python3 -m bdfr download ./path/to/output --subreddit Python -L 10 ``` ```bash +python3 -m bdfr download ./path/to/output --user reddituser --submitted -L 100 +``` +```bash +python3 -m bdfr download ./path/to/output --user reddituser --submitted --all-comments --comment-context +``` +```bash python3 -m bdfr download ./path/to/output --user me --saved --authenticate -L 25 --file-scheme '{POSTID}' ``` ```bash From 806bd76f877bf26edca66e44204bc721cd124dd3 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 25 Mar 2022 10:50:52 +1000 Subject: [PATCH 014/110] Strip any newline characters from names --- bdfr/file_name_formatter.py | 3 +++ tests/test_file_name_formatter.py | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index 3e8832b..1dabd34 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -111,6 +111,9 @@ class FileNameFormatter: if not resource.extension: raise BulkDownloaderException(f'Resource from {resource.url} has no extension') file_name = str(self._format_name(resource.source_submission, self.file_format_string)) + + file_name = re.sub(r'\n', ' ', file_name) + if not re.match(r'.*\.$', file_name) and not re.match(r'^\..*', resource.extension): ending = index + '.' + resource.extension else: diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index 30fac77..bd9d058 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -16,6 +16,7 @@ from bdfr.file_name_formatter import FileNameFormatter from bdfr.resource import Resource from bdfr.site_downloaders.base_downloader import BaseDownloader from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback +from bdfr.site_downloaders.self_post import SelfPost @pytest.fixture() @@ -406,6 +407,7 @@ def test_windows_max_path(tmp_path: Path): @pytest.mark.parametrize(('test_reddit_id', 'test_downloader', 'expected_names'), ( ('gphmnr', YtdlpFallback, {'He has a lot to say today.mp4'}), ('d0oir2', YtdlpFallback, {"Crunk's finest moment. Welcome to the new subreddit!.mp4"}), + ('jiecu', SelfPost, {'Reston, VA Some info regarding shelters in the area..txt'}) )) def test_name_submission( test_reddit_id: str, @@ -418,4 +420,4 @@ def test_name_submission( test_formatter = FileNameFormatter('{TITLE}', '', '') results = test_formatter.format_resource_paths(test_resources, Path('.')) results = set([r[0].name for r in results]) - assert expected_names == results + assert results == expected_names From 5a3ff887c41b90978ea840e8d1ddae7a18013e7b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 25 Mar 2022 10:52:49 +1000 Subject: [PATCH 015/110] Add second test case --- tests/test_file_name_formatter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index bd9d058..c9e049e 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -407,7 +407,8 @@ def test_windows_max_path(tmp_path: Path): @pytest.mark.parametrize(('test_reddit_id', 'test_downloader', 'expected_names'), ( ('gphmnr', YtdlpFallback, {'He has a lot to say today.mp4'}), ('d0oir2', YtdlpFallback, {"Crunk's finest moment. Welcome to the new subreddit!.mp4"}), - ('jiecu', SelfPost, {'Reston, VA Some info regarding shelters in the area..txt'}) + ('jiecu', SelfPost, {'Reston, VA Some info regarding shelters in the area..txt'}), + ('gui1i', SelfPost, {'The "Beer and Ear offer for those who need help in the \'burbs of North Dallas....txt'}), )) def test_name_submission( test_reddit_id: str, From b921d03705b3f47a995ddbff47f08d60e34de362 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Mon, 18 Apr 2022 00:30:17 -0500 Subject: [PATCH 016/110] Use stdout --- scripts/extract_failed_ids.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index 8addf7e..64d1e72 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -7,17 +7,10 @@ else exit 1 fi -if [ -n "$2" ]; then - output="$2" - echo "Outputting IDs to $output" -else - output="./failed.txt" -fi - { grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ; grep 'Failed to download resource' "$file" | awk '{ print $15 }' ; grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; grep 'Failed to write file' "$file" | awk '{ print $14 }' ; grep 'skipped due to disabled module' "$file" | awk '{ print $9 }' ; -} >>"$output" +} From 68e367453b4d73413cd2f4f4b5b23c5376682768 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Mon, 18 Apr 2022 16:03:24 -0500 Subject: [PATCH 017/110] readme: make --search info a bit more clear --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b84aa3d..d95b4fd 100644 --- a/README.md +++ b/README.md @@ -92,8 +92,8 @@ The following options are common between both the `archive` and `download` comma - This option will make the BDFR use the supplied user's saved posts list as a download source - This requires an authenticated Reddit instance, using the `--authenticate` flag, as well as `--user` set to `me` - `--search` - - This will apply the specified search term to specific lists when scraping submissions - - A search term can only be applied to subreddits and multireddits, supplied with the `- s` and `-m` flags respectively + - This will apply the input search term to specific lists when scraping submissions + - A search term can only be applied when using the `--subreddit` and `--multireddit` flags - `--submitted` - This will use a user's submissions as a source - A user must be specified with `--user` From 4e050c50d6503370af61c1e67886a87f9d3635ce Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Mon, 18 Apr 2022 20:42:50 -0500 Subject: [PATCH 018/110] okay --- scripts/tests/test_extract_failed_ids.bats | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/tests/test_extract_failed_ids.bats b/scripts/tests/test_extract_failed_ids.bats index 75b9bff..a716cba 100644 --- a/scripts/tests/test_extract_failed_ids.bats +++ b/scripts/tests/test_extract_failed_ids.bats @@ -13,31 +13,31 @@ teardown() { } @test "fail no downloader module" { - run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt >> failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "3" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail resource error" { - run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt >> failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail site downloader error" { - run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt >> failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "2" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail failed file write" { - run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt >> failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail disabled module" { - run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt >> failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } From 484bde9b136f18a02d9629b655bb3d9405848719 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Mon, 18 Apr 2022 20:47:35 -0500 Subject: [PATCH 019/110] oh there's another one --- scripts/extract_successful_ids.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/scripts/extract_successful_ids.sh b/scripts/extract_successful_ids.sh index 011ba6c..e8f482e 100755 --- a/scripts/extract_successful_ids.sh +++ b/scripts/extract_successful_ids.sh @@ -7,17 +7,10 @@ else exit 1 fi -if [ -n "$2" ]; then - output="$2" - echo "Outputting IDs to $output" -else - output="./successful.txt" -fi - { grep 'Downloaded submission' "$file" | awk '{ print $(NF-2) }' ; grep 'Resource hash' "$file" | awk '{ print $(NF-2) }' ; grep 'Download filter' "$file" | awk '{ print $(NF-3) }' ; grep 'already exists, continuing' "$file" | awk '{ print $(NF-3) }' ; grep 'Hard link made' "$file" | awk '{ print $(NF) }' ; -} >> "$output" +} From 5775c0ab9f19a05cebaf0cf687a7eca1eeefd87c Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Mon, 18 Apr 2022 20:48:02 -0500 Subject: [PATCH 020/110] and this one too --- scripts/tests/test_extract_successful_ids.bats | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/tests/test_extract_successful_ids.bats b/scripts/tests/test_extract_successful_ids.bats index 364bedb..caa8dd1 100644 --- a/scripts/tests/test_extract_successful_ids.bats +++ b/scripts/tests/test_extract_successful_ids.bats @@ -8,31 +8,31 @@ teardown() { } @test "success downloaded submission" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt >> ./successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "7" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success resource hash" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_resource_hash.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_resource_hash.txt >> ./successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success download filter" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_download_filter.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_download_filter.txt >> ./successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success already exists" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_already_exists.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_already_exists.txt >> ./successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success hard link" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_hard_link.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_hard_link.txt >> ./successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } From dbd0c6cd424650d76973c283a82793aa4a72d7f6 Mon Sep 17 00:00:00 2001 From: BlipRanger <1860540+BlipRanger@users.noreply.github.com> Date: Mon, 25 Apr 2022 12:09:09 -0400 Subject: [PATCH 021/110] Add support for v.reddit links. --- bdfr/site_downloaders/download_factory.py | 3 + bdfr/site_downloaders/vreddit.py | 77 +++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 bdfr/site_downloaders/vreddit.py diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 49dba5f..2c7ef21 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -18,6 +18,7 @@ from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost from bdfr.site_downloaders.vidble import Vidble from bdfr.site_downloaders.youtube import Youtube +from bdfr.site_downloaders.vreddit import vreddit class DownloadFactory: @@ -47,6 +48,8 @@ class DownloadFactory: return Youtube elif re.match(r'i\.redd\.it.*', sanitised_url): return Direct + elif re.match(r'v\.redd\.it.*', sanitised_url): + return vreddit elif re.match(r'pornhub\.com.*', sanitised_url): return PornHub elif re.match(r'vidble\.com', sanitised_url): diff --git a/bdfr/site_downloaders/vreddit.py b/bdfr/site_downloaders/vreddit.py new file mode 100644 index 0000000..945ee93 --- /dev/null +++ b/bdfr/site_downloaders/vreddit.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 + +import logging +import tempfile +from pathlib import Path +from typing import Callable, Optional + +import yt_dlp +from praw.models import Submission + +from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.base_downloader import BaseDownloader + +logger = logging.getLogger(__name__) + + +class vreddit(BaseDownloader): + def __init__(self, post: Submission): + super().__init__(post) + + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: + ytdl_options = { + 'playlistend': 1, + 'nooverwrites': True, + } + download_function = self._download_video(ytdl_options) + extension = self.get_video_attributes(self.post.url)['ext'] + res = Resource(self.post, self.post.url, download_function, extension) + return [res] + + def _download_video(self, ytdl_options: dict) -> Callable: + yt_logger = logging.getLogger('youtube-dl') + yt_logger.setLevel(logging.CRITICAL) + ytdl_options['quiet'] = True + ytdl_options['logger'] = yt_logger + + def download(_: dict) -> bytes: + with tempfile.TemporaryDirectory() as temp_dir: + download_path = Path(temp_dir).resolve() + ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s' + try: + with yt_dlp.YoutubeDL(ytdl_options) as ydl: + ydl.download([self.post.url]) + except yt_dlp.DownloadError as e: + raise SiteDownloaderError(f'Vreddit download failed: {e}') + + downloaded_files = list(download_path.iterdir()) + if len(downloaded_files) > 0: + downloaded_file = downloaded_files[0] + else: + raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}") + with open(downloaded_file, 'rb') as file: + content = file.read() + return content + return download + + @staticmethod + def get_video_attributes(url: str) -> dict: + yt_logger = logging.getLogger('youtube-dl') + yt_logger.setLevel(logging.CRITICAL) + with yt_dlp.YoutubeDL({'logger': yt_logger, }) as ydl: + try: + result = ydl.extract_info(url, download=False) + except Exception as e: + logger.exception(e) + raise NotADownloadableLinkError(f'Video info extraction failed for {url}') + if 'ext' in result: + return result + else: + try: + result = (result["entries"][0]) + return result + except Exception as e: + logger.exception(e) + raise NotADownloadableLinkError(f'Video info extraction failed for {url}') From d64acc25f52b85eec74a35fbc229fa080f869d0c Mon Sep 17 00:00:00 2001 From: BlipRanger <1860540+BlipRanger@users.noreply.github.com> Date: Mon, 25 Apr 2022 12:53:59 -0400 Subject: [PATCH 022/110] Add tests, fix style. --- bdfr/site_downloaders/download_factory.py | 4 +- bdfr/site_downloaders/vreddit.py | 2 +- .../site_downloaders/test_download_factory.py | 2 + tests/site_downloaders/test_vreddit.py | 39 +++++++++++++++++++ 4 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 tests/site_downloaders/test_vreddit.py diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 2c7ef21..f6f4895 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -18,7 +18,7 @@ from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost from bdfr.site_downloaders.vidble import Vidble from bdfr.site_downloaders.youtube import Youtube -from bdfr.site_downloaders.vreddit import vreddit +from bdfr.site_downloaders.vreddit import Vreddit class DownloadFactory: @@ -49,7 +49,7 @@ class DownloadFactory: elif re.match(r'i\.redd\.it.*', sanitised_url): return Direct elif re.match(r'v\.redd\.it.*', sanitised_url): - return vreddit + return Vreddit elif re.match(r'pornhub\.com.*', sanitised_url): return PornHub elif re.match(r'vidble\.com', sanitised_url): diff --git a/bdfr/site_downloaders/vreddit.py b/bdfr/site_downloaders/vreddit.py index 945ee93..b1117a0 100644 --- a/bdfr/site_downloaders/vreddit.py +++ b/bdfr/site_downloaders/vreddit.py @@ -16,7 +16,7 @@ from bdfr.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) -class vreddit(BaseDownloader): +class Vreddit(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index 134396c..b66d17b 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -17,6 +17,7 @@ from bdfr.site_downloaders.pornhub import PornHub from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost from bdfr.site_downloaders.youtube import Youtube +from bdfr.site_downloaders.vreddit import Vreddit @pytest.mark.online @@ -48,6 +49,7 @@ from bdfr.site_downloaders.youtube import Youtube ('http://video.pbs.org/viralplayer/2365173446/', YtdlpFallback), ('https://www.pornhub.com/view_video.php?viewkey=ph5a2ee0461a8d0', PornHub), ('https://www.patreon.com/posts/minecart-track-59346560', Gallery), + ('https://v.redd.it/9z1dnk3xr5k61', Vreddit) )) def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): result = DownloadFactory.pull_lever(test_submission_url) diff --git a/tests/site_downloaders/test_vreddit.py b/tests/site_downloaders/test_vreddit.py new file mode 100644 index 0000000..65428b5 --- /dev/null +++ b/tests/site_downloaders/test_vreddit.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from unittest.mock import MagicMock + +import pytest + +from bdfr.exceptions import NotADownloadableLinkError +from bdfr.resource import Resource +from bdfr.site_downloaders.vreddit import Vreddit + + +@pytest.mark.online +@pytest.mark.slow +@pytest.mark.parametrize(('test_url', 'expected_hash'), ( + ('https://www.reddit.com/user/Xomb_Forever/comments/u5p2kj/hold_up/', '690cffe27a7884196437926c22897216'), +)) +def test_find_resources_good(test_url: str, expected_hash: str): + test_submission = MagicMock() + test_submission.url = test_url + downloader = Vreddit(test_submission) + resources = downloader.find_resources() + assert len(resources) == 1 + assert isinstance(resources[0], Resource) + resources[0].download() + assert resources[0].hash.hexdigest() == expected_hash + + +@pytest.mark.online +@pytest.mark.parametrize('test_url', ( + 'https://www.polygon.com/disney-plus/2020/5/14/21249881/gargoyles-animated-series-disney-plus-greg-weisman' + '-interview-oj-simpson-goliath-chronicles', +)) +def test_find_resources_bad(test_url: str): + test_submission = MagicMock() + test_submission.url = test_url + downloader = Vreddit(test_submission) + with pytest.raises(NotADownloadableLinkError): + downloader.find_resources() From 274407537ef3b2426262ceb8a14a340eb0cc4a0f Mon Sep 17 00:00:00 2001 From: BlipRanger <1860540+BlipRanger@users.noreply.github.com> Date: Mon, 25 Apr 2022 13:02:42 -0400 Subject: [PATCH 023/110] Fix one test --- tests/site_downloaders/test_download_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index b66d17b..30e19f1 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -43,7 +43,7 @@ from bdfr.site_downloaders.vreddit import Vreddit ('https://i.imgur.com/3SKrQfK.jpg?1', Direct), ('https://dynasty-scans.com/system/images_images/000/017/819/original/80215103_p0.png?1612232781', Direct), ('https://m.imgur.com/a/py3RW0j', Imgur), - ('https://v.redd.it/9z1dnk3xr5k61', YtdlpFallback), + ('https://v.redd.it/9z1dnk3xr5k61', Vreddit), ('https://streamable.com/dt46y', YtdlpFallback), ('https://vimeo.com/channels/31259/53576664', YtdlpFallback), ('http://video.pbs.org/viralplayer/2365173446/', YtdlpFallback), From 1ad2b68e03c9ae7f5619e81f93caa81335bd8506 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Thu, 28 Apr 2022 19:44:17 -0500 Subject: [PATCH 024/110] fix: Redirect to /subreddits/search ``` File "/home/xk/github/o/bulk-downloader-for-reddit/bdfr/connector.py", line 413, in check_subreddit_status assert subreddit.id File "/home/xk/.local/share/virtualenvs/bulk-downloader-for-reddit-dCAFmVJi/lib/python3.10/site-packages/praw/models/reddit/base.py", line 34, in __getattr__ self._fetch() File "/home/xk/.local/share/virtualenvs/bulk-downloader-for-reddit-dCAFmVJi/lib/python3.10/site-packages/praw/models/reddit/subreddit.py", line 584, in _fetch data = self._fetch_data() File "/home/xk/.local/share/virtualenvs/bulk-downloader-for-reddit-dCAFmVJi/lib/python3.10/site-packages/praw/models/reddit/subreddit.py", line 581, in _fetch_data return self._reddit.request("GET", path, params) File "/home/xk/.local/share/virtualenvs/bulk-downloader-for-reddit-dCAFmVJi/lib/python3.10/site-packages/praw/reddit.py", line 885, in request return self._core.request( File "/home/xk/.local/share/virtualenvs/bulk-downloader-for-reddit-dCAFmVJi/lib/python3.10/site-packages/prawcore/sessions.py", line 330, in request return self._request_with_retries( File "/home/xk/.local/share/virtualenvs/bulk-downloader-for-reddit-dCAFmVJi/lib/python3.10/site-packages/prawcore/sessions.py", line 266, in _request_with_retries raise self.STATUS_EXCEPTIONS[response.status_code](response) prawcore.exceptions.Redirect: Redirect to /subreddits/search ``` --- bdfr/connector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index e04d9ef..61ed8f4 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -414,7 +414,9 @@ class RedditConnector(metaclass=ABCMeta): try: assert subreddit.id except prawcore.NotFound: - raise errors.BulkDownloaderException(f'Source {subreddit.display_name} does not exist or cannot be found') + raise errors.BulkDownloaderException(f"Source {subreddit.display_name} cannot be found") + except prawcore.Redirect: + raise errors.BulkDownloaderException(f"Source {subreddit.display_name} does not exist") except prawcore.Forbidden: raise errors.BulkDownloaderException(f'Source {subreddit.display_name} is private and cannot be scraped') From 81c49de9114ab160ddd00ae4dd3fc3bee35081df Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 8 May 2022 12:18:31 +1000 Subject: [PATCH 025/110] Replace old Vidble test cases --- tests/site_downloaders/test_vidble.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/site_downloaders/test_vidble.py b/tests/site_downloaders/test_vidble.py index 0c5ebb2..50ca808 100644 --- a/tests/site_downloaders/test_vidble.py +++ b/tests/site_downloaders/test_vidble.py @@ -30,8 +30,8 @@ def test_change_med_url(test_url: str, expected: str): 'https://www.vidble.com/VWuNsnLJMD.jpg', 'https://www.vidble.com/sMmM8O650W.jpg', }), - ('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', { - 'https://www.vidble.com/0q4nWakqM6kzQWxlePD8N62Dsflev0N9.mp4', + ('https://www.vidble.com/watch?v=joC6b7cgs2Tnucx7dhDoyqKPbr7TQUA5', { + 'https://www.vidble.com/joC6b7cgs2Tnucx7dhDoyqKPbr7TQUA5.mp4', }), ('https://www.vidble.com/pHuwWkOcEb', { 'https://www.vidble.com/pHuwWkOcEb.jpg', @@ -55,8 +55,8 @@ def test_get_links(test_url: str, expected: set[str]): 'b31a942cd8cdda218ed547bbc04c3a27', '6f77c570b451eef4222804bd52267481', }), - ('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', { - 'cebe9d5f24dba3b0443e5097f160ca83', + ('https://www.vidble.com/watch?v=joC6b7cgs2Tnucx7dhDoyqKPbr7TQUA5', { + 'ec5f7a7f74a4dd55c740cbfd4d3bf9ab', }), ('https://www.vidble.com/pHuwWkOcEb', { '585f486dd0b2f23a57bddbd5bf185bc7', From bfd481739b28c132e6bcc35595e3853a30d51a53 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Sun, 8 May 2022 08:45:34 -0500 Subject: [PATCH 026/110] Update test_connector.py --- tests/test_connector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_connector.py b/tests/test_connector.py index 3a10757..4a2d461 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -448,6 +448,7 @@ def test_check_user_existence_banned( @pytest.mark.parametrize(('test_subreddit_name', 'expected_message'), ( ('donaldtrump', 'cannot be found'), ('submitters', 'private and cannot be scraped') + ('lhnhfkuhwreolo', 'does not exist') )) def test_check_subreddit_status_bad(test_subreddit_name: str, expected_message: str, reddit_instance: praw.Reddit): test_subreddit = reddit_instance.subreddit(test_subreddit_name) From ac8855bc14690756ae1f5ca4fe912be390fa590a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 6 Jul 2022 15:04:05 +1000 Subject: [PATCH 027/110] Add test case --- .../site_downloaders/fallback_downloaders/test_ytdlp_fallback.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py index 503f14e..c600f6f 100644 --- a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py @@ -15,6 +15,7 @@ from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallb ('https://www.youtube.com/watch?v=P19nvJOmqCc', True), ('https://www.example.com/test', False), ('https://milesmatrix.bandcamp.com/album/la-boum/', False), + ('https://v.redd.it/54i8fvzev3u81', True), )) def test_can_handle_link(test_url: str, expected: bool): result = YtdlpFallback.can_handle_link(test_url) From 2e68850d0fa17060cb28078d662955fe9057c1b4 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 6 Jul 2022 16:50:02 +1000 Subject: [PATCH 028/110] Fix some test cases --- tests/integration_tests/test_download_integration.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 75216dd..ffae0d4 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -97,7 +97,6 @@ def test_cli_download_user_specific_subreddits(test_args: list[str], tmp_path: P ['-l', 'm2601g'], ['-l', 'https://www.reddit.com/r/TrollXChromosomes/comments/m2601g/its_a_step_in_the_right_direction/'], ['-l', 'm3hxzd'], # Really long title used to overflow filename limit - ['-l', 'm3kua3'], # Has a deleted user ['-l', 'm5bqkf'], # Resource leading to a 404 )) def test_cli_download_links(test_args: list[str], tmp_path: Path): @@ -313,9 +312,8 @@ def test_cli_download_file_scheme_warning(test_args: list[str], tmp_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['-l', 'm2601g', '--disable-module', 'Direct'], - ['-l', 'nnb9vs', '--disable-module', 'YoutubeDlFallback'], - ['-l', 'nnb9vs', '--disable-module', 'youtubedlfallback'], + ['-l', 'm2601g', '--disable-module', 'SelfPost'], + ['-l', 'nnb9vs', '--disable-module', 'YtdlpFallback'], )) def test_cli_download_disable_modules(test_args: list[str], tmp_path: Path): runner = CliRunner() From f57590cfa0ad72d3af65aec39f1b1462100e1f31 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 6 Jul 2022 16:52:01 +1000 Subject: [PATCH 029/110] Add exclusion options to archiver --- bdfr/__main__.py | 4 ++-- bdfr/archiver.py | 3 +++ bdfr/configuration.py | 2 +- .../integration_tests/test_archive_integration.py | 15 +++++++++++++++ 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 451a7f0..45450ed 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -17,6 +17,8 @@ _common_options = [ click.option('--authenticate', is_flag=True, default=None), click.option('--config', type=str, default=None), click.option('--disable-module', multiple=True, default=None, type=str), + click.option('--exclude-id', default=None, multiple=True), + click.option('--exclude-id-file', default=None, multiple=True), click.option('--file-scheme', default=None, type=str), click.option('--folder-scheme', default=None, type=str), click.option('--ignore-user', type=str, multiple=True, default=None), @@ -44,8 +46,6 @@ _downloader_options = [ click.option('--max-wait-time', type=int, default=None), click.option('--no-dupes', is_flag=True, default=None), click.option('--search-existing', is_flag=True, default=None), - click.option('--exclude-id', default=None, multiple=True), - click.option('--exclude-id-file', default=None, multiple=True), click.option('--skip', default=None, multiple=True), click.option('--skip-domain', default=None, multiple=True), click.option('--skip-subreddit', default=None, multiple=True), diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 559dcc1..214111f 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -34,6 +34,9 @@ class Archiver(RedditConnector): f'Submission {submission.id} in {submission.subreddit.display_name} skipped' f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user') continue + if submission.id in self.excluded_submission_ids: + logger.debug(f'Object {submission.id} in exclusion list, skipping') + continue logger.debug(f'Attempting to archive submission {submission.id}') self.write_entry(submission) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index ef24e36..c8fb323 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -35,7 +35,7 @@ class Configuration(Namespace): self.skip_subreddit: list[str] = [] self.sort: str = 'hot' self.submitted: bool = False - self.subscribed: bool = True + self.subscribed: bool = False self.subreddit: list[str] = [] self.time: str = 'all' self.time_format = None diff --git a/tests/integration_tests/test_archive_integration.py b/tests/integration_tests/test_archive_integration.py index 2234c5a..744e343 100644 --- a/tests/integration_tests/test_archive_integration.py +++ b/tests/integration_tests/test_archive_integration.py @@ -136,3 +136,18 @@ def test_cli_archive_file_format(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 assert 'Attempting to archive submission' in result.output assert re.search('format at /.+?/Judge says Trump and two adult', result.output) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-l', 'm2601g', '--exclude-id', 'm2601g'], +)) +def test_cli_archive_links_exclusion(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'in exclusion list' in result.output + assert 'Attempting to archive' not in result.output From 12104d54f1b91b67c0eefcf2fa74a498a9347f7e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 7 Jul 2022 11:39:42 +1000 Subject: [PATCH 030/110] Update some test hashes --- .../fallback_downloaders/test_ytdlp_fallback.py | 4 ++-- tests/site_downloaders/test_youtube.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py index c600f6f..23c95f9 100644 --- a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py @@ -36,8 +36,8 @@ def test_info_extraction_bad(test_url: str): @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://streamable.com/dt46y', 'b7e465adaade5f2b6d8c2b4b7d0a2878'), ('https://streamable.com/t8sem', '49b2d1220c485455548f1edbc05d4ecf'), - ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '0a406f3d2e09b3d3ba43bf97185b83e3'), - ('https://v.redd.it/9z1dnk3xr5k61', 'c7765c33972549465c87dcbd59eb3d5d'), + ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '49316899440ea1c3b74d5640d9d527c1'), + ('https://v.redd.it/9z1dnk3xr5k61', '76d5e6d7f4f9e1910c6c22b54dfa804f'), )) def test_find_resources(test_url: str, expected_hash: str): test_submission = MagicMock() diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index 684eb20..ce1abb8 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -15,7 +15,7 @@ from bdfr.site_downloaders.youtube import Youtube @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://www.youtube.com/watch?v=uSm2VDgRIUs', '2d60b54582df5b95ec72bb00b580d2ff'), ('https://www.youtube.com/watch?v=GcI7nxQj7HA', '5db0fc92a0a7fb9ac91e63505eea9cf0'), - ('https://youtu.be/TMqPOlp4tNo', 'f68c00b018162857f3df4844c45302e7'), # Age restricted + ('https://youtu.be/TMqPOlp4tNo', 'ceb4c2cb1a9bf79617623b2aa57e18fd'), # Age restricted )) def test_find_resources_good(test_url: str, expected_hash: str): test_submission = MagicMock() From a6940987f487c0ecf8dcaa0d067783653aa8525c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 7 Jul 2022 12:07:53 +1000 Subject: [PATCH 031/110] Update test parameter --- tests/test_file_name_formatter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index c9e049e..9c0a8bb 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -407,7 +407,7 @@ def test_windows_max_path(tmp_path: Path): @pytest.mark.parametrize(('test_reddit_id', 'test_downloader', 'expected_names'), ( ('gphmnr', YtdlpFallback, {'He has a lot to say today.mp4'}), ('d0oir2', YtdlpFallback, {"Crunk's finest moment. Welcome to the new subreddit!.mp4"}), - ('jiecu', SelfPost, {'Reston, VA Some info regarding shelters in the area..txt'}), + ('jiecu', SelfPost, {'[deleted by user].txt'}), ('gui1i', SelfPost, {'The "Beer and Ear offer for those who need help in the \'burbs of North Dallas....txt'}), )) def test_name_submission( From 919abb09efabb839456f6ec1e684d2bed4d583c9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 15 Jul 2022 14:22:55 +1000 Subject: [PATCH 032/110] Remove bugged test case --- tests/archive_entry/test_submission_archive_entry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/archive_entry/test_submission_archive_entry.py b/tests/archive_entry/test_submission_archive_entry.py index 60f47b5..045eabd 100644 --- a/tests/archive_entry/test_submission_archive_entry.py +++ b/tests/archive_entry/test_submission_archive_entry.py @@ -34,7 +34,7 @@ def test_get_comments(test_submission_id: str, min_comments: int, reddit_instanc 'created_utc': 1615583837, 'permalink': '/r/australia/comments/m3reby/this_little_guy_fell_out_of_a_tree_and_in_front/' }), - ('m3kua3', {'author': 'DELETED'}), + # TODO: add deleted user test case )) def test_get_post_details(test_submission_id: str, expected_dict: dict, reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) From a620ae91a18013d1e172d628d66b65980e16c0e0 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 18 Feb 2022 10:21:52 +1000 Subject: [PATCH 033/110] Add --subscribed option --- bdfr/__main__.py | 1 + bdfr/configuration.py | 1 + bdfr/connector.py | 20 +++++++---- bdfr/default_config.cfg | 2 +- .../test_download_integration.py | 35 ++++++++++--------- tests/test_connector.py | 18 +++++++++- 6 files changed, 52 insertions(+), 25 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index de658de..56ffb0f 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -23,6 +23,7 @@ _common_options = [ click.option('--saved', is_flag=True, default=None), click.option('--search', default=None, type=str), click.option('--submitted', is_flag=True, default=None), + click.option('--subscribed', is_flag=True, default=None), click.option('--time-format', type=str, default=None), click.option('--upvoted', is_flag=True, default=None), click.option('-L', '--limit', default=None, type=int), diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 81fa3e4..ef24e36 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -35,6 +35,7 @@ class Configuration(Namespace): self.skip_subreddit: list[str] = [] self.sort: str = 'hot' self.submitted: bool = False + self.subscribed: bool = True self.subreddit: list[str] = [] self.time: str = 'all' self.time_format = None diff --git a/bdfr/connector.py b/bdfr/connector.py index 506e23f..e04d9ef 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -243,9 +243,19 @@ class RedditConnector(metaclass=ABCMeta): return set(all_entries) def get_subreddits(self) -> list[praw.models.ListingGenerator]: - if self.args.subreddit: - out = [] - for reddit in self.split_args_input(self.args.subreddit): + out = [] + subscribed_subreddits = set() + if self.args.subscribed: + if self.args.authenticate: + try: + subscribed_subreddits = list(self.reddit_instance.user.subreddits(limit=None)) + subscribed_subreddits = set([s.display_name for s in subscribed_subreddits]) + except prawcore.InsufficientScope: + logger.error('BDFR has insufficient scope to access subreddit lists') + else: + logger.error('Cannot find subscribed subreddits without an authenticated instance') + if self.args.subreddit or subscribed_subreddits: + for reddit in self.split_args_input(self.args.subreddit) | subscribed_subreddits: if reddit == 'friends' and self.authenticated is False: logger.error('Cannot read friends subreddit without an authenticated instance') continue @@ -270,9 +280,7 @@ class RedditConnector(metaclass=ABCMeta): logger.debug(f'Added submissions from subreddit {reddit}') except (errors.BulkDownloaderException, praw.exceptions.PRAWException) as e: logger.error(f'Failed to get submissions for subreddit {reddit}: {e}') - return out - else: - return [] + return out def resolve_user_name(self, in_name: str) -> str: if in_name == 'me': diff --git a/bdfr/default_config.cfg b/bdfr/default_config.cfg index b8039a9..c601152 100644 --- a/bdfr/default_config.cfg +++ b/bdfr/default_config.cfg @@ -1,7 +1,7 @@ [DEFAULT] client_id = U-6gk4ZCh3IeNQ client_secret = 7CZHY6AmKweZME5s50SfDGylaPg -scopes = identity, history, read, save +scopes = identity, history, read, save, mysubreddits backup_log_count = 3 max_wait_time = 120 time_format = ISO \ No newline at end of file diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index bd53382..75216dd 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -31,23 +31,23 @@ def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['-s', 'Mindustry', '-L', 1], - ['-s', 'r/Mindustry', '-L', 1], - ['-s', 'r/mindustry', '-L', 1], - ['-s', 'mindustry', '-L', 1], - ['-s', 'https://www.reddit.com/r/TrollXChromosomes/', '-L', 1], - ['-s', 'r/TrollXChromosomes/', '-L', 1], - ['-s', 'TrollXChromosomes/', '-L', 1], - ['-s', 'trollxchromosomes', '-L', 1], - ['-s', 'trollxchromosomes,mindustry,python', '-L', 1], - ['-s', 'trollxchromosomes, mindustry, python', '-L', 1], - ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day'], - ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new'], - ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new'], - ['-s', 'trollxchromosomes', '-L', 1, '--search', 'women'], - ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--search', 'women'], - ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new', '--search', 'women'], - ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new', '--search', 'women'], + ['-s', 'Mindustry', '-L', 3], + ['-s', 'r/Mindustry', '-L', 3], + ['-s', 'r/mindustry', '-L', 3], + ['-s', 'mindustry', '-L', 3], + ['-s', 'https://www.reddit.com/r/TrollXChromosomes/', '-L', 3], + ['-s', 'r/TrollXChromosomes/', '-L', 3], + ['-s', 'TrollXChromosomes/', '-L', 3], + ['-s', 'trollxchromosomes', '-L', 3], + ['-s', 'trollxchromosomes,mindustry,python', '-L', 3], + ['-s', 'trollxchromosomes, mindustry, python', '-L', 3], + ['-s', 'trollxchromosomes', '-L', 3, '--time', 'day'], + ['-s', 'trollxchromosomes', '-L', 3, '--sort', 'new'], + ['-s', 'trollxchromosomes', '-L', 3, '--time', 'day', '--sort', 'new'], + ['-s', 'trollxchromosomes', '-L', 3, '--search', 'women'], + ['-s', 'trollxchromosomes', '-L', 3, '--time', 'day', '--search', 'women'], + ['-s', 'trollxchromosomes', '-L', 3, '--sort', 'new', '--search', 'women'], + ['-s', 'trollxchromosomes', '-L', 3, '--time', 'day', '--sort', 'new', '--search', 'women'], )) def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): runner = CliRunner() @@ -64,6 +64,7 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['-s', 'hentai', '-L', 10, '--search', 'red', '--authenticate'], + ['--authenticate', '--subscribed', '-L', 10], )) def test_cli_download_search_subreddits_authenticated(test_args: list[str], tmp_path: Path): runner = CliRunner() diff --git a/tests/test_connector.py b/tests/test_connector.py index 9fe58f2..3a10757 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -336,13 +336,29 @@ def test_get_user_authenticated_lists( downloader_mock.args.__dict__[test_flag] = True downloader_mock.reddit_instance = authenticated_reddit_instance downloader_mock.args.limit = 10 - downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.determine_sort_function.return_value = praw.models.Subreddit.hot downloader_mock.sort_filter = RedditTypes.SortType.HOT downloader_mock.args.user = [RedditConnector.resolve_user_name(downloader_mock, 'me')] results = RedditConnector.get_user_data(downloader_mock) assert_all_results_are_submissions_or_comments(10, results) +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.authenticated +def test_get_subscribed_subreddits(downloader_mock: MagicMock, authenticated_reddit_instance: praw.Reddit): + downloader_mock.reddit_instance = authenticated_reddit_instance + downloader_mock.args.limit = 10 + downloader_mock.args.authenticate = True + downloader_mock.args.subscribed = True + downloader_mock.determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.sort_filter = RedditTypes.SortType.HOT + results = RedditConnector.get_subreddits(downloader_mock) + assert all([isinstance(s, praw.models.ListingGenerator) for s in results]) + assert len(results) > 0 + + @pytest.mark.parametrize(('test_name', 'expected'), ( ('Mindustry', 'Mindustry'), ('Futurology', 'Futurology'), From 90a2eac90dc35f4067789077cb769f2c653c9233 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 18 Feb 2022 12:04:37 +1000 Subject: [PATCH 034/110] Add support for Redgifs images and galleries --- bdfr/site_downloaders/redgifs.py | 31 +++++++++++++---- tests/site_downloaders/test_redgifs.py | 48 ++++++++++++++++++-------- 2 files changed, 57 insertions(+), 22 deletions(-) diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index a62fedb..12fb24d 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -17,11 +17,11 @@ class Redgifs(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - media_url = self._get_link(self.post.url) - return [Resource(self.post, media_url, Resource.retry_download(media_url), '.mp4')] + media_urls = self._get_link(self.post.url) + return [Resource(self.post, m, Resource.retry_download(m), '.mp4') for m in media_urls] @staticmethod - def _get_link(url: str) -> str: + def _get_link(url: str) -> set[str]: try: redgif_id = re.match(r'.*/(.*?)/?$', url).group(1) except AttributeError: @@ -32,16 +32,33 @@ class Redgifs(BaseDownloader): 'Chrome/90.0.4430.93 Safari/537.36', } - content = Redgifs.retrieve_url(f'https://api.redgifs.com/v1/gfycats/{redgif_id}', headers=headers) + content = Redgifs.retrieve_url(f'https://api.redgifs.com/v2/gifs/{redgif_id}', headers=headers) if content is None: raise SiteDownloaderError('Could not read the page source') try: - out = json.loads(content.text)['gfyItem']['mp4Url'] - except (KeyError, AttributeError): - raise SiteDownloaderError('Failed to find JSON data in page') + response_json = json.loads(content.text) except json.JSONDecodeError as e: raise SiteDownloaderError(f'Received data was not valid JSON: {e}') + out = set() + try: + if response_json['gif']['type'] == 1: # type 1 is a video + out.add(response_json['gif']['urls']['hd']) + elif response_json['gif']['type'] == 2: # type 2 is an image + if response_json['gif']['gallery']: + content = Redgifs.retrieve_url( + f'https://api.redgifs.com/v2/gallery/{response_json["gif"]["gallery"]}', + headers=headers, + ) + response_json = json.loads(content.text) + out = {p['urls']['hd'] for p in response_json['gifs']} + else: + out.add(response_json['gif']['urls']['hd']) + else: + raise KeyError + except (KeyError, AttributeError): + raise SiteDownloaderError('Failed to find JSON data in page') + return out diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index 571f044..a1f571e 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -12,30 +12,48 @@ from bdfr.site_downloaders.redgifs import Redgifs @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected'), ( ('https://redgifs.com/watch/frighteningvictorioussalamander', - 'https://thumbs2.redgifs.com/FrighteningVictoriousSalamander.mp4'), + {'https://thumbs2.redgifs.com/FrighteningVictoriousSalamander.mp4'}), ('https://redgifs.com/watch/springgreendecisivetaruca', - 'https://thumbs2.redgifs.com/SpringgreenDecisiveTaruca.mp4'), + {'https://thumbs2.redgifs.com/SpringgreenDecisiveTaruca.mp4'}), ('https://www.redgifs.com/watch/palegoldenrodrawhalibut', - 'https://thumbs2.redgifs.com/PalegoldenrodRawHalibut.mp4'), + {'https://thumbs2.redgifs.com/PalegoldenrodRawHalibut.mp4'}), + ('https://redgifs.com/watch/hollowintentsnowyowl', + {'https://thumbs2.redgifs.com/HollowIntentSnowyowl-large.jpg'}), + ('https://www.redgifs.com/watch/lustrousstickywaxwing', + {'https://thumbs2.redgifs.com/EntireEnchantingHypsilophodon-large.jpg', + 'https://thumbs2.redgifs.com/FancyMagnificentAdamsstaghornedbeetle-large.jpg', + 'https://thumbs2.redgifs.com/LustrousStickyWaxwing-large.jpg', + 'https://thumbs2.redgifs.com/ParchedWindyArmyworm-large.jpg', + 'https://thumbs2.redgifs.com/ThunderousColorlessErmine-large.jpg', + 'https://thumbs2.redgifs.com/UnripeUnkemptWoodpecker-large.jpg'}), )) -def test_get_link(test_url: str, expected: str): +def test_get_link(test_url: str, expected: set[str]): result = Redgifs._get_link(test_url) assert result == expected @pytest.mark.online -@pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://redgifs.com/watch/frighteningvictorioussalamander', '4007c35d9e1f4b67091b5f12cffda00a'), - ('https://redgifs.com/watch/springgreendecisivetaruca', '8dac487ac49a1f18cc1b4dabe23f0869'), - ('https://redgifs.com/watch/leafysaltydungbeetle', '076792c660b9c024c0471ef4759af8bd'), - ('https://www.redgifs.com/watch/palegoldenrodrawhalibut', '46d5aa77fe80c6407de1ecc92801c10e'), +@pytest.mark.parametrize(('test_url', 'expected_hashes'), ( + ('https://redgifs.com/watch/frighteningvictorioussalamander', {'4007c35d9e1f4b67091b5f12cffda00a'}), + ('https://redgifs.com/watch/springgreendecisivetaruca', {'8dac487ac49a1f18cc1b4dabe23f0869'}), + ('https://redgifs.com/watch/leafysaltydungbeetle', {'076792c660b9c024c0471ef4759af8bd'}), + ('https://www.redgifs.com/watch/palegoldenrodrawhalibut', {'46d5aa77fe80c6407de1ecc92801c10e'}), + ('https://redgifs.com/watch/hollowintentsnowyowl', + {'5ee51fa15e0a58e98f11dea6a6cca771'}), + ('https://www.redgifs.com/watch/lustrousstickywaxwing', + {'b461e55664f07bed8d2f41d8586728fa', + '30ba079a8ed7d7adf17929dc3064c10f', + '0d4f149d170d29fc2f015c1121bab18b', + '53987d99cfd77fd65b5fdade3718f9f1', + 'fb2e7d972846b83bf4016447d3060d60', + '44fb28f72ec9a5cca63fa4369ab4f672'}), )) -def test_download_resource(test_url: str, expected_hash: str): +def test_download_resource(test_url: str, expected_hashes: set[str]): mock_submission = Mock() mock_submission.url = test_url test_site = Redgifs(mock_submission) - resources = test_site.find_resources() - assert len(resources) == 1 - assert isinstance(resources[0], Resource) - resources[0].download() - assert resources[0].hash.hexdigest() == expected_hash + results = test_site.find_resources() + assert all([isinstance(res, Resource) for res in results]) + [res.download() for res in results] + hashes = set([res.hash.hexdigest() for res in results]) + assert hashes == set(expected_hashes) From e8d767050f1be80311b293bc541b3600f52f0e61 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 18 Feb 2022 12:30:38 +1000 Subject: [PATCH 035/110] Add file scheme naming for archiver --- bdfr/__main__.py | 4 ++-- .../integration_tests/test_archive_integration.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 56ffb0f..451a7f0 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -17,6 +17,8 @@ _common_options = [ click.option('--authenticate', is_flag=True, default=None), click.option('--config', type=str, default=None), click.option('--disable-module', multiple=True, default=None, type=str), + click.option('--file-scheme', default=None, type=str), + click.option('--folder-scheme', default=None, type=str), click.option('--ignore-user', type=str, multiple=True, default=None), click.option('--include-id-file', multiple=True, default=None), click.option('--log', type=str, default=None), @@ -38,8 +40,6 @@ _common_options = [ ] _downloader_options = [ - click.option('--file-scheme', default=None, type=str), - click.option('--folder-scheme', default=None, type=str), click.option('--make-hard-links', is_flag=True, default=None), click.option('--max-wait-time', type=int, default=None), click.option('--no-dupes', is_flag=True, default=None), diff --git a/tests/integration_tests/test_archive_integration.py b/tests/integration_tests/test_archive_integration.py index 5ef04a6..2234c5a 100644 --- a/tests/integration_tests/test_archive_integration.py +++ b/tests/integration_tests/test_archive_integration.py @@ -121,3 +121,18 @@ def test_cli_archive_ignore_user(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 assert 'being an ignored user' in result.output assert 'Attempting to archive submission' not in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--file-scheme', '{TITLE}', '-l', 'suy011'], +)) +def test_cli_archive_file_format(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Attempting to archive submission' in result.output + assert re.search('format at /.+?/Judge says Trump and two adult', result.output) From a59916939979f7e540e80190a3d4b2ae0a87267e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 18 Feb 2022 12:42:42 +1000 Subject: [PATCH 036/110] Increase version number --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 198ebe7..67a1deb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.5.2 +version = 2.6.0 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc From f49a1d7a2d585f4743c526a745dacc231e439038 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 18 Feb 2022 12:49:46 +1000 Subject: [PATCH 037/110] Fix gfycat after redgifs changes --- bdfr/site_downloaders/gfycat.py | 4 ++-- tests/site_downloaders/test_gfycat.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bdfr/site_downloaders/gfycat.py b/bdfr/site_downloaders/gfycat.py index 6accaab..c8da9df 100644 --- a/bdfr/site_downloaders/gfycat.py +++ b/bdfr/site_downloaders/gfycat.py @@ -21,7 +21,7 @@ class Gfycat(Redgifs): return super().find_resources(authenticator) @staticmethod - def _get_link(url: str) -> str: + def _get_link(url: str) -> set[str]: gfycat_id = re.match(r'.*/(.*?)/?$', url).group(1) url = 'https://gfycat.com/' + gfycat_id @@ -39,4 +39,4 @@ class Gfycat(Redgifs): raise SiteDownloaderError(f'Failed to download Gfycat link {url}: {e}') except json.JSONDecodeError as e: raise SiteDownloaderError(f'Did not receive valid JSON data: {e}') - return out + return {out,} diff --git a/tests/site_downloaders/test_gfycat.py b/tests/site_downloaders/test_gfycat.py index 981d01d..3b40840 100644 --- a/tests/site_downloaders/test_gfycat.py +++ b/tests/site_downloaders/test_gfycat.py @@ -16,7 +16,7 @@ from bdfr.site_downloaders.gfycat import Gfycat )) def test_get_link(test_url: str, expected_url: str): result = Gfycat._get_link(test_url) - assert result == expected_url + assert result.pop() == expected_url @pytest.mark.online From 1abb7768c34a4ff3f8f09ccb25927d8349091f54 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 18 Feb 2022 12:51:51 +1000 Subject: [PATCH 038/110] Update test hashes --- .../fallback_downloaders/test_ytdlp_fallback.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py index 2c4a4f6..503f14e 100644 --- a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py @@ -35,8 +35,8 @@ def test_info_extraction_bad(test_url: str): @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://streamable.com/dt46y', 'b7e465adaade5f2b6d8c2b4b7d0a2878'), ('https://streamable.com/t8sem', '49b2d1220c485455548f1edbc05d4ecf'), - ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '21968d3d92161ea5e0abdcaf6311b06c'), - ('https://v.redd.it/9z1dnk3xr5k61', '351a2b57e888df5ccbc508056511f38d'), + ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '0a406f3d2e09b3d3ba43bf97185b83e3'), + ('https://v.redd.it/9z1dnk3xr5k61', 'c7765c33972549465c87dcbd59eb3d5d'), )) def test_find_resources(test_url: str, expected_hash: str): test_submission = MagicMock() From 12982c00cdd7bdbd69987cb686118194d0b26f40 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 20 Feb 2022 15:48:02 +1000 Subject: [PATCH 039/110] Switch redgifs to dynamic file extensions --- bdfr/site_downloaders/redgifs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 12fb24d..f7ea56a 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -18,7 +18,7 @@ class Redgifs(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: media_urls = self._get_link(self.post.url) - return [Resource(self.post, m, Resource.retry_download(m), '.mp4') for m in media_urls] + return [Resource(self.post, m, Resource.retry_download(m), None) for m in media_urls] @staticmethod def _get_link(url: str) -> set[str]: From 2bdeaf26604b7c2e937176cf1139aa348d0fa6f7 Mon Sep 17 00:00:00 2001 From: sinclairkosh <102016413+sinclairkosh@users.noreply.github.com> Date: Tue, 22 Mar 2022 05:53:43 +1100 Subject: [PATCH 040/110] Update Readme with some command clarifications Clarify that fact that downloading by user doesn't work the same way as downloading by subreddit. Feel free to user a better example username. :) --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index b84aa3d..47c0dde 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,12 @@ However, these commands are not enough. You should chain parameters in [Options] python3 -m bdfr download ./path/to/output --subreddit Python -L 10 ``` ```bash +python3 -m bdfr download ./path/to/output --user reddituser --submitted -L 100 +``` +```bash +python3 -m bdfr download ./path/to/output --user reddituser --submitted --all-comments --comment-context +``` +```bash python3 -m bdfr download ./path/to/output --user me --saved --authenticate -L 25 --file-scheme '{POSTID}' ``` ```bash From 9f3dcece4d2af458b819bd68e26d3adbea9ffb28 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 25 Mar 2022 10:50:52 +1000 Subject: [PATCH 041/110] Strip any newline characters from names --- bdfr/file_name_formatter.py | 3 +++ tests/test_file_name_formatter.py | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index 3e8832b..1dabd34 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -111,6 +111,9 @@ class FileNameFormatter: if not resource.extension: raise BulkDownloaderException(f'Resource from {resource.url} has no extension') file_name = str(self._format_name(resource.source_submission, self.file_format_string)) + + file_name = re.sub(r'\n', ' ', file_name) + if not re.match(r'.*\.$', file_name) and not re.match(r'^\..*', resource.extension): ending = index + '.' + resource.extension else: diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index 30fac77..bd9d058 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -16,6 +16,7 @@ from bdfr.file_name_formatter import FileNameFormatter from bdfr.resource import Resource from bdfr.site_downloaders.base_downloader import BaseDownloader from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback +from bdfr.site_downloaders.self_post import SelfPost @pytest.fixture() @@ -406,6 +407,7 @@ def test_windows_max_path(tmp_path: Path): @pytest.mark.parametrize(('test_reddit_id', 'test_downloader', 'expected_names'), ( ('gphmnr', YtdlpFallback, {'He has a lot to say today.mp4'}), ('d0oir2', YtdlpFallback, {"Crunk's finest moment. Welcome to the new subreddit!.mp4"}), + ('jiecu', SelfPost, {'Reston, VA Some info regarding shelters in the area..txt'}) )) def test_name_submission( test_reddit_id: str, @@ -418,4 +420,4 @@ def test_name_submission( test_formatter = FileNameFormatter('{TITLE}', '', '') results = test_formatter.format_resource_paths(test_resources, Path('.')) results = set([r[0].name for r in results]) - assert expected_names == results + assert results == expected_names From 53d7ce2e5d9ec313fde62e184b6b356340c0a0d9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 25 Mar 2022 10:52:49 +1000 Subject: [PATCH 042/110] Add second test case --- tests/test_file_name_formatter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index bd9d058..c9e049e 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -407,7 +407,8 @@ def test_windows_max_path(tmp_path: Path): @pytest.mark.parametrize(('test_reddit_id', 'test_downloader', 'expected_names'), ( ('gphmnr', YtdlpFallback, {'He has a lot to say today.mp4'}), ('d0oir2', YtdlpFallback, {"Crunk's finest moment. Welcome to the new subreddit!.mp4"}), - ('jiecu', SelfPost, {'Reston, VA Some info regarding shelters in the area..txt'}) + ('jiecu', SelfPost, {'Reston, VA Some info regarding shelters in the area..txt'}), + ('gui1i', SelfPost, {'The "Beer and Ear offer for those who need help in the \'burbs of North Dallas....txt'}), )) def test_name_submission( test_reddit_id: str, From e068c9ce563de95c5ae8a6d4f8b758288be72dce Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Mon, 18 Apr 2022 16:03:24 -0500 Subject: [PATCH 043/110] readme: make --search info a bit more clear --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 47c0dde..28c0b8a 100644 --- a/README.md +++ b/README.md @@ -98,8 +98,8 @@ The following options are common between both the `archive` and `download` comma - This option will make the BDFR use the supplied user's saved posts list as a download source - This requires an authenticated Reddit instance, using the `--authenticate` flag, as well as `--user` set to `me` - `--search` - - This will apply the specified search term to specific lists when scraping submissions - - A search term can only be applied to subreddits and multireddits, supplied with the `- s` and `-m` flags respectively + - This will apply the input search term to specific lists when scraping submissions + - A search term can only be applied when using the `--subreddit` and `--multireddit` flags - `--submitted` - This will use a user's submissions as a source - A user must be specified with `--user` From ad172841e2f52fe5c8b98af57ecdb1f607ac7fe2 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Mon, 18 Apr 2022 00:30:17 -0500 Subject: [PATCH 044/110] Use stdout --- scripts/extract_failed_ids.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index 8addf7e..64d1e72 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -7,17 +7,10 @@ else exit 1 fi -if [ -n "$2" ]; then - output="$2" - echo "Outputting IDs to $output" -else - output="./failed.txt" -fi - { grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ; grep 'Failed to download resource' "$file" | awk '{ print $15 }' ; grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; grep 'Failed to write file' "$file" | awk '{ print $14 }' ; grep 'skipped due to disabled module' "$file" | awk '{ print $9 }' ; -} >>"$output" +} From e4a44f1e252f684455a97447f17322d1f34cae72 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Mon, 18 Apr 2022 20:42:50 -0500 Subject: [PATCH 045/110] okay --- scripts/tests/test_extract_failed_ids.bats | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/tests/test_extract_failed_ids.bats b/scripts/tests/test_extract_failed_ids.bats index 75b9bff..a716cba 100644 --- a/scripts/tests/test_extract_failed_ids.bats +++ b/scripts/tests/test_extract_failed_ids.bats @@ -13,31 +13,31 @@ teardown() { } @test "fail no downloader module" { - run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt >> failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "3" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail resource error" { - run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt >> failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail site downloader error" { - run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt >> failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "2" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail failed file write" { - run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt >> failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail disabled module" { - run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt >> failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } From eb8f9d5876437ce28be8be184895874ebef14e3c Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Mon, 18 Apr 2022 20:47:35 -0500 Subject: [PATCH 046/110] oh there's another one --- scripts/extract_successful_ids.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/scripts/extract_successful_ids.sh b/scripts/extract_successful_ids.sh index 011ba6c..e8f482e 100755 --- a/scripts/extract_successful_ids.sh +++ b/scripts/extract_successful_ids.sh @@ -7,17 +7,10 @@ else exit 1 fi -if [ -n "$2" ]; then - output="$2" - echo "Outputting IDs to $output" -else - output="./successful.txt" -fi - { grep 'Downloaded submission' "$file" | awk '{ print $(NF-2) }' ; grep 'Resource hash' "$file" | awk '{ print $(NF-2) }' ; grep 'Download filter' "$file" | awk '{ print $(NF-3) }' ; grep 'already exists, continuing' "$file" | awk '{ print $(NF-3) }' ; grep 'Hard link made' "$file" | awk '{ print $(NF) }' ; -} >> "$output" +} From efea01e56f112e1c46645813296371c3c6e020a0 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Mon, 18 Apr 2022 20:48:02 -0500 Subject: [PATCH 047/110] and this one too --- scripts/tests/test_extract_successful_ids.bats | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/tests/test_extract_successful_ids.bats b/scripts/tests/test_extract_successful_ids.bats index 364bedb..caa8dd1 100644 --- a/scripts/tests/test_extract_successful_ids.bats +++ b/scripts/tests/test_extract_successful_ids.bats @@ -8,31 +8,31 @@ teardown() { } @test "success downloaded submission" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt >> ./successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "7" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success resource hash" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_resource_hash.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_resource_hash.txt >> ./successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success download filter" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_download_filter.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_download_filter.txt >> ./successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success already exists" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_already_exists.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_already_exists.txt >> ./successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success hard link" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_hard_link.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_hard_link.txt >> ./successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } From decb13b5db45624ff7aaf24e004a56445a7cfcc8 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 8 May 2022 12:18:31 +1000 Subject: [PATCH 048/110] Replace old Vidble test cases --- tests/site_downloaders/test_vidble.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/site_downloaders/test_vidble.py b/tests/site_downloaders/test_vidble.py index 0c5ebb2..50ca808 100644 --- a/tests/site_downloaders/test_vidble.py +++ b/tests/site_downloaders/test_vidble.py @@ -30,8 +30,8 @@ def test_change_med_url(test_url: str, expected: str): 'https://www.vidble.com/VWuNsnLJMD.jpg', 'https://www.vidble.com/sMmM8O650W.jpg', }), - ('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', { - 'https://www.vidble.com/0q4nWakqM6kzQWxlePD8N62Dsflev0N9.mp4', + ('https://www.vidble.com/watch?v=joC6b7cgs2Tnucx7dhDoyqKPbr7TQUA5', { + 'https://www.vidble.com/joC6b7cgs2Tnucx7dhDoyqKPbr7TQUA5.mp4', }), ('https://www.vidble.com/pHuwWkOcEb', { 'https://www.vidble.com/pHuwWkOcEb.jpg', @@ -55,8 +55,8 @@ def test_get_links(test_url: str, expected: set[str]): 'b31a942cd8cdda218ed547bbc04c3a27', '6f77c570b451eef4222804bd52267481', }), - ('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', { - 'cebe9d5f24dba3b0443e5097f160ca83', + ('https://www.vidble.com/watch?v=joC6b7cgs2Tnucx7dhDoyqKPbr7TQUA5', { + 'ec5f7a7f74a4dd55c740cbfd4d3bf9ab', }), ('https://www.vidble.com/pHuwWkOcEb', { '585f486dd0b2f23a57bddbd5bf185bc7', From 7d4916919d15aef08ca9b545ac65689f0f1b08fb Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 6 Jul 2022 15:04:05 +1000 Subject: [PATCH 049/110] Add test case --- .../site_downloaders/fallback_downloaders/test_ytdlp_fallback.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py index 503f14e..c600f6f 100644 --- a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py @@ -15,6 +15,7 @@ from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallb ('https://www.youtube.com/watch?v=P19nvJOmqCc', True), ('https://www.example.com/test', False), ('https://milesmatrix.bandcamp.com/album/la-boum/', False), + ('https://v.redd.it/54i8fvzev3u81', True), )) def test_can_handle_link(test_url: str, expected: bool): result = YtdlpFallback.can_handle_link(test_url) From 2d365b612b28c1efb637dda95a73266d10633170 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 6 Jul 2022 16:50:02 +1000 Subject: [PATCH 050/110] Fix some test cases --- tests/integration_tests/test_download_integration.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 75216dd..ffae0d4 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -97,7 +97,6 @@ def test_cli_download_user_specific_subreddits(test_args: list[str], tmp_path: P ['-l', 'm2601g'], ['-l', 'https://www.reddit.com/r/TrollXChromosomes/comments/m2601g/its_a_step_in_the_right_direction/'], ['-l', 'm3hxzd'], # Really long title used to overflow filename limit - ['-l', 'm3kua3'], # Has a deleted user ['-l', 'm5bqkf'], # Resource leading to a 404 )) def test_cli_download_links(test_args: list[str], tmp_path: Path): @@ -313,9 +312,8 @@ def test_cli_download_file_scheme_warning(test_args: list[str], tmp_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['-l', 'm2601g', '--disable-module', 'Direct'], - ['-l', 'nnb9vs', '--disable-module', 'YoutubeDlFallback'], - ['-l', 'nnb9vs', '--disable-module', 'youtubedlfallback'], + ['-l', 'm2601g', '--disable-module', 'SelfPost'], + ['-l', 'nnb9vs', '--disable-module', 'YtdlpFallback'], )) def test_cli_download_disable_modules(test_args: list[str], tmp_path: Path): runner = CliRunner() From 8c59329ffab9ecf977fc978650350158a054ea2f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 6 Jul 2022 16:52:01 +1000 Subject: [PATCH 051/110] Add exclusion options to archiver --- bdfr/__main__.py | 4 ++-- bdfr/archiver.py | 3 +++ bdfr/configuration.py | 2 +- .../integration_tests/test_archive_integration.py | 15 +++++++++++++++ 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 451a7f0..45450ed 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -17,6 +17,8 @@ _common_options = [ click.option('--authenticate', is_flag=True, default=None), click.option('--config', type=str, default=None), click.option('--disable-module', multiple=True, default=None, type=str), + click.option('--exclude-id', default=None, multiple=True), + click.option('--exclude-id-file', default=None, multiple=True), click.option('--file-scheme', default=None, type=str), click.option('--folder-scheme', default=None, type=str), click.option('--ignore-user', type=str, multiple=True, default=None), @@ -44,8 +46,6 @@ _downloader_options = [ click.option('--max-wait-time', type=int, default=None), click.option('--no-dupes', is_flag=True, default=None), click.option('--search-existing', is_flag=True, default=None), - click.option('--exclude-id', default=None, multiple=True), - click.option('--exclude-id-file', default=None, multiple=True), click.option('--skip', default=None, multiple=True), click.option('--skip-domain', default=None, multiple=True), click.option('--skip-subreddit', default=None, multiple=True), diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 559dcc1..214111f 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -34,6 +34,9 @@ class Archiver(RedditConnector): f'Submission {submission.id} in {submission.subreddit.display_name} skipped' f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user') continue + if submission.id in self.excluded_submission_ids: + logger.debug(f'Object {submission.id} in exclusion list, skipping') + continue logger.debug(f'Attempting to archive submission {submission.id}') self.write_entry(submission) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index ef24e36..c8fb323 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -35,7 +35,7 @@ class Configuration(Namespace): self.skip_subreddit: list[str] = [] self.sort: str = 'hot' self.submitted: bool = False - self.subscribed: bool = True + self.subscribed: bool = False self.subreddit: list[str] = [] self.time: str = 'all' self.time_format = None diff --git a/tests/integration_tests/test_archive_integration.py b/tests/integration_tests/test_archive_integration.py index 2234c5a..744e343 100644 --- a/tests/integration_tests/test_archive_integration.py +++ b/tests/integration_tests/test_archive_integration.py @@ -136,3 +136,18 @@ def test_cli_archive_file_format(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 assert 'Attempting to archive submission' in result.output assert re.search('format at /.+?/Judge says Trump and two adult', result.output) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-l', 'm2601g', '--exclude-id', 'm2601g'], +)) +def test_cli_archive_links_exclusion(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'in exclusion list' in result.output + assert 'Attempting to archive' not in result.output From 3fd5bad4070c35ce60c35ab73229108e06f5afdb Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 7 Jul 2022 11:39:42 +1000 Subject: [PATCH 052/110] Update some test hashes --- .../fallback_downloaders/test_ytdlp_fallback.py | 4 ++-- tests/site_downloaders/test_youtube.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py index c600f6f..23c95f9 100644 --- a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py @@ -36,8 +36,8 @@ def test_info_extraction_bad(test_url: str): @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://streamable.com/dt46y', 'b7e465adaade5f2b6d8c2b4b7d0a2878'), ('https://streamable.com/t8sem', '49b2d1220c485455548f1edbc05d4ecf'), - ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '0a406f3d2e09b3d3ba43bf97185b83e3'), - ('https://v.redd.it/9z1dnk3xr5k61', 'c7765c33972549465c87dcbd59eb3d5d'), + ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '49316899440ea1c3b74d5640d9d527c1'), + ('https://v.redd.it/9z1dnk3xr5k61', '76d5e6d7f4f9e1910c6c22b54dfa804f'), )) def test_find_resources(test_url: str, expected_hash: str): test_submission = MagicMock() diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index 684eb20..ce1abb8 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -15,7 +15,7 @@ from bdfr.site_downloaders.youtube import Youtube @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://www.youtube.com/watch?v=uSm2VDgRIUs', '2d60b54582df5b95ec72bb00b580d2ff'), ('https://www.youtube.com/watch?v=GcI7nxQj7HA', '5db0fc92a0a7fb9ac91e63505eea9cf0'), - ('https://youtu.be/TMqPOlp4tNo', 'f68c00b018162857f3df4844c45302e7'), # Age restricted + ('https://youtu.be/TMqPOlp4tNo', 'ceb4c2cb1a9bf79617623b2aa57e18fd'), # Age restricted )) def test_find_resources_good(test_url: str, expected_hash: str): test_submission = MagicMock() From 7315afeafd93f91934fa381dba8dcc1653f6fe1b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 7 Jul 2022 12:07:53 +1000 Subject: [PATCH 053/110] Update test parameter --- tests/test_file_name_formatter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index c9e049e..9c0a8bb 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -407,7 +407,7 @@ def test_windows_max_path(tmp_path: Path): @pytest.mark.parametrize(('test_reddit_id', 'test_downloader', 'expected_names'), ( ('gphmnr', YtdlpFallback, {'He has a lot to say today.mp4'}), ('d0oir2', YtdlpFallback, {"Crunk's finest moment. Welcome to the new subreddit!.mp4"}), - ('jiecu', SelfPost, {'Reston, VA Some info regarding shelters in the area..txt'}), + ('jiecu', SelfPost, {'[deleted by user].txt'}), ('gui1i', SelfPost, {'The "Beer and Ear offer for those who need help in the \'burbs of North Dallas....txt'}), )) def test_name_submission( From 4f876eecbc23360d6d6555787efc528a02dc9ad8 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 15 Jul 2022 14:22:55 +1000 Subject: [PATCH 054/110] Remove bugged test case --- tests/archive_entry/test_submission_archive_entry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/archive_entry/test_submission_archive_entry.py b/tests/archive_entry/test_submission_archive_entry.py index 60f47b5..045eabd 100644 --- a/tests/archive_entry/test_submission_archive_entry.py +++ b/tests/archive_entry/test_submission_archive_entry.py @@ -34,7 +34,7 @@ def test_get_comments(test_submission_id: str, min_comments: int, reddit_instanc 'created_utc': 1615583837, 'permalink': '/r/australia/comments/m3reby/this_little_guy_fell_out_of_a_tree_and_in_front/' }), - ('m3kua3', {'author': 'DELETED'}), + # TODO: add deleted user test case )) def test_get_post_details(test_submission_id: str, expected_dict: dict, reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) From 7d4eb47643f5f357e6361e05cd76464f992410aa Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 15 Jul 2022 15:05:07 +1000 Subject: [PATCH 055/110] Rename class --- bdfr/site_downloaders/download_factory.py | 4 ++-- bdfr/site_downloaders/vreddit.py | 2 +- tests/site_downloaders/test_download_factory.py | 6 +++--- tests/site_downloaders/test_vreddit.py | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index f6f4895..96e9a42 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -17,8 +17,8 @@ from bdfr.site_downloaders.pornhub import PornHub from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost from bdfr.site_downloaders.vidble import Vidble +from bdfr.site_downloaders.vreddit import VReddit from bdfr.site_downloaders.youtube import Youtube -from bdfr.site_downloaders.vreddit import Vreddit class DownloadFactory: @@ -49,7 +49,7 @@ class DownloadFactory: elif re.match(r'i\.redd\.it.*', sanitised_url): return Direct elif re.match(r'v\.redd\.it.*', sanitised_url): - return Vreddit + return VReddit elif re.match(r'pornhub\.com.*', sanitised_url): return PornHub elif re.match(r'vidble\.com', sanitised_url): diff --git a/bdfr/site_downloaders/vreddit.py b/bdfr/site_downloaders/vreddit.py index b1117a0..fda6dac 100644 --- a/bdfr/site_downloaders/vreddit.py +++ b/bdfr/site_downloaders/vreddit.py @@ -16,7 +16,7 @@ from bdfr.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) -class Vreddit(BaseDownloader): +class VReddit(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index 30e19f1..dcb5303 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -16,8 +16,8 @@ from bdfr.site_downloaders.imgur import Imgur from bdfr.site_downloaders.pornhub import PornHub from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost +from bdfr.site_downloaders.vreddit import VReddit from bdfr.site_downloaders.youtube import Youtube -from bdfr.site_downloaders.vreddit import Vreddit @pytest.mark.online @@ -43,13 +43,13 @@ from bdfr.site_downloaders.vreddit import Vreddit ('https://i.imgur.com/3SKrQfK.jpg?1', Direct), ('https://dynasty-scans.com/system/images_images/000/017/819/original/80215103_p0.png?1612232781', Direct), ('https://m.imgur.com/a/py3RW0j', Imgur), - ('https://v.redd.it/9z1dnk3xr5k61', Vreddit), + ('https://v.redd.it/9z1dnk3xr5k61', VReddit), ('https://streamable.com/dt46y', YtdlpFallback), ('https://vimeo.com/channels/31259/53576664', YtdlpFallback), ('http://video.pbs.org/viralplayer/2365173446/', YtdlpFallback), ('https://www.pornhub.com/view_video.php?viewkey=ph5a2ee0461a8d0', PornHub), ('https://www.patreon.com/posts/minecart-track-59346560', Gallery), - ('https://v.redd.it/9z1dnk3xr5k61', Vreddit) + ('https://v.redd.it/9z1dnk3xr5k61', VReddit) )) def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): result = DownloadFactory.pull_lever(test_submission_url) diff --git a/tests/site_downloaders/test_vreddit.py b/tests/site_downloaders/test_vreddit.py index 65428b5..f1f8219 100644 --- a/tests/site_downloaders/test_vreddit.py +++ b/tests/site_downloaders/test_vreddit.py @@ -7,7 +7,7 @@ import pytest from bdfr.exceptions import NotADownloadableLinkError from bdfr.resource import Resource -from bdfr.site_downloaders.vreddit import Vreddit +from bdfr.site_downloaders.vreddit import VReddit @pytest.mark.online @@ -18,7 +18,7 @@ from bdfr.site_downloaders.vreddit import Vreddit def test_find_resources_good(test_url: str, expected_hash: str): test_submission = MagicMock() test_submission.url = test_url - downloader = Vreddit(test_submission) + downloader = VReddit(test_submission) resources = downloader.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) @@ -34,6 +34,6 @@ def test_find_resources_good(test_url: str, expected_hash: str): def test_find_resources_bad(test_url: str): test_submission = MagicMock() test_submission.url = test_url - downloader = Vreddit(test_submission) + downloader = VReddit(test_submission) with pytest.raises(NotADownloadableLinkError): downloader.find_resources() From 92779033087e2e0f6674829a78a97d10e0cdaaaf Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 15 Jul 2022 15:12:39 +1000 Subject: [PATCH 056/110] Base VReddit class off of Youtube class --- bdfr/site_downloaders/vreddit.py | 41 ++++---------------------------- bdfr/site_downloaders/youtube.py | 7 +++++- 2 files changed, 10 insertions(+), 38 deletions(-) diff --git a/bdfr/site_downloaders/vreddit.py b/bdfr/site_downloaders/vreddit.py index fda6dac..ad526b4 100644 --- a/bdfr/site_downloaders/vreddit.py +++ b/bdfr/site_downloaders/vreddit.py @@ -11,12 +11,12 @@ from praw.models import Submission from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator -from bdfr.site_downloaders.base_downloader import BaseDownloader +from bdfr.site_downloaders.youtube import Youtube logger = logging.getLogger(__name__) -class VReddit(BaseDownloader): +class VReddit(Youtube): def __init__(self, post: Submission): super().__init__(post) @@ -30,47 +30,14 @@ class VReddit(BaseDownloader): res = Resource(self.post, self.post.url, download_function, extension) return [res] - def _download_video(self, ytdl_options: dict) -> Callable: - yt_logger = logging.getLogger('youtube-dl') - yt_logger.setLevel(logging.CRITICAL) - ytdl_options['quiet'] = True - ytdl_options['logger'] = yt_logger - - def download(_: dict) -> bytes: - with tempfile.TemporaryDirectory() as temp_dir: - download_path = Path(temp_dir).resolve() - ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s' - try: - with yt_dlp.YoutubeDL(ytdl_options) as ydl: - ydl.download([self.post.url]) - except yt_dlp.DownloadError as e: - raise SiteDownloaderError(f'Vreddit download failed: {e}') - - downloaded_files = list(download_path.iterdir()) - if len(downloaded_files) > 0: - downloaded_file = downloaded_files[0] - else: - raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}") - with open(downloaded_file, 'rb') as file: - content = file.read() - return content - return download - @staticmethod def get_video_attributes(url: str) -> dict: - yt_logger = logging.getLogger('youtube-dl') - yt_logger.setLevel(logging.CRITICAL) - with yt_dlp.YoutubeDL({'logger': yt_logger, }) as ydl: - try: - result = ydl.extract_info(url, download=False) - except Exception as e: - logger.exception(e) - raise NotADownloadableLinkError(f'Video info extraction failed for {url}') + result = VReddit.get_video_data(url) if 'ext' in result: return result else: try: - result = (result["entries"][0]) + result = result["entries"][0] return result except Exception as e: logger.exception(e) diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py index f18f405..70c35ae 100644 --- a/bdfr/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -58,7 +58,7 @@ class Youtube(BaseDownloader): return download @staticmethod - def get_video_attributes(url: str) -> dict: + def get_video_data(url: str) -> dict: yt_logger = logging.getLogger('youtube-dl') yt_logger.setLevel(logging.CRITICAL) with yt_dlp.YoutubeDL({'logger': yt_logger, }) as ydl: @@ -67,6 +67,11 @@ class Youtube(BaseDownloader): except Exception as e: logger.exception(e) raise NotADownloadableLinkError(f'Video info extraction failed for {url}') + return result + + @staticmethod + def get_video_attributes(url: str) -> dict: + result = Youtube.get_video_data(url) if 'ext' in result: return result else: From 86e451d49e7e48a89a30b6209174a0a3c435124e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 15 Jul 2022 15:18:14 +1000 Subject: [PATCH 057/110] Fix test case --- .../fallback_downloaders/test_ytdlp_fallback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py index 23c95f9..9aeca98 100644 --- a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py @@ -15,7 +15,7 @@ from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallb ('https://www.youtube.com/watch?v=P19nvJOmqCc', True), ('https://www.example.com/test', False), ('https://milesmatrix.bandcamp.com/album/la-boum/', False), - ('https://v.redd.it/54i8fvzev3u81', True), + ('https://v.redd.it/54i8fvzev3u81', False), )) def test_can_handle_link(test_url: str, expected: bool): result = YtdlpFallback.can_handle_link(test_url) From 1157c31be1d15bf1883ec8e5700dc6a04eab89dc Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 15 Jul 2022 15:47:49 +1000 Subject: [PATCH 058/110] Remove bad test case --- tests/test_file_name_formatter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index 9c0a8bb..21cb8a6 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -408,7 +408,6 @@ def test_windows_max_path(tmp_path: Path): ('gphmnr', YtdlpFallback, {'He has a lot to say today.mp4'}), ('d0oir2', YtdlpFallback, {"Crunk's finest moment. Welcome to the new subreddit!.mp4"}), ('jiecu', SelfPost, {'[deleted by user].txt'}), - ('gui1i', SelfPost, {'The "Beer and Ear offer for those who need help in the \'burbs of North Dallas....txt'}), )) def test_name_submission( test_reddit_id: str, From febad9c06c3b8b40e0cbb765a7559593ba5e2632 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 15 Jul 2022 15:47:49 +1000 Subject: [PATCH 059/110] Remove bad test case --- tests/test_file_name_formatter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index 9c0a8bb..21cb8a6 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -408,7 +408,6 @@ def test_windows_max_path(tmp_path: Path): ('gphmnr', YtdlpFallback, {'He has a lot to say today.mp4'}), ('d0oir2', YtdlpFallback, {"Crunk's finest moment. Welcome to the new subreddit!.mp4"}), ('jiecu', SelfPost, {'[deleted by user].txt'}), - ('gui1i', SelfPost, {'The "Beer and Ear offer for those who need help in the \'burbs of North Dallas....txt'}), )) def test_name_submission( test_reddit_id: str, From 59e57cee840886e8af43e50ab74fa48fdc137a41 Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Sat, 16 Jul 2022 13:13:23 +1000 Subject: [PATCH 060/110] Create protect_master.yml --- .github/workflows/protect_master.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .github/workflows/protect_master.yml diff --git a/.github/workflows/protect_master.yml b/.github/workflows/protect_master.yml new file mode 100644 index 0000000..6267b77 --- /dev/null +++ b/.github/workflows/protect_master.yml @@ -0,0 +1,13 @@ +name: Protect master branch + +on: + pull_request: + branches: + - master +jobs: + merge_check: + runs-on: ubuntu-latest + steps: + - name: Check if the pull request is mergeable to master + run: | + if [[ "$GITHUB_HEAD_REF" == 'development' && "$GITHUB_REPOSITORY" == 'aliparlakci/bulk-downloader-for-reddit' ]]; then exit 0; else exit 1; fi; From 7100291ed90d617ceb7bf8d627fce65f4c032e97 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Sat, 16 Jul 2022 10:38:34 -0500 Subject: [PATCH 061/110] forgot comma --- tests/test_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_connector.py b/tests/test_connector.py index 4a2d461..e928000 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -447,7 +447,7 @@ def test_check_user_existence_banned( @pytest.mark.reddit @pytest.mark.parametrize(('test_subreddit_name', 'expected_message'), ( ('donaldtrump', 'cannot be found'), - ('submitters', 'private and cannot be scraped') + ('submitters', 'private and cannot be scraped'), ('lhnhfkuhwreolo', 'does not exist') )) def test_check_subreddit_status_bad(test_subreddit_name: str, expected_message: str, reddit_instance: praw.Reddit): From 798ed728f540421973f7d8cb1f572e4d8d57cf9a Mon Sep 17 00:00:00 2001 From: Piotr Migdal Date: Sun, 27 Mar 2022 20:29:05 +0200 Subject: [PATCH 062/110] yaml for options --- bdfr/__main__.py | 1 + bdfr/configuration.py | 14 ++++++++++++-- opts_example.yaml | 9 +++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 opts_example.yaml diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 45450ed..3b2472a 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -16,6 +16,7 @@ _common_options = [ click.argument('directory', type=str), click.option('--authenticate', is_flag=True, default=None), click.option('--config', type=str, default=None), + click.option('--opts', type=str, default=None), click.option('--disable-module', multiple=True, default=None, type=str), click.option('--exclude-id', default=None, multiple=True), click.option('--exclude-id-file', default=None, multiple=True), diff --git a/bdfr/configuration.py b/bdfr/configuration.py index c8fb323..8b04722 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -5,6 +5,7 @@ from argparse import Namespace from typing import Optional import click +import yaml class Configuration(Namespace): @@ -12,6 +13,7 @@ class Configuration(Namespace): super(Configuration, self).__init__() self.authenticate = False self.config = None + self.opts: Optional[str] = None self.directory: str = '.' self.disable_module: list[str] = [] self.exclude_id = [] @@ -49,6 +51,14 @@ class Configuration(Namespace): self.comment_context: bool = False def process_click_arguments(self, context: click.Context): + if context.params['opts'] is not None: + with open(context.params['opts']) as f: + opts = yaml.load(f, Loader=yaml.FullLoader) + for arg_key, v in opts.items(): + vars(self)[arg_key] = v for arg_key in context.params.keys(): - if arg_key in vars(self) and context.params[arg_key] is not None: - vars(self)[arg_key] = context.params[arg_key] + if arg_key not in vars(self): + continue + if context.params[arg_key] is None or context.params[arg_key] == (): + continue + vars(self)[arg_key] = context.params[arg_key] diff --git a/opts_example.yaml b/opts_example.yaml new file mode 100644 index 0000000..91952e4 --- /dev/null +++ b/opts_example.yaml @@ -0,0 +1,9 @@ +skip: [mp4, avi, mov] +file-scheme: "{UPVOTES}_{REDDITOR}_{POSTID}_{DATE}" +limit: 10 +sort: top +time: all +no-dupes: true +subreddit: + - EarthPorn + - CityPorn From ef82387f84456751ea5f49e2c9f8039921ce1190 Mon Sep 17 00:00:00 2001 From: Piotr Migdal Date: Sun, 27 Mar 2022 20:49:28 +0200 Subject: [PATCH 063/110] underscores in YAML --- opts_example.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opts_example.yaml b/opts_example.yaml index 91952e4..22fca7d 100644 --- a/opts_example.yaml +++ b/opts_example.yaml @@ -1,9 +1,9 @@ skip: [mp4, avi, mov] -file-scheme: "{UPVOTES}_{REDDITOR}_{POSTID}_{DATE}" +file_scheme: "{UPVOTES}_{REDDITOR}_{POSTID}_{DATE}" limit: 10 sort: top time: all -no-dupes: true +no_dupes: true subreddit: - EarthPorn - CityPorn From 395bf9180aabb22152a299acb8df07d5bb1252f2 Mon Sep 17 00:00:00 2001 From: Piotr Migdal Date: Sun, 27 Mar 2022 20:50:34 +0200 Subject: [PATCH 064/110] explicit warnings for non-exisitng args --- bdfr/configuration.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 8b04722..856c90b 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -3,10 +3,12 @@ from argparse import Namespace from typing import Optional +import logging import click import yaml +logger = logging.getLogger(__name__) class Configuration(Namespace): def __init__(self): @@ -54,11 +56,17 @@ class Configuration(Namespace): if context.params['opts'] is not None: with open(context.params['opts']) as f: opts = yaml.load(f, Loader=yaml.FullLoader) - for arg_key, v in opts.items(): - vars(self)[arg_key] = v + for arg_key, val in opts.items(): + if not hasattr(self, arg_key): + logger.error(f'Ignoring an unknown YAML argument: {arg_key}') + continue + setattr(self, arg_key, val) for arg_key in context.params.keys(): - if arg_key not in vars(self): + if not hasattr(self, arg_key): + logger.warning(f'Ignoring an unknown CLI argument: {arg_key}') continue - if context.params[arg_key] is None or context.params[arg_key] == (): + val = context.params[arg_key] + if val is None or val == (): + # don't overwrite with an empty value continue - vars(self)[arg_key] = context.params[arg_key] + setattr(self, arg_key, val) From 0731de788d6334f5b206084f63b907e7e19dab8d Mon Sep 17 00:00:00 2001 From: Piotr Migdal Date: Sun, 27 Mar 2022 21:09:02 +0200 Subject: [PATCH 065/110] instructions for YAML options --- README.md | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 28c0b8a..82d8812 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,31 @@ python3 -m bdfr download ./path/to/output --subreddit 'Python, all, mindustry' - python3 -m bdfr archive ./path/to/output --subreddit all --format yaml -L 500 --folder-scheme '' ``` +Alternatively, you can pass options through a YAML file. + +```bash +python3 -m bdfr download ./path/to/output --opts my_opts.yaml +``` + +For example, running it with the following file + +```yaml +skip: [mp4, avi] +file_scheme: "{UPVOTES}_{REDDITOR}_{POSTID}_{DATE}" +limit: 10 +sort: top +subreddit: + - EarthPorn + - CityPorn +``` + +would be equilavent to (take note that in YAML there is `file_scheme` instead of `file-scheme`): +```bash +python3 -m bdfr download ./path/to/output --skip mp4 --skip avi --file-scheme "{UPVOTES}_{REDDITOR}_{POSTID}_{DATE}" -L 10 -S top --subreddit EarthPorn --subreddit CityPorn +``` + +In case when the same option is specified both in the YAML file and in as a command line argument, the command line argument takes prs + ## Options The following options are common between both the `archive` and `download` commands of the BDFR. @@ -80,6 +105,10 @@ The following options are common between both the `archive` and `download` comma - `--config` - If the path to a configuration file is supplied with this option, the BDFR will use the specified config - See [Configuration Files](#configuration) for more details +- `--opts` + - Load options from a YAML file. + - Has higher prority than the global config file but lower than command-line arguments. + - See [opts_example.yaml](./opts_example.yaml) for an example file. - `--disable-module` - Can be specified multiple times - Disables certain modules from being used @@ -221,7 +250,10 @@ The `clone` command can take all the options listed above for both the `archive` ## Common Command Tricks -A common use case is for subreddits/users to be loaded from a file. The BDFR doesn't support this directly but it is simple enough to do through the command-line. Consider a list of usernames to download; they can be passed through to the BDFR with the following command, assuming that the usernames are in a text file: +A common use case is for subreddits/users to be loaded from a file. The BDFR supports this via YAML file options (`--opts my_opts.yaml`). + +Alternatively, you can use the command-line [xargs](https://en.wikipedia.org/wiki/Xargs) function. +For a list of users `users.txt` (one user per line), type: ```bash cat users.txt | xargs -L 1 echo --user | xargs -L 50 python3 -m bdfr download From 5f443fddff952b0f21fe920b5ffbb4721023252a Mon Sep 17 00:00:00 2001 From: Piotr Migdal Date: Sun, 27 Mar 2022 21:13:33 +0200 Subject: [PATCH 066/110] a better check for opts --- bdfr/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 856c90b..2468ba9 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -53,7 +53,7 @@ class Configuration(Namespace): self.comment_context: bool = False def process_click_arguments(self, context: click.Context): - if context.params['opts'] is not None: + if context.params.get('opts') is not None: with open(context.params['opts']) as f: opts = yaml.load(f, Loader=yaml.FullLoader) for arg_key, val in opts.items(): From cb3415c62ffc33fda76f612924439b0d41ec1a12 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 22 Jul 2022 15:44:19 +1000 Subject: [PATCH 067/110] Extract YAML function --- bdfr/configuration.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 2468ba9..79a208b 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -2,6 +2,7 @@ # coding=utf-8 from argparse import Namespace +from pathlib import Path from typing import Optional import logging @@ -54,13 +55,7 @@ class Configuration(Namespace): def process_click_arguments(self, context: click.Context): if context.params.get('opts') is not None: - with open(context.params['opts']) as f: - opts = yaml.load(f, Loader=yaml.FullLoader) - for arg_key, val in opts.items(): - if not hasattr(self, arg_key): - logger.error(f'Ignoring an unknown YAML argument: {arg_key}') - continue - setattr(self, arg_key, val) + self.parse_yaml_options(context.params['opts']) for arg_key in context.params.keys(): if not hasattr(self, arg_key): logger.warning(f'Ignoring an unknown CLI argument: {arg_key}') @@ -70,3 +65,20 @@ class Configuration(Namespace): # don't overwrite with an empty value continue setattr(self, arg_key, val) + + def parse_yaml_options(self, file_path: str): + yaml_file_loc = Path(file_path) + if not yaml_file_loc.exists(): + logger.error(f'No YAML file found at {yaml_file_loc}') + return + with open(yaml_file_loc) as f: + try: + opts = yaml.load(f, Loader=yaml.FullLoader) + except yaml.YAMLError as e: + logger.error(f'Could not parse YAML options file: {e}') + return + for arg_key, val in opts.items(): + if not hasattr(self, arg_key): + logger.error(f'Ignoring an unknown YAML argument: {arg_key}') + continue + setattr(self, arg_key, val) From 23e20e6ddc606b5af987a9294364425a25ec67a9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 22 Jul 2022 15:45:09 +1000 Subject: [PATCH 068/110] Rename variable --- bdfr/configuration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 79a208b..e3f1758 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -71,9 +71,9 @@ class Configuration(Namespace): if not yaml_file_loc.exists(): logger.error(f'No YAML file found at {yaml_file_loc}') return - with open(yaml_file_loc) as f: + with open(yaml_file_loc) as file: try: - opts = yaml.load(f, Loader=yaml.FullLoader) + opts = yaml.load(file, Loader=yaml.FullLoader) except yaml.YAMLError as e: logger.error(f'Could not parse YAML options file: {e}') return From af3f98f59ceae0e0e262bfa274081339b3acfc1c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 22 Jul 2022 15:45:38 +1000 Subject: [PATCH 069/110] Change logger message level --- bdfr/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index e3f1758..ddc1401 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -79,6 +79,6 @@ class Configuration(Namespace): return for arg_key, val in opts.items(): if not hasattr(self, arg_key): - logger.error(f'Ignoring an unknown YAML argument: {arg_key}') + logger.warning(f'Ignoring an unknown YAML argument: {arg_key}') continue setattr(self, arg_key, val) From 27ca92ef157e8b174aa68b7548fa80d1893ac2c4 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 22 Jul 2022 17:31:08 +1000 Subject: [PATCH 070/110] Add simple test --- tests/test_configuration.py | 9 +++++++++ tests/yaml_test_configuration.yaml | 6 ++++++ 2 files changed, 15 insertions(+) create mode 100644 tests/yaml_test_configuration.yaml diff --git a/tests/test_configuration.py b/tests/test_configuration.py index 8ad1663..6b6cd86 100644 --- a/tests/test_configuration.py +++ b/tests/test_configuration.py @@ -22,3 +22,12 @@ def test_process_click_context(arg_dict: dict): test_config.process_click_arguments(test_context) test_config = vars(test_config) assert all([test_config[arg] == arg_dict[arg] for arg in arg_dict.keys()]) + + +def test_yaml_file_read(): + file = './yaml_test_configuration.yaml' + test_config = Configuration() + test_config.parse_yaml_options(file) + assert test_config.subreddit == ['EarthPorn', 'TwoXChromosomes', 'Mindustry'] + assert test_config.sort == 'new' + assert test_config.limit == 10 diff --git a/tests/yaml_test_configuration.yaml b/tests/yaml_test_configuration.yaml new file mode 100644 index 0000000..5621721 --- /dev/null +++ b/tests/yaml_test_configuration.yaml @@ -0,0 +1,6 @@ +limit: 10 +sort: new +subreddit: + - EarthPorn + - TwoXChromosomes + - Mindustry \ No newline at end of file From 1f1e7dc63d4b0e60bed219d45ceeea224bc0d327 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 17:02:01 +1000 Subject: [PATCH 071/110] Fix file path for test --- tests/test_configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_configuration.py b/tests/test_configuration.py index 6b6cd86..060f145 100644 --- a/tests/test_configuration.py +++ b/tests/test_configuration.py @@ -25,7 +25,7 @@ def test_process_click_context(arg_dict: dict): def test_yaml_file_read(): - file = './yaml_test_configuration.yaml' + file = './tests/yaml_test_configuration.yaml' test_config = Configuration() test_config.parse_yaml_options(file) assert test_config.subreddit == ['EarthPorn', 'TwoXChromosomes', 'Mindustry'] From 607d9634508f3b71674d4538c09f42d5471c6298 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 17:14:36 +1000 Subject: [PATCH 072/110] Change file paths for test resource --- tests/integration_tests/test_archive_integration.py | 6 +++--- tests/integration_tests/test_clone_integration.py | 4 ++-- tests/integration_tests/test_download_integration.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/integration_tests/test_archive_integration.py b/tests/integration_tests/test_archive_integration.py index 744e343..7b9a48d 100644 --- a/tests/integration_tests/test_archive_integration.py +++ b/tests/integration_tests/test_archive_integration.py @@ -10,11 +10,11 @@ from click.testing import CliRunner from bdfr.__main__ import cli -does_test_config_exist = Path('../test_config.cfg').exists() +does_test_config_exist = Path('./tests/test_config.cfg').exists() def copy_test_config(run_path: Path): - shutil.copy(Path('../test_config.cfg'), Path(run_path, '../test_config.cfg')) + shutil.copy(Path('./tests/test_config.cfg'), Path(run_path, 'test_config.cfg')) def create_basic_args_for_archive_runner(test_args: list[str], run_path: Path): @@ -23,7 +23,7 @@ def create_basic_args_for_archive_runner(test_args: list[str], run_path: Path): 'archive', str(run_path), '-v', - '--config', str(Path(run_path, '../test_config.cfg')), + '--config', str(Path(run_path, 'test_config.cfg')), '--log', str(Path(run_path, 'test_log.txt')), ] + test_args return out diff --git a/tests/integration_tests/test_clone_integration.py b/tests/integration_tests/test_clone_integration.py index 343b2d3..22c1988 100644 --- a/tests/integration_tests/test_clone_integration.py +++ b/tests/integration_tests/test_clone_integration.py @@ -9,11 +9,11 @@ from click.testing import CliRunner from bdfr.__main__ import cli -does_test_config_exist = Path('../test_config.cfg').exists() +does_test_config_exist = Path('./tests/test_config.cfg').exists() def copy_test_config(run_path: Path): - shutil.copy(Path('../test_config.cfg'), Path(run_path, '../test_config.cfg')) + shutil.copy(Path('./tests/test_config.cfg'), Path(run_path, 'test_config.cfg')) def create_basic_args_for_cloner_runner(test_args: list[str], tmp_path: Path): diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index ffae0d4..93d9392 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -9,11 +9,11 @@ from click.testing import CliRunner from bdfr.__main__ import cli -does_test_config_exist = Path('../test_config.cfg').exists() +does_test_config_exist = Path('./tests/test_config.cfg').exists() def copy_test_config(run_path: Path): - shutil.copy(Path('../test_config.cfg'), Path(run_path, '../test_config.cfg')) + shutil.copy(Path('./tests/test_config.cfg'), Path(run_path, './test_config.cfg')) def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): @@ -21,7 +21,7 @@ def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): out = [ 'download', str(run_path), '-v', - '--config', str(Path(run_path, '../test_config.cfg')), + '--config', str(Path(run_path, './test_config.cfg')), '--log', str(Path(run_path, 'test_log.txt')), ] + test_args return out From 4fc0d5dc1dd72c9444c00f73f8aaee30c6446adc Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 4 May 2022 22:42:40 -0500 Subject: [PATCH 073/110] Add score filtering --- bdfr/downloader.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 02f5c68..001a079 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -57,6 +57,12 @@ class RedditDownloader(RedditConnector): f'Submission {submission.id} in {submission.subreddit.display_name} skipped' f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user') return + elif submission.score < self.args.min_score or self.args.max_score < submission.score: + logger.debug(f"Submission {submission.id} filtered due to score {submission.score} < {self.args.min_score}") + return + elif submission.upvote_ratio < self.args.min_score_ratio or self.args.max_score_ratio < submission.upvote_ratio: + logger.debug(f"Submission {submission.id} filtered due to score ratio ({submission.upvote_ratio})") + return elif not isinstance(submission, praw.models.Submission): logger.warning(f'{submission.id} is not a submission') return From 89653c4bad5559dda22317def8cd320d64f80e20 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 4 May 2022 22:48:52 -0500 Subject: [PATCH 074/110] Update README.md --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 82d8812..a539331 100644 --- a/README.md +++ b/README.md @@ -227,6 +227,15 @@ The following options apply only to the `download` command. This command downloa - This skips all submissions from the specified subreddit - Can be specified multiple times - Also accepts CSV subreddit names +- `--min-score` + - This skips all submissions which have fewer than specified upvotes +- `--max-score` + - This skips all submissions which have more than specified upvotes +- `--min-score-ratio` + - This skips all submissions which have lower than specified upvote ratio +- `--max-score-ratio` + - This skips all submissions which have higher than specified upvote ratio + ### Archiver Options From 95454078966e626ac3fb3c25e769b51012cd1d0e Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 4 May 2022 22:52:12 -0500 Subject: [PATCH 075/110] Update __main__.py --- bdfr/__main__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 3b2472a..1117a70 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -50,6 +50,10 @@ _downloader_options = [ click.option('--skip', default=None, multiple=True), click.option('--skip-domain', default=None, multiple=True), click.option('--skip-subreddit', default=None, multiple=True), + click.option('--min-score', type=int, default=None), + click.option('--max-score', type=int, default=None), + click.option('--min-score-ratio', type=float, default=None), + click.option('--max-score-ratio', type=float, default=None), ] _archiver_options = [ From 7eb2ab6d7d70360fc4f6082fb7a69d49c1446a8b Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 4 May 2022 22:53:58 -0500 Subject: [PATCH 076/110] Update configuration.py --- bdfr/configuration.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bdfr/configuration.py b/bdfr/configuration.py index ddc1401..46c4cf0 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -38,6 +38,10 @@ class Configuration(Namespace): self.skip: list[str] = [] self.skip_domain: list[str] = [] self.skip_subreddit: list[str] = [] + self.min_score = None + self.max_score = None + self.min_score_ratio = None + self.max_score_ratio = None self.sort: str = 'hot' self.submitted: bool = False self.subscribed: bool = False From 5d76fcd5aa0141104cd4860800989bdc0281b635 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 4 May 2022 23:35:44 -0500 Subject: [PATCH 077/110] Update downloader.py --- bdfr/downloader.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 001a079..adfadcb 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -57,10 +57,15 @@ class RedditDownloader(RedditConnector): f'Submission {submission.id} in {submission.subreddit.display_name} skipped' f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user') return - elif submission.score < self.args.min_score or self.args.max_score < submission.score: - logger.debug(f"Submission {submission.id} filtered due to score {submission.score} < {self.args.min_score}") + elif self.args.min_score and submission.score < self.args.min_score: + logger.debug(f"Submission {submission.id} filtered due to score {submission.score} < [{self.args.min_score}]") return - elif submission.upvote_ratio < self.args.min_score_ratio or self.args.max_score_ratio < submission.upvote_ratio: + elif self.args.max_score and self.args.max_score < submission.score: + logger.debug(f"Submission {submission.id} filtered due to score [{self.args.max_score}] < {submission.score}") + return + elif (self.args.min_score_ratio and submission.upvote_ratio < self.args.min_score_ratio) or ( + self.args.max_score_ratio and self.args.max_score_ratio < submission.upvote_ratio + ): logger.debug(f"Submission {submission.id} filtered due to score ratio ({submission.upvote_ratio})") return elif not isinstance(submission, praw.models.Submission): From f22a8aec4d589e77c868d99e30992a6502ce112d Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 12:55:53 +1000 Subject: [PATCH 078/110] Fix line length --- bdfr/downloader.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index adfadcb..83b5ebf 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -58,10 +58,12 @@ class RedditDownloader(RedditConnector): f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user') return elif self.args.min_score and submission.score < self.args.min_score: - logger.debug(f"Submission {submission.id} filtered due to score {submission.score} < [{self.args.min_score}]") + logger.debug( + f"Submission {submission.id} filtered due to score {submission.score} < [{self.args.min_score}]") return elif self.args.max_score and self.args.max_score < submission.score: - logger.debug(f"Submission {submission.id} filtered due to score [{self.args.max_score}] < {submission.score}") + logger.debug( + f"Submission {submission.id} filtered due to score [{self.args.max_score}] < {submission.score}") return elif (self.args.min_score_ratio and submission.upvote_ratio < self.args.min_score_ratio) or ( self.args.max_score_ratio and self.args.max_score_ratio < submission.upvote_ratio From 2bbf1b644e4446502edce864ed300e9fbcb2b91b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 13:41:31 +1000 Subject: [PATCH 079/110] Change logging message --- bdfr/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 83b5ebf..3b5a7e1 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -63,7 +63,7 @@ class RedditDownloader(RedditConnector): return elif self.args.max_score and self.args.max_score < submission.score: logger.debug( - f"Submission {submission.id} filtered due to score [{self.args.max_score}] < {submission.score}") + f"Submission {submission.id} filtered due to score {submission.score} > [{self.args.max_score}]") return elif (self.args.min_score_ratio and submission.upvote_ratio < self.args.min_score_ratio) or ( self.args.max_score_ratio and self.args.max_score_ratio < submission.upvote_ratio From 9d631257243927d0a00073d92bd80493e0bfde9c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 13:47:43 +1000 Subject: [PATCH 080/110] Add tests for downloader --- tests/test_downloader.py | 104 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index e5f0a31..e2e9e82 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -200,3 +200,107 @@ def test_download_submission( RedditDownloader._download_submission(downloader_mock, submission) folder_contents = list(tmp_path.iterdir()) assert len(folder_contents) == expected_files_len + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'min_score'), ( + ('ljyy27', 1), +)) +def test_download_submission_min_score_above( + test_submission_id: str, + min_score: int, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, + tmp_path: Path, + capsys: pytest.CaptureFixture, +): + setup_logging(3) + downloader_mock.reddit_instance = reddit_instance + downloader_mock.download_filter.check_url.return_value = True + downloader_mock.args.folder_scheme = '' + downloader_mock.args.min_score = min_score + downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock) + downloader_mock.download_directory = tmp_path + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) + RedditDownloader._download_submission(downloader_mock, submission) + output = capsys.readouterr() + assert 'filtered due to score' not in output.out + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'min_score'), ( + ('ljyy27', 25), +)) +def test_download_submission_min_score_below( + test_submission_id: str, + min_score: int, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, + tmp_path: Path, + capsys: pytest.CaptureFixture, +): + setup_logging(3) + downloader_mock.reddit_instance = reddit_instance + downloader_mock.download_filter.check_url.return_value = True + downloader_mock.args.folder_scheme = '' + downloader_mock.args.min_score = min_score + downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock) + downloader_mock.download_directory = tmp_path + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) + RedditDownloader._download_submission(downloader_mock, submission) + output = capsys.readouterr() + assert 'filtered due to score' in output.out + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'max_score'), ( + ('ljyy27', 25), +)) +def test_download_submission_max_score_below( + test_submission_id: str, + max_score: int, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, + tmp_path: Path, + capsys: pytest.CaptureFixture, +): + setup_logging(3) + downloader_mock.reddit_instance = reddit_instance + downloader_mock.download_filter.check_url.return_value = True + downloader_mock.args.folder_scheme = '' + downloader_mock.args.max_score = max_score + downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock) + downloader_mock.download_directory = tmp_path + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) + RedditDownloader._download_submission(downloader_mock, submission) + output = capsys.readouterr() + assert 'filtered due to score' not in output.out + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'max_score'), ( + ('ljyy27', 1), +)) +def test_download_submission_max_score_above( + test_submission_id: str, + max_score: int, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, + tmp_path: Path, + capsys: pytest.CaptureFixture, +): + setup_logging(3) + downloader_mock.reddit_instance = reddit_instance + downloader_mock.download_filter.check_url.return_value = True + downloader_mock.args.folder_scheme = '' + downloader_mock.args.max_score = max_score + downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock) + downloader_mock.download_directory = tmp_path + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) + RedditDownloader._download_submission(downloader_mock, submission) + output = capsys.readouterr() + assert 'filtered due to score' in output.out From b47b90f2332d07ed1b06b74343dddcd04b1e9ab6 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 13:59:35 +1000 Subject: [PATCH 081/110] Add integration tests --- .../test_download_integration.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 93d9392..15173b6 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -351,3 +351,19 @@ def test_cli_download_ignore_user(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 assert 'Downloaded submission' not in result.output assert 'being an ignored user' in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_args', 'was_filtered'), ( + (['-l', 'ljyy27', '--min-score', '50'], True), + (['-l', 'ljyy27', '--min-score', '1'], False), + (['-l', 'ljyy27', '--max-score', '1'], True), + (['-l', 'ljyy27', '--max-score', '100'], False), +)) +def test_cli_download_score_filter(test_args: list[str], was_filtered: bool, tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert ('filtered due to score' in result.output) == was_filtered From 55c95495b238bf13e699df42a06f84eae28735fa Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 14:49:45 +1000 Subject: [PATCH 082/110] Fix test structure --- scripts/tests/test_extract_failed_ids.bats | 15 ++++++++++----- scripts/tests/test_extract_successful_ids.bats | 15 ++++++++++----- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/scripts/tests/test_extract_failed_ids.bats b/scripts/tests/test_extract_failed_ids.bats index a716cba..04eada6 100644 --- a/scripts/tests/test_extract_failed_ids.bats +++ b/scripts/tests/test_extract_failed_ids.bats @@ -13,31 +13,36 @@ teardown() { } @test "fail no downloader module" { - run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt >> failed.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt + echo "$output" > failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "3" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail resource error" { - run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt >> failed.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt + echo "$output" > failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail site downloader error" { - run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt >> failed.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt + echo "$output" > failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "2" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail failed file write" { - run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt >> failed.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt + echo "$output" > failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } @test "fail disabled module" { - run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt >> failed.txt + run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt + echo "$output" > failed.txt assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ]; } diff --git a/scripts/tests/test_extract_successful_ids.bats b/scripts/tests/test_extract_successful_ids.bats index caa8dd1..ddbd2ef 100644 --- a/scripts/tests/test_extract_successful_ids.bats +++ b/scripts/tests/test_extract_successful_ids.bats @@ -8,31 +8,36 @@ teardown() { } @test "success downloaded submission" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt >> ./successful.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt + echo "$output" > successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "7" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success resource hash" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_resource_hash.txt >> ./successful.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_resource_hash.txt + echo "$output" > successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success download filter" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_download_filter.txt >> ./successful.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_download_filter.txt + echo "$output" > successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success already exists" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_already_exists.txt >> ./successful.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_already_exists.txt + echo "$output" > successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } @test "success hard link" { - run ../extract_successful_ids.sh ./example_logfiles/succeed_hard_link.txt >> ./successful.txt + run ../extract_successful_ids.sh ./example_logfiles/succeed_hard_link.txt + echo "$output" > successful.txt assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } From 44e4c16b76d5fa0397eacf3d956a9909b22e5464 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 14:50:57 +1000 Subject: [PATCH 083/110] Update bash script --- scripts/extract_successful_ids.sh | 1 + scripts/tests/example_logfiles/succeed_score_filter.txt | 2 ++ scripts/tests/test_extract_successful_ids.bats | 7 +++++++ 3 files changed, 10 insertions(+) create mode 100644 scripts/tests/example_logfiles/succeed_score_filter.txt diff --git a/scripts/extract_successful_ids.sh b/scripts/extract_successful_ids.sh index e8f482e..f2128e5 100755 --- a/scripts/extract_successful_ids.sh +++ b/scripts/extract_successful_ids.sh @@ -13,4 +13,5 @@ fi grep 'Download filter' "$file" | awk '{ print $(NF-3) }' ; grep 'already exists, continuing' "$file" | awk '{ print $(NF-3) }' ; grep 'Hard link made' "$file" | awk '{ print $(NF) }' ; + grep 'filtered due to score' "$file" | awk '{ print $9 }' } diff --git a/scripts/tests/example_logfiles/succeed_score_filter.txt b/scripts/tests/example_logfiles/succeed_score_filter.txt new file mode 100644 index 0000000..8f31ef7 --- /dev/null +++ b/scripts/tests/example_logfiles/succeed_score_filter.txt @@ -0,0 +1,2 @@ +[2022-07-23 14:04:14,095 - bdfr.downloader - DEBUG] - Submission ljyy27 filtered due to score 15 < [50] +[2022-07-23 14:04:14,104 - bdfr.downloader - DEBUG] - Submission ljyy27 filtered due to score 16 > [1] \ No newline at end of file diff --git a/scripts/tests/test_extract_successful_ids.bats b/scripts/tests/test_extract_successful_ids.bats index ddbd2ef..6ff54bc 100644 --- a/scripts/tests/test_extract_successful_ids.bats +++ b/scripts/tests/test_extract_successful_ids.bats @@ -41,3 +41,10 @@ teardown() { assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ]; assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; } + +@test "success score filter" { + run ../extract_successful_ids.sh ./example_logfiles/succeed_score_filter.txt + echo "$output" > successful.txt + assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "2" ]; + assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ]; +} From 4b160c26118a7aff465a236bed8c8008a971dc2e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 23 Jul 2022 15:06:49 +1000 Subject: [PATCH 084/110] Add missing flag --- tests/integration_tests/test_download_integration.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 15173b6..a474172 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -355,6 +355,7 @@ def test_cli_download_ignore_user(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize(('test_args', 'was_filtered'), ( (['-l', 'ljyy27', '--min-score', '50'], True), (['-l', 'ljyy27', '--min-score', '1'], False), From d60b4e7fddb741fdb728bc0cf07627c4d11e45a0 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 1 Sep 2022 11:19:07 +1000 Subject: [PATCH 085/110] Fix Redgifs module --- bdfr/site_downloaders/redgifs.py | 11 +++++++++++ tests/site_downloaders/test_redgifs.py | 20 ++++++++++---------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index f7ea56a..26b9dfc 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -2,6 +2,7 @@ import json import re +import urllib.parse from typing import Optional from praw.models import Submission @@ -61,4 +62,14 @@ class Redgifs(BaseDownloader): except (KeyError, AttributeError): raise SiteDownloaderError('Failed to find JSON data in page') + # returned domain seems to be being phased out + out = {re.sub('thumbs2', 'thumbs3', link) for link in out} + out = {Redgifs._clean_thumbs4_link(link) for link in out} + return out + + @staticmethod + def _clean_thumbs4_link(url: str) -> str: + split_url = urllib.parse.urlsplit(url) + out = split_url.scheme + '://' + split_url.netloc + split_url.path + out = re.sub('thumbs4', 'thumbs3', out) return out diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index a1f571e..b7ae3b3 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -12,20 +12,20 @@ from bdfr.site_downloaders.redgifs import Redgifs @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected'), ( ('https://redgifs.com/watch/frighteningvictorioussalamander', - {'https://thumbs2.redgifs.com/FrighteningVictoriousSalamander.mp4'}), + {'https://thumbs3.redgifs.com/FrighteningVictoriousSalamander.mp4'}), ('https://redgifs.com/watch/springgreendecisivetaruca', - {'https://thumbs2.redgifs.com/SpringgreenDecisiveTaruca.mp4'}), + {'https://thumbs3.redgifs.com/SpringgreenDecisiveTaruca.mp4'}), ('https://www.redgifs.com/watch/palegoldenrodrawhalibut', - {'https://thumbs2.redgifs.com/PalegoldenrodRawHalibut.mp4'}), + {'https://thumbs3.redgifs.com/PalegoldenrodRawHalibut.mp4'}), ('https://redgifs.com/watch/hollowintentsnowyowl', - {'https://thumbs2.redgifs.com/HollowIntentSnowyowl-large.jpg'}), + {'https://thumbs3.redgifs.com/HollowIntentSnowyowl-large.jpg'}), ('https://www.redgifs.com/watch/lustrousstickywaxwing', - {'https://thumbs2.redgifs.com/EntireEnchantingHypsilophodon-large.jpg', - 'https://thumbs2.redgifs.com/FancyMagnificentAdamsstaghornedbeetle-large.jpg', - 'https://thumbs2.redgifs.com/LustrousStickyWaxwing-large.jpg', - 'https://thumbs2.redgifs.com/ParchedWindyArmyworm-large.jpg', - 'https://thumbs2.redgifs.com/ThunderousColorlessErmine-large.jpg', - 'https://thumbs2.redgifs.com/UnripeUnkemptWoodpecker-large.jpg'}), + {'https://thumbs3.redgifs.com/EntireEnchantingHypsilophodon-large.jpg', + 'https://thumbs3.redgifs.com/FancyMagnificentAdamsstaghornedbeetle-large.jpg', + 'https://thumbs3.redgifs.com/LustrousStickyWaxwing-large.jpg', + 'https://thumbs3.redgifs.com/ParchedWindyArmyworm-large.jpg', + 'https://thumbs3.redgifs.com/ThunderousColorlessErmine-large.jpg', + 'https://thumbs3.redgifs.com/UnripeUnkemptWoodpecker-large.jpg'}), )) def test_get_link(test_url: str, expected: set[str]): result = Redgifs._get_link(test_url) From 0767da14c25899f5320f1203a36db8bf6de1e9cf Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 3 Sep 2022 10:47:58 +1000 Subject: [PATCH 086/110] Fix clone integration test setup --- tests/integration_tests/test_clone_integration.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/integration_tests/test_clone_integration.py b/tests/integration_tests/test_clone_integration.py index 22c1988..29ec416 100644 --- a/tests/integration_tests/test_clone_integration.py +++ b/tests/integration_tests/test_clone_integration.py @@ -17,11 +17,12 @@ def copy_test_config(run_path: Path): def create_basic_args_for_cloner_runner(test_args: list[str], tmp_path: Path): + copy_test_config(tmp_path) out = [ 'clone', str(tmp_path), '-v', - '--config', 'test_config.cfg', + '--config', str(Path(tmp_path, 'test_config.cfg')), '--log', str(Path(tmp_path, 'test_log.txt')), ] + test_args return out @@ -33,6 +34,8 @@ def create_basic_args_for_cloner_runner(test_args: list[str], tmp_path: Path): @pytest.mark.parametrize('test_args', ( ['-l', 'm2601g'], ['-s', 'TrollXChromosomes/', '-L', 1], + ['-l', 'tr79b'], + ['-l', 'tr6ky'], )) def test_cli_scrape_general(test_args: list[str], tmp_path: Path): runner = CliRunner() From 5dbb4d00d4b12c875f37a478bae33c25de4a156e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 3 Sep 2022 11:29:31 +1000 Subject: [PATCH 087/110] Remove dead link tests --- tests/site_downloaders/test_vidble.py | 3 --- tests/site_downloaders/test_youtube.py | 1 - 2 files changed, 4 deletions(-) diff --git a/tests/site_downloaders/test_vidble.py b/tests/site_downloaders/test_vidble.py index 50ca808..faed440 100644 --- a/tests/site_downloaders/test_vidble.py +++ b/tests/site_downloaders/test_vidble.py @@ -30,9 +30,6 @@ def test_change_med_url(test_url: str, expected: str): 'https://www.vidble.com/VWuNsnLJMD.jpg', 'https://www.vidble.com/sMmM8O650W.jpg', }), - ('https://www.vidble.com/watch?v=joC6b7cgs2Tnucx7dhDoyqKPbr7TQUA5', { - 'https://www.vidble.com/joC6b7cgs2Tnucx7dhDoyqKPbr7TQUA5.mp4', - }), ('https://www.vidble.com/pHuwWkOcEb', { 'https://www.vidble.com/pHuwWkOcEb.jpg', }), diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index ce1abb8..14c6648 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -15,7 +15,6 @@ from bdfr.site_downloaders.youtube import Youtube @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://www.youtube.com/watch?v=uSm2VDgRIUs', '2d60b54582df5b95ec72bb00b580d2ff'), ('https://www.youtube.com/watch?v=GcI7nxQj7HA', '5db0fc92a0a7fb9ac91e63505eea9cf0'), - ('https://youtu.be/TMqPOlp4tNo', 'ceb4c2cb1a9bf79617623b2aa57e18fd'), # Age restricted )) def test_find_resources_good(test_url: str, expected_hash: str): test_submission = MagicMock() From 35645da241f46b70b27c52264cc1329f647f787f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 3 Sep 2022 11:31:57 +1000 Subject: [PATCH 088/110] Add missing mark --- tests/site_downloaders/test_vidble.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/site_downloaders/test_vidble.py b/tests/site_downloaders/test_vidble.py index faed440..4d71022 100644 --- a/tests/site_downloaders/test_vidble.py +++ b/tests/site_downloaders/test_vidble.py @@ -39,6 +39,7 @@ def test_get_links(test_url: str, expected: set[str]): assert results == expected +@pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_hashes'), ( ('https://www.vidble.com/show/UxsvAssYe5', { '0ef2f8e0e0b45936d2fb3e6fbdf67e28', From e0a36f4eab2a4ba23b98f19bf4bdef2b7287301d Mon Sep 17 00:00:00 2001 From: SoulSuck24 <79275800+Soulsuck24@users.noreply.github.com> Date: Mon, 12 Sep 2022 22:26:02 -0400 Subject: [PATCH 089/110] Re-fix Redgifs API seems to return incorrect signature value when sending header. Other fixes seems to have worked temporarily but have stopped working so they're removed. --- bdfr/site_downloaders/redgifs.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 26b9dfc..e2d27e7 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -28,12 +28,7 @@ class Redgifs(BaseDownloader): except AttributeError: raise SiteDownloaderError(f'Could not extract Redgifs ID from {url}') - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/90.0.4430.93 Safari/537.36', - } - - content = Redgifs.retrieve_url(f'https://api.redgifs.com/v2/gifs/{redgif_id}', headers=headers) + content = Redgifs.retrieve_url(f'https://api.redgifs.com/v2/gifs/{redgif_id}') if content is None: raise SiteDownloaderError('Could not read the page source') @@ -62,14 +57,4 @@ class Redgifs(BaseDownloader): except (KeyError, AttributeError): raise SiteDownloaderError('Failed to find JSON data in page') - # returned domain seems to be being phased out - out = {re.sub('thumbs2', 'thumbs3', link) for link in out} - out = {Redgifs._clean_thumbs4_link(link) for link in out} - return out - - @staticmethod - def _clean_thumbs4_link(url: str) -> str: - split_url = urllib.parse.urlsplit(url) - out = split_url.scheme + '://' + split_url.netloc + split_url.path - out = re.sub('thumbs4', 'thumbs3', out) return out From 0a9ecac41093e1029317a964c3265373e030cb03 Mon Sep 17 00:00:00 2001 From: SoulSuck24 <79275800+Soulsuck24@users.noreply.github.com> Date: Fri, 16 Sep 2022 14:47:55 -0400 Subject: [PATCH 090/110] Redgif image fixes --- bdfr/site_downloaders/redgifs.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index e2d27e7..9929493 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -45,9 +45,7 @@ class Redgifs(BaseDownloader): elif response_json['gif']['type'] == 2: # type 2 is an image if response_json['gif']['gallery']: content = Redgifs.retrieve_url( - f'https://api.redgifs.com/v2/gallery/{response_json["gif"]["gallery"]}', - headers=headers, - ) + f'https://api.redgifs.com/v2/gallery/{response_json["gif"]["gallery"]}') response_json = json.loads(content.text) out = {p['urls']['hd'] for p in response_json['gifs']} else: From 95749584ecad58e0db6d31220aaeae70e246a992 Mon Sep 17 00:00:00 2001 From: SoulSuck24 <79275800+Soulsuck24@users.noreply.github.com> Date: Fri, 16 Sep 2022 20:41:17 -0400 Subject: [PATCH 091/110] Redgifs fixed? If this doesn't work then I give up... --- bdfr/site_downloaders/download_factory.py | 4 ++-- bdfr/site_downloaders/redgifs.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 96e9a42..b0bf96a 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -27,6 +27,8 @@ class DownloadFactory: sanitised_url = DownloadFactory.sanitise_url(url) if re.match(r'(i\.)?imgur.*\.gif.+$', sanitised_url): return Imgur + elif re.match(r'(i\.)?(redgifs|gifdeliverynetwork)', sanitised_url): + return Redgifs elif re.match(r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', sanitised_url) and \ not DownloadFactory.is_web_resource(sanitised_url): return Direct @@ -40,8 +42,6 @@ class DownloadFactory: return Gfycat elif re.match(r'(m\.)?imgur.*', sanitised_url): return Imgur - elif re.match(r'(redgifs|gifdeliverynetwork)', sanitised_url): - return Redgifs elif re.match(r'reddit\.com/r/', sanitised_url): return SelfPost elif re.match(r'(m\.)?youtu\.?be', sanitised_url): diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 9929493..8d6ab21 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -24,7 +24,7 @@ class Redgifs(BaseDownloader): @staticmethod def _get_link(url: str) -> set[str]: try: - redgif_id = re.match(r'.*/(.*?)/?$', url).group(1) + redgif_id = re.match(r'.*/(.*?)(\..{3,})?$', url).group(1) except AttributeError: raise SiteDownloaderError(f'Could not extract Redgifs ID from {url}') @@ -55,4 +55,7 @@ class Redgifs(BaseDownloader): except (KeyError, AttributeError): raise SiteDownloaderError('Failed to find JSON data in page') + # Update subdomain if old one is returned + out = {re.sub('thumbs2', 'thumbs3', link) for link in out} + out = {re.sub('thumbs3', 'thumbs4', link) for link in out} return out From 2f2b5b749c7348be09babc31e26fac6c2c243716 Mon Sep 17 00:00:00 2001 From: SoulSuck24 <79275800+Soulsuck24@users.noreply.github.com> Date: Sun, 18 Sep 2022 13:24:42 -0400 Subject: [PATCH 092/110] Edge case coverage Cover edge cases that shouldn't ever happen but probably will sometime. Also included Imgur changes to cover similar situations of malformed/redirected links. --- bdfr/site_downloaders/download_factory.py | 4 +--- bdfr/site_downloaders/imgur.py | 9 +++++---- bdfr/site_downloaders/redgifs.py | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index b0bf96a..5f1d9b1 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -25,7 +25,7 @@ class DownloadFactory: @staticmethod def pull_lever(url: str) -> Type[BaseDownloader]: sanitised_url = DownloadFactory.sanitise_url(url) - if re.match(r'(i\.)?imgur.*\.gif.+$', sanitised_url): + if re.match(r'imgur\.com', sanitised_url): return Imgur elif re.match(r'(i\.)?(redgifs|gifdeliverynetwork)', sanitised_url): return Redgifs @@ -40,8 +40,6 @@ class DownloadFactory: return Gallery elif re.match(r'gfycat\.', sanitised_url): return Gfycat - elif re.match(r'(m\.)?imgur.*', sanitised_url): - return Imgur elif re.match(r'reddit\.com/r/', sanitised_url): return SelfPost elif re.match(r'(m\.)?youtu\.?be', sanitised_url): diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index 1f669d0..2c0ac04 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -41,10 +41,11 @@ class Imgur(BaseDownloader): @staticmethod def _get_data(link: str) -> dict: - link = link.rstrip('?') - if re.match(r'(?i).*\.gif.+$', link): - link = link.replace('i.imgur', 'imgur') - link = re.sub('(?i)\\.gif.+$', '', link) + try: + imgur_id = re.match(r'.*/(.*?)(\..{0,})?$', link).group(1) + link = f'https://imgur.com/a/{imgur_id}' + except AttributeError: + raise SiteDownloaderError(f'Could not extract Imgur ID from {link}') res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'}) diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 8d6ab21..2134aa3 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -24,7 +24,7 @@ class Redgifs(BaseDownloader): @staticmethod def _get_link(url: str) -> set[str]: try: - redgif_id = re.match(r'.*/(.*?)(\..{3,})?$', url).group(1) + redgif_id = re.match(r'.*/(.*?)(\..{0,})?$', url).group(1) except AttributeError: raise SiteDownloaderError(f'Could not extract Redgifs ID from {url}') From d4f7deaa6872ac35f37ccb5f17c350ab12424e9a Mon Sep 17 00:00:00 2001 From: SoulSuck24 <79275800+Soulsuck24@users.noreply.github.com> Date: Sun, 18 Sep 2022 14:30:43 -0400 Subject: [PATCH 093/110] Revert "Edge case coverage" This reverts commit 2f2b5b749c7348be09babc31e26fac6c2c243716. --- bdfr/site_downloaders/download_factory.py | 4 +++- bdfr/site_downloaders/imgur.py | 9 ++++----- bdfr/site_downloaders/redgifs.py | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 5f1d9b1..b0bf96a 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -25,7 +25,7 @@ class DownloadFactory: @staticmethod def pull_lever(url: str) -> Type[BaseDownloader]: sanitised_url = DownloadFactory.sanitise_url(url) - if re.match(r'imgur\.com', sanitised_url): + if re.match(r'(i\.)?imgur.*\.gif.+$', sanitised_url): return Imgur elif re.match(r'(i\.)?(redgifs|gifdeliverynetwork)', sanitised_url): return Redgifs @@ -40,6 +40,8 @@ class DownloadFactory: return Gallery elif re.match(r'gfycat\.', sanitised_url): return Gfycat + elif re.match(r'(m\.)?imgur.*', sanitised_url): + return Imgur elif re.match(r'reddit\.com/r/', sanitised_url): return SelfPost elif re.match(r'(m\.)?youtu\.?be', sanitised_url): diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index 2c0ac04..1f669d0 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -41,11 +41,10 @@ class Imgur(BaseDownloader): @staticmethod def _get_data(link: str) -> dict: - try: - imgur_id = re.match(r'.*/(.*?)(\..{0,})?$', link).group(1) - link = f'https://imgur.com/a/{imgur_id}' - except AttributeError: - raise SiteDownloaderError(f'Could not extract Imgur ID from {link}') + link = link.rstrip('?') + if re.match(r'(?i).*\.gif.+$', link): + link = link.replace('i.imgur', 'imgur') + link = re.sub('(?i)\\.gif.+$', '', link) res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'}) diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 2134aa3..8d6ab21 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -24,7 +24,7 @@ class Redgifs(BaseDownloader): @staticmethod def _get_link(url: str) -> set[str]: try: - redgif_id = re.match(r'.*/(.*?)(\..{0,})?$', url).group(1) + redgif_id = re.match(r'.*/(.*?)(\..{3,})?$', url).group(1) except AttributeError: raise SiteDownloaderError(f'Could not extract Redgifs ID from {url}') From 7bd957aafa9b50538951afc87cb516a76b855fb0 Mon Sep 17 00:00:00 2001 From: SoulSuck24 <79275800+Soulsuck24@users.noreply.github.com> Date: Sun, 18 Sep 2022 14:32:12 -0400 Subject: [PATCH 094/110] Redo edge case coverage for Redgifs Cover edge cases that shouldn't ever happen but probably will sometime. --- bdfr/site_downloaders/redgifs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 8d6ab21..2134aa3 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -24,7 +24,7 @@ class Redgifs(BaseDownloader): @staticmethod def _get_link(url: str) -> set[str]: try: - redgif_id = re.match(r'.*/(.*?)(\..{3,})?$', url).group(1) + redgif_id = re.match(r'.*/(.*?)(\..{0,})?$', url).group(1) except AttributeError: raise SiteDownloaderError(f'Could not extract Redgifs ID from {url}') From 106d7596b105f021e931fa064ca73d6591843a7b Mon Sep 17 00:00:00 2001 From: SoulSuck24 <79275800+Soulsuck24@users.noreply.github.com> Date: Sun, 18 Sep 2022 23:27:17 -0400 Subject: [PATCH 095/110] Imgur updates Update Imgur logic to cover malformed links that cause a redirect leading to the html of the page being saved as an image. --- bdfr/site_downloaders/download_factory.py | 4 +--- bdfr/site_downloaders/imgur.py | 10 ++++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 96e9a42..e618b68 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -25,7 +25,7 @@ class DownloadFactory: @staticmethod def pull_lever(url: str) -> Type[BaseDownloader]: sanitised_url = DownloadFactory.sanitise_url(url) - if re.match(r'(i\.)?imgur.*\.gif.+$', sanitised_url): + if re.match(r'(i\.|m\.)?imgur', sanitised_url): return Imgur elif re.match(r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', sanitised_url) and \ not DownloadFactory.is_web_resource(sanitised_url): @@ -38,8 +38,6 @@ class DownloadFactory: return Gallery elif re.match(r'gfycat\.', sanitised_url): return Gfycat - elif re.match(r'(m\.)?imgur.*', sanitised_url): - return Imgur elif re.match(r'(redgifs|gifdeliverynetwork)', sanitised_url): return Redgifs elif re.match(r'reddit\.com/r/', sanitised_url): diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index 1f669d0..f895785 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -41,10 +41,12 @@ class Imgur(BaseDownloader): @staticmethod def _get_data(link: str) -> dict: - link = link.rstrip('?') - if re.match(r'(?i).*\.gif.+$', link): - link = link.replace('i.imgur', 'imgur') - link = re.sub('(?i)\\.gif.+$', '', link) + try: + imgur_id = re.match(r'.*/(.*?)(\..{0,})?$', link).group(1) + gallery = 'a/' if re.search(r'.*/(.*?)(gallery/|a/)', link) else '' + link = f'https://imgur.com/{gallery}{imgur_id}' + except AttributeError: + raise SiteDownloaderError(f'Could not extract Imgur ID from {link}') res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'}) From 5c343ef79067abfb9479b11a7287a8078ceb31aa Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 20 Sep 2022 11:09:39 +1000 Subject: [PATCH 096/110] Fix Redgifs tests --- tests/site_downloaders/test_redgifs.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index b7ae3b3..b73ee95 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -2,6 +2,7 @@ # coding=utf-8 from unittest.mock import Mock +import re import pytest @@ -12,24 +13,26 @@ from bdfr.site_downloaders.redgifs import Redgifs @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected'), ( ('https://redgifs.com/watch/frighteningvictorioussalamander', - {'https://thumbs3.redgifs.com/FrighteningVictoriousSalamander.mp4'}), + {'FrighteningVictoriousSalamander.mp4'}), ('https://redgifs.com/watch/springgreendecisivetaruca', - {'https://thumbs3.redgifs.com/SpringgreenDecisiveTaruca.mp4'}), + {'SpringgreenDecisiveTaruca.mp4'}), ('https://www.redgifs.com/watch/palegoldenrodrawhalibut', - {'https://thumbs3.redgifs.com/PalegoldenrodRawHalibut.mp4'}), + {'PalegoldenrodRawHalibut.mp4'}), ('https://redgifs.com/watch/hollowintentsnowyowl', - {'https://thumbs3.redgifs.com/HollowIntentSnowyowl-large.jpg'}), + {'HollowIntentSnowyowl-large.jpg'}), ('https://www.redgifs.com/watch/lustrousstickywaxwing', - {'https://thumbs3.redgifs.com/EntireEnchantingHypsilophodon-large.jpg', - 'https://thumbs3.redgifs.com/FancyMagnificentAdamsstaghornedbeetle-large.jpg', - 'https://thumbs3.redgifs.com/LustrousStickyWaxwing-large.jpg', - 'https://thumbs3.redgifs.com/ParchedWindyArmyworm-large.jpg', - 'https://thumbs3.redgifs.com/ThunderousColorlessErmine-large.jpg', - 'https://thumbs3.redgifs.com/UnripeUnkemptWoodpecker-large.jpg'}), + {'EntireEnchantingHypsilophodon-large.jpg', + 'FancyMagnificentAdamsstaghornedbeetle-large.jpg', + 'LustrousStickyWaxwing-large.jpg', + 'ParchedWindyArmyworm-large.jpg', + 'ThunderousColorlessErmine-large.jpg', + 'UnripeUnkemptWoodpecker-large.jpg'}), )) def test_get_link(test_url: str, expected: set[str]): result = Redgifs._get_link(test_url) - assert result == expected + result = list(result) + patterns = [r'https://thumbs\d\.redgifs\.com/' + e + r'.*' for e in expected] + assert all([re.match(p, r) for p in patterns] for r in result) @pytest.mark.online From c4a9da06f6ee3b2a4fd51aa7ed40b66cdfa79d50 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 20 Sep 2022 17:20:43 +1000 Subject: [PATCH 097/110] Add dev requirements file --- dev_requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 dev_requirements.txt diff --git a/dev_requirements.txt b/dev_requirements.txt new file mode 100644 index 0000000..e079f8a --- /dev/null +++ b/dev_requirements.txt @@ -0,0 +1 @@ +pytest From 1dff7500e7b2a08b3ba7c210bec5567506e27fe9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 20 Sep 2022 17:33:44 +1000 Subject: [PATCH 098/110] Remove duplicate entries --- bdfr/site_downloaders/download_factory.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 0ebbf2c..f5e8d99 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -40,10 +40,6 @@ class DownloadFactory: return Gallery elif re.match(r'gfycat\.', sanitised_url): return Gfycat - elif re.match(r'(redgifs|gifdeliverynetwork)', sanitised_url): - return Redgifs - elif re.match(r'(m\.)?imgur.*', sanitised_url): - return Imgur elif re.match(r'reddit\.com/r/', sanitised_url): return SelfPost elif re.match(r'(m\.)?youtu\.?be', sanitised_url): From cd05bc388ec3d71fcf7f20f9e3e2708b0d0e26b9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 20 Sep 2022 17:33:51 +1000 Subject: [PATCH 099/110] Fix tests --- tests/site_downloaders/test_download_factory.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index dcb5303..d3fec6f 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -24,12 +24,12 @@ from bdfr.site_downloaders.youtube import Youtube @pytest.mark.parametrize(('test_submission_url', 'expected_class'), ( ('https://www.reddit.com/r/TwoXChromosomes/comments/lu29zn/i_refuse_to_live_my_life' '_in_anything_but_comfort/', SelfPost), - ('https://i.imgur.com/bZx1SJQ.jpg', Direct), + ('https://i.imgur.com/bZx1SJQ.jpg', Imgur), ('https://i.redd.it/affyv0axd5k61.png', Direct), - ('https://imgur.com/3ls94yv.jpeg', Direct), + ('https://imgur.com/3ls94yv.jpeg', Imgur), ('https://i.imgur.com/BuzvZwb.gifv', Imgur), ('https://imgur.com/BuzvZwb.gifv', Imgur), - ('https://i.imgur.com/6fNdLst.gif', Direct), + ('https://i.imgur.com/6fNdLst.gif', Imgur), ('https://imgur.com/a/MkxAzeg', Imgur), ('https://i.imgur.com/OGeVuAe.giff', Imgur), ('https://www.reddit.com/gallery/lu93m7', Gallery), @@ -40,7 +40,7 @@ from bdfr.site_downloaders.youtube import Youtube ('https://www.gifdeliverynetwork.com/repulsivefinishedandalusianhorse', Redgifs), ('https://youtu.be/DevfjHOhuFc', Youtube), ('https://m.youtube.com/watch?v=kr-FeojxzUM', Youtube), - ('https://i.imgur.com/3SKrQfK.jpg?1', Direct), + ('https://i.imgur.com/3SKrQfK.jpg?1', Imgur), ('https://dynasty-scans.com/system/images_images/000/017/819/original/80215103_p0.png?1612232781', Direct), ('https://m.imgur.com/a/py3RW0j', Imgur), ('https://v.redd.it/9z1dnk3xr5k61', VReddit), From 7fef6c4023c67f9ce2fba5cb5b085e579099cdbd Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Fri, 23 Sep 2022 02:51:53 -0400 Subject: [PATCH 100/110] Update test_clone_integration.py Update broken ID's in clone integration test --- tests/integration_tests/test_clone_integration.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration_tests/test_clone_integration.py b/tests/integration_tests/test_clone_integration.py index 29ec416..f9bf91a 100644 --- a/tests/integration_tests/test_clone_integration.py +++ b/tests/integration_tests/test_clone_integration.py @@ -32,10 +32,10 @@ def create_basic_args_for_cloner_runner(test_args: list[str], tmp_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['-l', 'm2601g'], + ['-l', '6l7778'], ['-s', 'TrollXChromosomes/', '-L', 1], - ['-l', 'tr79b'], - ['-l', 'tr6ky'], + ['-l', 'eiajjw'], + ['-l', 'xl0lhi'], )) def test_cli_scrape_general(test_args: list[str], tmp_path: Path): runner = CliRunner() From 39063868381cb7ee3ad60985887d5e0c495aac2f Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Fri, 23 Sep 2022 02:57:43 -0400 Subject: [PATCH 101/110] Update test_download_integration.py Update broken ID's in download integration test --- .../integration_tests/test_download_integration.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index a474172..196dfc3 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -60,6 +60,7 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit +@pytest.mark.slow @pytest.mark.authenticated @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( @@ -94,8 +95,8 @@ def test_cli_download_user_specific_subreddits(test_args: list[str], tmp_path: P @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['-l', 'm2601g'], - ['-l', 'https://www.reddit.com/r/TrollXChromosomes/comments/m2601g/its_a_step_in_the_right_direction/'], + ['-l', '6l7778'], + ['-l', 'https://reddit.com/r/EmpireDidNothingWrong/comments/6l7778/technically_true/'], ['-l', 'm3hxzd'], # Really long title used to overflow filename limit ['-l', 'm5bqkf'], # Resource leading to a 404 )) @@ -266,7 +267,7 @@ def test_cli_download_use_default_config(tmp_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['-l', 'm2601g', '--exclude-id', 'm2601g'], + ['-l', '6l7778', '--exclude-id', '6l7778'], )) def test_cli_download_links_exclusion(test_args: list[str], tmp_path: Path): runner = CliRunner() @@ -281,7 +282,7 @@ def test_cli_download_links_exclusion(test_args: list[str], tmp_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['-l', 'm2601g', '--skip-subreddit', 'trollxchromosomes'], + ['-l', '6l7778', '--skip-subreddit', 'EmpireDidNothingWrong'], ['-s', 'trollxchromosomes', '--skip-subreddit', 'trollxchromosomes', '-L', '3'], )) def test_cli_download_subreddit_exclusion(test_args: list[str], tmp_path: Path): @@ -312,8 +313,8 @@ def test_cli_download_file_scheme_warning(test_args: list[str], tmp_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['-l', 'm2601g', '--disable-module', 'SelfPost'], - ['-l', 'nnb9vs', '--disable-module', 'YtdlpFallback'], + ['-l', 'n9w9fo', '--disable-module', 'Direct'], + ['-l', 'nnb9vs', '--disable-module', 'VReddit'], )) def test_cli_download_disable_modules(test_args: list[str], tmp_path: Path): runner = CliRunner() From 9c067ad74feb5a393cc223f4654b8265ee0b5be5 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Fri, 23 Sep 2022 02:59:16 -0400 Subject: [PATCH 102/110] Update test_connector.py Update user that was banned/suspended with one that should not end up that way. --- tests/test_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_connector.py b/tests/test_connector.py index e928000..142baa6 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -399,7 +399,7 @@ def test_read_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Pat @pytest.mark.online @pytest.mark.reddit @pytest.mark.parametrize('test_redditor_name', ( - 'Paracortex', + 'nasa', 'crowdstrike', 'HannibalGoddamnit', )) From ca33dee265951340f0f4684bdfd64a3e1ac85e3b Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Fri, 23 Sep 2022 10:00:41 -0400 Subject: [PATCH 103/110] Update test_download_integration.py The old change twice forget once. Forgot I changed it back to a SelfPost rather than Direct. --- tests/integration_tests/test_download_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 196dfc3..a9f0e0e 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -313,7 +313,7 @@ def test_cli_download_file_scheme_warning(test_args: list[str], tmp_path: Path): @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['-l', 'n9w9fo', '--disable-module', 'Direct'], + ['-l', 'n9w9fo', '--disable-module', 'SelfPost'], ['-l', 'nnb9vs', '--disable-module', 'VReddit'], )) def test_cli_download_disable_modules(test_args: list[str], tmp_path: Path): From 57e59db458ede312cb140d44b1d54eb5cae5f19f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 22 Sep 2022 16:31:42 +1000 Subject: [PATCH 104/110] Update Erome link regex --- tests/site_downloaders/test_erome.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py index e06fab5..2f3701d 100644 --- a/tests/site_downloaders/test_erome.py +++ b/tests/site_downloaders/test_erome.py @@ -11,16 +11,16 @@ from bdfr.site_downloaders.erome import Erome @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_urls'), ( ('https://www.erome.com/a/vqtPuLXh', ( - r'https://s\d+.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4', + r'https://[a-z]\d+.erome.com/\d{3}/vqtPuLXh/KH2qBT99_480p.mp4', )), ('https://www.erome.com/a/ORhX0FZz', ( - r'https://s\d+.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', - r'https://s\d+.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', - r'https://s\d+.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4', - r'https://s\d+.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4', - r'https://s\d+.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4', - r'https://s\d+.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4', - r'https://s\d+.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4' + r'https://[a-z]\d+.erome.com/\d{3}/ORhX0FZz/9IYQocM9_480p.mp4', + r'https://[a-z]\d+.erome.com/\d{3}/ORhX0FZz/9eEDc8xm_480p.mp4', + r'https://[a-z]\d+.erome.com/\d{3}/ORhX0FZz/EvApC7Rp_480p.mp4', + r'https://[a-z]\d+.erome.com/\d{3}/ORhX0FZz/LruobtMs_480p.mp4', + r'https://[a-z]\d+.erome.com/\d{3}/ORhX0FZz/TJNmSUU5_480p.mp4', + r'https://[a-z]\d+.erome.com/\d{3}/ORhX0FZz/X11Skh6Z_480p.mp4', + r'https://[a-z]\d+.erome.com/\d{3}/ORhX0FZz/bjlTkpn7_480p.mp4' )), )) def test_get_link(test_url: str, expected_urls: tuple[str]): From 7bb2a9adbbf6047a2291028c3f9eb5659a7d1b1f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 22 Sep 2022 16:37:26 +1000 Subject: [PATCH 105/110] Remove obsolete test --- tests/site_downloaders/test_vidble.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/site_downloaders/test_vidble.py b/tests/site_downloaders/test_vidble.py index 4d71022..f6ddd56 100644 --- a/tests/site_downloaders/test_vidble.py +++ b/tests/site_downloaders/test_vidble.py @@ -53,9 +53,6 @@ def test_get_links(test_url: str, expected: set[str]): 'b31a942cd8cdda218ed547bbc04c3a27', '6f77c570b451eef4222804bd52267481', }), - ('https://www.vidble.com/watch?v=joC6b7cgs2Tnucx7dhDoyqKPbr7TQUA5', { - 'ec5f7a7f74a4dd55c740cbfd4d3bf9ab', - }), ('https://www.vidble.com/pHuwWkOcEb', { '585f486dd0b2f23a57bddbd5bf185bc7', }), From c834314086d73a29b97193b9436340d3001bc2c1 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 22 Sep 2022 16:56:31 +1000 Subject: [PATCH 106/110] Update hash --- tests/site_downloaders/test_vreddit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/site_downloaders/test_vreddit.py b/tests/site_downloaders/test_vreddit.py index f1f8219..da05c1b 100644 --- a/tests/site_downloaders/test_vreddit.py +++ b/tests/site_downloaders/test_vreddit.py @@ -13,7 +13,7 @@ from bdfr.site_downloaders.vreddit import VReddit @pytest.mark.online @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://www.reddit.com/user/Xomb_Forever/comments/u5p2kj/hold_up/', '690cffe27a7884196437926c22897216'), + ('https://www.reddit.com/user/Xomb_Forever/comments/u5p2kj/hold_up/', '379ef5cd87203544d51caee31e72d210'), )) def test_find_resources_good(test_url: str, expected_hash: str): test_submission = MagicMock() From 3b5f8bca676a76013e1a2e8c0ff6db2e06df2a6f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 22 Sep 2022 16:57:30 +1000 Subject: [PATCH 107/110] Update hashes --- .../fallback_downloaders/test_ytdlp_fallback.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py index 9aeca98..67359ec 100644 --- a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py @@ -36,8 +36,8 @@ def test_info_extraction_bad(test_url: str): @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://streamable.com/dt46y', 'b7e465adaade5f2b6d8c2b4b7d0a2878'), ('https://streamable.com/t8sem', '49b2d1220c485455548f1edbc05d4ecf'), - ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '49316899440ea1c3b74d5640d9d527c1'), - ('https://v.redd.it/9z1dnk3xr5k61', '76d5e6d7f4f9e1910c6c22b54dfa804f'), + ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '03087ce64f88f438bad6849858c9b7f0'), + ('https://v.redd.it/9z1dnk3xr5k61', '9ce39c8e46b6534a0b3f164a792d51c8'), )) def test_find_resources(test_url: str, expected_hash: str): test_submission = MagicMock() From d4664d784f5929a7f66b0c8468f3c9203e36d331 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 27 Sep 2022 10:52:04 +1000 Subject: [PATCH 108/110] Update yt-dlp requirement version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8ceffdb..83378f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,4 @@ ffmpeg-python>=0.2.0 praw>=7.2.0 pyyaml>=5.4.1 requests>=2.25.1 -yt-dlp>=2021.9.25 \ No newline at end of file +yt-dlp>=2022.9.1 \ No newline at end of file From b7d21161fb4d3b20c05dd1d42f527c8780e80015 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 27 Sep 2022 10:53:12 +1000 Subject: [PATCH 109/110] Update test --- .../fallback_downloaders/test_ytdlp_fallback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py index 67359ec..92ba27d 100644 --- a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py @@ -15,7 +15,7 @@ from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallb ('https://www.youtube.com/watch?v=P19nvJOmqCc', True), ('https://www.example.com/test', False), ('https://milesmatrix.bandcamp.com/album/la-boum/', False), - ('https://v.redd.it/54i8fvzev3u81', False), + ('https://v.redd.it/54i8fvzev3u81', True), )) def test_can_handle_link(test_url: str, expected: bool): result = YtdlpFallback.can_handle_link(test_url) From 0ce2585f7f27fc8cacc22d10b1fc2885ea2c2a68 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 27 Sep 2022 10:59:39 +1000 Subject: [PATCH 110/110] Update path so tests do not skip --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index da02948..a61d8d5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,7 +23,7 @@ def reddit_instance(): @pytest.fixture(scope='session') def authenticated_reddit_instance(): - test_config_path = Path('test_config.cfg') + test_config_path = Path('./tests/test_config.cfg') if not test_config_path.exists(): pytest.skip('Refresh token must be provided to authenticate with OAuth2') cfg_parser = configparser.ConfigParser()