From 77aaee96f3dbfbbd042948d285ebca7ff1055424 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 19 Jul 2021 18:44:54 +1000 Subject: [PATCH 01/15] Fix bug with deleted galleries --- bdfr/site_downloaders/gallery.py | 2 +- tests/site_downloaders/test_gallery.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index 62fec60..cd34416 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -21,7 +21,7 @@ class Gallery(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: try: image_urls = self._get_links(self.post.gallery_data['items']) - except AttributeError: + except (AttributeError, TypeError): try: image_urls = self._get_links(self.post.crosspost_parent_list[0]['gallery_data']['items']) except (AttributeError, IndexError, TypeError): diff --git a/tests/site_downloaders/test_gallery.py b/tests/site_downloaders/test_gallery.py index 51045f8..f84650d 100644 --- a/tests/site_downloaders/test_gallery.py +++ b/tests/site_downloaders/test_gallery.py @@ -4,6 +4,7 @@ import praw import pytest +from bdfr.exceptions import SiteDownloaderError from bdfr.site_downloaders.gallery import Gallery @@ -68,3 +69,13 @@ def test_gallery_download(test_submission_id: str, expected_hashes: set[str], re [res.download(120) for res in results] hashes = [res.hash.hexdigest() for res in results] assert set(hashes) == expected_hashes + + +@pytest.mark.parametrize('test_id', ( + 'n0pyzp', +)) +def test_gallery_download_raises_right_error(test_id: str, reddit_instance: praw.Reddit): + test_submission = reddit_instance.submission(id=test_id) + gallery = Gallery(test_submission) + with pytest.raises(SiteDownloaderError): + gallery.find_resources() From 1a4ff07f78f51dfbbe70c089b440e1c0f169be08 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Jul 2021 16:58:33 +1000 Subject: [PATCH 02/15] Add ability to read IDs from files --- bdfr/__main__.py | 11 ++++++----- bdfr/configuration.py | 1 + bdfr/connector.py | 16 +++++++++++----- .../test_download_integration.py | 14 ++++++++++++++ tests/test_connector.py | 5 ++--- 5 files changed, 34 insertions(+), 13 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 67e4f99..367f8c6 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -6,9 +6,9 @@ import sys import click from bdfr.archiver import Archiver +from bdfr.cloner import RedditCloner from bdfr.configuration import Configuration from bdfr.downloader import RedditDownloader -from bdfr.cloner import RedditCloner logger = logging.getLogger() @@ -17,6 +17,7 @@ _common_options = [ click.option('--authenticate', is_flag=True, default=None), click.option('--config', type=str, default=None), click.option('--disable-module', multiple=True, default=None, type=str), + click.option('--include-id-file', multiple=True, default=None), click.option('--log', type=str, default=None), click.option('--saved', is_flag=True, default=None), click.option('--search', default=None, type=str), @@ -26,12 +27,12 @@ _common_options = [ click.option('-L', '--limit', default=None, type=int), click.option('-l', '--link', multiple=True, default=None, type=str), click.option('-m', '--multireddit', multiple=True, default=None, type=str), + click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', 'controversial', 'rising', 'relevance')), + default=None), click.option('-s', '--subreddit', multiple=True, default=None, type=str), - click.option('-v', '--verbose', default=None, count=True), - click.option('-u', '--user', type=str, multiple=True, default=None), click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None), - click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', - 'controversial', 'rising', 'relevance')), default=None), + click.option('-u', '--user', type=str, multiple=True, default=None), + click.option('-v', '--verbose', default=None, count=True), ] _downloader_options = [ diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 36a1860..bc4c541 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -18,6 +18,7 @@ class Configuration(Namespace): self.exclude_id_file = [] self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}' self.folder_scheme: str = '{SUBREDDIT}' + self.include_id_file = [] self.limit: Optional[int] = None self.link: list[str] = [] self.log: Optional[str] = None diff --git a/bdfr/connector.py b/bdfr/connector.py index 0e78c8c..a379847 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -3,6 +3,7 @@ import configparser import importlib.resources +import itertools import logging import logging.handlers import re @@ -78,7 +79,12 @@ class RedditConnector(metaclass=ABCMeta): self.create_reddit_instance() self.args.user = list(filter(None, [self.resolve_user_name(user) for user in self.args.user])) - self.excluded_submission_ids = self.read_excluded_ids() + self.excluded_submission_ids = set.union( + self.read_id_files(self.args.exclude_id_file), + set(self.args.exclude_id), + ) + + self.args.link = list(itertools.chain(self.args.link, self.read_id_files(self.args.include_id_file))) self.master_hash_list = {} self.authenticator = self.create_authenticator() @@ -403,13 +409,13 @@ class RedditConnector(metaclass=ABCMeta): except prawcore.Forbidden: raise errors.BulkDownloaderException(f'Source {subreddit.display_name} is private and cannot be scraped') - def read_excluded_ids(self) -> set[str]: + @staticmethod + def read_id_files(file_locations: list[str]) -> set[str]: out = [] - out.extend(self.args.exclude_id) - for id_file in self.args.exclude_id_file: + for id_file in file_locations: id_file = Path(id_file).resolve().expanduser() if not id_file.exists(): - logger.warning(f'ID exclusion file at {id_file} does not exist') + logger.warning(f'ID file at {id_file} does not exist') continue with open(id_file, 'r') as file: for line in file: diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 305fe99..cb4a273 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -306,3 +306,17 @@ def test_cli_download_disable_modules(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 assert 'skipped due to disabled module' in result.output assert 'Downloaded submission' not in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +def test_cli_download_include_id_file(tmp_path: Path): + test_file = Path(tmp_path, 'include.txt') + test_args = ['--include-id-file', str(test_file)] + test_file.write_text('odr9wg\nody576') + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Downloaded submission' in result.output diff --git a/tests/test_connector.py b/tests/test_connector.py index 15eede1..2dd76f9 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -339,11 +339,10 @@ def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: se assert results == expected -def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path): +def test_read_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path): test_file = tmp_path / 'test.txt' test_file.write_text('aaaaaa\nbbbbbb') - downloader_mock.args.exclude_id_file = [test_file] - results = RedditConnector.read_excluded_ids(downloader_mock) + results = RedditConnector.read_id_files([str(test_file)]) assert results == {'aaaaaa', 'bbbbbb'} From 7a1663db51895e849aa112ec51b6b2c8a4301da7 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Jul 2021 17:02:19 +1000 Subject: [PATCH 03/15] Update README --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index be4f455..89a4e90 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,10 @@ The following options are common between both the `archive` and `download` comma - Can be specified multiple times - Disables certain modules from being used - See [Disabling Modules](#disabling-modules) for more information and a list of module names +- `--include-id-file` + - This will add any submission with the IDs in the files provided + - Can be specified multiple times + - Format is one ID per line - `--log` - This allows one to specify the location of the logfile - This must be done when running multiple instances of the BDFR, see [Multiple Instances](#multiple-instances) below From 44453b1707abc6559b3d9bc05d4ad53c8ffc7fbe Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 27 Jul 2021 13:12:50 +1000 Subject: [PATCH 04/15] Update tests --- tests/site_downloaders/test_gallery.py | 5 +---- tests/site_downloaders/test_gfycat.py | 2 -- tests/site_downloaders/test_redgifs.py | 9 +++------ 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/tests/site_downloaders/test_gallery.py b/tests/site_downloaders/test_gallery.py index f84650d..4e5d9f1 100644 --- a/tests/site_downloaders/test_gallery.py +++ b/tests/site_downloaders/test_gallery.py @@ -53,10 +53,6 @@ def test_gallery_get_links(test_ids: list[dict], expected: set[str]): '808c35267f44acb523ce03bfa5687404', 'ec8b65bdb7f1279c4b3af0ea2bbb30c3', }), - ('nxyahw', { - 'b89a3f41feb73ec1136ec4ffa7353eb1', - 'cabb76fd6fd11ae6e115a2039eb09f04', - }), ('obkflw', { '65163f685fb28c5b776e0e77122718be', '2a337eb5b13c34d3ca3f51b5db7c13e9', @@ -73,6 +69,7 @@ def test_gallery_download(test_submission_id: str, expected_hashes: set[str], re @pytest.mark.parametrize('test_id', ( 'n0pyzp', + 'nxyahw', )) def test_gallery_download_raises_right_error(test_id: str, reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_id) diff --git a/tests/site_downloaders/test_gfycat.py b/tests/site_downloaders/test_gfycat.py index 56aa2d0..3a405f8 100644 --- a/tests/site_downloaders/test_gfycat.py +++ b/tests/site_downloaders/test_gfycat.py @@ -13,8 +13,6 @@ from bdfr.site_downloaders.gfycat import Gfycat @pytest.mark.parametrize(('test_url', 'expected_url'), ( ('https://gfycat.com/definitivecaninecrayfish', 'https://giant.gfycat.com/DefinitiveCanineCrayfish.mp4'), ('https://gfycat.com/dazzlingsilkyiguana', 'https://giant.gfycat.com/DazzlingSilkyIguana.mp4'), - ('https://gfycat.com/webbedimpurebutterfly', 'https://thumbs2.redgifs.com/WebbedImpureButterfly.mp4'), - ('https://gfycat.com/CornyLoathsomeHarrierhawk', 'https://thumbs2.redgifs.com/CornyLoathsomeHarrierhawk.mp4') )) def test_get_link(test_url: str, expected_url: str): result = Gfycat._get_link(test_url) diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index 476149f..097fbf4 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -15,10 +15,8 @@ from bdfr.site_downloaders.redgifs import Redgifs 'https://thumbs2.redgifs.com/FrighteningVictoriousSalamander.mp4'), ('https://redgifs.com/watch/springgreendecisivetaruca', 'https://thumbs2.redgifs.com/SpringgreenDecisiveTaruca.mp4'), - ('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', - 'https://thumbs2.redgifs.com/RegalShoddyHorsechestnutleafminer.mp4'), - ('https://www.gifdeliverynetwork.com/maturenexthippopotamus', - 'https://thumbs2.redgifs.com/MatureNextHippopotamus.mp4'), + ('https://www.redgifs.com/watch/palegoldenrodrawhalibut', + 'https://thumbs2.redgifs.com/PalegoldenrodRawHalibut.mp4'), )) def test_get_link(test_url: str, expected: str): result = Redgifs._get_link(test_url) @@ -29,9 +27,8 @@ def test_get_link(test_url: str, expected: str): @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://redgifs.com/watch/frighteningvictorioussalamander', '4007c35d9e1f4b67091b5f12cffda00a'), ('https://redgifs.com/watch/springgreendecisivetaruca', '8dac487ac49a1f18cc1b4dabe23f0869'), - ('https://www.gifdeliverynetwork.com/maturenexthippopotamus', '9bec0a9e4163a43781368ed5d70471df'), - ('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', '8afb4e2c090a87140230f2352bf8beba'), ('https://redgifs.com/watch/leafysaltydungbeetle', '076792c660b9c024c0471ef4759af8bd'), + ('https://www.redgifs.com/watch/palegoldenrodrawhalibut', '46d5aa77fe80c6407de1ecc92801c10e'), )) def test_download_resource(test_url: str, expected_hash: str): mock_submission = Mock() From 3cdae99490e54bc6eb0da452cce2b3048da10786 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 27 Jul 2021 13:39:49 +1000 Subject: [PATCH 05/15] Implement callbacks for downloading --- bdfr/archiver.py | 6 +- bdfr/downloader.py | 2 +- bdfr/resource.py | 50 +++++++------- bdfr/site_downloaders/direct.py | 2 +- bdfr/site_downloaders/erome.py | 2 +- .../youtubedl_fallback.py | 26 ++++---- bdfr/site_downloaders/gallery.py | 2 +- bdfr/site_downloaders/imgur.py | 2 +- bdfr/site_downloaders/pornhub.py | 7 +- bdfr/site_downloaders/redgifs.py | 2 +- bdfr/site_downloaders/self_post.py | 2 +- bdfr/site_downloaders/youtube.py | 65 ++++++++++++------- tests/site_downloaders/test_direct.py | 2 +- tests/site_downloaders/test_erome.py | 2 +- tests/site_downloaders/test_gallery.py | 2 +- tests/site_downloaders/test_gfycat.py | 2 +- tests/site_downloaders/test_imgur.py | 2 +- tests/site_downloaders/test_pornhub.py | 2 +- tests/site_downloaders/test_redgifs.py | 2 +- tests/site_downloaders/test_youtube.py | 2 +- tests/test_download_filter.py | 4 +- tests/test_file_name_formatter.py | 10 +-- tests/test_resource.py | 6 +- 23 files changed, 112 insertions(+), 92 deletions(-) diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 74b92e8..d445e8d 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -76,17 +76,17 @@ class Archiver(RedditConnector): logger.info(f'Record for entry item {praw_item.id} written to disk') def _write_entry_json(self, entry: BaseArchiveEntry): - resource = Resource(entry.source, '', '.json') + resource = Resource(entry.source, '', lambda: None, '.json') content = json.dumps(entry.compile()) self._write_content_to_disk(resource, content) def _write_entry_xml(self, entry: BaseArchiveEntry): - resource = Resource(entry.source, '', '.xml') + resource = Resource(entry.source, '', lambda: None, '.xml') content = dict2xml.dict2xml(entry.compile(), wrap='root') self._write_content_to_disk(resource, content) def _write_entry_yaml(self, entry: BaseArchiveEntry): - resource = Resource(entry.source, '', '.yaml') + resource = Resource(entry.source, '', lambda: None, '.yaml') content = yaml.dump(entry.compile()) self._write_content_to_disk(resource, content) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index f4220db..69aa818 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -82,7 +82,7 @@ class RedditDownloader(RedditConnector): logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}') continue try: - res.download(self.args.max_wait_time) + res.download() except errors.BulkDownloaderException as e: logger.error(f'Failed to download resource {res.url} in submission {submission.id} ' f'with downloader {downloader_class.__name__}: {e}') diff --git a/bdfr/resource.py b/bdfr/resource.py index e8f9fd1..8f874ef 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -6,7 +6,7 @@ import logging import re import time import urllib.parse -from typing import Optional +from typing import Callable, Optional import _hashlib import requests @@ -18,40 +18,44 @@ logger = logging.getLogger(__name__) class Resource: - def __init__(self, source_submission: Submission, url: str, extension: str = None): + def __init__(self, source_submission: Submission, url: str, download_function: Callable, extension: str = None): self.source_submission = source_submission self.content: Optional[bytes] = None self.url = url self.hash: Optional[_hashlib.HASH] = None self.extension = extension + self.download_function = download_function if not self.extension: self.extension = self._determine_extension() @staticmethod - def retry_download(url: str, max_wait_time: int, current_wait_time: int = 60) -> Optional[bytes]: - try: - response = requests.get(url) - if re.match(r'^2\d{2}', str(response.status_code)) and response.content: - return response.content - elif response.status_code in (408, 429): - raise requests.exceptions.ConnectionError(f'Response code {response.status_code}') - else: - raise BulkDownloaderException( - f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') - except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e: - logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}') - time.sleep(current_wait_time) - if current_wait_time < max_wait_time: - current_wait_time += 60 - return Resource.retry_download(url, max_wait_time, current_wait_time) - else: - logger.error(f'Max wait time exceeded for resource at url {url}') - raise + def retry_download(url: str, max_wait_time: int) -> Callable: + def http_download() -> Optional[bytes]: + current_wait_time = 60 + while True: + try: + response = requests.get(url) + if re.match(r'^2\d{2}', str(response.status_code)) and response.content: + return response.content + elif response.status_code in (408, 429): + raise requests.exceptions.ConnectionError(f'Response code {response.status_code}') + else: + raise BulkDownloaderException( + f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') + except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e: + logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}') + time.sleep(current_wait_time) + if current_wait_time < max_wait_time: + current_wait_time += 60 + else: + logger.error(f'Max wait time exceeded for resource at url {url}') + raise + return http_download - def download(self, max_wait_time: int): + def download(self): if not self.content: try: - content = self.retry_download(self.url, max_wait_time) + content = self.download_function() except requests.exceptions.ConnectionError as e: raise BulkDownloaderException(f'Could not download resource: {e}') except BulkDownloaderException: diff --git a/bdfr/site_downloaders/direct.py b/bdfr/site_downloaders/direct.py index 106f251..df1a469 100644 --- a/bdfr/site_downloaders/direct.py +++ b/bdfr/site_downloaders/direct.py @@ -14,4 +14,4 @@ class Direct(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - return [Resource(self.post, self.post.url)] + return [Resource(self.post, self.post.url, Resource.retry_download(self.post.url, 300))] diff --git a/bdfr/site_downloaders/erome.py b/bdfr/site_downloaders/erome.py index bd29ea4..69b9ae3 100644 --- a/bdfr/site_downloaders/erome.py +++ b/bdfr/site_downloaders/erome.py @@ -29,7 +29,7 @@ class Erome(BaseDownloader): for link in links: if not re.match(r'https?://.*', link): link = 'https://' + link - out.append(Resource(self.post, link)) + out.append(Resource(self.post, link, Resource.retry_download(link, 300))) return out @staticmethod diff --git a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py index 281182a..6ede405 100644 --- a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py +++ b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py @@ -4,7 +4,6 @@ import logging from typing import Optional -import youtube_dl from praw.models import Submission from bdfr.resource import Resource @@ -20,21 +19,18 @@ class YoutubeDlFallback(BaseFallbackDownloader, Youtube): super(YoutubeDlFallback, self).__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - out = super()._download_video({}) + out = Resource( + self.post, + self.post.url, + super()._download_video({}), + super().get_video_attributes(self.post.url)['ext'], + ) return [out] @staticmethod def can_handle_link(url: str) -> bool: - yt_logger = logging.getLogger('youtube-dl') - yt_logger.setLevel(logging.CRITICAL) - with youtube_dl.YoutubeDL({ - 'logger': yt_logger, - }) as ydl: - try: - result = ydl.extract_info(url, download=False) - if result: - return True - except Exception as e: - logger.exception(e) - return False - return False + attributes = YoutubeDlFallback.get_video_attributes(url) + if attributes: + return True + else: + return False diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index cd34416..c016d28 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -31,7 +31,7 @@ class Gallery(BaseDownloader): if not image_urls: raise SiteDownloaderError('No images found in Reddit gallery') - return [Resource(self.post, url) for url in image_urls] + return [Resource(self.post, url, Resource.retry_download(url, 300)) for url in image_urls] @ staticmethod def _get_links(id_dict: list[dict]) -> list[str]: diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index 44a62f1..79a1115 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -33,7 +33,7 @@ class Imgur(BaseDownloader): def _compute_image_url(self, image: dict) -> Resource: image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext']) - return Resource(self.post, image_url) + return Resource(self.post, image_url, Resource.retry_download(image_url, 300)) @staticmethod def _get_data(link: str) -> dict: diff --git a/bdfr/site_downloaders/pornhub.py b/bdfr/site_downloaders/pornhub.py index 6658d7e..c2bc0ad 100644 --- a/bdfr/site_downloaders/pornhub.py +++ b/bdfr/site_downloaders/pornhub.py @@ -22,5 +22,10 @@ class PornHub(Youtube): 'format': 'best', 'nooverwrites': True, } - out = self._download_video(ytdl_options) + out = Resource( + self.post, + self.post.url, + super()._download_video(ytdl_options), + super().get_video_attributes(self.post.url)['ext'], + ) return [out] diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 9cfec02..d4989e7 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -18,7 +18,7 @@ class Redgifs(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: media_url = self._get_link(self.post.url) - return [Resource(self.post, media_url, '.mp4')] + return [Resource(self.post, media_url, Resource.retry_download(media_url, 300), '.mp4')] @staticmethod def _get_link(url: str) -> str: diff --git a/bdfr/site_downloaders/self_post.py b/bdfr/site_downloaders/self_post.py index cb922ee..6e4ce0e 100644 --- a/bdfr/site_downloaders/self_post.py +++ b/bdfr/site_downloaders/self_post.py @@ -17,7 +17,7 @@ class SelfPost(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - out = Resource(self.post, self.post.url, '.txt') + out = Resource(self.post, self.post.url, lambda: None, '.txt') out.content = self.export_to_string().encode('utf-8') out.create_hash() return [out] diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py index 8b93b23..126cb6a 100644 --- a/bdfr/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -3,12 +3,12 @@ import logging import tempfile from pathlib import Path -from typing import Optional +from typing import Callable, Optional import youtube_dl from praw.models import Submission -from bdfr.exceptions import (NotADownloadableLinkError, SiteDownloaderError) +from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator from bdfr.site_downloaders.base_downloader import BaseDownloader @@ -26,32 +26,47 @@ class Youtube(BaseDownloader): 'playlistend': 1, 'nooverwrites': True, } - out = self._download_video(ytdl_options) - return [out] + download_function = self._download_video(ytdl_options) + try: + extension = self.get_video_attributes(self.post.url)['ext'] + except KeyError: + raise NotADownloadableLinkError(f'Youtube-DL cannot download URL {self.post.url}') + res = Resource(self.post, self.post.url, download_function, extension) + return [res] - def _download_video(self, ytdl_options: dict) -> Resource: + def _download_video(self, ytdl_options: dict) -> Callable: yt_logger = logging.getLogger('youtube-dl') yt_logger.setLevel(logging.CRITICAL) ytdl_options['quiet'] = True ytdl_options['logger'] = yt_logger - with tempfile.TemporaryDirectory() as temp_dir: - download_path = Path(temp_dir).resolve() - ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s' - try: - with youtube_dl.YoutubeDL(ytdl_options) as ydl: - ydl.download([self.post.url]) - except youtube_dl.DownloadError as e: - raise SiteDownloaderError(f'Youtube download failed: {e}') - downloaded_files = list(download_path.iterdir()) - if len(downloaded_files) > 0: - downloaded_file = downloaded_files[0] - else: - raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}") - extension = downloaded_file.suffix - with open(downloaded_file, 'rb') as file: - content = file.read() - out = Resource(self.post, self.post.url, extension) - out.content = content - out.create_hash() - return out + def download() -> bytes: + with tempfile.TemporaryDirectory() as temp_dir: + download_path = Path(temp_dir).resolve() + ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s' + try: + with youtube_dl.YoutubeDL(ytdl_options) as ydl: + ydl.download([self.post.url]) + except youtube_dl.DownloadError as e: + raise SiteDownloaderError(f'Youtube download failed: {e}') + + downloaded_files = list(download_path.iterdir()) + if len(downloaded_files) > 0: + downloaded_file = downloaded_files[0] + else: + raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}") + with open(downloaded_file, 'rb') as file: + content = file.read() + return content + return download + + @staticmethod + def get_video_attributes(url: str) -> dict: + yt_logger = logging.getLogger('youtube-dl') + yt_logger.setLevel(logging.CRITICAL) + with youtube_dl.YoutubeDL({'logger': yt_logger, }) as ydl: + try: + result = ydl.extract_info(url, download=False) + return result + except Exception as e: + logger.exception(e) diff --git a/tests/site_downloaders/test_direct.py b/tests/site_downloaders/test_direct.py index 790f4c3..56f90fc 100644 --- a/tests/site_downloaders/test_direct.py +++ b/tests/site_downloaders/test_direct.py @@ -21,5 +21,5 @@ def test_download_resource(test_url: str, expected_hash: str): resources = test_site.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download(120) + resources[0].download() assert resources[0].hash.hexdigest() == expected_hash diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py index 84546c4..2918bef 100644 --- a/tests/site_downloaders/test_erome.py +++ b/tests/site_downloaders/test_erome.py @@ -49,6 +49,6 @@ def test_download_resource(test_url: str, expected_hashes: tuple[str]): mock_submission.url = test_url test_site = Erome(mock_submission) resources = test_site.find_resources() - [res.download(120) for res in resources] + [res.download() for res in resources] resource_hashes = [res.hash.hexdigest() for res in resources] assert len(resource_hashes) == len(expected_hashes) diff --git a/tests/site_downloaders/test_gallery.py b/tests/site_downloaders/test_gallery.py index 4e5d9f1..08eea91 100644 --- a/tests/site_downloaders/test_gallery.py +++ b/tests/site_downloaders/test_gallery.py @@ -62,7 +62,7 @@ def test_gallery_download(test_submission_id: str, expected_hashes: set[str], re test_submission = reddit_instance.submission(id=test_submission_id) gallery = Gallery(test_submission) results = gallery.find_resources() - [res.download(120) for res in results] + [res.download() for res in results] hashes = [res.hash.hexdigest() for res in results] assert set(hashes) == expected_hashes diff --git a/tests/site_downloaders/test_gfycat.py b/tests/site_downloaders/test_gfycat.py index 3a405f8..981d01d 100644 --- a/tests/site_downloaders/test_gfycat.py +++ b/tests/site_downloaders/test_gfycat.py @@ -31,5 +31,5 @@ def test_download_resource(test_url: str, expected_hash: str): resources = test_site.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download(120) + resources[0].download() assert resources[0].hash.hexdigest() == expected_hash diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index 94bd240..bfb7405 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -149,6 +149,6 @@ def test_find_resources(test_url: str, expected_hashes: list[str]): downloader = Imgur(mock_download) results = downloader.find_resources() assert all([isinstance(res, Resource) for res in results]) - [res.download(120) for res in results] + [res.download() for res in results] hashes = set([res.hash.hexdigest() for res in results]) assert hashes == set(expected_hashes) diff --git a/tests/site_downloaders/test_pornhub.py b/tests/site_downloaders/test_pornhub.py index 12144dd..e07da45 100644 --- a/tests/site_downloaders/test_pornhub.py +++ b/tests/site_downloaders/test_pornhub.py @@ -21,5 +21,5 @@ def test_find_resources_good(test_url: str, expected_hash: str): resources = downloader.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download(120) + resources[0].download() assert resources[0].hash.hexdigest() == expected_hash diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index 097fbf4..571f044 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -37,5 +37,5 @@ def test_download_resource(test_url: str, expected_hash: str): resources = test_site.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download(120) + resources[0].download() assert resources[0].hash.hexdigest() == expected_hash diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index f3a97e1..1f6b81a 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -23,7 +23,7 @@ def test_find_resources_good(test_url: str, expected_hash: str): resources = downloader.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download(120) + resources[0].download() assert resources[0].hash.hexdigest() == expected_hash diff --git a/tests/test_download_filter.py b/tests/test_download_filter.py index ead2b2f..5def10c 100644 --- a/tests/test_download_filter.py +++ b/tests/test_download_filter.py @@ -46,7 +46,7 @@ def test_filter_domain(test_url: str, expected: bool, download_filter: DownloadF ('http://reddit.com/test.gif', False), )) def test_filter_all(test_url: str, expected: bool, download_filter: DownloadFilter): - test_resource = Resource(MagicMock(), test_url) + test_resource = Resource(MagicMock(), test_url, lambda: None) result = download_filter.check_resource(test_resource) assert result == expected @@ -59,6 +59,6 @@ def test_filter_all(test_url: str, expected: bool, download_filter: DownloadFilt )) def test_filter_empty_filter(test_url: str): download_filter = DownloadFilter() - test_resource = Resource(MagicMock(), test_url) + test_resource = Resource(MagicMock(), test_url, lambda: None) result = download_filter.check_resource(test_resource) assert result is True diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index e4c82ac..f596d89 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -119,7 +119,7 @@ def test_format_full( format_string_file: str, expected: str, reddit_submission: praw.models.Submission): - test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') + test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png', lambda: None) test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO') result = test_formatter.format_path(test_resource, Path('test')) assert do_test_path_equality(result, expected) @@ -136,7 +136,7 @@ def test_format_full_conform( format_string_directory: str, format_string_file: str, reddit_submission: praw.models.Submission): - test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') + test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png', lambda: None) test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO') test_formatter.format_path(test_resource, Path('test')) @@ -156,7 +156,7 @@ def test_format_full_with_index_suffix( expected: str, reddit_submission: praw.models.Submission, ): - test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') + test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png', lambda: None) test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO') result = test_formatter.format_path(test_resource, Path('test'), index) assert do_test_path_equality(result, expected) @@ -216,7 +216,7 @@ def test_shorten_filenames(submission: MagicMock, tmp_path: Path): submission.author.name = 'test' submission.subreddit.display_name = 'test' submission.id = 'BBBBBB' - test_resource = Resource(submission, 'www.example.com/empty', '.jpeg') + test_resource = Resource(submission, 'www.example.com/empty', lambda: None, '.jpeg') test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}', 'ISO') result = test_formatter.format_path(test_resource, tmp_path) result.parent.mkdir(parents=True) @@ -296,7 +296,7 @@ def test_format_archive_entry_comment( ): test_comment = reddit_instance.comment(id=test_comment_id) test_formatter = FileNameFormatter(test_file_scheme, test_folder_scheme, 'ISO') - test_entry = Resource(test_comment, '', '.json') + test_entry = Resource(test_comment, '', lambda: None, '.json') result = test_formatter.format_path(test_entry, tmp_path) assert do_test_string_equality(result, expected_name) diff --git a/tests/test_resource.py b/tests/test_resource.py index 272c457..db9a6cc 100644 --- a/tests/test_resource.py +++ b/tests/test_resource.py @@ -21,7 +21,7 @@ from bdfr.resource import Resource ('https://www.test.com/test/test2/example.png?random=test#thing', '.png'), )) def test_resource_get_extension(test_url: str, expected: str): - test_resource = Resource(MagicMock(), test_url) + test_resource = Resource(MagicMock(), test_url, lambda: None) result = test_resource._determine_extension() assert result == expected @@ -31,6 +31,6 @@ def test_resource_get_extension(test_url: str, expected: str): ('https://www.iana.org/_img/2013.1/iana-logo-header.svg', '426b3ac01d3584c820f3b7f5985d6623'), )) def test_download_online_resource(test_url: str, expected_hash: str): - test_resource = Resource(MagicMock(), test_url) - test_resource.download(120) + test_resource = Resource(MagicMock(), test_url, Resource.retry_download(test_url, 60)) + test_resource.download() assert test_resource.hash.hexdigest() == expected_hash From dbe8733fd44cb1b3055faa072c801e73e18d7865 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 27 Jul 2021 14:02:30 +1000 Subject: [PATCH 06/15] Refactor method to remove max wait time --- bdfr/resource.py | 5 ++++- bdfr/site_downloaders/direct.py | 2 +- bdfr/site_downloaders/erome.py | 2 +- bdfr/site_downloaders/gallery.py | 2 +- bdfr/site_downloaders/imgur.py | 2 +- bdfr/site_downloaders/redgifs.py | 2 +- tests/test_resource.py | 2 +- 7 files changed, 10 insertions(+), 7 deletions(-) diff --git a/bdfr/resource.py b/bdfr/resource.py index 8f874ef..a1c90de 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -6,6 +6,7 @@ import logging import re import time import urllib.parse +from collections import namedtuple from typing import Callable, Optional import _hashlib @@ -29,7 +30,9 @@ class Resource: self.extension = self._determine_extension() @staticmethod - def retry_download(url: str, max_wait_time: int) -> Callable: + def retry_download(url: str) -> Callable: + max_wait_time = 300 + def http_download() -> Optional[bytes]: current_wait_time = 60 while True: diff --git a/bdfr/site_downloaders/direct.py b/bdfr/site_downloaders/direct.py index df1a469..833acae 100644 --- a/bdfr/site_downloaders/direct.py +++ b/bdfr/site_downloaders/direct.py @@ -14,4 +14,4 @@ class Direct(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - return [Resource(self.post, self.post.url, Resource.retry_download(self.post.url, 300))] + return [Resource(self.post, self.post.url, Resource.retry_download(self.post.url))] diff --git a/bdfr/site_downloaders/erome.py b/bdfr/site_downloaders/erome.py index 69b9ae3..6130560 100644 --- a/bdfr/site_downloaders/erome.py +++ b/bdfr/site_downloaders/erome.py @@ -29,7 +29,7 @@ class Erome(BaseDownloader): for link in links: if not re.match(r'https?://.*', link): link = 'https://' + link - out.append(Resource(self.post, link, Resource.retry_download(link, 300))) + out.append(Resource(self.post, link, Resource.retry_download(link))) return out @staticmethod diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index c016d28..158e338 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -31,7 +31,7 @@ class Gallery(BaseDownloader): if not image_urls: raise SiteDownloaderError('No images found in Reddit gallery') - return [Resource(self.post, url, Resource.retry_download(url, 300)) for url in image_urls] + return [Resource(self.post, url, Resource.retry_download(url)) for url in image_urls] @ staticmethod def _get_links(id_dict: list[dict]) -> list[str]: diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index 79a1115..f0b7012 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -33,7 +33,7 @@ class Imgur(BaseDownloader): def _compute_image_url(self, image: dict) -> Resource: image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext']) - return Resource(self.post, image_url, Resource.retry_download(image_url, 300)) + return Resource(self.post, image_url, Resource.retry_download(image_url)) @staticmethod def _get_data(link: str) -> dict: diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index d4989e7..a62fedb 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -18,7 +18,7 @@ class Redgifs(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: media_url = self._get_link(self.post.url) - return [Resource(self.post, media_url, Resource.retry_download(media_url, 300), '.mp4')] + return [Resource(self.post, media_url, Resource.retry_download(media_url), '.mp4')] @staticmethod def _get_link(url: str) -> str: diff --git a/tests/test_resource.py b/tests/test_resource.py index db9a6cc..f3bbc9a 100644 --- a/tests/test_resource.py +++ b/tests/test_resource.py @@ -31,6 +31,6 @@ def test_resource_get_extension(test_url: str, expected: str): ('https://www.iana.org/_img/2013.1/iana-logo-header.svg', '426b3ac01d3584c820f3b7f5985d6623'), )) def test_download_online_resource(test_url: str, expected_hash: str): - test_resource = Resource(MagicMock(), test_url, Resource.retry_download(test_url, 60)) + test_resource = Resource(MagicMock(), test_url, Resource.retry_download(test_url)) test_resource.download() assert test_resource.hash.hexdigest() == expected_hash From 7bca303b1b663848c5081fd9fa0543291a05396a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 29 Jul 2021 19:10:10 +1000 Subject: [PATCH 07/15] Add in downloader parameters --- bdfr/downloader.py | 2 +- bdfr/resource.py | 13 +++++++++---- bdfr/site_downloaders/youtube.py | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 69aa818..70052b2 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -82,7 +82,7 @@ class RedditDownloader(RedditConnector): logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}') continue try: - res.download() + res.download({'max_wait_time': self.args.max_wait_time}) except errors.BulkDownloaderException as e: logger.error(f'Failed to download resource {res.url} in submission {submission.id} ' f'with downloader {downloader_class.__name__}: {e}') diff --git a/bdfr/resource.py b/bdfr/resource.py index a1c90de..27ba84b 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -6,7 +6,6 @@ import logging import re import time import urllib.parse -from collections import namedtuple from typing import Callable, Optional import _hashlib @@ -33,8 +32,12 @@ class Resource: def retry_download(url: str) -> Callable: max_wait_time = 300 - def http_download() -> Optional[bytes]: + def http_download(download_parameters: dict) -> Optional[bytes]: current_wait_time = 60 + if 'max_wait_time' in download_parameters: + max_wait_time = download_parameters['max_wait_time'] + else: + max_wait_time = 300 while True: try: response = requests.get(url) @@ -55,10 +58,12 @@ class Resource: raise return http_download - def download(self): + def download(self, download_parameters: Optional[dict] = None): + if download_parameters is None: + download_parameters = {} if not self.content: try: - content = self.download_function() + content = self.download_function(download_parameters) except requests.exceptions.ConnectionError as e: raise BulkDownloaderException(f'Could not download resource: {e}') except BulkDownloaderException: diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py index 126cb6a..a870c2e 100644 --- a/bdfr/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -40,7 +40,7 @@ class Youtube(BaseDownloader): ytdl_options['quiet'] = True ytdl_options['logger'] = yt_logger - def download() -> bytes: + def download(_: dict) -> bytes: with tempfile.TemporaryDirectory() as temp_dir: download_path = Path(temp_dir).resolve() ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s' From 87f283cc98ccb7743cfefd54b063d23142040431 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 3 Sep 2021 19:24:28 +1000 Subject: [PATCH 08/15] Fix backup config location --- bdfr/connector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index 0e78c8c..78ddc4f 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -184,8 +184,9 @@ class RedditConnector(metaclass=ABCMeta): logger.debug(f'Loading configuration from {path}') break if not self.config_location: - self.config_location = list(importlib.resources.path('bdfr', 'default_config.cfg').gen)[0] - shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg')) + with importlib.resources.path('bdfr', 'default_config.cfg') as path: + self.config_location = path + shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg')) if not self.config_location: raise errors.BulkDownloaderException('Could not find a configuration file to load') self.cfg_parser.read(self.config_location) From afc2a6416bc08b6009e7f4d27af132cf65705259 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 3 Sep 2021 16:39:00 +1000 Subject: [PATCH 09/15] Add integration test --- tests/integration_tests/test_download_integration.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 305fe99..6fecd73 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -45,6 +45,7 @@ def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new'], ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new'], ['-s', 'trollxchromosomes', '-L', 1, '--search', 'women'], + ['-s', 'hentai', '-L', 10, '--search', 'red'], ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--search', 'women'], ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new', '--search', 'women'], ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new', '--search', 'women'], @@ -55,6 +56,7 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'Added submissions from subreddit ' in result.output + assert 'Downloaded submission' in result.output @pytest.mark.online From defd6bca77ff2b56e91b289307d12fe422cda524 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 9 Sep 2021 13:42:18 +1000 Subject: [PATCH 10/15] Tweak test conditions --- tests/test_connector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_connector.py b/tests/test_connector.py index 15eede1..a275d9f 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -199,10 +199,9 @@ def test_get_subreddit_normal( @pytest.mark.reddit @pytest.mark.parametrize(('test_subreddits', 'search_term', 'limit', 'time_filter', 'max_expected_len'), ( (('Python',), 'scraper', 10, 'all', 10), - (('Python',), '', 10, 'all', 10), + (('Python',), '', 10, 'all', 0), (('Python',), 'djsdsgewef', 10, 'all', 0), (('Python',), 'scraper', 10, 'year', 10), - (('Python',), 'scraper', 10, 'hour', 1), )) def test_get_subreddit_search( test_subreddits: list[str], @@ -226,6 +225,8 @@ def test_get_subreddit_search( assert all([isinstance(res, praw.models.Submission) for res in results]) assert all([res.subreddit.display_name in test_subreddits for res in results]) assert len(results) <= max_expected_len + if max_expected_len != 0: + assert len(results) > 0 assert not any([isinstance(m, MagicMock) for m in results]) From 56575dc390fbefcbcbadb390e950fdda38561030 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 9 Sep 2021 13:43:11 +1000 Subject: [PATCH 11/15] Add NSFW search test --- .../test_download_integration.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 6fecd73..57f39bf 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -45,7 +45,6 @@ def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new'], ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new'], ['-s', 'trollxchromosomes', '-L', 1, '--search', 'women'], - ['-s', 'hentai', '-L', 10, '--search', 'red'], ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--search', 'women'], ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new', '--search', 'women'], ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new', '--search', 'women'], @@ -59,6 +58,22 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): assert 'Downloaded submission' in result.output +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.authenticated +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-s', 'hentai', '-L', 10, '--search', 'red', '--authenticate'], +)) +def test_cli_download_search_subreddits_authenticated(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Added submissions from subreddit ' in result.output + assert 'Downloaded submission' in result.output + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.authenticated From edc2db0ded1222b4b050f99421d939fc369ff104 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 9 Sep 2021 13:50:03 +1000 Subject: [PATCH 12/15] Update test --- tests/site_downloaders/test_erome.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py index 2918bef..bab34bb 100644 --- a/tests/site_downloaders/test_erome.py +++ b/tests/site_downloaders/test_erome.py @@ -14,13 +14,13 @@ from bdfr.site_downloaders.erome import Erome 'https://s11.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4', )), ('https://www.erome.com/a/ORhX0FZz', ( - 'https://s4.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4' + 'https://s15.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', + 'https://s15.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', + 'https://s15.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4', + 'https://s15.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4', + 'https://s15.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4', + 'https://s15.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4', + 'https://s15.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4' )), )) def test_get_link(test_url: str, expected_urls: tuple[str]): From 940d646d30299747b6d0a0c3b25ea3fbafed0875 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 11 Sep 2021 12:13:21 +1000 Subject: [PATCH 13/15] Add Vidble module --- bdfr/site_downloaders/vidble.py | 48 +++++++++++++++++++ tests/site_downloaders/test_vidble.py | 67 +++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 bdfr/site_downloaders/vidble.py create mode 100644 tests/site_downloaders/test_vidble.py diff --git a/bdfr/site_downloaders/vidble.py b/bdfr/site_downloaders/vidble.py new file mode 100644 index 0000000..2f8f4f4 --- /dev/null +++ b/bdfr/site_downloaders/vidble.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# coding=utf-8 +import itertools +import logging +import re +from typing import Optional + +import bs4 +import requests +from praw.models import Submission + +from bdfr.exceptions import SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.base_downloader import BaseDownloader + +logger = logging.getLogger(__name__) + + +class Vidble(BaseDownloader): + def __init__(self, post: Submission): + super().__init__(post) + + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: + res = self.get_links(self.post.url) + if not res: + raise SiteDownloaderError(rf'No resources found at {self.post.url}') + res = [Resource(self.post, r, Resource.retry_download(r)) for r in res] + return res + + @staticmethod + def get_links(url: str) -> set[str]: + page = requests.get(url) + soup = bs4.BeautifulSoup(page.text, 'html.parser') + content_div = soup.find('div', attrs={'id': 'ContentPlaceHolder1_divContent'}) + images = content_div.find_all('img') + images = [i.get('src') for i in images] + videos = content_div.find_all('source', attrs={'type': 'video/mp4'}) + videos = [v.get('src') for v in videos] + resources = filter(None, itertools.chain(images, videos)) + resources = ['https://www.vidble.com' + r for r in resources] + resources = [Vidble.change_med_url(r) for r in resources] + return set(resources) + + @staticmethod + def change_med_url(url: str) -> str: + out = re.sub(r'_med(\..{3,4})$', r'\1', url) + return out diff --git a/tests/site_downloaders/test_vidble.py b/tests/site_downloaders/test_vidble.py new file mode 100644 index 0000000..1617bf1 --- /dev/null +++ b/tests/site_downloaders/test_vidble.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# coding=utf-8 +from unittest.mock import Mock + +import pytest + +from bdfr.resource import Resource +from bdfr.site_downloaders.vidble import Vidble + + +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('/RDFbznUvcN_med.jpg', '/RDFbznUvcN.jpg'), +)) +def test_change_med_url(test_url: str, expected: str): + result = Vidble.change_med_url(test_url) + assert result == expected + + +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('https://www.vidble.com/show/UxsvAssYe5', { + 'https://www.vidble.com/UxsvAssYe5.gif', + }), + ('https://vidble.com/show/RDFbznUvcN', { + 'https://www.vidble.com/RDFbznUvcN.jpg', + }), + ('https://vidble.com/album/h0jTLs6B', { + 'https://www.vidble.com/XG4eAoJ5JZ.jpg', + 'https://www.vidble.com/IqF5UdH6Uq.jpg', + 'https://www.vidble.com/VWuNsnLJMD.jpg', + 'https://www.vidble.com/sMmM8O650W.jpg', + }), + ('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', { + 'https://www.vidble.com/0q4nWakqM6kzQWxlePD8N62Dsflev0N9.mp4', + }), +)) +def test_get_links(test_url: str, expected: set[str]): + results = Vidble.get_links(test_url) + assert results == expected + + +@pytest.mark.parametrize(('test_url', 'expected_hashes'), ( + ('https://www.vidble.com/show/UxsvAssYe5', { + '0ef2f8e0e0b45936d2fb3e6fbdf67e28', + }), + ('https://vidble.com/show/RDFbznUvcN', { + 'c2dd30a71e32369c50eed86f86efff58', + }), + ('https://vidble.com/album/h0jTLs6B', { + '3b3cba02e01c91f9858a95240b942c71', + 'dd6ecf5fc9e936f9fb614eb6a0537f99', + 'b31a942cd8cdda218ed547bbc04c3a27', + '6f77c570b451eef4222804bd52267481', + }), + ('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', { + 'cebe9d5f24dba3b0443e5097f160ca83', + }), +)) +def test_find_resources(test_url: str, expected_hashes: set[str]): + mock_download = Mock() + mock_download.url = test_url + downloader = Vidble(mock_download) + results = downloader.find_resources() + assert all([isinstance(res, Resource) for res in results]) + [res.download() for res in results] + hashes = set([res.hash.hexdigest() for res in results]) + assert hashes == set(expected_hashes) From aee6f4add9a0e89686c194ff8be3723bb3ce24e6 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 11 Sep 2021 12:15:35 +1000 Subject: [PATCH 14/15] Add Vidble to download factory --- bdfr/site_downloaders/download_factory.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 911e8fb..a4e9a6a 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -16,6 +16,7 @@ from bdfr.site_downloaders.imgur import Imgur from bdfr.site_downloaders.pornhub import PornHub from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost +from bdfr.site_downloaders.vidble import Vidble from bdfr.site_downloaders.youtube import Youtube @@ -46,11 +47,12 @@ class DownloadFactory: return Direct elif re.match(r'pornhub\.com.*', sanitised_url): return PornHub + elif re.match(r'vidble\.com', sanitised_url): + return Vidble elif YoutubeDlFallback.can_handle_link(sanitised_url): return YoutubeDlFallback else: - raise NotADownloadableLinkError( - f'No downloader module exists for url {url}') + raise NotADownloadableLinkError(f'No downloader module exists for url {url}') @staticmethod def sanitise_url(url: str) -> str: From 89e24eca62bd7cf5fd6e9e8854f87a03d76f1309 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Sun, 12 Sep 2021 20:06:51 +0300 Subject: [PATCH 15/15] Bump version to v2.4 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 2969fe0..196bd9e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.2.0 +version = 2.4.0 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc