diff --git a/bdfr/resource.py b/bdfr/resource.py index 27ba84b..68a42e1 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -30,33 +30,7 @@ class Resource: @staticmethod def retry_download(url: str) -> Callable: - max_wait_time = 300 - - def http_download(download_parameters: dict) -> Optional[bytes]: - current_wait_time = 60 - if 'max_wait_time' in download_parameters: - max_wait_time = download_parameters['max_wait_time'] - else: - max_wait_time = 300 - while True: - try: - response = requests.get(url) - if re.match(r'^2\d{2}', str(response.status_code)) and response.content: - return response.content - elif response.status_code in (408, 429): - raise requests.exceptions.ConnectionError(f'Response code {response.status_code}') - else: - raise BulkDownloaderException( - f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') - except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e: - logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}') - time.sleep(current_wait_time) - if current_wait_time < max_wait_time: - current_wait_time += 60 - else: - logger.error(f'Max wait time exceeded for resource at url {url}') - raise - return http_download + return lambda global_params: Resource.http_download(url, global_params) def download(self, download_parameters: Optional[dict] = None): if download_parameters is None: @@ -82,3 +56,30 @@ class Resource: match = re.search(extension_pattern, stripped_url) if match: return match.group(1) + + @staticmethod + def http_download(url: str, download_parameters: dict) -> Optional[bytes]: + headers = download_parameters.get('headers') + current_wait_time = 60 + if 'max_wait_time' in download_parameters: + max_wait_time = download_parameters['max_wait_time'] + else: + max_wait_time = 300 + while True: + try: + response = requests.get(url, headers=headers) + if re.match(r'^2\d{2}', str(response.status_code)) and response.content: + return response.content + elif response.status_code in (408, 429): + raise requests.exceptions.ConnectionError(f'Response code {response.status_code}') + else: + raise BulkDownloaderException( + f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') + except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e: + logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}') + time.sleep(current_wait_time) + if current_wait_time < max_wait_time: + current_wait_time += 60 + else: + logger.error(f'Max wait time exceeded for resource at url {url}') + raise diff --git a/bdfr/site_downloaders/erome.py b/bdfr/site_downloaders/erome.py index 6130560..6250415 100644 --- a/bdfr/site_downloaders/erome.py +++ b/bdfr/site_downloaders/erome.py @@ -2,7 +2,7 @@ import logging import re -from typing import Optional +from typing import Callable, Optional import bs4 from praw.models import Submission @@ -29,7 +29,7 @@ class Erome(BaseDownloader): for link in links: if not re.match(r'https?://.*', link): link = 'https://' + link - out.append(Resource(self.post, link, Resource.retry_download(link))) + out.append(Resource(self.post, link, self.erome_download(link))) return out @staticmethod @@ -43,3 +43,14 @@ class Erome(BaseDownloader): out.extend([vid.get('src') for vid in videos]) return set(out) + + @staticmethod + def erome_download(url: str) -> Callable: + download_parameters = { + 'headers': { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/88.0.4324.104 Safari/537.36', + 'Referer': 'https://www.erome.com/', + }, + } + return lambda global_params: Resource.http_download(url, global_params | download_parameters) diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py index a870c2e..ba82007 100644 --- a/bdfr/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -5,7 +5,7 @@ import tempfile from pathlib import Path from typing import Callable, Optional -import youtube_dl +import yt_dlp from praw.models import Submission from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError @@ -45,9 +45,9 @@ class Youtube(BaseDownloader): download_path = Path(temp_dir).resolve() ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s' try: - with youtube_dl.YoutubeDL(ytdl_options) as ydl: + with yt_dlp.YoutubeDL(ytdl_options) as ydl: ydl.download([self.post.url]) - except youtube_dl.DownloadError as e: + except yt_dlp.DownloadError as e: raise SiteDownloaderError(f'Youtube download failed: {e}') downloaded_files = list(download_path.iterdir()) @@ -64,7 +64,7 @@ class Youtube(BaseDownloader): def get_video_attributes(url: str) -> dict: yt_logger = logging.getLogger('youtube-dl') yt_logger.setLevel(logging.CRITICAL) - with youtube_dl.YoutubeDL({'logger': yt_logger, }) as ydl: + with yt_dlp.YoutubeDL({'logger': yt_logger, }) as ydl: try: result = ydl.extract_info(url, download=False) return result diff --git a/requirements.txt b/requirements.txt index e7b5ff1..8ceffdb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,4 @@ ffmpeg-python>=0.2.0 praw>=7.2.0 pyyaml>=5.4.1 requests>=2.25.1 -youtube-dl>=2021.3.14 +yt-dlp>=2021.9.25 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 5792355..94ae1de 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.4.1 +version = 2.4.2 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc diff --git a/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py b/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py index f70a91c..f268c0a 100644 --- a/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py @@ -22,7 +22,7 @@ def test_can_handle_link(test_url: str, expected: bool): @pytest.mark.online @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://streamable.com/dt46y', '1e7f4928e55de6e3ca23d85cc9246bbb'), + ('https://streamable.com/dt46y', 'b7e465adaade5f2b6d8c2b4b7d0a2878'), ('https://streamable.com/t8sem', '49b2d1220c485455548f1edbc05d4ecf'), ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '21968d3d92161ea5e0abdcaf6311b06c'), ('https://v.redd.it/9z1dnk3xr5k61', '351a2b57e888df5ccbc508056511f38d'), @@ -34,4 +34,6 @@ def test_find_resources(test_url: str, expected_hash: str): resources = downloader.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) + for res in resources: + res.download() assert resources[0].hash.hexdigest() == expected_hash diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py index bab34bb..e06fab5 100644 --- a/tests/site_downloaders/test_erome.py +++ b/tests/site_downloaders/test_erome.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # coding=utf-8 - +import re from unittest.mock import MagicMock import pytest @@ -11,44 +11,37 @@ from bdfr.site_downloaders.erome import Erome @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_urls'), ( ('https://www.erome.com/a/vqtPuLXh', ( - 'https://s11.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4', + r'https://s\d+.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4', )), ('https://www.erome.com/a/ORhX0FZz', ( - 'https://s15.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', - 'https://s15.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', - 'https://s15.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4', - 'https://s15.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4', - 'https://s15.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4', - 'https://s15.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4', - 'https://s15.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4' + r'https://s\d+.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', + r'https://s\d+.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', + r'https://s\d+.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4', + r'https://s\d+.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4', + r'https://s\d+.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4', + r'https://s\d+.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4', + r'https://s\d+.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4' )), )) def test_get_link(test_url: str, expected_urls: tuple[str]): result = Erome. _get_links(test_url) - assert set(result) == set(expected_urls) + assert all([any([re.match(p, r) for r in result]) for p in expected_urls]) @pytest.mark.online @pytest.mark.slow -@pytest.mark.parametrize(('test_url', 'expected_hashes'), ( - ('https://www.erome.com/a/vqtPuLXh', { - '5da2a8d60d87bed279431fdec8e7d72f' - }), - ('https://www.erome.com/a/lGrcFxmb', { - '0e98f9f527a911dcedde4f846bb5b69f', - '25696ae364750a5303fc7d7dc78b35c1', - '63775689f438bd393cde7db6d46187de', - 'a1abf398cfd4ef9cfaf093ceb10c746a', - 'bd9e1a4ea5ef0d6ba47fb90e337c2d14' - }), +@pytest.mark.parametrize(('test_url', 'expected_hashes_len'), ( + ('https://www.erome.com/a/vqtPuLXh', 1), + ('https://www.erome.com/a/4tP3KI6F', 1), )) -def test_download_resource(test_url: str, expected_hashes: tuple[str]): +def test_download_resource(test_url: str, expected_hashes_len: int): # Can't compare hashes for this test, Erome doesn't return the exact same file from request to request so the hash # will change back and forth randomly mock_submission = MagicMock() mock_submission.url = test_url test_site = Erome(mock_submission) resources = test_site.find_resources() - [res.download() for res in resources] + for res in resources: + res.download() resource_hashes = [res.hash.hexdigest() for res in resources] - assert len(resource_hashes) == len(expected_hashes) + assert len(resource_hashes) == expected_hashes_len diff --git a/tests/site_downloaders/test_pornhub.py b/tests/site_downloaders/test_pornhub.py index e07da45..5c220cc 100644 --- a/tests/site_downloaders/test_pornhub.py +++ b/tests/site_downloaders/test_pornhub.py @@ -12,7 +12,7 @@ from bdfr.site_downloaders.pornhub import PornHub @pytest.mark.online @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://www.pornhub.com/view_video.php?viewkey=ph5a2ee0461a8d0', '5f5294b9b97dbb7cb9cf8df278515621'), + ('https://www.pornhub.com/view_video.php?viewkey=ph6074c59798497', 'd9b99e4ebecf2d8d67efe5e70d2acf8a'), )) def test_find_resources_good(test_url: str, expected_hash: str): test_submission = MagicMock() diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index 1f6b81a..684eb20 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -13,8 +13,9 @@ from bdfr.site_downloaders.youtube import Youtube @pytest.mark.online @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://www.youtube.com/watch?v=uSm2VDgRIUs', 'f70b704b4b78b9bb5cd032bfc26e4971'), - ('https://www.youtube.com/watch?v=GcI7nxQj7HA', '2bfdbf434ed284623e46f3bf52c36166'), + ('https://www.youtube.com/watch?v=uSm2VDgRIUs', '2d60b54582df5b95ec72bb00b580d2ff'), + ('https://www.youtube.com/watch?v=GcI7nxQj7HA', '5db0fc92a0a7fb9ac91e63505eea9cf0'), + ('https://youtu.be/TMqPOlp4tNo', 'f68c00b018162857f3df4844c45302e7'), # Age restricted )) def test_find_resources_good(test_url: str, expected_hash: str): test_submission = MagicMock()