diff --git a/bulkredditdownloader/site_downloaders/base_downloader.py b/bulkredditdownloader/site_downloaders/base_downloader.py index 4a0d871..a872953 100644 --- a/bulkredditdownloader/site_downloaders/base_downloader.py +++ b/bulkredditdownloader/site_downloaders/base_downloader.py @@ -3,42 +3,22 @@ import logging from abc import ABC, abstractmethod +from typing import Optional -import requests from praw.models import Submission -from bulkredditdownloader.errors import SiteDownloaderError +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.resource import Resource logger = logging.getLogger(__name__) class BaseDownloader(ABC): - def __init__(self, post: Submission): + def __init__(self, post: Submission, typical_extension: Optional[str] = None): self.post = post - self.hashes = [] + self.typical_extension = typical_extension @abstractmethod - def download(self) -> list[Resource]: + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + """Return list of all un-downloaded Resources from submission""" raise NotImplementedError - - def _download_resource(self, resource_url: str): - headers = { - "User-Agent": - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 " - "Safari/537.36 OPR/54.0.2952.64", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", - "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", - "Accept-Encoding": "none", - "Accept-Language": "en-US,en;q=0.8", - "Connection": "keep-alive", - } - # Loop to attempt download 3 times - for i in range(3): - try: - download_content = requests.get(resource_url, headers=headers).content - except ConnectionResetError: - raise SiteDownloaderError - return Resource(self.post, resource_url, download_content) - - raise SiteDownloaderError diff --git a/bulkredditdownloader/site_downloaders/direct.py b/bulkredditdownloader/site_downloaders/direct.py index 713eacf..450d409 100644 --- a/bulkredditdownloader/site_downloaders/direct.py +++ b/bulkredditdownloader/site_downloaders/direct.py @@ -1,7 +1,11 @@ #!/usr/bin/env python3 +from typing import Optional + from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -9,5 +13,5 @@ class Direct(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def download(self): - return [self._download_resource(self.post.url)] + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + return [Resource(self.post, self.post.url)] diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index 39094f6..8675cee 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -5,10 +5,13 @@ import re import urllib.error import urllib.request from html.parser import HTMLParser +from typing import Optional from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) @@ -18,7 +21,7 @@ class Erome(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def download(self): + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: try: images = self._get_links(self.post.url) except urllib.error.HTTPError: @@ -29,15 +32,14 @@ class Erome(BaseDownloader): image = images[0] if not re.match(r'https?://.*', image): image = "https://" + image - return [self._download_resource(image)] + return [Resource(self.post, image)] else: out = [] for i, image in enumerate(images): if not re.match(r'https?://.*', image): image = "https://" + image - - out.append(self._download_resource(image)) + out.append(Resource(self.post, image)) return out @staticmethod diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 7125674..8d53056 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -2,11 +2,14 @@ import json import logging +from typing import Optional import requests from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) @@ -18,7 +21,7 @@ class Gallery(BaseDownloader): link = self.post.url self.raw_data = self._get_data(link) - def download(self): + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: images = {} count = 0 for model in self.raw_data['posts']['models']: @@ -61,7 +64,5 @@ class Gallery(BaseDownloader): return data def _download_album(self, images: dict): - out = [] - for image_key in images.keys(): - out.append(self._download_resource(images[image_key]['url'])) + out = [Resource(self.post, images[image_key]['url']) for image_key in images.keys()] return out diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py index af94596..cd33f46 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -3,10 +3,13 @@ import json import re import urllib.request +from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork @@ -14,14 +17,12 @@ class Gfycat(GifDeliveryNetwork): def __init__(self, post: Submission): super().__init__(post) - def download(self): - super().download() + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + return super().find_resources(authenticator) @staticmethod def _get_link(url: str) -> str: - """Extract direct link to the video from page's source - and return it - """ + """Extract direct link to the video from page's source and return it """ if re.match(r'\.(webm|mp4|gif)$', url): return url diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index b335ed8..072048e 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -1,11 +1,14 @@ #!/usr/bin/env python3 import urllib.request +from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -13,19 +16,17 @@ class GifDeliveryNetwork(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def download(self): + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: try: media_url = self._get_link(self.post.url) except IndexError: raise NotADownloadableLinkError("Could not read the page source") - return [self._download_resource(media_url)] + return [Resource(self.post, media_url)] @staticmethod def _get_link(url: str) -> str: - """Extract direct link to the video from page's source - and return it - """ + """Extract direct link to the video from page's source and return it""" if '.webm' in url.split('/')[-1] or '.mp4' in url.split('/')[-1] or '.gif' in url.split('/')[-1]: return url diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index d555800..2111b44 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -2,11 +2,14 @@ import json import logging +from typing import Optional import requests from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound, SiteDownloaderError +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.direct import Direct @@ -14,19 +17,18 @@ logger = logging.getLogger(__name__) class Imgur(BaseDownloader): - imgur_image_domain = "https://i.imgur.com/" def __init__(self, post: Submission): super().__init__(post) self.raw_data = {} - def download(self): + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: link = self.post.url if link.endswith(".gifv"): direct_thing = Direct(self.post) - return direct_thing.download() + return direct_thing.find_resources(authenticator) self.raw_data = self._get_data(link) @@ -47,13 +49,13 @@ class Imgur(BaseDownloader): for i in range(images_length): extension = self._validate_extension(images["images"][i]["ext"]) image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension - out.append(self._download_resource(image_url)) + out.append(Resource(self.post, image_url)) return out def _download_image(self, image: dict): extension = self._validate_extension(image["ext"]) image_url = self.imgur_image_domain + image["hash"] + extension - return [self._download_resource(image_url)] + return [Resource(self.post, image_url)] def _is_album(self) -> bool: return "album_images" in self.raw_data diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index 7cb54fc..3e8ad8e 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -2,11 +2,14 @@ import json import urllib.request +from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork @@ -14,8 +17,8 @@ class Redgifs(GifDeliveryNetwork): def __init__(self, post: Submission): super().__init__(post) - def download(self): - super().download() + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + return super().find_resources(authenticator) @staticmethod def _get_link(url: str) -> str: @@ -31,7 +34,8 @@ class Redgifs(GifDeliveryNetwork): url.add_header( 'User-Agent', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64') + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64') page_source = (urllib.request.urlopen(url).read().decode()) diff --git a/bulkredditdownloader/site_downloaders/self_post.py b/bulkredditdownloader/site_downloaders/self_post.py index cda5c78..f01b6f1 100644 --- a/bulkredditdownloader/site_downloaders/self_post.py +++ b/bulkredditdownloader/site_downloaders/self_post.py @@ -1,9 +1,11 @@ #!/usr/bin/env python3 import logging +from typing import Optional from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -14,8 +16,10 @@ class SelfPost(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def download(self): - return Resource(self.post, self.post.url, bytes(self.export_to_string())) + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + out = Resource(self.post, self.post.url) + out.content = self.export_to_string() + return out def export_to_string(self) -> str: """Self posts are formatted here""" diff --git a/bulkredditdownloader/site_downloaders/vreddit.py b/bulkredditdownloader/site_downloaders/vreddit.py index 40df4b3..c92bf8a 100644 --- a/bulkredditdownloader/site_downloaders/vreddit.py +++ b/bulkredditdownloader/site_downloaders/vreddit.py @@ -5,10 +5,12 @@ import os import pathlib import subprocess import tempfile +from typing import Optional import requests from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -19,12 +21,12 @@ class VReddit(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def download(self): + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: try: fnull = open(os.devnull, 'w') subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT) except subprocess.SubprocessError: - return self._download_resource(self.post.url) + return [Resource(self.post, self.post.url)] else: video_url = self.post.url audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4' @@ -39,7 +41,9 @@ class VReddit(BaseDownloader): self._merge_audio(temp_dir) with open(temp_dir / 'output.mp4', 'rb') as file: content = file.read() - return Resource(self.post, self.post.url, content) + out = Resource(self.post, self.post.url) + out.content = content + return out @staticmethod def _merge_audio(working_directory: pathlib.Path): diff --git a/bulkredditdownloader/site_downloaders/youtube.py b/bulkredditdownloader/site_downloaders/youtube.py index 6184d26..d9da907 100644 --- a/bulkredditdownloader/site_downloaders/youtube.py +++ b/bulkredditdownloader/site_downloaders/youtube.py @@ -2,10 +2,12 @@ import logging import tempfile +from typing import Optional import youtube_dl from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -16,8 +18,8 @@ class Youtube(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def download(self): - return self._download_video() + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + return [self._download_video()] def _download_video(self) -> Resource: with tempfile.TemporaryDirectory() as temp_dir: @@ -33,4 +35,6 @@ class Youtube(BaseDownloader): with open(temp_dir / 'test.mp4', 'rb') as file: content = file.read() - return Resource(self.post, self.post.url, content) + out = Resource(self.post, self.post.url) + out.content = content + return out diff --git a/bulkredditdownloader/tests/downloaders/test_base_downloader.py b/bulkredditdownloader/tests/downloaders/test_base_downloader.py deleted file mode 100644 index 3644abf..0000000 --- a/bulkredditdownloader/tests/downloaders/test_base_downloader.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python3 -# coding=utf-8 - -from pathlib import Path -from unittest.mock import Mock - -import pytest - -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader - - -class BlankDownloader(BaseDownloader): - def __init__(self, post): - super().__init__(post) - - def download(self) -> list[Resource]: - return [self._download_resource(self.post.url)] - - -@pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://docs.python.org/3/_static/py.png', 'a721fc7ec672275e257bbbfde49a4d4e'), -)) -def test_get_resource(test_url: str, expected_hash: str): - mock_submission = Mock - mock_submission.url = test_url - downloader = BlankDownloader(mock_submission) - result = downloader.download() - assert isinstance(result[0], Resource) - assert result[0].hash.hexdigest() == expected_hash diff --git a/bulkredditdownloader/tests/downloaders/test_gallery.py b/bulkredditdownloader/tests/downloaders/test_gallery.py index 1fd41e9..11a0651 100644 --- a/bulkredditdownloader/tests/downloaders/test_gallery.py +++ b/bulkredditdownloader/tests/downloaders/test_gallery.py @@ -15,6 +15,6 @@ def reddit_submission(reddit_instance) -> praw.models.Submission: def test_gallery(reddit_submission: praw.models.Submission): gallery = Gallery(reddit_submission) - results = gallery.download() + results = gallery.find_resources() assert len(results) == 4 assert all([isinstance(result, Resource) for result in results])