1
0
Fork 0
mirror of synced 2024-06-28 19:10:41 +12:00

Integrate new base_downloader class

This commit is contained in:
Serene-Arc 2021-02-25 20:40:08 +10:00 committed by Ali Parlakci
parent a75e94e43e
commit e0d321c785
13 changed files with 70 additions and 93 deletions

View file

@ -3,42 +3,22 @@
import logging import logging
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Optional
import requests
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.errors import SiteDownloaderError from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.resource import Resource from bulkredditdownloader.resource import Resource
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class BaseDownloader(ABC): class BaseDownloader(ABC):
def __init__(self, post: Submission): def __init__(self, post: Submission, typical_extension: Optional[str] = None):
self.post = post self.post = post
self.hashes = [] self.typical_extension = typical_extension
@abstractmethod @abstractmethod
def download(self) -> list[Resource]: def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
"""Return list of all un-downloaded Resources from submission"""
raise NotImplementedError raise NotImplementedError
def _download_resource(self, resource_url: str):
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "
"Safari/537.36 OPR/54.0.2952.64",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"Accept-Encoding": "none",
"Accept-Language": "en-US,en;q=0.8",
"Connection": "keep-alive",
}
# Loop to attempt download 3 times
for i in range(3):
try:
download_content = requests.get(resource_url, headers=headers).content
except ConnectionResetError:
raise SiteDownloaderError
return Resource(self.post, resource_url, download_content)
raise SiteDownloaderError

View file

@ -1,7 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from typing import Optional
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
@ -9,5 +13,5 @@ class Direct(BaseDownloader):
def __init__(self, post: Submission): def __init__(self, post: Submission):
super().__init__(post) super().__init__(post)
def download(self): def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
return [self._download_resource(self.post.url)] return [Resource(self.post, self.post.url)]

View file

@ -5,10 +5,13 @@ import re
import urllib.error import urllib.error
import urllib.request import urllib.request
from html.parser import HTMLParser from html.parser import HTMLParser
from typing import Optional
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -18,7 +21,7 @@ class Erome(BaseDownloader):
def __init__(self, post: Submission): def __init__(self, post: Submission):
super().__init__(post) super().__init__(post)
def download(self): def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
try: try:
images = self._get_links(self.post.url) images = self._get_links(self.post.url)
except urllib.error.HTTPError: except urllib.error.HTTPError:
@ -29,15 +32,14 @@ class Erome(BaseDownloader):
image = images[0] image = images[0]
if not re.match(r'https?://.*', image): if not re.match(r'https?://.*', image):
image = "https://" + image image = "https://" + image
return [self._download_resource(image)] return [Resource(self.post, image)]
else: else:
out = [] out = []
for i, image in enumerate(images): for i, image in enumerate(images):
if not re.match(r'https?://.*', image): if not re.match(r'https?://.*', image):
image = "https://" + image image = "https://" + image
out.append(Resource(self.post, image))
out.append(self._download_resource(image))
return out return out
@staticmethod @staticmethod

View file

@ -2,11 +2,14 @@
import json import json
import logging import logging
from typing import Optional
import requests import requests
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -18,7 +21,7 @@ class Gallery(BaseDownloader):
link = self.post.url link = self.post.url
self.raw_data = self._get_data(link) self.raw_data = self._get_data(link)
def download(self): def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
images = {} images = {}
count = 0 count = 0
for model in self.raw_data['posts']['models']: for model in self.raw_data['posts']['models']:
@ -61,7 +64,5 @@ class Gallery(BaseDownloader):
return data return data
def _download_album(self, images: dict): def _download_album(self, images: dict):
out = [] out = [Resource(self.post, images[image_key]['url']) for image_key in images.keys()]
for image_key in images.keys():
out.append(self._download_resource(images[image_key]['url']))
return out return out

View file

@ -3,10 +3,13 @@
import json import json
import re import re
import urllib.request import urllib.request
from typing import Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
@ -14,14 +17,12 @@ class Gfycat(GifDeliveryNetwork):
def __init__(self, post: Submission): def __init__(self, post: Submission):
super().__init__(post) super().__init__(post)
def download(self): def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
super().download() return super().find_resources(authenticator)
@staticmethod @staticmethod
def _get_link(url: str) -> str: def _get_link(url: str) -> str:
"""Extract direct link to the video from page's source """Extract direct link to the video from page's source and return it """
and return it
"""
if re.match(r'\.(webm|mp4|gif)$', url): if re.match(r'\.(webm|mp4|gif)$', url):
return url return url

View file

@ -1,11 +1,14 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import urllib.request import urllib.request
from typing import Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
@ -13,19 +16,17 @@ class GifDeliveryNetwork(BaseDownloader):
def __init__(self, post: Submission): def __init__(self, post: Submission):
super().__init__(post) super().__init__(post)
def download(self): def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
try: try:
media_url = self._get_link(self.post.url) media_url = self._get_link(self.post.url)
except IndexError: except IndexError:
raise NotADownloadableLinkError("Could not read the page source") raise NotADownloadableLinkError("Could not read the page source")
return [self._download_resource(media_url)] return [Resource(self.post, media_url)]
@staticmethod @staticmethod
def _get_link(url: str) -> str: def _get_link(url: str) -> str:
"""Extract direct link to the video from page's source """Extract direct link to the video from page's source and return it"""
and return it
"""
if '.webm' in url.split('/')[-1] or '.mp4' in url.split('/')[-1] or '.gif' in url.split('/')[-1]: if '.webm' in url.split('/')[-1] or '.mp4' in url.split('/')[-1] or '.gif' in url.split('/')[-1]:
return url return url

View file

@ -2,11 +2,14 @@
import json import json
import logging import logging
from typing import Optional
import requests import requests
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound, SiteDownloaderError from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound, SiteDownloaderError
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.site_downloaders.direct import Direct from bulkredditdownloader.site_downloaders.direct import Direct
@ -14,19 +17,18 @@ logger = logging.getLogger(__name__)
class Imgur(BaseDownloader): class Imgur(BaseDownloader):
imgur_image_domain = "https://i.imgur.com/" imgur_image_domain = "https://i.imgur.com/"
def __init__(self, post: Submission): def __init__(self, post: Submission):
super().__init__(post) super().__init__(post)
self.raw_data = {} self.raw_data = {}
def download(self): def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
link = self.post.url link = self.post.url
if link.endswith(".gifv"): if link.endswith(".gifv"):
direct_thing = Direct(self.post) direct_thing = Direct(self.post)
return direct_thing.download() return direct_thing.find_resources(authenticator)
self.raw_data = self._get_data(link) self.raw_data = self._get_data(link)
@ -47,13 +49,13 @@ class Imgur(BaseDownloader):
for i in range(images_length): for i in range(images_length):
extension = self._validate_extension(images["images"][i]["ext"]) extension = self._validate_extension(images["images"][i]["ext"])
image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension
out.append(self._download_resource(image_url)) out.append(Resource(self.post, image_url))
return out return out
def _download_image(self, image: dict): def _download_image(self, image: dict):
extension = self._validate_extension(image["ext"]) extension = self._validate_extension(image["ext"])
image_url = self.imgur_image_domain + image["hash"] + extension image_url = self.imgur_image_domain + image["hash"] + extension
return [self._download_resource(image_url)] return [Resource(self.post, image_url)]
def _is_album(self) -> bool: def _is_album(self) -> bool:
return "album_images" in self.raw_data return "album_images" in self.raw_data

View file

@ -2,11 +2,14 @@
import json import json
import urllib.request import urllib.request
from typing import Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
@ -14,8 +17,8 @@ class Redgifs(GifDeliveryNetwork):
def __init__(self, post: Submission): def __init__(self, post: Submission):
super().__init__(post) super().__init__(post)
def download(self): def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
super().download() return super().find_resources(authenticator)
@staticmethod @staticmethod
def _get_link(url: str) -> str: def _get_link(url: str) -> str:
@ -31,7 +34,8 @@ class Redgifs(GifDeliveryNetwork):
url.add_header( url.add_header(
'User-Agent', 'User-Agent',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64') 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64')
page_source = (urllib.request.urlopen(url).read().decode()) page_source = (urllib.request.urlopen(url).read().decode())

View file

@ -1,9 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import logging import logging
from typing import Optional
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.resource import Resource from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
@ -14,8 +16,10 @@ class SelfPost(BaseDownloader):
def __init__(self, post: Submission): def __init__(self, post: Submission):
super().__init__(post) super().__init__(post)
def download(self): def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
return Resource(self.post, self.post.url, bytes(self.export_to_string())) out = Resource(self.post, self.post.url)
out.content = self.export_to_string()
return out
def export_to_string(self) -> str: def export_to_string(self) -> str:
"""Self posts are formatted here""" """Self posts are formatted here"""

View file

@ -5,10 +5,12 @@ import os
import pathlib import pathlib
import subprocess import subprocess
import tempfile import tempfile
from typing import Optional
import requests import requests
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.resource import Resource from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
@ -19,12 +21,12 @@ class VReddit(BaseDownloader):
def __init__(self, post: Submission): def __init__(self, post: Submission):
super().__init__(post) super().__init__(post)
def download(self): def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
try: try:
fnull = open(os.devnull, 'w') fnull = open(os.devnull, 'w')
subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT) subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT)
except subprocess.SubprocessError: except subprocess.SubprocessError:
return self._download_resource(self.post.url) return [Resource(self.post, self.post.url)]
else: else:
video_url = self.post.url video_url = self.post.url
audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4' audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4'
@ -39,7 +41,9 @@ class VReddit(BaseDownloader):
self._merge_audio(temp_dir) self._merge_audio(temp_dir)
with open(temp_dir / 'output.mp4', 'rb') as file: with open(temp_dir / 'output.mp4', 'rb') as file:
content = file.read() content = file.read()
return Resource(self.post, self.post.url, content) out = Resource(self.post, self.post.url)
out.content = content
return out
@staticmethod @staticmethod
def _merge_audio(working_directory: pathlib.Path): def _merge_audio(working_directory: pathlib.Path):

View file

@ -2,10 +2,12 @@
import logging import logging
import tempfile import tempfile
from typing import Optional
import youtube_dl import youtube_dl
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.resource import Resource from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
@ -16,8 +18,8 @@ class Youtube(BaseDownloader):
def __init__(self, post: Submission): def __init__(self, post: Submission):
super().__init__(post) super().__init__(post)
def download(self): def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
return self._download_video() return [self._download_video()]
def _download_video(self) -> Resource: def _download_video(self) -> Resource:
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
@ -33,4 +35,6 @@ class Youtube(BaseDownloader):
with open(temp_dir / 'test.mp4', 'rb') as file: with open(temp_dir / 'test.mp4', 'rb') as file:
content = file.read() content = file.read()
return Resource(self.post, self.post.url, content) out = Resource(self.post, self.post.url)
out.content = content
return out

View file

@ -1,30 +0,0 @@
#!/usr/bin/env python3
# coding=utf-8
from pathlib import Path
from unittest.mock import Mock
import pytest
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
class BlankDownloader(BaseDownloader):
def __init__(self, post):
super().__init__(post)
def download(self) -> list[Resource]:
return [self._download_resource(self.post.url)]
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
('https://docs.python.org/3/_static/py.png', 'a721fc7ec672275e257bbbfde49a4d4e'),
))
def test_get_resource(test_url: str, expected_hash: str):
mock_submission = Mock
mock_submission.url = test_url
downloader = BlankDownloader(mock_submission)
result = downloader.download()
assert isinstance(result[0], Resource)
assert result[0].hash.hexdigest() == expected_hash

View file

@ -15,6 +15,6 @@ def reddit_submission(reddit_instance) -> praw.models.Submission:
def test_gallery(reddit_submission: praw.models.Submission): def test_gallery(reddit_submission: praw.models.Submission):
gallery = Gallery(reddit_submission) gallery = Gallery(reddit_submission)
results = gallery.download() results = gallery.find_resources()
assert len(results) == 4 assert len(results) == 4
assert all([isinstance(result, Resource) for result in results]) assert all([isinstance(result, Resource) for result in results])