1
0
Fork 0
mirror of synced 2024-06-25 17:40:17 +12:00

Integrate new base_downloader class

This commit is contained in:
Serene-Arc 2021-02-25 20:40:08 +10:00 committed by Ali Parlakci
parent a75e94e43e
commit e0d321c785
13 changed files with 70 additions and 93 deletions

View file

@ -3,42 +3,22 @@
import logging
from abc import ABC, abstractmethod
from typing import Optional
import requests
from praw.models import Submission
from bulkredditdownloader.errors import SiteDownloaderError
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.resource import Resource
logger = logging.getLogger(__name__)
class BaseDownloader(ABC):
def __init__(self, post: Submission):
def __init__(self, post: Submission, typical_extension: Optional[str] = None):
self.post = post
self.hashes = []
self.typical_extension = typical_extension
@abstractmethod
def download(self) -> list[Resource]:
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
"""Return list of all un-downloaded Resources from submission"""
raise NotImplementedError
def _download_resource(self, resource_url: str):
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "
"Safari/537.36 OPR/54.0.2952.64",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"Accept-Encoding": "none",
"Accept-Language": "en-US,en;q=0.8",
"Connection": "keep-alive",
}
# Loop to attempt download 3 times
for i in range(3):
try:
download_content = requests.get(resource_url, headers=headers).content
except ConnectionResetError:
raise SiteDownloaderError
return Resource(self.post, resource_url, download_content)
raise SiteDownloaderError

View file

@ -1,7 +1,11 @@
#!/usr/bin/env python3
from typing import Optional
from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
@ -9,5 +13,5 @@ class Direct(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
def download(self):
return [self._download_resource(self.post.url)]
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
return [Resource(self.post, self.post.url)]

View file

@ -5,10 +5,13 @@ import re
import urllib.error
import urllib.request
from html.parser import HTMLParser
from typing import Optional
from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
logger = logging.getLogger(__name__)
@ -18,7 +21,7 @@ class Erome(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
def download(self):
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
try:
images = self._get_links(self.post.url)
except urllib.error.HTTPError:
@ -29,15 +32,14 @@ class Erome(BaseDownloader):
image = images[0]
if not re.match(r'https?://.*', image):
image = "https://" + image
return [self._download_resource(image)]
return [Resource(self.post, image)]
else:
out = []
for i, image in enumerate(images):
if not re.match(r'https?://.*', image):
image = "https://" + image
out.append(self._download_resource(image))
out.append(Resource(self.post, image))
return out
@staticmethod

View file

@ -2,11 +2,14 @@
import json
import logging
from typing import Optional
import requests
from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
logger = logging.getLogger(__name__)
@ -18,7 +21,7 @@ class Gallery(BaseDownloader):
link = self.post.url
self.raw_data = self._get_data(link)
def download(self):
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
images = {}
count = 0
for model in self.raw_data['posts']['models']:
@ -61,7 +64,5 @@ class Gallery(BaseDownloader):
return data
def _download_album(self, images: dict):
out = []
for image_key in images.keys():
out.append(self._download_resource(images[image_key]['url']))
out = [Resource(self.post, images[image_key]['url']) for image_key in images.keys()]
return out

View file

@ -3,10 +3,13 @@
import json
import re
import urllib.request
from typing import Optional
from bs4 import BeautifulSoup
from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
@ -14,14 +17,12 @@ class Gfycat(GifDeliveryNetwork):
def __init__(self, post: Submission):
super().__init__(post)
def download(self):
super().download()
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
return super().find_resources(authenticator)
@staticmethod
def _get_link(url: str) -> str:
"""Extract direct link to the video from page's source
and return it
"""
"""Extract direct link to the video from page's source and return it """
if re.match(r'\.(webm|mp4|gif)$', url):
return url

View file

@ -1,11 +1,14 @@
#!/usr/bin/env python3
import urllib.request
from typing import Optional
from bs4 import BeautifulSoup
from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
@ -13,19 +16,17 @@ class GifDeliveryNetwork(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
def download(self):
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
try:
media_url = self._get_link(self.post.url)
except IndexError:
raise NotADownloadableLinkError("Could not read the page source")
return [self._download_resource(media_url)]
return [Resource(self.post, media_url)]
@staticmethod
def _get_link(url: str) -> str:
"""Extract direct link to the video from page's source
and return it
"""
"""Extract direct link to the video from page's source and return it"""
if '.webm' in url.split('/')[-1] or '.mp4' in url.split('/')[-1] or '.gif' in url.split('/')[-1]:
return url

View file

@ -2,11 +2,14 @@
import json
import logging
from typing import Optional
import requests
from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound, SiteDownloaderError
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.site_downloaders.direct import Direct
@ -14,19 +17,18 @@ logger = logging.getLogger(__name__)
class Imgur(BaseDownloader):
imgur_image_domain = "https://i.imgur.com/"
def __init__(self, post: Submission):
super().__init__(post)
self.raw_data = {}
def download(self):
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
link = self.post.url
if link.endswith(".gifv"):
direct_thing = Direct(self.post)
return direct_thing.download()
return direct_thing.find_resources(authenticator)
self.raw_data = self._get_data(link)
@ -47,13 +49,13 @@ class Imgur(BaseDownloader):
for i in range(images_length):
extension = self._validate_extension(images["images"][i]["ext"])
image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension
out.append(self._download_resource(image_url))
out.append(Resource(self.post, image_url))
return out
def _download_image(self, image: dict):
extension = self._validate_extension(image["ext"])
image_url = self.imgur_image_domain + image["hash"] + extension
return [self._download_resource(image_url)]
return [Resource(self.post, image_url)]
def _is_album(self) -> bool:
return "album_images" in self.raw_data

View file

@ -2,11 +2,14 @@
import json
import urllib.request
from typing import Optional
from bs4 import BeautifulSoup
from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
@ -14,8 +17,8 @@ class Redgifs(GifDeliveryNetwork):
def __init__(self, post: Submission):
super().__init__(post)
def download(self):
super().download()
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
return super().find_resources(authenticator)
@staticmethod
def _get_link(url: str) -> str:
@ -31,7 +34,8 @@ class Redgifs(GifDeliveryNetwork):
url.add_header(
'User-Agent',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64')
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64')
page_source = (urllib.request.urlopen(url).read().decode())

View file

@ -1,9 +1,11 @@
#!/usr/bin/env python3
import logging
from typing import Optional
from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
@ -14,8 +16,10 @@ class SelfPost(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
def download(self):
return Resource(self.post, self.post.url, bytes(self.export_to_string()))
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
out = Resource(self.post, self.post.url)
out.content = self.export_to_string()
return out
def export_to_string(self) -> str:
"""Self posts are formatted here"""

View file

@ -5,10 +5,12 @@ import os
import pathlib
import subprocess
import tempfile
from typing import Optional
import requests
from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
@ -19,12 +21,12 @@ class VReddit(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
def download(self):
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
try:
fnull = open(os.devnull, 'w')
subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT)
except subprocess.SubprocessError:
return self._download_resource(self.post.url)
return [Resource(self.post, self.post.url)]
else:
video_url = self.post.url
audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4'
@ -39,7 +41,9 @@ class VReddit(BaseDownloader):
self._merge_audio(temp_dir)
with open(temp_dir / 'output.mp4', 'rb') as file:
content = file.read()
return Resource(self.post, self.post.url, content)
out = Resource(self.post, self.post.url)
out.content = content
return out
@staticmethod
def _merge_audio(working_directory: pathlib.Path):

View file

@ -2,10 +2,12 @@
import logging
import tempfile
from typing import Optional
import youtube_dl
from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
@ -16,8 +18,8 @@ class Youtube(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
def download(self):
return self._download_video()
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
return [self._download_video()]
def _download_video(self) -> Resource:
with tempfile.TemporaryDirectory() as temp_dir:
@ -33,4 +35,6 @@ class Youtube(BaseDownloader):
with open(temp_dir / 'test.mp4', 'rb') as file:
content = file.read()
return Resource(self.post, self.post.url, content)
out = Resource(self.post, self.post.url)
out.content = content
return out

View file

@ -1,30 +0,0 @@
#!/usr/bin/env python3
# coding=utf-8
from pathlib import Path
from unittest.mock import Mock
import pytest
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
class BlankDownloader(BaseDownloader):
def __init__(self, post):
super().__init__(post)
def download(self) -> list[Resource]:
return [self._download_resource(self.post.url)]
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
('https://docs.python.org/3/_static/py.png', 'a721fc7ec672275e257bbbfde49a4d4e'),
))
def test_get_resource(test_url: str, expected_hash: str):
mock_submission = Mock
mock_submission.url = test_url
downloader = BlankDownloader(mock_submission)
result = downloader.download()
assert isinstance(result[0], Resource)
assert result[0].hash.hexdigest() == expected_hash

View file

@ -15,6 +15,6 @@ def reddit_submission(reddit_instance) -> praw.models.Submission:
def test_gallery(reddit_submission: praw.models.Submission):
gallery = Gallery(reddit_submission)
results = gallery.download()
results = gallery.find_resources()
assert len(results) == 4
assert all([isinstance(result, Resource) for result in results])