Integrate new base_downloader class
This commit is contained in:
parent
a75e94e43e
commit
e0d321c785
|
@ -3,42 +3,22 @@
|
|||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from praw.models import Submission
|
||||
|
||||
from bulkredditdownloader.errors import SiteDownloaderError
|
||||
from bulkredditdownloader.authenticator import Authenticator
|
||||
from bulkredditdownloader.resource import Resource
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseDownloader(ABC):
|
||||
def __init__(self, post: Submission):
|
||||
def __init__(self, post: Submission, typical_extension: Optional[str] = None):
|
||||
self.post = post
|
||||
self.hashes = []
|
||||
self.typical_extension = typical_extension
|
||||
|
||||
@abstractmethod
|
||||
def download(self) -> list[Resource]:
|
||||
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
|
||||
"""Return list of all un-downloaded Resources from submission"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _download_resource(self, resource_url: str):
|
||||
headers = {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "
|
||||
"Safari/537.36 OPR/54.0.2952.64",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
||||
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
|
||||
"Accept-Encoding": "none",
|
||||
"Accept-Language": "en-US,en;q=0.8",
|
||||
"Connection": "keep-alive",
|
||||
}
|
||||
# Loop to attempt download 3 times
|
||||
for i in range(3):
|
||||
try:
|
||||
download_content = requests.get(resource_url, headers=headers).content
|
||||
except ConnectionResetError:
|
||||
raise SiteDownloaderError
|
||||
return Resource(self.post, resource_url, download_content)
|
||||
|
||||
raise SiteDownloaderError
|
||||
|
|
|
@ -1,7 +1,11 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from praw.models import Submission
|
||||
|
||||
from bulkredditdownloader.authenticator import Authenticator
|
||||
from bulkredditdownloader.resource import Resource
|
||||
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
|
||||
|
@ -9,5 +13,5 @@ class Direct(BaseDownloader):
|
|||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def download(self):
|
||||
return [self._download_resource(self.post.url)]
|
||||
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
|
||||
return [Resource(self.post, self.post.url)]
|
||||
|
|
|
@ -5,10 +5,13 @@ import re
|
|||
import urllib.error
|
||||
import urllib.request
|
||||
from html.parser import HTMLParser
|
||||
from typing import Optional
|
||||
|
||||
from praw.models import Submission
|
||||
|
||||
from bulkredditdownloader.authenticator import Authenticator
|
||||
from bulkredditdownloader.errors import NotADownloadableLinkError
|
||||
from bulkredditdownloader.resource import Resource
|
||||
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -18,7 +21,7 @@ class Erome(BaseDownloader):
|
|||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def download(self):
|
||||
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
|
||||
try:
|
||||
images = self._get_links(self.post.url)
|
||||
except urllib.error.HTTPError:
|
||||
|
@ -29,15 +32,14 @@ class Erome(BaseDownloader):
|
|||
image = images[0]
|
||||
if not re.match(r'https?://.*', image):
|
||||
image = "https://" + image
|
||||
return [self._download_resource(image)]
|
||||
return [Resource(self.post, image)]
|
||||
|
||||
else:
|
||||
out = []
|
||||
for i, image in enumerate(images):
|
||||
if not re.match(r'https?://.*', image):
|
||||
image = "https://" + image
|
||||
|
||||
out.append(self._download_resource(image))
|
||||
out.append(Resource(self.post, image))
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -2,11 +2,14 @@
|
|||
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from praw.models import Submission
|
||||
|
||||
from bulkredditdownloader.authenticator import Authenticator
|
||||
from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound
|
||||
from bulkredditdownloader.resource import Resource
|
||||
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -18,7 +21,7 @@ class Gallery(BaseDownloader):
|
|||
link = self.post.url
|
||||
self.raw_data = self._get_data(link)
|
||||
|
||||
def download(self):
|
||||
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
|
||||
images = {}
|
||||
count = 0
|
||||
for model in self.raw_data['posts']['models']:
|
||||
|
@ -61,7 +64,5 @@ class Gallery(BaseDownloader):
|
|||
return data
|
||||
|
||||
def _download_album(self, images: dict):
|
||||
out = []
|
||||
for image_key in images.keys():
|
||||
out.append(self._download_resource(images[image_key]['url']))
|
||||
out = [Resource(self.post, images[image_key]['url']) for image_key in images.keys()]
|
||||
return out
|
||||
|
|
|
@ -3,10 +3,13 @@
|
|||
import json
|
||||
import re
|
||||
import urllib.request
|
||||
from typing import Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from praw.models import Submission
|
||||
|
||||
from bulkredditdownloader.authenticator import Authenticator
|
||||
from bulkredditdownloader.resource import Resource
|
||||
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
|
||||
|
||||
|
||||
|
@ -14,14 +17,12 @@ class Gfycat(GifDeliveryNetwork):
|
|||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def download(self):
|
||||
super().download()
|
||||
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
|
||||
return super().find_resources(authenticator)
|
||||
|
||||
@staticmethod
|
||||
def _get_link(url: str) -> str:
|
||||
"""Extract direct link to the video from page's source
|
||||
and return it
|
||||
"""
|
||||
"""Extract direct link to the video from page's source and return it """
|
||||
if re.match(r'\.(webm|mp4|gif)$', url):
|
||||
return url
|
||||
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import urllib.request
|
||||
from typing import Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from praw.models import Submission
|
||||
|
||||
from bulkredditdownloader.authenticator import Authenticator
|
||||
from bulkredditdownloader.errors import NotADownloadableLinkError
|
||||
from bulkredditdownloader.resource import Resource
|
||||
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
|
||||
|
@ -13,19 +16,17 @@ class GifDeliveryNetwork(BaseDownloader):
|
|||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def download(self):
|
||||
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
|
||||
try:
|
||||
media_url = self._get_link(self.post.url)
|
||||
except IndexError:
|
||||
raise NotADownloadableLinkError("Could not read the page source")
|
||||
|
||||
return [self._download_resource(media_url)]
|
||||
return [Resource(self.post, media_url)]
|
||||
|
||||
@staticmethod
|
||||
def _get_link(url: str) -> str:
|
||||
"""Extract direct link to the video from page's source
|
||||
and return it
|
||||
"""
|
||||
"""Extract direct link to the video from page's source and return it"""
|
||||
if '.webm' in url.split('/')[-1] or '.mp4' in url.split('/')[-1] or '.gif' in url.split('/')[-1]:
|
||||
return url
|
||||
|
||||
|
|
|
@ -2,11 +2,14 @@
|
|||
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from praw.models import Submission
|
||||
|
||||
from bulkredditdownloader.authenticator import Authenticator
|
||||
from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound, SiteDownloaderError
|
||||
from bulkredditdownloader.resource import Resource
|
||||
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
||||
from bulkredditdownloader.site_downloaders.direct import Direct
|
||||
|
||||
|
@ -14,19 +17,18 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class Imgur(BaseDownloader):
|
||||
|
||||
imgur_image_domain = "https://i.imgur.com/"
|
||||
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
self.raw_data = {}
|
||||
|
||||
def download(self):
|
||||
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
|
||||
link = self.post.url
|
||||
|
||||
if link.endswith(".gifv"):
|
||||
direct_thing = Direct(self.post)
|
||||
return direct_thing.download()
|
||||
return direct_thing.find_resources(authenticator)
|
||||
|
||||
self.raw_data = self._get_data(link)
|
||||
|
||||
|
@ -47,13 +49,13 @@ class Imgur(BaseDownloader):
|
|||
for i in range(images_length):
|
||||
extension = self._validate_extension(images["images"][i]["ext"])
|
||||
image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension
|
||||
out.append(self._download_resource(image_url))
|
||||
out.append(Resource(self.post, image_url))
|
||||
return out
|
||||
|
||||
def _download_image(self, image: dict):
|
||||
extension = self._validate_extension(image["ext"])
|
||||
image_url = self.imgur_image_domain + image["hash"] + extension
|
||||
return [self._download_resource(image_url)]
|
||||
return [Resource(self.post, image_url)]
|
||||
|
||||
def _is_album(self) -> bool:
|
||||
return "album_images" in self.raw_data
|
||||
|
|
|
@ -2,11 +2,14 @@
|
|||
|
||||
import json
|
||||
import urllib.request
|
||||
from typing import Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from praw.models import Submission
|
||||
|
||||
from bulkredditdownloader.authenticator import Authenticator
|
||||
from bulkredditdownloader.errors import NotADownloadableLinkError
|
||||
from bulkredditdownloader.resource import Resource
|
||||
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
|
||||
|
||||
|
||||
|
@ -14,8 +17,8 @@ class Redgifs(GifDeliveryNetwork):
|
|||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def download(self):
|
||||
super().download()
|
||||
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
|
||||
return super().find_resources(authenticator)
|
||||
|
||||
@staticmethod
|
||||
def _get_link(url: str) -> str:
|
||||
|
@ -31,7 +34,8 @@ class Redgifs(GifDeliveryNetwork):
|
|||
|
||||
url.add_header(
|
||||
'User-Agent',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64')
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
||||
' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64')
|
||||
|
||||
page_source = (urllib.request.urlopen(url).read().decode())
|
||||
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from praw.models import Submission
|
||||
|
||||
from bulkredditdownloader.authenticator import Authenticator
|
||||
from bulkredditdownloader.resource import Resource
|
||||
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
|
@ -14,8 +16,10 @@ class SelfPost(BaseDownloader):
|
|||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def download(self):
|
||||
return Resource(self.post, self.post.url, bytes(self.export_to_string()))
|
||||
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
|
||||
out = Resource(self.post, self.post.url)
|
||||
out.content = self.export_to_string()
|
||||
return out
|
||||
|
||||
def export_to_string(self) -> str:
|
||||
"""Self posts are formatted here"""
|
||||
|
|
|
@ -5,10 +5,12 @@ import os
|
|||
import pathlib
|
||||
import subprocess
|
||||
import tempfile
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from praw.models import Submission
|
||||
|
||||
from bulkredditdownloader.authenticator import Authenticator
|
||||
from bulkredditdownloader.resource import Resource
|
||||
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
|
@ -19,12 +21,12 @@ class VReddit(BaseDownloader):
|
|||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def download(self):
|
||||
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
|
||||
try:
|
||||
fnull = open(os.devnull, 'w')
|
||||
subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT)
|
||||
except subprocess.SubprocessError:
|
||||
return self._download_resource(self.post.url)
|
||||
return [Resource(self.post, self.post.url)]
|
||||
else:
|
||||
video_url = self.post.url
|
||||
audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4'
|
||||
|
@ -39,7 +41,9 @@ class VReddit(BaseDownloader):
|
|||
self._merge_audio(temp_dir)
|
||||
with open(temp_dir / 'output.mp4', 'rb') as file:
|
||||
content = file.read()
|
||||
return Resource(self.post, self.post.url, content)
|
||||
out = Resource(self.post, self.post.url)
|
||||
out.content = content
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def _merge_audio(working_directory: pathlib.Path):
|
||||
|
|
|
@ -2,10 +2,12 @@
|
|||
|
||||
import logging
|
||||
import tempfile
|
||||
from typing import Optional
|
||||
|
||||
import youtube_dl
|
||||
from praw.models import Submission
|
||||
|
||||
from bulkredditdownloader.authenticator import Authenticator
|
||||
from bulkredditdownloader.resource import Resource
|
||||
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
|
@ -16,8 +18,8 @@ class Youtube(BaseDownloader):
|
|||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def download(self):
|
||||
return self._download_video()
|
||||
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
|
||||
return [self._download_video()]
|
||||
|
||||
def _download_video(self) -> Resource:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
|
@ -33,4 +35,6 @@ class Youtube(BaseDownloader):
|
|||
|
||||
with open(temp_dir / 'test.mp4', 'rb') as file:
|
||||
content = file.read()
|
||||
return Resource(self.post, self.post.url, content)
|
||||
out = Resource(self.post, self.post.url)
|
||||
out.content = content
|
||||
return out
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from bulkredditdownloader.resource import Resource
|
||||
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
|
||||
class BlankDownloader(BaseDownloader):
|
||||
def __init__(self, post):
|
||||
super().__init__(post)
|
||||
|
||||
def download(self) -> list[Resource]:
|
||||
return [self._download_resource(self.post.url)]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
|
||||
('https://docs.python.org/3/_static/py.png', 'a721fc7ec672275e257bbbfde49a4d4e'),
|
||||
))
|
||||
def test_get_resource(test_url: str, expected_hash: str):
|
||||
mock_submission = Mock
|
||||
mock_submission.url = test_url
|
||||
downloader = BlankDownloader(mock_submission)
|
||||
result = downloader.download()
|
||||
assert isinstance(result[0], Resource)
|
||||
assert result[0].hash.hexdigest() == expected_hash
|
|
@ -15,6 +15,6 @@ def reddit_submission(reddit_instance) -> praw.models.Submission:
|
|||
|
||||
def test_gallery(reddit_submission: praw.models.Submission):
|
||||
gallery = Gallery(reddit_submission)
|
||||
results = gallery.download()
|
||||
results = gallery.find_resources()
|
||||
assert len(results) == 4
|
||||
assert all([isinstance(result, Resource) for result in results])
|
||||
|
|
Loading…
Reference in a new issue