1
0
Fork 0
mirror of synced 2024-05-20 12:12:40 +12:00

Merge pull request #517 from aliparlakci/development

v2.4
This commit is contained in:
Ali Parlakçı 2021-09-12 20:20:24 +03:00 committed by GitHub
commit afe3b71f59
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
33 changed files with 324 additions and 132 deletions

View file

@ -76,6 +76,10 @@ The following options are common between both the `archive` and `download` comma
- Can be specified multiple times - Can be specified multiple times
- Disables certain modules from being used - Disables certain modules from being used
- See [Disabling Modules](#disabling-modules) for more information and a list of module names - See [Disabling Modules](#disabling-modules) for more information and a list of module names
- `--include-id-file`
- This will add any submission with the IDs in the files provided
- Can be specified multiple times
- Format is one ID per line
- `--log` - `--log`
- This allows one to specify the location of the logfile - This allows one to specify the location of the logfile
- This must be done when running multiple instances of the BDFR, see [Multiple Instances](#multiple-instances) below - This must be done when running multiple instances of the BDFR, see [Multiple Instances](#multiple-instances) below

View file

@ -6,9 +6,9 @@ import sys
import click import click
from bdfr.archiver import Archiver from bdfr.archiver import Archiver
from bdfr.cloner import RedditCloner
from bdfr.configuration import Configuration from bdfr.configuration import Configuration
from bdfr.downloader import RedditDownloader from bdfr.downloader import RedditDownloader
from bdfr.cloner import RedditCloner
logger = logging.getLogger() logger = logging.getLogger()
@ -17,6 +17,7 @@ _common_options = [
click.option('--authenticate', is_flag=True, default=None), click.option('--authenticate', is_flag=True, default=None),
click.option('--config', type=str, default=None), click.option('--config', type=str, default=None),
click.option('--disable-module', multiple=True, default=None, type=str), click.option('--disable-module', multiple=True, default=None, type=str),
click.option('--include-id-file', multiple=True, default=None),
click.option('--log', type=str, default=None), click.option('--log', type=str, default=None),
click.option('--saved', is_flag=True, default=None), click.option('--saved', is_flag=True, default=None),
click.option('--search', default=None, type=str), click.option('--search', default=None, type=str),
@ -26,12 +27,12 @@ _common_options = [
click.option('-L', '--limit', default=None, type=int), click.option('-L', '--limit', default=None, type=int),
click.option('-l', '--link', multiple=True, default=None, type=str), click.option('-l', '--link', multiple=True, default=None, type=str),
click.option('-m', '--multireddit', multiple=True, default=None, type=str), click.option('-m', '--multireddit', multiple=True, default=None, type=str),
click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', 'controversial', 'rising', 'relevance')),
default=None),
click.option('-s', '--subreddit', multiple=True, default=None, type=str), click.option('-s', '--subreddit', multiple=True, default=None, type=str),
click.option('-v', '--verbose', default=None, count=True),
click.option('-u', '--user', type=str, multiple=True, default=None),
click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None), click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None),
click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', click.option('-u', '--user', type=str, multiple=True, default=None),
'controversial', 'rising', 'relevance')), default=None), click.option('-v', '--verbose', default=None, count=True),
] ]
_downloader_options = [ _downloader_options = [

View file

@ -76,17 +76,17 @@ class Archiver(RedditConnector):
logger.info(f'Record for entry item {praw_item.id} written to disk') logger.info(f'Record for entry item {praw_item.id} written to disk')
def _write_entry_json(self, entry: BaseArchiveEntry): def _write_entry_json(self, entry: BaseArchiveEntry):
resource = Resource(entry.source, '', '.json') resource = Resource(entry.source, '', lambda: None, '.json')
content = json.dumps(entry.compile()) content = json.dumps(entry.compile())
self._write_content_to_disk(resource, content) self._write_content_to_disk(resource, content)
def _write_entry_xml(self, entry: BaseArchiveEntry): def _write_entry_xml(self, entry: BaseArchiveEntry):
resource = Resource(entry.source, '', '.xml') resource = Resource(entry.source, '', lambda: None, '.xml')
content = dict2xml.dict2xml(entry.compile(), wrap='root') content = dict2xml.dict2xml(entry.compile(), wrap='root')
self._write_content_to_disk(resource, content) self._write_content_to_disk(resource, content)
def _write_entry_yaml(self, entry: BaseArchiveEntry): def _write_entry_yaml(self, entry: BaseArchiveEntry):
resource = Resource(entry.source, '', '.yaml') resource = Resource(entry.source, '', lambda: None, '.yaml')
content = yaml.dump(entry.compile()) content = yaml.dump(entry.compile())
self._write_content_to_disk(resource, content) self._write_content_to_disk(resource, content)

View file

@ -18,6 +18,7 @@ class Configuration(Namespace):
self.exclude_id_file = [] self.exclude_id_file = []
self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}' self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}'
self.folder_scheme: str = '{SUBREDDIT}' self.folder_scheme: str = '{SUBREDDIT}'
self.include_id_file = []
self.limit: Optional[int] = None self.limit: Optional[int] = None
self.link: list[str] = [] self.link: list[str] = []
self.log: Optional[str] = None self.log: Optional[str] = None

View file

@ -3,6 +3,7 @@
import configparser import configparser
import importlib.resources import importlib.resources
import itertools
import logging import logging
import logging.handlers import logging.handlers
import re import re
@ -78,7 +79,12 @@ class RedditConnector(metaclass=ABCMeta):
self.create_reddit_instance() self.create_reddit_instance()
self.args.user = list(filter(None, [self.resolve_user_name(user) for user in self.args.user])) self.args.user = list(filter(None, [self.resolve_user_name(user) for user in self.args.user]))
self.excluded_submission_ids = self.read_excluded_ids() self.excluded_submission_ids = set.union(
self.read_id_files(self.args.exclude_id_file),
set(self.args.exclude_id),
)
self.args.link = list(itertools.chain(self.args.link, self.read_id_files(self.args.include_id_file)))
self.master_hash_list = {} self.master_hash_list = {}
self.authenticator = self.create_authenticator() self.authenticator = self.create_authenticator()
@ -184,8 +190,9 @@ class RedditConnector(metaclass=ABCMeta):
logger.debug(f'Loading configuration from {path}') logger.debug(f'Loading configuration from {path}')
break break
if not self.config_location: if not self.config_location:
self.config_location = list(importlib.resources.path('bdfr', 'default_config.cfg').gen)[0] with importlib.resources.path('bdfr', 'default_config.cfg') as path:
shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg')) self.config_location = path
shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg'))
if not self.config_location: if not self.config_location:
raise errors.BulkDownloaderException('Could not find a configuration file to load') raise errors.BulkDownloaderException('Could not find a configuration file to load')
self.cfg_parser.read(self.config_location) self.cfg_parser.read(self.config_location)
@ -403,13 +410,13 @@ class RedditConnector(metaclass=ABCMeta):
except prawcore.Forbidden: except prawcore.Forbidden:
raise errors.BulkDownloaderException(f'Source {subreddit.display_name} is private and cannot be scraped') raise errors.BulkDownloaderException(f'Source {subreddit.display_name} is private and cannot be scraped')
def read_excluded_ids(self) -> set[str]: @staticmethod
def read_id_files(file_locations: list[str]) -> set[str]:
out = [] out = []
out.extend(self.args.exclude_id) for id_file in file_locations:
for id_file in self.args.exclude_id_file:
id_file = Path(id_file).resolve().expanduser() id_file = Path(id_file).resolve().expanduser()
if not id_file.exists(): if not id_file.exists():
logger.warning(f'ID exclusion file at {id_file} does not exist') logger.warning(f'ID file at {id_file} does not exist')
continue continue
with open(id_file, 'r') as file: with open(id_file, 'r') as file:
for line in file: for line in file:

View file

@ -82,7 +82,7 @@ class RedditDownloader(RedditConnector):
logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}') logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}')
continue continue
try: try:
res.download(self.args.max_wait_time) res.download({'max_wait_time': self.args.max_wait_time})
except errors.BulkDownloaderException as e: except errors.BulkDownloaderException as e:
logger.error(f'Failed to download resource {res.url} in submission {submission.id} ' logger.error(f'Failed to download resource {res.url} in submission {submission.id} '
f'with downloader {downloader_class.__name__}: {e}') f'with downloader {downloader_class.__name__}: {e}')

View file

@ -6,7 +6,7 @@ import logging
import re import re
import time import time
import urllib.parse import urllib.parse
from typing import Optional from typing import Callable, Optional
import _hashlib import _hashlib
import requests import requests
@ -18,40 +18,52 @@ logger = logging.getLogger(__name__)
class Resource: class Resource:
def __init__(self, source_submission: Submission, url: str, extension: str = None): def __init__(self, source_submission: Submission, url: str, download_function: Callable, extension: str = None):
self.source_submission = source_submission self.source_submission = source_submission
self.content: Optional[bytes] = None self.content: Optional[bytes] = None
self.url = url self.url = url
self.hash: Optional[_hashlib.HASH] = None self.hash: Optional[_hashlib.HASH] = None
self.extension = extension self.extension = extension
self.download_function = download_function
if not self.extension: if not self.extension:
self.extension = self._determine_extension() self.extension = self._determine_extension()
@staticmethod @staticmethod
def retry_download(url: str, max_wait_time: int, current_wait_time: int = 60) -> Optional[bytes]: def retry_download(url: str) -> Callable:
try: max_wait_time = 300
response = requests.get(url)
if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
return response.content
elif response.status_code in (408, 429):
raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
else:
raise BulkDownloaderException(
f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
time.sleep(current_wait_time)
if current_wait_time < max_wait_time:
current_wait_time += 60
return Resource.retry_download(url, max_wait_time, current_wait_time)
else:
logger.error(f'Max wait time exceeded for resource at url {url}')
raise
def download(self, max_wait_time: int): def http_download(download_parameters: dict) -> Optional[bytes]:
current_wait_time = 60
if 'max_wait_time' in download_parameters:
max_wait_time = download_parameters['max_wait_time']
else:
max_wait_time = 300
while True:
try:
response = requests.get(url)
if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
return response.content
elif response.status_code in (408, 429):
raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
else:
raise BulkDownloaderException(
f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
time.sleep(current_wait_time)
if current_wait_time < max_wait_time:
current_wait_time += 60
else:
logger.error(f'Max wait time exceeded for resource at url {url}')
raise
return http_download
def download(self, download_parameters: Optional[dict] = None):
if download_parameters is None:
download_parameters = {}
if not self.content: if not self.content:
try: try:
content = self.retry_download(self.url, max_wait_time) content = self.download_function(download_parameters)
except requests.exceptions.ConnectionError as e: except requests.exceptions.ConnectionError as e:
raise BulkDownloaderException(f'Could not download resource: {e}') raise BulkDownloaderException(f'Could not download resource: {e}')
except BulkDownloaderException: except BulkDownloaderException:

View file

@ -14,4 +14,4 @@ class Direct(BaseDownloader):
super().__init__(post) super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
return [Resource(self.post, self.post.url)] return [Resource(self.post, self.post.url, Resource.retry_download(self.post.url))]

View file

@ -16,6 +16,7 @@ from bdfr.site_downloaders.imgur import Imgur
from bdfr.site_downloaders.pornhub import PornHub from bdfr.site_downloaders.pornhub import PornHub
from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.redgifs import Redgifs
from bdfr.site_downloaders.self_post import SelfPost from bdfr.site_downloaders.self_post import SelfPost
from bdfr.site_downloaders.vidble import Vidble
from bdfr.site_downloaders.youtube import Youtube from bdfr.site_downloaders.youtube import Youtube
@ -46,11 +47,12 @@ class DownloadFactory:
return Direct return Direct
elif re.match(r'pornhub\.com.*', sanitised_url): elif re.match(r'pornhub\.com.*', sanitised_url):
return PornHub return PornHub
elif re.match(r'vidble\.com', sanitised_url):
return Vidble
elif YoutubeDlFallback.can_handle_link(sanitised_url): elif YoutubeDlFallback.can_handle_link(sanitised_url):
return YoutubeDlFallback return YoutubeDlFallback
else: else:
raise NotADownloadableLinkError( raise NotADownloadableLinkError(f'No downloader module exists for url {url}')
f'No downloader module exists for url {url}')
@staticmethod @staticmethod
def sanitise_url(url: str) -> str: def sanitise_url(url: str) -> str:

View file

@ -29,7 +29,7 @@ class Erome(BaseDownloader):
for link in links: for link in links:
if not re.match(r'https?://.*', link): if not re.match(r'https?://.*', link):
link = 'https://' + link link = 'https://' + link
out.append(Resource(self.post, link)) out.append(Resource(self.post, link, Resource.retry_download(link)))
return out return out
@staticmethod @staticmethod

View file

@ -4,7 +4,6 @@
import logging import logging
from typing import Optional from typing import Optional
import youtube_dl
from praw.models import Submission from praw.models import Submission
from bdfr.resource import Resource from bdfr.resource import Resource
@ -20,21 +19,18 @@ class YoutubeDlFallback(BaseFallbackDownloader, Youtube):
super(YoutubeDlFallback, self).__init__(post) super(YoutubeDlFallback, self).__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
out = super()._download_video({}) out = Resource(
self.post,
self.post.url,
super()._download_video({}),
super().get_video_attributes(self.post.url)['ext'],
)
return [out] return [out]
@staticmethod @staticmethod
def can_handle_link(url: str) -> bool: def can_handle_link(url: str) -> bool:
yt_logger = logging.getLogger('youtube-dl') attributes = YoutubeDlFallback.get_video_attributes(url)
yt_logger.setLevel(logging.CRITICAL) if attributes:
with youtube_dl.YoutubeDL({ return True
'logger': yt_logger, else:
}) as ydl: return False
try:
result = ydl.extract_info(url, download=False)
if result:
return True
except Exception as e:
logger.exception(e)
return False
return False

View file

@ -21,7 +21,7 @@ class Gallery(BaseDownloader):
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
try: try:
image_urls = self._get_links(self.post.gallery_data['items']) image_urls = self._get_links(self.post.gallery_data['items'])
except AttributeError: except (AttributeError, TypeError):
try: try:
image_urls = self._get_links(self.post.crosspost_parent_list[0]['gallery_data']['items']) image_urls = self._get_links(self.post.crosspost_parent_list[0]['gallery_data']['items'])
except (AttributeError, IndexError, TypeError): except (AttributeError, IndexError, TypeError):
@ -31,7 +31,7 @@ class Gallery(BaseDownloader):
if not image_urls: if not image_urls:
raise SiteDownloaderError('No images found in Reddit gallery') raise SiteDownloaderError('No images found in Reddit gallery')
return [Resource(self.post, url) for url in image_urls] return [Resource(self.post, url, Resource.retry_download(url)) for url in image_urls]
@ staticmethod @ staticmethod
def _get_links(id_dict: list[dict]) -> list[str]: def _get_links(id_dict: list[dict]) -> list[str]:

View file

@ -33,7 +33,7 @@ class Imgur(BaseDownloader):
def _compute_image_url(self, image: dict) -> Resource: def _compute_image_url(self, image: dict) -> Resource:
image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext']) image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext'])
return Resource(self.post, image_url) return Resource(self.post, image_url, Resource.retry_download(image_url))
@staticmethod @staticmethod
def _get_data(link: str) -> dict: def _get_data(link: str) -> dict:

View file

@ -22,5 +22,10 @@ class PornHub(Youtube):
'format': 'best', 'format': 'best',
'nooverwrites': True, 'nooverwrites': True,
} }
out = self._download_video(ytdl_options) out = Resource(
self.post,
self.post.url,
super()._download_video(ytdl_options),
super().get_video_attributes(self.post.url)['ext'],
)
return [out] return [out]

View file

@ -18,7 +18,7 @@ class Redgifs(BaseDownloader):
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
media_url = self._get_link(self.post.url) media_url = self._get_link(self.post.url)
return [Resource(self.post, media_url, '.mp4')] return [Resource(self.post, media_url, Resource.retry_download(media_url), '.mp4')]
@staticmethod @staticmethod
def _get_link(url: str) -> str: def _get_link(url: str) -> str:

View file

@ -17,7 +17,7 @@ class SelfPost(BaseDownloader):
super().__init__(post) super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
out = Resource(self.post, self.post.url, '.txt') out = Resource(self.post, self.post.url, lambda: None, '.txt')
out.content = self.export_to_string().encode('utf-8') out.content = self.export_to_string().encode('utf-8')
out.create_hash() out.create_hash()
return [out] return [out]

View file

@ -0,0 +1,48 @@
#!/usr/bin/env python3
# coding=utf-8
import itertools
import logging
import re
from typing import Optional
import bs4
import requests
from praw.models import Submission
from bdfr.exceptions import SiteDownloaderError
from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.base_downloader import BaseDownloader
logger = logging.getLogger(__name__)
class Vidble(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
res = self.get_links(self.post.url)
if not res:
raise SiteDownloaderError(rf'No resources found at {self.post.url}')
res = [Resource(self.post, r, Resource.retry_download(r)) for r in res]
return res
@staticmethod
def get_links(url: str) -> set[str]:
page = requests.get(url)
soup = bs4.BeautifulSoup(page.text, 'html.parser')
content_div = soup.find('div', attrs={'id': 'ContentPlaceHolder1_divContent'})
images = content_div.find_all('img')
images = [i.get('src') for i in images]
videos = content_div.find_all('source', attrs={'type': 'video/mp4'})
videos = [v.get('src') for v in videos]
resources = filter(None, itertools.chain(images, videos))
resources = ['https://www.vidble.com' + r for r in resources]
resources = [Vidble.change_med_url(r) for r in resources]
return set(resources)
@staticmethod
def change_med_url(url: str) -> str:
out = re.sub(r'_med(\..{3,4})$', r'\1', url)
return out

View file

@ -3,12 +3,12 @@
import logging import logging
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Callable, Optional
import youtube_dl import youtube_dl
from praw.models import Submission from praw.models import Submission
from bdfr.exceptions import (NotADownloadableLinkError, SiteDownloaderError) from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError
from bdfr.resource import Resource from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.base_downloader import BaseDownloader from bdfr.site_downloaders.base_downloader import BaseDownloader
@ -26,32 +26,47 @@ class Youtube(BaseDownloader):
'playlistend': 1, 'playlistend': 1,
'nooverwrites': True, 'nooverwrites': True,
} }
out = self._download_video(ytdl_options) download_function = self._download_video(ytdl_options)
return [out] try:
extension = self.get_video_attributes(self.post.url)['ext']
except KeyError:
raise NotADownloadableLinkError(f'Youtube-DL cannot download URL {self.post.url}')
res = Resource(self.post, self.post.url, download_function, extension)
return [res]
def _download_video(self, ytdl_options: dict) -> Resource: def _download_video(self, ytdl_options: dict) -> Callable:
yt_logger = logging.getLogger('youtube-dl') yt_logger = logging.getLogger('youtube-dl')
yt_logger.setLevel(logging.CRITICAL) yt_logger.setLevel(logging.CRITICAL)
ytdl_options['quiet'] = True ytdl_options['quiet'] = True
ytdl_options['logger'] = yt_logger ytdl_options['logger'] = yt_logger
with tempfile.TemporaryDirectory() as temp_dir:
download_path = Path(temp_dir).resolve()
ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s'
try:
with youtube_dl.YoutubeDL(ytdl_options) as ydl:
ydl.download([self.post.url])
except youtube_dl.DownloadError as e:
raise SiteDownloaderError(f'Youtube download failed: {e}')
downloaded_files = list(download_path.iterdir()) def download(_: dict) -> bytes:
if len(downloaded_files) > 0: with tempfile.TemporaryDirectory() as temp_dir:
downloaded_file = downloaded_files[0] download_path = Path(temp_dir).resolve()
else: ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s'
raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}") try:
extension = downloaded_file.suffix with youtube_dl.YoutubeDL(ytdl_options) as ydl:
with open(downloaded_file, 'rb') as file: ydl.download([self.post.url])
content = file.read() except youtube_dl.DownloadError as e:
out = Resource(self.post, self.post.url, extension) raise SiteDownloaderError(f'Youtube download failed: {e}')
out.content = content
out.create_hash() downloaded_files = list(download_path.iterdir())
return out if len(downloaded_files) > 0:
downloaded_file = downloaded_files[0]
else:
raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}")
with open(downloaded_file, 'rb') as file:
content = file.read()
return content
return download
@staticmethod
def get_video_attributes(url: str) -> dict:
yt_logger = logging.getLogger('youtube-dl')
yt_logger.setLevel(logging.CRITICAL)
with youtube_dl.YoutubeDL({'logger': yt_logger, }) as ydl:
try:
result = ydl.extract_info(url, download=False)
return result
except Exception as e:
logger.exception(e)

View file

@ -4,7 +4,7 @@ description_file = README.md
description_content_type = text/markdown description_content_type = text/markdown
home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit
keywords = reddit, download, archive keywords = reddit, download, archive
version = 2.3.0 version = 2.4.0
author = Ali Parlakci author = Ali Parlakci
author_email = parlakciali@gmail.com author_email = parlakciali@gmail.com
maintainer = Serene Arc maintainer = Serene Arc

View file

@ -55,6 +55,23 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path):
result = runner.invoke(cli, test_args) result = runner.invoke(cli, test_args)
assert result.exit_code == 0 assert result.exit_code == 0
assert 'Added submissions from subreddit ' in result.output assert 'Added submissions from subreddit ' in result.output
assert 'Downloaded submission' in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.authenticated
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['-s', 'hentai', '-L', 10, '--search', 'red', '--authenticate'],
))
def test_cli_download_search_subreddits_authenticated(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_download_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Added submissions from subreddit ' in result.output
assert 'Downloaded submission' in result.output
@pytest.mark.online @pytest.mark.online
@ -306,3 +323,17 @@ def test_cli_download_disable_modules(test_args: list[str], tmp_path: Path):
assert result.exit_code == 0 assert result.exit_code == 0
assert 'skipped due to disabled module' in result.output assert 'skipped due to disabled module' in result.output
assert 'Downloaded submission' not in result.output assert 'Downloaded submission' not in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
def test_cli_download_include_id_file(tmp_path: Path):
test_file = Path(tmp_path, 'include.txt')
test_args = ['--include-id-file', str(test_file)]
test_file.write_text('odr9wg\nody576')
runner = CliRunner()
test_args = create_basic_args_for_download_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Downloaded submission' in result.output

View file

@ -21,5 +21,5 @@ def test_download_resource(test_url: str, expected_hash: str):
resources = test_site.find_resources() resources = test_site.find_resources()
assert len(resources) == 1 assert len(resources) == 1
assert isinstance(resources[0], Resource) assert isinstance(resources[0], Resource)
resources[0].download(120) resources[0].download()
assert resources[0].hash.hexdigest() == expected_hash assert resources[0].hash.hexdigest() == expected_hash

View file

@ -14,13 +14,13 @@ from bdfr.site_downloaders.erome import Erome
'https://s11.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4', 'https://s11.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4',
)), )),
('https://www.erome.com/a/ORhX0FZz', ( ('https://www.erome.com/a/ORhX0FZz', (
'https://s4.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', 'https://s15.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4',
'https://s4.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', 'https://s15.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4',
'https://s4.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4', 'https://s15.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4',
'https://s4.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4', 'https://s15.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4',
'https://s4.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4', 'https://s15.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4',
'https://s4.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4', 'https://s15.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4',
'https://s4.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4' 'https://s15.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4'
)), )),
)) ))
def test_get_link(test_url: str, expected_urls: tuple[str]): def test_get_link(test_url: str, expected_urls: tuple[str]):
@ -49,6 +49,6 @@ def test_download_resource(test_url: str, expected_hashes: tuple[str]):
mock_submission.url = test_url mock_submission.url = test_url
test_site = Erome(mock_submission) test_site = Erome(mock_submission)
resources = test_site.find_resources() resources = test_site.find_resources()
[res.download(120) for res in resources] [res.download() for res in resources]
resource_hashes = [res.hash.hexdigest() for res in resources] resource_hashes = [res.hash.hexdigest() for res in resources]
assert len(resource_hashes) == len(expected_hashes) assert len(resource_hashes) == len(expected_hashes)

View file

@ -4,6 +4,7 @@
import praw import praw
import pytest import pytest
from bdfr.exceptions import SiteDownloaderError
from bdfr.site_downloaders.gallery import Gallery from bdfr.site_downloaders.gallery import Gallery
@ -52,10 +53,6 @@ def test_gallery_get_links(test_ids: list[dict], expected: set[str]):
'808c35267f44acb523ce03bfa5687404', '808c35267f44acb523ce03bfa5687404',
'ec8b65bdb7f1279c4b3af0ea2bbb30c3', 'ec8b65bdb7f1279c4b3af0ea2bbb30c3',
}), }),
('nxyahw', {
'b89a3f41feb73ec1136ec4ffa7353eb1',
'cabb76fd6fd11ae6e115a2039eb09f04',
}),
('obkflw', { ('obkflw', {
'65163f685fb28c5b776e0e77122718be', '65163f685fb28c5b776e0e77122718be',
'2a337eb5b13c34d3ca3f51b5db7c13e9', '2a337eb5b13c34d3ca3f51b5db7c13e9',
@ -65,6 +62,17 @@ def test_gallery_download(test_submission_id: str, expected_hashes: set[str], re
test_submission = reddit_instance.submission(id=test_submission_id) test_submission = reddit_instance.submission(id=test_submission_id)
gallery = Gallery(test_submission) gallery = Gallery(test_submission)
results = gallery.find_resources() results = gallery.find_resources()
[res.download(120) for res in results] [res.download() for res in results]
hashes = [res.hash.hexdigest() for res in results] hashes = [res.hash.hexdigest() for res in results]
assert set(hashes) == expected_hashes assert set(hashes) == expected_hashes
@pytest.mark.parametrize('test_id', (
'n0pyzp',
'nxyahw',
))
def test_gallery_download_raises_right_error(test_id: str, reddit_instance: praw.Reddit):
test_submission = reddit_instance.submission(id=test_id)
gallery = Gallery(test_submission)
with pytest.raises(SiteDownloaderError):
gallery.find_resources()

View file

@ -13,8 +13,6 @@ from bdfr.site_downloaders.gfycat import Gfycat
@pytest.mark.parametrize(('test_url', 'expected_url'), ( @pytest.mark.parametrize(('test_url', 'expected_url'), (
('https://gfycat.com/definitivecaninecrayfish', 'https://giant.gfycat.com/DefinitiveCanineCrayfish.mp4'), ('https://gfycat.com/definitivecaninecrayfish', 'https://giant.gfycat.com/DefinitiveCanineCrayfish.mp4'),
('https://gfycat.com/dazzlingsilkyiguana', 'https://giant.gfycat.com/DazzlingSilkyIguana.mp4'), ('https://gfycat.com/dazzlingsilkyiguana', 'https://giant.gfycat.com/DazzlingSilkyIguana.mp4'),
('https://gfycat.com/webbedimpurebutterfly', 'https://thumbs2.redgifs.com/WebbedImpureButterfly.mp4'),
('https://gfycat.com/CornyLoathsomeHarrierhawk', 'https://thumbs2.redgifs.com/CornyLoathsomeHarrierhawk.mp4')
)) ))
def test_get_link(test_url: str, expected_url: str): def test_get_link(test_url: str, expected_url: str):
result = Gfycat._get_link(test_url) result = Gfycat._get_link(test_url)
@ -33,5 +31,5 @@ def test_download_resource(test_url: str, expected_hash: str):
resources = test_site.find_resources() resources = test_site.find_resources()
assert len(resources) == 1 assert len(resources) == 1
assert isinstance(resources[0], Resource) assert isinstance(resources[0], Resource)
resources[0].download(120) resources[0].download()
assert resources[0].hash.hexdigest() == expected_hash assert resources[0].hash.hexdigest() == expected_hash

View file

@ -149,6 +149,6 @@ def test_find_resources(test_url: str, expected_hashes: list[str]):
downloader = Imgur(mock_download) downloader = Imgur(mock_download)
results = downloader.find_resources() results = downloader.find_resources()
assert all([isinstance(res, Resource) for res in results]) assert all([isinstance(res, Resource) for res in results])
[res.download(120) for res in results] [res.download() for res in results]
hashes = set([res.hash.hexdigest() for res in results]) hashes = set([res.hash.hexdigest() for res in results])
assert hashes == set(expected_hashes) assert hashes == set(expected_hashes)

View file

@ -21,5 +21,5 @@ def test_find_resources_good(test_url: str, expected_hash: str):
resources = downloader.find_resources() resources = downloader.find_resources()
assert len(resources) == 1 assert len(resources) == 1
assert isinstance(resources[0], Resource) assert isinstance(resources[0], Resource)
resources[0].download(120) resources[0].download()
assert resources[0].hash.hexdigest() == expected_hash assert resources[0].hash.hexdigest() == expected_hash

View file

@ -15,10 +15,8 @@ from bdfr.site_downloaders.redgifs import Redgifs
'https://thumbs2.redgifs.com/FrighteningVictoriousSalamander.mp4'), 'https://thumbs2.redgifs.com/FrighteningVictoriousSalamander.mp4'),
('https://redgifs.com/watch/springgreendecisivetaruca', ('https://redgifs.com/watch/springgreendecisivetaruca',
'https://thumbs2.redgifs.com/SpringgreenDecisiveTaruca.mp4'), 'https://thumbs2.redgifs.com/SpringgreenDecisiveTaruca.mp4'),
('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', ('https://www.redgifs.com/watch/palegoldenrodrawhalibut',
'https://thumbs2.redgifs.com/RegalShoddyHorsechestnutleafminer.mp4'), 'https://thumbs2.redgifs.com/PalegoldenrodRawHalibut.mp4'),
('https://www.gifdeliverynetwork.com/maturenexthippopotamus',
'https://thumbs2.redgifs.com/MatureNextHippopotamus.mp4'),
)) ))
def test_get_link(test_url: str, expected: str): def test_get_link(test_url: str, expected: str):
result = Redgifs._get_link(test_url) result = Redgifs._get_link(test_url)
@ -29,9 +27,8 @@ def test_get_link(test_url: str, expected: str):
@pytest.mark.parametrize(('test_url', 'expected_hash'), ( @pytest.mark.parametrize(('test_url', 'expected_hash'), (
('https://redgifs.com/watch/frighteningvictorioussalamander', '4007c35d9e1f4b67091b5f12cffda00a'), ('https://redgifs.com/watch/frighteningvictorioussalamander', '4007c35d9e1f4b67091b5f12cffda00a'),
('https://redgifs.com/watch/springgreendecisivetaruca', '8dac487ac49a1f18cc1b4dabe23f0869'), ('https://redgifs.com/watch/springgreendecisivetaruca', '8dac487ac49a1f18cc1b4dabe23f0869'),
('https://www.gifdeliverynetwork.com/maturenexthippopotamus', '9bec0a9e4163a43781368ed5d70471df'),
('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', '8afb4e2c090a87140230f2352bf8beba'),
('https://redgifs.com/watch/leafysaltydungbeetle', '076792c660b9c024c0471ef4759af8bd'), ('https://redgifs.com/watch/leafysaltydungbeetle', '076792c660b9c024c0471ef4759af8bd'),
('https://www.redgifs.com/watch/palegoldenrodrawhalibut', '46d5aa77fe80c6407de1ecc92801c10e'),
)) ))
def test_download_resource(test_url: str, expected_hash: str): def test_download_resource(test_url: str, expected_hash: str):
mock_submission = Mock() mock_submission = Mock()
@ -40,5 +37,5 @@ def test_download_resource(test_url: str, expected_hash: str):
resources = test_site.find_resources() resources = test_site.find_resources()
assert len(resources) == 1 assert len(resources) == 1
assert isinstance(resources[0], Resource) assert isinstance(resources[0], Resource)
resources[0].download(120) resources[0].download()
assert resources[0].hash.hexdigest() == expected_hash assert resources[0].hash.hexdigest() == expected_hash

View file

@ -0,0 +1,67 @@
#!/usr/bin/env python3
# coding=utf-8
from unittest.mock import Mock
import pytest
from bdfr.resource import Resource
from bdfr.site_downloaders.vidble import Vidble
@pytest.mark.parametrize(('test_url', 'expected'), (
('/RDFbznUvcN_med.jpg', '/RDFbznUvcN.jpg'),
))
def test_change_med_url(test_url: str, expected: str):
result = Vidble.change_med_url(test_url)
assert result == expected
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected'), (
('https://www.vidble.com/show/UxsvAssYe5', {
'https://www.vidble.com/UxsvAssYe5.gif',
}),
('https://vidble.com/show/RDFbznUvcN', {
'https://www.vidble.com/RDFbznUvcN.jpg',
}),
('https://vidble.com/album/h0jTLs6B', {
'https://www.vidble.com/XG4eAoJ5JZ.jpg',
'https://www.vidble.com/IqF5UdH6Uq.jpg',
'https://www.vidble.com/VWuNsnLJMD.jpg',
'https://www.vidble.com/sMmM8O650W.jpg',
}),
('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', {
'https://www.vidble.com/0q4nWakqM6kzQWxlePD8N62Dsflev0N9.mp4',
}),
))
def test_get_links(test_url: str, expected: set[str]):
results = Vidble.get_links(test_url)
assert results == expected
@pytest.mark.parametrize(('test_url', 'expected_hashes'), (
('https://www.vidble.com/show/UxsvAssYe5', {
'0ef2f8e0e0b45936d2fb3e6fbdf67e28',
}),
('https://vidble.com/show/RDFbznUvcN', {
'c2dd30a71e32369c50eed86f86efff58',
}),
('https://vidble.com/album/h0jTLs6B', {
'3b3cba02e01c91f9858a95240b942c71',
'dd6ecf5fc9e936f9fb614eb6a0537f99',
'b31a942cd8cdda218ed547bbc04c3a27',
'6f77c570b451eef4222804bd52267481',
}),
('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', {
'cebe9d5f24dba3b0443e5097f160ca83',
}),
))
def test_find_resources(test_url: str, expected_hashes: set[str]):
mock_download = Mock()
mock_download.url = test_url
downloader = Vidble(mock_download)
results = downloader.find_resources()
assert all([isinstance(res, Resource) for res in results])
[res.download() for res in results]
hashes = set([res.hash.hexdigest() for res in results])
assert hashes == set(expected_hashes)

View file

@ -23,7 +23,7 @@ def test_find_resources_good(test_url: str, expected_hash: str):
resources = downloader.find_resources() resources = downloader.find_resources()
assert len(resources) == 1 assert len(resources) == 1
assert isinstance(resources[0], Resource) assert isinstance(resources[0], Resource)
resources[0].download(120) resources[0].download()
assert resources[0].hash.hexdigest() == expected_hash assert resources[0].hash.hexdigest() == expected_hash

View file

@ -199,10 +199,9 @@ def test_get_subreddit_normal(
@pytest.mark.reddit @pytest.mark.reddit
@pytest.mark.parametrize(('test_subreddits', 'search_term', 'limit', 'time_filter', 'max_expected_len'), ( @pytest.mark.parametrize(('test_subreddits', 'search_term', 'limit', 'time_filter', 'max_expected_len'), (
(('Python',), 'scraper', 10, 'all', 10), (('Python',), 'scraper', 10, 'all', 10),
(('Python',), '', 10, 'all', 10), (('Python',), '', 10, 'all', 0),
(('Python',), 'djsdsgewef', 10, 'all', 0), (('Python',), 'djsdsgewef', 10, 'all', 0),
(('Python',), 'scraper', 10, 'year', 10), (('Python',), 'scraper', 10, 'year', 10),
(('Python',), 'scraper', 10, 'hour', 1),
)) ))
def test_get_subreddit_search( def test_get_subreddit_search(
test_subreddits: list[str], test_subreddits: list[str],
@ -226,6 +225,8 @@ def test_get_subreddit_search(
assert all([isinstance(res, praw.models.Submission) for res in results]) assert all([isinstance(res, praw.models.Submission) for res in results])
assert all([res.subreddit.display_name in test_subreddits for res in results]) assert all([res.subreddit.display_name in test_subreddits for res in results])
assert len(results) <= max_expected_len assert len(results) <= max_expected_len
if max_expected_len != 0:
assert len(results) > 0
assert not any([isinstance(m, MagicMock) for m in results]) assert not any([isinstance(m, MagicMock) for m in results])
@ -339,11 +340,10 @@ def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: se
assert results == expected assert results == expected
def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path): def test_read_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path):
test_file = tmp_path / 'test.txt' test_file = tmp_path / 'test.txt'
test_file.write_text('aaaaaa\nbbbbbb') test_file.write_text('aaaaaa\nbbbbbb')
downloader_mock.args.exclude_id_file = [test_file] results = RedditConnector.read_id_files([str(test_file)])
results = RedditConnector.read_excluded_ids(downloader_mock)
assert results == {'aaaaaa', 'bbbbbb'} assert results == {'aaaaaa', 'bbbbbb'}

View file

@ -46,7 +46,7 @@ def test_filter_domain(test_url: str, expected: bool, download_filter: DownloadF
('http://reddit.com/test.gif', False), ('http://reddit.com/test.gif', False),
)) ))
def test_filter_all(test_url: str, expected: bool, download_filter: DownloadFilter): def test_filter_all(test_url: str, expected: bool, download_filter: DownloadFilter):
test_resource = Resource(MagicMock(), test_url) test_resource = Resource(MagicMock(), test_url, lambda: None)
result = download_filter.check_resource(test_resource) result = download_filter.check_resource(test_resource)
assert result == expected assert result == expected
@ -59,6 +59,6 @@ def test_filter_all(test_url: str, expected: bool, download_filter: DownloadFilt
)) ))
def test_filter_empty_filter(test_url: str): def test_filter_empty_filter(test_url: str):
download_filter = DownloadFilter() download_filter = DownloadFilter()
test_resource = Resource(MagicMock(), test_url) test_resource = Resource(MagicMock(), test_url, lambda: None)
result = download_filter.check_resource(test_resource) result = download_filter.check_resource(test_resource)
assert result is True assert result is True

View file

@ -119,7 +119,7 @@ def test_format_full(
format_string_file: str, format_string_file: str,
expected: str, expected: str,
reddit_submission: praw.models.Submission): reddit_submission: praw.models.Submission):
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png', lambda: None)
test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO') test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO')
result = test_formatter.format_path(test_resource, Path('test')) result = test_formatter.format_path(test_resource, Path('test'))
assert do_test_path_equality(result, expected) assert do_test_path_equality(result, expected)
@ -136,7 +136,7 @@ def test_format_full_conform(
format_string_directory: str, format_string_directory: str,
format_string_file: str, format_string_file: str,
reddit_submission: praw.models.Submission): reddit_submission: praw.models.Submission):
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png', lambda: None)
test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO') test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO')
test_formatter.format_path(test_resource, Path('test')) test_formatter.format_path(test_resource, Path('test'))
@ -156,7 +156,7 @@ def test_format_full_with_index_suffix(
expected: str, expected: str,
reddit_submission: praw.models.Submission, reddit_submission: praw.models.Submission,
): ):
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png', lambda: None)
test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO') test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO')
result = test_formatter.format_path(test_resource, Path('test'), index) result = test_formatter.format_path(test_resource, Path('test'), index)
assert do_test_path_equality(result, expected) assert do_test_path_equality(result, expected)
@ -216,7 +216,7 @@ def test_shorten_filenames(submission: MagicMock, tmp_path: Path):
submission.author.name = 'test' submission.author.name = 'test'
submission.subreddit.display_name = 'test' submission.subreddit.display_name = 'test'
submission.id = 'BBBBBB' submission.id = 'BBBBBB'
test_resource = Resource(submission, 'www.example.com/empty', '.jpeg') test_resource = Resource(submission, 'www.example.com/empty', lambda: None, '.jpeg')
test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}', 'ISO') test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}', 'ISO')
result = test_formatter.format_path(test_resource, tmp_path) result = test_formatter.format_path(test_resource, tmp_path)
result.parent.mkdir(parents=True) result.parent.mkdir(parents=True)
@ -296,7 +296,7 @@ def test_format_archive_entry_comment(
): ):
test_comment = reddit_instance.comment(id=test_comment_id) test_comment = reddit_instance.comment(id=test_comment_id)
test_formatter = FileNameFormatter(test_file_scheme, test_folder_scheme, 'ISO') test_formatter = FileNameFormatter(test_file_scheme, test_folder_scheme, 'ISO')
test_entry = Resource(test_comment, '', '.json') test_entry = Resource(test_comment, '', lambda: None, '.json')
result = test_formatter.format_path(test_entry, tmp_path) result = test_formatter.format_path(test_entry, tmp_path)
assert do_test_string_equality(result, expected_name) assert do_test_string_equality(result, expected_name)

View file

@ -21,7 +21,7 @@ from bdfr.resource import Resource
('https://www.test.com/test/test2/example.png?random=test#thing', '.png'), ('https://www.test.com/test/test2/example.png?random=test#thing', '.png'),
)) ))
def test_resource_get_extension(test_url: str, expected: str): def test_resource_get_extension(test_url: str, expected: str):
test_resource = Resource(MagicMock(), test_url) test_resource = Resource(MagicMock(), test_url, lambda: None)
result = test_resource._determine_extension() result = test_resource._determine_extension()
assert result == expected assert result == expected
@ -31,6 +31,6 @@ def test_resource_get_extension(test_url: str, expected: str):
('https://www.iana.org/_img/2013.1/iana-logo-header.svg', '426b3ac01d3584c820f3b7f5985d6623'), ('https://www.iana.org/_img/2013.1/iana-logo-header.svg', '426b3ac01d3584c820f3b7f5985d6623'),
)) ))
def test_download_online_resource(test_url: str, expected_hash: str): def test_download_online_resource(test_url: str, expected_hash: str):
test_resource = Resource(MagicMock(), test_url) test_resource = Resource(MagicMock(), test_url, Resource.retry_download(test_url))
test_resource.download(120) test_resource.download()
assert test_resource.hash.hexdigest() == expected_hash assert test_resource.hash.hexdigest() == expected_hash