commit
afe3b71f59
|
@ -76,6 +76,10 @@ The following options are common between both the `archive` and `download` comma
|
|||
- Can be specified multiple times
|
||||
- Disables certain modules from being used
|
||||
- See [Disabling Modules](#disabling-modules) for more information and a list of module names
|
||||
- `--include-id-file`
|
||||
- This will add any submission with the IDs in the files provided
|
||||
- Can be specified multiple times
|
||||
- Format is one ID per line
|
||||
- `--log`
|
||||
- This allows one to specify the location of the logfile
|
||||
- This must be done when running multiple instances of the BDFR, see [Multiple Instances](#multiple-instances) below
|
||||
|
|
|
@ -6,9 +6,9 @@ import sys
|
|||
import click
|
||||
|
||||
from bdfr.archiver import Archiver
|
||||
from bdfr.cloner import RedditCloner
|
||||
from bdfr.configuration import Configuration
|
||||
from bdfr.downloader import RedditDownloader
|
||||
from bdfr.cloner import RedditCloner
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
|
@ -17,6 +17,7 @@ _common_options = [
|
|||
click.option('--authenticate', is_flag=True, default=None),
|
||||
click.option('--config', type=str, default=None),
|
||||
click.option('--disable-module', multiple=True, default=None, type=str),
|
||||
click.option('--include-id-file', multiple=True, default=None),
|
||||
click.option('--log', type=str, default=None),
|
||||
click.option('--saved', is_flag=True, default=None),
|
||||
click.option('--search', default=None, type=str),
|
||||
|
@ -26,12 +27,12 @@ _common_options = [
|
|||
click.option('-L', '--limit', default=None, type=int),
|
||||
click.option('-l', '--link', multiple=True, default=None, type=str),
|
||||
click.option('-m', '--multireddit', multiple=True, default=None, type=str),
|
||||
click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', 'controversial', 'rising', 'relevance')),
|
||||
default=None),
|
||||
click.option('-s', '--subreddit', multiple=True, default=None, type=str),
|
||||
click.option('-v', '--verbose', default=None, count=True),
|
||||
click.option('-u', '--user', type=str, multiple=True, default=None),
|
||||
click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None),
|
||||
click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new',
|
||||
'controversial', 'rising', 'relevance')), default=None),
|
||||
click.option('-u', '--user', type=str, multiple=True, default=None),
|
||||
click.option('-v', '--verbose', default=None, count=True),
|
||||
]
|
||||
|
||||
_downloader_options = [
|
||||
|
|
|
@ -76,17 +76,17 @@ class Archiver(RedditConnector):
|
|||
logger.info(f'Record for entry item {praw_item.id} written to disk')
|
||||
|
||||
def _write_entry_json(self, entry: BaseArchiveEntry):
|
||||
resource = Resource(entry.source, '', '.json')
|
||||
resource = Resource(entry.source, '', lambda: None, '.json')
|
||||
content = json.dumps(entry.compile())
|
||||
self._write_content_to_disk(resource, content)
|
||||
|
||||
def _write_entry_xml(self, entry: BaseArchiveEntry):
|
||||
resource = Resource(entry.source, '', '.xml')
|
||||
resource = Resource(entry.source, '', lambda: None, '.xml')
|
||||
content = dict2xml.dict2xml(entry.compile(), wrap='root')
|
||||
self._write_content_to_disk(resource, content)
|
||||
|
||||
def _write_entry_yaml(self, entry: BaseArchiveEntry):
|
||||
resource = Resource(entry.source, '', '.yaml')
|
||||
resource = Resource(entry.source, '', lambda: None, '.yaml')
|
||||
content = yaml.dump(entry.compile())
|
||||
self._write_content_to_disk(resource, content)
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ class Configuration(Namespace):
|
|||
self.exclude_id_file = []
|
||||
self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}'
|
||||
self.folder_scheme: str = '{SUBREDDIT}'
|
||||
self.include_id_file = []
|
||||
self.limit: Optional[int] = None
|
||||
self.link: list[str] = []
|
||||
self.log: Optional[str] = None
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
import configparser
|
||||
import importlib.resources
|
||||
import itertools
|
||||
import logging
|
||||
import logging.handlers
|
||||
import re
|
||||
|
@ -78,7 +79,12 @@ class RedditConnector(metaclass=ABCMeta):
|
|||
self.create_reddit_instance()
|
||||
self.args.user = list(filter(None, [self.resolve_user_name(user) for user in self.args.user]))
|
||||
|
||||
self.excluded_submission_ids = self.read_excluded_ids()
|
||||
self.excluded_submission_ids = set.union(
|
||||
self.read_id_files(self.args.exclude_id_file),
|
||||
set(self.args.exclude_id),
|
||||
)
|
||||
|
||||
self.args.link = list(itertools.chain(self.args.link, self.read_id_files(self.args.include_id_file)))
|
||||
|
||||
self.master_hash_list = {}
|
||||
self.authenticator = self.create_authenticator()
|
||||
|
@ -184,8 +190,9 @@ class RedditConnector(metaclass=ABCMeta):
|
|||
logger.debug(f'Loading configuration from {path}')
|
||||
break
|
||||
if not self.config_location:
|
||||
self.config_location = list(importlib.resources.path('bdfr', 'default_config.cfg').gen)[0]
|
||||
shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg'))
|
||||
with importlib.resources.path('bdfr', 'default_config.cfg') as path:
|
||||
self.config_location = path
|
||||
shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg'))
|
||||
if not self.config_location:
|
||||
raise errors.BulkDownloaderException('Could not find a configuration file to load')
|
||||
self.cfg_parser.read(self.config_location)
|
||||
|
@ -403,13 +410,13 @@ class RedditConnector(metaclass=ABCMeta):
|
|||
except prawcore.Forbidden:
|
||||
raise errors.BulkDownloaderException(f'Source {subreddit.display_name} is private and cannot be scraped')
|
||||
|
||||
def read_excluded_ids(self) -> set[str]:
|
||||
@staticmethod
|
||||
def read_id_files(file_locations: list[str]) -> set[str]:
|
||||
out = []
|
||||
out.extend(self.args.exclude_id)
|
||||
for id_file in self.args.exclude_id_file:
|
||||
for id_file in file_locations:
|
||||
id_file = Path(id_file).resolve().expanduser()
|
||||
if not id_file.exists():
|
||||
logger.warning(f'ID exclusion file at {id_file} does not exist')
|
||||
logger.warning(f'ID file at {id_file} does not exist')
|
||||
continue
|
||||
with open(id_file, 'r') as file:
|
||||
for line in file:
|
||||
|
|
|
@ -82,7 +82,7 @@ class RedditDownloader(RedditConnector):
|
|||
logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}')
|
||||
continue
|
||||
try:
|
||||
res.download(self.args.max_wait_time)
|
||||
res.download({'max_wait_time': self.args.max_wait_time})
|
||||
except errors.BulkDownloaderException as e:
|
||||
logger.error(f'Failed to download resource {res.url} in submission {submission.id} '
|
||||
f'with downloader {downloader_class.__name__}: {e}')
|
||||
|
|
|
@ -6,7 +6,7 @@ import logging
|
|||
import re
|
||||
import time
|
||||
import urllib.parse
|
||||
from typing import Optional
|
||||
from typing import Callable, Optional
|
||||
|
||||
import _hashlib
|
||||
import requests
|
||||
|
@ -18,40 +18,52 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class Resource:
|
||||
def __init__(self, source_submission: Submission, url: str, extension: str = None):
|
||||
def __init__(self, source_submission: Submission, url: str, download_function: Callable, extension: str = None):
|
||||
self.source_submission = source_submission
|
||||
self.content: Optional[bytes] = None
|
||||
self.url = url
|
||||
self.hash: Optional[_hashlib.HASH] = None
|
||||
self.extension = extension
|
||||
self.download_function = download_function
|
||||
if not self.extension:
|
||||
self.extension = self._determine_extension()
|
||||
|
||||
@staticmethod
|
||||
def retry_download(url: str, max_wait_time: int, current_wait_time: int = 60) -> Optional[bytes]:
|
||||
try:
|
||||
response = requests.get(url)
|
||||
if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
|
||||
return response.content
|
||||
elif response.status_code in (408, 429):
|
||||
raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
|
||||
else:
|
||||
raise BulkDownloaderException(
|
||||
f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
|
||||
except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
|
||||
logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
|
||||
time.sleep(current_wait_time)
|
||||
if current_wait_time < max_wait_time:
|
||||
current_wait_time += 60
|
||||
return Resource.retry_download(url, max_wait_time, current_wait_time)
|
||||
else:
|
||||
logger.error(f'Max wait time exceeded for resource at url {url}')
|
||||
raise
|
||||
def retry_download(url: str) -> Callable:
|
||||
max_wait_time = 300
|
||||
|
||||
def download(self, max_wait_time: int):
|
||||
def http_download(download_parameters: dict) -> Optional[bytes]:
|
||||
current_wait_time = 60
|
||||
if 'max_wait_time' in download_parameters:
|
||||
max_wait_time = download_parameters['max_wait_time']
|
||||
else:
|
||||
max_wait_time = 300
|
||||
while True:
|
||||
try:
|
||||
response = requests.get(url)
|
||||
if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
|
||||
return response.content
|
||||
elif response.status_code in (408, 429):
|
||||
raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
|
||||
else:
|
||||
raise BulkDownloaderException(
|
||||
f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
|
||||
except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
|
||||
logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
|
||||
time.sleep(current_wait_time)
|
||||
if current_wait_time < max_wait_time:
|
||||
current_wait_time += 60
|
||||
else:
|
||||
logger.error(f'Max wait time exceeded for resource at url {url}')
|
||||
raise
|
||||
return http_download
|
||||
|
||||
def download(self, download_parameters: Optional[dict] = None):
|
||||
if download_parameters is None:
|
||||
download_parameters = {}
|
||||
if not self.content:
|
||||
try:
|
||||
content = self.retry_download(self.url, max_wait_time)
|
||||
content = self.download_function(download_parameters)
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
raise BulkDownloaderException(f'Could not download resource: {e}')
|
||||
except BulkDownloaderException:
|
||||
|
|
|
@ -14,4 +14,4 @@ class Direct(BaseDownloader):
|
|||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
return [Resource(self.post, self.post.url)]
|
||||
return [Resource(self.post, self.post.url, Resource.retry_download(self.post.url))]
|
||||
|
|
|
@ -16,6 +16,7 @@ from bdfr.site_downloaders.imgur import Imgur
|
|||
from bdfr.site_downloaders.pornhub import PornHub
|
||||
from bdfr.site_downloaders.redgifs import Redgifs
|
||||
from bdfr.site_downloaders.self_post import SelfPost
|
||||
from bdfr.site_downloaders.vidble import Vidble
|
||||
from bdfr.site_downloaders.youtube import Youtube
|
||||
|
||||
|
||||
|
@ -46,11 +47,12 @@ class DownloadFactory:
|
|||
return Direct
|
||||
elif re.match(r'pornhub\.com.*', sanitised_url):
|
||||
return PornHub
|
||||
elif re.match(r'vidble\.com', sanitised_url):
|
||||
return Vidble
|
||||
elif YoutubeDlFallback.can_handle_link(sanitised_url):
|
||||
return YoutubeDlFallback
|
||||
else:
|
||||
raise NotADownloadableLinkError(
|
||||
f'No downloader module exists for url {url}')
|
||||
raise NotADownloadableLinkError(f'No downloader module exists for url {url}')
|
||||
|
||||
@staticmethod
|
||||
def sanitise_url(url: str) -> str:
|
||||
|
|
|
@ -29,7 +29,7 @@ class Erome(BaseDownloader):
|
|||
for link in links:
|
||||
if not re.match(r'https?://.*', link):
|
||||
link = 'https://' + link
|
||||
out.append(Resource(self.post, link))
|
||||
out.append(Resource(self.post, link, Resource.retry_download(link)))
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import youtube_dl
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.resource import Resource
|
||||
|
@ -20,21 +19,18 @@ class YoutubeDlFallback(BaseFallbackDownloader, Youtube):
|
|||
super(YoutubeDlFallback, self).__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
out = super()._download_video({})
|
||||
out = Resource(
|
||||
self.post,
|
||||
self.post.url,
|
||||
super()._download_video({}),
|
||||
super().get_video_attributes(self.post.url)['ext'],
|
||||
)
|
||||
return [out]
|
||||
|
||||
@staticmethod
|
||||
def can_handle_link(url: str) -> bool:
|
||||
yt_logger = logging.getLogger('youtube-dl')
|
||||
yt_logger.setLevel(logging.CRITICAL)
|
||||
with youtube_dl.YoutubeDL({
|
||||
'logger': yt_logger,
|
||||
}) as ydl:
|
||||
try:
|
||||
result = ydl.extract_info(url, download=False)
|
||||
if result:
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
return False
|
||||
return False
|
||||
attributes = YoutubeDlFallback.get_video_attributes(url)
|
||||
if attributes:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
|
|
@ -21,7 +21,7 @@ class Gallery(BaseDownloader):
|
|||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
try:
|
||||
image_urls = self._get_links(self.post.gallery_data['items'])
|
||||
except AttributeError:
|
||||
except (AttributeError, TypeError):
|
||||
try:
|
||||
image_urls = self._get_links(self.post.crosspost_parent_list[0]['gallery_data']['items'])
|
||||
except (AttributeError, IndexError, TypeError):
|
||||
|
@ -31,7 +31,7 @@ class Gallery(BaseDownloader):
|
|||
|
||||
if not image_urls:
|
||||
raise SiteDownloaderError('No images found in Reddit gallery')
|
||||
return [Resource(self.post, url) for url in image_urls]
|
||||
return [Resource(self.post, url, Resource.retry_download(url)) for url in image_urls]
|
||||
|
||||
@ staticmethod
|
||||
def _get_links(id_dict: list[dict]) -> list[str]:
|
||||
|
|
|
@ -33,7 +33,7 @@ class Imgur(BaseDownloader):
|
|||
|
||||
def _compute_image_url(self, image: dict) -> Resource:
|
||||
image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext'])
|
||||
return Resource(self.post, image_url)
|
||||
return Resource(self.post, image_url, Resource.retry_download(image_url))
|
||||
|
||||
@staticmethod
|
||||
def _get_data(link: str) -> dict:
|
||||
|
|
|
@ -22,5 +22,10 @@ class PornHub(Youtube):
|
|||
'format': 'best',
|
||||
'nooverwrites': True,
|
||||
}
|
||||
out = self._download_video(ytdl_options)
|
||||
out = Resource(
|
||||
self.post,
|
||||
self.post.url,
|
||||
super()._download_video(ytdl_options),
|
||||
super().get_video_attributes(self.post.url)['ext'],
|
||||
)
|
||||
return [out]
|
||||
|
|
|
@ -18,7 +18,7 @@ class Redgifs(BaseDownloader):
|
|||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
media_url = self._get_link(self.post.url)
|
||||
return [Resource(self.post, media_url, '.mp4')]
|
||||
return [Resource(self.post, media_url, Resource.retry_download(media_url), '.mp4')]
|
||||
|
||||
@staticmethod
|
||||
def _get_link(url: str) -> str:
|
||||
|
|
|
@ -17,7 +17,7 @@ class SelfPost(BaseDownloader):
|
|||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
out = Resource(self.post, self.post.url, '.txt')
|
||||
out = Resource(self.post, self.post.url, lambda: None, '.txt')
|
||||
out.content = self.export_to_string().encode('utf-8')
|
||||
out.create_hash()
|
||||
return [out]
|
||||
|
|
48
bdfr/site_downloaders/vidble.py
Normal file
48
bdfr/site_downloaders/vidble.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
import bs4
|
||||
import requests
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Vidble(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
res = self.get_links(self.post.url)
|
||||
if not res:
|
||||
raise SiteDownloaderError(rf'No resources found at {self.post.url}')
|
||||
res = [Resource(self.post, r, Resource.retry_download(r)) for r in res]
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def get_links(url: str) -> set[str]:
|
||||
page = requests.get(url)
|
||||
soup = bs4.BeautifulSoup(page.text, 'html.parser')
|
||||
content_div = soup.find('div', attrs={'id': 'ContentPlaceHolder1_divContent'})
|
||||
images = content_div.find_all('img')
|
||||
images = [i.get('src') for i in images]
|
||||
videos = content_div.find_all('source', attrs={'type': 'video/mp4'})
|
||||
videos = [v.get('src') for v in videos]
|
||||
resources = filter(None, itertools.chain(images, videos))
|
||||
resources = ['https://www.vidble.com' + r for r in resources]
|
||||
resources = [Vidble.change_med_url(r) for r in resources]
|
||||
return set(resources)
|
||||
|
||||
@staticmethod
|
||||
def change_med_url(url: str) -> str:
|
||||
out = re.sub(r'_med(\..{3,4})$', r'\1', url)
|
||||
return out
|
|
@ -3,12 +3,12 @@
|
|||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Callable, Optional
|
||||
|
||||
import youtube_dl
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import (NotADownloadableLinkError, SiteDownloaderError)
|
||||
from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
@ -26,32 +26,47 @@ class Youtube(BaseDownloader):
|
|||
'playlistend': 1,
|
||||
'nooverwrites': True,
|
||||
}
|
||||
out = self._download_video(ytdl_options)
|
||||
return [out]
|
||||
download_function = self._download_video(ytdl_options)
|
||||
try:
|
||||
extension = self.get_video_attributes(self.post.url)['ext']
|
||||
except KeyError:
|
||||
raise NotADownloadableLinkError(f'Youtube-DL cannot download URL {self.post.url}')
|
||||
res = Resource(self.post, self.post.url, download_function, extension)
|
||||
return [res]
|
||||
|
||||
def _download_video(self, ytdl_options: dict) -> Resource:
|
||||
def _download_video(self, ytdl_options: dict) -> Callable:
|
||||
yt_logger = logging.getLogger('youtube-dl')
|
||||
yt_logger.setLevel(logging.CRITICAL)
|
||||
ytdl_options['quiet'] = True
|
||||
ytdl_options['logger'] = yt_logger
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
download_path = Path(temp_dir).resolve()
|
||||
ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s'
|
||||
try:
|
||||
with youtube_dl.YoutubeDL(ytdl_options) as ydl:
|
||||
ydl.download([self.post.url])
|
||||
except youtube_dl.DownloadError as e:
|
||||
raise SiteDownloaderError(f'Youtube download failed: {e}')
|
||||
|
||||
downloaded_files = list(download_path.iterdir())
|
||||
if len(downloaded_files) > 0:
|
||||
downloaded_file = downloaded_files[0]
|
||||
else:
|
||||
raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}")
|
||||
extension = downloaded_file.suffix
|
||||
with open(downloaded_file, 'rb') as file:
|
||||
content = file.read()
|
||||
out = Resource(self.post, self.post.url, extension)
|
||||
out.content = content
|
||||
out.create_hash()
|
||||
return out
|
||||
def download(_: dict) -> bytes:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
download_path = Path(temp_dir).resolve()
|
||||
ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s'
|
||||
try:
|
||||
with youtube_dl.YoutubeDL(ytdl_options) as ydl:
|
||||
ydl.download([self.post.url])
|
||||
except youtube_dl.DownloadError as e:
|
||||
raise SiteDownloaderError(f'Youtube download failed: {e}')
|
||||
|
||||
downloaded_files = list(download_path.iterdir())
|
||||
if len(downloaded_files) > 0:
|
||||
downloaded_file = downloaded_files[0]
|
||||
else:
|
||||
raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}")
|
||||
with open(downloaded_file, 'rb') as file:
|
||||
content = file.read()
|
||||
return content
|
||||
return download
|
||||
|
||||
@staticmethod
|
||||
def get_video_attributes(url: str) -> dict:
|
||||
yt_logger = logging.getLogger('youtube-dl')
|
||||
yt_logger.setLevel(logging.CRITICAL)
|
||||
with youtube_dl.YoutubeDL({'logger': yt_logger, }) as ydl:
|
||||
try:
|
||||
result = ydl.extract_info(url, download=False)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
|
|
|
@ -4,7 +4,7 @@ description_file = README.md
|
|||
description_content_type = text/markdown
|
||||
home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit
|
||||
keywords = reddit, download, archive
|
||||
version = 2.3.0
|
||||
version = 2.4.0
|
||||
author = Ali Parlakci
|
||||
author_email = parlakciali@gmail.com
|
||||
maintainer = Serene Arc
|
||||
|
|
|
@ -55,6 +55,23 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path):
|
|||
result = runner.invoke(cli, test_args)
|
||||
assert result.exit_code == 0
|
||||
assert 'Added submissions from subreddit ' in result.output
|
||||
assert 'Downloaded submission' in result.output
|
||||
|
||||
|
||||
@pytest.mark.online
|
||||
@pytest.mark.reddit
|
||||
@pytest.mark.authenticated
|
||||
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
|
||||
@pytest.mark.parametrize('test_args', (
|
||||
['-s', 'hentai', '-L', 10, '--search', 'red', '--authenticate'],
|
||||
))
|
||||
def test_cli_download_search_subreddits_authenticated(test_args: list[str], tmp_path: Path):
|
||||
runner = CliRunner()
|
||||
test_args = create_basic_args_for_download_runner(test_args, tmp_path)
|
||||
result = runner.invoke(cli, test_args)
|
||||
assert result.exit_code == 0
|
||||
assert 'Added submissions from subreddit ' in result.output
|
||||
assert 'Downloaded submission' in result.output
|
||||
|
||||
|
||||
@pytest.mark.online
|
||||
|
@ -306,3 +323,17 @@ def test_cli_download_disable_modules(test_args: list[str], tmp_path: Path):
|
|||
assert result.exit_code == 0
|
||||
assert 'skipped due to disabled module' in result.output
|
||||
assert 'Downloaded submission' not in result.output
|
||||
|
||||
|
||||
@pytest.mark.online
|
||||
@pytest.mark.reddit
|
||||
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
|
||||
def test_cli_download_include_id_file(tmp_path: Path):
|
||||
test_file = Path(tmp_path, 'include.txt')
|
||||
test_args = ['--include-id-file', str(test_file)]
|
||||
test_file.write_text('odr9wg\nody576')
|
||||
runner = CliRunner()
|
||||
test_args = create_basic_args_for_download_runner(test_args, tmp_path)
|
||||
result = runner.invoke(cli, test_args)
|
||||
assert result.exit_code == 0
|
||||
assert 'Downloaded submission' in result.output
|
||||
|
|
|
@ -21,5 +21,5 @@ def test_download_resource(test_url: str, expected_hash: str):
|
|||
resources = test_site.find_resources()
|
||||
assert len(resources) == 1
|
||||
assert isinstance(resources[0], Resource)
|
||||
resources[0].download(120)
|
||||
resources[0].download()
|
||||
assert resources[0].hash.hexdigest() == expected_hash
|
||||
|
|
|
@ -14,13 +14,13 @@ from bdfr.site_downloaders.erome import Erome
|
|||
'https://s11.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4',
|
||||
)),
|
||||
('https://www.erome.com/a/ORhX0FZz', (
|
||||
'https://s4.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4',
|
||||
'https://s4.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4',
|
||||
'https://s4.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4',
|
||||
'https://s4.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4',
|
||||
'https://s4.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4',
|
||||
'https://s4.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4',
|
||||
'https://s4.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4'
|
||||
'https://s15.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4',
|
||||
'https://s15.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4',
|
||||
'https://s15.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4',
|
||||
'https://s15.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4',
|
||||
'https://s15.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4',
|
||||
'https://s15.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4',
|
||||
'https://s15.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4'
|
||||
)),
|
||||
))
|
||||
def test_get_link(test_url: str, expected_urls: tuple[str]):
|
||||
|
@ -49,6 +49,6 @@ def test_download_resource(test_url: str, expected_hashes: tuple[str]):
|
|||
mock_submission.url = test_url
|
||||
test_site = Erome(mock_submission)
|
||||
resources = test_site.find_resources()
|
||||
[res.download(120) for res in resources]
|
||||
[res.download() for res in resources]
|
||||
resource_hashes = [res.hash.hexdigest() for res in resources]
|
||||
assert len(resource_hashes) == len(expected_hashes)
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
import praw
|
||||
import pytest
|
||||
|
||||
from bdfr.exceptions import SiteDownloaderError
|
||||
from bdfr.site_downloaders.gallery import Gallery
|
||||
|
||||
|
||||
|
@ -52,10 +53,6 @@ def test_gallery_get_links(test_ids: list[dict], expected: set[str]):
|
|||
'808c35267f44acb523ce03bfa5687404',
|
||||
'ec8b65bdb7f1279c4b3af0ea2bbb30c3',
|
||||
}),
|
||||
('nxyahw', {
|
||||
'b89a3f41feb73ec1136ec4ffa7353eb1',
|
||||
'cabb76fd6fd11ae6e115a2039eb09f04',
|
||||
}),
|
||||
('obkflw', {
|
||||
'65163f685fb28c5b776e0e77122718be',
|
||||
'2a337eb5b13c34d3ca3f51b5db7c13e9',
|
||||
|
@ -65,6 +62,17 @@ def test_gallery_download(test_submission_id: str, expected_hashes: set[str], re
|
|||
test_submission = reddit_instance.submission(id=test_submission_id)
|
||||
gallery = Gallery(test_submission)
|
||||
results = gallery.find_resources()
|
||||
[res.download(120) for res in results]
|
||||
[res.download() for res in results]
|
||||
hashes = [res.hash.hexdigest() for res in results]
|
||||
assert set(hashes) == expected_hashes
|
||||
|
||||
|
||||
@pytest.mark.parametrize('test_id', (
|
||||
'n0pyzp',
|
||||
'nxyahw',
|
||||
))
|
||||
def test_gallery_download_raises_right_error(test_id: str, reddit_instance: praw.Reddit):
|
||||
test_submission = reddit_instance.submission(id=test_id)
|
||||
gallery = Gallery(test_submission)
|
||||
with pytest.raises(SiteDownloaderError):
|
||||
gallery.find_resources()
|
||||
|
|
|
@ -13,8 +13,6 @@ from bdfr.site_downloaders.gfycat import Gfycat
|
|||
@pytest.mark.parametrize(('test_url', 'expected_url'), (
|
||||
('https://gfycat.com/definitivecaninecrayfish', 'https://giant.gfycat.com/DefinitiveCanineCrayfish.mp4'),
|
||||
('https://gfycat.com/dazzlingsilkyiguana', 'https://giant.gfycat.com/DazzlingSilkyIguana.mp4'),
|
||||
('https://gfycat.com/webbedimpurebutterfly', 'https://thumbs2.redgifs.com/WebbedImpureButterfly.mp4'),
|
||||
('https://gfycat.com/CornyLoathsomeHarrierhawk', 'https://thumbs2.redgifs.com/CornyLoathsomeHarrierhawk.mp4')
|
||||
))
|
||||
def test_get_link(test_url: str, expected_url: str):
|
||||
result = Gfycat._get_link(test_url)
|
||||
|
@ -33,5 +31,5 @@ def test_download_resource(test_url: str, expected_hash: str):
|
|||
resources = test_site.find_resources()
|
||||
assert len(resources) == 1
|
||||
assert isinstance(resources[0], Resource)
|
||||
resources[0].download(120)
|
||||
resources[0].download()
|
||||
assert resources[0].hash.hexdigest() == expected_hash
|
||||
|
|
|
@ -149,6 +149,6 @@ def test_find_resources(test_url: str, expected_hashes: list[str]):
|
|||
downloader = Imgur(mock_download)
|
||||
results = downloader.find_resources()
|
||||
assert all([isinstance(res, Resource) for res in results])
|
||||
[res.download(120) for res in results]
|
||||
[res.download() for res in results]
|
||||
hashes = set([res.hash.hexdigest() for res in results])
|
||||
assert hashes == set(expected_hashes)
|
||||
|
|
|
@ -21,5 +21,5 @@ def test_find_resources_good(test_url: str, expected_hash: str):
|
|||
resources = downloader.find_resources()
|
||||
assert len(resources) == 1
|
||||
assert isinstance(resources[0], Resource)
|
||||
resources[0].download(120)
|
||||
resources[0].download()
|
||||
assert resources[0].hash.hexdigest() == expected_hash
|
||||
|
|
|
@ -15,10 +15,8 @@ from bdfr.site_downloaders.redgifs import Redgifs
|
|||
'https://thumbs2.redgifs.com/FrighteningVictoriousSalamander.mp4'),
|
||||
('https://redgifs.com/watch/springgreendecisivetaruca',
|
||||
'https://thumbs2.redgifs.com/SpringgreenDecisiveTaruca.mp4'),
|
||||
('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer',
|
||||
'https://thumbs2.redgifs.com/RegalShoddyHorsechestnutleafminer.mp4'),
|
||||
('https://www.gifdeliverynetwork.com/maturenexthippopotamus',
|
||||
'https://thumbs2.redgifs.com/MatureNextHippopotamus.mp4'),
|
||||
('https://www.redgifs.com/watch/palegoldenrodrawhalibut',
|
||||
'https://thumbs2.redgifs.com/PalegoldenrodRawHalibut.mp4'),
|
||||
))
|
||||
def test_get_link(test_url: str, expected: str):
|
||||
result = Redgifs._get_link(test_url)
|
||||
|
@ -29,9 +27,8 @@ def test_get_link(test_url: str, expected: str):
|
|||
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
|
||||
('https://redgifs.com/watch/frighteningvictorioussalamander', '4007c35d9e1f4b67091b5f12cffda00a'),
|
||||
('https://redgifs.com/watch/springgreendecisivetaruca', '8dac487ac49a1f18cc1b4dabe23f0869'),
|
||||
('https://www.gifdeliverynetwork.com/maturenexthippopotamus', '9bec0a9e4163a43781368ed5d70471df'),
|
||||
('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', '8afb4e2c090a87140230f2352bf8beba'),
|
||||
('https://redgifs.com/watch/leafysaltydungbeetle', '076792c660b9c024c0471ef4759af8bd'),
|
||||
('https://www.redgifs.com/watch/palegoldenrodrawhalibut', '46d5aa77fe80c6407de1ecc92801c10e'),
|
||||
))
|
||||
def test_download_resource(test_url: str, expected_hash: str):
|
||||
mock_submission = Mock()
|
||||
|
@ -40,5 +37,5 @@ def test_download_resource(test_url: str, expected_hash: str):
|
|||
resources = test_site.find_resources()
|
||||
assert len(resources) == 1
|
||||
assert isinstance(resources[0], Resource)
|
||||
resources[0].download(120)
|
||||
resources[0].download()
|
||||
assert resources[0].hash.hexdigest() == expected_hash
|
||||
|
|
67
tests/site_downloaders/test_vidble.py
Normal file
67
tests/site_downloaders/test_vidble.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_downloaders.vidble import Vidble
|
||||
|
||||
|
||||
@pytest.mark.parametrize(('test_url', 'expected'), (
|
||||
('/RDFbznUvcN_med.jpg', '/RDFbznUvcN.jpg'),
|
||||
))
|
||||
def test_change_med_url(test_url: str, expected: str):
|
||||
result = Vidble.change_med_url(test_url)
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.online
|
||||
@pytest.mark.parametrize(('test_url', 'expected'), (
|
||||
('https://www.vidble.com/show/UxsvAssYe5', {
|
||||
'https://www.vidble.com/UxsvAssYe5.gif',
|
||||
}),
|
||||
('https://vidble.com/show/RDFbznUvcN', {
|
||||
'https://www.vidble.com/RDFbznUvcN.jpg',
|
||||
}),
|
||||
('https://vidble.com/album/h0jTLs6B', {
|
||||
'https://www.vidble.com/XG4eAoJ5JZ.jpg',
|
||||
'https://www.vidble.com/IqF5UdH6Uq.jpg',
|
||||
'https://www.vidble.com/VWuNsnLJMD.jpg',
|
||||
'https://www.vidble.com/sMmM8O650W.jpg',
|
||||
}),
|
||||
('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', {
|
||||
'https://www.vidble.com/0q4nWakqM6kzQWxlePD8N62Dsflev0N9.mp4',
|
||||
}),
|
||||
))
|
||||
def test_get_links(test_url: str, expected: set[str]):
|
||||
results = Vidble.get_links(test_url)
|
||||
assert results == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(('test_url', 'expected_hashes'), (
|
||||
('https://www.vidble.com/show/UxsvAssYe5', {
|
||||
'0ef2f8e0e0b45936d2fb3e6fbdf67e28',
|
||||
}),
|
||||
('https://vidble.com/show/RDFbznUvcN', {
|
||||
'c2dd30a71e32369c50eed86f86efff58',
|
||||
}),
|
||||
('https://vidble.com/album/h0jTLs6B', {
|
||||
'3b3cba02e01c91f9858a95240b942c71',
|
||||
'dd6ecf5fc9e936f9fb614eb6a0537f99',
|
||||
'b31a942cd8cdda218ed547bbc04c3a27',
|
||||
'6f77c570b451eef4222804bd52267481',
|
||||
}),
|
||||
('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', {
|
||||
'cebe9d5f24dba3b0443e5097f160ca83',
|
||||
}),
|
||||
))
|
||||
def test_find_resources(test_url: str, expected_hashes: set[str]):
|
||||
mock_download = Mock()
|
||||
mock_download.url = test_url
|
||||
downloader = Vidble(mock_download)
|
||||
results = downloader.find_resources()
|
||||
assert all([isinstance(res, Resource) for res in results])
|
||||
[res.download() for res in results]
|
||||
hashes = set([res.hash.hexdigest() for res in results])
|
||||
assert hashes == set(expected_hashes)
|
|
@ -23,7 +23,7 @@ def test_find_resources_good(test_url: str, expected_hash: str):
|
|||
resources = downloader.find_resources()
|
||||
assert len(resources) == 1
|
||||
assert isinstance(resources[0], Resource)
|
||||
resources[0].download(120)
|
||||
resources[0].download()
|
||||
assert resources[0].hash.hexdigest() == expected_hash
|
||||
|
||||
|
||||
|
|
|
@ -199,10 +199,9 @@ def test_get_subreddit_normal(
|
|||
@pytest.mark.reddit
|
||||
@pytest.mark.parametrize(('test_subreddits', 'search_term', 'limit', 'time_filter', 'max_expected_len'), (
|
||||
(('Python',), 'scraper', 10, 'all', 10),
|
||||
(('Python',), '', 10, 'all', 10),
|
||||
(('Python',), '', 10, 'all', 0),
|
||||
(('Python',), 'djsdsgewef', 10, 'all', 0),
|
||||
(('Python',), 'scraper', 10, 'year', 10),
|
||||
(('Python',), 'scraper', 10, 'hour', 1),
|
||||
))
|
||||
def test_get_subreddit_search(
|
||||
test_subreddits: list[str],
|
||||
|
@ -226,6 +225,8 @@ def test_get_subreddit_search(
|
|||
assert all([isinstance(res, praw.models.Submission) for res in results])
|
||||
assert all([res.subreddit.display_name in test_subreddits for res in results])
|
||||
assert len(results) <= max_expected_len
|
||||
if max_expected_len != 0:
|
||||
assert len(results) > 0
|
||||
assert not any([isinstance(m, MagicMock) for m in results])
|
||||
|
||||
|
||||
|
@ -339,11 +340,10 @@ def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: se
|
|||
assert results == expected
|
||||
|
||||
|
||||
def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path):
|
||||
def test_read_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path):
|
||||
test_file = tmp_path / 'test.txt'
|
||||
test_file.write_text('aaaaaa\nbbbbbb')
|
||||
downloader_mock.args.exclude_id_file = [test_file]
|
||||
results = RedditConnector.read_excluded_ids(downloader_mock)
|
||||
results = RedditConnector.read_id_files([str(test_file)])
|
||||
assert results == {'aaaaaa', 'bbbbbb'}
|
||||
|
||||
|
||||
|
|
|
@ -46,7 +46,7 @@ def test_filter_domain(test_url: str, expected: bool, download_filter: DownloadF
|
|||
('http://reddit.com/test.gif', False),
|
||||
))
|
||||
def test_filter_all(test_url: str, expected: bool, download_filter: DownloadFilter):
|
||||
test_resource = Resource(MagicMock(), test_url)
|
||||
test_resource = Resource(MagicMock(), test_url, lambda: None)
|
||||
result = download_filter.check_resource(test_resource)
|
||||
assert result == expected
|
||||
|
||||
|
@ -59,6 +59,6 @@ def test_filter_all(test_url: str, expected: bool, download_filter: DownloadFilt
|
|||
))
|
||||
def test_filter_empty_filter(test_url: str):
|
||||
download_filter = DownloadFilter()
|
||||
test_resource = Resource(MagicMock(), test_url)
|
||||
test_resource = Resource(MagicMock(), test_url, lambda: None)
|
||||
result = download_filter.check_resource(test_resource)
|
||||
assert result is True
|
||||
|
|
|
@ -119,7 +119,7 @@ def test_format_full(
|
|||
format_string_file: str,
|
||||
expected: str,
|
||||
reddit_submission: praw.models.Submission):
|
||||
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
|
||||
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png', lambda: None)
|
||||
test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO')
|
||||
result = test_formatter.format_path(test_resource, Path('test'))
|
||||
assert do_test_path_equality(result, expected)
|
||||
|
@ -136,7 +136,7 @@ def test_format_full_conform(
|
|||
format_string_directory: str,
|
||||
format_string_file: str,
|
||||
reddit_submission: praw.models.Submission):
|
||||
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
|
||||
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png', lambda: None)
|
||||
test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO')
|
||||
test_formatter.format_path(test_resource, Path('test'))
|
||||
|
||||
|
@ -156,7 +156,7 @@ def test_format_full_with_index_suffix(
|
|||
expected: str,
|
||||
reddit_submission: praw.models.Submission,
|
||||
):
|
||||
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png')
|
||||
test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png', lambda: None)
|
||||
test_formatter = FileNameFormatter(format_string_file, format_string_directory, 'ISO')
|
||||
result = test_formatter.format_path(test_resource, Path('test'), index)
|
||||
assert do_test_path_equality(result, expected)
|
||||
|
@ -216,7 +216,7 @@ def test_shorten_filenames(submission: MagicMock, tmp_path: Path):
|
|||
submission.author.name = 'test'
|
||||
submission.subreddit.display_name = 'test'
|
||||
submission.id = 'BBBBBB'
|
||||
test_resource = Resource(submission, 'www.example.com/empty', '.jpeg')
|
||||
test_resource = Resource(submission, 'www.example.com/empty', lambda: None, '.jpeg')
|
||||
test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}', 'ISO')
|
||||
result = test_formatter.format_path(test_resource, tmp_path)
|
||||
result.parent.mkdir(parents=True)
|
||||
|
@ -296,7 +296,7 @@ def test_format_archive_entry_comment(
|
|||
):
|
||||
test_comment = reddit_instance.comment(id=test_comment_id)
|
||||
test_formatter = FileNameFormatter(test_file_scheme, test_folder_scheme, 'ISO')
|
||||
test_entry = Resource(test_comment, '', '.json')
|
||||
test_entry = Resource(test_comment, '', lambda: None, '.json')
|
||||
result = test_formatter.format_path(test_entry, tmp_path)
|
||||
assert do_test_string_equality(result, expected_name)
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ from bdfr.resource import Resource
|
|||
('https://www.test.com/test/test2/example.png?random=test#thing', '.png'),
|
||||
))
|
||||
def test_resource_get_extension(test_url: str, expected: str):
|
||||
test_resource = Resource(MagicMock(), test_url)
|
||||
test_resource = Resource(MagicMock(), test_url, lambda: None)
|
||||
result = test_resource._determine_extension()
|
||||
assert result == expected
|
||||
|
||||
|
@ -31,6 +31,6 @@ def test_resource_get_extension(test_url: str, expected: str):
|
|||
('https://www.iana.org/_img/2013.1/iana-logo-header.svg', '426b3ac01d3584c820f3b7f5985d6623'),
|
||||
))
|
||||
def test_download_online_resource(test_url: str, expected_hash: str):
|
||||
test_resource = Resource(MagicMock(), test_url)
|
||||
test_resource.download(120)
|
||||
test_resource = Resource(MagicMock(), test_url, Resource.retry_download(test_url))
|
||||
test_resource.download()
|
||||
assert test_resource.hash.hexdigest() == expected_hash
|
||||
|
|
Loading…
Reference in a new issue