2021-02-11 12:10:40 +13:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# coding=utf-8
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import configparser
|
|
|
|
import logging
|
|
|
|
import socket
|
|
|
|
from datetime import datetime
|
|
|
|
from enum import Enum, auto
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
import appdirs
|
|
|
|
import praw
|
|
|
|
import praw.models
|
|
|
|
|
|
|
|
from bulkredditdownloader.download_filter import DownloadFilter
|
|
|
|
from bulkredditdownloader.errors import NotADownloadableLinkError, RedditAuthenticationError
|
|
|
|
from bulkredditdownloader.file_name_formatter import FileNameFormatter
|
2021-03-03 15:53:53 +13:00
|
|
|
from bulkredditdownloader.site_authenticator import SiteAuthenticator
|
2021-02-11 12:10:40 +13:00
|
|
|
from bulkredditdownloader.site_downloaders.download_factory import DownloadFactory
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
class RedditTypes:
|
|
|
|
class SortType(Enum):
|
|
|
|
HOT = auto()
|
|
|
|
RISING = auto()
|
|
|
|
CONTROVERSIAL = auto()
|
|
|
|
NEW = auto()
|
|
|
|
RELEVENCE = auto()
|
|
|
|
|
|
|
|
class TimeType(Enum):
|
|
|
|
HOUR = auto()
|
|
|
|
DAY = auto()
|
|
|
|
WEEK = auto()
|
|
|
|
MONTH = auto()
|
|
|
|
YEAR = auto()
|
|
|
|
ALL = auto()
|
|
|
|
|
|
|
|
|
|
|
|
class RedditDownloader:
|
|
|
|
def __init__(self, args: argparse.Namespace):
|
2021-02-14 22:04:20 +13:00
|
|
|
self.args = args
|
2021-02-11 12:10:40 +13:00
|
|
|
self.config_directories = appdirs.AppDirs('bulk_reddit_downloader')
|
|
|
|
self.run_time = datetime.now().isoformat()
|
2021-02-14 22:04:20 +13:00
|
|
|
self._setup_internal_objects()
|
2021-02-11 12:10:40 +13:00
|
|
|
|
2021-02-14 22:04:20 +13:00
|
|
|
self.reddit_lists = self._retrieve_reddit_lists()
|
2021-02-11 12:10:40 +13:00
|
|
|
|
2021-02-14 22:04:20 +13:00
|
|
|
def _setup_internal_objects(self):
|
|
|
|
self.download_filter = self._create_download_filter()
|
|
|
|
self.time_filter = self._create_time_filter()
|
|
|
|
self.sort_filter = self._create_sort_filter()
|
|
|
|
self.file_name_formatter = self._create_file_name_formatter()
|
2021-02-26 21:56:21 +13:00
|
|
|
self.authenticator = self._create_authenticator()
|
2021-02-14 22:04:20 +13:00
|
|
|
self._determine_directories()
|
2021-02-14 18:52:11 +13:00
|
|
|
self._create_file_logger()
|
2021-02-11 12:10:40 +13:00
|
|
|
self.master_hash_list = []
|
2021-02-14 22:04:20 +13:00
|
|
|
self._load_config()
|
2021-02-26 21:56:21 +13:00
|
|
|
if self.cfg_parser.has_option('DEFAULT', 'reddit_token'):
|
|
|
|
# TODO: implement OAuth2 authentication
|
2021-02-11 12:10:40 +13:00
|
|
|
self.authenticated = True
|
|
|
|
self.reddit_instance = praw.Reddit(client_id=self.cfg_parser.get('DEFAULT', 'client_id'),
|
|
|
|
client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'),
|
|
|
|
user_agent=socket.gethostname(),
|
2021-02-26 21:56:21 +13:00
|
|
|
)
|
2021-02-11 12:10:40 +13:00
|
|
|
else:
|
|
|
|
self.authenticated = False
|
|
|
|
self.reddit_instance = praw.Reddit(client_id=self.cfg_parser.get('DEFAULT', 'client_id'),
|
|
|
|
client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'),
|
|
|
|
user_agent=socket.gethostname())
|
|
|
|
|
2021-02-14 22:04:20 +13:00
|
|
|
def _retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]:
|
2021-02-11 12:10:40 +13:00
|
|
|
master_list = []
|
2021-02-14 22:04:20 +13:00
|
|
|
master_list.extend(self._get_subreddits())
|
|
|
|
master_list.extend(self._get_multireddits())
|
|
|
|
master_list.extend(self._get_user_data())
|
2021-02-15 21:05:04 +13:00
|
|
|
master_list.extend(self._get_submissions_from_link())
|
2021-02-11 12:10:40 +13:00
|
|
|
return master_list
|
|
|
|
|
2021-02-14 22:04:20 +13:00
|
|
|
def _determine_directories(self):
|
|
|
|
self.download_directory = Path(self.args.directory)
|
2021-02-11 12:10:40 +13:00
|
|
|
self.logfile_directory = self.download_directory / 'LOG_FILES'
|
|
|
|
self.config_directory = self.config_directories.user_config_dir
|
|
|
|
|
2021-02-15 16:56:02 +13:00
|
|
|
self.download_directory.mkdir(exist_ok=True, parents=True)
|
|
|
|
self.logfile_directory.mkdir(exist_ok=True, parents=True)
|
|
|
|
|
2021-02-14 22:04:20 +13:00
|
|
|
def _load_config(self):
|
2021-02-11 12:10:40 +13:00
|
|
|
self.cfg_parser = configparser.ConfigParser()
|
2021-02-14 22:04:20 +13:00
|
|
|
if self.args.use_local_config and Path('./config.cfg').exists():
|
2021-02-11 12:10:40 +13:00
|
|
|
self.cfg_parser.read(Path('./config.cfg'))
|
|
|
|
else:
|
|
|
|
self.cfg_parser.read(Path('./default_config.cfg').resolve())
|
|
|
|
|
2021-02-14 18:52:11 +13:00
|
|
|
def _create_file_logger(self):
|
|
|
|
main_logger = logging.getLogger()
|
2021-02-15 16:56:02 +13:00
|
|
|
file_handler = logging.FileHandler(self.logfile_directory / 'log_output.txt')
|
2021-02-14 18:52:11 +13:00
|
|
|
formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s')
|
|
|
|
file_handler.setFormatter(formatter)
|
|
|
|
file_handler.setLevel(0)
|
|
|
|
|
|
|
|
main_logger.addHandler(file_handler)
|
|
|
|
|
2021-02-14 22:04:20 +13:00
|
|
|
def _get_subreddits(self) -> list[praw.models.ListingGenerator]:
|
|
|
|
if self.args.subreddit:
|
|
|
|
subreddits = [self.reddit_instance.subreddit(chosen_subreddit) for chosen_subreddit in self.args.subreddit]
|
|
|
|
if self.args.search:
|
2021-03-03 15:53:53 +13:00
|
|
|
return [
|
|
|
|
reddit.search(
|
|
|
|
self.args.search,
|
|
|
|
sort=self.sort_filter.name.lower(),
|
|
|
|
limit=self.args.limit) for reddit in subreddits]
|
2021-02-11 12:10:40 +13:00
|
|
|
else:
|
2021-02-15 16:55:33 +13:00
|
|
|
sort_function = self._determine_sort_function()
|
|
|
|
return [sort_function(reddit, limit=self.args.limit) for reddit in subreddits]
|
2021-02-11 12:10:40 +13:00
|
|
|
else:
|
|
|
|
return []
|
|
|
|
|
2021-02-15 21:05:04 +13:00
|
|
|
def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]:
|
|
|
|
supplied_submissions = []
|
2021-03-03 15:53:53 +13:00
|
|
|
for sub_id in self.args.link:
|
|
|
|
supplied_submissions.append(self.reddit_instance.submission(id=sub_id))
|
2021-02-15 21:05:04 +13:00
|
|
|
return [supplied_submissions]
|
|
|
|
|
2021-02-15 16:55:33 +13:00
|
|
|
def _determine_sort_function(self):
|
|
|
|
if self.sort_filter is RedditTypes.SortType.NEW:
|
|
|
|
sort_function = praw.models.Subreddit.new
|
|
|
|
elif self.sort_filter is RedditTypes.SortType.RISING:
|
|
|
|
sort_function = praw.models.Subreddit.rising
|
|
|
|
elif self.sort_filter is RedditTypes.SortType.CONTROVERSIAL:
|
|
|
|
sort_function = praw.models.Subreddit.controversial
|
|
|
|
else:
|
|
|
|
sort_function = praw.models.Subreddit.hot
|
|
|
|
return sort_function
|
|
|
|
|
2021-02-14 22:04:20 +13:00
|
|
|
def _get_multireddits(self) -> list[praw.models.ListingGenerator]:
|
|
|
|
if self.args.multireddit:
|
2021-02-11 12:10:40 +13:00
|
|
|
if self.authenticated:
|
2021-02-14 22:04:20 +13:00
|
|
|
return [self.reddit_instance.multireddit(m_reddit_choice) for m_reddit_choice in self.args.multireddit]
|
2021-02-11 12:10:40 +13:00
|
|
|
else:
|
|
|
|
raise RedditAuthenticationError('Accessing multireddits requires authentication')
|
|
|
|
else:
|
|
|
|
return []
|
|
|
|
|
2021-02-14 22:04:20 +13:00
|
|
|
def _get_user_data(self) -> list[praw.models.ListingGenerator]:
|
|
|
|
if any((self.args.upvoted, self.args.submitted, self.args.saved)):
|
2021-02-11 12:10:40 +13:00
|
|
|
if self.authenticated:
|
|
|
|
generators = []
|
2021-02-15 16:55:33 +13:00
|
|
|
sort_function = self._determine_sort_function()
|
2021-02-14 22:04:20 +13:00
|
|
|
if self.args.upvoted:
|
|
|
|
generators.append(self.reddit_instance.redditor(self.args.user).upvoted)
|
|
|
|
if self.args.submitted:
|
2021-02-15 16:55:33 +13:00
|
|
|
generators.append(
|
|
|
|
sort_function(
|
|
|
|
self.reddit_instance.redditor(self.args.user).submissions,
|
|
|
|
limit=self.args.limit))
|
2021-02-14 22:04:20 +13:00
|
|
|
if self.args.saved:
|
|
|
|
generators.append(self.reddit_instance.redditor(self.args.user).saved)
|
2021-02-11 12:10:40 +13:00
|
|
|
|
|
|
|
return generators
|
|
|
|
else:
|
|
|
|
raise RedditAuthenticationError('Accessing user lists requires authentication')
|
|
|
|
else:
|
|
|
|
return []
|
|
|
|
|
2021-02-14 22:04:20 +13:00
|
|
|
def _create_file_name_formatter(self) -> FileNameFormatter:
|
2021-03-03 15:53:53 +13:00
|
|
|
return FileNameFormatter(self.args.set_file_scheme, self.args.set_folder_scheme)
|
2021-02-11 12:10:40 +13:00
|
|
|
|
2021-02-14 22:04:20 +13:00
|
|
|
def _create_time_filter(self) -> RedditTypes.TimeType:
|
2021-02-11 12:10:40 +13:00
|
|
|
try:
|
2021-03-03 15:53:53 +13:00
|
|
|
return RedditTypes.TimeType[self.args.time.upper()]
|
2021-02-11 12:10:40 +13:00
|
|
|
except (KeyError, AttributeError):
|
|
|
|
return RedditTypes.TimeType.ALL
|
|
|
|
|
2021-02-14 22:04:20 +13:00
|
|
|
def _create_sort_filter(self) -> RedditTypes.SortType:
|
2021-02-11 12:10:40 +13:00
|
|
|
try:
|
2021-03-03 15:53:53 +13:00
|
|
|
return RedditTypes.SortType[self.args.sort.upper()]
|
2021-02-11 12:10:40 +13:00
|
|
|
except (KeyError, AttributeError):
|
|
|
|
return RedditTypes.SortType.HOT
|
|
|
|
|
2021-02-14 22:04:20 +13:00
|
|
|
def _create_download_filter(self) -> DownloadFilter:
|
2021-03-03 15:53:53 +13:00
|
|
|
return DownloadFilter(self.args.skip, self.args.skip_domain)
|
2021-02-11 12:10:40 +13:00
|
|
|
|
2021-02-26 21:57:05 +13:00
|
|
|
def _create_authenticator(self) -> SiteAuthenticator:
|
2021-02-26 21:56:21 +13:00
|
|
|
raise NotImplementedError
|
|
|
|
|
2021-02-11 12:10:40 +13:00
|
|
|
def download(self):
|
|
|
|
for generator in self.reddit_lists:
|
|
|
|
for submission in generator:
|
|
|
|
self._download_submission(submission)
|
|
|
|
|
|
|
|
def _download_submission(self, submission: praw.models.Submission):
|
|
|
|
if self.download_filter.check_url(submission.url):
|
2021-02-15 20:45:10 +13:00
|
|
|
logger.debug('Attempting to download submission {}'.format(submission.id))
|
2021-02-11 12:10:40 +13:00
|
|
|
try:
|
|
|
|
downloader_class = DownloadFactory.pull_lever(submission.url)
|
2021-02-15 18:12:27 +13:00
|
|
|
downloader = downloader_class(submission)
|
2021-02-14 22:09:18 +13:00
|
|
|
if self.args.no_download:
|
|
|
|
logger.info('Skipping download for submission {}'.format(submission.id))
|
|
|
|
else:
|
2021-02-26 21:56:21 +13:00
|
|
|
content = downloader.find_resources(self.authenticator)
|
2021-02-14 22:09:18 +13:00
|
|
|
for res in content:
|
|
|
|
destination = self.file_name_formatter.format_path(res, self.download_directory)
|
|
|
|
if destination.exists():
|
|
|
|
logger.debug('File already exists: {}'.format(destination))
|
|
|
|
else:
|
2021-02-15 16:56:14 +13:00
|
|
|
if res.hash.hexdigest() not in self.master_hash_list and not self.args.no_dupes:
|
2021-02-14 22:09:18 +13:00
|
|
|
# TODO: consider making a hard link/symlink here
|
|
|
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(destination, 'wb') as file:
|
|
|
|
file.write(res.content)
|
|
|
|
logger.debug('Written file to {}'.format(destination))
|
|
|
|
self.master_hash_list.append(res.hash.hexdigest())
|
|
|
|
logger.debug('Hash added to master list: {}'.format(res.hash.hexdigest()))
|
2021-02-14 22:22:31 +13:00
|
|
|
else:
|
|
|
|
logger.debug(f'Resource from {res.url} downloaded elsewhere')
|
2021-02-11 12:10:40 +13:00
|
|
|
|
|
|
|
logger.info('Downloaded submission {}'.format(submission.name))
|
|
|
|
except NotADownloadableLinkError as e:
|
|
|
|
logger.error('Could not download submission {}: {}'.format(submission.name, e))
|