1
0
Fork 0
mirror of synced 2024-06-12 07:14:37 +12:00

Add much more logging

This commit is contained in:
Serene-Arc 2021-03-11 13:20:39 +10:00 committed by Ali Parlakci
parent 312769cb66
commit f941161014
6 changed files with 47 additions and 21 deletions

View file

@ -57,10 +57,12 @@ def _setup_logging(verbosity: int):
formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s')
stream.setFormatter(formatter)
logger.addHandler(stream)
if verbosity < 0:
if verbosity <= 0:
stream.setLevel(logging.INFO)
else:
elif verbosity == 1:
stream.setLevel(logging.DEBUG)
else:
stream.setLevel(9)
logging.getLogger('praw').setLevel(logging.CRITICAL)
logging.getLogger('prawcore').setLevel(logging.CRITICAL)
logging.getLogger('urllib3').setLevel(logging.CRITICAL)

View file

@ -1,8 +1,11 @@
#!/usr/bin/env python3
# coding=utf-8
import logging
import re
logger = logging.getLogger(__name__)
class DownloadFilter:
def __init__(self, excluded_extensions: list[str] = None, excluded_domains: list[str] = None):
@ -24,6 +27,7 @@ class DownloadFilter:
combined_extensions = '|'.join(self.excluded_extensions)
pattern = re.compile(r'.*({})$'.format(combined_extensions))
if re.match(pattern, url):
logger.log(9, f'Url "{url}" matched with "{str(pattern)}"')
return False
else:
return True
@ -34,6 +38,7 @@ class DownloadFilter:
combined_domains = '|'.join(self.excluded_domains)
pattern = re.compile(r'https?://.*({}).*'.format(combined_domains))
if re.match(pattern, url):
logger.log(9, f'Url "{url}" matched with "{str(pattern)}"')
return False
else:
return True

View file

@ -57,13 +57,13 @@ class RedditDownloader:
self._create_file_logger()
self.download_filter = self._create_download_filter()
logger.debug('Created download filter')
logger.log(9, 'Created download filter')
self.time_filter = self._create_time_filter()
logger.debug('Created time filter')
logger.log(9, 'Created time filter')
self.sort_filter = self._create_sort_filter()
logger.debug('Created sort filter')
logger.log(9, 'Created sort filter')
self.file_name_formatter = self._create_file_name_formatter()
logger.debug('Create file name formatter')
logger.log(9, 'Create file name formatter')
self._resolve_user_name()
self._load_config()
@ -71,14 +71,14 @@ class RedditDownloader:
self.master_hash_list = []
self.authenticator = self._create_authenticator()
logger.debug('Created site authenticator')
logger.log(9, 'Created site authenticator')
self._create_reddit_instance()
def _create_reddit_instance(self):
if self.args.authenticate:
logger.debug('Using authenticated Reddit instance')
if not self.cfg_parser.has_option('DEFAULT', 'user_token'):
logger.debug('Commencing OAuth2 authentication')
logger.log(9, 'Commencing OAuth2 authentication')
scopes = self.cfg_parser.get('DEFAULT', 'scopes')
scopes = OAuth2Authenticator.split_scopes(scopes)
oauth2_authenticator = OAuth2Authenticator(
@ -106,13 +106,13 @@ class RedditDownloader:
def _retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]:
master_list = []
master_list.extend(self._get_subreddits())
logger.debug('Retrieved subreddits')
logger.log(9, 'Retrieved subreddits')
master_list.extend(self._get_multireddits())
logger.debug('Retrieved multireddits')
logger.log(9, 'Retrieved multireddits')
master_list.extend(self._get_user_data())
logger.debug('Retrieved user data')
logger.log(9, 'Retrieved user data')
master_list.extend(self._get_submissions_from_link())
logger.debug('Retrieved submissions for given links')
logger.log(9, 'Retrieved submissions for given links')
return master_list
def _determine_directories(self):
@ -140,6 +140,7 @@ class RedditDownloader:
for path in possible_paths:
if path.resolve().expanduser().exists():
self.config_location = path
logger.debug(f'Loading configuration from {path}')
break
if not self.config_location:
raise errors.BulkDownloaderException('Could not find a configuration file to load')
@ -181,6 +182,7 @@ class RedditDownloader:
def _resolve_user_name(self):
if self.args.user == 'me':
self.args.user = self.reddit_instance.user.me().name
logger.log(9, f'Resolved user to {self.args.user}')
def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]:
supplied_submissions = []
@ -227,6 +229,7 @@ class RedditDownloader:
generators = []
sort_function = self._determine_sort_function()
if self.args.submitted:
logger.debug(f'Retrieving submitted posts of user {self.args.user}')
generators.append(
sort_function(
self.reddit_instance.redditor(self.args.user).submissions,
@ -235,8 +238,10 @@ class RedditDownloader:
raise errors.RedditAuthenticationError('Accessing user lists requires authentication')
else:
if self.args.upvoted:
logger.debug(f'Retrieving upvoted posts of user {self.args.user}')
generators.append(self.reddit_instance.redditor(self.args.user).upvoted(limit=self.args.limit))
if self.args.saved:
logger.debug(f'Retrieving saved posts of user {self.args.user}')
generators.append(self.reddit_instance.redditor(self.args.user).saved(limit=self.args.limit))
return generators
else:
@ -277,11 +282,11 @@ class RedditDownloader:
def download(self):
for generator in self.reddit_lists:
for submission in generator:
logger.debug(f'Attempting to download submission {submission.id}')
self._download_submission(submission)
def _download_submission(self, submission: praw.models.Submission):
if self.download_filter.check_url(submission.url):
logger.debug(f'Attempting to download submission {submission.id}')
try:
downloader_class = DownloadFactory.pull_lever(submission.url)
@ -293,11 +298,12 @@ class RedditDownloader:
content = downloader.find_resources(self.authenticator)
for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory):
if destination.exists():
logger.debug(f'File already exists: {destination}')
logger.warning(f'File already exists: {destination}')
else:
res.download()
if res.hash.hexdigest() in self.master_hash_list and self.args.no_dupes:
logger.debug(f'Resource from {res.url} downloaded elsewhere')
logger.warning(
f'Resource from "{res.url}" and hash "{res.hash.hexdigest()}" downloaded elsewhere')
else:
# TODO: consider making a hard link/symlink here
destination.parent.mkdir(parents=True, exist_ok=True)

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3
# coding=utf-8
import logging
import re
from pathlib import Path
from typing import Optional
@ -10,6 +11,8 @@ import praw.models
from bulkredditdownloader.exceptions import BulkDownloaderException
from bulkredditdownloader.resource import Resource
logger = logging.getLogger(__name__)
class FileNameFormatter:
key_terms = ('title', 'subreddit', 'redditor', 'postid', 'upvotes', 'flair', 'date')
@ -35,6 +38,7 @@ class FileNameFormatter:
for key in submission_attributes.keys():
if re.search(r'(?i).*{{{}}}.*'.format(key), result):
result = re.sub(r'(?i){{{}}}'.format(key), str(submission_attributes.get(key, 'unknown')), result)
logger.log(9, f'Found key string {key} in name')
result = result.replace('/', '')
return result
@ -42,14 +46,18 @@ class FileNameFormatter:
def _format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path:
subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string)
index = f'_{str(index)}' if index else ''
file_path = subfolder / (str(self._format_name(resource.source_submission,
self.file_format_string)) + index + resource.extension)
try:
file_path = subfolder / (str(self._format_name(resource.source_submission,
self.file_format_string)) + index + resource.extension)
except TypeError:
raise BulkDownloaderException(f'Could not determine path name: {subfolder}, {index}, {resource.extension}')
return file_path
def format_resource_paths(self, resources: list[Resource],
destination_directory: Path) -> list[tuple[Path, Resource]]:
out = []
for i, res in enumerate(resources, start=1):
logger.log(9, f'Formatting filename with index {i}')
out.append((self._format_path(res, destination_directory, i), res))
return out

View file

@ -70,12 +70,12 @@ class OAuth2Authenticator:
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server.bind(('localhost', 7634))
logger.debug('Server listening on localhost:7634')
logger.log(9, 'Server listening on localhost:7634')
server.listen(1)
client = server.accept()[0]
server.close()
logger.debug('Server closed')
logger.log(9, 'Server closed')
return client
@ -95,7 +95,7 @@ class OAuth2TokenManager(praw.reddit.BaseTokenManager):
if authorizer.refresh_token is None:
if self.config.has_option('DEFAULT', 'user_token'):
authorizer.refresh_token = self.config.get('DEFAULT', 'user_token')
logger.debug('Loaded OAuth2 token for authoriser')
logger.log(9, 'Loaded OAuth2 token for authoriser')
else:
raise RedditAuthenticationError('No auth token loaded in configuration')
@ -103,4 +103,4 @@ class OAuth2TokenManager(praw.reddit.BaseTokenManager):
self.config.set('DEFAULT', 'user_token', authorizer.refresh_token)
with open(self.config_location, 'w') as file:
self.config.write(file, True)
logger.debug(f'Written OAuth2 token from authoriser to {self.config_location}')
logger.log(9, f'Written OAuth2 token from authoriser to {self.config_location}')

View file

@ -2,6 +2,7 @@
# coding=utf-8
import hashlib
import logging
import re
import time
from typing import Optional
@ -12,6 +13,8 @@ from praw.models import Submission
from bulkredditdownloader.exceptions import BulkDownloaderException
logger = logging.getLogger(__name__)
class Resource:
def __init__(self, source_submission: Submission, url: str, extension: str = None):
@ -32,10 +35,12 @@ class Resource:
else:
raise requests.exceptions.ConnectionError
except requests.exceptions.ConnectionError:
logger.log(9, f'Error occured downloading resource, waiting {wait_time} seconds')
time.sleep(wait_time)
if wait_time < 300:
return Resource.retry_download(url, wait_time + 60)
else:
logger.error(f'Max wait time exceeded for resource at url {url}')
return None
def download(self):