Add much more logging
This commit is contained in:
parent
312769cb66
commit
f941161014
|
@ -57,10 +57,12 @@ def _setup_logging(verbosity: int):
|
||||||
formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s')
|
formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s')
|
||||||
stream.setFormatter(formatter)
|
stream.setFormatter(formatter)
|
||||||
logger.addHandler(stream)
|
logger.addHandler(stream)
|
||||||
if verbosity < 0:
|
if verbosity <= 0:
|
||||||
stream.setLevel(logging.INFO)
|
stream.setLevel(logging.INFO)
|
||||||
else:
|
elif verbosity == 1:
|
||||||
stream.setLevel(logging.DEBUG)
|
stream.setLevel(logging.DEBUG)
|
||||||
|
else:
|
||||||
|
stream.setLevel(9)
|
||||||
logging.getLogger('praw').setLevel(logging.CRITICAL)
|
logging.getLogger('praw').setLevel(logging.CRITICAL)
|
||||||
logging.getLogger('prawcore').setLevel(logging.CRITICAL)
|
logging.getLogger('prawcore').setLevel(logging.CRITICAL)
|
||||||
logging.getLogger('urllib3').setLevel(logging.CRITICAL)
|
logging.getLogger('urllib3').setLevel(logging.CRITICAL)
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class DownloadFilter:
|
class DownloadFilter:
|
||||||
def __init__(self, excluded_extensions: list[str] = None, excluded_domains: list[str] = None):
|
def __init__(self, excluded_extensions: list[str] = None, excluded_domains: list[str] = None):
|
||||||
|
@ -24,6 +27,7 @@ class DownloadFilter:
|
||||||
combined_extensions = '|'.join(self.excluded_extensions)
|
combined_extensions = '|'.join(self.excluded_extensions)
|
||||||
pattern = re.compile(r'.*({})$'.format(combined_extensions))
|
pattern = re.compile(r'.*({})$'.format(combined_extensions))
|
||||||
if re.match(pattern, url):
|
if re.match(pattern, url):
|
||||||
|
logger.log(9, f'Url "{url}" matched with "{str(pattern)}"')
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
@ -34,6 +38,7 @@ class DownloadFilter:
|
||||||
combined_domains = '|'.join(self.excluded_domains)
|
combined_domains = '|'.join(self.excluded_domains)
|
||||||
pattern = re.compile(r'https?://.*({}).*'.format(combined_domains))
|
pattern = re.compile(r'https?://.*({}).*'.format(combined_domains))
|
||||||
if re.match(pattern, url):
|
if re.match(pattern, url):
|
||||||
|
logger.log(9, f'Url "{url}" matched with "{str(pattern)}"')
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -57,13 +57,13 @@ class RedditDownloader:
|
||||||
self._create_file_logger()
|
self._create_file_logger()
|
||||||
|
|
||||||
self.download_filter = self._create_download_filter()
|
self.download_filter = self._create_download_filter()
|
||||||
logger.debug('Created download filter')
|
logger.log(9, 'Created download filter')
|
||||||
self.time_filter = self._create_time_filter()
|
self.time_filter = self._create_time_filter()
|
||||||
logger.debug('Created time filter')
|
logger.log(9, 'Created time filter')
|
||||||
self.sort_filter = self._create_sort_filter()
|
self.sort_filter = self._create_sort_filter()
|
||||||
logger.debug('Created sort filter')
|
logger.log(9, 'Created sort filter')
|
||||||
self.file_name_formatter = self._create_file_name_formatter()
|
self.file_name_formatter = self._create_file_name_formatter()
|
||||||
logger.debug('Create file name formatter')
|
logger.log(9, 'Create file name formatter')
|
||||||
|
|
||||||
self._resolve_user_name()
|
self._resolve_user_name()
|
||||||
self._load_config()
|
self._load_config()
|
||||||
|
@ -71,14 +71,14 @@ class RedditDownloader:
|
||||||
|
|
||||||
self.master_hash_list = []
|
self.master_hash_list = []
|
||||||
self.authenticator = self._create_authenticator()
|
self.authenticator = self._create_authenticator()
|
||||||
logger.debug('Created site authenticator')
|
logger.log(9, 'Created site authenticator')
|
||||||
self._create_reddit_instance()
|
self._create_reddit_instance()
|
||||||
|
|
||||||
def _create_reddit_instance(self):
|
def _create_reddit_instance(self):
|
||||||
if self.args.authenticate:
|
if self.args.authenticate:
|
||||||
logger.debug('Using authenticated Reddit instance')
|
logger.debug('Using authenticated Reddit instance')
|
||||||
if not self.cfg_parser.has_option('DEFAULT', 'user_token'):
|
if not self.cfg_parser.has_option('DEFAULT', 'user_token'):
|
||||||
logger.debug('Commencing OAuth2 authentication')
|
logger.log(9, 'Commencing OAuth2 authentication')
|
||||||
scopes = self.cfg_parser.get('DEFAULT', 'scopes')
|
scopes = self.cfg_parser.get('DEFAULT', 'scopes')
|
||||||
scopes = OAuth2Authenticator.split_scopes(scopes)
|
scopes = OAuth2Authenticator.split_scopes(scopes)
|
||||||
oauth2_authenticator = OAuth2Authenticator(
|
oauth2_authenticator = OAuth2Authenticator(
|
||||||
|
@ -106,13 +106,13 @@ class RedditDownloader:
|
||||||
def _retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]:
|
def _retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]:
|
||||||
master_list = []
|
master_list = []
|
||||||
master_list.extend(self._get_subreddits())
|
master_list.extend(self._get_subreddits())
|
||||||
logger.debug('Retrieved subreddits')
|
logger.log(9, 'Retrieved subreddits')
|
||||||
master_list.extend(self._get_multireddits())
|
master_list.extend(self._get_multireddits())
|
||||||
logger.debug('Retrieved multireddits')
|
logger.log(9, 'Retrieved multireddits')
|
||||||
master_list.extend(self._get_user_data())
|
master_list.extend(self._get_user_data())
|
||||||
logger.debug('Retrieved user data')
|
logger.log(9, 'Retrieved user data')
|
||||||
master_list.extend(self._get_submissions_from_link())
|
master_list.extend(self._get_submissions_from_link())
|
||||||
logger.debug('Retrieved submissions for given links')
|
logger.log(9, 'Retrieved submissions for given links')
|
||||||
return master_list
|
return master_list
|
||||||
|
|
||||||
def _determine_directories(self):
|
def _determine_directories(self):
|
||||||
|
@ -140,6 +140,7 @@ class RedditDownloader:
|
||||||
for path in possible_paths:
|
for path in possible_paths:
|
||||||
if path.resolve().expanduser().exists():
|
if path.resolve().expanduser().exists():
|
||||||
self.config_location = path
|
self.config_location = path
|
||||||
|
logger.debug(f'Loading configuration from {path}')
|
||||||
break
|
break
|
||||||
if not self.config_location:
|
if not self.config_location:
|
||||||
raise errors.BulkDownloaderException('Could not find a configuration file to load')
|
raise errors.BulkDownloaderException('Could not find a configuration file to load')
|
||||||
|
@ -181,6 +182,7 @@ class RedditDownloader:
|
||||||
def _resolve_user_name(self):
|
def _resolve_user_name(self):
|
||||||
if self.args.user == 'me':
|
if self.args.user == 'me':
|
||||||
self.args.user = self.reddit_instance.user.me().name
|
self.args.user = self.reddit_instance.user.me().name
|
||||||
|
logger.log(9, f'Resolved user to {self.args.user}')
|
||||||
|
|
||||||
def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]:
|
def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]:
|
||||||
supplied_submissions = []
|
supplied_submissions = []
|
||||||
|
@ -227,6 +229,7 @@ class RedditDownloader:
|
||||||
generators = []
|
generators = []
|
||||||
sort_function = self._determine_sort_function()
|
sort_function = self._determine_sort_function()
|
||||||
if self.args.submitted:
|
if self.args.submitted:
|
||||||
|
logger.debug(f'Retrieving submitted posts of user {self.args.user}')
|
||||||
generators.append(
|
generators.append(
|
||||||
sort_function(
|
sort_function(
|
||||||
self.reddit_instance.redditor(self.args.user).submissions,
|
self.reddit_instance.redditor(self.args.user).submissions,
|
||||||
|
@ -235,8 +238,10 @@ class RedditDownloader:
|
||||||
raise errors.RedditAuthenticationError('Accessing user lists requires authentication')
|
raise errors.RedditAuthenticationError('Accessing user lists requires authentication')
|
||||||
else:
|
else:
|
||||||
if self.args.upvoted:
|
if self.args.upvoted:
|
||||||
|
logger.debug(f'Retrieving upvoted posts of user {self.args.user}')
|
||||||
generators.append(self.reddit_instance.redditor(self.args.user).upvoted(limit=self.args.limit))
|
generators.append(self.reddit_instance.redditor(self.args.user).upvoted(limit=self.args.limit))
|
||||||
if self.args.saved:
|
if self.args.saved:
|
||||||
|
logger.debug(f'Retrieving saved posts of user {self.args.user}')
|
||||||
generators.append(self.reddit_instance.redditor(self.args.user).saved(limit=self.args.limit))
|
generators.append(self.reddit_instance.redditor(self.args.user).saved(limit=self.args.limit))
|
||||||
return generators
|
return generators
|
||||||
else:
|
else:
|
||||||
|
@ -277,11 +282,11 @@ class RedditDownloader:
|
||||||
def download(self):
|
def download(self):
|
||||||
for generator in self.reddit_lists:
|
for generator in self.reddit_lists:
|
||||||
for submission in generator:
|
for submission in generator:
|
||||||
|
logger.debug(f'Attempting to download submission {submission.id}')
|
||||||
self._download_submission(submission)
|
self._download_submission(submission)
|
||||||
|
|
||||||
def _download_submission(self, submission: praw.models.Submission):
|
def _download_submission(self, submission: praw.models.Submission):
|
||||||
if self.download_filter.check_url(submission.url):
|
if self.download_filter.check_url(submission.url):
|
||||||
logger.debug(f'Attempting to download submission {submission.id}')
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
downloader_class = DownloadFactory.pull_lever(submission.url)
|
downloader_class = DownloadFactory.pull_lever(submission.url)
|
||||||
|
@ -293,11 +298,12 @@ class RedditDownloader:
|
||||||
content = downloader.find_resources(self.authenticator)
|
content = downloader.find_resources(self.authenticator)
|
||||||
for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory):
|
for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory):
|
||||||
if destination.exists():
|
if destination.exists():
|
||||||
logger.debug(f'File already exists: {destination}')
|
logger.warning(f'File already exists: {destination}')
|
||||||
else:
|
else:
|
||||||
res.download()
|
res.download()
|
||||||
if res.hash.hexdigest() in self.master_hash_list and self.args.no_dupes:
|
if res.hash.hexdigest() in self.master_hash_list and self.args.no_dupes:
|
||||||
logger.debug(f'Resource from {res.url} downloaded elsewhere')
|
logger.warning(
|
||||||
|
f'Resource from "{res.url}" and hash "{res.hash.hexdigest()}" downloaded elsewhere')
|
||||||
else:
|
else:
|
||||||
# TODO: consider making a hard link/symlink here
|
# TODO: consider making a hard link/symlink here
|
||||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
@ -10,6 +11,8 @@ import praw.models
|
||||||
from bulkredditdownloader.exceptions import BulkDownloaderException
|
from bulkredditdownloader.exceptions import BulkDownloaderException
|
||||||
from bulkredditdownloader.resource import Resource
|
from bulkredditdownloader.resource import Resource
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class FileNameFormatter:
|
class FileNameFormatter:
|
||||||
key_terms = ('title', 'subreddit', 'redditor', 'postid', 'upvotes', 'flair', 'date')
|
key_terms = ('title', 'subreddit', 'redditor', 'postid', 'upvotes', 'flair', 'date')
|
||||||
|
@ -35,6 +38,7 @@ class FileNameFormatter:
|
||||||
for key in submission_attributes.keys():
|
for key in submission_attributes.keys():
|
||||||
if re.search(r'(?i).*{{{}}}.*'.format(key), result):
|
if re.search(r'(?i).*{{{}}}.*'.format(key), result):
|
||||||
result = re.sub(r'(?i){{{}}}'.format(key), str(submission_attributes.get(key, 'unknown')), result)
|
result = re.sub(r'(?i){{{}}}'.format(key), str(submission_attributes.get(key, 'unknown')), result)
|
||||||
|
logger.log(9, f'Found key string {key} in name')
|
||||||
|
|
||||||
result = result.replace('/', '')
|
result = result.replace('/', '')
|
||||||
return result
|
return result
|
||||||
|
@ -42,14 +46,18 @@ class FileNameFormatter:
|
||||||
def _format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path:
|
def _format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path:
|
||||||
subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string)
|
subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string)
|
||||||
index = f'_{str(index)}' if index else ''
|
index = f'_{str(index)}' if index else ''
|
||||||
file_path = subfolder / (str(self._format_name(resource.source_submission,
|
try:
|
||||||
self.file_format_string)) + index + resource.extension)
|
file_path = subfolder / (str(self._format_name(resource.source_submission,
|
||||||
|
self.file_format_string)) + index + resource.extension)
|
||||||
|
except TypeError:
|
||||||
|
raise BulkDownloaderException(f'Could not determine path name: {subfolder}, {index}, {resource.extension}')
|
||||||
return file_path
|
return file_path
|
||||||
|
|
||||||
def format_resource_paths(self, resources: list[Resource],
|
def format_resource_paths(self, resources: list[Resource],
|
||||||
destination_directory: Path) -> list[tuple[Path, Resource]]:
|
destination_directory: Path) -> list[tuple[Path, Resource]]:
|
||||||
out = []
|
out = []
|
||||||
for i, res in enumerate(resources, start=1):
|
for i, res in enumerate(resources, start=1):
|
||||||
|
logger.log(9, f'Formatting filename with index {i}')
|
||||||
out.append((self._format_path(res, destination_directory, i), res))
|
out.append((self._format_path(res, destination_directory, i), res))
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
|
@ -70,12 +70,12 @@ class OAuth2Authenticator:
|
||||||
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||||
server.bind(('localhost', 7634))
|
server.bind(('localhost', 7634))
|
||||||
logger.debug('Server listening on localhost:7634')
|
logger.log(9, 'Server listening on localhost:7634')
|
||||||
|
|
||||||
server.listen(1)
|
server.listen(1)
|
||||||
client = server.accept()[0]
|
client = server.accept()[0]
|
||||||
server.close()
|
server.close()
|
||||||
logger.debug('Server closed')
|
logger.log(9, 'Server closed')
|
||||||
|
|
||||||
return client
|
return client
|
||||||
|
|
||||||
|
@ -95,7 +95,7 @@ class OAuth2TokenManager(praw.reddit.BaseTokenManager):
|
||||||
if authorizer.refresh_token is None:
|
if authorizer.refresh_token is None:
|
||||||
if self.config.has_option('DEFAULT', 'user_token'):
|
if self.config.has_option('DEFAULT', 'user_token'):
|
||||||
authorizer.refresh_token = self.config.get('DEFAULT', 'user_token')
|
authorizer.refresh_token = self.config.get('DEFAULT', 'user_token')
|
||||||
logger.debug('Loaded OAuth2 token for authoriser')
|
logger.log(9, 'Loaded OAuth2 token for authoriser')
|
||||||
else:
|
else:
|
||||||
raise RedditAuthenticationError('No auth token loaded in configuration')
|
raise RedditAuthenticationError('No auth token loaded in configuration')
|
||||||
|
|
||||||
|
@ -103,4 +103,4 @@ class OAuth2TokenManager(praw.reddit.BaseTokenManager):
|
||||||
self.config.set('DEFAULT', 'user_token', authorizer.refresh_token)
|
self.config.set('DEFAULT', 'user_token', authorizer.refresh_token)
|
||||||
with open(self.config_location, 'w') as file:
|
with open(self.config_location, 'w') as file:
|
||||||
self.config.write(file, True)
|
self.config.write(file, True)
|
||||||
logger.debug(f'Written OAuth2 token from authoriser to {self.config_location}')
|
logger.log(9, f'Written OAuth2 token from authoriser to {self.config_location}')
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
@ -12,6 +13,8 @@ from praw.models import Submission
|
||||||
|
|
||||||
from bulkredditdownloader.exceptions import BulkDownloaderException
|
from bulkredditdownloader.exceptions import BulkDownloaderException
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Resource:
|
class Resource:
|
||||||
def __init__(self, source_submission: Submission, url: str, extension: str = None):
|
def __init__(self, source_submission: Submission, url: str, extension: str = None):
|
||||||
|
@ -32,10 +35,12 @@ class Resource:
|
||||||
else:
|
else:
|
||||||
raise requests.exceptions.ConnectionError
|
raise requests.exceptions.ConnectionError
|
||||||
except requests.exceptions.ConnectionError:
|
except requests.exceptions.ConnectionError:
|
||||||
|
logger.log(9, f'Error occured downloading resource, waiting {wait_time} seconds')
|
||||||
time.sleep(wait_time)
|
time.sleep(wait_time)
|
||||||
if wait_time < 300:
|
if wait_time < 300:
|
||||||
return Resource.retry_download(url, wait_time + 60)
|
return Resource.retry_download(url, wait_time + 60)
|
||||||
else:
|
else:
|
||||||
|
logger.error(f'Max wait time exceeded for resource at url {url}')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def download(self):
|
def download(self):
|
||||||
|
|
Loading…
Reference in a new issue