1
0
Fork 0
mirror of synced 2024-05-02 19:32:45 +12:00
bulk-downloader-for-reddit/bdfr/file_name_formatter.py
2022-03-25 10:50:52 +10:00

207 lines
8 KiB
Python

#!/usr/bin/env python3
# coding=utf-8
import datetime
import logging
import platform
import re
import subprocess
from pathlib import Path
from typing import Optional
from praw.models import Comment, Submission
from bdfr.exceptions import BulkDownloaderException
from bdfr.resource import Resource
logger = logging.getLogger(__name__)
class FileNameFormatter:
key_terms = (
'date',
'flair',
'postid',
'redditor',
'subreddit',
'title',
'upvotes',
)
def __init__(self, file_format_string: str, directory_format_string: str, time_format_string: str):
if not self.validate_string(file_format_string):
raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string')
self.file_format_string = file_format_string
self.directory_format_string: list[str] = directory_format_string.split('/')
self.time_format_string = time_format_string
def _format_name(self, submission: (Comment, Submission), format_string: str) -> str:
if isinstance(submission, Submission):
attributes = self._generate_name_dict_from_submission(submission)
elif isinstance(submission, Comment):
attributes = self._generate_name_dict_from_comment(submission)
else:
raise BulkDownloaderException(f'Cannot name object {type(submission).__name__}')
result = format_string
for key in attributes.keys():
if re.search(fr'(?i).*{{{key}}}.*', result):
key_value = str(attributes.get(key, 'unknown'))
key_value = FileNameFormatter._convert_unicode_escapes(key_value)
key_value = key_value.replace('\\', '\\\\')
result = re.sub(fr'(?i){{{key}}}', key_value, result)
result = result.replace('/', '')
if platform.system() == 'Windows':
result = FileNameFormatter._format_for_windows(result)
return result
@staticmethod
def _convert_unicode_escapes(in_string: str) -> str:
pattern = re.compile(r'(\\u\d{4})')
matches = re.search(pattern, in_string)
if matches:
for match in matches.groups():
converted_match = bytes(match, 'utf-8').decode('unicode-escape')
in_string = in_string.replace(match, converted_match)
return in_string
def _generate_name_dict_from_submission(self, submission: Submission) -> dict:
submission_attributes = {
'title': submission.title,
'subreddit': submission.subreddit.display_name,
'redditor': submission.author.name if submission.author else 'DELETED',
'postid': submission.id,
'upvotes': submission.score,
'flair': submission.link_flair_text,
'date': self._convert_timestamp(submission.created_utc),
}
return submission_attributes
def _convert_timestamp(self, timestamp: float) -> str:
input_time = datetime.datetime.fromtimestamp(timestamp)
if self.time_format_string.upper().strip() == 'ISO':
return input_time.isoformat()
else:
return input_time.strftime(self.time_format_string)
def _generate_name_dict_from_comment(self, comment: Comment) -> dict:
comment_attributes = {
'title': comment.submission.title,
'subreddit': comment.subreddit.display_name,
'redditor': comment.author.name if comment.author else 'DELETED',
'postid': comment.id,
'upvotes': comment.score,
'flair': '',
'date': self._convert_timestamp(comment.created_utc),
}
return comment_attributes
def format_path(
self,
resource: Resource,
destination_directory: Path,
index: Optional[int] = None,
) -> Path:
subfolder = Path(
destination_directory,
*[self._format_name(resource.source_submission, part) for part in self.directory_format_string],
)
index = f'_{str(index)}' if index else ''
if not resource.extension:
raise BulkDownloaderException(f'Resource from {resource.url} has no extension')
file_name = str(self._format_name(resource.source_submission, self.file_format_string))
file_name = re.sub(r'\n', ' ', file_name)
if not re.match(r'.*\.$', file_name) and not re.match(r'^\..*', resource.extension):
ending = index + '.' + resource.extension
else:
ending = index + resource.extension
try:
file_path = self.limit_file_name_length(file_name, ending, subfolder)
except TypeError:
raise BulkDownloaderException(f'Could not determine path name: {subfolder}, {index}, {resource.extension}')
return file_path
@staticmethod
def limit_file_name_length(filename: str, ending: str, root: Path) -> Path:
root = root.resolve().expanduser()
possible_id = re.search(r'((?:_\w{6})?$)', filename)
if possible_id:
ending = possible_id.group(1) + ending
filename = filename[:possible_id.start()]
max_path = FileNameFormatter.find_max_path_length()
max_file_part_length_chars = 255 - len(ending)
max_file_part_length_bytes = 255 - len(ending.encode('utf-8'))
max_path_length = max_path - len(ending) - len(str(root)) - 1
out = Path(root, filename + ending)
while any([len(filename) > max_file_part_length_chars,
len(filename.encode('utf-8')) > max_file_part_length_bytes,
len(str(out)) > max_path_length,
]):
filename = filename[:-1]
out = Path(root, filename + ending)
return out
@staticmethod
def find_max_path_length() -> int:
try:
return int(subprocess.check_output(['getconf', 'PATH_MAX', '/']))
except (ValueError, subprocess.CalledProcessError, OSError):
if platform.system() == 'Windows':
return 260
else:
return 4096
def format_resource_paths(
self,
resources: list[Resource],
destination_directory: Path,
) -> list[tuple[Path, Resource]]:
out = []
if len(resources) == 1:
try:
out.append((self.format_path(resources[0], destination_directory, None), resources[0]))
except BulkDownloaderException as e:
logger.error(f'Could not generate file path for resource {resources[0].url}: {e}')
logger.exception('Could not generate file path')
else:
for i, res in enumerate(resources, start=1):
logger.log(9, f'Formatting filename with index {i}')
try:
out.append((self.format_path(res, destination_directory, i), res))
except BulkDownloaderException as e:
logger.error(f'Could not generate file path for resource {res.url}: {e}')
logger.exception('Could not generate file path')
return out
@staticmethod
def validate_string(test_string: str) -> bool:
if not test_string:
return False
result = any([f'{{{key}}}' in test_string.lower() for key in FileNameFormatter.key_terms])
if result:
if 'POSTID' not in test_string:
logger.warning('Some files might not be downloaded due to name conflicts as filenames are'
' not guaranteed to be be unique without {POSTID}')
return True
else:
return False
@staticmethod
def _format_for_windows(input_string: str) -> str:
invalid_characters = r'<>:"\/|?*'
for char in invalid_characters:
input_string = input_string.replace(char, '')
input_string = FileNameFormatter._strip_emojis(input_string)
return input_string
@staticmethod
def _strip_emojis(input_string: str) -> str:
result = input_string.encode('ascii', errors='ignore').decode('utf-8')
return result