2021-02-11 12:08:47 +13:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# coding=utf-8
|
|
|
|
|
2021-03-11 16:20:39 +13:00
|
|
|
import logging
|
2021-03-22 17:21:56 +13:00
|
|
|
import platform
|
2021-02-11 12:08:47 +13:00
|
|
|
import re
|
|
|
|
from pathlib import Path
|
2021-03-10 17:39:01 +13:00
|
|
|
from typing import Optional
|
2021-02-11 12:08:47 +13:00
|
|
|
|
2021-04-01 21:37:20 +13:00
|
|
|
from praw.models import Comment, Submission
|
2021-02-11 12:08:47 +13:00
|
|
|
|
2021-03-05 16:32:24 +13:00
|
|
|
from bulkredditdownloader.exceptions import BulkDownloaderException
|
2021-02-11 12:08:47 +13:00
|
|
|
from bulkredditdownloader.resource import Resource
|
|
|
|
|
2021-03-11 16:20:39 +13:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2021-02-11 12:08:47 +13:00
|
|
|
|
|
|
|
class FileNameFormatter:
|
2021-04-04 18:16:06 +12:00
|
|
|
key_terms = (
|
|
|
|
'date',
|
|
|
|
'flair',
|
|
|
|
'postid',
|
|
|
|
'redditor',
|
|
|
|
'subreddit',
|
|
|
|
'title',
|
|
|
|
'upvotes',
|
|
|
|
)
|
2021-03-02 17:06:21 +13:00
|
|
|
|
2021-02-11 12:08:47 +13:00
|
|
|
def __init__(self, file_format_string: str, directory_format_string: str):
|
2021-03-02 17:06:21 +13:00
|
|
|
if not self.validate_string(file_format_string):
|
|
|
|
raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string')
|
2021-02-11 12:08:47 +13:00
|
|
|
self.file_format_string = file_format_string
|
2021-04-04 18:16:06 +12:00
|
|
|
self.directory_format_string: list[str] = directory_format_string.split('/')
|
2021-02-11 12:08:47 +13:00
|
|
|
|
|
|
|
@staticmethod
|
2021-04-01 21:37:20 +13:00
|
|
|
def _format_name(submission: (Comment, Submission), format_string: str) -> str:
|
|
|
|
if isinstance(submission, Submission):
|
|
|
|
attributes = FileNameFormatter._generate_name_dict_from_submission(submission)
|
|
|
|
elif isinstance(submission, Comment):
|
|
|
|
attributes = FileNameFormatter._generate_name_dict_from_comment(submission)
|
|
|
|
else:
|
|
|
|
raise BulkDownloaderException(f'Cannot name object {type(submission).__name__}')
|
2021-03-30 20:43:32 +13:00
|
|
|
result = format_string
|
2021-04-01 21:37:20 +13:00
|
|
|
for key in attributes.keys():
|
2021-04-04 18:16:06 +12:00
|
|
|
if re.search(fr'(?i).*{{{key}}}.*', result):
|
2021-04-07 01:43:03 +12:00
|
|
|
key_value = attributes.get(key, 'unknown')
|
|
|
|
key_value = bytes(key_value, 'utf-8').decode('unicode-escape')
|
|
|
|
result = re.sub(fr'(?i){{{key}}}', key_value, result,)
|
2021-03-30 20:43:32 +13:00
|
|
|
logger.log(9, f'Found key string {key} in name')
|
|
|
|
|
2021-02-11 12:08:47 +13:00
|
|
|
result = result.replace('/', '')
|
2021-03-22 17:21:56 +13:00
|
|
|
|
|
|
|
if platform.system() == 'Windows':
|
|
|
|
result = FileNameFormatter._format_for_windows(result)
|
|
|
|
|
2021-02-11 12:08:47 +13:00
|
|
|
return result
|
|
|
|
|
2021-04-01 21:37:20 +13:00
|
|
|
@staticmethod
|
|
|
|
def _generate_name_dict_from_submission(submission: Submission) -> dict:
|
|
|
|
submission_attributes = {
|
|
|
|
'title': submission.title,
|
|
|
|
'subreddit': submission.subreddit.display_name,
|
|
|
|
'redditor': submission.author.name if submission.author else 'DELETED',
|
|
|
|
'postid': submission.id,
|
|
|
|
'upvotes': submission.score,
|
|
|
|
'flair': submission.link_flair_text,
|
|
|
|
'date': submission.created_utc
|
|
|
|
}
|
|
|
|
return submission_attributes
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _generate_name_dict_from_comment(comment: Comment) -> dict:
|
|
|
|
comment_attributes = {
|
|
|
|
'title': comment.submission.title,
|
|
|
|
'subreddit': comment.subreddit.display_name,
|
|
|
|
'redditor': comment.author.name if comment.author else 'DELETED',
|
|
|
|
'postid': comment.id,
|
|
|
|
'upvotes': comment.score,
|
|
|
|
'flair': '',
|
2021-04-04 18:16:06 +12:00
|
|
|
'date': comment.created_utc,
|
2021-04-01 21:37:20 +13:00
|
|
|
}
|
|
|
|
return comment_attributes
|
|
|
|
|
|
|
|
def format_path(
|
|
|
|
self,
|
|
|
|
resource: Resource,
|
|
|
|
destination_directory: Path,
|
2021-04-04 18:16:06 +12:00
|
|
|
index: Optional[int] = None,
|
|
|
|
) -> Path:
|
|
|
|
subfolder = Path(
|
|
|
|
destination_directory,
|
|
|
|
*[self._format_name(resource.source_submission, part) for part in self.directory_format_string]
|
|
|
|
)
|
2021-03-10 17:39:01 +13:00
|
|
|
index = f'_{str(index)}' if index else ''
|
2021-03-13 14:13:36 +13:00
|
|
|
if not resource.extension:
|
|
|
|
raise BulkDownloaderException(f'Resource from {resource.url} has no extension')
|
|
|
|
ending = index + resource.extension
|
|
|
|
file_name = str(self._format_name(resource.source_submission, self.file_format_string))
|
|
|
|
file_name = self._limit_file_name_length(file_name, ending)
|
2021-03-22 17:21:56 +13:00
|
|
|
|
2021-03-11 16:20:39 +13:00
|
|
|
try:
|
2021-03-13 14:13:36 +13:00
|
|
|
file_path = Path(subfolder, file_name)
|
2021-03-11 16:20:39 +13:00
|
|
|
except TypeError:
|
|
|
|
raise BulkDownloaderException(f'Could not determine path name: {subfolder}, {index}, {resource.extension}')
|
2021-02-11 12:08:47 +13:00
|
|
|
return file_path
|
2021-03-02 17:06:21 +13:00
|
|
|
|
2021-03-13 14:13:36 +13:00
|
|
|
@staticmethod
|
|
|
|
def _limit_file_name_length(filename: str, ending: str) -> str:
|
2021-03-30 21:22:11 +13:00
|
|
|
possible_id = re.search(r'((?:_\w{6})?$)', filename)
|
|
|
|
if possible_id:
|
|
|
|
ending = possible_id.group(1) + ending
|
|
|
|
filename = filename[:possible_id.start()]
|
2021-03-13 15:39:54 +13:00
|
|
|
max_length_chars = 255 - len(ending)
|
|
|
|
max_length_bytes = 255 - len(ending.encode('utf-8'))
|
|
|
|
while len(filename) > max_length_chars or len(filename.encode('utf-8')) > max_length_bytes:
|
|
|
|
filename = filename[:-1]
|
2021-03-13 14:13:36 +13:00
|
|
|
return filename + ending
|
|
|
|
|
2021-04-04 18:16:06 +12:00
|
|
|
def format_resource_paths(
|
|
|
|
self,
|
|
|
|
resources: list[Resource],
|
|
|
|
destination_directory: Path,
|
|
|
|
) -> list[tuple[Path, Resource]]:
|
2021-03-10 17:39:01 +13:00
|
|
|
out = []
|
2021-03-17 18:49:07 +13:00
|
|
|
if len(resources) == 1:
|
|
|
|
out.append((self.format_path(resources[0], destination_directory, None), resources[0]))
|
|
|
|
else:
|
|
|
|
for i, res in enumerate(resources, start=1):
|
|
|
|
logger.log(9, f'Formatting filename with index {i}')
|
|
|
|
out.append((self.format_path(res, destination_directory, i), res))
|
2021-03-10 17:39:01 +13:00
|
|
|
return out
|
|
|
|
|
2021-03-22 17:21:56 +13:00
|
|
|
@staticmethod
|
2021-03-02 17:06:21 +13:00
|
|
|
def validate_string(test_string: str) -> bool:
|
|
|
|
if not test_string:
|
|
|
|
return False
|
2021-03-30 20:20:05 +13:00
|
|
|
result = any([f'{{{key}}}' in test_string.lower() for key in FileNameFormatter.key_terms])
|
|
|
|
if result:
|
|
|
|
if 'POSTID' not in test_string:
|
|
|
|
logger.warning(
|
2021-04-04 18:16:06 +12:00
|
|
|
'Some files might not be downloaded due to name conflicts as filenames are'
|
|
|
|
' not guaranteed to be be unique without {POSTID}')
|
2021-03-30 20:20:05 +13:00
|
|
|
return True
|
|
|
|
else:
|
|
|
|
return False
|
2021-03-22 17:21:56 +13:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _format_for_windows(input_string: str) -> str:
|
|
|
|
invalid_characters = r'<>:"\/|?*'
|
|
|
|
for char in invalid_characters:
|
|
|
|
input_string = input_string.replace(char, '')
|
2021-03-28 00:14:08 +13:00
|
|
|
input_string = FileNameFormatter._strip_emojis(input_string)
|
2021-03-22 17:21:56 +13:00
|
|
|
return input_string
|
2021-03-28 00:14:08 +13:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _strip_emojis(input_string: str) -> str:
|
|
|
|
result = input_string.encode('ascii', errors='ignore').decode('utf-8')
|
|
|
|
return result
|