bulk-downloader-for-reddit/bdfr/file_name_formatter.py

#!/usr/bin/env python3
# coding=utf-8
import datetime
import logging
import platform
import re
import subprocess
from pathlib import Path
from typing import Optional

from praw.models import Comment, Submission

from bdfr.exceptions import BulkDownloaderException
from bdfr.resource import Resource

logger = logging.getLogger(__name__)


class FileNameFormatter:
    key_terms = (
        'date',
        'flair',
        'postid',
        'redditor',
        'subreddit',
        'title',
        'upvotes',
    )

    def __init__(self, file_format_string: str, directory_format_string: str, time_format_string: str):
        if not self.validate_string(file_format_string):
            raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string')
        self.file_format_string = file_format_string
        self.directory_format_string: list[str] = directory_format_string.split('/')
        self.time_format_string = time_format_string

    def _format_name(self, submission: (Comment, Submission), format_string: str) -> str:
        if isinstance(submission, Submission):
            attributes = self._generate_name_dict_from_submission(submission)
        elif isinstance(submission, Comment):
            attributes = self._generate_name_dict_from_comment(submission)
        else:
            raise BulkDownloaderException(f'Cannot name object {type(submission).__name__}')
        result = format_string
        for key in attributes.keys():
            if re.search(fr'(?i).*{{{key}}}.*', result):
                key_value = str(attributes.get(key, 'unknown'))
                key_value = FileNameFormatter._convert_unicode_escapes(key_value)
                key_value = key_value.replace('\\', '\\\\')
                result = re.sub(fr'(?i){{{key}}}', key_value, result)

        result = result.replace('/', '')

        if platform.system() == 'Windows':
            result = FileNameFormatter._format_for_windows(result)

        return result

    @staticmethod
    def _convert_unicode_escapes(in_string: str) -> str:
        pattern = re.compile(r'(\\u\d{4})')
        matches = re.search(pattern, in_string)
        if matches:
            for match in matches.groups():
                converted_match = bytes(match, 'utf-8').decode('unicode-escape')
                in_string = in_string.replace(match, converted_match)
        return in_string

    def _generate_name_dict_from_submission(self, submission: Submission) -> dict:
        submission_attributes = {
            'title': submission.title,
            'subreddit': submission.subreddit.display_name,
            'redditor': submission.author.name if submission.author else 'DELETED',
            'postid': submission.id,
            'upvotes': submission.score,
            'flair': submission.link_flair_text,
            'date': self._convert_timestamp(submission.created_utc),
        }
        return submission_attributes

    def _convert_timestamp(self, timestamp: float) -> str:
        input_time = datetime.datetime.fromtimestamp(timestamp)
        if self.time_format_string.upper().strip() == 'ISO':
            return input_time.isoformat()
        else:
            return input_time.strftime(self.time_format_string)

    def _generate_name_dict_from_comment(self, comment: Comment) -> dict:
        comment_attributes = {
            'title': comment.submission.title,
            'subreddit': comment.subreddit.display_name,
            'redditor': comment.author.name if comment.author else 'DELETED',
            'postid': comment.id,
            'upvotes': comment.score,
            'flair': '',
            'date': self._convert_timestamp(comment.created_utc),
        }
        return comment_attributes

    def format_path(
            self,
            resource: Resource,
            destination_directory: Path,
            index: Optional[int] = None,
    ) -> Path:
        subfolder = Path(
            destination_directory,
            *[self._format_name(resource.source_submission, part) for part in self.directory_format_string],
        )
        index = f'_{str(index)}' if index else ''
        if not resource.extension:
            raise BulkDownloaderException(f'Resource from {resource.url} has no extension')
        file_name = str(self._format_name(resource.source_submission, self.file_format_string))

        file_name = re.sub(r'\n', ' ', file_name)

        if not re.match(r'.*\.$', file_name) and not re.match(r'^\..*', resource.extension):
            ending = index + '.' + resource.extension
        else:
            ending = index + resource.extension

        try:
            file_path = self.limit_file_name_length(file_name, ending, subfolder)
        except TypeError:
            raise BulkDownloaderException(f'Could not determine path name: {subfolder}, {index}, {resource.extension}')
        return file_path

    @staticmethod
    def limit_file_name_length(filename: str, ending: str, root: Path) -> Path:
        root = root.resolve().expanduser()
        possible_id = re.search(r'((?:_\w{6})?$)', filename)
        if possible_id:
            ending = possible_id.group(1) + ending
            filename = filename[:possible_id.start()]
        max_path = FileNameFormatter.find_max_path_length()
        max_file_part_length_chars = 255 - len(ending)
        max_file_part_length_bytes = 255 - len(ending.encode('utf-8'))
        max_path_length = max_path - len(ending) - len(str(root)) - 1

        out = Path(root, filename + ending)
        while any([len(filename) > max_file_part_length_chars,
                   len(filename.encode('utf-8')) > max_file_part_length_bytes,
                   len(str(out)) > max_path_length,
                   ]):
            filename = filename[:-1]
            out = Path(root, filename + ending)

        return out

    @staticmethod
    def find_max_path_length() -> int:
        try:
            return int(subprocess.check_output(['getconf', 'PATH_MAX', '/']))
        except (ValueError, subprocess.CalledProcessError, OSError):
            if platform.system() == 'Windows':
                return 260
            else:
                return 4096

    def format_resource_paths(
            self,
            resources: list[Resource],
            destination_directory: Path,
    ) -> list[tuple[Path, Resource]]:
        out = []
        if len(resources) == 1:
            try:
                out.append((self.format_path(resources[0], destination_directory, None), resources[0]))
            except BulkDownloaderException as e:
                logger.error(f'Could not generate file path for resource {resources[0].url}: {e}')
                logger.exception('Could not generate file path')
        else:
            for i, res in enumerate(resources, start=1):
                logger.log(9, f'Formatting filename with index {i}')
                try:
                    out.append((self.format_path(res, destination_directory, i), res))
                except BulkDownloaderException as e:
                    logger.error(f'Could not generate file path for resource {res.url}: {e}')
                    logger.exception('Could not generate file path')
        return out

    @staticmethod
    def validate_string(test_string: str) -> bool:
        if not test_string:
            return False
        result = any([f'{{{key}}}' in test_string.lower() for key in FileNameFormatter.key_terms])
        if result:
            if 'POSTID' not in test_string:
                logger.warning('Some files might not be downloaded due to name conflicts as filenames are'
                               ' not guaranteed to be be unique without {POSTID}')
            return True
        else:
            return False

    @staticmethod
    def _format_for_windows(input_string: str) -> str:
        invalid_characters = r'<>:"\/|?*'
        for char in invalid_characters:
            input_string = input_string.replace(char, '')
        input_string = FileNameFormatter._strip_emojis(input_string)
        return input_string

    @staticmethod
    def _strip_emojis(input_string: str) -> str:
        result = input_string.encode('ascii', errors='ignore').decode('utf-8')
        return result
Add file name formatter class 2021-02-11 12:08:47 +13:00			`#!/usr/bin/env python3`
			`# coding=utf-8`
Use ISO format for timestamps in names 2021-04-22 12:38:32 +12:00			`import datetime`
Add much more logging 2021-03-11 16:20:39 +13:00			`import logging`
Scrub windows paths for invalid characters 2021-03-22 17:21:56 +13:00			`import platform`
Add file name formatter class 2021-02-11 12:08:47 +13:00			`import re`
Add path limit fix 2021-05-18 14:39:08 +12:00			`import subprocess`
Add file name formatter class 2021-02-11 12:08:47 +13:00			`from pathlib import Path`
Add indexing for multiple resources from one submission 2021-03-10 17:39:01 +13:00			`from typing import Optional`
Add file name formatter class 2021-02-11 12:08:47 +13:00
Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test 2021-04-01 21:37:20 +13:00			`from praw.models import Comment, Submission`
Add file name formatter class 2021-02-11 12:08:47 +13:00
Rename module 2021-04-12 19:58:32 +12:00			`from bdfr.exceptions import BulkDownloaderException`
			`from bdfr.resource import Resource`
Add file name formatter class 2021-02-11 12:08:47 +13:00
Add much more logging 2021-03-11 16:20:39 +13:00			`logger = logging.getLogger(__name__)`

Add file name formatter class 2021-02-11 12:08:47 +13:00
			`class FileNameFormatter:`
Allow multilevel folder schemes (#251) * Improve some formatting * Allow multilevel folder names 2021-04-04 18:16:06 +12:00			`key_terms = (`
			`'date',`
			`'flair',`
			`'postid',`
			`'redditor',`
			`'subreddit',`
			`'title',`
			`'upvotes',`
			`)`
Add function to validate formatting strings 2021-03-02 17:06:21 +13:00
Add customisable time formatting 2021-05-02 15:56:39 +12:00			`def __init__(self, file_format_string: str, directory_format_string: str, time_format_string: str):`
Add function to validate formatting strings 2021-03-02 17:06:21 +13:00			`if not self.validate_string(file_format_string):`
			`raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string')`
Add file name formatter class 2021-02-11 12:08:47 +13:00			`self.file_format_string = file_format_string`
Allow multilevel folder schemes (#251) * Improve some formatting * Allow multilevel folder names 2021-04-04 18:16:06 +12:00			`self.directory_format_string: list[str] = directory_format_string.split('/')`
Add customisable time formatting 2021-05-02 15:56:39 +12:00			`self.time_format_string = time_format_string`
Add file name formatter class 2021-02-11 12:08:47 +13:00
Add customisable time formatting 2021-05-02 15:56:39 +12:00			`def _format_name(self, submission: (Comment, Submission), format_string: str) -> str:`
Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test 2021-04-01 21:37:20 +13:00			`if isinstance(submission, Submission):`
Add customisable time formatting 2021-05-02 15:56:39 +12:00			`attributes = self._generate_name_dict_from_submission(submission)`
Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test 2021-04-01 21:37:20 +13:00			`elif isinstance(submission, Comment):`
Add customisable time formatting 2021-05-02 15:56:39 +12:00			`attributes = self._generate_name_dict_from_comment(submission)`
Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test 2021-04-01 21:37:20 +13:00			`else:`
			`raise BulkDownloaderException(f'Cannot name object {type(submission).__name__}')`
Revert "Use .format() instead of regular expression" This reverts commit 8e8225283214927b461b71c247d2bcf8adcf4b34. 2021-03-30 20:43:32 +13:00			`result = format_string`
Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test 2021-04-01 21:37:20 +13:00			`for key in attributes.keys():`
Allow multilevel folder schemes (#251) * Improve some formatting * Allow multilevel folder names 2021-04-04 18:16:06 +12:00			`if re.search(fr'(?i).{{{key}}}.', result):`
Fix bug in file name formatter 2021-04-12 20:07:40 +12:00			`key_value = str(attributes.get(key, 'unknown'))`
Fix bug with emojis in the filename (#263) 2021-04-12 23:11:55 +12:00			`key_value = FileNameFormatter._convert_unicode_escapes(key_value)`
Fix mistaken backreference in some titles This should resolve #267 2021-04-13 18:40:22 +12:00			`key_value = key_value.replace('\\', '\\\\')`
			`result = re.sub(fr'(?i){{{key}}}', key_value, result)`
Revert "Use .format() instead of regular expression" This reverts commit 8e8225283214927b461b71c247d2bcf8adcf4b34. 2021-03-30 20:43:32 +13:00
Add file name formatter class 2021-02-11 12:08:47 +13:00			`result = result.replace('/', '')`
Scrub windows paths for invalid characters 2021-03-22 17:21:56 +13:00
			`if platform.system() == 'Windows':`
			`result = FileNameFormatter._format_for_windows(result)`

Add file name formatter class 2021-02-11 12:08:47 +13:00			`return result`

Fix bug with emojis in the filename (#263) 2021-04-12 23:11:55 +12:00			`@staticmethod`
			`def _convert_unicode_escapes(in_string: str) -> str:`
			`pattern = re.compile(r'(\\u\d{4})')`
			`matches = re.search(pattern, in_string)`
			`if matches:`
			`for match in matches.groups():`
			`converted_match = bytes(match, 'utf-8').decode('unicode-escape')`
			`in_string = in_string.replace(match, converted_match)`
			`return in_string`

Add customisable time formatting 2021-05-02 15:56:39 +12:00			`def _generate_name_dict_from_submission(self, submission: Submission) -> dict:`
Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test 2021-04-01 21:37:20 +13:00			`submission_attributes = {`
			`'title': submission.title,`
			`'subreddit': submission.subreddit.display_name,`
			`'redditor': submission.author.name if submission.author else 'DELETED',`
			`'postid': submission.id,`
			`'upvotes': submission.score,`
			`'flair': submission.link_flair_text,`
Add customisable time formatting 2021-05-02 15:56:39 +12:00			`'date': self._convert_timestamp(submission.created_utc),`
Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test 2021-04-01 21:37:20 +13:00			`}`
			`return submission_attributes`

Add customisable time formatting 2021-05-02 15:56:39 +12:00			`def _convert_timestamp(self, timestamp: float) -> str:`
Use ISO format for timestamps in names 2021-04-22 12:38:32 +12:00			`input_time = datetime.datetime.fromtimestamp(timestamp)`
Add customisable time formatting 2021-05-02 15:56:39 +12:00			`if self.time_format_string.upper().strip() == 'ISO':`
			`return input_time.isoformat()`
			`else:`
			`return input_time.strftime(self.time_format_string)`
Use ISO format for timestamps in names 2021-04-22 12:38:32 +12:00
Add customisable time formatting 2021-05-02 15:56:39 +12:00			`def _generate_name_dict_from_comment(self, comment: Comment) -> dict:`
Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test 2021-04-01 21:37:20 +13:00			`comment_attributes = {`
			`'title': comment.submission.title,`
			`'subreddit': comment.subreddit.display_name,`
			`'redditor': comment.author.name if comment.author else 'DELETED',`
			`'postid': comment.id,`
			`'upvotes': comment.score,`
			`'flair': '',`
Add customisable time formatting 2021-05-02 15:56:39 +12:00			`'date': self._convert_timestamp(comment.created_utc),`
Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test 2021-04-01 21:37:20 +13:00			`}`
			`return comment_attributes`

			`def format_path(`
			`self,`
			`resource: Resource,`
			`destination_directory: Path,`
Allow multilevel folder schemes (#251) * Improve some formatting * Allow multilevel folder names 2021-04-04 18:16:06 +12:00			`index: Optional[int] = None,`
			`) -> Path:`
			`subfolder = Path(`
			`destination_directory,`
Add path limit fix 2021-05-18 14:39:08 +12:00			`*[self._format_name(resource.source_submission, part) for part in self.directory_format_string],`
Allow multilevel folder schemes (#251) * Improve some formatting * Allow multilevel folder names 2021-04-04 18:16:06 +12:00			`)`
Add indexing for multiple resources from one submission 2021-03-10 17:39:01 +13:00			`index = f'_{str(index)}' if index else ''`
Shorten filenames that are too long 2021-03-13 14:13:36 +13:00			`if not resource.extension:`
			`raise BulkDownloaderException(f'Resource from {resource.url} has no extension')`
			`file_name = str(self._format_name(resource.source_submission, self.file_format_string))`
Strip any newline characters from names 2022-03-25 13:50:52 +13:00
			`file_name = re.sub(r'\n', ' ', file_name)`

Add more tests for file length checking 2021-11-15 14:57:54 +13:00			`if not re.match(r'.\.$', file_name) and not re.match(r'^\..', resource.extension):`
Fix bug with period not separating file extension 2021-11-05 15:47:46 +13:00			`ending = index + '.' + resource.extension`
			`else:`
			`ending = index + resource.extension`
Scrub windows paths for invalid characters 2021-03-22 17:21:56 +13:00
Add much more logging 2021-03-11 16:20:39 +13:00			`try:`
Add more tests for file length checking 2021-11-15 14:57:54 +13:00			`file_path = self.limit_file_name_length(file_name, ending, subfolder)`
Add much more logging 2021-03-11 16:20:39 +13:00			`except TypeError:`
			`raise BulkDownloaderException(f'Could not determine path name: {subfolder}, {index}, {resource.extension}')`
Add file name formatter class 2021-02-11 12:08:47 +13:00			`return file_path`
Add function to validate formatting strings 2021-03-02 17:06:21 +13:00
Shorten filenames that are too long 2021-03-13 14:13:36 +13:00			`@staticmethod`
Add more tests for file length checking 2021-11-15 14:57:54 +13:00			`def limit_file_name_length(filename: str, ending: str, root: Path) -> Path:`
Add path limit fix 2021-05-18 14:39:08 +12:00			`root = root.resolve().expanduser()`
Use slice to shorten name 2021-03-30 21:22:11 +13:00			`possible_id = re.search(r'((?:_\w{6})?$)', filename)`
			`if possible_id:`
			`ending = possible_id.group(1) + ending`
			`filename = filename[:possible_id.start()]`
Add path limit fix 2021-05-18 14:39:08 +12:00			`max_path = FileNameFormatter.find_max_path_length()`
Fix max path length calculations 2021-11-21 16:14:28 +13:00			`max_file_part_length_chars = 255 - len(ending)`
			`max_file_part_length_bytes = 255 - len(ending.encode('utf-8'))`
Add path limit fix 2021-05-18 14:39:08 +12:00			`max_path_length = max_path - len(ending) - len(str(root)) - 1`
Fix max path length calculations 2021-11-21 16:14:28 +13:00
			`out = Path(root, filename + ending)`
			`while any([len(filename) > max_file_part_length_chars,`
			`len(filename.encode('utf-8')) > max_file_part_length_bytes,`
			`len(str(out)) > max_path_length,`
			`]):`
Limit name byte length 2021-03-13 15:39:54 +13:00			`filename = filename[:-1]`
Fix max path length calculations 2021-11-21 16:14:28 +13:00			`out = Path(root, filename + ending)`

			`return out`
Add path limit fix 2021-05-18 14:39:08 +12:00
			`@staticmethod`
			`def find_max_path_length() -> int:`
			`try:`
			`return int(subprocess.check_output(['getconf', 'PATH_MAX', '/']))`
			`except (ValueError, subprocess.CalledProcessError, OSError):`
			`if platform.system() == 'Windows':`
			`return 260`
			`else:`
			`return 4096`
Shorten filenames that are too long 2021-03-13 14:13:36 +13:00
Allow multilevel folder schemes (#251) * Improve some formatting * Allow multilevel folder names 2021-04-04 18:16:06 +12:00			`def format_resource_paths(`
			`self,`
			`resources: list[Resource],`
			`destination_directory: Path,`
			`) -> list[tuple[Path, Resource]]:`
Add indexing for multiple resources from one submission 2021-03-10 17:39:01 +13:00			`out = []`
Fix index being added to single resources 2021-03-17 18:49:07 +13:00			`if len(resources) == 1:`
Catch errors when resources have no extension This is related to #266 and will prevent the BDFR from completely crashing when a file extension is unknown 2021-04-13 15:22:13 +12:00			`try:`
			`out.append((self.format_path(resources[0], destination_directory, None), resources[0]))`
			`except BulkDownloaderException as e:`
			`logger.error(f'Could not generate file path for resource {resources[0].url}: {e}')`
			`logger.exception('Could not generate file path')`
Fix index being added to single resources 2021-03-17 18:49:07 +13:00			`else:`
			`for i, res in enumerate(resources, start=1):`
			`logger.log(9, f'Formatting filename with index {i}')`
Catch errors when resources have no extension This is related to #266 and will prevent the BDFR from completely crashing when a file extension is unknown 2021-04-13 15:22:13 +12:00			`try:`
			`out.append((self.format_path(res, destination_directory, i), res))`
			`except BulkDownloaderException as e:`
			`logger.error(f'Could not generate file path for resource {res.url}: {e}')`
			`logger.exception('Could not generate file path')`
Add indexing for multiple resources from one submission 2021-03-10 17:39:01 +13:00			`return out`

Scrub windows paths for invalid characters 2021-03-22 17:21:56 +13:00			`@staticmethod`
Add function to validate formatting strings 2021-03-02 17:06:21 +13:00			`def validate_string(test_string: str) -> bool:`
			`if not test_string:`
			`return False`
Add warning for non-unique file name schemes (#233) * Add warning for non-unique file name schemes * Update README with warning 2021-03-30 20:20:05 +13:00			`result = any([f'{{{key}}}' in test_string.lower() for key in FileNameFormatter.key_terms])`
			`if result:`
			`if 'POSTID' not in test_string:`
Add customisable time formatting 2021-05-02 15:56:39 +12:00			`logger.warning('Some files might not be downloaded due to name conflicts as filenames are'`
			`' not guaranteed to be be unique without {POSTID}')`
Add warning for non-unique file name schemes (#233) * Add warning for non-unique file name schemes * Update README with warning 2021-03-30 20:20:05 +13:00			`return True`
			`else:`
			`return False`
Scrub windows paths for invalid characters 2021-03-22 17:21:56 +13:00
			`@staticmethod`
			`def _format_for_windows(input_string: str) -> str:`
			`invalid_characters = r'<>:"\/\|?*'`
			`for char in invalid_characters:`
			`input_string = input_string.replace(char, '')`
Strip emojis from filenames on Windows (#222) 2021-03-28 00:14:08 +13:00			`input_string = FileNameFormatter._strip_emojis(input_string)`
Scrub windows paths for invalid characters 2021-03-22 17:21:56 +13:00			`return input_string`
Strip emojis from filenames on Windows (#222) 2021-03-28 00:14:08 +13:00
			`@staticmethod`
			`def _strip_emojis(input_string: str) -> str:`
			`result = input_string.encode('ascii', errors='ignore').decode('utf-8')`
			`return result`