bulk-downloader-for-reddit/bulkredditdownloader/file_name_formatter.py

#!/usr/bin/env python3
# coding=utf-8

import logging
import platform
import re
from pathlib import Path
from typing import Optional

from praw.models import Comment, Submission

from bulkredditdownloader.exceptions import BulkDownloaderException
from bulkredditdownloader.resource import Resource

logger = logging.getLogger(__name__)


class FileNameFormatter:
    key_terms = (
        'date',
        'flair',
        'postid',
        'redditor',
        'subreddit',
        'title',
        'upvotes',
    )

    def __init__(self, file_format_string: str, directory_format_string: str):
        if not self.validate_string(file_format_string):
            raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string')
        self.file_format_string = file_format_string
        self.directory_format_string: list[str] = directory_format_string.split('/')

    @staticmethod
    def _format_name(submission: (Comment, Submission), format_string: str) -> str:
        if isinstance(submission, Submission):
            attributes = FileNameFormatter._generate_name_dict_from_submission(submission)
        elif isinstance(submission, Comment):
            attributes = FileNameFormatter._generate_name_dict_from_comment(submission)
        else:
            raise BulkDownloaderException(f'Cannot name object {type(submission).__name__}')
        result = format_string
        for key in attributes.keys():
            if re.search(fr'(?i).*{{{key}}}.*', result):
                key_value = attributes.get(key, 'unknown')
                key_value = bytes(key_value, 'utf-8').decode('unicode-escape')
                result = re.sub(fr'(?i){{{key}}}', key_value, result,)
                logger.log(9, f'Found key string {key} in name')

        result = result.replace('/', '')

        if platform.system() == 'Windows':
            result = FileNameFormatter._format_for_windows(result)

        return result

    @staticmethod
    def _generate_name_dict_from_submission(submission: Submission) -> dict:
        submission_attributes = {
            'title': submission.title,
            'subreddit': submission.subreddit.display_name,
            'redditor': submission.author.name if submission.author else 'DELETED',
            'postid': submission.id,
            'upvotes': submission.score,
            'flair': submission.link_flair_text,
            'date': submission.created_utc
        }
        return submission_attributes

    @staticmethod
    def _generate_name_dict_from_comment(comment: Comment) -> dict:
        comment_attributes = {
            'title': comment.submission.title,
            'subreddit': comment.subreddit.display_name,
            'redditor': comment.author.name if comment.author else 'DELETED',
            'postid': comment.id,
            'upvotes': comment.score,
            'flair': '',
            'date': comment.created_utc,
        }
        return comment_attributes

    def format_path(
            self,
            resource: Resource,
            destination_directory: Path,
            index: Optional[int] = None,
    ) -> Path:
        subfolder = Path(
            destination_directory,
            *[self._format_name(resource.source_submission, part) for part in self.directory_format_string]
        )
        index = f'_{str(index)}' if index else ''
        if not resource.extension:
            raise BulkDownloaderException(f'Resource from {resource.url} has no extension')
        ending = index + resource.extension
        file_name = str(self._format_name(resource.source_submission, self.file_format_string))
        file_name = self._limit_file_name_length(file_name, ending)

        try:
            file_path = Path(subfolder, file_name)
        except TypeError:
            raise BulkDownloaderException(f'Could not determine path name: {subfolder}, {index}, {resource.extension}')
        return file_path

    @staticmethod
    def _limit_file_name_length(filename: str, ending: str) -> str:
        possible_id = re.search(r'((?:_\w{6})?$)', filename)
        if possible_id:
            ending = possible_id.group(1) + ending
            filename = filename[:possible_id.start()]
        max_length_chars = 255 - len(ending)
        max_length_bytes = 255 - len(ending.encode('utf-8'))
        while len(filename) > max_length_chars or len(filename.encode('utf-8')) > max_length_bytes:
            filename = filename[:-1]
        return filename + ending

    def format_resource_paths(
            self,
            resources: list[Resource],
            destination_directory: Path,
    ) -> list[tuple[Path, Resource]]:
        out = []
        if len(resources) == 1:
            out.append((self.format_path(resources[0], destination_directory, None), resources[0]))
        else:
            for i, res in enumerate(resources, start=1):
                logger.log(9, f'Formatting filename with index {i}')
                out.append((self.format_path(res, destination_directory, i), res))
        return out

    @staticmethod
    def validate_string(test_string: str) -> bool:
        if not test_string:
            return False
        result = any([f'{{{key}}}' in test_string.lower() for key in FileNameFormatter.key_terms])
        if result:
            if 'POSTID' not in test_string:
                logger.warning(
                    'Some files might not be downloaded due to name conflicts as filenames are'
                    ' not guaranteed to be be unique without {POSTID}')
            return True
        else:
            return False

    @staticmethod
    def _format_for_windows(input_string: str) -> str:
        invalid_characters = r'<>:"\/|?*'
        for char in invalid_characters:
            input_string = input_string.replace(char, '')
        input_string = FileNameFormatter._strip_emojis(input_string)
        return input_string

    @staticmethod
    def _strip_emojis(input_string: str) -> str:
        result = input_string.encode('ascii', errors='ignore').decode('utf-8')
        return result
Add file name formatter class 2021-02-11 12:08:47 +13:00			`#!/usr/bin/env python3`
			`# coding=utf-8`

Add much more logging 2021-03-11 16:20:39 +13:00			`import logging`
Scrub windows paths for invalid characters 2021-03-22 17:21:56 +13:00			`import platform`
Add file name formatter class 2021-02-11 12:08:47 +13:00			`import re`
			`from pathlib import Path`
Add indexing for multiple resources from one submission 2021-03-10 17:39:01 +13:00			`from typing import Optional`
Add file name formatter class 2021-02-11 12:08:47 +13:00
Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test 2021-04-01 21:37:20 +13:00			`from praw.models import Comment, Submission`
Add file name formatter class 2021-02-11 12:08:47 +13:00
Rename file with custom exceptions 2021-03-05 16:32:24 +13:00			`from bulkredditdownloader.exceptions import BulkDownloaderException`
Add file name formatter class 2021-02-11 12:08:47 +13:00			`from bulkredditdownloader.resource import Resource`

Add much more logging 2021-03-11 16:20:39 +13:00			`logger = logging.getLogger(__name__)`

Add file name formatter class 2021-02-11 12:08:47 +13:00
			`class FileNameFormatter:`
Allow multilevel folder schemes (#251) * Improve some formatting * Allow multilevel folder names 2021-04-04 18:16:06 +12:00			`key_terms = (`
			`'date',`
			`'flair',`
			`'postid',`
			`'redditor',`
			`'subreddit',`
			`'title',`
			`'upvotes',`
			`)`
Add function to validate formatting strings 2021-03-02 17:06:21 +13:00
Add file name formatter class 2021-02-11 12:08:47 +13:00			`def __init__(self, file_format_string: str, directory_format_string: str):`
Add function to validate formatting strings 2021-03-02 17:06:21 +13:00			`if not self.validate_string(file_format_string):`
			`raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string')`
Add file name formatter class 2021-02-11 12:08:47 +13:00			`self.file_format_string = file_format_string`
Allow multilevel folder schemes (#251) * Improve some formatting * Allow multilevel folder names 2021-04-04 18:16:06 +12:00			`self.directory_format_string: list[str] = directory_format_string.split('/')`
Add file name formatter class 2021-02-11 12:08:47 +13:00
			`@staticmethod`
Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test 2021-04-01 21:37:20 +13:00			`def _format_name(submission: (Comment, Submission), format_string: str) -> str:`
			`if isinstance(submission, Submission):`
			`attributes = FileNameFormatter._generate_name_dict_from_submission(submission)`
			`elif isinstance(submission, Comment):`
			`attributes = FileNameFormatter._generate_name_dict_from_comment(submission)`
			`else:`
			`raise BulkDownloaderException(f'Cannot name object {type(submission).__name__}')`
Revert "Use .format() instead of regular expression" This reverts commit 8e8225283214927b461b71c247d2bcf8adcf4b34. 2021-03-30 20:43:32 +13:00			`result = format_string`
Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test 2021-04-01 21:37:20 +13:00			`for key in attributes.keys():`
Allow multilevel folder schemes (#251) * Improve some formatting * Allow multilevel folder names 2021-04-04 18:16:06 +12:00			`if re.search(fr'(?i).{{{key}}}.', result):`
Parse unicode escapes in file name fields (#254) 2021-04-07 01:43:03 +12:00			`key_value = attributes.get(key, 'unknown')`
			`key_value = bytes(key_value, 'utf-8').decode('unicode-escape')`
			`result = re.sub(fr'(?i){{{key}}}', key_value, result,)`
Revert "Use .format() instead of regular expression" This reverts commit 8e8225283214927b461b71c247d2bcf8adcf4b34. 2021-03-30 20:43:32 +13:00			`logger.log(9, f'Found key string {key} in name')`

Add file name formatter class 2021-02-11 12:08:47 +13:00			`result = result.replace('/', '')`
Scrub windows paths for invalid characters 2021-03-22 17:21:56 +13:00
			`if platform.system() == 'Windows':`
			`result = FileNameFormatter._format_for_windows(result)`

Add file name formatter class 2021-02-11 12:08:47 +13:00			`return result`

Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test 2021-04-01 21:37:20 +13:00			`@staticmethod`
			`def _generate_name_dict_from_submission(submission: Submission) -> dict:`
			`submission_attributes = {`
			`'title': submission.title,`
			`'subreddit': submission.subreddit.display_name,`
			`'redditor': submission.author.name if submission.author else 'DELETED',`
			`'postid': submission.id,`
			`'upvotes': submission.score,`
			`'flair': submission.link_flair_text,`
			`'date': submission.created_utc`
			`}`
			`return submission_attributes`

			`@staticmethod`
			`def _generate_name_dict_from_comment(comment: Comment) -> dict:`
			`comment_attributes = {`
			`'title': comment.submission.title,`
			`'subreddit': comment.subreddit.display_name,`
			`'redditor': comment.author.name if comment.author else 'DELETED',`
			`'postid': comment.id,`
			`'upvotes': comment.score,`
			`'flair': '',`
Allow multilevel folder schemes (#251) * Improve some formatting * Allow multilevel folder names 2021-04-04 18:16:06 +12:00			`'date': comment.created_utc,`
Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test 2021-04-01 21:37:20 +13:00			`}`
			`return comment_attributes`

			`def format_path(`
			`self,`
			`resource: Resource,`
			`destination_directory: Path,`
Allow multilevel folder schemes (#251) * Improve some formatting * Allow multilevel folder names 2021-04-04 18:16:06 +12:00			`index: Optional[int] = None,`
			`) -> Path:`
			`subfolder = Path(`
			`destination_directory,`
			`*[self._format_name(resource.source_submission, part) for part in self.directory_format_string]`
			`)`
Add indexing for multiple resources from one submission 2021-03-10 17:39:01 +13:00			`index = f'_{str(index)}' if index else ''`
Shorten filenames that are too long 2021-03-13 14:13:36 +13:00			`if not resource.extension:`
			`raise BulkDownloaderException(f'Resource from {resource.url} has no extension')`
			`ending = index + resource.extension`
			`file_name = str(self._format_name(resource.source_submission, self.file_format_string))`
			`file_name = self._limit_file_name_length(file_name, ending)`
Scrub windows paths for invalid characters 2021-03-22 17:21:56 +13:00
Add much more logging 2021-03-11 16:20:39 +13:00			`try:`
Shorten filenames that are too long 2021-03-13 14:13:36 +13:00			`file_path = Path(subfolder, file_name)`
Add much more logging 2021-03-11 16:20:39 +13:00			`except TypeError:`
			`raise BulkDownloaderException(f'Could not determine path name: {subfolder}, {index}, {resource.extension}')`
Add file name formatter class 2021-02-11 12:08:47 +13:00			`return file_path`
Add function to validate formatting strings 2021-03-02 17:06:21 +13:00
Shorten filenames that are too long 2021-03-13 14:13:36 +13:00			`@staticmethod`
			`def _limit_file_name_length(filename: str, ending: str) -> str:`
Use slice to shorten name 2021-03-30 21:22:11 +13:00			`possible_id = re.search(r'((?:_\w{6})?$)', filename)`
			`if possible_id:`
			`ending = possible_id.group(1) + ending`
			`filename = filename[:possible_id.start()]`
Limit name byte length 2021-03-13 15:39:54 +13:00			`max_length_chars = 255 - len(ending)`
			`max_length_bytes = 255 - len(ending.encode('utf-8'))`
			`while len(filename) > max_length_chars or len(filename.encode('utf-8')) > max_length_bytes:`
			`filename = filename[:-1]`
Shorten filenames that are too long 2021-03-13 14:13:36 +13:00			`return filename + ending`

Allow multilevel folder schemes (#251) * Improve some formatting * Allow multilevel folder names 2021-04-04 18:16:06 +12:00			`def format_resource_paths(`
			`self,`
			`resources: list[Resource],`
			`destination_directory: Path,`
			`) -> list[tuple[Path, Resource]]:`
Add indexing for multiple resources from one submission 2021-03-10 17:39:01 +13:00			`out = []`
Fix index being added to single resources 2021-03-17 18:49:07 +13:00			`if len(resources) == 1:`
			`out.append((self.format_path(resources[0], destination_directory, None), resources[0]))`
			`else:`
			`for i, res in enumerate(resources, start=1):`
			`logger.log(9, f'Formatting filename with index {i}')`
			`out.append((self.format_path(res, destination_directory, i), res))`
Add indexing for multiple resources from one submission 2021-03-10 17:39:01 +13:00			`return out`

Scrub windows paths for invalid characters 2021-03-22 17:21:56 +13:00			`@staticmethod`
Add function to validate formatting strings 2021-03-02 17:06:21 +13:00			`def validate_string(test_string: str) -> bool:`
			`if not test_string:`
			`return False`
Add warning for non-unique file name schemes (#233) * Add warning for non-unique file name schemes * Update README with warning 2021-03-30 20:20:05 +13:00			`result = any([f'{{{key}}}' in test_string.lower() for key in FileNameFormatter.key_terms])`
			`if result:`
			`if 'POSTID' not in test_string:`
			`logger.warning(`
Allow multilevel folder schemes (#251) * Improve some formatting * Allow multilevel folder names 2021-04-04 18:16:06 +12:00			`'Some files might not be downloaded due to name conflicts as filenames are'`
			`' not guaranteed to be be unique without {POSTID}')`
Add warning for non-unique file name schemes (#233) * Add warning for non-unique file name schemes * Update README with warning 2021-03-30 20:20:05 +13:00			`return True`
			`else:`
			`return False`
Scrub windows paths for invalid characters 2021-03-22 17:21:56 +13:00
			`@staticmethod`
			`def _format_for_windows(input_string: str) -> str:`
			`invalid_characters = r'<>:"\/\|?*'`
			`for char in invalid_characters:`
			`input_string = input_string.replace(char, '')`
Strip emojis from filenames on Windows (#222) 2021-03-28 00:14:08 +13:00			`input_string = FileNameFormatter._strip_emojis(input_string)`
Scrub windows paths for invalid characters 2021-03-22 17:21:56 +13:00			`return input_string`
Strip emojis from filenames on Windows (#222) 2021-03-28 00:14:08 +13:00
			`@staticmethod`
			`def _strip_emojis(input_string: str) -> str:`
			`result = input_string.encode('ascii', errors='ignore').decode('utf-8')`
			`return result`