1
0
Fork 0
mirror of synced 2024-05-19 11:42:40 +12:00
bulk-downloader-for-reddit/bdfr/archiver.py

125 lines
5.8 KiB
Python
Raw Permalink Normal View History

2021-03-13 23:18:30 +13:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
2021-03-13 23:18:30 +13:00
import json
import logging
import re
2023-01-26 16:23:59 +13:00
from collections.abc import Iterable, Iterator
from pathlib import Path
2022-12-11 15:19:29 +13:00
from time import sleep
2023-01-26 16:23:59 +13:00
from typing import Union
2021-03-13 23:18:30 +13:00
2021-03-14 12:00:00 +13:00
import dict2xml
2021-03-13 23:18:30 +13:00
import praw.models
import prawcore
2021-03-14 12:00:00 +13:00
import yaml
2021-03-13 23:18:30 +13:00
2021-04-12 19:58:32 +12:00
from bdfr.archive_entry.base_archive_entry import BaseArchiveEntry
from bdfr.archive_entry.comment_archive_entry import CommentArchiveEntry
from bdfr.archive_entry.submission_archive_entry import SubmissionArchiveEntry
from bdfr.configuration import Configuration
from bdfr.connector import RedditConnector
2021-04-12 19:58:32 +12:00
from bdfr.exceptions import ArchiverError
from bdfr.resource import Resource
2021-03-13 23:18:30 +13:00
logger = logging.getLogger(__name__)
class Archiver(RedditConnector):
def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()):
super(Archiver, self).__init__(args, logging_handlers)
2021-03-13 23:18:30 +13:00
def download(self):
for generator in self.reddit_lists:
2022-12-11 15:19:29 +13:00
try:
for submission in generator:
try:
if (submission.author and submission.author.name in self.args.ignore_user) or (
submission.author is None and "DELETED" in self.args.ignore_user
):
logger.debug(
f"Submission {submission.id} in {submission.subreddit.display_name} skipped due to"
f" {submission.author.name if submission.author else 'DELETED'} being an ignored user"
)
continue
if submission.id in self.excluded_submission_ids:
logger.debug(f"Object {submission.id} in exclusion list, skipping")
continue
logger.debug(f"Attempting to archive submission {submission.id}")
self.write_entry(submission)
except prawcore.PrawcoreException as e:
logger.error(f"Submission {submission.id} failed to be archived due to a PRAW exception: {e}")
except prawcore.PrawcoreException as e:
logger.error(f"The submission after {submission.id} failed to download due to a PRAW exception: {e}")
logger.debug("Waiting 60 seconds to continue")
sleep(60)
2021-03-13 23:18:30 +13:00
def get_submissions_from_link(self) -> list[list[praw.models.Submission]]:
supplied_submissions = []
for sub_id in self.args.link:
if len(sub_id) == 6:
supplied_submissions.append(self.reddit_instance.submission(id=sub_id))
2022-12-03 18:11:17 +13:00
elif re.match(r"^\w{7}$", sub_id):
supplied_submissions.append(self.reddit_instance.comment(id=sub_id))
else:
supplied_submissions.append(self.reddit_instance.submission(url=sub_id))
return [supplied_submissions]
def get_user_data(self) -> list[Iterator]:
results = super(Archiver, self).get_user_data()
if self.args.user and self.args.all_comments:
sort = self.determine_sort_function()
for user in self.args.user:
2022-12-03 18:11:17 +13:00
logger.debug(f"Retrieving comments of user {user}")
results.append(sort(self.reddit_instance.redditor(user).comments, limit=self.args.limit))
return results
@staticmethod
2022-12-01 15:48:10 +13:00
def _pull_lever_entry_factory(praw_item: Union[praw.models.Submission, praw.models.Comment]) -> BaseArchiveEntry:
if isinstance(praw_item, praw.models.Submission):
return SubmissionArchiveEntry(praw_item)
elif isinstance(praw_item, praw.models.Comment):
return CommentArchiveEntry(praw_item)
else:
2022-12-03 18:11:17 +13:00
raise ArchiverError(f"Factory failed to classify item of type {type(praw_item).__name__}")
2022-12-01 15:48:10 +13:00
def write_entry(self, praw_item: Union[praw.models.Submission, praw.models.Comment]):
2021-06-12 12:35:31 +12:00
if self.args.comment_context and isinstance(praw_item, praw.models.Comment):
2022-12-03 18:11:17 +13:00
logger.debug(f"Converting comment {praw_item.id} to submission {praw_item.submission.id}")
2021-06-11 17:31:11 +12:00
praw_item = praw_item.submission
archive_entry = self._pull_lever_entry_factory(praw_item)
2022-12-03 18:11:17 +13:00
if self.args.format == "json":
self._write_entry_json(archive_entry)
2022-12-03 18:11:17 +13:00
elif self.args.format == "xml":
self._write_entry_xml(archive_entry)
2022-12-03 18:11:17 +13:00
elif self.args.format == "yaml":
self._write_entry_yaml(archive_entry)
2021-03-13 23:18:30 +13:00
else:
2022-12-03 18:11:17 +13:00
raise ArchiverError(f"Unknown format {self.args.format} given")
logger.info(f"Record for entry item {praw_item.id} written to disk")
2021-03-13 23:18:30 +13:00
def _write_entry_json(self, entry: BaseArchiveEntry):
2022-12-03 18:11:17 +13:00
resource = Resource(entry.source, "", lambda: None, ".json")
content = json.dumps(entry.compile())
self._write_content_to_disk(resource, content)
2021-03-13 23:18:30 +13:00
def _write_entry_xml(self, entry: BaseArchiveEntry):
2022-12-03 18:11:17 +13:00
resource = Resource(entry.source, "", lambda: None, ".xml")
content = dict2xml.dict2xml(entry.compile(), wrap="root")
self._write_content_to_disk(resource, content)
def _write_entry_yaml(self, entry: BaseArchiveEntry):
2022-12-03 18:11:17 +13:00
resource = Resource(entry.source, "", lambda: None, ".yaml")
2023-01-26 16:23:59 +13:00
content = yaml.safe_dump(entry.compile())
self._write_content_to_disk(resource, content)
2021-03-13 23:18:30 +13:00
def _write_content_to_disk(self, resource: Resource, content: str):
2021-03-14 12:00:00 +13:00
file_path = self.file_name_formatter.format_path(resource, self.download_directory)
2021-03-14 14:11:37 +13:00
file_path.parent.mkdir(exist_ok=True, parents=True)
2023-01-26 16:23:59 +13:00
with Path(file_path).open(mode="w", encoding="utf-8") as file:
logger.debug(
2022-12-03 18:11:17 +13:00
f"Writing entry {resource.source_submission.id} to file in {resource.extension[1:].upper()}"
f" format at {file_path}"
)
file.write(content)