2021-02-11 12:10:40 +13:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2021-03-11 17:18:48 +13:00
|
|
|
import hashlib
|
2021-04-05 16:47:39 +12:00
|
|
|
import logging.handlers
|
2021-03-11 17:18:48 +13:00
|
|
|
import os
|
2021-05-17 22:49:35 +12:00
|
|
|
import time
|
2023-01-26 16:23:59 +13:00
|
|
|
from collections.abc import Iterable
|
2021-02-11 12:10:40 +13:00
|
|
|
from datetime import datetime
|
2021-03-26 13:42:51 +13:00
|
|
|
from multiprocessing import Pool
|
2021-02-11 12:10:40 +13:00
|
|
|
from pathlib import Path
|
2022-12-11 06:36:54 +13:00
|
|
|
from time import sleep
|
2021-02-11 12:10:40 +13:00
|
|
|
|
|
|
|
import praw
|
2021-03-11 20:18:21 +13:00
|
|
|
import praw.exceptions
|
2021-02-11 12:10:40 +13:00
|
|
|
import praw.models
|
2022-11-20 21:54:56 +13:00
|
|
|
import prawcore
|
2021-02-11 12:10:40 +13:00
|
|
|
|
2021-05-17 12:56:44 +12:00
|
|
|
from bdfr import exceptions as errors
|
2021-04-12 19:58:32 +12:00
|
|
|
from bdfr.configuration import Configuration
|
2021-05-17 12:56:44 +12:00
|
|
|
from bdfr.connector import RedditConnector
|
2021-04-12 19:58:32 +12:00
|
|
|
from bdfr.site_downloaders.download_factory import DownloadFactory
|
2021-02-11 12:10:40 +13:00
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
2023-02-19 14:32:48 +13:00
|
|
|
def _calc_hash(existing_file: Path) -> tuple[Path, str]:
|
2021-05-23 14:13:44 +12:00
|
|
|
chunk_size = 1024 * 1024
|
2023-02-05 07:24:38 +13:00
|
|
|
md5_hash = hashlib.md5(usedforsecurity=False)
|
2022-12-03 18:11:17 +13:00
|
|
|
with existing_file.open("rb") as file:
|
2021-05-23 14:13:44 +12:00
|
|
|
chunk = file.read(chunk_size)
|
2021-05-22 06:41:57 +12:00
|
|
|
while chunk:
|
|
|
|
md5_hash.update(chunk)
|
2021-05-23 14:13:44 +12:00
|
|
|
chunk = file.read(chunk_size)
|
2021-05-22 06:41:57 +12:00
|
|
|
file_hash = md5_hash.hexdigest()
|
|
|
|
return existing_file, file_hash
|
2021-03-26 13:42:51 +13:00
|
|
|
|
|
|
|
|
2021-05-17 12:56:44 +12:00
|
|
|
class RedditDownloader(RedditConnector):
|
2023-02-19 13:38:17 +13:00
|
|
|
def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()) -> None:
|
2023-01-29 17:58:36 +13:00
|
|
|
super().__init__(args, logging_handlers)
|
2021-03-12 16:24:25 +13:00
|
|
|
if self.args.search_existing:
|
2021-03-20 15:03:53 +13:00
|
|
|
self.master_hash_list = self.scan_existing_files(self.download_directory)
|
2021-02-26 21:56:21 +13:00
|
|
|
|
2023-02-19 14:16:28 +13:00
|
|
|
def download(self) -> None:
|
2021-02-11 12:10:40 +13:00
|
|
|
for generator in self.reddit_lists:
|
2022-12-11 06:36:54 +13:00
|
|
|
try:
|
|
|
|
for submission in generator:
|
|
|
|
try:
|
|
|
|
self._download_submission(submission)
|
|
|
|
except prawcore.PrawcoreException as e:
|
|
|
|
logger.error(f"Submission {submission.id} failed to download due to a PRAW exception: {e}")
|
|
|
|
except prawcore.PrawcoreException as e:
|
|
|
|
logger.error(f"The submission after {submission.id} failed to download due to a PRAW exception: {e}")
|
|
|
|
logger.debug("Waiting 60 seconds to continue")
|
|
|
|
sleep(60)
|
2021-02-11 12:10:40 +13:00
|
|
|
|
2023-02-19 14:32:48 +13:00
|
|
|
def _download_submission(self, submission: praw.models.Submission) -> None:
|
2021-06-06 22:29:09 +12:00
|
|
|
if submission.id in self.excluded_submission_ids:
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.debug(f"Object {submission.id} in exclusion list, skipping")
|
2021-06-06 22:29:09 +12:00
|
|
|
return
|
|
|
|
elif submission.subreddit.display_name.lower() in self.args.skip_subreddit:
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.debug(f"Submission {submission.id} in {submission.subreddit.display_name} in skip list")
|
2021-06-06 22:29:09 +12:00
|
|
|
return
|
2022-12-03 18:11:17 +13:00
|
|
|
elif (submission.author and submission.author.name in self.args.ignore_user) or (
|
|
|
|
submission.author is None and "DELETED" in self.args.ignore_user
|
|
|
|
):
|
2021-11-02 02:28:46 +13:00
|
|
|
logger.debug(
|
2022-12-03 18:11:17 +13:00
|
|
|
f"Submission {submission.id} in {submission.subreddit.display_name} skipped"
|
2023-02-19 10:06:32 +13:00
|
|
|
f" due to {submission.author.name if submission.author else 'DELETED'} being an ignored user"
|
2022-12-03 18:11:17 +13:00
|
|
|
)
|
2021-11-02 02:28:46 +13:00
|
|
|
return
|
2022-05-05 16:35:44 +12:00
|
|
|
elif self.args.min_score and submission.score < self.args.min_score:
|
2022-07-23 14:55:53 +12:00
|
|
|
logger.debug(
|
2022-12-03 18:11:17 +13:00
|
|
|
f"Submission {submission.id} filtered due to score {submission.score} < [{self.args.min_score}]"
|
|
|
|
)
|
2022-05-05 15:42:40 +12:00
|
|
|
return
|
2022-05-05 16:35:44 +12:00
|
|
|
elif self.args.max_score and self.args.max_score < submission.score:
|
2022-07-23 14:55:53 +12:00
|
|
|
logger.debug(
|
2022-12-03 18:11:17 +13:00
|
|
|
f"Submission {submission.id} filtered due to score {submission.score} > [{self.args.max_score}]"
|
|
|
|
)
|
2022-05-05 16:35:44 +12:00
|
|
|
return
|
|
|
|
elif (self.args.min_score_ratio and submission.upvote_ratio < self.args.min_score_ratio) or (
|
|
|
|
self.args.max_score_ratio and self.args.max_score_ratio < submission.upvote_ratio
|
|
|
|
):
|
2022-05-05 15:42:40 +12:00
|
|
|
logger.debug(f"Submission {submission.id} filtered due to score ratio ({submission.upvote_ratio})")
|
|
|
|
return
|
2021-06-06 22:29:09 +12:00
|
|
|
elif not isinstance(submission, praw.models.Submission):
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.warning(f"{submission.id} is not a submission")
|
2021-03-25 19:28:08 +13:00
|
|
|
return
|
2021-06-23 16:30:39 +12:00
|
|
|
elif not self.download_filter.check_url(submission.url):
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.debug(f"Submission {submission.id} filtered due to URL {submission.url}")
|
2021-06-23 16:30:39 +12:00
|
|
|
return
|
2021-06-06 22:29:09 +12:00
|
|
|
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.debug(f"Attempting to download submission {submission.id}")
|
2021-03-12 16:24:25 +13:00
|
|
|
try:
|
|
|
|
downloader_class = DownloadFactory.pull_lever(submission.url)
|
|
|
|
downloader = downloader_class(submission)
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.debug(f"Using {downloader_class.__name__} with url {submission.url}")
|
2021-03-12 16:24:25 +13:00
|
|
|
except errors.NotADownloadableLinkError as e:
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.error(f"Could not download submission {submission.id}: {e}")
|
2021-03-12 16:24:25 +13:00
|
|
|
return
|
2021-06-06 22:47:56 +12:00
|
|
|
if downloader_class.__name__.lower() in self.args.disable_module:
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.debug(f"Submission {submission.id} skipped due to disabled module {downloader_class.__name__}")
|
2021-06-06 22:47:56 +12:00
|
|
|
return
|
2021-03-13 15:01:30 +13:00
|
|
|
try:
|
|
|
|
content = downloader.find_resources(self.authenticator)
|
2021-04-04 10:38:48 +12:00
|
|
|
except errors.SiteDownloaderError as e:
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.error(f"Site {downloader_class.__name__} failed to download submission {submission.id}: {e}")
|
2021-03-13 15:01:30 +13:00
|
|
|
return
|
2021-03-12 16:24:25 +13:00
|
|
|
for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory):
|
|
|
|
if destination.exists():
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.debug(f"File {destination} from submission {submission.id} already exists, continuing")
|
2021-05-21 18:50:05 +12:00
|
|
|
continue
|
2021-05-03 15:57:06 +12:00
|
|
|
elif not self.download_filter.check_resource(res):
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.debug(f"Download filter removed {submission.id} file with URL {submission.url}")
|
2021-05-21 18:50:05 +12:00
|
|
|
continue
|
|
|
|
try:
|
2022-12-03 18:11:17 +13:00
|
|
|
res.download({"max_wait_time": self.args.max_wait_time})
|
2021-05-21 18:50:05 +12:00
|
|
|
except errors.BulkDownloaderException as e:
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.error(
|
|
|
|
f"Failed to download resource {res.url} in submission {submission.id} "
|
|
|
|
f"with downloader {downloader_class.__name__}: {e}"
|
|
|
|
)
|
2021-05-21 18:50:05 +12:00
|
|
|
return
|
|
|
|
resource_hash = res.hash.hexdigest()
|
|
|
|
if resource_hash in self.master_hash_list:
|
|
|
|
if self.args.no_dupes:
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.info(f"Resource hash {resource_hash} from submission {submission.id} downloaded elsewhere")
|
2021-03-15 17:00:21 +13:00
|
|
|
return
|
2021-05-21 18:50:05 +12:00
|
|
|
elif self.args.make_hard_links:
|
2023-02-25 15:44:16 +13:00
|
|
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
2022-10-15 11:15:49 +13:00
|
|
|
try:
|
|
|
|
destination.hardlink_to(self.master_hash_list[resource_hash])
|
2022-11-09 06:06:20 +13:00
|
|
|
except AttributeError:
|
2022-10-15 11:15:49 +13:00
|
|
|
self.master_hash_list[resource_hash].link_to(destination)
|
2021-05-21 18:50:05 +12:00
|
|
|
logger.info(
|
2022-12-03 18:11:17 +13:00
|
|
|
f"Hard link made linking {destination} to {self.master_hash_list[resource_hash]}"
|
|
|
|
f" in submission {submission.id}"
|
|
|
|
)
|
2021-05-21 18:50:05 +12:00
|
|
|
return
|
2023-02-25 15:44:16 +13:00
|
|
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
2021-05-23 14:17:14 +12:00
|
|
|
try:
|
2022-12-03 18:11:17 +13:00
|
|
|
with destination.open("wb") as file:
|
2021-05-23 14:17:14 +12:00
|
|
|
file.write(res.content)
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.debug(f"Written file to {destination}")
|
2021-05-23 14:17:14 +12:00
|
|
|
except OSError as e:
|
|
|
|
logger.exception(e)
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.error(f"Failed to write file in submission {submission.id} to {destination}: {e}")
|
2021-06-10 20:59:22 +12:00
|
|
|
return
|
2021-05-21 18:50:05 +12:00
|
|
|
creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple())
|
|
|
|
os.utime(destination, (creation_time, creation_time))
|
|
|
|
self.master_hash_list[resource_hash] = destination
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.debug(f"Hash added to master list: {resource_hash}")
|
|
|
|
logger.info(f"Downloaded submission {submission.id} from {submission.subreddit.display_name}")
|
2021-03-12 16:24:25 +13:00
|
|
|
|
|
|
|
@staticmethod
|
2021-03-20 15:03:53 +13:00
|
|
|
def scan_existing_files(directory: Path) -> dict[str, Path]:
|
2021-03-11 17:18:48 +13:00
|
|
|
files = []
|
2023-02-03 05:50:47 +13:00
|
|
|
for dirpath, _dirnames, filenames in os.walk(directory):
|
2021-03-11 17:18:48 +13:00
|
|
|
files.extend([Path(dirpath, file) for file in filenames])
|
2022-12-03 18:11:17 +13:00
|
|
|
logger.info(f"Calculating hashes for {len(files)} files")
|
2021-03-26 13:42:51 +13:00
|
|
|
|
|
|
|
pool = Pool(15)
|
|
|
|
results = pool.map(_calc_hash, files)
|
|
|
|
pool.close()
|
|
|
|
|
|
|
|
hash_list = {res[1]: res[0] for res in results}
|
2021-03-11 17:18:48 +13:00
|
|
|
return hash_list
|