From 3aa740e979ffd976508009a8e4c80934eeb3eaeb Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 10 Dec 2022 12:36:54 -0500 Subject: [PATCH 1/3] Add soft fail on 5xx Prawcore errors. --- bdfr/connector.py | 44 +++++++++++++++++++++++++------------------- bdfr/downloader.py | 16 +++++++++++----- 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index ea970db..e5d74a2 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -13,6 +13,7 @@ from abc import ABCMeta, abstractmethod from datetime import datetime from enum import Enum, auto from pathlib import Path +from time import sleep from typing import Callable, Iterator import appdirs @@ -353,26 +354,31 @@ class RedditConnector(metaclass=ABCMeta): generators = [] for user in self.args.user: try: - self.check_user_existence(user) - except errors.BulkDownloaderException as e: - logger.error(e) - continue - if self.args.submitted: - logger.debug(f"Retrieving submitted posts of user {self.args.user}") - generators.append( - self.create_filtered_listing_generator( - self.reddit_instance.redditor(user).submissions, + try: + self.check_user_existence(user) + except errors.BulkDownloaderException as e: + logger.error(e) + continue + if self.args.submitted: + logger.debug(f"Retrieving submitted posts of user {user}") + generators.append( + self.create_filtered_listing_generator( + self.reddit_instance.redditor(user).submissions, + ) ) - ) - if not self.authenticated and any((self.args.upvoted, self.args.saved)): - logger.warning("Accessing user lists requires authentication") - else: - if self.args.upvoted: - logger.debug(f"Retrieving upvoted posts of user {self.args.user}") - generators.append(self.reddit_instance.redditor(user).upvoted(limit=self.args.limit)) - if self.args.saved: - logger.debug(f"Retrieving saved posts of user {self.args.user}") - generators.append(self.reddit_instance.redditor(user).saved(limit=self.args.limit)) + if not self.authenticated and any((self.args.upvoted, self.args.saved)): + logger.warning("Accessing user lists requires authentication") + else: + if self.args.upvoted: + logger.debug(f"Retrieving upvoted posts of user {user}") + generators.append(self.reddit_instance.redditor(user).upvoted(limit=self.args.limit)) + if self.args.saved: + logger.debug(f"Retrieving saved posts of user {user}") + generators.append(self.reddit_instance.redditor(user).saved(limit=self.args.limit)) + except prawcore.PrawcoreException as e: + logger.error(f"User {user} failed to be retrieved due to a PRAW exception: {e}") + logger.debug("Waiting 60 seconds to continue") + sleep(60) return generators else: return [] diff --git a/bdfr/downloader.py b/bdfr/downloader.py index fa5d10c..1cb6d46 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -8,6 +8,7 @@ import time from datetime import datetime from multiprocessing import Pool from pathlib import Path +from time import sleep import praw import praw.exceptions @@ -42,11 +43,16 @@ class RedditDownloader(RedditConnector): def download(self): for generator in self.reddit_lists: - for submission in generator: - try: - self._download_submission(submission) - except prawcore.PrawcoreException as e: - logger.error(f"Submission {submission.id} failed to download due to a PRAW exception: {e}") + try: + for submission in generator: + try: + self._download_submission(submission) + except prawcore.PrawcoreException as e: + logger.error(f"Submission {submission.id} failed to download due to a PRAW exception: {e}") + except prawcore.PrawcoreException as e: + logger.error(f"The submission after {submission.id} failed to download due to a PRAW exception: {e}") + logger.debug("Waiting 60 seconds to continue") + sleep(60) def _download_submission(self, submission: praw.models.Submission): if submission.id in self.excluded_submission_ids: From ac91c9089c652e77a23ea44a556632e4d8e17636 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 10 Dec 2022 21:19:29 -0500 Subject: [PATCH 2/3] Add 5xx soft fail for clone/archive --- bdfr/archiver.py | 40 +++++++++++++++++++++++----------------- bdfr/cloner.py | 18 ++++++++++++------ 2 files changed, 35 insertions(+), 23 deletions(-) diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 28a270b..e2ed33d 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -4,6 +4,7 @@ import json import logging import re +from time import sleep from typing import Iterator, Union import dict2xml @@ -28,23 +29,28 @@ class Archiver(RedditConnector): def download(self): for generator in self.reddit_lists: - for submission in generator: - try: - if (submission.author and submission.author.name in self.args.ignore_user) or ( - submission.author is None and "DELETED" in self.args.ignore_user - ): - logger.debug( - f"Submission {submission.id} in {submission.subreddit.display_name} skipped" - f" due to {submission.author.name if submission.author else 'DELETED'} being an ignored user" - ) - continue - if submission.id in self.excluded_submission_ids: - logger.debug(f"Object {submission.id} in exclusion list, skipping") - continue - logger.debug(f"Attempting to archive submission {submission.id}") - self.write_entry(submission) - except prawcore.PrawcoreException as e: - logger.error(f"Submission {submission.id} failed to be archived due to a PRAW exception: {e}") + try: + for submission in generator: + try: + if (submission.author and submission.author.name in self.args.ignore_user) or ( + submission.author is None and "DELETED" in self.args.ignore_user + ): + logger.debug( + f"Submission {submission.id} in {submission.subreddit.display_name} skipped due to" + f" {submission.author.name if submission.author else 'DELETED'} being an ignored user" + ) + continue + if submission.id in self.excluded_submission_ids: + logger.debug(f"Object {submission.id} in exclusion list, skipping") + continue + logger.debug(f"Attempting to archive submission {submission.id}") + self.write_entry(submission) + except prawcore.PrawcoreException as e: + logger.error(f"Submission {submission.id} failed to be archived due to a PRAW exception: {e}") + except prawcore.PrawcoreException as e: + logger.error(f"The submission after {submission.id} failed to download due to a PRAW exception: {e}") + logger.debug("Waiting 60 seconds to continue") + sleep(60) def get_submissions_from_link(self) -> list[list[praw.models.Submission]]: supplied_submissions = [] diff --git a/bdfr/cloner.py b/bdfr/cloner.py index c26d17b..e82cfaa 100644 --- a/bdfr/cloner.py +++ b/bdfr/cloner.py @@ -2,6 +2,7 @@ # coding=utf-8 import logging +from time import sleep import prawcore @@ -18,9 +19,14 @@ class RedditCloner(RedditDownloader, Archiver): def download(self): for generator in self.reddit_lists: - for submission in generator: - try: - self._download_submission(submission) - self.write_entry(submission) - except prawcore.PrawcoreException as e: - logger.error(f"Submission {submission.id} failed to be cloned due to a PRAW exception: {e}") + try: + for submission in generator: + try: + self._download_submission(submission) + self.write_entry(submission) + except prawcore.PrawcoreException as e: + logger.error(f"Submission {submission.id} failed to be cloned due to a PRAW exception: {e}") + except prawcore.PrawcoreException as e: + logger.error(f"The submission after {submission.id} failed to download due to a PRAW exception: {e}") + logger.debug("Waiting 60 seconds to continue") + sleep(60) From 4ba5df6b3728c08980ab1fe3f99bbf051a8168c3 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Wed, 14 Dec 2022 23:04:33 -0500 Subject: [PATCH 3/3] 5xx error tests --- .../test_archive_integration.py | 29 +++++++++++++++++++ .../test_clone_integration.py | 29 +++++++++++++++++++ .../test_download_integration.py | 29 +++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/tests/integration_tests/test_archive_integration.py b/tests/integration_tests/test_archive_integration.py index f10f37c..1c0d30a 100644 --- a/tests/integration_tests/test_archive_integration.py +++ b/tests/integration_tests/test_archive_integration.py @@ -4,7 +4,9 @@ import re import shutil from pathlib import Path +from unittest.mock import MagicMock, patch +import prawcore import pytest from click.testing import CliRunner @@ -176,3 +178,30 @@ def test_cli_archive_soft_fail(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 assert "failed to be archived due to a PRAW exception" in result.output assert "Attempting to archive" not in result.output + + +@pytest.mark.skipif(not does_test_config_exist, reason="A test config file is required for integration tests") +@pytest.mark.parametrize( + ("test_args", "response"), + ( + ( + ["--user", "nasa", "--submitted"], + 502, + ), + ( + ["--user", "nasa", "--submitted"], + 504, + ), + ), +) +def test_user_serv_fail(test_args: list[str], response: int, tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + with patch("bdfr.connector.sleep", return_value=None): + with patch( + "bdfr.connector.RedditConnector.check_user_existence", + side_effect=prawcore.exceptions.ResponseException(MagicMock(status_code=response)), + ): + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert f"received {response} HTTP response" in result.output diff --git a/tests/integration_tests/test_clone_integration.py b/tests/integration_tests/test_clone_integration.py index e8dc008..eb64364 100644 --- a/tests/integration_tests/test_clone_integration.py +++ b/tests/integration_tests/test_clone_integration.py @@ -3,7 +3,9 @@ import shutil from pathlib import Path +from unittest.mock import MagicMock, patch +import prawcore import pytest from click.testing import CliRunner @@ -68,3 +70,30 @@ def test_cli_scrape_soft_fail(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 assert "Downloaded submission" not in result.output assert "Record for entry item" not in result.output + + +@pytest.mark.skipif(not does_test_config_exist, reason="A test config file is required for integration tests") +@pytest.mark.parametrize( + ("test_args", "response"), + ( + ( + ["--user", "nasa", "--submitted"], + 502, + ), + ( + ["--user", "nasa", "--submitted"], + 504, + ), + ), +) +def test_user_serv_fail(test_args: list[str], response: int, tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_cloner_runner(test_args, tmp_path) + with patch("bdfr.connector.sleep", return_value=None): + with patch( + "bdfr.connector.RedditConnector.check_user_existence", + side_effect=prawcore.exceptions.ResponseException(MagicMock(status_code=response)), + ): + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert f"received {response} HTTP response" in result.output diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 2ab38a0..e44c95e 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -3,7 +3,9 @@ import shutil from pathlib import Path +from unittest.mock import MagicMock, patch +import prawcore import pytest from click.testing import CliRunner @@ -396,3 +398,30 @@ def test_cli_download_score_filter(test_args: list[str], was_filtered: bool, tmp result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert ("filtered due to score" in result.output) == was_filtered + + +@pytest.mark.skipif(not does_test_config_exist, reason="A test config file is required for integration tests") +@pytest.mark.parametrize( + ("test_args", "response"), + ( + ( + ["--user", "nasa", "--submitted"], + 502, + ), + ( + ["--user", "nasa", "--submitted"], + 504, + ), + ), +) +def test_user_serv_fail(test_args: list[str], response: int, tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + with patch("bdfr.connector.sleep", return_value=None): + with patch( + "bdfr.connector.RedditConnector.check_user_existence", + side_effect=prawcore.exceptions.ResponseException(MagicMock(status_code=response)), + ): + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert f"received {response} HTTP response" in result.output