diff --git a/README.md b/README.md index 97dcc97..fd5a3b7 100644 --- a/README.md +++ b/README.md @@ -26,16 +26,24 @@ If you want to use the source code or make contributions, refer to [CONTRIBUTING The BDFR works by taking submissions from a variety of "sources" from Reddit and then parsing them to download. These sources might be a subreddit, multireddit, a user list, or individual links. These sources are combined and downloaded to disk, according to a naming and organisational scheme defined by the user. -There are two modes to the BDFR: download, and archive. Each one has a command that performs similar but distinct functions. The `download` command will download the resource linked in the Reddit submission, such as the images, video, etc. The `archive` command will download the submission data itself and store it, such as the submission details, upvotes, text, statistics, as and all the comments on that submission. These can then be saved in a data markup language form, such as JSON, XML, or YAML. +There are three modes to the BDFR: download, archive, and clone. Each one has a command that performs similar but distinct functions. The `download` command will download the resource linked in the Reddit submission, such as the images, video, etc. The `archive` command will download the submission data itself and store it, such as the submission details, upvotes, text, statistics, as and all the comments on that submission. These can then be saved in a data markup language form, such as JSON, XML, or YAML. Lastly, the `clone` command will perform both functions of the previous commands at once and is more efficient than running those commands sequentially. + +Note that the `clone` command is not a true, failthful clone of Reddit. It simply retrieves much of the raw data that Reddit provides. To get a true clone of Reddit, another tool such as HTTrack should be used. After installation, run the program from any directory as shown below: + ```bash python3 -m bdfr download ``` + ```bash python3 -m bdfr archive ``` +```bash +python3 -m bdfr clone +``` + However, these commands are not enough. You should chain parameters in [Options](#options) according to your use case. Don't forget that some parameters can be provided multiple times. Some quick reference commands are: ```bash @@ -184,6 +192,10 @@ The following options are for the `archive` command specifically. - `xml` - `yaml` +### Cloner Options + +The `clone` command can take all the options listed above for both the `archive` and `download` commands since it performs the functions of both. + ## Authentication and Security The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token-based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, saved posts, and private multireddits. diff --git a/bdfr/__main__.py b/bdfr/__main__.py index cf039a5..0d299c9 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -8,6 +8,7 @@ import click from bdfr.archiver import Archiver from bdfr.configuration import Configuration from bdfr.downloader import RedditDownloader +from bdfr.cloner import RedditCloner logger = logging.getLogger() @@ -32,11 +33,32 @@ _common_options = [ 'controversial', 'rising', 'relevance')), default=None), ] +_downloader_options = [ + click.option('--file-scheme', default=None, type=str), + click.option('--folder-scheme', default=None, type=str), + click.option('--make-hard-links', is_flag=True, default=None), + click.option('--max-wait-time', type=int, default=None), + click.option('--no-dupes', is_flag=True, default=None), + click.option('--search-existing', is_flag=True, default=None), + click.option('--exclude-id', default=None, multiple=True), + click.option('--exclude-id-file', default=None, multiple=True), + click.option('--skip', default=None, multiple=True), + click.option('--skip-domain', default=None, multiple=True), + click.option('--skip-subreddit', default=None, multiple=True), +] -def _add_common_options(func): - for opt in _common_options: - func = opt(func) - return func +_archiver_options = [ + click.option('--all-comments', is_flag=True, default=None), + click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None), +] + + +def _add_options(opts: list): + def wrap(func): + for opt in opts: + func = opt(func) + return func + return wrap @click.group() @@ -45,18 +67,8 @@ def cli(): @cli.command('download') -@click.option('--file-scheme', default=None, type=str) -@click.option('--folder-scheme', default=None, type=str) -@click.option('--make-hard-links', is_flag=True, default=None) -@click.option('--max-wait-time', type=int, default=None) -@click.option('--no-dupes', is_flag=True, default=None) -@click.option('--search-existing', is_flag=True, default=None) -@click.option('--exclude-id', default=None, multiple=True) -@click.option('--exclude-id-file', default=None, multiple=True) -@click.option('--skip', default=None, multiple=True) -@click.option('--skip-domain', default=None, multiple=True) -@click.option('--skip-subreddit', default=None, multiple=True) -@_add_common_options +@_add_options(_common_options) +@_add_options(_downloader_options) @click.pass_context def cli_download(context: click.Context, **_): config = Configuration() @@ -73,9 +85,8 @@ def cli_download(context: click.Context, **_): @cli.command('archive') -@_add_common_options -@click.option('--all-comments', is_flag=True, default=None) -@click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None) +@_add_options(_common_options) +@_add_options(_archiver_options) @click.pass_context def cli_archive(context: click.Context, **_): config = Configuration() @@ -85,7 +96,26 @@ def cli_archive(context: click.Context, **_): reddit_archiver = Archiver(config) reddit_archiver.download() except Exception: - logger.exception('Downloader exited unexpectedly') + logger.exception('Archiver exited unexpectedly') + raise + else: + logger.info('Program complete') + + +@cli.command('clone') +@_add_options(_common_options) +@_add_options(_archiver_options) +@_add_options(_downloader_options) +@click.pass_context +def cli_clone(context: click.Context, **_): + config = Configuration() + config.process_click_arguments(context) + setup_logging(config.verbose) + try: + reddit_scraper = RedditCloner(config) + reddit_scraper.download() + except Exception: + logger.exception('Scraper exited unexpectedly') raise else: logger.info('Program complete') diff --git a/bdfr/cloner.py b/bdfr/cloner.py new file mode 100644 index 0000000..979f50f --- /dev/null +++ b/bdfr/cloner.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import logging + +from bdfr.archiver import Archiver +from bdfr.configuration import Configuration +from bdfr.downloader import RedditDownloader + +logger = logging.getLogger(__name__) + + +class RedditCloner(RedditDownloader, Archiver): + def __init__(self, args: Configuration): + super(RedditCloner, self).__init__(args) + + def download(self): + for generator in self.reddit_lists: + for submission in generator: + self._download_submission(submission) + self.write_entry(submission) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 6733691..3b2c581 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -42,19 +42,20 @@ class RedditDownloader(RedditConnector): def download(self): for generator in self.reddit_lists: for submission in generator: - if submission.id in self.excluded_submission_ids: - logger.debug(f'Object {submission.id} in exclusion list, skipping') - continue - elif submission.subreddit.display_name.lower() in self.args.skip_subreddit: - logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list') - else: - logger.debug(f'Attempting to download submission {submission.id}') - self._download_submission(submission) + self._download_submission(submission) def _download_submission(self, submission: praw.models.Submission): - if not isinstance(submission, praw.models.Submission): + if submission.id in self.excluded_submission_ids: + logger.debug(f'Object {submission.id} in exclusion list, skipping') + return + elif submission.subreddit.display_name.lower() in self.args.skip_subreddit: + logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list') + return + elif not isinstance(submission, praw.models.Submission): logger.warning(f'{submission.id} is not a submission') return + + logger.debug(f'Attempting to download submission {submission.id}') try: downloader_class = DownloadFactory.pull_lever(submission.url) downloader = downloader_class(submission) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index b4f175d..d67aee6 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -4,11 +4,12 @@ import os import re from pathlib import Path -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import praw.models import pytest +import bdfr.site_downloaders.download_factory from bdfr.__main__ import setup_logging from bdfr.configuration import Configuration from bdfr.connector import RedditConnector @@ -37,17 +38,30 @@ def downloader_mock(args: Configuration): (('aaaaaa',), ('aaaaaa',), 0), ((), ('aaaaaa',), 0), (('aaaaaa', 'bbbbbb'), ('aaaaaa',), 1), + (('aaaaaa', 'bbbbbb', 'cccccc'), ('aaaaaa',), 2), )) -def test_excluded_ids(test_ids: tuple[str], test_excluded: tuple[str], expected_len: int, downloader_mock: MagicMock): +@patch('bdfr.site_downloaders.download_factory.DownloadFactory.pull_lever') +def test_excluded_ids( + mock_function: MagicMock, + test_ids: tuple[str], + test_excluded: tuple[str], + expected_len: int, + downloader_mock: MagicMock, +): downloader_mock.excluded_submission_ids = test_excluded + mock_function.return_value = MagicMock() + mock_function.return_value.__name__ = 'test' test_submissions = [] for test_id in test_ids: m = MagicMock() m.id = test_id + m.subreddit.display_name.return_value = 'https://www.example.com/' + m.__class__ = praw.models.Submission test_submissions.append(m) downloader_mock.reddit_lists = [test_submissions] - RedditDownloader.download(downloader_mock) - assert downloader_mock._download_submission.call_count == expected_len + for submission in test_submissions: + RedditDownloader._download_submission(downloader_mock, submission) + assert mock_function.call_count == expected_len @pytest.mark.online diff --git a/tests/test_integration.py b/tests/test_integration.py index 2ff1909..ed67f03 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -33,6 +33,17 @@ def create_basic_args_for_archive_runner(test_args: list[str], tmp_path: Path): return out +def create_basic_args_for_cloner_runner(test_args: list[str], tmp_path: Path): + out = [ + 'clone', + str(tmp_path), + '-v', + '--config', 'test_config.cfg', + '--log', str(Path(tmp_path, 'test_log.txt')), + ] + test_args + return out + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @@ -343,3 +354,19 @@ def test_cli_file_scheme_warning(test_args: list[str], tmp_path: Path): result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'Some files might not be downloaded due to name conflicts' in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-l', 'm2601g'], + ['-s', 'TrollXChromosomes/', '-L', 1], +)) +def test_cli_scrape_general(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_cloner_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Downloaded submission' in result.output + assert 'Record for entry item' in result.output