1
0
Fork 0
mirror of synced 2024-05-26 23:20:01 +12:00

Add a combined command for the archiver and downloader: clone (#433)

* Simplify downloader function

* Add basic scraper class

* Add "scrape" command

* Rename "scrape" command to "clone"

* Add integration tests for clone command

* Update README

* Fix failing test
This commit is contained in:
Serene 2021-06-06 20:29:09 +10:00 committed by GitHub
parent a2f010c40d
commit 434aeb8feb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 139 additions and 34 deletions

View file

@ -26,16 +26,24 @@ If you want to use the source code or make contributions, refer to [CONTRIBUTING
The BDFR works by taking submissions from a variety of "sources" from Reddit and then parsing them to download. These sources might be a subreddit, multireddit, a user list, or individual links. These sources are combined and downloaded to disk, according to a naming and organisational scheme defined by the user.
There are two modes to the BDFR: download, and archive. Each one has a command that performs similar but distinct functions. The `download` command will download the resource linked in the Reddit submission, such as the images, video, etc. The `archive` command will download the submission data itself and store it, such as the submission details, upvotes, text, statistics, as and all the comments on that submission. These can then be saved in a data markup language form, such as JSON, XML, or YAML.
There are three modes to the BDFR: download, archive, and clone. Each one has a command that performs similar but distinct functions. The `download` command will download the resource linked in the Reddit submission, such as the images, video, etc. The `archive` command will download the submission data itself and store it, such as the submission details, upvotes, text, statistics, as and all the comments on that submission. These can then be saved in a data markup language form, such as JSON, XML, or YAML. Lastly, the `clone` command will perform both functions of the previous commands at once and is more efficient than running those commands sequentially.
Note that the `clone` command is not a true, failthful clone of Reddit. It simply retrieves much of the raw data that Reddit provides. To get a true clone of Reddit, another tool such as HTTrack should be used.
After installation, run the program from any directory as shown below:
```bash
python3 -m bdfr download
```
```bash
python3 -m bdfr archive
```
```bash
python3 -m bdfr clone
```
However, these commands are not enough. You should chain parameters in [Options](#options) according to your use case. Don't forget that some parameters can be provided multiple times. Some quick reference commands are:
```bash
@ -184,6 +192,10 @@ The following options are for the `archive` command specifically.
- `xml`
- `yaml`
### Cloner Options
The `clone` command can take all the options listed above for both the `archive` and `download` commands since it performs the functions of both.
## Authentication and Security
The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token-based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, saved posts, and private multireddits.

View file

@ -8,6 +8,7 @@ import click
from bdfr.archiver import Archiver
from bdfr.configuration import Configuration
from bdfr.downloader import RedditDownloader
from bdfr.cloner import RedditCloner
logger = logging.getLogger()
@ -32,11 +33,32 @@ _common_options = [
'controversial', 'rising', 'relevance')), default=None),
]
_downloader_options = [
click.option('--file-scheme', default=None, type=str),
click.option('--folder-scheme', default=None, type=str),
click.option('--make-hard-links', is_flag=True, default=None),
click.option('--max-wait-time', type=int, default=None),
click.option('--no-dupes', is_flag=True, default=None),
click.option('--search-existing', is_flag=True, default=None),
click.option('--exclude-id', default=None, multiple=True),
click.option('--exclude-id-file', default=None, multiple=True),
click.option('--skip', default=None, multiple=True),
click.option('--skip-domain', default=None, multiple=True),
click.option('--skip-subreddit', default=None, multiple=True),
]
def _add_common_options(func):
for opt in _common_options:
func = opt(func)
return func
_archiver_options = [
click.option('--all-comments', is_flag=True, default=None),
click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None),
]
def _add_options(opts: list):
def wrap(func):
for opt in opts:
func = opt(func)
return func
return wrap
@click.group()
@ -45,18 +67,8 @@ def cli():
@cli.command('download')
@click.option('--file-scheme', default=None, type=str)
@click.option('--folder-scheme', default=None, type=str)
@click.option('--make-hard-links', is_flag=True, default=None)
@click.option('--max-wait-time', type=int, default=None)
@click.option('--no-dupes', is_flag=True, default=None)
@click.option('--search-existing', is_flag=True, default=None)
@click.option('--exclude-id', default=None, multiple=True)
@click.option('--exclude-id-file', default=None, multiple=True)
@click.option('--skip', default=None, multiple=True)
@click.option('--skip-domain', default=None, multiple=True)
@click.option('--skip-subreddit', default=None, multiple=True)
@_add_common_options
@_add_options(_common_options)
@_add_options(_downloader_options)
@click.pass_context
def cli_download(context: click.Context, **_):
config = Configuration()
@ -73,9 +85,8 @@ def cli_download(context: click.Context, **_):
@cli.command('archive')
@_add_common_options
@click.option('--all-comments', is_flag=True, default=None)
@click.option('-f', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None)
@_add_options(_common_options)
@_add_options(_archiver_options)
@click.pass_context
def cli_archive(context: click.Context, **_):
config = Configuration()
@ -85,7 +96,26 @@ def cli_archive(context: click.Context, **_):
reddit_archiver = Archiver(config)
reddit_archiver.download()
except Exception:
logger.exception('Downloader exited unexpectedly')
logger.exception('Archiver exited unexpectedly')
raise
else:
logger.info('Program complete')
@cli.command('clone')
@_add_options(_common_options)
@_add_options(_archiver_options)
@_add_options(_downloader_options)
@click.pass_context
def cli_clone(context: click.Context, **_):
config = Configuration()
config.process_click_arguments(context)
setup_logging(config.verbose)
try:
reddit_scraper = RedditCloner(config)
reddit_scraper.download()
except Exception:
logger.exception('Scraper exited unexpectedly')
raise
else:
logger.info('Program complete')

21
bdfr/cloner.py Normal file
View file

@ -0,0 +1,21 @@
#!/usr/bin/env python3
# coding=utf-8
import logging
from bdfr.archiver import Archiver
from bdfr.configuration import Configuration
from bdfr.downloader import RedditDownloader
logger = logging.getLogger(__name__)
class RedditCloner(RedditDownloader, Archiver):
def __init__(self, args: Configuration):
super(RedditCloner, self).__init__(args)
def download(self):
for generator in self.reddit_lists:
for submission in generator:
self._download_submission(submission)
self.write_entry(submission)

View file

@ -42,19 +42,20 @@ class RedditDownloader(RedditConnector):
def download(self):
for generator in self.reddit_lists:
for submission in generator:
if submission.id in self.excluded_submission_ids:
logger.debug(f'Object {submission.id} in exclusion list, skipping')
continue
elif submission.subreddit.display_name.lower() in self.args.skip_subreddit:
logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list')
else:
logger.debug(f'Attempting to download submission {submission.id}')
self._download_submission(submission)
self._download_submission(submission)
def _download_submission(self, submission: praw.models.Submission):
if not isinstance(submission, praw.models.Submission):
if submission.id in self.excluded_submission_ids:
logger.debug(f'Object {submission.id} in exclusion list, skipping')
return
elif submission.subreddit.display_name.lower() in self.args.skip_subreddit:
logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list')
return
elif not isinstance(submission, praw.models.Submission):
logger.warning(f'{submission.id} is not a submission')
return
logger.debug(f'Attempting to download submission {submission.id}')
try:
downloader_class = DownloadFactory.pull_lever(submission.url)
downloader = downloader_class(submission)

View file

@ -4,11 +4,12 @@
import os
import re
from pathlib import Path
from unittest.mock import MagicMock
from unittest.mock import MagicMock, patch
import praw.models
import pytest
import bdfr.site_downloaders.download_factory
from bdfr.__main__ import setup_logging
from bdfr.configuration import Configuration
from bdfr.connector import RedditConnector
@ -37,17 +38,30 @@ def downloader_mock(args: Configuration):
(('aaaaaa',), ('aaaaaa',), 0),
((), ('aaaaaa',), 0),
(('aaaaaa', 'bbbbbb'), ('aaaaaa',), 1),
(('aaaaaa', 'bbbbbb', 'cccccc'), ('aaaaaa',), 2),
))
def test_excluded_ids(test_ids: tuple[str], test_excluded: tuple[str], expected_len: int, downloader_mock: MagicMock):
@patch('bdfr.site_downloaders.download_factory.DownloadFactory.pull_lever')
def test_excluded_ids(
mock_function: MagicMock,
test_ids: tuple[str],
test_excluded: tuple[str],
expected_len: int,
downloader_mock: MagicMock,
):
downloader_mock.excluded_submission_ids = test_excluded
mock_function.return_value = MagicMock()
mock_function.return_value.__name__ = 'test'
test_submissions = []
for test_id in test_ids:
m = MagicMock()
m.id = test_id
m.subreddit.display_name.return_value = 'https://www.example.com/'
m.__class__ = praw.models.Submission
test_submissions.append(m)
downloader_mock.reddit_lists = [test_submissions]
RedditDownloader.download(downloader_mock)
assert downloader_mock._download_submission.call_count == expected_len
for submission in test_submissions:
RedditDownloader._download_submission(downloader_mock, submission)
assert mock_function.call_count == expected_len
@pytest.mark.online

View file

@ -33,6 +33,17 @@ def create_basic_args_for_archive_runner(test_args: list[str], tmp_path: Path):
return out
def create_basic_args_for_cloner_runner(test_args: list[str], tmp_path: Path):
out = [
'clone',
str(tmp_path),
'-v',
'--config', 'test_config.cfg',
'--log', str(Path(tmp_path, 'test_log.txt')),
] + test_args
return out
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@ -343,3 +354,19 @@ def test_cli_file_scheme_warning(test_args: list[str], tmp_path: Path):
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Some files might not be downloaded due to name conflicts' in result.output
@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize('test_args', (
['-l', 'm2601g'],
['-s', 'TrollXChromosomes/', '-L', 1],
))
def test_cli_scrape_general(test_args: list[str], tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_cloner_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert 'Downloaded submission' in result.output
assert 'Record for entry item' in result.output