Add ability to disable modules (#434)
* Fix test name to match standard * Rename file * Add ability to disable modules * Update README * Fix missing comma * Fix more missing commas. sigh... Co-authored-by: Ali Parlakçı <parlakciali@gmail.com>
This commit is contained in:
parent
434aeb8feb
commit
6dcef83666
21
README.md
21
README.md
|
@ -71,6 +71,10 @@ The following options are common between both the `archive` and `download` comma
|
||||||
- `--config`
|
- `--config`
|
||||||
- If the path to a configuration file is supplied with this option, the BDFR will use the specified config
|
- If the path to a configuration file is supplied with this option, the BDFR will use the specified config
|
||||||
- See [Configuration Files](#configuration) for more details
|
- See [Configuration Files](#configuration) for more details
|
||||||
|
- `--disable-module`
|
||||||
|
- Can be specified multiple times
|
||||||
|
- Disables certain modules from being used
|
||||||
|
- See [Disabling Modules](#disabling-modules) for more information and a list of module names
|
||||||
- `--log`
|
- `--log`
|
||||||
- This allows one to specify the location of the logfile
|
- This allows one to specify the location of the logfile
|
||||||
- This must be done when running multiple instances of the BDFR, see [Multiple Instances](#multiple-instances) below
|
- This must be done when running multiple instances of the BDFR, see [Multiple Instances](#multiple-instances) below
|
||||||
|
@ -266,6 +270,7 @@ The following keys are optional, and defaults will be used if they cannot be fou
|
||||||
- `backup_log_count`
|
- `backup_log_count`
|
||||||
- `max_wait_time`
|
- `max_wait_time`
|
||||||
- `time_format`
|
- `time_format`
|
||||||
|
- `disabled_modules`
|
||||||
|
|
||||||
All of these should not be modified unless you know what you're doing, as the default values will enable the BDFR to function just fine. A configuration is included in the BDFR when it is installed, and this will be placed in the configuration directory as the default.
|
All of these should not be modified unless you know what you're doing, as the default values will enable the BDFR to function just fine. A configuration is included in the BDFR when it is installed, and this will be placed in the configuration directory as the default.
|
||||||
|
|
||||||
|
@ -277,6 +282,22 @@ The option `time_format` will specify the format of the timestamp that replaces
|
||||||
|
|
||||||
The format can be specified through the [format codes](https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior) that are standard in the Python `datetime` library.
|
The format can be specified through the [format codes](https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior) that are standard in the Python `datetime` library.
|
||||||
|
|
||||||
|
#### Disabling Modules
|
||||||
|
|
||||||
|
The individual modules of the BDFR, used to download submissions from websites, can be disabled. This is helpful especially in the case of the fallback downloaders, since the `--skip-domain` option cannot be effectively used in these cases. For example, the Youtube-DL downloader can retrieve data from hundreds of websites and domains; thus the only way to fully disable it is via the `--disable-module` option.
|
||||||
|
|
||||||
|
Modules can be disabled through the command line interface for the BDFR or more permanently in the configuration file via the `disabled_modules` option. The list of downloaders that can be disabled are the following. Note that they are case-insensitive.
|
||||||
|
|
||||||
|
- `Direct`
|
||||||
|
- `Erome`
|
||||||
|
- `Gallery` (Reddit Image Galleries)
|
||||||
|
- `Gfycat`
|
||||||
|
- `Imgur`
|
||||||
|
- `Redgifs`
|
||||||
|
- `SelfPost` (Reddit Text Post)
|
||||||
|
- `Youtube`
|
||||||
|
- `YoutubeDlFallback`
|
||||||
|
|
||||||
### Rate Limiting
|
### Rate Limiting
|
||||||
|
|
||||||
The option `max_wait_time` has to do with retrying downloads. There are certain HTTP errors that mean that no amount of requests will return the wanted data, but some errors are from rate-limiting. This is when a single client is making so many requests that the remote website cuts the client off to preserve the function of the site. This is a common situation when downloading many resources from the same site. It is polite and best practice to obey the website's wishes in these cases.
|
The option `max_wait_time` has to do with retrying downloads. There are certain HTTP errors that mean that no amount of requests will return the wanted data, but some errors are from rate-limiting. This is when a single client is making so many requests that the remote website cuts the client off to preserve the function of the site. This is a common situation when downloading many resources from the same site. It is polite and best practice to obey the website's wishes in these cases.
|
||||||
|
|
|
@ -14,19 +14,20 @@ logger = logging.getLogger()
|
||||||
|
|
||||||
_common_options = [
|
_common_options = [
|
||||||
click.argument('directory', type=str),
|
click.argument('directory', type=str),
|
||||||
click.option('--config', type=str, default=None),
|
|
||||||
click.option('-v', '--verbose', default=None, count=True),
|
|
||||||
click.option('-l', '--link', multiple=True, default=None, type=str),
|
|
||||||
click.option('-s', '--subreddit', multiple=True, default=None, type=str),
|
|
||||||
click.option('-m', '--multireddit', multiple=True, default=None, type=str),
|
|
||||||
click.option('-L', '--limit', default=None, type=int),
|
|
||||||
click.option('--authenticate', is_flag=True, default=None),
|
click.option('--authenticate', is_flag=True, default=None),
|
||||||
|
click.option('--config', type=str, default=None),
|
||||||
|
click.option('--disable-module', multiple=True, default=None, type=str),
|
||||||
click.option('--log', type=str, default=None),
|
click.option('--log', type=str, default=None),
|
||||||
click.option('--submitted', is_flag=True, default=None),
|
|
||||||
click.option('--upvoted', is_flag=True, default=None),
|
|
||||||
click.option('--saved', is_flag=True, default=None),
|
click.option('--saved', is_flag=True, default=None),
|
||||||
click.option('--search', default=None, type=str),
|
click.option('--search', default=None, type=str),
|
||||||
|
click.option('--submitted', is_flag=True, default=None),
|
||||||
click.option('--time-format', type=str, default=None),
|
click.option('--time-format', type=str, default=None),
|
||||||
|
click.option('--upvoted', is_flag=True, default=None),
|
||||||
|
click.option('-L', '--limit', default=None, type=int),
|
||||||
|
click.option('-l', '--link', multiple=True, default=None, type=str),
|
||||||
|
click.option('-m', '--multireddit', multiple=True, default=None, type=str),
|
||||||
|
click.option('-s', '--subreddit', multiple=True, default=None, type=str),
|
||||||
|
click.option('-v', '--verbose', default=None, count=True),
|
||||||
click.option('-u', '--user', type=str, multiple=True, default=None),
|
click.option('-u', '--user', type=str, multiple=True, default=None),
|
||||||
click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None),
|
click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None),
|
||||||
click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new',
|
click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new',
|
||||||
|
|
|
@ -13,19 +13,21 @@ class Configuration(Namespace):
|
||||||
self.authenticate = False
|
self.authenticate = False
|
||||||
self.config = None
|
self.config = None
|
||||||
self.directory: str = '.'
|
self.directory: str = '.'
|
||||||
|
self.disable_module: list[str] = []
|
||||||
self.exclude_id = []
|
self.exclude_id = []
|
||||||
self.exclude_id_file = []
|
self.exclude_id_file = []
|
||||||
|
self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}'
|
||||||
|
self.folder_scheme: str = '{SUBREDDIT}'
|
||||||
self.limit: Optional[int] = None
|
self.limit: Optional[int] = None
|
||||||
self.link: list[str] = []
|
self.link: list[str] = []
|
||||||
self.log: Optional[str] = None
|
self.log: Optional[str] = None
|
||||||
|
self.make_hard_links = False
|
||||||
self.max_wait_time = None
|
self.max_wait_time = None
|
||||||
self.multireddit: list[str] = []
|
self.multireddit: list[str] = []
|
||||||
self.no_dupes: bool = False
|
self.no_dupes: bool = False
|
||||||
self.saved: bool = False
|
self.saved: bool = False
|
||||||
self.search: Optional[str] = None
|
self.search: Optional[str] = None
|
||||||
self.search_existing: bool = False
|
self.search_existing: bool = False
|
||||||
self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}'
|
|
||||||
self.folder_scheme: str = '{SUBREDDIT}'
|
|
||||||
self.skip: list[str] = []
|
self.skip: list[str] = []
|
||||||
self.skip_domain: list[str] = []
|
self.skip_domain: list[str] = []
|
||||||
self.skip_subreddit: list[str] = []
|
self.skip_subreddit: list[str] = []
|
||||||
|
@ -37,7 +39,6 @@ class Configuration(Namespace):
|
||||||
self.upvoted: bool = False
|
self.upvoted: bool = False
|
||||||
self.user: list[str] = []
|
self.user: list[str] = []
|
||||||
self.verbose: int = 0
|
self.verbose: int = 0
|
||||||
self.make_hard_links = False
|
|
||||||
|
|
||||||
# Archiver-specific options
|
# Archiver-specific options
|
||||||
self.format = 'json'
|
self.format = 'json'
|
||||||
|
|
|
@ -64,6 +64,8 @@ class RedditConnector(metaclass=ABCMeta):
|
||||||
|
|
||||||
self.read_config()
|
self.read_config()
|
||||||
|
|
||||||
|
self.parse_disabled_modules()
|
||||||
|
|
||||||
self.download_filter = self.create_download_filter()
|
self.download_filter = self.create_download_filter()
|
||||||
logger.log(9, 'Created download filter')
|
logger.log(9, 'Created download filter')
|
||||||
self.time_filter = self.create_time_filter()
|
self.time_filter = self.create_time_filter()
|
||||||
|
@ -99,10 +101,19 @@ class RedditConnector(metaclass=ABCMeta):
|
||||||
option = 'ISO'
|
option = 'ISO'
|
||||||
logger.debug(f'Setting datetime format string to {option}')
|
logger.debug(f'Setting datetime format string to {option}')
|
||||||
self.args.time_format = option
|
self.args.time_format = option
|
||||||
|
if not self.args.disable_module:
|
||||||
|
self.args.disable_module = [self.cfg_parser.get('DEFAULT', 'disabled_modules', fallback='')]
|
||||||
# Update config on disk
|
# Update config on disk
|
||||||
with open(self.config_location, 'w') as file:
|
with open(self.config_location, 'w') as file:
|
||||||
self.cfg_parser.write(file)
|
self.cfg_parser.write(file)
|
||||||
|
|
||||||
|
def parse_disabled_modules(self):
|
||||||
|
disabled_modules = self.args.disable_module
|
||||||
|
disabled_modules = self.split_args_input(disabled_modules)
|
||||||
|
disabled_modules = set([name.strip().lower() for name in disabled_modules])
|
||||||
|
self.args.disable_module = disabled_modules
|
||||||
|
logger.debug(f'Disabling the following modules: {", ".join(self.args.disable_module)}')
|
||||||
|
|
||||||
def create_reddit_instance(self):
|
def create_reddit_instance(self):
|
||||||
if self.args.authenticate:
|
if self.args.authenticate:
|
||||||
logger.debug('Using authenticated Reddit instance')
|
logger.debug('Using authenticated Reddit instance')
|
||||||
|
|
|
@ -63,7 +63,9 @@ class RedditDownloader(RedditConnector):
|
||||||
except errors.NotADownloadableLinkError as e:
|
except errors.NotADownloadableLinkError as e:
|
||||||
logger.error(f'Could not download submission {submission.id}: {e}')
|
logger.error(f'Could not download submission {submission.id}: {e}')
|
||||||
return
|
return
|
||||||
|
if downloader_class.__name__.lower() in self.args.disable_module:
|
||||||
|
logger.debug(f'Submission {submission.id} skipped due to disabled module {downloader_class.__name__}')
|
||||||
|
return
|
||||||
try:
|
try:
|
||||||
content = downloader.find_resources(self.authenticator)
|
content = downloader.find_resources(self.authenticator)
|
||||||
except errors.SiteDownloaderError as e:
|
except errors.SiteDownloaderError as e:
|
||||||
|
|
|
@ -313,7 +313,9 @@ def test_sanitise_subreddit_name(test_name: str, expected: str):
|
||||||
(['test1,test2', 'test3'], {'test1', 'test2', 'test3'}),
|
(['test1,test2', 'test3'], {'test1', 'test2', 'test3'}),
|
||||||
(['test1, test2', 'test3'], {'test1', 'test2', 'test3'}),
|
(['test1, test2', 'test3'], {'test1', 'test2', 'test3'}),
|
||||||
(['test1; test2', 'test3'], {'test1', 'test2', 'test3'}),
|
(['test1; test2', 'test3'], {'test1', 'test2', 'test3'}),
|
||||||
(['test1, test2', 'test1,test2,test3', 'test4'], {'test1', 'test2', 'test3', 'test4'})
|
(['test1, test2', 'test1,test2,test3', 'test4'], {'test1', 'test2', 'test3', 'test4'}),
|
||||||
|
([''], {''}),
|
||||||
|
(['test'], {'test'}),
|
||||||
))
|
))
|
||||||
def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: set[str]):
|
def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: set[str]):
|
||||||
results = RedditConnector.split_args_input(test_subreddit_entries)
|
results = RedditConnector.split_args_input(test_subreddit_entries)
|
||||||
|
|
|
@ -348,7 +348,7 @@ def test_cli_download_subreddit_exclusion(test_args: list[str], tmp_path: Path):
|
||||||
['--file-scheme', '{TITLE}'],
|
['--file-scheme', '{TITLE}'],
|
||||||
['--file-scheme', '{TITLE}_test_{SUBREDDIT}'],
|
['--file-scheme', '{TITLE}_test_{SUBREDDIT}'],
|
||||||
))
|
))
|
||||||
def test_cli_file_scheme_warning(test_args: list[str], tmp_path: Path):
|
def test_cli_download_file_scheme_warning(test_args: list[str], tmp_path: Path):
|
||||||
runner = CliRunner()
|
runner = CliRunner()
|
||||||
test_args = create_basic_args_for_download_runner(test_args, tmp_path)
|
test_args = create_basic_args_for_download_runner(test_args, tmp_path)
|
||||||
result = runner.invoke(cli, test_args)
|
result = runner.invoke(cli, test_args)
|
||||||
|
@ -356,6 +356,23 @@ def test_cli_file_scheme_warning(test_args: list[str], tmp_path: Path):
|
||||||
assert 'Some files might not be downloaded due to name conflicts' in result.output
|
assert 'Some files might not be downloaded due to name conflicts' in result.output
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.online
|
||||||
|
@pytest.mark.reddit
|
||||||
|
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
|
||||||
|
@pytest.mark.parametrize('test_args', (
|
||||||
|
['-l', 'm2601g', '--disable-module', 'Direct'],
|
||||||
|
['-l', 'nnb9vs', '--disable-module', 'YoutubeDlFallback'],
|
||||||
|
['-l', 'nnb9vs', '--disable-module', 'youtubedlfallback'],
|
||||||
|
))
|
||||||
|
def test_cli_download_disable_modules(test_args: list[str], tmp_path: Path):
|
||||||
|
runner = CliRunner()
|
||||||
|
test_args = create_basic_args_for_download_runner(test_args, tmp_path)
|
||||||
|
result = runner.invoke(cli, test_args)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert 'skipped due to disabled module' in result.output
|
||||||
|
assert 'Downloaded submission' not in result.output
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.online
|
@pytest.mark.online
|
||||||
@pytest.mark.reddit
|
@pytest.mark.reddit
|
||||||
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
|
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
|
||||||
|
|
Loading…
Reference in a new issue