diff --git a/archivebox/config.py b/archivebox/config.py index 412be192..4d1546af 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -183,7 +183,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--compressed' ]}, 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, - 'SINGLEFILE_ARGS': {'type': list, 'default' : None} + 'SINGLEFILE_ARGS': {'type': list, 'default' : None}, + 'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'}, }, 'SEARCH_BACKEND_CONFIG' : { diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index b8831d0c..5baafc17 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -10,6 +10,7 @@ from ..util import enforce_types, domain from ..config import ( TIMEOUT, SAVE_FAVICON, + FAVICON_PROVIDER, CURL_BINARY, CURL_ARGS, CURL_VERSION, @@ -40,7 +41,7 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) '--output', str(output), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), - 'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)), + FAVICON_PROVIDER.format(domain(link.url)), ] status = 'failed' timer = TimedProgress(timeout, prefix=' ') diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 480e9c7f..c44165a9 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -20,7 +20,7 @@ from django.utils.functional import cached_property from ..system import get_dir_size from ..util import ts_to_date_str, parse_date -from ..config import OUTPUT_DIR, ARCHIVE_DIR_NAME +from ..config import OUTPUT_DIR, ARCHIVE_DIR_NAME, FAVICON_PROVIDER class ArchiveError(Exception): def __init__(self, message, hints=None): @@ -423,7 +423,7 @@ class Link: canonical = { 'index_path': 'index.html', 'favicon_path': 'favicon.ico', - 'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain), + 'google_favicon_path': FAVICON_PROVIDER.format(self.domain), 'wget_path': wget_output_path(self), 'warc_path': 'warc/', 'singlefile_path': 'singlefile.html',