From 46e80dd50933e563712e9ce90fc536f02b3c983c Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Sun, 30 Jul 2023 23:43:04 -0400 Subject: [PATCH 1/3] Rename URL_(WHITE|BLACK)LIST to URL_(ALLOW|DENY)LIST Retain aliases for old configuration files --- archivebox/config.py | 8 ++++---- archivebox/config_stubs.py | 2 +- archivebox/core/forms.py | 2 +- archivebox/index/__init__.py | 8 ++++---- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 739d7f12..f5eef758 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -82,8 +82,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'MEDIA_TIMEOUT': {'type': int, 'default': 3600}, 'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'}, 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, - 'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages - 'URL_WHITELIST': {'type': str, 'default': None}, + 'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages + 'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)}, 'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True}, 'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'}, }, @@ -371,8 +371,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME}, 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()}, 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None - 'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, - 'URL_WHITELIST_PTN': {'default': lambda c: c['URL_WHITELIST'] and re.compile(c['URL_WHITELIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, + 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, + 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, 'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')}, diff --git a/archivebox/config_stubs.py b/archivebox/config_stubs.py index 2c42e808..c8cc9ecb 100644 --- a/archivebox/config_stubs.py +++ b/archivebox/config_stubs.py @@ -41,7 +41,7 @@ class ConfigDict(BaseConfig, total=False): MEDIA_TIMEOUT: int OUTPUT_PERMISSIONS: str RESTRICT_FILE_NAMES: str - URL_BLACKLIST: str + URL_DENYLIST: str SECRET_KEY: Optional[str] BIND_ADDR: str diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index 99f4d02e..193c0d05 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -41,7 +41,7 @@ class AddLinkForm(forms.Form): # label="Exclude patterns", # min_length='1', # required=False, - # initial=URL_BLACKLIST, + # initial=URL_DENYLIST, # ) # timeout = forms.IntegerField( # initial=TIMEOUT, diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index f631430c..b9d57aeb 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -22,8 +22,8 @@ from ..config import ( JSON_INDEX_FILENAME, OUTPUT_DIR, TIMEOUT, - URL_BLACKLIST_PTN, - URL_WHITELIST_PTN, + URL_DENYLIST_PTN, + URL_ALLOWLIST_PTN, stderr, OUTPUT_PERMISSIONS ) @@ -142,9 +142,9 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]: continue if scheme(link.url) not in ('http', 'https', 'ftp'): continue - if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url): + if URL_DENYLIST_PTN and URL_DENYLIST_PTN.search(link.url): continue - if URL_WHITELIST_PTN and (not URL_WHITELIST_PTN.search(link.url)): + if URL_ALLOWLIST_PTN and (not URL_ALLOWLIST_PTN.search(link.url)): continue yield link From b44f7e68b180276aff61fcd918b0ef96d9b9fa28 Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Mon, 31 Jul 2023 11:34:03 -0400 Subject: [PATCH 2/3] Add URL-specific method allow/deny lists Allows enabling only allow-listed extractors or disabling specific deny-listed extractors for a regular expression matched against an added site's URL. --- archivebox/config.py | 11 ++++++-- archivebox/extractors/__init__.py | 47 +++++++++++++++++++++++-------- tests/test_extractors.py | 43 ++++++++++++++++++++++++++-- 3 files changed, 85 insertions(+), 16 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index f5eef758..7334b169 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -124,6 +124,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)}, 'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)}, 'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)}, + 'SAVE_ALLOWLIST': {'type': dict, 'default': {},}, + 'SAVE_DENYLIST': {'type': dict, 'default': {},}, }, 'ARCHIVE_METHOD_OPTIONS': { @@ -355,6 +357,8 @@ def get_commit_hash(config): ############################## Derived Config ################################## +ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE + DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns}, 'USER': {'default': lambda c: SYSTEM_USER}, @@ -371,8 +375,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME}, 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()}, 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None - 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, - 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, + 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)}, + 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)}, 'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')}, @@ -446,10 +450,11 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)}, 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, 'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)}, + 'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}}, + 'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}}, } - ################################### Helpers #################################### diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 3ca9cfa7..5f09931e 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -4,12 +4,16 @@ import os import sys from pathlib import Path -from typing import Optional, List, Iterable, Union +from typing import Callable, Optional, List, Iterable, TypeAlias, Union from datetime import datetime, timezone from django.db.models import QuerySet +from ..config import ( + SAVE_ALLOWLIST_PTN, + SAVE_DENYLIST_PTN, +) from ..core.settings import ERROR_LOG -from ..index.schema import Link +from ..index.schema import ArchiveResult, Link from ..index.sql import write_link_to_sql_index from ..index import ( load_link_details, @@ -42,7 +46,11 @@ from .archive_org import should_save_archive_dot_org, save_archive_dot_org from .headers import should_save_headers, save_headers -def get_default_archive_methods(): +ShouldSaveFunction: TypeAlias = Callable[[Link, Optional[Path], Optional[bool]], bool] +SaveFunction: TypeAlias = Callable[[Link, Optional[Path], int], ArchiveResult] +ArchiveMethodEntry: TypeAlias = tuple[str, ShouldSaveFunction, SaveFunction] + +def get_default_archive_methods() -> List[ArchiveMethodEntry]: return [ ('favicon', should_save_favicon, save_favicon), ('headers', should_save_headers, save_headers), @@ -59,14 +67,31 @@ def get_default_archive_methods(): ('archive_org', should_save_archive_dot_org, save_archive_dot_org), ] +@enforce_types +def get_archive_methods_for_link(link: Link) -> Iterable[ArchiveMethodEntry]: + DEFAULT_METHODS = get_default_archive_methods() + allowed_methods = { + m for pat, methods in + SAVE_ALLOWLIST_PTN.items() + if pat.search(link.url) + for m in methods + } or { m[0] for m in DEFAULT_METHODS } + denied_methods = { + m for pat, methods in + SAVE_DENYLIST_PTN.items() + if pat.search(link.url) + for m in methods + } + allowed_methods -= denied_methods + + return (m for m in DEFAULT_METHODS if m[0] in allowed_methods) + ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)] @enforce_types -def ignore_methods(to_ignore: List[str]): +def ignore_methods(to_ignore: List[str]) -> Iterable[str]: ARCHIVE_METHODS = get_default_archive_methods() - methods = filter(lambda x: x[0] not in to_ignore, ARCHIVE_METHODS) - methods = map(lambda x: x[0], methods) - return list(methods) + return [x[0] for x in ARCHIVE_METHODS if x[0] not in to_ignore] @enforce_types def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link: @@ -79,11 +104,11 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s except Snapshot.DoesNotExist: snapshot = write_link_to_sql_index(link) - ARCHIVE_METHODS = get_default_archive_methods() + active_methods = get_archive_methods_for_link(link) if methods: - ARCHIVE_METHODS = [ - method for method in ARCHIVE_METHODS + active_methods = [ + method for method in active_methods if method[0] in methods ] @@ -100,7 +125,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} start_ts = datetime.now(timezone.utc) - for method_name, should_run, method_function in ARCHIVE_METHODS: + for method_name, should_run, method_function in active_methods: try: if method_name not in link.history: link.history[method_name] = [] diff --git a/tests/test_extractors.py b/tests/test_extractors.py index 86b50d51..bd6d2775 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -13,12 +13,51 @@ def test_ignore_methods(): Takes the passed method out of the default methods list and returns that value """ ignored = ignore_methods(['title']) - assert should_save_title not in ignored + assert "title" not in ignored + +def test_save_allowdenylist_works(tmp_path, process, disable_extractors_dict): + allow_list = { + r'/static': ["headers", "singlefile"], + r'example\.com\.html$': ["headers"], + } + deny_list = { + "/static": ["singlefile"], + } + disable_extractors_dict.update({ + "SAVE_HEADERS": "true", + "USE_SINGLEFILE": "true", + "SAVE_ALLOWLIST": pyjson.dumps(allow_list), + "SAVE_DENYLIST": pyjson.dumps(deny_list), + }) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob('archive/**/*'))[0] + singlefile_file = archived_item_path / "singlefile.html" + assert not singlefile_file.exists() + headers_file = archived_item_path / "headers.json" + assert headers_file.exists() + +def test_save_denylist_works(tmp_path, process, disable_extractors_dict): + deny_list = { + "/static": ["singlefile"], + } + disable_extractors_dict.update({ + "SAVE_HEADERS": "true", + "USE_SINGLEFILE": "true", + "SAVE_DENYLIST": pyjson.dumps(deny_list), + }) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob('archive/**/*'))[0] + singlefile_file = archived_item_path / "singlefile.html" + assert not singlefile_file.exists() + headers_file = archived_item_path / "headers.json" + assert headers_file.exists() def test_singlefile_works(tmp_path, process, disable_extractors_dict): disable_extractors_dict.update({"USE_SINGLEFILE": "true"}) add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], - capture_output=True, env=disable_extractors_dict) + capture_output=True, env=disable_extractors_dict) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] output_file = archived_item_path / "singlefile.html" assert output_file.exists() From 207647425292f703ae5cd21e41a980c1cb0d939e Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Wed, 2 Aug 2023 10:56:48 -0400 Subject: [PATCH 3/3] Drop use of TypeAlias to maintain Python 3.9 compat TypeAlias annotation was introduced in Python 3.10, and is not strictly necessary. Drop use of it to maintain Python 3.9 compatibility. --- archivebox/extractors/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 5f09931e..38710182 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -4,7 +4,7 @@ import os import sys from pathlib import Path -from typing import Callable, Optional, List, Iterable, TypeAlias, Union +from typing import Callable, Optional, List, Iterable, Union from datetime import datetime, timezone from django.db.models import QuerySet @@ -46,9 +46,9 @@ from .archive_org import should_save_archive_dot_org, save_archive_dot_org from .headers import should_save_headers, save_headers -ShouldSaveFunction: TypeAlias = Callable[[Link, Optional[Path], Optional[bool]], bool] -SaveFunction: TypeAlias = Callable[[Link, Optional[Path], int], ArchiveResult] -ArchiveMethodEntry: TypeAlias = tuple[str, ShouldSaveFunction, SaveFunction] +ShouldSaveFunction = Callable[[Link, Optional[Path], Optional[bool]], bool] +SaveFunction = Callable[[Link, Optional[Path], int], ArchiveResult] +ArchiveMethodEntry = tuple[str, ShouldSaveFunction, SaveFunction] def get_default_archive_methods() -> List[ArchiveMethodEntry]: return [