add proper support for URL_WHITELIST instead of using negation regexes

2024-06-01 10:09:49 +12:00 · 2021-07-06 23:42:00 -04:00 · 2021-07-06 23:42:00 -04:00 · 5a2c78e14b
parent e4974d3536
commit 5a2c78e14b
2 changed files with 6 additions and 4 deletions
--- a/archivebox/config.py
+++ b/archivebox/config.py
@ -77,6 +77,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'OUTPUT_PERMISSIONS':       {'type': str,   'default': '644'},
        'RESTRICT_FILE_NAMES':      {'type': str,   'default': 'windows'},
        'URL_BLACKLIST':            {'type': str,   'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'},  # to avoid downloading code assets as their own pages
+        'URL_WHITELIST':            {'type': str,   'default': None},
        'ENFORCE_ATOMIC_WRITES':    {'type': bool,  'default': True},
    },

@ -337,6 +338,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'COOKIES_FILE':             {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
    'CHROME_USER_DATA_DIR':     {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)},   # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
    'URL_BLACKLIST_PTN':        {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
+    'URL_WHITELIST_PTN':        {'default': lambda c: c['URL_WHITELIST'] and re.compile(c['URL_WHITELIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
    'DIR_OUTPUT_PERMISSIONS':   {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},

    'ARCHIVEBOX_BINARY':        {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -23,6 +23,7 @@ from ..config import (
    OUTPUT_DIR,
    TIMEOUT,
    URL_BLACKLIST_PTN,
+    URL_WHITELIST_PTN,
    stderr,
    OUTPUT_PERMISSIONS
 )
@ -141,10 +142,9 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
            continue
        if scheme(link.url) not in ('http', 'https', 'ftp'):
            continue
-        if URL_BLACKLIST_PTN and (URL_BLACKLIST_PTN.match(link.url) or URL_BLACKLIST_PTN.search(link.url)):
-            # https://stackoverflow.com/questions/180986/what-is-the-difference-between-re-search-and-re-match
-            # we want both behaviors in order to support multiple patterns in the regex,
-            # and negation regexes like (?!someptnhere) to allow for whitelisting
+        if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
+            continue
+        if URL_WHITELIST_PTN and (not URL_WHITELIST_PTN.search(link.url)):
            continue

        yield link