From d6de04a83ad0963c1b36209e124a66358d09aab6 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 06:07:35 -0500
Subject: [PATCH 01/19] fix lgtm errors

---
 archivebox/core/settings.py         | 2 ++
 archivebox/extractors/favicon.py    | 3 +--
 archivebox/index/__init__.py        | 2 +-
 archivebox/parsers/generic_txt.py   | 4 ++--
 archivebox/parsers/wallabag_atom.py | 2 +-
 5 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index bcf9c073..918e15e9 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -33,6 +33,8 @@ LOGOUT_REDIRECT_URL = '/'
 PASSWORD_RESET_URL = '/accounts/password_reset/'
 APPEND_SLASH = True
 
+DEBUG = DEBUG or sys.environ.get('DEBUG', 'false').lower() != 'false' or '--debug' in sys.argv
+
 INSTALLED_APPS = [
     'django.contrib.auth',
     'django.contrib.contenttypes',
diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py
index 3a4aeea7..b8831d0c 100644
--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@@ -42,14 +42,13 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
         'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
     ]
-    status = 'pending'
+    status = 'failed'
     timer = TimedProgress(timeout, prefix='      ')
     try:
         run(cmd, cwd=str(out_dir), timeout=timeout)
         chmod_file(output, cwd=str(out_dir))
         status = 'succeeded'
     except Exception as err:
-        status = 'failed'
         output = err
     finally:
         timer.end()
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index 8eab1d38..04ab0a8d 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -2,7 +2,6 @@ __package__ = 'archivebox.index'
 
 import os
 import shutil
-import json as pyjson
 from pathlib import Path
 
 from itertools import chain
@@ -42,6 +41,7 @@ from .html import (
     write_html_link_details,
 )
 from .json import (
+    pyjson,
     parse_json_link_details, 
     write_json_link_details,
 )
diff --git a/archivebox/parsers/generic_txt.py b/archivebox/parsers/generic_txt.py
index e296ec7e..94dd523c 100644
--- a/archivebox/parsers/generic_txt.py
+++ b/archivebox/parsers/generic_txt.py
@@ -51,9 +51,9 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
             # look inside the URL for any sub-urls, e.g. for archive.org links
             # https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
             # -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
-            for url in re.findall(URL_REGEX, line[1:]):
+            for sub_url in re.findall(URL_REGEX, line[1:]):
                 yield Link(
-                    url=htmldecode(url),
+                    url=htmldecode(sub_url),
                     timestamp=str(datetime.now().timestamp()),
                     title=None,
                     tags=None,
diff --git a/archivebox/parsers/wallabag_atom.py b/archivebox/parsers/wallabag_atom.py
index 0d77869f..7acfc2fc 100644
--- a/archivebox/parsers/wallabag_atom.py
+++ b/archivebox/parsers/wallabag_atom.py
@@ -45,7 +45,7 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
         time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
         try:
             tags = str_between(get_row('category'), 'label="', '" />')
-        except:
+        except Exception:
             tags = None
 
         yield Link(

From 326ce78496176f753e48d7142c199b750b3780d9 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 06:09:26 -0500
Subject: [PATCH 02/19] simplify debug

---
 archivebox/core/settings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index 918e15e9..e73c93d9 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -33,7 +33,7 @@ LOGOUT_REDIRECT_URL = '/'
 PASSWORD_RESET_URL = '/accounts/password_reset/'
 APPEND_SLASH = True
 
-DEBUG = DEBUG or sys.environ.get('DEBUG', 'false').lower() != 'false' or '--debug' in sys.argv
+DEBUG = DEBUG or ('--debug' in sys.argv)
 
 INSTALLED_APPS = [
     'django.contrib.auth',

From 8e493bf556c75d6560ab78e7f04556b290416178 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 06:16:24 -0500
Subject: [PATCH 03/19] heading fix

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 19196b4f..c1464eb8 100644
--- a/README.md
+++ b/README.md
@@ -639,7 +639,7 @@ archivebox config --set DEBUG=True
 archivebox server --debug ...
 ```
 
-### Build and run a Github branch
+#### Build and run a Github branch
 
 ```bash
 docker build -t archivebox:dev https://github.com/ArchiveBox/ArchiveBox.git#dev

From c25853969d6996ca5200f411b0e96dee6ec6908c Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 08:25:34 -0500
Subject: [PATCH 04/19] add dbshell command examples for executing SQL

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c1464eb8..a83922a3 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ archivebox help
 - `archivebox add/remove/update/list` to manage Snapshots in the archive
 - `archivebox schedule` to pull in fresh URLs in regularly from [boorkmarks/history/Pocket/Pinboard/RSS/etc.](#input-formats)
 - `archivebox oneshot` archive single URLs without starting a whole collection
-- `archivebox shell` open a REPL to use the [Python API](https://docs.archivebox.io/en/latest/modules.html) (alpha)
+- `archivebox shell/manage dbshell` open a REPL to use the [Python API](https://docs.archivebox.io/en/latest/modules.html) (alpha), or SQL API
 
 <div align="center">
 <br/>
@@ -669,6 +669,7 @@ cd archivebox/
 
 cd path/to/test/data/
 archivebox shell
+archivebox manage dbshell
 ```
 (uses `pytest -s`)
 

From 9d24bfd0dcef782a64d4b52117aa5ab5a67e9163 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 20:38:59 -0500
Subject: [PATCH 05/19] disable progress bars on mac again

---
 archivebox/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/archivebox/config.py b/archivebox/config.py
index 7fd4b2fc..23ec17d2 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -27,6 +27,7 @@ import re
 import sys
 import json
 import getpass
+import platform
 import shutil
 import django
 
@@ -51,7 +52,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
     'SHELL_CONFIG': {
         'IS_TTY':                   {'type': bool,  'default': lambda _: sys.stdout.isatty()},
         'USE_COLOR':                {'type': bool,  'default': lambda c: c['IS_TTY']},
-        'SHOW_PROGRESS':            {'type': bool,  'default': lambda c: c['IS_TTY']},
+        'SHOW_PROGRESS':            {'type': bool,  'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')},  # progress bars are buggy on mac, disable for now
         'IN_DOCKER':                {'type': bool,  'default': False},
         # TODO: 'SHOW_HINTS':       {'type:  bool,  'default': True},
     },

From d072f1d4136cb3cb0f07e413395f0e62dcb6f118 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 20:39:11 -0500
Subject: [PATCH 06/19] hide ssl warnings when checking SSL is disabled

---
 archivebox/config.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/archivebox/config.py b/archivebox/config.py
index 23ec17d2..f984d027 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -915,7 +915,11 @@ os.umask(0o777 - int(OUTPUT_PERMISSIONS, base=8))  # noqa: F821
 NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))
 sys.path.append(NODE_BIN_PATH)
 
-
+if not CHECK_SSL_VALIDITY:
+    import urllib3
+    import requests
+    requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
 
 ########################### Config Validity Checkers ###########################

From b9b1c3d9e8990ab3d603a78116be958a622b2a16 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 20:40:10 -0500
Subject: [PATCH 07/19] fix singlefile output path not relative

---
 archivebox/core/admin.py            | 2 +-
 archivebox/extractors/singlefile.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py
index 8c3c3599..ea51f668 100644
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -99,7 +99,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
     list_display = ('added', 'title_str', 'url_str', 'files', 'size')
     sort_fields = ('title_str', 'url_str', 'added')
     readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
-    search_fields = ['url', 'timestamp', 'title', 'tags__name']
+    search_fields = ['url__icontains', 'timestamp', 'title', 'tags__name']
     fields = (*readonly_fields, 'title', 'tags')
     list_filter = ('added', 'updated', 'tags')
     ordering = ['-added']
diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py
index 8d9b36be..3279960e 100644
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@@ -39,7 +39,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
     """download full site using single-file"""
 
     out_dir = out_dir or Path(link.link_dir)
-    output = str(out_dir.absolute() / "singlefile.html")
+    output = "singlefile.html"
 
     browser_args = chrome_args(TIMEOUT=0)
 
@@ -50,7 +50,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
         '--browser-executable-path={}'.format(CHROME_BINARY),
         browser_args,
         link.url,
-        output
+        output,
     ]
 
     status = 'succeeded'
@@ -71,9 +71,9 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
         )
 
         # Check for common failure cases
-        if (result.returncode > 0):
+        if (result.returncode > 0) or not (out_dir / output).is_file():
             raise ArchiveError('SingleFile was not able to archive the page', hints)
-        chmod_file(output)
+        chmod_file(output, cwd=str(out_dir))
     except (Exception, OSError) as err:
         status = 'failed'
         # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).

From c089501073983b6d96d9ec08fcb66f49745e21db Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 20:41:39 -0500
Subject: [PATCH 08/19] add response status code to headers.json

---
 archivebox/util.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/archivebox/util.py b/archivebox/util.py
index 5530ab45..a96950bb 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -200,7 +200,13 @@ def get_headers(url: str, timeout: int=None) -> str:
             stream=True
         )
     
-    return pyjson.dumps(dict(response.headers), indent=4)
+    return pyjson.dumps(
+        {
+            'Status-Code': response.status_code,
+            **dict(response.headers),
+        },
+        indent=4,
+    )
 
 
 @enforce_types

From 24e24934f761ca488b0b51c21da1935df96ab244 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 21:58:38 -0500
Subject: [PATCH 09/19] add headers.json and fix relative singlefile path
 resolving for sonic

---
 archivebox/index/schema.py | 1 +
 archivebox/search/utils.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py
index 5c5eb0f0..7e2c784d 100644
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@@ -427,6 +427,7 @@ class Link:
             'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url),
             'git_path': 'git/',
             'media_path': 'media/',
+            'headers_path': 'headers.json',
         }
         if self.is_static:
             # static binary files like PDF and images are handled slightly differently.
diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py
index 55c97e75..e6d15455 100644
--- a/archivebox/search/utils.py
+++ b/archivebox/search/utils.py
@@ -34,10 +34,11 @@ def get_indexable_content(results: QuerySet):
         return []
     # This should come from a plugin interface
 
+    # TODO: banish this duplication and get these from the extractor file
     if method == 'readability':
         return get_file_result_content(res, 'content.txt')
     elif method == 'singlefile':
-        return get_file_result_content(res, '')
+        return get_file_result_content(res,'',use_pwd=True)
     elif method == 'dom':
         return get_file_result_content(res,'',use_pwd=True)
     elif method == 'wget':

From 385daf9af8ad203ff03f50b5d9cb7d44c953522e Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 22:01:49 -0500
Subject: [PATCH 10/19] save the url as title for staticfiles or non html files

---
 archivebox/extractors/title.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py
index 816c0484..194c57ad 100644
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -62,9 +62,6 @@ class TitleParser(HTMLParser):
 
 @enforce_types
 def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
-    if is_static_file(link.url):
-        return False
-
     # if link already has valid title, skip it
     if not overwrite and link.title and not link.title.lower().startswith('http'):
         return False
@@ -113,7 +110,11 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
                                         timestamp=link.timestamp)\
                                 .update(title=output)
         else:
-            raise ArchiveError('Unable to detect page title')
+            # if no content was returned, dont save a title (because it might be a temporary error)
+            if not html:
+                raise ArchiveError('Unable to detect page title')
+            # output = html[:128]       # use first bit of content as the title
+            output = link.base_url      # use the filename as the title (better UX)
     except Exception as err:
         status = 'failed'
         output = err

From e6fa16e13a24e0d6146398f3556133d97ce20156 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 22:02:11 -0500
Subject: [PATCH 11/19] only chmod wget output if it exists

---
 archivebox/extractors/wget.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py
index 33529e4c..54b631f9 100644
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -105,7 +105,12 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
             if b'ERROR 500: Internal Server Error' in result.stderr:
                 raise ArchiveError('500 Internal Server Error', hints)
             raise ArchiveError('Wget failed or got an error from the server', hints)
-        chmod_file(output, cwd=str(out_dir))
+        
+        if (out_dir / output).exists():
+            chmod_file(output, cwd=str(out_dir))
+        else:
+            print(f'          {out_dir}/{output}')
+            raise ArchiveError('Failed to find wget output after running', hints)
     except Exception as err:
         status = 'failed'
         output = err

From 846c966c4d75929a5450e546d27e1e417a5e13de Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 22:02:39 -0500
Subject: [PATCH 12/19] use globbing to find wget output path

---
 archivebox/extractors/wget.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py
index 54b631f9..fac212c2 100644
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -134,9 +134,7 @@ def wget_output_path(link: Link) -> Optional[str]:
 
     See docs on wget --adjust-extension (-E)
     """
-    if is_static_file(link.url):
-        return without_scheme(without_fragment(link.url))
-
+    
     # Wget downloads can save in a number of different ways depending on the url:
     #    https://example.com
     #       > example.com/index.html
@@ -187,7 +185,7 @@ def wget_output_path(link: Link) -> Optional[str]:
                 last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
                 for file_present in search_dir.iterdir():
                     if file_present == last_part_of_url:
-                        return str(search_dir / file_present)
+                        return str((search_dir / file_present).relative_to(link.link_dir))
 
         # Move up one directory level
         search_dir = search_dir.parent
@@ -195,10 +193,16 @@ def wget_output_path(link: Link) -> Optional[str]:
         if str(search_dir) == link.link_dir:
             break
 
-
+    # check for staticfiles
+    base_url = without_scheme(without_fragment(link.url))
+    domain_dir = Path(domain(link.url).replace(":", "+"))
+    files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*'))
+    if files_within:
+        return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
     
-    search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
-    if not search_dir.is_dir():
-        return str(search_dir.relative_to(link.link_dir))
+    # fallback to just the domain dir
+    search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
+    if search_dir.is_dir():
+        return domain(link.url).replace(":", "+")
 
     return None

From 15e87353bd83fcc12e1086fbcce308a249a7b351 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 22:03:59 -0500
Subject: [PATCH 13/19] only show archive.org if enabled

---
 archivebox/index/html.py   | 2 ++
 archivebox/index/schema.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/archivebox/index/html.py b/archivebox/index/html.py
index cff50085..c8b9d07e 100644
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -23,6 +23,7 @@ from ..config import (
     GIT_SHA,
     FOOTER_INFO,
     HTML_INDEX_FILENAME,
+    SAVE_ARCHIVE_DOT_ORG,
 )
 
 MAIN_INDEX_TEMPLATE = 'static_index.html'
@@ -103,6 +104,7 @@ def link_details_template(link: Link) -> str:
         'status': 'archived' if link.is_archived else 'not yet archived',
         'status_color': 'success' if link.is_archived else 'danger',
         'oldest_archive_date': ts_to_date(link.oldest_archive_date),
+        'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
     })
 
 @enforce_types
diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py
index 7e2c784d..7501da3a 100644
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@@ -412,6 +412,8 @@ class Link:
         """predict the expected output paths that should be present after archiving"""
 
         from ..extractors.wget import wget_output_path
+        # TODO: banish this awful duplication from the codebase and import these
+        # from their respective extractor files
         canonical = {
             'index_path': 'index.html',
             'favicon_path': 'favicon.ico',

From 54c53316939cfe6a1e6dbece64eff16f6061b5a5 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 22:04:14 -0500
Subject: [PATCH 14/19] check for output existance when rendering files icons

---
 archivebox/index/html.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/archivebox/index/html.py b/archivebox/index/html.py
index c8b9d07e..5eba0959 100644
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -140,22 +140,22 @@ def snapshot_icons(snapshot) -> str:
     exclude = ["favicon", "title", "headers", "archive_org"]
     # Missing specific entry for WARC
 
-    extractor_items = defaultdict(lambda: None)
+    extractor_outputs = defaultdict(lambda: None)
     for extractor, _ in EXTRACTORS:
         for result in archive_results:
-            if result.extractor == extractor:
-                extractor_items[extractor] = result
+            if result.extractor == extractor and result:
+                extractor_outputs[extractor] = result
 
     for extractor, _ in EXTRACTORS:
         if extractor not in exclude:
-            exists = False
-            if extractor_items[extractor] is not None:
-                outpath = (Path(path) / canon[f"{extractor}_path"])
-                if outpath.is_dir():
+            outpath = extractor_outputs[extractor] and extractor_outputs[extractor].output
+            if outpath:
+                outpath = (Path(path) / outpath)
+                if outpath.is_file():
+                    exists = True
+                elif outpath.is_dir():
                     exists = any(outpath.glob('*.*'))
-                elif outpath.is_file():
-                    exists = outpath.stat().st_size > 100
-            output += format_html(output_template, path, canon[f"{extractor}_path"], str(exists),
+            output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(outpath)),
                                          extractor, icons.get(extractor, "?"))
         if extractor == "wget":
             # warc isn't technically it's own extractor, so we have to add it after wget

From 560d3103a89b418dadced6e4f68eb37a3e674c4d Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 30 Jan 2021 22:04:24 -0500
Subject: [PATCH 15/19] cleanup snapshot detail page UI

---
 archivebox/templates/core/snapshot.html | 211 +++++++++++++++---------
 1 file changed, 133 insertions(+), 78 deletions(-)

diff --git a/archivebox/templates/core/snapshot.html b/archivebox/templates/core/snapshot.html
index b1edcfe0..ebf2385a 100644
--- a/archivebox/templates/core/snapshot.html
+++ b/archivebox/templates/core/snapshot.html
@@ -33,7 +33,7 @@
             }
             .nav > div {
                 min-height: 30px;
-                margin: 8px 0px;
+                line-height: 1.3;
             }
             .header-top a {
                 text-decoration: none;
@@ -68,6 +68,11 @@
                 vertical-align: -2px;
                 margin-right: 4px;
             }
+            .header-toggle {
+                line-height: 14px;
+                font-size: 70px;
+                vertical-align: -8px;
+            }
             
             .info-row {
                 margin-top: 2px;
@@ -76,24 +81,30 @@
             .info-row .alert {
                 margin-bottom: 0px;
             }
-            .card {
+            .header-bottom-frames .card {
                 overflow: hidden;
                 box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
                 margin-top: 10px;
+                border: 1px solid rgba(0,0,0,3);
+                border-radius: 14px;
+                background-color: black;
             }
             .card h4 {
                 font-size: 1.4vw;
             }
             .card-body {
-                font-size: 1vw;
-                padding-top: 1.2vw;
-                padding-left: 1vw;
-                padding-right: 1vw;
-                padding-bottom: 1vw;
+                font-size: 15px;
+                padding: 13px 10px;
+                padding-bottom: 6px;
+                /* padding-left: 3px; */
+                /* padding-right: 3px; */
+                /* padding-bottom: 3px; */
                 line-height: 1.1;
                 word-wrap: break-word;
                 max-height: 102px;
                 overflow: hidden;
+                background-color: #1a1a1a;
+                color: #d3d3d3;
             }
             .card-title {
                 margin-bottom: 4px;
@@ -126,7 +137,7 @@
                 border-top: 3px solid #aa1e55;
             }
             .card.selected-card {
-                border: 2px solid orange;
+                border: 1px solid orange;
                 box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
             }
             .iframe-large {
@@ -174,12 +185,13 @@
                 width: 98%;
                 border: 1px solid rgba(0,0,0,0.2);
                 box-shadow: 4px 4px 4px rgba(0,0,0,0.2);
-                margin-top: 5px;
+                margin-top: 0px;
             }
             .header-bottom-info {
                 color: #6f6f6f;
-                padding-top: 8px;
-                padding-bottom: 13px;
+                padding-top: 0px;
+                padding-bottom: 0px;
+                margin: 0px -15px;
             }
 
             .header-bottom-info > div {
@@ -203,12 +215,30 @@
                 margin-top: 5px;
             }
             .header-bottom-frames .card-title {
-                padding-bottom: 0px;
-                font-size: 1.2vw;
+                width: 100%;
+                text-align: center;
+                font-size: 18px;
                 margin-bottom: 5px;
+                display: inline-block;
+                color: #d3d3d3;
+                font-weight: 200;
+                vertical-align: 0px;
+                margin-top: -6px;
             }
             .header-bottom-frames .card-text {
+                width: 100%;
+                text-align: center;
                 font-size: 0.9em;
+                display: inline-block;
+                position: relative;
+                top: -11px;
+            }
+            .card-text code {
+                padding: .2rem .4rem;
+                font-size: 90%;
+                color: #bd4147;
+                background-color: #101010;
+                border-radius: .25rem;
             }
 
             @media(max-width: 1092px) {
@@ -247,7 +277,7 @@
                         </a>
                     </div>
                     <div class="col-lg-8">
-                        <img src="favicon.ico" alt="Favicon">
+                        <img src="favicon.ico" onerror="this.style.opacity=0" alt="Favicon">
                         &nbsp;&nbsp;
                         {{title}}
                         &nbsp;&nbsp;
@@ -316,120 +346,145 @@
                     </div>
                 </div>
                 <div class="row header-bottom-frames">
-                    <div class="col-lg-3">
+                    <div class="col-lg-2">
                         <div class="card selected-card">
-                          <iframe class="card-img-top" src="{{archive_url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
-                          <div class="card-body">
-                                <a href="{{archive_url}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
-                                    <img src="../../static/external.png" class="external"/>
-                                </a>
-                                <a href="{{archive_url}}" target="preview"><h4 class="card-title">Wget &gt; WARC</h4></a>
-                                <p class="card-text">archive/{{domain}}</p>
-                            </div>
-                        </div>
-                    </div>
-                    <div class="col-lg-3">
-                        <div class="card">
                             <iframe class="card-img-top" src="{{singlefile_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
-                                <a href="{{singlefile_path}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
-                                    <img src="../../static/external.png" class="external"/>
+                                <a href="{{singlefile_path}}" title="Open in new tab..." target="_blank" rel="noopener">
+                                    <p class="card-text"><code>./singlefile.html</code></p>
                                 </a>
                                 <a href="{{singlefile_path}}" target="preview"><h4 class="card-title">Chrome &gt; SingleFile</h4></a>
-                                <p class="card-text">archive/singlefile.html</p>
                           </div>
                         </div>
                     </div>
-                    <div class="col-lg-3">
-                        <div class="card">
-                            <iframe class="card-img-top" src="{{archive_org_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
-                            <div class="card-body">
-                                <a href="{{archive_org_path}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
-                                    <img src="../../static/external.png" class="external"/>
-                                </a>
-                                <a href="{{archive_org_path}}" target="preview"><h4 class="card-title">Archive.Org</h4></a>
-                                <p class="card-text">web.archive.org/web/...</p>
-                          </div>
-                        </div>
-                    </div>
-                    <div class="col-lg-3">
-                        <div class="card">
-                            <iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
-                            <div class="card-body">
-                                <a href="{{url}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
-                                    <img src="../../static/external.png" class="external"/>
-                                </a>
-                                <a href="{{url}}" target="preview"><h4 class="card-title">Original</h4></a>
-                                <p class="card-text">{{domain}}</p>
-                          </div>
-                        </div>
-                    </div>
-                    <br/>
-                    <div class="col-lg-3">
+                    <div class="col-lg-2">
                         <div class="card">
                             <iframe class="card-img-top pdf-frame" src="{{pdf_path}}" scrolling="no"></iframe>
                             <div class="card-body">
-                                <a href="{{pdf_path}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
-                                    <img src="../../static/external.png" class="external"/>
+                                <a href="{{pdf_path}}" title="Open in new tab..." target="_blank" rel="noopener">
+                                    <p class="card-text"><code>./output.pdf</code></p>
                                 </a>
                                 <a href="{{pdf_path}}" target="preview" id="pdf-btn"><h4 class="card-title">Chrome &gt; PDF</h4></a>
-                                <p class="card-text">archive/output.pdf</p>
                           </div>
                         </div>
                     </div>
-                    <div class="col-lg-3">
+                    <div class="col-lg-2">
                         <div class="card">
-                            <img class="card-img-top screenshot" src="{{screenshot_path}}"></iframe>
+                            <img class="card-img-top" src="{{screenshot_path}}" onerror="this.style.opacity=0.2"/>
                             <div class="card-body">
-                                <a href="{{screenshot_path}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
-                                    <img src="../../static/external.png" class="external"/>
+                                <a href="{{screenshot_path}}" title="Open in new tab..." target="_blank" rel="noopener">
+                                    <p class="card-text"><code>./screenshot.png</code></p>
                                 </a>
                                 <a href="{{screenshot_path}}" target="preview"><h4 class="card-title">Chrome &gt; Screenshot</h4></a>
-                                <p class="card-text">archive/screenshot.png</p>
                           </div>
                         </div>
                     </div>
-                    <div class="col-lg-3">
+                    <div class="col-lg-2">
+                        <div class="card">
+                          <iframe class="card-img-top" src="{{archive_url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                          <div class="card-body">
+                                <a href="{{archive_url}}" title="Open in new tab..." target="_blank" rel="noopener">
+                                    <p class="card-text"><code>./{{domain}}</code></p>
+                                </a>
+                                <a href="{{archive_url}}" target="preview"><h4 class="card-title">Wget &gt; HTML</h4></a>
+                            </div>
+                        </div>
+                    </div>
+                    {% if SAVE_ARCHIVE_DOT_ORG %}
+                    <div class="col-lg-2">
+                        <div class="card">
+                            <iframe class="card-img-top" src="{{archive_org_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <div class="card-body">
+                                <a href="{{archive_org_path}}" title="Open in new tab..." target="_blank" rel="noopener">
+                                    <p class="card-text"><code>🌐 web.archive.org/web/...</code></p>
+                                </a>
+                                <a href="{{archive_org_path}}" target="preview"><h4 class="card-title">Archive.Org</h4></a>
+                          </div>
+                        </div>
+                    </div>
+                    {% endif %}
+                    <div class="col-lg-2">
+                        <div class="card">
+                            <iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <div class="card-body">
+                                <a href="{{url}}" title="Open in new tab..." target="_blank" rel="noopener">
+                                    <p class="card-text"><code>🌐 {{domain}}</code></p>
+                                </a>
+                                <a href="{{url}}" target="preview"><h4 class="card-title">Original</h4></a>
+                          </div>
+                        </div>
+                    </div>
+                    <div class="col-lg-2">
+                        <div class="card">
+                            <iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <div class="card-body">
+                                <a href="{{headers_path}}" title="Open in new tab..." target="_blank" rel="noopener">
+                                    <p class="card-text"><code>./headers.json</code></p>
+                                </a>
+                                <a href="{{headers_path}}" target="preview"><h4 class="card-title">Headers</h4></a>
+                          </div>
+                        </div>
+                    </div>
+                    <div class="col-lg-2">
                         <div class="card">
                             <iframe class="card-img-top" src="{{dom_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
-                                <a href="{{dom_path}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
-                                    <img src="../../static/external.png" class="external"/>
+                                <a href="{{dom_path}}" title="Open in new tab..." target="_blank" rel="noopener">
+                                    <p class="card-text"><code>./output.html</code></p>
                                 </a>
                                 <a href="{{dom_path}}" target="preview"><h4 class="card-title">Chrome &gt; HTML</h4></a>
-                                <p class="card-text">archive/output.html</p>
                           </div>
                         </div>
                     </div>
-                    <div class="col-lg-3">
+                    <div class="col-lg-2">
                         <div class="card">
                             <iframe class="card-img-top" src="{{readability_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
-                                <a href="{{readability_path}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
-                                    <img src="../../static/external.png" class="external"/>
+                                <a href="{{readability_path}}" title="Open in new tab..." target="_blank" rel="noopener">
+                                    <p class="card-text"><code>./readability/content.html</code></p>
                                 </a>
                                 <a href="{{readability_path}}" target="preview"><h4 class="card-title">Readability</h4></a>
-                                <p class="card-text">archive/readability/...</p>
                           </div>
                         </div>
                     </div>
                     <br/>
-                    <div class="col-lg-3">
+                    <div class="col-lg-2">
                         <div class="card">
                             <iframe class="card-img-top" src="{{mercury_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
-                                <a href="{{mercury_path}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
-                                    <img src="../../static/external.png" class="external"/>
+                                <a href="{{mercury_path}}" title="Open in new tab..." target="_blank" rel="noopener">
+                                    <p class="card-text"><code>./mercury/content.html</code></p>
                                 </a>
-                                <a href="{{mercury_path}}" target="preview"><h4 class="card-title">mercury</h4></a>
-                                <p class="card-text">archive/mercury/...</p>
+                                <a href="{{mercury_path}}" target="preview"><h4 class="card-title">Mercury</h4></a>
+                          </div>
+                        </div>
+                    </div>
+                    <div class="col-lg-2">
+                        <div class="card">
+                            <iframe class="card-img-top" src="{{media_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <div class="card-body">
+                                <a href="{{media_path}}" title="Open in new tab..." target="_blank" rel="noopener">
+                                    <p class="card-text"><code>./media/*.mp4</code></p>
+                                </a>
+                                <a href="{{media_path}}" target="preview"><h4 class="card-title">Media</h4></a>
+                          </div>
+                        </div>
+                    </div>
+                    <div class="col-lg-2">
+                        <div class="card">
+                            <iframe class="card-img-top" src="{{git_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <div class="card-body">
+                                <a href="{{git_path}}" title="Open in new tab..." target="_blank" rel="noopener">
+                                    <p class="card-text"><code>./git/*.git</code></p>
+                                </a>
+                                <a href="{{git_path}}" target="preview"><h4 class="card-title">Git</h4></a>
                           </div>
                         </div>
                     </div>
                 </div>
             </div>
         </header>
-        <iframe sandbox="allow-same-origin allow-scripts allow-forms" class="full-page-iframe" src="{{archive_url}}" name="preview"></iframe>
+        <iframe sandbox="allow-same-origin allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_url}}" name="preview"></iframe>
     
         <script
               src="https://code.jquery.com/jquery-3.2.1.slim.min.js"

From 923f517a8f22ecef87b1695fda418383cf0ab2c0 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Mon, 1 Feb 2021 02:17:54 -0500
Subject: [PATCH 16/19] minor fixes

---
 archivebox/core/admin.py                | 1 +
 archivebox/index/html.py                | 7 ++++++-
 archivebox/templates/core/snapshot.html | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py
index ea51f668..bacc53c0 100644
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -106,6 +106,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
     actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
     actions_template = 'admin/actions_as_select.html'
     form = SnapshotAdminForm
+    list_per_page = 40
 
     def get_urls(self):
         urls = super().get_urls()
diff --git a/archivebox/index/html.py b/archivebox/index/html.py
index 5eba0959..d97c6595 100644
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -118,6 +118,8 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
 def snapshot_icons(snapshot) -> str:
     from core.models import EXTRACTORS
 
+    # start = datetime.now()
+
     archive_results = snapshot.archiveresult_set.filter(status="succeeded")
     link = snapshot.as_link()
     path = link.archive_path
@@ -169,4 +171,7 @@ def snapshot_icons(snapshot) -> str:
             output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
                                                                                         "archive_org", icons.get("archive_org", "?"))
 
-    return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
+    result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
+    # end = datetime.now()
+    # print(((end - start).total_seconds()*1000) // 1, 'ms')
+    return result
diff --git a/archivebox/templates/core/snapshot.html b/archivebox/templates/core/snapshot.html
index ebf2385a..839df05c 100644
--- a/archivebox/templates/core/snapshot.html
+++ b/archivebox/templates/core/snapshot.html
@@ -484,7 +484,7 @@
                 </div>
             </div>
         </header>
-        <iframe sandbox="allow-same-origin allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_url}}" name="preview"></iframe>
+        <iframe sandbox="allow-same-origin allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_path}}" name="preview"></iframe>
     
         <script
               src="https://code.jquery.com/jquery-3.2.1.slim.min.js"

From 534ead2440a3ecbe5ea44a81bcde850a50e9822f Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Mon, 1 Feb 2021 02:18:13 -0500
Subject: [PATCH 17/19] use the db exclusively for icons instead of hammering
 filesystem

---
 archivebox/index/html.py | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/archivebox/index/html.py b/archivebox/index/html.py
index d97c6595..ebfe7d78 100644
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -150,24 +150,33 @@ def snapshot_icons(snapshot) -> str:
 
     for extractor, _ in EXTRACTORS:
         if extractor not in exclude:
-            outpath = extractor_outputs[extractor] and extractor_outputs[extractor].output
-            if outpath:
-                outpath = (Path(path) / outpath)
-                if outpath.is_file():
-                    exists = True
-                elif outpath.is_dir():
-                    exists = any(outpath.glob('*.*'))
-            output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(outpath)),
+            existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+            # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
+            # if existing:
+            #     existing = (Path(path) / existing)
+            #     if existing.is_file():
+            #         existing = True
+            #     elif existing.is_dir():
+            #         existing = any(existing.glob('*.*'))
+            output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
                                          extractor, icons.get(extractor, "?"))
         if extractor == "wget":
             # warc isn't technically it's own extractor, so we have to add it after wget
-            exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
-            output += format_html(output_template, exists[0] if exists else '#', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
+            
+            # get from db (faster but less thurthful)
+            exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+            # get from filesystem (slower but more accurate)
+            # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
+            output += format_html(output_template, 'warc/', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
 
         if extractor == "archive_org":
             # The check for archive_org is different, so it has to be handled separately
-            target_path = Path(path) / "archive.org.txt"
-            exists = target_path.exists()
+
+            # get from db (faster)
+            exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+            # get from filesystem (slower)
+            # target_path = Path(path) / "archive.org.txt"
+            # exists = target_path.exists()
             output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
                                                                                         "archive_org", icons.get("archive_org", "?"))
 

From 04c951cdd50e9ab8f4257e0f702ee909506af01d Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Mon, 1 Feb 2021 02:22:02 -0500
Subject: [PATCH 18/19] fix alerts

---
 archivebox/extractors/title.py | 1 -
 archivebox/extractors/wget.py  | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py
index 194c57ad..272eebc8 100644
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -8,7 +8,6 @@ from typing import Optional
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..util import (
     enforce_types,
-    is_static_file,
     download_url,
     htmldecode,
 )
diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py
index fac212c2..e0617bde 100644
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -10,7 +10,6 @@ from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..system import run, chmod_file
 from ..util import (
     enforce_types,
-    is_static_file,
     without_scheme,
     without_fragment,
     without_query,
@@ -193,8 +192,7 @@ def wget_output_path(link: Link) -> Optional[str]:
         if str(search_dir) == link.link_dir:
             break
 
-    # check for staticfiles
-    base_url = without_scheme(without_fragment(link.url))
+    # check for literally any file present that isnt an empty folder
     domain_dir = Path(domain(link.url).replace(":", "+"))
     files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*'))
     if files_within:

From 7d0f5653c3f6699b2444ddf2682e3718827a886a Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Mon, 1 Feb 2021 02:27:24 -0500
Subject: [PATCH 19/19] fix lgtm alerts

---
 archivebox/config.py          | 3 ++-
 archivebox/extractors/wget.py | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/archivebox/config.py b/archivebox/config.py
index f984d027..349817ec 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -915,7 +915,8 @@ os.umask(0o777 - int(OUTPUT_PERMISSIONS, base=8))  # noqa: F821
 NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))
 sys.path.append(NODE_BIN_PATH)
 
-if not CHECK_SSL_VALIDITY:
+# disable stderr "you really shouldnt disable ssl" warnings with library config
+if not CONFIG['CHECK_SSL_VALIDITY']:
     import urllib3
     import requests
     requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py
index e0617bde..4d04f673 100644
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -10,7 +10,6 @@ from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..system import run, chmod_file
 from ..util import (
     enforce_types,
-    without_scheme,
     without_fragment,
     without_query,
     path,