From 225b63b73207531349969588980f65c924e121a9 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 17 Aug 2020 03:12:17 -0400 Subject: [PATCH] skip invalid urls at all stages --- archivebox/index/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index fc55beea..8afe0e80 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -9,6 +9,7 @@ from itertools import chain from typing import List, Tuple, Dict, Optional, Iterable from collections import OrderedDict from contextlib import contextmanager +from urllib.parse import urlparse from ..system import atomic_write from ..util import ( @@ -139,6 +140,10 @@ def validate_links(links: Iterable[Link]) -> List[Link]: def archivable_links(links: Iterable[Link]) -> Iterable[Link]: """remove chrome://, about:// or other schemed links that cant be archived""" for link in links: + try: + urlparse(link.url) + except ValueError: + continue scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp') not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True if scheme_is_valid and not_blacklisted: