1
0
Fork 0
mirror of synced 2024-06-28 19:10:33 +12:00

skip invalid urls at all stages

This commit is contained in:
Nick Sweeting 2020-08-17 03:12:17 -04:00
parent 26022fc9fb
commit 225b63b732

View file

@ -9,6 +9,7 @@ from itertools import chain
from typing import List, Tuple, Dict, Optional, Iterable from typing import List, Tuple, Dict, Optional, Iterable
from collections import OrderedDict from collections import OrderedDict
from contextlib import contextmanager from contextlib import contextmanager
from urllib.parse import urlparse
from ..system import atomic_write from ..system import atomic_write
from ..util import ( from ..util import (
@ -139,6 +140,10 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
def archivable_links(links: Iterable[Link]) -> Iterable[Link]: def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
"""remove chrome://, about:// or other schemed links that cant be archived""" """remove chrome://, about:// or other schemed links that cant be archived"""
for link in links: for link in links:
try:
urlparse(link.url)
except ValueError:
continue
scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp') scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
if scheme_is_valid and not_blacklisted: if scheme_is_valid and not_blacklisted: