1
0
Fork 0
mirror of synced 2024-06-28 19:10:33 +12:00

Merge pull request #448 from pirate/skip-invalid-urls

Skip invalid URLs when archiving
This commit is contained in:
Nick Sweeting 2020-08-18 00:53:31 -04:00 committed by GitHub
commit 09ad3a5303
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -9,6 +9,7 @@ from itertools import chain
from typing import List, Tuple, Dict, Optional, Iterable from typing import List, Tuple, Dict, Optional, Iterable
from collections import OrderedDict from collections import OrderedDict
from contextlib import contextmanager from contextlib import contextmanager
from urllib.parse import urlparse
from ..system import atomic_write from ..system import atomic_write
from ..util import ( from ..util import (
@ -139,6 +140,10 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
def archivable_links(links: Iterable[Link]) -> Iterable[Link]: def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
"""remove chrome://, about:// or other schemed links that cant be archived""" """remove chrome://, about:// or other schemed links that cant be archived"""
for link in links: for link in links:
try:
urlparse(link.url)
except ValueError:
continue
scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp') scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
if scheme_is_valid and not_blacklisted: if scheme_is_valid and not_blacklisted: