1
0
Fork 0
mirror of synced 2024-06-02 18:44:59 +12:00

only add url-list lines that are real urls

This commit is contained in:
Nick Sweeting 2021-04-01 03:31:55 -04:00
parent d73f7d7d96
commit f59b6d4189

View file

@ -1,12 +1,15 @@
__package__ = 'archivebox.parsers'
__description__ = 'URL list'
import re
from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
enforce_types
enforce_types,
URL_REGEX,
)
@ -17,7 +20,7 @@ def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]:
text_file.seek(0)
for line in text_file.readlines():
url = line.strip()
if not url:
if (not url) or not re.findall(URL_REGEX, url):
continue
yield Link(