From 3fe7a9b70cca8186c0a89c1ff69b1354518fba1a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 27 Jul 2020 18:52:02 -0400 Subject: [PATCH] also parse and archive sub-urls in generic_txt input --- archivebox/parsers/generic_txt.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/archivebox/parsers/generic_txt.py b/archivebox/parsers/generic_txt.py index 61d1973f..22c805dd 100644 --- a/archivebox/parsers/generic_txt.py +++ b/archivebox/parsers/generic_txt.py @@ -43,3 +43,15 @@ def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]: tags=None, sources=[text_file.name], ) + + # look inside the URL for any sub-urls, e.g. for archive.org links + # https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/ + # -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/ + for url in re.findall(URL_REGEX, line[1:]): + yield Link( + url=htmldecode(url), + timestamp=str(datetime.now().timestamp()), + title=None, + tags=None, + sources=[text_file.name], + )