diff --git a/archivebox/parsers/generic_txt.py b/archivebox/parsers/generic_txt.py index 61d1973f..22c805dd 100644 --- a/archivebox/parsers/generic_txt.py +++ b/archivebox/parsers/generic_txt.py @@ -43,3 +43,15 @@ def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]: tags=None, sources=[text_file.name], ) + + # look inside the URL for any sub-urls, e.g. for archive.org links + # https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/ + # -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/ + for url in re.findall(URL_REGEX, line[1:]): + yield Link( + url=htmldecode(url), + timestamp=str(datetime.now().timestamp()), + title=None, + tags=None, + sources=[text_file.name], + )