ArchiveBox/archivebox/parsers/generic_txt.py

__package__ = 'archivebox.parsers'
__description__ = 'Plain Text'

from typing import IO, Iterable
from datetime import datetime, timezone
from pathlib import Path

from ..index.schema import Link
from ..util import (
    htmldecode,
    enforce_types,
    find_all_urls,
)


@enforce_types
def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
    """Parse links from a text file, ignoring other text"""

    text_file.seek(0)
    for line in text_file.readlines():
        if not line.strip():
            continue

        # if the line is a local file path that resolves, then we can archive it
        try:
            if Path(line).exists():
                yield Link(
                    url=line,
                    timestamp=str(datetime.now(timezone.utc).timestamp()),
                    title=None,
                    tags=None,
                    sources=[text_file.name],
                )
        except (OSError, PermissionError):
            # nvm, not a valid path...
            pass

        # otherwise look for anything that looks like a URL in the line
        for url in find_all_urls(line):
            yield Link(
                url=htmldecode(url),
                timestamp=str(datetime.now(timezone.utc).timestamp()),
                title=None,
                tags=None,
                sources=[text_file.name],
            )


KEY = 'txt'
NAME = 'Generic TXT'
PARSER = parse_generic_txt_export
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`__package__ = 'archivebox.parsers'`
			`__description__ = 'Plain Text'`

			`from typing import IO, Iterable`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 20:19:30 +12:00			`from datetime import datetime, timezone`
accept local paths as valid link URLs when parsing 2020-07-14 03:22:58 +12:00			`from pathlib import Path`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00
			`from ..index.schema import Link`
			`from ..util import (`
			`htmldecode,`
			`enforce_types,`
replace uses of URL_REGEX with find_all_urls to handle markdown better 2024-04-25 12:45:45 +12:00			`find_all_urls,`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`)`

accept local paths as valid link URLs when parsing 2020-07-14 03:22:58 +12:00
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`@enforce_types`
make all parsers accept arbitrary meta kwargs 2020-08-19 00:27:47 +12:00			`def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:`
add command: --parser option 2021-03-21 05:38:00 +13:00			`"""Parse links from a text file, ignoring other text"""`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00
			`text_file.seek(0)`
			`for line in text_file.readlines():`
accept local paths as valid link URLs when parsing 2020-07-14 03:22:58 +12:00			`if not line.strip():`
			`continue`

			`# if the line is a local file path that resolves, then we can archive it`
fix url is too long to be a path error 2020-08-19 00:23:57 +12:00			`try:`
			`if Path(line).exists():`
			`yield Link(`
			`url=line,`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 20:19:30 +12:00			`timestamp=str(datetime.now(timezone.utc).timestamp()),`
fix url is too long to be a path error 2020-08-19 00:23:57 +12:00			`title=None,`
			`tags=None,`
			`sources=[text_file.name],`
			`)`
			`except (OSError, PermissionError):`
			`# nvm, not a valid path...`
			`pass`
accept local paths as valid link URLs when parsing 2020-07-14 03:22:58 +12:00
			`# otherwise look for anything that looks like a URL in the line`
replace uses of URL_REGEX with find_all_urls to handle markdown better 2024-04-25 12:45:45 +12:00			`for url in find_all_urls(line):`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`yield Link(`
			`url=htmldecode(url),`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 20:19:30 +12:00			`timestamp=str(datetime.now(timezone.utc).timestamp()),`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`title=None,`
			`tags=None,`
			`sources=[text_file.name],`
			`)`
also parse and archive sub-urls in generic_txt input 2020-07-28 10:52:02 +12:00
use KEY, NAME, and PARSER to define parsers instead of hardcoding in init 2021-03-31 18:05:49 +13:00
			`KEY = 'txt'`
			`NAME = 'Generic TXT'`
			`PARSER = parse_generic_txt_export`