diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index a96888b0..2c3d7ce3 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -10,6 +10,7 @@ from typing import List, Optional, IO from ..main import add from ..util import docstring +from ..parsers import PARSERS from ..config import OUTPUT_DIR, ONLY_NEW from ..logging_util import SmartFormatter, accept_stdin, stderr @@ -79,6 +80,13 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional This does not take precedence over the configuration", default="" ) + parser.add_argument( + "--parser", + type=str, + help="Parser used to read inputted URLs.", + default="auto", + choices=["auto", *PARSERS.keys()], + ) command = parser.parse_args(args or ()) urls = command.urls @@ -101,6 +109,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional overwrite=command.overwrite, init=command.init, extractors=command.extract, + parser=command.parser, out_dir=pwd or OUTPUT_DIR, ) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index d3d1bedc..252244f1 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -265,14 +265,14 @@ def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]: @enforce_types -def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> Tuple[List[Link], List[Link]]: +def parse_links_from_source(source_path: str, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], List[Link]]: from ..parsers import parse_links new_links: List[Link] = [] # parse and validate the import file - raw_links, parser_name = parse_links(source_path, root_url=root_url) + raw_links, parser_name = parse_links(source_path, root_url=root_url, parser=parser) new_links = validate_links(raw_links) if parser_name: diff --git a/archivebox/main.py b/archivebox/main.py index 5c697c55..dbe8d475 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -568,6 +568,7 @@ def add(urls: Union[str, List[str]], overwrite: bool=False, init: bool=False, extractors: str="", + parser: str="auto", out_dir: Path=OUTPUT_DIR) -> List[Link]: """Add a new URL or list of URLs to your archive""" @@ -594,7 +595,7 @@ def add(urls: Union[str, List[str]], # save verbatim args to sources write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) - new_links += parse_links_from_source(write_ahead_log, root_url=None) + new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser) # If we're going one level deeper, download each link and look for more links new_links_depth = [] diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 4af2c5ac..88b705ae 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -31,36 +31,42 @@ from ..util import ( from ..index.schema import Link from ..logging_util import TimedProgress, log_source_saved -from .pocket_html import parse_pocket_html_export -from .pocket_api import parse_pocket_api_export -from .pinboard_rss import parse_pinboard_rss_export -from .wallabag_atom import parse_wallabag_atom_export -from .shaarli_rss import parse_shaarli_rss_export -from .medium_rss import parse_medium_rss_export -from .netscape_html import parse_netscape_html_export -from .generic_rss import parse_generic_rss_export -from .generic_json import parse_generic_json_export -from .generic_html import parse_generic_html_export -from .generic_txt import parse_generic_txt_export +from . import pocket_api +from . import wallabag_atom +from . import pocket_html +from . import pinboard_rss +from . import shaarli_rss +from . import medium_rss -PARSERS = ( +from . import netscape_html +from . import generic_rss +from . import generic_json +from . import generic_html +from . import generic_txt +from . import url_list + + +PARSERS = { # Specialized parsers - ('Pocket API', parse_pocket_api_export), - ('Wallabag ATOM', parse_wallabag_atom_export), - ('Pocket HTML', parse_pocket_html_export), - ('Pinboard RSS', parse_pinboard_rss_export), - ('Shaarli RSS', parse_shaarli_rss_export), - ('Medium RSS', parse_medium_rss_export), - - # General parsers - ('Netscape HTML', parse_netscape_html_export), - ('Generic RSS', parse_generic_rss_export), - ('Generic JSON', parse_generic_json_export), - ('Generic HTML', parse_generic_html_export), + pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER), + wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER), + pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER), + pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER), + shaarli_rss.KEY: (shaarli_rss.NAME, shaarli_rss.PARSER), + medium_rss.KEY: (medium_rss.NAME, medium_rss.PARSER), - # Fallback parser - ('Plain Text', parse_generic_txt_export), -) + # General parsers + netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER), + generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER), + generic_json.KEY: (generic_json.NAME, generic_json.PARSER), + generic_html.KEY: (generic_html.NAME, generic_html.PARSER), + + # Catchall fallback parser + generic_txt.KEY: (generic_txt.NAME, generic_txt.PARSER), + + # Explicitly specified parsers + url_list.KEY: (url_list.NAME, url_list.PARSER), +} @enforce_types @@ -83,14 +89,14 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None): @enforce_types -def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Link], str]: +def parse_links(source_file: str, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], str]: """parse a list of URLs with their metadata from an RSS feed, bookmarks export, or text file """ timer = TimedProgress(TIMEOUT * 4) with open(source_file, 'r', encoding='utf-8') as file: - links, parser = run_parser_functions(file, timer, root_url=root_url) + links, parser = run_parser_functions(file, timer, root_url=root_url, parser=parser) timer.end() if parser is None: @@ -98,11 +104,20 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li return links, parser -def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Link], Optional[str]]: +def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], Optional[str]]: most_links: List[Link] = [] best_parser_name = None - for parser_name, parser_func in PARSERS: + if parser != "auto": + parser_name, parser_func = PARSERS[parser] + parsed_links = list(parser_func(to_parse, root_url=root_url)) + if not parsed_links: + raise Exception('no links found') + timer.end() + return parsed_links, parser_name + + for parser_id in PARSERS: + parser_name, parser_func = PARSERS[parser_id] try: parsed_links = list(parser_func(to_parse, root_url=root_url)) if not parsed_links: diff --git a/archivebox/parsers/generic_html.py b/archivebox/parsers/generic_html.py index 74b3d1fc..6950dc1d 100644 --- a/archivebox/parsers/generic_html.py +++ b/archivebox/parsers/generic_html.py @@ -51,3 +51,8 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, tags=None, sources=[html_file.name], ) + + +KEY = 'html' +NAME = 'Generic HTML' +PARSER = parse_generic_html_export diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py index e6ed6772..fff4d712 100644 --- a/archivebox/parsers/generic_json.py +++ b/archivebox/parsers/generic_json.py @@ -63,3 +63,8 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: tags=htmldecode(link.get('tags')) or '', sources=[json_file.name], ) + + +KEY = 'json' +NAME = 'Generic JSON' +PARSER = parse_generic_json_export diff --git a/archivebox/parsers/generic_rss.py b/archivebox/parsers/generic_rss.py index 28318444..4bd04967 100644 --- a/archivebox/parsers/generic_rss.py +++ b/archivebox/parsers/generic_rss.py @@ -47,3 +47,8 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: tags=None, sources=[rss_file.name], ) + + +KEY = 'rss' +NAME = 'Generic RSS' +PARSER = parse_generic_rss_export diff --git a/archivebox/parsers/generic_txt.py b/archivebox/parsers/generic_txt.py index 94dd523c..a7ed8d54 100644 --- a/archivebox/parsers/generic_txt.py +++ b/archivebox/parsers/generic_txt.py @@ -17,7 +17,7 @@ from ..util import ( @enforce_types def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]: - """Parse raw links from each line in a text file""" + """Parse links from a text file, ignoring other text""" text_file.seek(0) for line in text_file.readlines(): @@ -59,3 +59,7 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]: tags=None, sources=[text_file.name], ) + +KEY = 'txt' +NAME = 'Generic TXT' +PARSER = parse_generic_txt_export diff --git a/archivebox/parsers/medium_rss.py b/archivebox/parsers/medium_rss.py index 8f14f773..a4159f28 100644 --- a/archivebox/parsers/medium_rss.py +++ b/archivebox/parsers/medium_rss.py @@ -33,3 +33,8 @@ def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: tags=None, sources=[rss_file.name], ) + + +KEY = 'medium_rss' +NAME = 'Medium RSS' +PARSER = parse_medium_rss_export diff --git a/archivebox/parsers/netscape_html.py b/archivebox/parsers/netscape_html.py index a063023c..7523f100 100644 --- a/archivebox/parsers/netscape_html.py +++ b/archivebox/parsers/netscape_html.py @@ -37,3 +37,7 @@ def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]: sources=[html_file.name], ) + +KEY = 'netscape_html' +NAME = 'Netscape HTML' +PARSER = parse_netscape_html_export diff --git a/archivebox/parsers/pinboard_rss.py b/archivebox/parsers/pinboard_rss.py index 98ff14a3..17d1025e 100644 --- a/archivebox/parsers/pinboard_rss.py +++ b/archivebox/parsers/pinboard_rss.py @@ -45,3 +45,8 @@ def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: tags=htmldecode(tags) or None, sources=[rss_file.name], ) + + +KEY = 'pinboard_rss' +NAME = 'Pinboard RSS' +PARSER = parse_pinboard_rss_export diff --git a/archivebox/parsers/pocket_api.py b/archivebox/parsers/pocket_api.py index bf3a292b..afad70ed 100644 --- a/archivebox/parsers/pocket_api.py +++ b/archivebox/parsers/pocket_api.py @@ -111,3 +111,8 @@ def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]: yield link_from_article(article, sources=[line]) write_since(username, api.last_since) + + +KEY = 'pocket_api' +NAME = 'Pocket API' +PARSER = parse_pocket_api_export diff --git a/archivebox/parsers/pocket_html.py b/archivebox/parsers/pocket_html.py index 653f21b8..d34c8bad 100644 --- a/archivebox/parsers/pocket_html.py +++ b/archivebox/parsers/pocket_html.py @@ -36,3 +36,8 @@ def parse_pocket_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]: tags=tags or '', sources=[html_file.name], ) + + +KEY = 'pocket_html' +NAME = 'Pocket HTML' +PARSER = parse_pocket_html_export diff --git a/archivebox/parsers/shaarli_rss.py b/archivebox/parsers/shaarli_rss.py index 4a925f46..67934899 100644 --- a/archivebox/parsers/shaarli_rss.py +++ b/archivebox/parsers/shaarli_rss.py @@ -48,3 +48,8 @@ def parse_shaarli_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: tags=None, sources=[rss_file.name], ) + + +KEY = 'shaarli_rss' +NAME = 'Shaarli RSS' +PARSER = parse_shaarli_rss_export diff --git a/archivebox/parsers/url_list.py b/archivebox/parsers/url_list.py new file mode 100644 index 00000000..a45e5225 --- /dev/null +++ b/archivebox/parsers/url_list.py @@ -0,0 +1,34 @@ +__package__ = 'archivebox.parsers' +__description__ = 'URL list' + +from typing import IO, Iterable +from datetime import datetime + +from ..index.schema import Link +from ..util import ( + enforce_types +) + + +@enforce_types +def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]: + """Parse raw URLs from each line in a text file""" + + text_file.seek(0) + for line in text_file.readlines(): + url = line.strip() + if not url: + continue + + yield Link( + url=url, + timestamp=str(datetime.now().timestamp()), + title=None, + tags=None, + sources=[text_file.name], + ) + + +KEY = 'url_list' +NAME = 'URL List' +PARSER = parse_url_list diff --git a/archivebox/parsers/wallabag_atom.py b/archivebox/parsers/wallabag_atom.py index 7acfc2fc..32740097 100644 --- a/archivebox/parsers/wallabag_atom.py +++ b/archivebox/parsers/wallabag_atom.py @@ -55,3 +55,8 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: tags=tags or '', sources=[rss_file.name], ) + + +KEY = 'wallabag_atom' +NAME = 'Wallabag Atom' +PARSER = parse_wallabag_atom_export