From 7e2b249388e4a530dcbcd374de469033eeb36c18 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 7 Aug 2020 08:05:17 -0500 Subject: [PATCH] feat: Initial version of readability extractor --- archivebox/config/__init__.py | 14 +++++ archivebox/extractors/__init__.py | 2 + archivebox/extractors/readability.py | 83 ++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+) create mode 100644 archivebox/extractors/readability.py diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index c53c5eec..b51c7034 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -76,6 +76,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)}, 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)}, 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)}, + 'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)}, 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, @@ -107,6 +108,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'USE_CURL': {'type': bool, 'default': True}, 'USE_WGET': {'type': bool, 'default': True}, 'USE_SINGLEFILE': {'type': bool, 'default': True}, + 'USE_READABILITY': {'type': bool, 'default': True}, 'USE_GIT': {'type': bool, 'default': True}, 'USE_CHROME': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True}, @@ -115,6 +117,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'GIT_BINARY': {'type': str, 'default': 'git'}, 'WGET_BINARY': {'type': str, 'default': 'wget'}, 'SINGLEFILE_BINARY': {'type': str, 'default': 'single-file'}, + 'READABILITY_BINARY': {'type': str, 'default': 'readability-extractor'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, 'CHROME_BINARY': {'type': str, 'default': None}, }, @@ -256,6 +259,9 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, + 'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']}, + 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, + 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, @@ -272,6 +278,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']}, 'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']}, 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['USE_SINGLEFILE']}, + 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY']}, 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, @@ -689,6 +696,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': config['USE_SINGLEFILE'], 'is_valid': bool(config['SINGLEFILE_VERSION']), }, + 'READABILITY_BINARY': { + 'path': bin_path(config['READABILITY_BINARY']), + 'version': config['READABILITY_VERSION'], + 'hash': bin_hash(config['READABILITY_BINARY']), + 'enabled': config['USE_READABILITY'], + 'is_valid': bool(config['READABILITY_VERSION']), + }, 'GIT_BINARY': { 'path': bin_path(config['GIT_BINARY']), 'version': config['GIT_VERSION'], diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index bdeae3d7..6cd3c551 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -26,6 +26,7 @@ from .title import should_save_title, save_title from .favicon import should_save_favicon, save_favicon from .wget import should_save_wget, save_wget from .singlefile import should_save_singlefile, save_singlefile +from .readability import should_save_readability, save_readability from .pdf import should_save_pdf, save_pdf from .screenshot import should_save_screenshot, save_screenshot from .dom import should_save_dom, save_dom @@ -39,6 +40,7 @@ def get_default_archive_methods(): ('favicon', should_save_favicon, save_favicon), ('wget', should_save_wget, save_wget), ('singlefile', should_save_singlefile, save_singlefile), + ('readability', should_save_readability, save_readability), ('pdf', should_save_pdf, save_pdf), ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py new file mode 100644 index 00000000..2659c18e --- /dev/null +++ b/archivebox/extractors/readability.py @@ -0,0 +1,83 @@ +__package__ = 'archivebox.extractors' + +from pathlib import Path + +from typing import Optional +import json + +from ..index.schema import Link, ArchiveResult, ArchiveError +from ..system import run, atomic_write +from ..util import ( + enforce_types, + download_url, + +) +from ..config import ( + TIMEOUT, + SAVE_READABILITY, + READABILITY_BINARY, + READABILITY_VERSION, + CHROME_BINARY, +) +from ..logging_util import TimedProgress + + +@enforce_types +def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool: + out_dir = out_dir or link.link_dir + + output = Path(out_dir or link.link_dir) / 'readability.json' + return SAVE_READABILITY and (not output.exists()) + + +@enforce_types +def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """download reader friendly version using @mozilla/readability""" + + out_dir = out_dir or link.link_dir + output = str(Path(out_dir).absolute() / "readability.json") + + document = download_url(link.url) + + # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli + cmd = [ + READABILITY_BINARY, + document + ] + + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + try: + result = run(cmd, cwd=out_dir, timeout=timeout) + result_json = json.loads(result.stdout) + atomic_write(output, result_json) + + # parse out number of files downloaded from last line of stderr: + # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" + output_tail = [ + line.strip() + for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] + if line.strip() + ] + hints = ( + 'Got readability response code: {}.'.format(result.returncode), + *output_tail, + ) + + # Check for common failure cases + if (result.returncode > 0): + raise ArchiveError('Readability was not able to archive the page', hints) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=out_dir, + cmd_version=READABILITY_VERSION, + output=output, + status=status, + **timer.stats, + )