diff --git a/Dockerfile b/Dockerfile index 062359ce..9e6eba63 100644 --- a/Dockerfile +++ b/Dockerfile @@ -90,7 +90,9 @@ WORKDIR "$DATA_DIR" ENV IN_DOCKER=True \ CHROME_SANDBOX=False \ CHROME_BINARY="chromium" \ + USE_SINGLEFILE=True \ SINGLEFILE_BINARY="$NODE_DIR/node_modules/.bin/single-file" \ + USE_READABILITY=True \ READABILITY_BINARY="$NODE_DIR/node_modules/.bin/readability-extractor" # Print version for nice docker finish summary diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 952929a1..efc2cb18 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -77,6 +77,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)}, 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)}, 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)}, + 'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)}, 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, @@ -106,7 +107,8 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'DEPENDENCY_CONFIG': { 'USE_CURL': {'type': bool, 'default': True}, 'USE_WGET': {'type': bool, 'default': True}, - 'USE_SINGLEFILE': {'type': bool, 'default': True}, + 'USE_SINGLEFILE': {'type': bool, 'default': False}, + 'USE_READABILITY': {'type': bool, 'default': False}, 'USE_GIT': {'type': bool, 'default': True}, 'USE_CHROME': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True}, @@ -115,6 +117,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'GIT_BINARY': {'type': str, 'default': 'git'}, 'WGET_BINARY': {'type': str, 'default': 'wget'}, 'SINGLEFILE_BINARY': {'type': str, 'default': 'single-file'}, + 'READABILITY_BINARY': {'type': str, 'default': 'readability-extractor'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, 'CHROME_BINARY': {'type': str, 'default': None}, }, @@ -256,6 +259,9 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, + 'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']}, + 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, + 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, @@ -272,6 +278,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']}, 'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']}, 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['USE_SINGLEFILE']}, + 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY']}, 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, @@ -689,6 +696,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': config['USE_SINGLEFILE'], 'is_valid': bool(config['SINGLEFILE_VERSION']), }, + 'READABILITY_BINARY': { + 'path': bin_path(config['READABILITY_BINARY']), + 'version': config['READABILITY_VERSION'], + 'hash': bin_hash(config['READABILITY_BINARY']), + 'enabled': config['USE_READABILITY'], + 'is_valid': bool(config['READABILITY_VERSION']), + }, 'GIT_BINARY': { 'path': bin_path(config['GIT_BINARY']), 'version': config['GIT_VERSION'], diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index bdeae3d7..dd388446 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -26,6 +26,7 @@ from .title import should_save_title, save_title from .favicon import should_save_favicon, save_favicon from .wget import should_save_wget, save_wget from .singlefile import should_save_singlefile, save_singlefile +from .readability import should_save_readability, save_readability from .pdf import should_save_pdf, save_pdf from .screenshot import should_save_screenshot, save_screenshot from .dom import should_save_dom, save_dom @@ -42,6 +43,7 @@ def get_default_archive_methods(): ('pdf', should_save_pdf, save_pdf), ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), + ('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them ('git', should_save_git, save_git), ('media', should_save_media, save_media), ('archive_org', should_save_archive_dot_org, save_archive_dot_org), @@ -93,6 +95,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s stats[result.status] += 1 log_archive_method_finished(result) else: + print(' > Skipping extractor: {}'.format(method_name)) stats['skipped'] += 1 except Exception as e: raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format( diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py new file mode 100644 index 00000000..c6335a5a --- /dev/null +++ b/archivebox/extractors/readability.py @@ -0,0 +1,113 @@ +__package__ = 'archivebox.extractors' + +from pathlib import Path +from tempfile import NamedTemporaryFile + +from typing import Optional +import json + +from ..index.schema import Link, ArchiveResult, ArchiveError +from ..system import run, atomic_write +from ..util import ( + enforce_types, + download_url, + is_static_file, + +) +from ..config import ( + TIMEOUT, + SAVE_READABILITY, + READABILITY_BINARY, + READABILITY_VERSION, +) +from ..logging_util import TimedProgress + +@enforce_types +def get_html(link: Link, path: Path) -> str: + """ + Try to find wget, singlefile and then dom files. + If none is found, download the url again. + """ + canonical = link.canonical_outputs() + abs_path = path.absolute() + sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]] + document = None + for source in sources: + try: + with open(abs_path / source, "r") as f: + document = f.read() + break + except (FileNotFoundError, TypeError): + continue + if document is None: + return download_url(link.url) + else: + return document + +@enforce_types +def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool: + out_dir = out_dir or link.link_dir + if is_static_file(link.url): + return False + + output = Path(out_dir or link.link_dir) / 'readability' + return SAVE_READABILITY and (not output.exists()) + + +@enforce_types +def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """download reader friendly version using @mozilla/readability""" + + out_dir = Path(out_dir or link.link_dir) + output_folder = out_dir.absolute() / "readability" + output = str(output_folder) + + document = get_html(link, out_dir) + temp_doc = NamedTemporaryFile(delete=False) + temp_doc.write(document.encode("utf-8")) + temp_doc.close() + # Readability Docs: https://github.com/mozilla/readability + cmd = [ + READABILITY_BINARY, + temp_doc.name + ] + + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + try: + result = run(cmd, cwd=out_dir, timeout=timeout) + result_json = json.loads(result.stdout) + output_folder.mkdir(exist_ok=True) + atomic_write(str(output_folder / "content.html"), result_json.pop("content")) + atomic_write(str(output_folder / "content.txt"), result_json.pop("textContent")) + atomic_write(str(output_folder / "article.json"), result_json) + + # parse out number of files downloaded from last line of stderr: + # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" + output_tail = [ + line.strip() + for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] + if line.strip() + ] + hints = ( + 'Got readability response code: {}.'.format(result.returncode), + *output_tail, + ) + + # Check for common failure cases + if (result.returncode > 0): + raise ArchiveError('Readability was not able to archive the page', hints) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=str(out_dir), + cmd_version=READABILITY_VERSION, + output=output, + status=status, + **timer.stats, + ) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 2129f5d3..c3b6ce8c 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -407,6 +407,7 @@ class Link: 'wget_path': wget_output_path(self), 'warc_path': 'warc', 'singlefile_path': 'singlefile.html', + 'readability_path': 'readability/content.html', 'pdf_path': 'output.pdf', 'screenshot_path': 'screenshot.png', 'dom_path': 'output.html', @@ -427,6 +428,7 @@ class Link: 'screenshot_path': static_path, 'dom_path': static_path, 'singlefile_path': static_path, + 'readability_path': static_path, }) return canonical diff --git a/archivebox/themes/legacy/link_details.html b/archivebox/themes/legacy/link_details.html index 447552ad..1dabae2d 100644 --- a/archivebox/themes/legacy/link_details.html +++ b/archivebox/themes/legacy/link_details.html @@ -348,6 +348,18 @@ +
+
+ +
+ + + +

Readability

+

archive/readability/...

+
+
+
diff --git a/tests/fixtures.py b/tests/fixtures.py index 3d8dabfe..458929d3 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -15,6 +15,7 @@ def disable_extractors_dict(): env.update({ "USE_WGET": "false", "USE_SINGLEFILE": "false", + "USE_READABILITY": "false", "SAVE_PDF": "false", "SAVE_SCREENSHOT": "false", "SAVE_DOM": "false", diff --git a/tests/test_extractors.py b/tests/test_extractors.py index ffb933c1..e085d10e 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -21,3 +21,35 @@ def test_singlefile_works(tmp_path, process, disable_extractors_dict): archived_item_path = list(tmp_path.glob('archive/**/*'))[0] output_file = archived_item_path / "singlefile.html" assert output_file.exists() + +def test_readability_works(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_READABILITY": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "readability" / "content.html" + assert output_file.exists() + +def test_readability_works_with_wget(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_READABILITY": "true", "USE_WGET": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "readability" / "content.html" + assert output_file.exists() + +def test_readability_works_with_singlefile(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_READABILITY": "true", "USE_SINGLEFILE": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "readability" / "content.html" + assert output_file.exists() + +def test_readability_works_with_dom(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "readability" / "content.html" + assert output_file.exists() diff --git a/tests/test_init.py b/tests/test_init.py index bd1ad516..f5a34538 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -19,6 +19,7 @@ def test_update(tmp_path, process): assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8") def test_add_link(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_WGET": "true"}) os.chdir(tmp_path) add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) @@ -35,6 +36,7 @@ def test_add_link(tmp_path, process, disable_extractors_dict): assert "Example Domain" in output_html def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_WGET": "true"}) os.chdir(tmp_path) stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=disable_extractors_dict)