From acb932ba125d6d2d2908e56aecf3ce7abdd8e846 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Feb 2021 15:53:11 -0500 Subject: [PATCH] improve readability and mercury error handling and fix output path to be relative --- archivebox/extractors/mercury.py | 8 +++++++- archivebox/extractors/readability.py | 15 +++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index 59cfc6f6..e7d20362 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -54,7 +54,7 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) out_dir = Path(out_dir or link.link_dir) output_folder = out_dir.absolute() / "mercury" - output = str(output_folder) + output = "mercury" status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') @@ -73,6 +73,9 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) except json.JSONDecodeError: raise ShellError(cmd, result) + if article_text.get('failed'): + raise ArchiveError('Mercury was not able to get article text from the URL') + atomic_write(str(output_folder / "content.txt"), article_text["content"]) # Get HTML version of article @@ -86,6 +89,9 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) except json.JSONDecodeError: raise ShellError(cmd, result) + if article_text.get('failed'): + raise ArchiveError('Mercury was not able to get article HTML from the URL') + atomic_write(str(output_folder / "content.html"), article_json.pop("content")) atomic_write(str(output_folder / "article.json"), article_json) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 3e7f2069..b2e88712 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -63,7 +63,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO out_dir = Path(out_dir or link.link_dir) output_folder = out_dir.absolute() / "readability" - output = str(output_folder) + output = "readability" # Readability Docs: https://github.com/mozilla/readability @@ -81,13 +81,20 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO temp_doc.write(document.encode("utf-8")) temp_doc.close() + if not document or len(document) < 10: + raise ArchiveError('Readability could not find HTML to parse for article text') + cmd = [ DEPENDENCIES['READABILITY_BINARY']['path'], - temp_doc.name + temp_doc.name, ] result = run(cmd, cwd=out_dir, timeout=timeout) - result_json = json.loads(result.stdout) + try: + result_json = json.loads(result.stdout) + except json.JSONDecodeError: + raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr) + output_folder.mkdir(exist_ok=True) readability_content = result_json.pop("textContent") atomic_write(str(output_folder / "content.html"), result_json.pop("content")) @@ -122,6 +129,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO cmd_version=READABILITY_VERSION, output=output, status=status, - index_texts= [readability_content] if readability_content else [], + index_texts=[readability_content] if readability_content else [], **timer.stats, )