From 05c71fc30275e9d555a744efd6745c803a32a8fd Mon Sep 17 00:00:00 2001 From: Cristian Date: Mon, 17 Aug 2020 08:34:40 -0500 Subject: [PATCH] fix: Organize readability extractor so a timeout does not break the whole process --- archivebox/extractors/readability.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index c6335a5a..78ca863f 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -62,19 +62,21 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO output_folder = out_dir.absolute() / "readability" output = str(output_folder) - document = get_html(link, out_dir) - temp_doc = NamedTemporaryFile(delete=False) - temp_doc.write(document.encode("utf-8")) - temp_doc.close() # Readability Docs: https://github.com/mozilla/readability - cmd = [ - READABILITY_BINARY, - temp_doc.name - ] status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: + document = get_html(link, out_dir) + temp_doc = NamedTemporaryFile(delete=False) + temp_doc.write(document.encode("utf-8")) + temp_doc.close() + + cmd = [ + READABILITY_BINARY, + temp_doc.name + ] + result = run(cmd, cwd=out_dir, timeout=timeout) result_json = json.loads(result.stdout) output_folder.mkdir(exist_ok=True)