1
0
Fork 0
mirror of synced 2024-06-24 17:10:21 +12:00

fix: Organize readability extractor so a timeout does not break the whole process

This commit is contained in:
Cristian 2020-08-17 08:34:40 -05:00
parent 26022fc9fb
commit 05c71fc302

View file

@ -62,19 +62,21 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
output_folder = out_dir.absolute() / "readability"
output = str(output_folder)
document = get_html(link, out_dir)
temp_doc = NamedTemporaryFile(delete=False)
temp_doc.write(document.encode("utf-8"))
temp_doc.close()
# Readability Docs: https://github.com/mozilla/readability
cmd = [
READABILITY_BINARY,
temp_doc.name
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
document = get_html(link, out_dir)
temp_doc = NamedTemporaryFile(delete=False)
temp_doc.write(document.encode("utf-8"))
temp_doc.close()
cmd = [
READABILITY_BINARY,
temp_doc.name
]
result = run(cmd, cwd=out_dir, timeout=timeout)
result_json = json.loads(result.stdout)
output_folder.mkdir(exist_ok=True)