1
0
Fork 0
mirror of synced 2024-07-01 04:20:55 +12:00

fix: Organize readability extractor so a timeout does not break the whole process

This commit is contained in:
Cristian 2020-08-17 08:34:40 -05:00
parent 26022fc9fb
commit 05c71fc302

View file

@ -62,19 +62,21 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
output_folder = out_dir.absolute() / "readability" output_folder = out_dir.absolute() / "readability"
output = str(output_folder) output = str(output_folder)
# Readability Docs: https://github.com/mozilla/readability
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
document = get_html(link, out_dir) document = get_html(link, out_dir)
temp_doc = NamedTemporaryFile(delete=False) temp_doc = NamedTemporaryFile(delete=False)
temp_doc.write(document.encode("utf-8")) temp_doc.write(document.encode("utf-8"))
temp_doc.close() temp_doc.close()
# Readability Docs: https://github.com/mozilla/readability
cmd = [ cmd = [
READABILITY_BINARY, READABILITY_BINARY,
temp_doc.name temp_doc.name
] ]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=out_dir, timeout=timeout) result = run(cmd, cwd=out_dir, timeout=timeout)
result_json = json.loads(result.stdout) result_json = json.loads(result.stdout)
output_folder.mkdir(exist_ok=True) output_folder.mkdir(exist_ok=True)