From 5b07a1126cfed92dae1b6e8297c5897b6e113ae5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 3 Jan 2024 19:00:19 -0800 Subject: [PATCH] add comment about why DOM is preferred over singlefile for readability parsing --- archivebox/extractors/readability.py | 2 ++ archivebox/templates/admin/base.html | 1 + 2 files changed, 3 insertions(+) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index f849d909..574dc09c 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -99,6 +99,8 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO except (Exception, OSError) as err: status = 'failed' output = err + + # prefer Chrome dom output to singlefile because singlefile often contains huge url(data:image/...base64) strings that make the html too long to parse with readability cmd = [cmd[0], './{dom,singlefile}.html'] finally: timer.end() diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index a29df4f5..5d4d4cc5 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -177,6 +177,7 @@ } {% endif %} +