From 603ce7ec1048321835ca6ec9647192e5249546ae Mon Sep 17 00:00:00 2001 From: spresse1 Date: Mon, 28 Aug 2023 17:27:03 +0200 Subject: [PATCH] After a timeout, chrome will leave behind a SingletonLock, which prevents future instances of chrome from starting. When an extractor fails due to a timeout, remove this file. --- archivebox/extractors/dom.py | 2 ++ archivebox/extractors/pdf.py | 2 ++ archivebox/extractors/screenshot.py | 2 ++ archivebox/util.py | 12 ++++++++++++ 4 files changed, 18 insertions(+) diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py index 162ae38b..8a86026f 100644 --- a/archivebox/extractors/dom.py +++ b/archivebox/extractors/dom.py @@ -9,6 +9,7 @@ from ..util import ( enforce_types, is_static_file, chrome_args, + chrome_cleanup, ) from ..config import ( TIMEOUT, @@ -57,6 +58,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> except Exception as err: status = 'failed' output = err + chrome_cleanup() finally: timer.end() diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py index 9b256015..a6b51948 100644 --- a/archivebox/extractors/pdf.py +++ b/archivebox/extractors/pdf.py @@ -9,6 +9,7 @@ from ..util import ( enforce_types, is_static_file, chrome_args, + chrome_cleanup, ) from ..config import ( TIMEOUT, @@ -54,6 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> except Exception as err: status = 'failed' output = err + chrome_cleanup() finally: timer.end() diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py index a50f5896..7ed8dd9d 100644 --- a/archivebox/extractors/screenshot.py +++ b/archivebox/extractors/screenshot.py @@ -9,6 +9,7 @@ from ..util import ( enforce_types, is_static_file, chrome_args, + chrome_cleanup, ) from ..config import ( TIMEOUT, @@ -54,6 +55,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO except Exception as err: status = 'failed' output = err + chrome_cleanup() finally: timer.end() diff --git a/archivebox/util.py b/archivebox/util.py index cfa7d931..2eecbaeb 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -17,6 +17,8 @@ from requests.exceptions import RequestException, ReadTimeout from .vendor.base32_crockford import encode as base32_encode # type: ignore from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding +from os.path import lexists +from os import remove as remove_file try: import chardet @@ -272,6 +274,16 @@ def chrome_args(**options) -> List[str]: return cmd_args +def chrome_cleanup(): + """ + Cleans up any state or runtime files that chrome leaves behind when killed by + a timeout or other error + """ + + from .config import IN_DOCKER + + if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"): + remove_file("/home/archivebox/.config/chromium/SingletonLock") def ansi_to_html(text): """