diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py index 162ae38b..8a86026f 100644 --- a/archivebox/extractors/dom.py +++ b/archivebox/extractors/dom.py @@ -9,6 +9,7 @@ from ..util import ( enforce_types, is_static_file, chrome_args, + chrome_cleanup, ) from ..config import ( TIMEOUT, @@ -57,6 +58,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> except Exception as err: status = 'failed' output = err + chrome_cleanup() finally: timer.end() diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py index 9b256015..a6b51948 100644 --- a/archivebox/extractors/pdf.py +++ b/archivebox/extractors/pdf.py @@ -9,6 +9,7 @@ from ..util import ( enforce_types, is_static_file, chrome_args, + chrome_cleanup, ) from ..config import ( TIMEOUT, @@ -54,6 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> except Exception as err: status = 'failed' output = err + chrome_cleanup() finally: timer.end() diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py index a50f5896..7ed8dd9d 100644 --- a/archivebox/extractors/screenshot.py +++ b/archivebox/extractors/screenshot.py @@ -9,6 +9,7 @@ from ..util import ( enforce_types, is_static_file, chrome_args, + chrome_cleanup, ) from ..config import ( TIMEOUT, @@ -54,6 +55,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO except Exception as err: status = 'failed' output = err + chrome_cleanup() finally: timer.end() diff --git a/archivebox/util.py b/archivebox/util.py index cfa7d931..2eecbaeb 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -17,6 +17,8 @@ from requests.exceptions import RequestException, ReadTimeout from .vendor.base32_crockford import encode as base32_encode # type: ignore from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding +from os.path import lexists +from os import remove as remove_file try: import chardet @@ -272,6 +274,16 @@ def chrome_args(**options) -> List[str]: return cmd_args +def chrome_cleanup(): + """ + Cleans up any state or runtime files that chrome leaves behind when killed by + a timeout or other error + """ + + from .config import IN_DOCKER + + if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"): + remove_file("/home/archivebox/.config/chromium/SingletonLock") def ansi_to_html(text): """