Merge branch 'dev' into plugins-browsertrix

2024-05-13 17:03:19 +12:00 · 2024-02-22 04:51:31 -08:00 · 2024-02-22 04:51:31 -08:00 · 1ea7ac168a
parent 15d1865f4e 31d05d8526
commit 1ea7ac168a
13 changed files with 88 additions and 36 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -28,4 +28,5 @@ assets/
 docker/

 data/
+data*/
 output/
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -30,5 +30,4 @@ formats:
 # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
 python:
  install:
-    - requirements: requirements.txt
-    - requirements: docs/requirements.txt
+    - requirements: docs/requirements.txt
--- a/5
+++ b/5
@ -294,9 +294,8 @@ WORKDIR "$DATA_DIR"
 VOLUME "$DATA_DIR"
 EXPOSE 8000

-# Optional:
-# HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
-#     CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
+HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
+    CMD curl --silent 'http://localhost:8000/health/' | grep -q 'OK'

 ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
 CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]
--- a/README.md
+++ b/README.md
@ -408,7 +408,7 @@ See <a href="#%EF%B8%8F-cli-usage">below</a> for usage examples using the CLI, W
 > *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.*

 <ul>
-<li><a href="https://dev.to/finloop/setting-up-archivebox-on-truenas-scale-1788">TrueNAS</a></li>
+<li>TrueNAS: <a href="https://truecharts.org/charts/incubator/archivebox/">Official ArchiveBox TrueChart</a> / <a href="https://dev.to/finloop/setting-up-archivebox-on-truenas-scale-1788">Custom App Guide</a></li>
 <li><a href="https://unraid.net/community/apps?q=archivebox#r">UnRaid</a></li>
 <li><a href="https://github.com/YunoHost-Apps/archivebox_ynh">Yunohost</a></li>
 <li><a href="https://www.cloudron.io/store/io.archivebox.cloudronapp.html">Cloudron</a></li>
@ -1441,23 +1441,62 @@ archivebox init --setup
 </details>


-#### Make migrations or enter a django shell
+#### Make DB migrations, enter Django shell, other dev helper commands

 <details><summary><i>Click to expand...</i></summary>

-Make sure to run this whenever you change things in `models.py`.
-
 ```bash
+# generate the database migrations after changes to models.py
 cd archivebox/
 ./manage.py makemigrations

+# enter a python shell or a SQL shell
 cd path/to/test/data/
 archivebox shell
 archivebox manage dbshell
+
+# generate a graph of the ORM models
+brew install graphviz
+pip install pydot graphviz
+archivebox manage graph_models -a -o orm.png
+open orm.png
+
+# list all models with field db info and methods
+archivebox manage list_model_info --all --signature --db-type --field-class
+
+# print all django settings
+archivebox manage print_settings
+archivebox manage print_settings --format=yaml    # pip install pyyaml
+
+# autogenerate an admin.py from given app models
+archivebox manage admin_generator core > core/admin.py
+
+# dump db data to a script that re-populates it
+archivebox manage dumpscript core > scripts/testdata.py
+archivebox manage reset core
+archivebox manage runscript testdata
+
+# resetdb and clear all data!
+archivebox manage reset_db
+
+# use django-tui to interactively explore commands
+pip install django-tui
+# ensure django-tui is in INSTALLED_APPS: core/settings.py
+archivebox manage tui
+
+# show python and JS package dependency trees
+pdm list --tree
+npm ls --all
 ```

-(uses `pytest -s`)  
-https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running
+<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/dc3e9f8c-9544-46e0-a7f0-30f571b72022" width="600px" alt="ArchiveBox ORM models relatinoship graph"/>
+
+- https://django-extensions.readthedocs.io/en/latest/command_extensions.html
+- https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running
+- https://github.com/anze3db/django-tui (explore `manage.py` commands as TUI)
+- https://github.com/bloomberg/memray (advanced python profiler)
+- https://github.com/laixintao/flameshow (display flamegraphs in terminal)
+- https://github.com/taliraj/django-migrations-tui (explore migrations as TUI)

 </details>

--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@ -131,7 +131,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s

        link = load_link_details(link, out_dir=out_dir)
        write_link_details(link, out_dir=out_dir, skip_sql_index=False)
-        log_link_archiving_started(link, out_dir, is_new)
+        log_link_archiving_started(link, str(out_dir), is_new)
        link = link.overwrite(updated=datetime.now(timezone.utc))
        stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
        start_ts = datetime.now(timezone.utc)
@ -165,16 +165,6 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                    # print('{black}      X {}{reset}'.format(method_name, **ANSI))
                    stats['skipped'] += 1
            except Exception as e:
-                # Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
-                # and https://github.com/ArchiveBox/ArchiveBox/issues/1014
-                # are fixed.
-                """
-                raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
-                    method_name,
-                    link.url,
-                )) from e
-                """
-                # Instead, use the kludgy workaround from
                # https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
                with open(ERROR_LOG, "a", encoding='utf-8') as f:
                    command = ' '.join(sys.argv)
@ -186,6 +176,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                        ts
                    ) + "\n" + str(e) + "\n"))
                    #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
+               
+                # print(f'        ERROR: {method_name} {e.__class__.__name__}: {e} {getattr(e, "hints", "")}', ts, link.url, command)
+                raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
+                    method_name,
+                    link.url,
+                )) from e
+

        # print('    ', stats)

@ -218,7 +215,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa

    if type(all_links) is QuerySet:
        num_links: int = all_links.count()
-        get_link = lambda x: x.as_link()
+        get_link = lambda x: x.as_link_with_details()
        all_links = all_links.iterator()
    else:
        num_links: int = len(all_links)
--- a/archivebox/extractors/htmltotext.py
+++ b/archivebox/extractors/htmltotext.py
@ -121,9 +121,11 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO

    out_dir = Path(out_dir or link.link_dir)
    output = "htmltotext.txt"
+    cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']

    timer = TimedProgress(timeout, prefix='      ')
    extracted_text = None
+    status = 'failed'
    try:
        extractor = HTMLTextExtractor()
        document = get_html(link, out_dir)
@ -136,10 +138,9 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
        extracted_text = str(extractor)

        atomic_write(str(out_dir / output), extracted_text)
+        status = 'succeeded'
    except (Exception, OSError) as err:
-        status = 'failed'
        output = err
-        cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
    finally:
        timer.end()

--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@ -77,6 +77,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO

    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
+    result = None
    try:
        result = run(cmd, cwd=str(out_dir), timeout=timeout)

@ -84,7 +85,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
        output_tail = [
            line.strip()
-            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
+            for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
            if line.strip()
        ]
        hints = (
@ -94,12 +95,13 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO

        # Check for common failure cases
        if (result.returncode > 0) or not (out_dir / output).is_file():
-            raise ArchiveError('SingleFile was not able to archive the page', hints)
+            raise ArchiveError(f'SingleFile was not able to archive the page (status={result.returncode})', hints)
        chmod_file(output, cwd=str(out_dir))
    except (Exception, OSError) as err:
        status = 'failed'
        # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
        cmd[2] = browser_args.replace('"', "\\\"")
+        err.hints = (result.stdout + result.stderr).decode().split('\n')
        output = err
    finally:
        timer.end()
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@ -75,7 +75,7 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
            with open(abs_path / source, "r", encoding="utf-8") as f:
                document = f.read()
                break
-        except (FileNotFoundError, TypeError):
+        except (FileNotFoundError, TypeError, UnicodeDecodeError):
            continue
    if document is None:
        return download_url(link.url, timeout=timeout)
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type

 def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """indexed links without checking archive status or data directory validity"""
-    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+    links = (snapshot.as_link() for snapshot in snapshots.iterator())
    return {
        link.link_dir: link
        for link in links
@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option

 def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """indexed links that are archived with a valid data directory"""
-    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+    links = (snapshot.as_link() for snapshot in snapshots.iterator())
    return {
        link.link_dir: link
        for link in filter(is_archived, links)
@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio

 def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """indexed links that are unarchived with no data directory or an empty data directory"""
-    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+    links = (snapshot.as_link() for snapshot in snapshots.iterator())
    return {
        link.link_dir: link
        for link in filter(is_unarchived, links)
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@ -432,12 +432,14 @@ def log_archive_method_finished(result: "ArchiveResult"):
                    **ANSI,
                ),
            ]
+        
+        # import pudb; pudb.set_trace()

        # Prettify error output hints string and limit to five lines
        hints = getattr(result.output, 'hints', None) or ()
        if hints:
            if isinstance(hints, (list, tuple, type(_ for _ in ()))):
-                hints = [hint.decode() for hint in hints if isinstance(hint, bytes)]
+                hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
            else:
                if isinstance(hints, bytes):
                    hints = hints.decode()
--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -791,6 +791,8 @@ def update(resume: Optional[float]=None,
           out_dir: Path=OUTPUT_DIR) -> List[Link]:
    """Import any new links from subscriptions and retry any previously failed/skipped links"""

+    from core.models import ArchiveResult
+
    check_data_folder(out_dir=out_dir)
    check_dependencies()
    new_links: List[Link] = [] # TODO: Remove input argument: only_new
@ -798,19 +800,23 @@ def update(resume: Optional[float]=None,
    extractors = extractors.split(",") if extractors else []

    # Step 1: Filter for selected_links
+    print('[*] Finding matching Snapshots to update...')
+    print(f'    - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
    matching_snapshots = list_links(
        filter_patterns=filter_patterns,
        filter_type=filter_type,
        before=before,
        after=after,
    )
-
+    print(f'    - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
    matching_folders = list_folders(
        links=matching_snapshots,
        status=status,
        out_dir=out_dir,
    )
-    all_links = [link for link in matching_folders.values() if link]
+    all_links = (link for link in matching_folders.values() if link)
+    print('    - Sorting by most unfinished -> least unfinished + date archived...')
+    all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))

    if index_only:
        for link in all_links:
@ -836,6 +842,7 @@ def update(resume: Optional[float]=None,
    if extractors:
        archive_kwargs["methods"] = extractors

+
    archive_links(to_archive, overwrite=overwrite, **archive_kwargs)

    # Step 4: Re-write links index with updated titles, icons, and resources
--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -179,7 +179,11 @@ def download_url(url: str, timeout: int=None) -> str:
    if encoding is not None:
        response.encoding = encoding

-    return response.text
+    try:
+        return response.text
+    except UnicodeDecodeError:
+        # if response is non-test (e.g. image or other binary files), just return the filename instead
+        return url.rsplit('/', 1)[-1]

@enforce_types
 def get_headers(url: str, timeout: int=None) -> str:
--- a/etc/sonic.cfg
+++ b/etc/sonic.cfg
@ -6,7 +6,8 @@

 [server]

-log_level = "debug"
+# log_level = "debug"
+log_level = "warn"


 [channel]