config fixes

2024-06-18 18:34:51 +12:00 · 2020-10-31 07:55:27 -04:00 · 2020-10-31 07:55:27 -04:00 · ac9e0e356d
parent aa71a231f6
commit ac9e0e356d
8 changed files with 57 additions and 42 deletions
--- a/archivebox/config.py
+++ b/archivebox/config.py
@ -885,32 +885,31 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
        stderr('')

    if config['TIMEOUT'] < 5:
-        stderr()
        stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
        stderr('    You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
        stderr('    (Setting it to somewhere between 30 and 3000 seconds is recommended)')
        stderr()
        stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
        stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
+        stderr()

    elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
-        stderr()
        stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
        stderr('    Chrome will fail to archive all sites if set to less than ~15 seconds.')
        stderr('    (Setting it to somewhere between 30 and 300 seconds is recommended)')
        stderr()
        stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
        stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
+        stderr()

    if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
-        stderr()
        stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
        stderr('    Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
        stderr('    (Setting it somewhere over 60 seconds is recommended)')
        stderr()
        stderr('    If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
        stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media')
-
+        stderr()
        
 def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
    output_dir = out_dir or config['OUTPUT_DIR']
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@ -11,7 +11,7 @@ from django.shortcuts import render, redirect
 from django.contrib.auth import get_user_model
 from django import forms

-from core.models import Snapshot
+from core.models import Snapshot, Tag
 from core.forms import AddLinkForm, TagField
 from core.utils import get_icons

@ -109,8 +109,9 @@ class SnapshotAdmin(admin.ModelAdmin):
    def title_str(self, obj):
        canon = obj.as_link().canonical_outputs()
        tags = ''.join(
-            format_html(' <a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
+            format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
            for tag in obj.tags.all()
+            if str(tag).strip()
        )
        return format_html(
            '<a href="/{}">'
@ -124,7 +125,7 @@ class SnapshotAdmin(admin.ModelAdmin):
            obj.archive_path,
            'fetched' if obj.latest_title or obj.title else 'pending',
            urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
-        ) + mark_safe(f'<span class="tags">{tags}</span>')
+        ) + mark_safe(f' <span class="tags">{tags}</span>')

    def files(self, obj):
        return get_icons(obj)
@ -151,6 +152,12 @@ class SnapshotAdmin(admin.ModelAdmin):
    title_str.admin_order_field = 'title'
    url_str.admin_order_field = 'url'

+class TagAdmin(admin.ModelAdmin):
+    list_display = ('slug', 'name', 'id')
+    sort_fields = ('id', 'name', 'slug')
+    readonly_fields = ('id',)
+    search_fields = ('id', 'name', 'slug')
+    fields = (*readonly_fields, 'name', 'slug')


 class ArchiveBoxAdmin(admin.AdminSite):
@ -206,4 +213,5 @@ class ArchiveBoxAdmin(admin.AdminSite):
 admin.site = ArchiveBoxAdmin()
 admin.site.register(get_user_model())
 admin.site.register(Snapshot, SnapshotAdmin)
+admin.site.register(Tag, TagAdmin)
 admin.site.disable_action('delete_selected')
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@ -82,7 +82,7 @@ class Snapshot(models.Model):
        args = args or self.keys
        return {
            key: getattr(self, key)
-            if key != 'tags' else self.get_tags_str()
+            if key != 'tags' else self.tags_str()
            for key in args 
        }

@ -93,12 +93,8 @@ class Snapshot(models.Model):
        from ..index import load_link_details
        return load_link_details(self.as_link())
    
-    def get_tags_str(self) -> str:
-        tags = ','.join(
-            tag.name
-            for tag in self.tags.all()
-        ) if self.tags.all() else ''
-        return tags
+    def tags_str(self) -> str:
+        return ','.join(self.tags.order_by('name').values_list('name', flat=True))

    @cached_property
    def bookmarked(self):
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@ -25,6 +25,7 @@ IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
 ### Django Core Settings
 ################################################################################

+DEBUG = True
 WSGI_APPLICATION = 'core.wsgi.application'
 ROOT_URLCONF = 'core.urls'

--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@ -13,26 +13,26 @@ def get_icons(snapshot: Snapshot) -> str:
    # slow version: highlights icons based on whether files exist or not for that output
    # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
    # fast version: all icons are highlighted without checking for outputs in filesystem
-    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method])
+    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())

    return format_html(
            '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
                '<a href="/{}/{}" class="exists-{}" title="Wget clone">🌐 </a> '
+                '<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
                '<a href="/{}/{}" class="exists-{}" title="PDF">📄</a> '
                '<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
                '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
                '<a href="/{}/{}" class="exists-{}" title="WARC">🆆 </a> '
-                '<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
                '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
                '<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
                '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
            '</span>',
            *link_tuple(link, 'wget_path'),
+            *link_tuple(link, 'singlefile_path'),
            *link_tuple(link, 'pdf_path'),
            *link_tuple(link, 'screenshot_path'),
            *link_tuple(link, 'dom_path'),
            *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
-            *link_tuple(link, 'singlefile_path'),
            *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
            *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
            canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@ -59,7 +59,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
        result = run(cmd, cwd=str(out_dir), timeout=timeout)
        content_location, errors = parse_archive_dot_org_response(result.stdout)
        if content_location:
-            archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
+            archive_org_url = content_location[0]
        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
            archive_org_url = None
            # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
--- a/archivebox/extractors/headers.py
+++ b/archivebox/extractors/headers.py
@ -50,11 +50,9 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
        link.url,
    ]
    try:
-        json_headers = get_headers(link.url)
-
+        json_headers = get_headers(link.url, timeout=timeout)
        output_folder.mkdir(exist_ok=True)
        atomic_write(str(output_folder / "headers.json"), json_headers)
-
    except (Exception, OSError) as err:
        status = 'failed'
        output = err
--- a/archivebox/extractors/mercury.py
+++ b/archivebox/extractors/mercury.py
@ -2,7 +2,8 @@ __package__ = 'archivebox.extractors'

 from pathlib import Path

-from typing import Optional
+from subprocess import CompletedProcess
+from typing import Optional, Tuple, List
 import json

 from ..index.schema import Link, ArchiveResult, ArchiveError
@ -20,6 +21,21 @@ from ..config import (
 )
 from ..logging_util import TimedProgress

+
+
+@enforce_types
+def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
+    # parse out last line of stderr
+    return ArchiveError(
+        f'Got {cmd[0]} response code: {result.returncode}).',
+        *(
+            line.strip()
+            for line in (result.stdout + result.stderr).decode().rsplit('\n', lines)[-lines:]
+            if line.strip()
+        ),
+    )
+
+
@enforce_types
 def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool:
    out_dir = out_dir or link.link_dir
@ -31,7 +47,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool:


@enforce_types
-def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """download reader friendly version using @postlight/mercury-parser"""

    out_dir = Path(out_dir or link.link_dir)
@ -41,41 +57,38 @@ def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
+        # Get plain text version of article
        cmd = [
            DEPENDENCIES['MERCURY_BINARY']['path'],
            link.url,
            "--format=text"
        ]
        result = run(cmd, cwd=out_dir, timeout=timeout)
-        txtresult_json = json.loads(result.stdout)
-
+        try:
+            article_text = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            raise ShellError(cmd, result)
+        
+        # Get HTML version of article
        cmd = [
            DEPENDENCIES['MERCURY_BINARY']['path'],
            link.url
        ]
        result = run(cmd, cwd=out_dir, timeout=timeout)
-        result_json = json.loads(result.stdout)
+        try:
+            article_json = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            raise ShellError(cmd, result)

        output_folder.mkdir(exist_ok=True)
-        atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
-        atomic_write(str(output_folder / "content.txt"), txtresult_json["content"])
-        atomic_write(str(output_folder / "article.json"), result_json)
-
-        # parse out last line of stderr
-        output_tail = [
-            line.strip()
-            for line in (result.stdout + result.stderr).decode().rsplit('\n', 20)[-20:]
-            if line.strip()
-        ]
-        hints = (
-            'Got mercury response code: {}.'.format(result.returncode),
-            *output_tail,
-        )
+        atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
+        atomic_write(str(output_folder / "content.txt"), article_text["content"])
+        atomic_write(str(output_folder / "article.json"), article_json)

        # Check for common failure cases
        if (result.returncode > 0):
-            raise ArchiveError('Mercury parser was not able to archive the page', hints)
-    except (Exception, OSError) as err:
+            raise ShellError(cmd, result)
+    except (ArchiveError, Exception, OSError) as err:
        status = 'failed'
        output = err
    finally: