1
0
Fork 0
mirror of synced 2024-06-16 09:24:52 +12:00

config fixes

This commit is contained in:
Nick Sweeting 2020-10-31 07:55:27 -04:00
parent aa71a231f6
commit ac9e0e356d
8 changed files with 57 additions and 42 deletions

View file

@ -885,32 +885,31 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
stderr('')
if config['TIMEOUT'] < 5:
stderr()
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)')
stderr()
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
stderr()
elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
stderr()
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
stderr()
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
stderr()
if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
stderr()
stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
stderr(' Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
stderr(' (Setting it somewhere over 60 seconds is recommended)')
stderr()
stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media')
stderr()
def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
output_dir = out_dir or config['OUTPUT_DIR']

View file

@ -11,7 +11,7 @@ from django.shortcuts import render, redirect
from django.contrib.auth import get_user_model
from django import forms
from core.models import Snapshot
from core.models import Snapshot, Tag
from core.forms import AddLinkForm, TagField
from core.utils import get_icons
@ -109,8 +109,9 @@ class SnapshotAdmin(admin.ModelAdmin):
def title_str(self, obj):
canon = obj.as_link().canonical_outputs()
tags = ''.join(
format_html(' <a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
for tag in obj.tags.all()
if str(tag).strip()
)
return format_html(
'<a href="/{}">'
@ -124,7 +125,7 @@ class SnapshotAdmin(admin.ModelAdmin):
obj.archive_path,
'fetched' if obj.latest_title or obj.title else 'pending',
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
) + mark_safe(f'<span class="tags">{tags}</span>')
) + mark_safe(f' <span class="tags">{tags}</span>')
def files(self, obj):
return get_icons(obj)
@ -151,6 +152,12 @@ class SnapshotAdmin(admin.ModelAdmin):
title_str.admin_order_field = 'title'
url_str.admin_order_field = 'url'
class TagAdmin(admin.ModelAdmin):
list_display = ('slug', 'name', 'id')
sort_fields = ('id', 'name', 'slug')
readonly_fields = ('id',)
search_fields = ('id', 'name', 'slug')
fields = (*readonly_fields, 'name', 'slug')
class ArchiveBoxAdmin(admin.AdminSite):
@ -206,4 +213,5 @@ class ArchiveBoxAdmin(admin.AdminSite):
admin.site = ArchiveBoxAdmin()
admin.site.register(get_user_model())
admin.site.register(Snapshot, SnapshotAdmin)
admin.site.register(Tag, TagAdmin)
admin.site.disable_action('delete_selected')

View file

@ -82,7 +82,7 @@ class Snapshot(models.Model):
args = args or self.keys
return {
key: getattr(self, key)
if key != 'tags' else self.get_tags_str()
if key != 'tags' else self.tags_str()
for key in args
}
@ -93,12 +93,8 @@ class Snapshot(models.Model):
from ..index import load_link_details
return load_link_details(self.as_link())
def get_tags_str(self) -> str:
tags = ','.join(
tag.name
for tag in self.tags.all()
) if self.tags.all() else ''
return tags
def tags_str(self) -> str:
return ','.join(self.tags.order_by('name').values_list('name', flat=True))
@cached_property
def bookmarked(self):

View file

@ -25,6 +25,7 @@ IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
### Django Core Settings
################################################################################
DEBUG = True
WSGI_APPLICATION = 'core.wsgi.application'
ROOT_URLCONF = 'core.urls'

View file

@ -13,26 +13,26 @@ def get_icons(snapshot: Snapshot) -> str:
# slow version: highlights icons based on whether files exist or not for that output
# link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
# fast version: all icons are highlighted without checking for outputs in filesystem
link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method])
link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
return format_html(
'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
'<a href="/{}/{}" class="exists-{}" title="Wget clone">🌐 </a> '
'<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
'<a href="/{}/{}" class="exists-{}" title="PDF">📄</a> '
'<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
'<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
'<a href="/{}/{}" class="exists-{}" title="WARC">🆆 </a> '
'<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
'<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
'<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
'<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
'</span>',
*link_tuple(link, 'wget_path'),
*link_tuple(link, 'singlefile_path'),
*link_tuple(link, 'pdf_path'),
*link_tuple(link, 'screenshot_path'),
*link_tuple(link, 'dom_path'),
*link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
*link_tuple(link, 'singlefile_path'),
*link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
*link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),

View file

@ -59,7 +59,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
result = run(cmd, cwd=str(out_dir), timeout=timeout)
content_location, errors = parse_archive_dot_org_response(result.stdout)
if content_location:
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
archive_org_url = content_location[0]
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
archive_org_url = None
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))

View file

@ -50,11 +50,9 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
link.url,
]
try:
json_headers = get_headers(link.url)
json_headers = get_headers(link.url, timeout=timeout)
output_folder.mkdir(exist_ok=True)
atomic_write(str(output_folder / "headers.json"), json_headers)
except (Exception, OSError) as err:
status = 'failed'
output = err

View file

@ -2,7 +2,8 @@ __package__ = 'archivebox.extractors'
from pathlib import Path
from typing import Optional
from subprocess import CompletedProcess
from typing import Optional, Tuple, List
import json
from ..index.schema import Link, ArchiveResult, ArchiveError
@ -20,6 +21,21 @@ from ..config import (
)
from ..logging_util import TimedProgress
@enforce_types
def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
# parse out last line of stderr
return ArchiveError(
f'Got {cmd[0]} response code: {result.returncode}).',
*(
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', lines)[-lines:]
if line.strip()
),
)
@enforce_types
def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
@ -31,7 +47,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool:
@enforce_types
def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download reader friendly version using @postlight/mercury-parser"""
out_dir = Path(out_dir or link.link_dir)
@ -41,41 +57,38 @@ def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
# Get plain text version of article
cmd = [
DEPENDENCIES['MERCURY_BINARY']['path'],
link.url,
"--format=text"
]
result = run(cmd, cwd=out_dir, timeout=timeout)
txtresult_json = json.loads(result.stdout)
try:
article_text = json.loads(result.stdout)
except json.JSONDecodeError:
raise ShellError(cmd, result)
# Get HTML version of article
cmd = [
DEPENDENCIES['MERCURY_BINARY']['path'],
link.url
]
result = run(cmd, cwd=out_dir, timeout=timeout)
result_json = json.loads(result.stdout)
try:
article_json = json.loads(result.stdout)
except json.JSONDecodeError:
raise ShellError(cmd, result)
output_folder.mkdir(exist_ok=True)
atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
atomic_write(str(output_folder / "content.txt"), txtresult_json["content"])
atomic_write(str(output_folder / "article.json"), result_json)
# parse out last line of stderr
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 20)[-20:]
if line.strip()
]
hints = (
'Got mercury response code: {}.'.format(result.returncode),
*output_tail,
)
atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
atomic_write(str(output_folder / "content.txt"), article_text["content"])
atomic_write(str(output_folder / "article.json"), article_json)
# Check for common failure cases
if (result.returncode > 0):
raise ArchiveError('Mercury parser was not able to archive the page', hints)
except (Exception, OSError) as err:
raise ShellError(cmd, result)
except (ArchiveError, Exception, OSError) as err:
status = 'failed'
output = err
finally: