1
0
Fork 0
mirror of synced 2024-06-23 08:30:29 +12:00

config fixes

This commit is contained in:
Nick Sweeting 2020-10-31 07:55:27 -04:00
parent aa71a231f6
commit ac9e0e356d
8 changed files with 57 additions and 42 deletions

View file

@ -885,32 +885,31 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
stderr('') stderr('')
if config['TIMEOUT'] < 5: if config['TIMEOUT'] < 5:
stderr()
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red') stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.') stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)') stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)')
stderr() stderr()
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles') stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
stderr()
elif config['USE_CHROME'] and config['TIMEOUT'] < 15: elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
stderr()
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red') stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.') stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)') stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
stderr() stderr()
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles') stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
stderr()
if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20: if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
stderr()
stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red') stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
stderr(' Youtube-dl will fail to archive all media if set to less than ~20 seconds.') stderr(' Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
stderr(' (Setting it somewhere over 60 seconds is recommended)') stderr(' (Setting it somewhere over 60 seconds is recommended)')
stderr() stderr()
stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:') stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media') stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media')
stderr()
def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None: def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
output_dir = out_dir or config['OUTPUT_DIR'] output_dir = out_dir or config['OUTPUT_DIR']

View file

@ -11,7 +11,7 @@ from django.shortcuts import render, redirect
from django.contrib.auth import get_user_model from django.contrib.auth import get_user_model
from django import forms from django import forms
from core.models import Snapshot from core.models import Snapshot, Tag
from core.forms import AddLinkForm, TagField from core.forms import AddLinkForm, TagField
from core.utils import get_icons from core.utils import get_icons
@ -109,8 +109,9 @@ class SnapshotAdmin(admin.ModelAdmin):
def title_str(self, obj): def title_str(self, obj):
canon = obj.as_link().canonical_outputs() canon = obj.as_link().canonical_outputs()
tags = ''.join( tags = ''.join(
format_html(' <a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag) format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
for tag in obj.tags.all() for tag in obj.tags.all()
if str(tag).strip()
) )
return format_html( return format_html(
'<a href="/{}">' '<a href="/{}">'
@ -124,7 +125,7 @@ class SnapshotAdmin(admin.ModelAdmin):
obj.archive_path, obj.archive_path,
'fetched' if obj.latest_title or obj.title else 'pending', 'fetched' if obj.latest_title or obj.title else 'pending',
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...' urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
) + mark_safe(f'<span class="tags">{tags}</span>') ) + mark_safe(f' <span class="tags">{tags}</span>')
def files(self, obj): def files(self, obj):
return get_icons(obj) return get_icons(obj)
@ -151,6 +152,12 @@ class SnapshotAdmin(admin.ModelAdmin):
title_str.admin_order_field = 'title' title_str.admin_order_field = 'title'
url_str.admin_order_field = 'url' url_str.admin_order_field = 'url'
class TagAdmin(admin.ModelAdmin):
list_display = ('slug', 'name', 'id')
sort_fields = ('id', 'name', 'slug')
readonly_fields = ('id',)
search_fields = ('id', 'name', 'slug')
fields = (*readonly_fields, 'name', 'slug')
class ArchiveBoxAdmin(admin.AdminSite): class ArchiveBoxAdmin(admin.AdminSite):
@ -206,4 +213,5 @@ class ArchiveBoxAdmin(admin.AdminSite):
admin.site = ArchiveBoxAdmin() admin.site = ArchiveBoxAdmin()
admin.site.register(get_user_model()) admin.site.register(get_user_model())
admin.site.register(Snapshot, SnapshotAdmin) admin.site.register(Snapshot, SnapshotAdmin)
admin.site.register(Tag, TagAdmin)
admin.site.disable_action('delete_selected') admin.site.disable_action('delete_selected')

View file

@ -82,7 +82,7 @@ class Snapshot(models.Model):
args = args or self.keys args = args or self.keys
return { return {
key: getattr(self, key) key: getattr(self, key)
if key != 'tags' else self.get_tags_str() if key != 'tags' else self.tags_str()
for key in args for key in args
} }
@ -93,12 +93,8 @@ class Snapshot(models.Model):
from ..index import load_link_details from ..index import load_link_details
return load_link_details(self.as_link()) return load_link_details(self.as_link())
def get_tags_str(self) -> str: def tags_str(self) -> str:
tags = ','.join( return ','.join(self.tags.order_by('name').values_list('name', flat=True))
tag.name
for tag in self.tags.all()
) if self.tags.all() else ''
return tags
@cached_property @cached_property
def bookmarked(self): def bookmarked(self):

View file

@ -25,6 +25,7 @@ IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
### Django Core Settings ### Django Core Settings
################################################################################ ################################################################################
DEBUG = True
WSGI_APPLICATION = 'core.wsgi.application' WSGI_APPLICATION = 'core.wsgi.application'
ROOT_URLCONF = 'core.urls' ROOT_URLCONF = 'core.urls'

View file

@ -13,26 +13,26 @@ def get_icons(snapshot: Snapshot) -> str:
# slow version: highlights icons based on whether files exist or not for that output # slow version: highlights icons based on whether files exist or not for that output
# link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
# fast version: all icons are highlighted without checking for outputs in filesystem # fast version: all icons are highlighted without checking for outputs in filesystem
link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method]) link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
return format_html( return format_html(
'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">' '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
'<a href="/{}/{}" class="exists-{}" title="Wget clone">🌐 </a> ' '<a href="/{}/{}" class="exists-{}" title="Wget clone">🌐 </a> '
'<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
'<a href="/{}/{}" class="exists-{}" title="PDF">📄</a> ' '<a href="/{}/{}" class="exists-{}" title="PDF">📄</a> '
'<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> ' '<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
'<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> ' '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
'<a href="/{}/{}" class="exists-{}" title="WARC">🆆 </a> ' '<a href="/{}/{}" class="exists-{}" title="WARC">🆆 </a> '
'<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
'<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> ' '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
'<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> ' '<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
'<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> ' '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
'</span>', '</span>',
*link_tuple(link, 'wget_path'), *link_tuple(link, 'wget_path'),
*link_tuple(link, 'singlefile_path'),
*link_tuple(link, 'pdf_path'), *link_tuple(link, 'pdf_path'),
*link_tuple(link, 'screenshot_path'), *link_tuple(link, 'screenshot_path'),
*link_tuple(link, 'dom_path'), *link_tuple(link, 'dom_path'),
*link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
*link_tuple(link, 'singlefile_path'),
*link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
*link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),

View file

@ -59,7 +59,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
result = run(cmd, cwd=str(out_dir), timeout=timeout) result = run(cmd, cwd=str(out_dir), timeout=timeout)
content_location, errors = parse_archive_dot_org_response(result.stdout) content_location, errors = parse_archive_dot_org_response(result.stdout)
if content_location: if content_location:
archive_org_url = 'https://web.archive.org{}'.format(content_location[0]) archive_org_url = content_location[0]
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]: elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
archive_org_url = None archive_org_url = None
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url))) # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))

View file

@ -50,11 +50,9 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
link.url, link.url,
] ]
try: try:
json_headers = get_headers(link.url) json_headers = get_headers(link.url, timeout=timeout)
output_folder.mkdir(exist_ok=True) output_folder.mkdir(exist_ok=True)
atomic_write(str(output_folder / "headers.json"), json_headers) atomic_write(str(output_folder / "headers.json"), json_headers)
except (Exception, OSError) as err: except (Exception, OSError) as err:
status = 'failed' status = 'failed'
output = err output = err

View file

@ -2,7 +2,8 @@ __package__ = 'archivebox.extractors'
from pathlib import Path from pathlib import Path
from typing import Optional from subprocess import CompletedProcess
from typing import Optional, Tuple, List
import json import json
from ..index.schema import Link, ArchiveResult, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveError
@ -20,6 +21,21 @@ from ..config import (
) )
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
@enforce_types
def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
# parse out last line of stderr
return ArchiveError(
f'Got {cmd[0]} response code: {result.returncode}).',
*(
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', lines)[-lines:]
if line.strip()
),
)
@enforce_types @enforce_types
def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool: def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir out_dir = out_dir or link.link_dir
@ -31,7 +47,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool:
@enforce_types @enforce_types
def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download reader friendly version using @postlight/mercury-parser""" """download reader friendly version using @postlight/mercury-parser"""
out_dir = Path(out_dir or link.link_dir) out_dir = Path(out_dir or link.link_dir)
@ -41,41 +57,38 @@ def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
# Get plain text version of article
cmd = [ cmd = [
DEPENDENCIES['MERCURY_BINARY']['path'], DEPENDENCIES['MERCURY_BINARY']['path'],
link.url, link.url,
"--format=text" "--format=text"
] ]
result = run(cmd, cwd=out_dir, timeout=timeout) result = run(cmd, cwd=out_dir, timeout=timeout)
txtresult_json = json.loads(result.stdout) try:
article_text = json.loads(result.stdout)
except json.JSONDecodeError:
raise ShellError(cmd, result)
# Get HTML version of article
cmd = [ cmd = [
DEPENDENCIES['MERCURY_BINARY']['path'], DEPENDENCIES['MERCURY_BINARY']['path'],
link.url link.url
] ]
result = run(cmd, cwd=out_dir, timeout=timeout) result = run(cmd, cwd=out_dir, timeout=timeout)
result_json = json.loads(result.stdout) try:
article_json = json.loads(result.stdout)
except json.JSONDecodeError:
raise ShellError(cmd, result)
output_folder.mkdir(exist_ok=True) output_folder.mkdir(exist_ok=True)
atomic_write(str(output_folder / "content.html"), result_json.pop("content")) atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
atomic_write(str(output_folder / "content.txt"), txtresult_json["content"]) atomic_write(str(output_folder / "content.txt"), article_text["content"])
atomic_write(str(output_folder / "article.json"), result_json) atomic_write(str(output_folder / "article.json"), article_json)
# parse out last line of stderr
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 20)[-20:]
if line.strip()
]
hints = (
'Got mercury response code: {}.'.format(result.returncode),
*output_tail,
)
# Check for common failure cases # Check for common failure cases
if (result.returncode > 0): if (result.returncode > 0):
raise ArchiveError('Mercury parser was not able to archive the page', hints) raise ShellError(cmd, result)
except (Exception, OSError) as err: except (ArchiveError, Exception, OSError) as err:
status = 'failed' status = 'failed'
output = err output = err
finally: finally: