Merge e59406541d
into e5aba0dc2e
This commit is contained in:
commit
0c0ea7e4f5
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -26,11 +26,9 @@ dist/
|
|||
|
||||
# Data folders
|
||||
data/
|
||||
data1/
|
||||
data2/
|
||||
data3/
|
||||
data*/
|
||||
output/
|
||||
|
||||
# vim
|
||||
*.sw?
|
||||
.vscode/
|
||||
|
|
|
@ -15,8 +15,8 @@
|
|||
# Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
|
||||
|
||||
|
||||
# Use Debian 12 w/ faster package updates: https://packages.debian.org/bookworm-backports/
|
||||
FROM python:3.11-slim-bookworm
|
||||
# Uses Debian 12 w/ faster-updating apt-lists added below: https://packages.debian.org/bookworm-backports/
|
||||
|
||||
LABEL name="archivebox" \
|
||||
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
|
||||
|
@ -127,9 +127,9 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
|||
# 1. packaging dependencies
|
||||
apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
|
||||
# 2. docker and init system dependencies
|
||||
zlib1g-dev dumb-init gosu cron unzip grep \
|
||||
zlib1g-dev dumb-init gosu cron unzip grep ncat \
|
||||
# 3. frivolous CLI helpers to make debugging failed archiving easier
|
||||
# nano iputils-ping dnsutils htop procps jq yq
|
||||
# nano iputils-ping dnsutils htop procps jq yq \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
######### Language Environments ####################################
|
||||
|
|
|
@ -38,7 +38,7 @@ from hashlib import md5
|
|||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, Type, Tuple, Dict, Union, List
|
||||
from subprocess import run, PIPE, DEVNULL
|
||||
from subprocess import run, PIPE, STDOUT, DEVNULL
|
||||
from configparser import ConfigParser
|
||||
from collections import defaultdict
|
||||
import importlib.metadata
|
||||
|
@ -854,7 +854,7 @@ def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Op
|
|||
|
||||
|
||||
# Dependency Metadata Helpers
|
||||
def bin_version(binary: Optional[str]) -> Optional[str]:
|
||||
def bin_version(binary: Optional[str], cmd=None) -> Optional[str]:
|
||||
"""check the presence and return valid version line of a specified binary"""
|
||||
|
||||
abspath = bin_path(binary)
|
||||
|
@ -863,11 +863,21 @@ def bin_version(binary: Optional[str]) -> Optional[str]:
|
|||
|
||||
try:
|
||||
bin_env = os.environ | {'LANG': 'C'}
|
||||
version_str = run([abspath, "--version"], stdout=PIPE, env=bin_env).stdout.strip().decode()
|
||||
is_cmd_str = cmd and isinstance(cmd, str)
|
||||
version_str = run(cmd or [abspath, "--version"], shell=is_cmd_str, stdout=PIPE, stderr=STDOUT, env=bin_env).stdout.strip().decode()
|
||||
if not version_str:
|
||||
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
|
||||
# take first 3 columns of first line of version info
|
||||
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
|
||||
version_str = run(cmd or [abspath, "--version"], shell=is_cmd_str, stdout=PIPE, stderr=STDOUT).stdout.strip().decode()
|
||||
|
||||
version_ptn = re.compile(r"\d+?\.\d+?\.?\d*", re.MULTILINE)
|
||||
try:
|
||||
version_nums = version_ptn.findall(version_str.split('\n')[0])[0]
|
||||
if version_nums:
|
||||
return version_nums
|
||||
else:
|
||||
raise IndexError
|
||||
except IndexError:
|
||||
# take first 3 columns of first line of version info
|
||||
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
|
||||
except OSError:
|
||||
pass
|
||||
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
|
||||
|
|
|
@ -9,6 +9,8 @@ SimpleConfigValueDict = Dict[str, SimpleConfigValue]
|
|||
SimpleConfigValueGetter = Callable[[], SimpleConfigValue]
|
||||
ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter]
|
||||
|
||||
SHArgs = List[str] # shell command args list e.g. ["--something=1", "--someotherarg"]
|
||||
|
||||
|
||||
class BaseConfig(TypedDict):
|
||||
pass
|
||||
|
@ -16,10 +18,10 @@ class BaseConfig(TypedDict):
|
|||
class ConfigDict(BaseConfig, total=False):
|
||||
"""
|
||||
# Regenerate by pasting this quine into `archivebox shell` 🥚
|
||||
from archivebox.config import ConfigDict, CONFIG_DEFAULTS
|
||||
from archivebox.config import ConfigDict, CONFIG_SCHEMA
|
||||
print('class ConfigDict(BaseConfig, total=False):')
|
||||
print(' ' + '"'*3 + ConfigDict.__doc__ + '"'*3)
|
||||
for section, configs in CONFIG_DEFAULTS.items():
|
||||
for section, configs in CONFIG_SCHEMA.items():
|
||||
for key, attrs in configs.items():
|
||||
Type, default = attrs['type'], attrs['default']
|
||||
if default is None:
|
||||
|
@ -32,16 +34,23 @@ class ConfigDict(BaseConfig, total=False):
|
|||
USE_COLOR: bool
|
||||
SHOW_PROGRESS: bool
|
||||
IN_DOCKER: bool
|
||||
IN_QEMU: bool
|
||||
PUID: int
|
||||
PGID: int
|
||||
|
||||
PACKAGE_DIR: Path
|
||||
OUTPUT_DIR: Path
|
||||
CONFIG_FILE: Path
|
||||
OUTPUT_DIR: Optional[str]
|
||||
CONFIG_FILE: Optional[str]
|
||||
ONLY_NEW: bool
|
||||
TIMEOUT: int
|
||||
MEDIA_TIMEOUT: int
|
||||
OUTPUT_PERMISSIONS: str
|
||||
RESTRICT_FILE_NAMES: str
|
||||
URL_DENYLIST: str
|
||||
URL_ALLOWLIST: Optional[str]
|
||||
ADMIN_USERNAME: Optional[str]
|
||||
ADMIN_PASSWORD: Optional[str]
|
||||
ENFORCE_ATOMIC_WRITES: bool
|
||||
TAG_SEPARATOR_PATTERN: str
|
||||
|
||||
SECRET_KEY: Optional[str]
|
||||
BIND_ADDR: str
|
||||
|
@ -49,7 +58,27 @@ class ConfigDict(BaseConfig, total=False):
|
|||
DEBUG: bool
|
||||
PUBLIC_INDEX: bool
|
||||
PUBLIC_SNAPSHOTS: bool
|
||||
PUBLIC_ADD_VIEW: bool
|
||||
FOOTER_INFO: str
|
||||
SNAPSHOTS_PER_PAGE: int
|
||||
CUSTOM_TEMPLATES_DIR: Optional[str]
|
||||
TIME_ZONE: str
|
||||
TIMEZONE: str
|
||||
REVERSE_PROXY_USER_HEADER: str
|
||||
REVERSE_PROXY_WHITELIST: str
|
||||
LOGOUT_REDIRECT_URL: str
|
||||
PREVIEW_ORIGINALS: bool
|
||||
LDAP: bool
|
||||
LDAP_SERVER_URI: Optional[str]
|
||||
LDAP_BIND_DN: Optional[str]
|
||||
LDAP_BIND_PASSWORD: Optional[str]
|
||||
LDAP_USER_BASE: Optional[str]
|
||||
LDAP_USER_FILTER: Optional[str]
|
||||
LDAP_USERNAME_ATTR: Optional[str]
|
||||
LDAP_FIRSTNAME_ATTR: Optional[str]
|
||||
LDAP_LASTNAME_ATTR: Optional[str]
|
||||
LDAP_EMAIL_ATTR: Optional[str]
|
||||
LDAP_CREATE_SUPERUSER: bool
|
||||
|
||||
SAVE_TITLE: bool
|
||||
SAVE_FAVICON: bool
|
||||
|
@ -58,25 +87,50 @@ class ConfigDict(BaseConfig, total=False):
|
|||
SAVE_SINGLEFILE: bool
|
||||
SAVE_READABILITY: bool
|
||||
SAVE_MERCURY: bool
|
||||
SAVE_HTMLTOTEXT: bool
|
||||
SAVE_PDF: bool
|
||||
SAVE_SCREENSHOT: bool
|
||||
SAVE_DOM: bool
|
||||
SAVE_HEADERS: bool
|
||||
SAVE_WARC: bool
|
||||
SAVE_GIT: bool
|
||||
SAVE_MEDIA: bool
|
||||
SAVE_ARCHIVE_DOT_ORG: bool
|
||||
SAVE_ALLOWLIST: dict
|
||||
SAVE_DENYLIST: dict
|
||||
|
||||
RESOLUTION: str
|
||||
GIT_DOMAINS: str
|
||||
CHECK_SSL_VALIDITY: bool
|
||||
MEDIA_MAX_SIZE: str
|
||||
CURL_USER_AGENT: str
|
||||
WGET_USER_AGENT: str
|
||||
CHROME_USER_AGENT: str
|
||||
COOKIES_FILE: Union[str, Path, None]
|
||||
CHROME_USER_DATA_DIR: Union[str, Path, None]
|
||||
COOKIES_FILE: Optional[str]
|
||||
CHROME_USER_DATA_DIR: Optional[str]
|
||||
CHROME_TIMEOUT: int
|
||||
CHROME_HEADLESS: bool
|
||||
CHROME_SANDBOX: bool
|
||||
YOUTUBEDL_ARGS: list
|
||||
WGET_ARGS: list
|
||||
CURL_ARGS: list
|
||||
GIT_ARGS: list
|
||||
SINGLEFILE_ARGS: Optional[list]
|
||||
FAVICON_PROVIDER: str
|
||||
|
||||
USE_INDEXING_BACKEND: bool
|
||||
USE_SEARCHING_BACKEND: bool
|
||||
SEARCH_BACKEND_ENGINE: str
|
||||
SEARCH_BACKEND_HOST_NAME: str
|
||||
SEARCH_BACKEND_PORT: int
|
||||
SEARCH_BACKEND_PASSWORD: str
|
||||
SEARCH_PROCESS_HTML: bool
|
||||
SONIC_COLLECTION: str
|
||||
SONIC_BUCKET: str
|
||||
SEARCH_BACKEND_TIMEOUT: int
|
||||
FTS_SEPARATE_DATABASE: bool
|
||||
FTS_TOKENIZERS: str
|
||||
FTS_SQLITE_MAX_LENGTH: int
|
||||
|
||||
USE_CURL: bool
|
||||
USE_WGET: bool
|
||||
|
@ -85,7 +139,9 @@ class ConfigDict(BaseConfig, total=False):
|
|||
USE_MERCURY: bool
|
||||
USE_GIT: bool
|
||||
USE_CHROME: bool
|
||||
USE_NODE: bool
|
||||
USE_YOUTUBEDL: bool
|
||||
USE_RIPGREP: bool
|
||||
CURL_BINARY: str
|
||||
GIT_BINARY: str
|
||||
WGET_BINARY: str
|
||||
|
@ -93,13 +149,12 @@ class ConfigDict(BaseConfig, total=False):
|
|||
READABILITY_BINARY: str
|
||||
MERCURY_BINARY: str
|
||||
YOUTUBEDL_BINARY: str
|
||||
NODE_BINARY: str
|
||||
RIPGREP_BINARY: str
|
||||
CHROME_BINARY: Optional[str]
|
||||
|
||||
YOUTUBEDL_ARGS: List[str]
|
||||
WGET_ARGS: List[str]
|
||||
CURL_ARGS: List[str]
|
||||
GIT_ARGS: List[str]
|
||||
TAG_SEPARATOR_PATTERN: str
|
||||
POCKET_CONSUMER_KEY: Optional[str]
|
||||
POCKET_ACCESS_TOKENS: dict
|
||||
READWISE_READER_TOKENS: dict
|
||||
|
||||
|
||||
ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
# default_app_config = 'core.apps.CoreAppConfig'
|
||||
|
|
|
@ -12,6 +12,7 @@ from django.utils.html import format_html
|
|||
from django.utils.safestring import mark_safe
|
||||
from django.shortcuts import render, redirect
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.contrib.auth.models import Group, Permission
|
||||
from django import forms
|
||||
|
||||
from ..util import htmldecode, urldecode, ansi_to_html
|
||||
|
@ -159,6 +160,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
|
||||
action_form = SnapshotActionForm
|
||||
|
||||
|
||||
def changelist_view(self, request, extra_context=None):
|
||||
extra_context = extra_context or {}
|
||||
return super().changelist_view(request, extra_context | GLOBAL_CONTEXT)
|
||||
|
|
|
@ -1,9 +1,16 @@
|
|||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class CoreConfig(AppConfig):
|
||||
class CoreAppConfig(AppConfig):
|
||||
name = 'core'
|
||||
|
||||
# label = 'Archive Data'
|
||||
verbose_name = "Archive Data"
|
||||
|
||||
# WIP: broken by Django 3.1.2 -> 4.0 migration
|
||||
# default_auto_field = 'django.db.models.UUIDField'
|
||||
|
||||
|
||||
def ready(self):
|
||||
from .auth import register_signals
|
||||
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
import os
|
||||
from django.conf import settings
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
|
||||
|
||||
from ..config import (
|
||||
LDAP
|
||||
)
|
||||
|
|
|
@ -50,7 +50,7 @@ class Tag(models.Model):
|
|||
|
||||
class Meta:
|
||||
verbose_name = "Tag"
|
||||
verbose_name_plural = "Tags"
|
||||
verbose_name_plural = "🏷️ Tags"
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
@ -98,6 +98,10 @@ class Snapshot(models.Model):
|
|||
|
||||
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
|
||||
|
||||
class Meta:
|
||||
verbose_name = "Snapshot"
|
||||
verbose_name_plural = "⭐️ Archived Webpages (Snapshots)"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
title = self.title or '-'
|
||||
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
|
||||
|
@ -282,5 +286,9 @@ class ArchiveResult(models.Model):
|
|||
|
||||
objects = ArchiveResultManager()
|
||||
|
||||
class Meta:
|
||||
verbose_name = "ArchiveResult"
|
||||
verbose_name_plural = "📑 Logs (ArchiveResults)"
|
||||
|
||||
def __str__(self):
|
||||
return self.extractor
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
# TODO: add this after we upgrade to Django >=3.2
|
||||
# https://github.com/typeddjango/django-stubs
|
||||
# import django_stubs_ext
|
||||
# django_stubs_ext.monkeypatch()
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
|
@ -59,13 +64,88 @@ INSTALLED_APPS = [
|
|||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'django.contrib.admin',
|
||||
'solo',
|
||||
|
||||
|
||||
'core',
|
||||
'api',
|
||||
|
||||
# Plugins
|
||||
|
||||
'plugins.defaults',
|
||||
'plugins.system',
|
||||
# 'plugins.replaywebpage', # provides UI to view WARC files
|
||||
# 'plugins.gallerydl', # provides gallerydl dependency + extractor
|
||||
# 'plugins.browsertrix', # provides browsertrix dependency + extractor
|
||||
# 'plugins.playwright', # provides playwright dependency
|
||||
# ...
|
||||
# someday we may have enough plugins to justify dynamic loading:
|
||||
# *(path.parent.name for path in (Path(PACKAGE_DIR) / 'plugins').glob('*/apps.py')),,
|
||||
|
||||
'django_extensions',
|
||||
]
|
||||
|
||||
################################################################################
|
||||
### Staticfile and Template Settings
|
||||
################################################################################
|
||||
|
||||
STATIC_URL = '/static/'
|
||||
|
||||
STATIC_ROOT = Path(PACKAGE_DIR) / 'collected_static'
|
||||
|
||||
STATICFILES_DIRS = [
|
||||
*([str(CUSTOM_TEMPLATES_DIR / 'static')] if CUSTOM_TEMPLATES_DIR else []),
|
||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'static'),
|
||||
|
||||
# Plugins
|
||||
# str(Path(PACKAGE_DIR) / 'plugins/defaults/static'),
|
||||
# str(Path(PACKAGE_DIR) / 'plugins/replaywebpage/static'),
|
||||
# str(Path(PACKAGE_DIR) / 'plugins/gallerydl/static'),
|
||||
# str(Path(PACKAGE_DIR) / 'plugins/browsertrix/static'),
|
||||
# str(Path(PACKAGE_DIR) / 'plugins/playwright/static'),
|
||||
# ...
|
||||
# someday if there are many more plugins / user-addable plugins:
|
||||
# *(str(path) for path in (Path(PACKAGE_DIR) / 'plugins').glob('*/static')),
|
||||
]
|
||||
|
||||
MEDIA_URL = '/archive/'
|
||||
MEDIA_ROOT = OUTPUT_DIR / 'archive'
|
||||
|
||||
|
||||
TEMPLATE_DIRS = [
|
||||
*([str(CUSTOM_TEMPLATES_DIR)] if CUSTOM_TEMPLATES_DIR else []),
|
||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'core'),
|
||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'admin'),
|
||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME),
|
||||
|
||||
# Plugins
|
||||
# added by plugins.<PluginName>.apps.<AppName>.ready -> .settings.register_plugin_settings
|
||||
# str(Path(PACKAGE_DIR) / 'plugins/defaults/templates'),
|
||||
# str(Path(PACKAGE_DIR) / 'plugins/replaywebpage/templates'),
|
||||
# str(Path(PACKAGE_DIR) / 'plugins/gallerydl/templates'),
|
||||
# str(Path(PACKAGE_DIR) / 'plugins/browsertrix/templates'),
|
||||
# str(Path(PACKAGE_DIR) / 'plugins/playwright/templates'),
|
||||
# ...
|
||||
#
|
||||
# someday if there are many more plugins / user-addable plugins:
|
||||
# *(str(path) for path in (Path(PACKAGE_DIR) / 'plugins').glob('*/templates')),
|
||||
]
|
||||
|
||||
TEMPLATES = [
|
||||
{
|
||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||
'DIRS': TEMPLATE_DIRS,
|
||||
'APP_DIRS': True,
|
||||
'OPTIONS': {
|
||||
'context_processors': [
|
||||
'django.template.context_processors.debug',
|
||||
'django.template.context_processors.request',
|
||||
'django.contrib.auth.context_processors.auth',
|
||||
'django.contrib.messages.context_processors.messages',
|
||||
],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
# For usage with https://www.jetadmin.io/integrations/django
|
||||
# INSTALLED_APPS += ['jet_django']
|
||||
|
@ -163,7 +243,7 @@ if DEBUG_TOOLBAR:
|
|||
'debug_toolbar.panels.request.RequestPanel',
|
||||
'debug_toolbar.panels.sql.SQLPanel',
|
||||
'debug_toolbar.panels.staticfiles.StaticFilesPanel',
|
||||
# 'debug_toolbar.panels.templates.TemplatesPanel',
|
||||
# 'debug_toolbar.panels.templates.TemplatesPanel', # buggy/slow
|
||||
'debug_toolbar.panels.cache.CachePanel',
|
||||
'debug_toolbar.panels.signals.SignalsPanel',
|
||||
'debug_toolbar.panels.logging.LoggingPanel',
|
||||
|
@ -173,39 +253,6 @@ if DEBUG_TOOLBAR:
|
|||
]
|
||||
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
|
||||
|
||||
################################################################################
|
||||
### Staticfile and Template Settings
|
||||
################################################################################
|
||||
|
||||
STATIC_URL = '/static/'
|
||||
|
||||
STATICFILES_DIRS = [
|
||||
*([str(CUSTOM_TEMPLATES_DIR / 'static')] if CUSTOM_TEMPLATES_DIR else []),
|
||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'static'),
|
||||
]
|
||||
|
||||
TEMPLATE_DIRS = [
|
||||
*([str(CUSTOM_TEMPLATES_DIR)] if CUSTOM_TEMPLATES_DIR else []),
|
||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'core'),
|
||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'admin'),
|
||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME),
|
||||
]
|
||||
|
||||
TEMPLATES = [
|
||||
{
|
||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||
'DIRS': TEMPLATE_DIRS,
|
||||
'APP_DIRS': True,
|
||||
'OPTIONS': {
|
||||
'context_processors': [
|
||||
'django.template.context_processors.debug',
|
||||
'django.template.context_processors.request',
|
||||
'django.contrib.auth.context_processors.auth',
|
||||
'django.contrib.messages.context_processors.messages',
|
||||
],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
################################################################################
|
||||
|
@ -312,21 +359,21 @@ IGNORABLE_404_URLS = [
|
|||
]
|
||||
|
||||
class NoisyRequestsFilter(logging.Filter):
|
||||
def filter(self, record):
|
||||
def filter(self, record) -> bool:
|
||||
logline = record.getMessage()
|
||||
|
||||
# ignore harmless 404s for the patterns in IGNORABLE_404_URLS
|
||||
for ignorable_url_pattern in IGNORABLE_404_URLS:
|
||||
ignorable_log_pattern = re.compile(f'^"GET /.*/?{ignorable_url_pattern.pattern[:-1]} HTTP/.*" (200|30.|404) .+$', re.I | re.M)
|
||||
if ignorable_log_pattern.match(logline):
|
||||
return 0
|
||||
return False
|
||||
|
||||
# ignore staticfile requests that 200 or 30*
|
||||
ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M)
|
||||
if ignoreable_200_log_pattern.match(logline):
|
||||
return 0
|
||||
return False
|
||||
|
||||
return 1
|
||||
return True
|
||||
|
||||
if LOGS_DIR.exists():
|
||||
ERROR_LOG = (LOGS_DIR / 'errors.log')
|
||||
|
|
|
@ -32,6 +32,10 @@ urlpatterns = [
|
|||
|
||||
path('archive/', RedirectView.as_view(url='/')),
|
||||
path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
|
||||
path('web/<path:path>', SnapshotView.as_view()), # support archive.org-style URLs
|
||||
|
||||
path('plugins/replaywebpage/', include('plugins.replaywebpage.urls')),
|
||||
# ... dynamic load these someday if there are more of them
|
||||
|
||||
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
|
||||
path('add/', AddView.as_view(), name='add'),
|
||||
|
|
|
@ -56,12 +56,18 @@ class SnapshotView(View):
|
|||
slug, archivefile = path.split('/', 1)[0], 'index.html'
|
||||
|
||||
# slug is a timestamp
|
||||
if slug.replace('.','').isdigit():
|
||||
if slug.replace('.', '').isdigit():
|
||||
|
||||
# missing trailing slash -> redirect to index
|
||||
if '/' not in path:
|
||||
return redirect(f'{path}/index.html')
|
||||
|
||||
# TODO: add support for archive.org-style URLs where timestamp may be a human-readable date
|
||||
# https://web.archivebox.io / web / 2022-01 / https://example.com
|
||||
# https://web.archivebox.io / web / 20220505103616 / https://example.com
|
||||
# https://web.archivebox.io / web / 2022-05-05__0:36:16 / https://example.com
|
||||
# use archivebox.util.parse_date (supports unix timestamps, iso date strings, and lots more etc.)
|
||||
|
||||
try:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
|
||||
|
|
|
@ -7,7 +7,7 @@ if __name__ == '__main__':
|
|||
# versions of ./manage.py commands whenever possible. When that's not possible
|
||||
# (e.g. makemigrations), you can comment out this check temporarily
|
||||
|
||||
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv):
|
||||
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv or 'collectstatic' in sys.argv):
|
||||
print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):")
|
||||
print()
|
||||
print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:')
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
[mypy]
|
||||
plugins =
|
||||
mypy_django_plugin.main
|
3
archivebox/plugins/__init__.py
Normal file
3
archivebox/plugins/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
__package__ = 'archivebox.plugins'
|
||||
|
||||
|
3
archivebox/plugins/defaults/__init__.py
Normal file
3
archivebox/plugins/defaults/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
__package__ = 'archivebox.plugins.defaults'
|
||||
|
||||
default_app_config = 'plugins.defaults.apps.DefaultsPluginAppConfig'
|
20
archivebox/plugins/defaults/admin.py
Normal file
20
archivebox/plugins/defaults/admin.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
from django.contrib import admin
|
||||
from solo.admin import SingletonModelAdmin
|
||||
|
||||
from .models import (
|
||||
ArchiveBoxDefaultDependency,
|
||||
ArchiveBoxDefaultExtractor,
|
||||
)
|
||||
|
||||
|
||||
class DependencyAdmin(SingletonModelAdmin):
|
||||
readonly_fields = ('id', 'NAME', 'LABEL', 'REQUIRED', 'bin_path', 'bin_version', 'is_valid', 'is_enabled')
|
||||
|
||||
class ExtractorAdmin(SingletonModelAdmin):
|
||||
readonly_fields = ('id', 'NAME', 'LABEL', 'DEFAULT_ENABLED', 'DEFAULT_CMD', 'CMD', 'ARGS', 'TIMEOUT', 'dependency', 'is_valid', 'is_enabled')
|
||||
|
||||
print('DefaultsPluginConfig.admin')
|
||||
|
||||
|
||||
admin.site.register(ArchiveBoxDefaultDependency, DependencyAdmin)
|
||||
admin.site.register(ArchiveBoxDefaultExtractor, ExtractorAdmin)
|
24
archivebox/plugins/defaults/apps.py
Normal file
24
archivebox/plugins/defaults/apps.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
# __package__ = 'archivebox.plugins.defaults'
|
||||
|
||||
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class DefaultsPluginAppConfig(AppConfig):
|
||||
name = "plugins.defaults"
|
||||
|
||||
# label = "ArchiveBox Defaults"
|
||||
verbose_name = "Plugin Configuration Defaults"
|
||||
|
||||
default_auto_field = "django.db.models.AutoField"
|
||||
|
||||
def ready(self):
|
||||
print('plugins.defaults.apps.DefaultsPluginConfig.ready')
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from .settings import register_plugin_settings
|
||||
|
||||
register_plugin_settings(settings, name=self.name)
|
||||
|
41
archivebox/plugins/defaults/migrations/0001_initial.py
Normal file
41
archivebox/plugins/defaults/migrations/0001_initial.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
# Generated by Django 3.1.14 on 2024-01-24 08:56
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='ArchiveBoxDefaultDependency',
|
||||
fields=[
|
||||
('ENABLED', models.BooleanField(default=True, editable=False)),
|
||||
('BINARY', models.CharField(default='/bin/false', max_length=255)),
|
||||
('ARGS', models.CharField(default='', max_length=255)),
|
||||
('id', models.AutoField(default=1, primary_key=True, serialize=False)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Dependency Configuration Defaults',
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='ArchiveBoxDefaultExtractor',
|
||||
fields=[
|
||||
('ENABLED', models.BooleanField(default=True)),
|
||||
('CMD', models.CharField(default=['{DEPENDENCY.BINARY}', '{ARGS}', '{url}'], max_length=255)),
|
||||
('ARGS', models.CharField(default=['--timeout={TIMEOUT}'], max_length=255)),
|
||||
('TIMEOUT', models.CharField(default='{TIMEOUT}', max_length=255)),
|
||||
('id', models.AutoField(default=1, primary_key=True, serialize=False)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Extractor Configuration Defaults',
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
]
|
|
@ -0,0 +1,31 @@
|
|||
# Generated by Django 3.1.14 on 2024-01-24 09:43
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('defaults', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='archiveboxdefaultdependency',
|
||||
options={'verbose_name': 'Default Configuration: Dependencies'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='archiveboxdefaultextractor',
|
||||
options={'verbose_name': 'Default Configuration: Extractors'},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveboxdefaultdependency',
|
||||
name='BINARY',
|
||||
field=models.CharField(default='/bin/bash', max_length=255),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveboxdefaultdependency',
|
||||
name='ENABLED',
|
||||
field=models.BooleanField(default=True),
|
||||
),
|
||||
]
|
0
archivebox/plugins/defaults/migrations/__init__.py
Normal file
0
archivebox/plugins/defaults/migrations/__init__.py
Normal file
385
archivebox/plugins/defaults/models.py
Normal file
385
archivebox/plugins/defaults/models.py
Normal file
|
@ -0,0 +1,385 @@
|
|||
__package__ = 'archivebox.plugins.defaults'
|
||||
|
||||
# import shutil
|
||||
|
||||
import re
|
||||
|
||||
from typing import List, Dict, Any
|
||||
from pathlib import Path
|
||||
|
||||
from django.db import models, transaction
|
||||
from django.utils.functional import cached_property
|
||||
|
||||
from solo.models import SingletonModel # type: ignore[import-untyped]
|
||||
|
||||
|
||||
from config import bin_path, bin_version
|
||||
|
||||
ConfigDict = Dict[str, Any]
|
||||
|
||||
|
||||
# def bin_path(binary: str) -> str | None:
|
||||
# return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
|
||||
|
||||
# def bin_version(bin_path: str, cmd: str | None=None) -> str | None:
|
||||
# return '0.0.0'
|
||||
|
||||
# def pretty_path(path: Path) -> str:
|
||||
# """take a Path object and return the path as a string relative to the current directory"""
|
||||
|
||||
# if not path:
|
||||
# return ''
|
||||
|
||||
# return str(path.expanduser().resolve().relative_to(Path.cwd().resolve()))
|
||||
|
||||
|
||||
class ArchiveBoxBaseDependency(models.Model):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(default=singleton_instance_id, primary_key=True)
|
||||
|
||||
NAME = 'DEFAULT'
|
||||
LABEL = "Default"
|
||||
REQUIRED = False
|
||||
|
||||
PARENT_DEPENDENCIES: List[str] = []
|
||||
|
||||
BIN_DEPENDENCIES: List[str] = []
|
||||
APT_DEPENDENCIES: List[str] = []
|
||||
BREW_DEPENDENCIES: List[str] = []
|
||||
PIP_DEPENDENCIES: List[str] = []
|
||||
NPM_DEPENDENCIES: List[str] = []
|
||||
|
||||
DEFAULT_BINARY: str | None = '/bin/bash'
|
||||
DEFAULT_START_CMD: str | None = '/bin/bash -c "while true; do sleep 1; done"'
|
||||
DEFAULT_PID_FILE: str | None = 'logs/{NAME}_WORKER.pid'
|
||||
DEFAULT_STOP_CMD: str | None = 'kill "$(<{PID_FILE})"'
|
||||
DEFAULT_VERSION_COMMAND: str | None = '{BINARY} --version'
|
||||
DEFAULT_ARGS: str | None = ''
|
||||
|
||||
VERSION_CMD = '{BINARY} --version'
|
||||
|
||||
ENABLED = models.BooleanField(default=True, editable=False)
|
||||
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
|
||||
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
|
||||
|
||||
# START_CMD = models.CharField(max_length=255, default=DEFAULT_START_CMD)
|
||||
# WORKERS = models.IntegerField(default=1)
|
||||
|
||||
class Meta:
|
||||
abstract = True
|
||||
app_label = 'defaults'
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.LABEL} Dependency Configuration"
|
||||
|
||||
def __json__(self):
|
||||
return {
|
||||
'type': 'ArchiveBoxDependency',
|
||||
'__class__': self.__class__.__name__,
|
||||
'NAME': self.NAME,
|
||||
'LABEL': self.LABEL,
|
||||
'ENABLED': self.ENABLED,
|
||||
'BINARY': self.BINARY,
|
||||
'ARGS': self.ARGS,
|
||||
# 'START_CMD': self.START_CMD,
|
||||
# 'WORKERS': self.WORKERS,
|
||||
}
|
||||
|
||||
@cached_property
|
||||
def bin_path(self) -> str:
|
||||
return bin_path(self.BINARY or self.DEFAULT_BINARY)
|
||||
|
||||
@cached_property
|
||||
def bin_version(self) -> str | None:
|
||||
print(f'ArchiveBoxBaseDependency.bin_version({self.bin_path}, cmd={self.VERSION_CMD.format(BINARY=self.BINARY)})')
|
||||
return bin_version(self.bin_path, cmd=self.VERSION_CMD.format(BINARY=self.BINARY))
|
||||
# return bin_version(self.bin_path, cmd=self.VERSION_CMD)
|
||||
|
||||
@cached_property
|
||||
def is_valid(self) -> bool:
|
||||
return bool(self.bin_path and self.bin_version)
|
||||
|
||||
@cached_property
|
||||
def is_enabled(self) -> bool:
|
||||
return bool(self.ENABLED and self.is_valid)
|
||||
|
||||
@cached_property
|
||||
def pretty_version(self) -> str:
|
||||
if self.is_enabled:
|
||||
if self.is_valid:
|
||||
color, symbol, note, version = 'green', '√', 'valid', ''
|
||||
|
||||
parsed_version_num = re.search(r'[\d\.]+', self.bin_version)
|
||||
if parsed_version_num:
|
||||
version = f'v{parsed_version_num[0]}'
|
||||
|
||||
if not self.bin_version:
|
||||
color, symbol, note, version = 'red', 'X', 'invalid', '?'
|
||||
else:
|
||||
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
|
||||
|
||||
path = pretty_path(self.bin_path)
|
||||
|
||||
return ' '.join((
|
||||
ANSI[color],
|
||||
symbol,
|
||||
ANSI['reset'],
|
||||
name.ljust(21),
|
||||
version.ljust(14),
|
||||
ANSI[color],
|
||||
note.ljust(8),
|
||||
ANSI['reset'],
|
||||
path.ljust(76),
|
||||
))
|
||||
|
||||
# @helper
|
||||
def install_parents(self, config):
|
||||
return {
|
||||
# parent_dependency.NAME: parent_dependency.get_solo().install_self()
|
||||
parent_dependency: parent_dependency
|
||||
for parent_dependency in self.PARENT_DEPENDENCIES
|
||||
}
|
||||
|
||||
# @helper
|
||||
def install_self(self, config):
|
||||
assert all(self.install_parents(config=config).values())
|
||||
|
||||
BashEnvironmentDependency.get_solo().install_pkgs(self.BIN_DEPENDENCIES)
|
||||
AptEnvironmentDependency.get_solo().install_pkgs(self.APT_DEPENDENCIES)
|
||||
BrewEnvironmentDependency.get_solo().install_pkgs(self.BREW_DEPENDENCIES)
|
||||
PipEnvironmentDependency.get_solo().install_pkgs(self.PIP_DEPENDENCIES)
|
||||
NPMEnvironmentDependency.get_solo().install_pkgs(self.NPM_DEPENDENCIES)
|
||||
|
||||
assert self.is_valid
|
||||
return self.bin_version
|
||||
|
||||
# @task
|
||||
def run(args, pwd, timeout):
|
||||
errors = None
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
proc = run(cmd=[self.bin_path, *args], pwd=pwd, timeout=timeout)
|
||||
|
||||
except Exception as err:
|
||||
errors = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return proc, timer, errors
|
||||
|
||||
class ArchiveBoxDefaultDependency(ArchiveBoxBaseDependency, SingletonModel):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(default=singleton_instance_id, primary_key=True)
|
||||
|
||||
ENABLED = models.BooleanField(default=True, editable=True)
|
||||
|
||||
class Meta: # pyright: ignore [reportIncompatibleVariableOverride]
|
||||
abstract = False
|
||||
app_label = 'defaults'
|
||||
verbose_name = 'Default Configuration: Dependencies'
|
||||
|
||||
|
||||
class ArchiveBoxBaseExtractor(models.Model):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(default=singleton_instance_id, primary_key=True)
|
||||
|
||||
NAME = 'DEFAULT'
|
||||
LABEL = 'Default'
|
||||
|
||||
DEFAULT_DEPENDENCY = ArchiveBoxDefaultDependency
|
||||
DEPENDENCY = DEFAULT_DEPENDENCY
|
||||
|
||||
|
||||
DEFAULT_ENABLED = True
|
||||
DEFAULT_CMD = ['{DEPENDENCY.BINARY}', '{ARGS}', '{url}']
|
||||
DEFAULT_ARGS = ['--timeout={TIMEOUT}']
|
||||
DEFAULT_TIMEOUT = '{TIMEOUT}'
|
||||
# DEFAULT_USER_AGENT = '{USER_AGENT}'
|
||||
# DEFAULT_COOKIES_TXT = '{COOKIES_TXT}'
|
||||
|
||||
ENABLED = models.BooleanField(default=DEFAULT_ENABLED, editable=True)
|
||||
|
||||
CMD = models.CharField(max_length=255, default=DEFAULT_CMD)
|
||||
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
|
||||
TIMEOUT = models.CharField(max_length=255, default=DEFAULT_TIMEOUT)
|
||||
|
||||
ALIASES = {
|
||||
'ENABLED': (f'SAVE_{NAME}', f'USE_{NAME}', f'FETCH_{NAME}'),
|
||||
}
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.LABEL} Extractor Configuration"
|
||||
|
||||
class Meta: # pyright: ignore [reportIncompatibleVariableOverride]
|
||||
abstract = True
|
||||
verbose_name = "Default Extractor Configuration"
|
||||
app_label = 'defaults'
|
||||
|
||||
@cached_property
|
||||
def dependency(self):
|
||||
return self.DEPENDENCY.get_solo()
|
||||
|
||||
def __json__(self):
|
||||
return {
|
||||
'type': 'ArchiveBoxExtractor',
|
||||
'__class__': self.__class__.__name__,
|
||||
'NAME': self.NAME,
|
||||
'LABEL': self.LABEL,
|
||||
'ENABLED': self.ENABLED,
|
||||
'DEPENDENCY': self.dependency.__json__(),
|
||||
'ARGS': self.ARGS,
|
||||
'CMD': self.CMD,
|
||||
'TIMEOUT': self.TIMEOUT,
|
||||
'is_valid': self.is_valid,
|
||||
'is_enabled': self.is_enabled,
|
||||
}
|
||||
|
||||
|
||||
def format_args(self, csv: List[str], **config):
|
||||
un_prefixed_config = {**self.__json__()} # e.g. ENABLED=True
|
||||
prefixed_config = { # e.g. GALLERYDL_ENABLED=True
|
||||
f'{self.NAME}_{key}': value
|
||||
for key, value in un_prefixed_config.items()
|
||||
}
|
||||
|
||||
merged_config = {
|
||||
**config, # e.g. TIMEOUT=60
|
||||
**un_prefixed_config, # e.g. ENABLED=True
|
||||
**prefixed_config, # e.g. GALLERYDL_ENABLED=True
|
||||
}
|
||||
formatted_config = [
|
||||
arg.format(**merged_config)
|
||||
for arg in csv
|
||||
]
|
||||
|
||||
return formatted_config
|
||||
|
||||
@cached_property
|
||||
def is_valid(self):
|
||||
if not self.dependency.is_valid:
|
||||
return False
|
||||
|
||||
# TIMEOUT must be at least 5 seconds
|
||||
# if self.TIMEOUT < 5:
|
||||
# return False
|
||||
|
||||
# assert Path(self.COOKIES_TXT).exists()
|
||||
# TODO: validate user agent with uaparser
|
||||
# TODO: validate args, cookies.txt?
|
||||
return True
|
||||
|
||||
@cached_property
|
||||
def is_enabled(self):
|
||||
return self.ENABLED and self.is_valid and self.dependency.is_enabled
|
||||
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
# assert self.is_valid
|
||||
|
||||
with transaction.atomic():
|
||||
result = super().save(*args, **kwargs)
|
||||
# post to message bus:
|
||||
print({
|
||||
'type': f'{self.__class__.__name__}.save',
|
||||
'diff': self.__json__(),
|
||||
'kwargs': kwargs,
|
||||
})
|
||||
# potential consumers of this event:
|
||||
# - event logger: write to events.log
|
||||
# - config file updater: writes to ArchiveBox.conf
|
||||
# - supervisor: restarts relevant dependencies/extractors
|
||||
# - etc...
|
||||
|
||||
return result
|
||||
|
||||
def out_dir(self, url: str, snapshot_dir: Path, config: ConfigDict):
|
||||
return (snapshot_dir / self.NAME)
|
||||
|
||||
def create_out_dir(self, url: str, snapshot_dir: Path, config: ConfigDict):
|
||||
out_dir = self.out_dir(url=url, snapshot_dir=snapshot_dir, config=config)
|
||||
return out_dir.mkdir(exist_ok=True)
|
||||
|
||||
def should_extract(self, url: str, snapshot_dir: Path, config: ConfigDict):
|
||||
# return False if extractor is disabled
|
||||
if not self.is_enabled:
|
||||
return False
|
||||
|
||||
out_dir = self.out_dir(url=url, snapshot_dir=snapshot_dir, config=config)
|
||||
|
||||
if has_existing_output := out_dir.glob('*'):
|
||||
return False
|
||||
|
||||
if not (has_write_access := os.access(out_dir, os.W_OK | os.X_OK)):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def get_dependency_cmd(self, url: str, extractor_dir: Path, config: ConfigDict):
|
||||
return [
|
||||
self.format_args(self.CMD, **config),
|
||||
url,
|
||||
*self.format_args(self.ARGS, **config), # TODO: split and requote this properly
|
||||
]
|
||||
|
||||
# @requires_config('HOSTNAME', 'TIMEOUT', 'USER_AGENT', 'CHECK_SSL_VALIDITY')
|
||||
def extract(self, url: str, snapshot_dir: Path, config: ConfigDict):
|
||||
if not self.ENABLED:
|
||||
return
|
||||
|
||||
extractor_dir = self.create_extractor_directory(snapshot_dir)
|
||||
|
||||
cmd = self.get_dependency_cmd(url=url, extractor_dir=extractor_dir, config=config)
|
||||
|
||||
status, stdout, stderr, output_path = 'failed', '', '', None
|
||||
try:
|
||||
proc, timer, errors = self.dependency.run(cmd, cwd=extractor_dir, timeout=self.TIMEOUT)
|
||||
stdout, stderr = proc.stdout, proc.stderr
|
||||
|
||||
if 'ERROR: Unsupported URL' in stderr:
|
||||
hints = ('gallery-dl doesnt support this type of url yet',)
|
||||
raise ArchiveError('Failed to save gallerydl', hints)
|
||||
|
||||
if proc.returncode == 0 and 'finished' in stdout:
|
||||
output_path = extractor_dir / 'index.html'
|
||||
status = 'succeeded'
|
||||
except Exception as err:
|
||||
stderr += err
|
||||
|
||||
num_bytes, num_dirs, num_files = get_dir_size(extractor_dir)
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=str(out_dir),
|
||||
cmd_version=self.dependency.bin_version,
|
||||
cmd_path=self.dependency.bin_path,
|
||||
cmd_hostname=config.HOSTNAME,
|
||||
|
||||
output_path=output_path,
|
||||
stdout=stdout,
|
||||
stderr=stderr,
|
||||
status=status,
|
||||
|
||||
num_bytes=num_bytes,
|
||||
num_files=num_files,
|
||||
num_dirs=num_dirs,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
|
||||
class ArchiveBoxDefaultExtractor(ArchiveBoxBaseExtractor, SingletonModel):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(default=singleton_instance_id, primary_key=True)
|
||||
|
||||
DEPENDENCY = ArchiveBoxDefaultDependency
|
||||
|
||||
ENABLED = models.BooleanField(default=True, editable=True)
|
||||
|
||||
class Meta:
|
||||
abstract = False
|
||||
app_label = 'defaults'
|
||||
verbose_name = 'Default Configuration: Extractors'
|
15
archivebox/plugins/defaults/settings.py
Normal file
15
archivebox/plugins/defaults/settings.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
def register_plugin_settings(settings=settings, name='defaults'):
|
||||
|
||||
settings.STATICFILES_DIRS += [
|
||||
str(Path(settings.PACKAGE_DIR) / f'plugins/{name}/static'),
|
||||
]
|
||||
|
||||
settings.TEMPLATE_DIRS += [
|
||||
str(Path(settings.PACKAGE_DIR) / f'plugins/{name}/templates'),
|
||||
]
|
||||
|
||||
print('REGISTERED PLUGIN SETTINGS', name)
|
1
archivebox/plugins/gallerydl/__init__.py
Normal file
1
archivebox/plugins/gallerydl/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
__package__ = 'archivebox.plugins.replaywebpage'
|
8
archivebox/plugins/gallerydl/admin.py
Normal file
8
archivebox/plugins/gallerydl/admin.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
from django.contrib import admin
|
||||
from solo.admin import SingletonModelAdmin
|
||||
|
||||
from .models import GalleryDLDependency, GalleryDLExtractor
|
||||
|
||||
|
||||
admin.site.register(GalleryDLDependency, SingletonModelAdmin)
|
||||
admin.site.register(GalleryDLExtractor, SingletonModelAdmin)
|
13
archivebox/plugins/gallerydl/apps.py
Normal file
13
archivebox/plugins/gallerydl/apps.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class GalleryDLAppConfig(AppConfig):
|
||||
label = "Gallery-DL"
|
||||
name = "plugin_gallerydl"
|
||||
|
||||
default_auto_field = "django.db.models.BigAutoField"
|
||||
|
||||
def ready(self):
|
||||
# querying models is ok, but don't fetch rows from DB or perform stateful actions here
|
||||
|
||||
print('√ Loaded GalleryDL Plugin')
|
50
archivebox/plugins/gallerydl/extractors.py
Normal file
50
archivebox/plugins/gallerydl/extractors.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
# browsertrix extractor
|
||||
|
||||
def save_browsertrix(link, out_dir, timeout, config):
|
||||
|
||||
|
||||
browsertrix_dir = out_dir / 'browsertrix'
|
||||
browsertrix_dir.mkdir(exist_ok=True)
|
||||
|
||||
crawl_id = link.timestamp
|
||||
|
||||
browsertrix_crawler_cmd = [
|
||||
'crawl',
|
||||
f'--url', link.url,
|
||||
f'--collection={crawl_id}',
|
||||
'--scopeType=page',
|
||||
'--generateWACZ',
|
||||
'--text=final-to-warc',
|
||||
'--timeLimit=60',
|
||||
]
|
||||
|
||||
remote_cmd = """
|
||||
rm /tmp/dump.rdb;
|
||||
rm -rf /crawls/collections;
|
||||
mkdir /crawls/collections;
|
||||
env CRAWL_ID={crawl_id}
|
||||
"""
|
||||
|
||||
local_cmd = ['nc', 'browsertrix', '2222']
|
||||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(local_cmd, cwd=str(out_dir), input=remote_cmd, timeout=timeout)
|
||||
|
||||
cmd_output = result.stdout.decode()
|
||||
|
||||
wacz_output_file = Path('/browsertrix/crawls') / crawl_id / f'{crawl_id}'.wacz
|
||||
|
||||
copy_and_overwrite(wacz_output_file, browsertrix_dir / wacz_output_file.name)
|
||||
|
||||
|
||||
|
||||
TEMPLATE = """
|
||||
|
||||
"""
|
||||
|
||||
# rm /tmp/dump.rdb;
|
||||
# rm -rf /crawls/collections;
|
||||
# mkdir /crawls/collections;
|
||||
# env CRAWL_ID=tec2342 crawl --url 'https://example.com' --scopeType page --generateWACZ --collection tec2342 --text final-to-warc --timeLimit 60
|
121
archivebox/plugins/gallerydl/models.py
Normal file
121
archivebox/plugins/gallerydl/models.py
Normal file
|
@ -0,0 +1,121 @@
|
|||
from django.db import models
|
||||
from django.utils.functional import cached_property
|
||||
|
||||
from solo.models import SingletonModel
|
||||
|
||||
from archivebox.plugins.defaults.models import (
|
||||
ArchiveBoxDefaultDependency,
|
||||
ArchiveBoxDefaultExtractor,
|
||||
BashEnvironmentDependency,
|
||||
PipEnvironmentDependency,
|
||||
)
|
||||
|
||||
|
||||
class GalleryDLDependency(ArchiveBoxDefaultDependency, SingletonModel):
|
||||
NAME = 'GALLERYDL'
|
||||
LABEL = "GalleryDL"
|
||||
REQUIRED = False
|
||||
|
||||
PARENT_DEPENDENCIES = [
|
||||
BashEnvironmentDependency,
|
||||
PipEnvironmentDependency,
|
||||
]
|
||||
|
||||
BIN_DEPENDENCIES = ['gallery-dl']
|
||||
APT_DEPENDENCIES = []
|
||||
BREW_DEPENDENCIES = []
|
||||
PIP_PACKAGES = ['gallery-dl']
|
||||
NPM_PACKAGES = []
|
||||
|
||||
DEFAULT_BINARY = 'gallery-dl'
|
||||
DEFAULT_START_CMD = None
|
||||
DEFAULT_ARGS = []
|
||||
VERSION_CMD = '{BINARY} --version'
|
||||
|
||||
ENABLED = models.BooleanField(default=True)
|
||||
BINARY = models.CharField(max_length=255, default='gallery-dl')
|
||||
|
||||
WORKERS = models.IntegerField(default='1')
|
||||
|
||||
|
||||
class GalleryDLExtractor(ArchiveBoxDefaultExtractor, SingletonModel):
|
||||
NAME = 'GALLERYDL'
|
||||
LABEL = 'gallery-dl'
|
||||
|
||||
DEPENDENCY = GalleryDLDependency.get_solo()
|
||||
|
||||
# https://github.com/mikf/gallery-dl
|
||||
DEFAULT_CMD = [
|
||||
'{DEPENDENCY.BINARY}',
|
||||
'{ARGS}'
|
||||
'{url}',
|
||||
]
|
||||
DEFAULT_ARGS = [
|
||||
'--timeout', self.TIMEOUT.format(**config),
|
||||
'--cookies', self.COOKIES_TXT.format(**config),
|
||||
'--user-agent', self.COOKIES_TXT.format(**config),
|
||||
'--verify', self.CHECK_SSL_VALIDITY.format(**config),
|
||||
]
|
||||
|
||||
ENABLED = models.BooleanField(default=True)
|
||||
|
||||
CMD = models.CharField(max_length=255, default=DEFAULT_CMD)
|
||||
ARGS = models.CSVField(max_length=255, default=DEFAULT_ARGS)
|
||||
|
||||
TIMEOUT = models.CharField(max_length=255, default='{TIMEOUT}')
|
||||
USER_AGENT = models.CharField(max_length=255, default='{USER_AGENT}')
|
||||
COOKIES_TXT = models.CharField(max_length=255, default='{COOKIES_TXT}')
|
||||
CHECK_SSL_VALIDITY = models.CharField(default='{CHECK_SSL_VALIDITY}')
|
||||
|
||||
# @task
|
||||
# @requires_config('HOSTNAME', 'TIMEOUT', 'USER_AGENT', 'CHECK_SSL_VALIDITY')
|
||||
def extract(self, url: str, out_dir: Path, config: ConfigDict):
|
||||
if not self.ENABLED:
|
||||
return
|
||||
|
||||
extractor_dir = self.create_extractor_directory(out_dir)
|
||||
|
||||
cmd = [
|
||||
self.CMD,
|
||||
url,
|
||||
'--timeout', self.TIMEOUT.format(**config),
|
||||
'--cookies', self.COOKIES_TXT.format(**config),
|
||||
'--user-agent', self.COOKIES_TXT.format(**config),
|
||||
'--verify', self.CHECK_SSL_VALIDITY.format(**config),
|
||||
*split_args(self.ARGS.format(**config)),
|
||||
]
|
||||
|
||||
status, stdout, stderr, output_path = 'failed', '', '', None
|
||||
try:
|
||||
proc, timer, errors = self.DEPENDENCY.run(cmd, cwd=extractor_dir, timeout=self.GALLERYDL_TIMEOUT)
|
||||
stdout, stderr = proc.stdout, proc.stderr
|
||||
|
||||
if 'ERROR: Unsupported URL' in stderr:
|
||||
hints = ('gallery-dl doesnt support this type of url yet',)
|
||||
raise ArchiveError('Failed to save gallerydl', hints)
|
||||
|
||||
if proc.returncode == 0 and 'finished' in stdout:
|
||||
output_path = extractor_dir / 'index.html'
|
||||
status = 'succeeded'
|
||||
except Exception as err:
|
||||
stderr += err
|
||||
|
||||
num_bytes, num_dirs, num_files = get_dir_size(extractor_dir)
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=str(out_dir),
|
||||
cmd_version=self.DEPENDENCY.bin_version,
|
||||
cmd_path=self.DEPENDENCY.bin_path,
|
||||
cmd_hostname=config.HOSTNAME,
|
||||
|
||||
output_path=output_path,
|
||||
stdout=stdout,
|
||||
stderr=stderr,
|
||||
status=status,
|
||||
|
||||
num_bytes=num_bytes,
|
||||
num_files=num_files,
|
||||
num_dirs=num_dirs,
|
||||
**timer.stats,
|
||||
)
|
59
archivebox/plugins/gallerydl/plugin.yaml
Normal file
59
archivebox/plugins/gallerydl/plugin.yaml
Normal file
|
@ -0,0 +1,59 @@
|
|||
dependencies:
|
||||
GalleryDLDependency:
|
||||
ID: gallerydl
|
||||
LABEL: GalleryDL
|
||||
REQUIRED: false
|
||||
|
||||
PARENT_DEPENDENCIES:
|
||||
- BashEnvironmentDependency
|
||||
- PipEnvironmentDependency
|
||||
|
||||
PIP_DEPENDENCIES:
|
||||
- gallery-dl
|
||||
|
||||
USER_CONFIG:
|
||||
ENABLED: models.BooleanField(max_length=255, default={DEFAULT_CONFIG.ENABLED})
|
||||
BINARY: models.CharField(max_length=255, default={DEFAULT_CONFIG.BINARY})
|
||||
|
||||
DEFAULT_CONFIG:
|
||||
ENABLED: true
|
||||
BINARY: 'gallery-dl'
|
||||
|
||||
CONFIG_ALIASES:
|
||||
- SAVE_GALLERYDL: ENABLED
|
||||
- USE_GALLERYDL: ENABLED
|
||||
- GALLERYDL_ENABLED: ENABLED
|
||||
- GALLERYDL_BINARY: BINARY
|
||||
|
||||
TASKS:
|
||||
# plugins.GalleryDLDependency
|
||||
run_dependency: plugins.gallerydl.models.GalleryDLDependency.run_dependency
|
||||
|
||||
|
||||
extractors:
|
||||
GalleryDLExtractor:
|
||||
ID: GALLERYDL
|
||||
LABEL: GalleryDL
|
||||
ENABLED: true
|
||||
|
||||
DEPENDENCY: GalleryDLDependency
|
||||
|
||||
CONFIG:
|
||||
ENABLED: models.BooleanField(default={DEFAULT_CONFIG.ENABLED})
|
||||
CMD: models.CharField(max_length=255, default={DEFAULT_CONFIG.CMD})
|
||||
ARGS: models.CharField(max_length=255, default={DEFAULT_CONFIG.ARGS})
|
||||
USER_AGENT: models.CharField(max_length=255, default={DEFAULT_CONFIG.USER_AGENT})
|
||||
CHECK_SSL_VALIDITY: models.CharField(max_length=255, default={DEFAULT_CONFIG.CHECK_SSL_VALIDITY})
|
||||
|
||||
DEFAULT_CONFIG:
|
||||
ENABLED: true
|
||||
CMD: gallery-dl {args} {url}
|
||||
ARGS: --user-agent={USER_AGENT} --check-ssl={CHECK_SSL_VALIDITY}
|
||||
CHECK_SSL_VALIDITY: {CHECK_SSL_VALIDITY}
|
||||
USER_AGENT: {USER_AGENT}
|
||||
|
||||
|
||||
TASKS:
|
||||
CREATE_OUT_DIR: plugins.gallerydl.tasks.create_out_dir
|
||||
SHOULD_EXTRACT: plugins.gallerydl.tasks.should_extract
|
||||
EXTRACT: plugins.gallerydl.tasks.extract
|
124
archivebox/plugins/gallerydl/static/sw.js
Normal file
124
archivebox/plugins/gallerydl/static/sw.js
Normal file
File diff suppressed because one or more lines are too long
1
archivebox/plugins/gallerydl/static/test.txt
Normal file
1
archivebox/plugins/gallerydl/static/test.txt
Normal file
|
@ -0,0 +1 @@
|
|||
test content this should be visible
|
BIN
archivebox/plugins/gallerydl/static/test.wacz
Normal file
BIN
archivebox/plugins/gallerydl/static/test.wacz
Normal file
Binary file not shown.
3392
archivebox/plugins/gallerydl/static/ui.js
Normal file
3392
archivebox/plugins/gallerydl/static/ui.js
Normal file
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,40 @@
|
|||
{% load tz core_tags static %}
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>{{title}}</title>
|
||||
<meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
|
||||
|
||||
</style>
|
||||
<style>
|
||||
html, body {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background-color: #ddd;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
ReplayWeb.page for: {{snapshot.url}} ({{timestamp}}) /{{warc_filename}}
|
||||
|
||||
{{snapshot}}
|
||||
|
||||
<script>
|
||||
// https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/sw.min.js
|
||||
// https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/ui.min.js
|
||||
</script>
|
||||
|
||||
<style>
|
||||
</style>
|
||||
<script src="/static/ui.js"></script>
|
||||
|
||||
<replay-web-page
|
||||
style="height: 600px"
|
||||
embed="replay"
|
||||
replayBase="/static/"
|
||||
source="/static/test.wacz"
|
||||
url="https://example.com/">
|
||||
</replay-web-page>
|
||||
</body>
|
||||
</html>
|
12
archivebox/plugins/gallerydl/urls.py
Normal file
12
archivebox/plugins/gallerydl/urls.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
from django.urls import path
|
||||
|
||||
from .views import GalleryDLIconView, GalleryDLEmbedView, GalleryDLOutputView, GalleryDLDependencyView, GalleryDLExtractorView
|
||||
|
||||
urlpatterns = [
|
||||
path('/plugins/gallerydl/icon/<path:path>', GalleryDLIconView(.as_view), name='gallerydl_icon'),
|
||||
path('/plugins/gallerydl/embed/<path:path>', GalleryDLEmbedView.as_view(), name='gallerydl_embed'),
|
||||
path('/plugins/gallerydl/output/<path:path>', GalleryDLOutputView.as_view(), name='gallerydl_output'),
|
||||
|
||||
path('/plugins/gallerydl/dependency/', GalleryDLDependencyView.as_view(), name='gallerydl_dependency'),
|
||||
path('/plugins/gallerydl/extractor/', GalleryDLExtractorView.as_view(), name='gallerydl_extractor'),
|
||||
]
|
78
archivebox/plugins/gallerydl/views.py
Normal file
78
archivebox/plugins/gallerydl/views.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from django.views import View
|
||||
from django.shortcuts import render
|
||||
from django.db.models import Q
|
||||
|
||||
from core.models import Snapshot
|
||||
|
||||
# from archivebox.config import PUBLIC_SNAPSHOTS
|
||||
PUBLIC_SNAPSHOTS = True
|
||||
|
||||
|
||||
class GalleryDLIconView(View):
|
||||
template_name = 'plugin_gallerydl__icon.html'
|
||||
|
||||
# render static html index from filesystem archive/<timestamp>/index.html
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
return {
|
||||
# **super().get_context_data(**kwargs),
|
||||
# 'VERSION': VERSION,
|
||||
# 'COMMIT_HASH': COMMIT_HASH,
|
||||
# 'FOOTER_INFO': FOOTER_INFO,
|
||||
}
|
||||
|
||||
|
||||
def get(self, request, path):
|
||||
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
# ...
|
||||
return render(template_name=self.template_name, request=self.request, context=context)
|
||||
|
||||
|
||||
class GalleryDLEmbedView(View):
|
||||
template_name = 'plugin_gallerydl__embed.html'
|
||||
|
||||
# render static html index from filesystem archive/<timestamp>/index.html
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
return {
|
||||
# **super().get_context_data(**kwargs),
|
||||
# 'VERSION': VERSION,
|
||||
# 'COMMIT_HASH': COMMIT_HASH,
|
||||
# 'FOOTER_INFO': FOOTER_INFO,
|
||||
}
|
||||
|
||||
|
||||
def get(self, request, path):
|
||||
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
# ...
|
||||
return render(template_name=self.template_name, request=self.request, context=context)
|
||||
|
||||
|
||||
class GalleryDLOutputView(View):
|
||||
template_name = 'plugin_gallerydl__output.html'
|
||||
|
||||
# render static html index from filesystem archive/<timestamp>/index.html
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
return {
|
||||
# **super().get_context_data(**kwargs),
|
||||
# 'VERSION': VERSION,
|
||||
# 'COMMIT_HASH': COMMIT_HASH,
|
||||
# 'FOOTER_INFO': FOOTER_INFO,
|
||||
}
|
||||
|
||||
|
||||
def get(self, request, path):
|
||||
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
# ...
|
||||
return render(template_name=self.template_name, request=self.request, context=context)
|
1
archivebox/plugins/replaywebpage/__init__.py
Normal file
1
archivebox/plugins/replaywebpage/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
__package__ = 'archivebox.plugins.replaywebpage'
|
8
archivebox/plugins/replaywebpage/apps.py
Normal file
8
archivebox/plugins/replaywebpage/apps.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class ReplayWebPageConfig(AppConfig):
|
||||
label = "ReplayWeb.Page"
|
||||
name = "plugin_replaywebpage"
|
||||
|
||||
default_auto_field = "django.db.models.BigAutoField"
|
50
archivebox/plugins/replaywebpage/extractors.py
Normal file
50
archivebox/plugins/replaywebpage/extractors.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
# browsertrix extractor
|
||||
|
||||
def save_browsertrix(link, out_dir, timeout, config):
|
||||
|
||||
|
||||
browsertrix_dir = out_dir / 'browsertrix'
|
||||
browsertrix_dir.mkdir(exist_ok=True)
|
||||
|
||||
crawl_id = link.timestamp
|
||||
|
||||
browsertrix_crawler_cmd = [
|
||||
'crawl',
|
||||
f'--url', link.url,
|
||||
f'--collection={crawl_id}',
|
||||
'--scopeType=page',
|
||||
'--generateWACZ',
|
||||
'--text=final-to-warc',
|
||||
'--timeLimit=60',
|
||||
]
|
||||
|
||||
remote_cmd = """
|
||||
rm /tmp/dump.rdb;
|
||||
rm -rf /crawls/collections;
|
||||
mkdir /crawls/collections;
|
||||
env CRAWL_ID={crawl_id}
|
||||
"""
|
||||
|
||||
local_cmd = ['nc', 'browsertrix', '2222']
|
||||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(local_cmd, cwd=str(out_dir), input=remote_cmd, timeout=timeout)
|
||||
|
||||
cmd_output = result.stdout.decode()
|
||||
|
||||
wacz_output_file = Path('/browsertrix/crawls') / crawl_id / f'{crawl_id}'.wacz
|
||||
|
||||
copy_and_overwrite(wacz_output_file, browsertrix_dir / wacz_output_file.name)
|
||||
|
||||
|
||||
|
||||
TEMPLATE = """
|
||||
|
||||
"""
|
||||
|
||||
# rm /tmp/dump.rdb;
|
||||
# rm -rf /crawls/collections;
|
||||
# mkdir /crawls/collections;
|
||||
# env CRAWL_ID=tec2342 crawl --url 'https://example.com' --scopeType page --generateWACZ --collection tec2342 --text final-to-warc --timeLimit 60
|
12
archivebox/plugins/replaywebpage/models.py
Normal file
12
archivebox/plugins/replaywebpage/models.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
# from solo.models import SingletonModel
|
||||
|
||||
|
||||
# class ReplayWebPageConfiguration(SingletonModel):
|
||||
# site_name = models.CharField(max_length=255, default='Site Name')
|
||||
# maintenance_mode = models.BooleanField(default=False)
|
||||
|
||||
# def __str__(self):
|
||||
# return "Site Configuration"
|
||||
|
||||
# class Meta:
|
||||
# verbose_name = "Site Configuration"
|
124
archivebox/plugins/replaywebpage/static/sw.js
Normal file
124
archivebox/plugins/replaywebpage/static/sw.js
Normal file
File diff suppressed because one or more lines are too long
1
archivebox/plugins/replaywebpage/static/test.txt
Normal file
1
archivebox/plugins/replaywebpage/static/test.txt
Normal file
|
@ -0,0 +1 @@
|
|||
test content this should be visible
|
BIN
archivebox/plugins/replaywebpage/static/test.wacz
Normal file
BIN
archivebox/plugins/replaywebpage/static/test.wacz
Normal file
Binary file not shown.
3392
archivebox/plugins/replaywebpage/static/ui.js
Normal file
3392
archivebox/plugins/replaywebpage/static/ui.js
Normal file
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,40 @@
|
|||
{% load tz core_tags static %}
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>{{title}}</title>
|
||||
<meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
|
||||
|
||||
</style>
|
||||
<style>
|
||||
html, body {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background-color: #ddd;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
ReplayWeb.page for: {{snapshot.url}} ({{timestamp}}) /{{warc_filename}}
|
||||
|
||||
{{snapshot}}
|
||||
|
||||
<script>
|
||||
// https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/sw.min.js
|
||||
// https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/ui.min.js
|
||||
</script>
|
||||
|
||||
<style>
|
||||
</style>
|
||||
<script src="/static/ui.js"></script>
|
||||
|
||||
<replay-web-page
|
||||
style="height: 600px"
|
||||
embed="replay"
|
||||
replayBase="/static/"
|
||||
source="/static/test.wacz"
|
||||
url="https://example.com/">
|
||||
</replay-web-page>
|
||||
</body>
|
||||
</html>
|
7
archivebox/plugins/replaywebpage/urls.py
Normal file
7
archivebox/plugins/replaywebpage/urls.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
from django.urls import path
|
||||
|
||||
from .views import ReplayWebPageViewer
|
||||
|
||||
urlpatterns = [
|
||||
path('<path:path>', ReplayWebPageViewer.as_view(), name='plugin_replaywebpage__viewer'),
|
||||
]
|
47
archivebox/plugins/replaywebpage/views.py
Normal file
47
archivebox/plugins/replaywebpage/views.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from django.views import View
|
||||
from django.shortcuts import render
|
||||
from django.db.models import Q
|
||||
|
||||
from core.models import Snapshot
|
||||
|
||||
# from archivebox.config import PUBLIC_SNAPSHOTS
|
||||
PUBLIC_SNAPSHOTS = True
|
||||
|
||||
|
||||
class ReplayWebPageViewer(View):
|
||||
template_name = 'plugin_replaywebpage__viewer.html'
|
||||
|
||||
# render static html index from filesystem archive/<timestamp>/index.html
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
return {
|
||||
# **super().get_context_data(**kwargs),
|
||||
# 'VERSION': VERSION,
|
||||
# 'COMMIT_HASH': COMMIT_HASH,
|
||||
# 'FOOTER_INFO': FOOTER_INFO,
|
||||
}
|
||||
|
||||
|
||||
def get(self, request, path):
|
||||
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
try:
|
||||
timestamp, warc_filename = path.split('/', 1)
|
||||
except (IndexError, ValueError):
|
||||
timestamp, warc_filename = path.split('/', 1)[0], ''
|
||||
|
||||
snapshot = Snapshot.objects.get(Q(timestamp=timestamp) | Q(id__startswith=timestamp))
|
||||
|
||||
context = self.get_context_data()
|
||||
context.update({
|
||||
"snapshot": snapshot,
|
||||
"timestamp": timestamp,
|
||||
"warc_filename": warc_filename,
|
||||
})
|
||||
return render(template_name=self.template_name, request=self.request, context=context)
|
||||
|
3
archivebox/plugins/system/__init__.py
Normal file
3
archivebox/plugins/system/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
__package__ = 'archivebox.plugins.system'
|
||||
|
||||
default_app_config = 'plugins.system.apps.SystemPluginAppConfig'
|
49
archivebox/plugins/system/admin.py
Normal file
49
archivebox/plugins/system/admin.py
Normal file
|
@ -0,0 +1,49 @@
|
|||
from django.contrib import admin
|
||||
from solo.admin import SingletonModelAdmin
|
||||
|
||||
from plugins.defaults.admin import DependencyAdmin, ExtractorAdmin
|
||||
|
||||
from .models import (
|
||||
BashEnvironmentDependency,
|
||||
PythonEnvironmentDependency,
|
||||
NodeJSEnvironmentDependency,
|
||||
|
||||
AptEnvironmentDependency,
|
||||
BrewEnvironmentDependency,
|
||||
PipEnvironmentDependency,
|
||||
NPMEnvironmentDependency,
|
||||
|
||||
SQLiteDependency,
|
||||
DjangoDependency,
|
||||
ArchiveBoxDependency,
|
||||
|
||||
# ArchiveBoxDefaultExtractor,
|
||||
)
|
||||
|
||||
|
||||
print('DefaultsPluginConfig.admin')
|
||||
|
||||
class MultiDependencyAdmin(admin.ModelAdmin):
|
||||
readonly_fields = DependencyAdmin.readonly_fields
|
||||
list_display = ('id', 'NAME', 'ENABLED', 'BINARY', 'ARGS', 'bin_path', 'bin_version', 'is_valid', 'is_enabled')
|
||||
|
||||
class MultiExtractorAdmin(admin.ModelAdmin):
|
||||
readonly_fields = DependencyAdmin.readonly_fields
|
||||
list_display = ('id', 'NAME', 'CMD', 'ARGS', 'is_valid', 'is_enabled')
|
||||
|
||||
|
||||
# admin.site.register(BashEnvironmentDependency, DependencyAdmin)
|
||||
admin.site.register(BashEnvironmentDependency, MultiDependencyAdmin)
|
||||
admin.site.register(PythonEnvironmentDependency, DependencyAdmin)
|
||||
admin.site.register(NodeJSEnvironmentDependency, DependencyAdmin)
|
||||
|
||||
admin.site.register(AptEnvironmentDependency, DependencyAdmin)
|
||||
admin.site.register(BrewEnvironmentDependency, DependencyAdmin)
|
||||
admin.site.register(PipEnvironmentDependency, DependencyAdmin)
|
||||
admin.site.register(NPMEnvironmentDependency, DependencyAdmin)
|
||||
|
||||
admin.site.register(SQLiteDependency, DependencyAdmin)
|
||||
admin.site.register(DjangoDependency, DependencyAdmin)
|
||||
admin.site.register(ArchiveBoxDependency, DependencyAdmin)
|
||||
|
||||
# admin.site.register(ArchiveBoxDefaultExtractor, ExtractorAdmin)
|
21
archivebox/plugins/system/apps.py
Normal file
21
archivebox/plugins/system/apps.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
__package__ = 'archivebox.plugins.system'
|
||||
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class SystemPluginAppConfig(AppConfig):
|
||||
name = "plugins.system"
|
||||
verbose_name = "Host System Configuration"
|
||||
|
||||
default_auto_field = "django.db.models.AutoField"
|
||||
|
||||
def ready(self):
|
||||
print('plugins.system.apps.SystemPluginConfig.ready')
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from plugins.defaults.settings import register_plugin_settings
|
||||
|
||||
register_plugin_settings(settings, name=self.name)
|
||||
|
144
archivebox/plugins/system/migrations/0001_initial.py
Normal file
144
archivebox/plugins/system/migrations/0001_initial.py
Normal file
|
@ -0,0 +1,144 @@
|
|||
# Generated by Django 3.1.14 on 2024-01-24 08:56
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='AptEnvironmentDependency',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('ENABLED', models.BooleanField(default=True)),
|
||||
('BINARY', models.CharField(default='apt-get', max_length=255)),
|
||||
('ARGS', models.CharField(default='-qq', max_length=255)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Package Manager: apt',
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='ArchiveBoxDependency',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('ENABLED', models.BooleanField(default=True, editable=False)),
|
||||
('BINARY', models.CharField(default='archivebox', editable=False, max_length=255)),
|
||||
('ARGS', models.CharField(default=[], editable=False, max_length=255)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Internal Dependency: ArchiveBox Package',
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='BashEnvironmentDependency',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('ENABLED', models.BooleanField(default=True, editable=False)),
|
||||
('BINARY', models.CharField(default='bash', max_length=255)),
|
||||
('ARGS', models.CharField(default='-c', max_length=255)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Shell Environment: bash',
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='BrewEnvironmentDependency',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('ENABLED', models.BooleanField(default=True)),
|
||||
('BINARY', models.CharField(default='brew', max_length=255)),
|
||||
('ARGS', models.CharField(default='', max_length=255)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Package Manager: brew',
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='DjangoDependency',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('ENABLED', models.BooleanField(default=True, editable=False)),
|
||||
('BINARY', models.CharField(default='django-admin.py', editable=False, max_length=255)),
|
||||
('ARGS', models.CharField(default=[], editable=False, max_length=255)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Internal Dependency: Django Package',
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='NodeJSEnvironmentDependency',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('ENABLED', models.BooleanField(default=True)),
|
||||
('BINARY', models.CharField(default='node', max_length=255)),
|
||||
('ARGS', models.CharField(default='-c', max_length=255)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Shell Environment: NodeJS',
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='NPMEnvironmentDependency',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('ENABLED', models.BooleanField(default=True)),
|
||||
('BINARY', models.CharField(default='node', max_length=255)),
|
||||
('ARGS', models.CharField(default='', max_length=255)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Package Manager: npm',
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='PipEnvironmentDependency',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('ENABLED', models.BooleanField(default=True)),
|
||||
('BINARY', models.CharField(default='pip3', max_length=255)),
|
||||
('ARGS', models.CharField(default='', max_length=255)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Package Manager: pip',
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='PythonEnvironmentDependency',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('ENABLED', models.BooleanField(default=True, editable=False)),
|
||||
('BINARY', models.CharField(default='python3', max_length=255)),
|
||||
('ARGS', models.CharField(default='-c', max_length=255)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Shell Environment: Python3',
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='SQLiteDependency',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('ENABLED', models.BooleanField(default=True, editable=False)),
|
||||
('BINARY', models.CharField(default='sqlite3', editable=False, max_length=255)),
|
||||
('ARGS', models.CharField(default=[], editable=False, max_length=255)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Internal Dependency: SQLite3 Package',
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
]
|
|
@ -0,0 +1,33 @@
|
|||
# Generated by Django 3.1.14 on 2024-01-24 09:43
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('system', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='archiveboxdependency',
|
||||
options={'verbose_name': 'Internal Dependency: archivebox'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='djangodependency',
|
||||
options={'verbose_name': 'Internal Dependency: django'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='nodejsenvironmentdependency',
|
||||
options={'verbose_name': 'Shell Environment: node'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='pythonenvironmentdependency',
|
||||
options={'verbose_name': 'Shell Environment: python3'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='sqlitedependency',
|
||||
options={'verbose_name': 'Internal Dependency: sqlite3'},
|
||||
),
|
||||
]
|
|
@ -0,0 +1,22 @@
|
|||
# Generated by Django 3.1.14 on 2024-01-24 09:56
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('system', '0002_auto_20240124_0943'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='bashenvironmentdependency',
|
||||
options={'verbose_name': 'Shell Environment: bash', 'verbose_name_plural': 'Shell Environments: bash'},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='bashenvironmentdependency',
|
||||
name='VERSION_CMD',
|
||||
field=models.CharField(default='{BINARY} --version', max_length=255),
|
||||
),
|
||||
]
|
0
archivebox/plugins/system/migrations/__init__.py
Normal file
0
archivebox/plugins/system/migrations/__init__.py
Normal file
448
archivebox/plugins/system/models.py
Normal file
448
archivebox/plugins/system/models.py
Normal file
|
@ -0,0 +1,448 @@
|
|||
# __package__ = 'archivebox.plugins.system'
|
||||
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import inspect
|
||||
import django
|
||||
from sqlite3 import dbapi2 as sqlite3
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from django.db import models
|
||||
from django.utils.functional import cached_property
|
||||
|
||||
from solo.models import SingletonModel
|
||||
|
||||
from config import bin_path, bin_version, VERSION
|
||||
|
||||
from plugins.defaults.models import ArchiveBoxBaseDependency
|
||||
|
||||
ConfigDict = Dict[str, Any]
|
||||
|
||||
|
||||
class BashEnvironmentDependency(ArchiveBoxBaseDependency):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(primary_key=True)
|
||||
|
||||
NAME = 'BASH'
|
||||
LABEL = "Bash"
|
||||
REQUIRED = True
|
||||
|
||||
PARENT_DEPENDENCIES = []
|
||||
|
||||
BIN_DEPENDENCIES: List[str] = ['bash']
|
||||
APT_DEPENDENCIES: List[str] = []
|
||||
BREW_DEPENDENCIES: List[str] = []
|
||||
PIP_DEPENDENCIES: List[str] = []
|
||||
NPM_DEPENDENCIES: List[str] = []
|
||||
|
||||
DEFAULT_BINARY = 'bash'
|
||||
DEFAULT_START_CMD = None
|
||||
DEFAULT_STOP_CMD = None
|
||||
DEFAULT_PID_FILE = None
|
||||
DEFAULT_ARGS = '-c'
|
||||
|
||||
ENABLED = models.BooleanField(default=True, editable=not REQUIRED)
|
||||
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
|
||||
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
|
||||
|
||||
VERSION_CMD = models.CharField(max_length=255, default='{BINARY} --version')
|
||||
|
||||
# START_CMD = models.CharField(max_length=255, default=DEFAULT_START_CMD)
|
||||
# WORKERS = models.IntegerField(default=1)
|
||||
|
||||
class Meta:
|
||||
abstract = False
|
||||
app_label = 'system'
|
||||
verbose_name = "Shell Environment: bash"
|
||||
verbose_name_plural = "Shell Environments: bash"
|
||||
|
||||
# @task
|
||||
def install_pkgs(self, os_pkgs=()):
|
||||
assert self.is_valid, 'Bash environment is not available on this host'
|
||||
|
||||
for os_dependency in os_pkgs:
|
||||
assert bin_path(os_dependency)
|
||||
|
||||
return True
|
||||
|
||||
class PythonEnvironmentDependency(ArchiveBoxBaseDependency):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(primary_key=True)
|
||||
|
||||
NAME = 'PYTHON'
|
||||
LABEL = "Python"
|
||||
REQUIRED = True
|
||||
|
||||
PARENT_DEPENDENCIES = []
|
||||
|
||||
BIN_DEPENDENCIES = ['python3']
|
||||
APT_DEPENDENCIES = []
|
||||
BREW_DEPENDENCIES = []
|
||||
PIP_DEPENDENCIES = []
|
||||
NPM_DEPENDENCIES = []
|
||||
|
||||
DEFAULT_BINARY = 'python3'
|
||||
DEFAULT_START_CMD = None
|
||||
DEFAULT_STOP_CMD = None
|
||||
DEFAULT_PID_FILE = None
|
||||
DEFAULT_ARGS = '-c'
|
||||
VERSION_CMD = '{BINARY} --version'
|
||||
|
||||
ENABLED = models.BooleanField(default=True, editable=not REQUIRED)
|
||||
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
|
||||
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
|
||||
|
||||
# START_CMD = models.CharField(max_length=255, default=DEFAULT_START_CMD)
|
||||
# WORKERS = models.IntegerField(default=1)
|
||||
|
||||
class Meta:
|
||||
abstract = False
|
||||
app_label = 'system'
|
||||
verbose_name = "Shell Environment: python3"
|
||||
|
||||
class NodeJSEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(primary_key=True)
|
||||
|
||||
NAME = 'NODEJS'
|
||||
LABEL = "NodeJS"
|
||||
REQUIRED = True
|
||||
|
||||
PARENT_DEPENDENCIES = []
|
||||
|
||||
BIN_DEPENDENCIES = ['node']
|
||||
APT_DEPENDENCIES = []
|
||||
BREW_DEPENDENCIES = []
|
||||
PIP_DEPENDENCIES = []
|
||||
NPM_DEPENDENCIES = []
|
||||
|
||||
DEFAULT_BINARY = 'node'
|
||||
DEFAULT_START_CMD = None
|
||||
DEFAULT_STOP_CMD = None
|
||||
DEFAULT_PID_FILE = None
|
||||
DEFAULT_ARGS = '-c'
|
||||
VERSION_CMD = '{BINARY} --version'
|
||||
|
||||
ENABLED = models.BooleanField(default=True, editable=True)
|
||||
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
|
||||
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
|
||||
|
||||
# START_CMD = models.CharField(max_length=255, default=DEFAULT_START_CMD)
|
||||
# WORKERS = models.IntegerField(default=1)
|
||||
|
||||
class Meta:
|
||||
abstract = False
|
||||
app_label = 'system'
|
||||
verbose_name = "Shell Environment: node"
|
||||
|
||||
|
||||
class AptEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(primary_key=True)
|
||||
|
||||
NAME = 'APT'
|
||||
LABEL = "apt"
|
||||
REQUIRED = False
|
||||
|
||||
PARENT_DEPENDENCIES = ['BashEnvironmentDependency']
|
||||
|
||||
BIN_DEPENDENCIES = ['apt-get']
|
||||
APT_DEPENDENCIES = []
|
||||
BREW_DEPENDENCIES = []
|
||||
PIP_PACKAGES = []
|
||||
NPM_PACKAGES = []
|
||||
|
||||
DEFAULT_BINARY = 'apt-get'
|
||||
DEFAULT_START_CMD = None
|
||||
DEFAULT_STOP_CMD = None
|
||||
DEFAULT_PID_FILE = None
|
||||
DEFAULT_ARGS = '-qq'
|
||||
|
||||
ENABLED = models.BooleanField(default=True, editable=not REQUIRED)
|
||||
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
|
||||
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
|
||||
|
||||
class Meta:
|
||||
abstract = False
|
||||
app_label = 'system'
|
||||
verbose_name = "Package Manager: apt"
|
||||
|
||||
# @task
|
||||
def install_pkgs(self, apt_pkgs=()):
|
||||
assert self.is_valid, 'Apt environment is not available on this host'
|
||||
|
||||
# with huey.lock_task('apt-install'):
|
||||
|
||||
run(cmd=[self.DEFAULT_BINARY, '-qq', 'update'])
|
||||
for apt_package in apt_pkgs:
|
||||
run(cmd=[self.DEFAULT_BINARY, 'install', '-y', apt_package])
|
||||
|
||||
return True
|
||||
|
||||
class BrewEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(primary_key=True)
|
||||
|
||||
NAME = 'BREW'
|
||||
LABEL = "homebrew"
|
||||
REQUIRED = False
|
||||
|
||||
PARENT_DEPENDENCIES = ['BashEnvironmentDependency']
|
||||
|
||||
BIN_DEPENDENCIES = ['brew']
|
||||
APT_DEPENDENCIES = []
|
||||
BREW_DEPENDENCIES = []
|
||||
PIP_PACKAGES = []
|
||||
NPM_PACKAGES = []
|
||||
|
||||
DEFAULT_BINARY = 'brew'
|
||||
DEFAULT_START_CMD = None
|
||||
DEFAULT_STOP_CMD = None
|
||||
DEFAULT_PID_FILE = None
|
||||
DEFAULT_ARGS = ''
|
||||
|
||||
ENABLED = models.BooleanField(default=True, editable=True)
|
||||
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
|
||||
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
|
||||
|
||||
class Meta:
|
||||
abstract = False
|
||||
app_label = 'system'
|
||||
verbose_name = "Package Manager: brew"
|
||||
|
||||
# @task
|
||||
def install_pkgs(self, brew_pkgs=()):
|
||||
assert self.is_valid, 'Brw environment is not available on this host'
|
||||
|
||||
run(cmd=[self.DEFAULT_BINARY, 'update'])
|
||||
|
||||
for brew_pkg in brew_pkgs:
|
||||
run(cmd=[self.DEFAULT_BINARY, 'install', brew_pkg])
|
||||
|
||||
return True
|
||||
|
||||
|
||||
|
||||
|
||||
class PipEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(primary_key=True)
|
||||
|
||||
NAME = 'PIP'
|
||||
LABEL = "pip"
|
||||
REQUIRED = False
|
||||
|
||||
PARENT_DEPENDENCIES = ['BashEnvironmentDependency']
|
||||
|
||||
BIN_DEPENDENCIES = ['python3', 'pip3']
|
||||
APT_DEPENDENCIES = ['python3.11', 'pip3', 'pipx']
|
||||
BREW_DEPENDENCIES = ['python@3.11', 'pipx']
|
||||
PIP_PACKAGES = ['setuptools', 'pipx']
|
||||
NPM_PACKAGES = []
|
||||
|
||||
DEFAULT_BINARY = 'pip3'
|
||||
DEFAULT_START_CMD = None
|
||||
DEFAULT_STOP_CMD = None
|
||||
DEFAULT_PID_FILE = None
|
||||
DEFAULT_ARGS = ''
|
||||
VERSION_CMD = '{BINARY} --version'
|
||||
|
||||
ENABLED = models.BooleanField(default=True, editable=True)
|
||||
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
|
||||
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
|
||||
|
||||
class Meta:
|
||||
abstract = False
|
||||
app_label = 'system'
|
||||
verbose_name = "Package Manager: pip"
|
||||
|
||||
# @task
|
||||
def install_pkgs(self, pip_pkgs=()):
|
||||
assert self.is_valid, 'Pip environment is not available on this host'
|
||||
|
||||
for pip_pkg in pip_pkgs:
|
||||
run(cmd=[self.DEFAULT_BINARY, 'install', '--update', '--ignore-installed', pip_pkg])
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class NPMEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(primary_key=True)
|
||||
|
||||
NAME = 'NODEJS'
|
||||
LABEL = "NodeJS"
|
||||
REQUIRED = False
|
||||
|
||||
PARENT_DEPENDENCIES = ['BashEnvironmentDependency']
|
||||
|
||||
BIN_DEPENDENCIES = ['node', 'npm']
|
||||
APT_DEPENDENCIES = ['node', 'npm']
|
||||
BREW_DEPENDENCIES = ['node', 'npm']
|
||||
PIP_PACKAGES = []
|
||||
NPM_PACKAGES = ['npm']
|
||||
|
||||
DEFAULT_BINARY = 'node'
|
||||
DEFAULT_START_CMD = None
|
||||
DEFAULT_STOP_CMD = None
|
||||
DEFAULT_PID_FILE = None
|
||||
DEFAULT_ARGS = ''
|
||||
VERSION_CMD = '{BINARY} --version'
|
||||
|
||||
ENABLED = models.BooleanField(default=True, editable=True)
|
||||
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
|
||||
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
|
||||
|
||||
class Meta:
|
||||
abstract = False
|
||||
app_label = 'system'
|
||||
verbose_name = "Package Manager: npm"
|
||||
|
||||
# @task
|
||||
def install_pkgs(self, npm_pkgs=()):
|
||||
assert self.is_valid, 'NPM environment is not available on this host'
|
||||
|
||||
for npm_pkg in npm_pkgs:
|
||||
run(cmd=[self.DEFAULT_BINARY, 'install', npm_pkg])
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class DjangoDependency(ArchiveBoxBaseDependency, SingletonModel):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(primary_key=True)
|
||||
|
||||
NAME = 'DJANGO'
|
||||
LABEL = "Django"
|
||||
REQUIRED = True
|
||||
|
||||
PARENT_DEPENDENCIES = []
|
||||
|
||||
BIN_DEPENDENCIES = ['django-admin.py']
|
||||
APT_DEPENDENCIES = []
|
||||
BREW_DEPENDENCIES = []
|
||||
PIP_PACKAGES = ['django==3.1.14']
|
||||
NPM_PACKAGES = []
|
||||
|
||||
DEFAULT_BINARY = 'django-admin.py'
|
||||
DEFAULT_START_CMD = 'archivebox server 0.0.0.0:8000'
|
||||
DEFAULT_PID_FILE = 'logs/{NAME}_WORKER.pid'
|
||||
DEFAULT_STOP_CMD = 'kill "$(<{PID_FILE})"'
|
||||
DEFAULT_ARGS = []
|
||||
VERSION_CMD = '{BINARY} --version'
|
||||
|
||||
ENABLED = models.BooleanField(default=True, editable=False)
|
||||
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY, editable=False)
|
||||
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS, editable=False)
|
||||
|
||||
class Meta:
|
||||
abstract = False
|
||||
app_label = 'system'
|
||||
verbose_name = "Internal Dependency: django"
|
||||
|
||||
@cached_property
|
||||
def bin_path(self):
|
||||
return inspect.getfile(django)
|
||||
|
||||
@cached_property
|
||||
def bin_version(self):
|
||||
return '.'.join(str(v) for v in django.VERSION[:3])
|
||||
|
||||
|
||||
class SQLiteDependency(ArchiveBoxBaseDependency, SingletonModel):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(primary_key=True)
|
||||
|
||||
NAME = 'SQLITE'
|
||||
LABEL = "SQLite"
|
||||
REQUIRED = True
|
||||
|
||||
PARENT_DEPENDENCIES = []
|
||||
|
||||
BIN_DEPENDENCIES = []
|
||||
APT_DEPENDENCIES = []
|
||||
BREW_DEPENDENCIES = []
|
||||
PIP_PACKAGES = []
|
||||
NPM_PACKAGES = []
|
||||
|
||||
DEFAULT_BINARY = 'sqlite3'
|
||||
DEFAULT_START_CMD = None
|
||||
DEFAULT_STOP_CMD = None
|
||||
DEFAULT_PID_FILE = None
|
||||
DEFAULT_ARGS = []
|
||||
VERSION_CMD = 'python3 -c ""'
|
||||
|
||||
ENABLED = models.BooleanField(default=True, editable=False)
|
||||
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY, editable=False)
|
||||
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS, editable=False)
|
||||
|
||||
class Meta:
|
||||
abstract = False
|
||||
app_label = 'system'
|
||||
verbose_name = "Internal Dependency: sqlite3"
|
||||
|
||||
@cached_property
|
||||
def bin_path(self):
|
||||
return inspect.getfile(sqlite3)
|
||||
|
||||
@cached_property
|
||||
def bin_version(self):
|
||||
return sqlite3.version
|
||||
|
||||
class ArchiveBoxDependency(ArchiveBoxBaseDependency):
|
||||
singleton_instance_id = 1
|
||||
|
||||
id = models.AutoField(primary_key=True)
|
||||
|
||||
NAME = 'ARCHIVEBOX'
|
||||
LABEL = "ArchiveBox"
|
||||
REQUIRED = True
|
||||
|
||||
PARENT_DEPENDENCIES = [
|
||||
'PipEnvironmentDependency',
|
||||
'DjangoDependency',
|
||||
'SQLiteDependency',
|
||||
]
|
||||
|
||||
BIN_DEPENDENCIES = ['archivebox']
|
||||
APT_DEPENDENCIES = []
|
||||
BREW_DEPENDENCIES = []
|
||||
PIP_PACKAGES = ['archivebox']
|
||||
NPM_PACKAGES = []
|
||||
|
||||
DEFAULT_BINARY = 'archivebox'
|
||||
DEFAULT_START_CMD = '{BINARY} server 0.0.0.0:8000'
|
||||
DEFAULT_ARGS = []
|
||||
VERSION_CMD = 'archivebox --version'
|
||||
|
||||
ENABLED = models.BooleanField(default=True, editable=False)
|
||||
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY, editable=False)
|
||||
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS, editable=False)
|
||||
|
||||
class Meta:
|
||||
abstract = False
|
||||
app_label = 'system'
|
||||
verbose_name = "Internal Dependency: archivebox"
|
||||
|
||||
@cached_property
|
||||
def bin_path(self):
|
||||
return sys.argv[0] or bin_path('archivebox')
|
||||
|
||||
@cached_property
|
||||
def bin_version(self):
|
||||
# return config['VERSION']
|
||||
return VERSION
|
||||
|
3
archivebox/plugins/system/settings.py
Normal file
3
archivebox/plugins/system/settings.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.conf import settings
|
||||
|
||||
from plugins.defaults import register_plugin_settings
|
118
archivebox/templates/static/sw.js
Normal file
118
archivebox/templates/static/sw.js
Normal file
File diff suppressed because one or more lines are too long
3390
archivebox/templates/static/ui.js
Normal file
3390
archivebox/templates/static/ui.js
Normal file
File diff suppressed because one or more lines are too long
|
@ -271,7 +271,11 @@ def get_headers(url: str, timeout: int=None) -> str:
|
|||
|
||||
return pyjson.dumps(
|
||||
{
|
||||
'URL': url,
|
||||
'Status-Code': response.status_code,
|
||||
'Elapsed': response.elapsed,
|
||||
'Encoding': response.encoding,
|
||||
'Apparent-Encoding': response.apparent_encoding,
|
||||
**dict(response.headers),
|
||||
},
|
||||
indent=4,
|
||||
|
|
|
@ -64,7 +64,7 @@ if [[ -d "$DATA_DIR/archive" ]]; then
|
|||
rm -f "$DATA_DIR/archive/.permissions_test_safe_to_delete"
|
||||
# echo "[√] Permissions are correct"
|
||||
else
|
||||
# the only time this fails is if the host filesystem doesn't allow us to write as root (e.g. some NFS mapall/maproot problems, connection issues, drive dissapeared, etc.)
|
||||
# the only time this fails is if the host filesystem doesn't allow us to write as root (e.g. some NFS mapall/maproot problems, connection issues, drive dissapeared, etc.)
|
||||
echo -e "\n[X] Error: archivebox user (PUID=$PUID) is not able to write to your ./data/archive dir (currently owned by $(stat -c '%u' "$DATA_DIR/archive"):$(stat -c '%g' "$DATA_DIR/archive")." > /dev/stderr
|
||||
echo -e " Change ./data to be owned by PUID=$PUID PGID=$PGID on the host and retry:" > /dev/stderr
|
||||
echo -e " \$ chown -R $PUID:$PGID ./data\n" > /dev/stderr
|
||||
|
@ -89,7 +89,8 @@ if ! chown $PUID:$PGID "$DATA_DIR"/* > /dev/null 2>&1; then
|
|||
find "$DATA_DIR" -type d -not -path "$DATA_DIR/archive*" -exec chown $PUID:$PGID {} \; > /dev/null 2>&1
|
||||
find "$DATA_DIR" -type f -not -path "$DATA_DIR/archive/*" -exec chown $PUID:$PGID {} \; > /dev/null 2>&1
|
||||
fi
|
||||
|
||||
mkdir -p /var/spool/cron/crontabs
|
||||
chown -R $PUID:$PGID /var/spool/cron/crontabs > /dev/null 2>&1 &
|
||||
|
||||
# also chown BROWSERS_DIR because otherwise 'archivebox setup' wont be able to 'playwright install chromium' at runtime
|
||||
export PLAYWRIGHT_BROWSERS_PATH="${PLAYWRIGHT_BROWSERS_PATH:-/browsers}"
|
||||
|
@ -191,9 +192,11 @@ if [[ "$1" == /* || "$1" == "bash" || "$1" == "sh" || "$1" == "echo" || "$1" ==
|
|||
# "docker run archivebox /bin/bash -c '...'"
|
||||
# "docker run archivebox cat /VERSION.txt"
|
||||
exec gosu "$PUID" /bin/bash -c "exec $(printf ' %q' "$@")"
|
||||
# WARNING: make sure to test extensively if you change this line, there are many edge-cases with nested quotes, special character, etc.
|
||||
# printf requotes shell parameters properly https://stackoverflow.com/a/39463371/2156113
|
||||
# gosu spawns an ephemeral bash process owned by archivebox user (bash wrapper is needed to load env vars, PATH, and setup terminal TTY)
|
||||
# outermost exec hands over current process ID to inner bash process, inner exec hands over inner bash PID to user's command
|
||||
# - https://github.com/ArchiveBox/ArchiveBox/issues/1191
|
||||
else
|
||||
# handle "docker run archivebox add some subcommand --with=args abc" by calling archivebox to run as args as CLI subcommand
|
||||
# e.g. "docker run archivebox help"
|
||||
|
|
38
bin/docker_ipc_listener.py
Executable file
38
bin/docker_ipc_listener.py
Executable file
|
@ -0,0 +1,38 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Allow another docker container to run commands on this container
|
||||
# This is the script to run on the server container.
|
||||
# The client can connect and run a command like so:
|
||||
# $ echo whoami | nc servercontainername 2222
|
||||
# root
|
||||
|
||||
import socket
|
||||
import subprocess as sp
|
||||
from datetime import datetime
|
||||
|
||||
LISTEN_PORT = 2222
|
||||
|
||||
s1 = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
s1.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
s1.bind(("0.0.0.0", LISTEN_PORT))
|
||||
s1.listen(1)
|
||||
print("Listening for shell commands on 0.0.0.0:2222", flush=True)
|
||||
|
||||
conn, addr = s1.accept()
|
||||
while True:
|
||||
cmd = conn.recv(1024).decode()
|
||||
if not cmd:
|
||||
conn, addr = s1.accept()
|
||||
continue
|
||||
|
||||
timestamp = datetime.now().isoformat()
|
||||
client_ip, client_port = conn.getsockname()
|
||||
print(f'\n[{timestamp}][{client_ip}:{client_port}] $', cmd)
|
||||
|
||||
with sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.STDOUT, stdin=sp.PIPE, bufsize=1, universal_newlines=True) as p:
|
||||
for line in p.stdout:
|
||||
print(line.strip(), flush=True)
|
||||
conn.sendall(line.encode("utf-8"))
|
||||
|
||||
conn.close()
|
||||
conn, addr = s1.accept()
|
|
@ -39,7 +39,6 @@ services:
|
|||
# dns:
|
||||
# - 172.20.0.53
|
||||
|
||||
|
||||
######## Optional Addons: tweak examples below as needed for your specific use case ########
|
||||
|
||||
### This optional container runs any scheduled tasks in the background, add new tasks like so:
|
||||
|
@ -188,6 +187,13 @@ services:
|
|||
# - ./wireguard.conf:/config/wg0.conf:ro
|
||||
|
||||
|
||||
### Example: Run browsertrix in parallel with ArchiveBox
|
||||
|
||||
# browsertrix:
|
||||
# image: webrecorder/browsertrix-crawler:latest
|
||||
# volumes:
|
||||
# - ./browsertrix:/crawls:z
|
||||
|
||||
### Example: Run PYWB in parallel and auto-import WARCs from ArchiveBox
|
||||
|
||||
# pywb:
|
||||
|
|
|
@ -7,7 +7,12 @@ wsgi-file = archivebox/core/wsgi.py
|
|||
processes = 4
|
||||
threads = 1
|
||||
stats = 127.0.0.1:9191
|
||||
static-map /static=./archivebox/templates/static
|
||||
static-map = /static=./archivebox/templates/static
|
||||
static-map = /static=./archivebox/plugins/replaywebpage/static
|
||||
static-map = /archive=$(PWD)/archive
|
||||
static=index = index.html
|
||||
harakiri = 172800
|
||||
post-buffering = 1
|
||||
disable-logging = True
|
||||
check-static
|
||||
honour-range = True
|
23
package.json
23
package.json
|
@ -1,13 +1,14 @@
|
|||
{
|
||||
"name": "archivebox",
|
||||
"version": "0.8.0",
|
||||
"description": "ArchiveBox: The self-hosted internet archive",
|
||||
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
|
||||
"repository": "github:ArchiveBox/ArchiveBox",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@postlight/parser": "^2.2.3",
|
||||
"readability-extractor": "github:ArchiveBox/readability-extractor",
|
||||
"single-file-cli": "^1.1.54"
|
||||
}
|
||||
"name": "archivebox",
|
||||
"version": "0.7.3",
|
||||
"description": "ArchiveBox: The self-hosted internet archive",
|
||||
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
|
||||
"repository": "github:ArchiveBox/ArchiveBox",
|
||||
"license": "MIT",
|
||||
"dependencies":
|
||||
{
|
||||
"@postlight/parser": "^2.2.3",
|
||||
"readability-extractor": "github:ArchiveBox/readability-extractor",
|
||||
"single-file-cli": "^1.1.54"
|
||||
}
|
||||
}
|
||||
|
|
29
pdm.lock
29
pdm.lock
|
@ -359,6 +359,19 @@ files = [
|
|||
{file = "django_ninja-1.1.0.tar.gz", hash = "sha256:87bff046416a2653ed2fbef1408e101292bf8170684821bac82accfd73bef059"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "django-solo"
|
||||
version = "2.0.0"
|
||||
requires_python = ">=3.6"
|
||||
summary = "Django Solo helps working with singletons"
|
||||
dependencies = [
|
||||
"django>=2.2",
|
||||
]
|
||||
files = [
|
||||
{file = "django-solo-2.0.0.tar.gz", hash = "sha256:7c6dbe04ae858a4645b580ec83a31a960a067ad4525d8227cca50b7fc5983a62"},
|
||||
{file = "django_solo-2.0.0-py3-none-any.whl", hash = "sha256:9046eca738f2ed64dbef38c2107a02af1065a8899b4f9fabf61b06b8325de1b4"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "exceptiongroup"
|
||||
version = "1.2.1"
|
||||
|
@ -902,22 +915,12 @@ files = [
|
|||
|
||||
[[package]]
|
||||
name = "setuptools"
|
||||
version = "69.5.1"
|
||||
version = "69.0.3"
|
||||
requires_python = ">=3.8"
|
||||
summary = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||
groups = ["default"]
|
||||
files = [
|
||||
{file = "setuptools-69.5.1-py3-none-any.whl", hash = "sha256:c636ac361bc47580504644275c9ad802c50415c7522212252c033bd15f301f32"},
|
||||
{file = "setuptools-69.5.1.tar.gz", hash = "sha256:6c1fccdac05a97e598fb0ae3bbed5904ccb317337a51139dcd51453611bbb987"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sgmllib3k"
|
||||
version = "1.0.0"
|
||||
summary = "Py3k port of sgmllib."
|
||||
groups = ["default"]
|
||||
files = [
|
||||
{file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"},
|
||||
{file = "setuptools-69.0.3-py3-none-any.whl", hash = "sha256:385eb4edd9c9d5c17540511303e39a147ce2fc04bc55289c322b9e5904fe2c05"},
|
||||
{file = "setuptools-69.0.3.tar.gz", hash = "sha256:be1af57fc409f93647f2e8e4573a142ed38724b8cdd389706a867bb4efcf1e78"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
@ -16,6 +16,7 @@ dependencies = [
|
|||
"setuptools>=69.5.1",
|
||||
"django>=4.2.0,<5.0",
|
||||
"django-ninja>=1.1.0",
|
||||
"django-solo>=2.0.0",
|
||||
"django-extensions>=3.2.3",
|
||||
"mypy-extensions>=1.0.0",
|
||||
|
||||
|
@ -123,7 +124,9 @@ test = [
|
|||
lint = [
|
||||
"flake8",
|
||||
"mypy",
|
||||
"django-stubs",
|
||||
"django-stubs[compatible-mypy]>=4.2.7",
|
||||
"types-requests>=2.31.0.20240125",
|
||||
"pudb>=2024.1",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
|
@ -133,6 +136,21 @@ build-backend = "pdm.backend"
|
|||
[project.scripts]
|
||||
archivebox = "archivebox.cli:main"
|
||||
|
||||
[tool.pyright]
|
||||
include = ["archivebox"]
|
||||
exclude = ["data", "data2", "data3", "data4", "data5", "pip_dist", "brew_dist", "dist", "vendor", "migrations", "tests"]
|
||||
|
||||
[tool.mypy]
|
||||
mypy_path = "archivebox"
|
||||
explicit_package_bases = true
|
||||
check_untyped_defs = true
|
||||
plugins = ["mypy_django_plugin.main"]
|
||||
# TODO: remove this eventually https://github.com/hauntsaninja/no_implicit_optional
|
||||
implicit_optional = true
|
||||
|
||||
[tool.django-stubs]
|
||||
django_settings_module = "core.settings"
|
||||
strict_settings = false
|
||||
|
||||
[tool.pdm.scripts]
|
||||
lint = "./bin/lint.sh"
|
||||
|
@ -142,19 +160,6 @@ test = "./bin/test.sh"
|
|||
[tool.pytest.ini_options]
|
||||
testpaths = [ "tests" ]
|
||||
|
||||
[tool.mypy]
|
||||
mypy_path = "archivebox"
|
||||
namespace_packages = true
|
||||
explicit_package_bases = true
|
||||
# follow_imports = "silent"
|
||||
# ignore_missing_imports = true
|
||||
# disallow_incomplete_defs = true
|
||||
# disallow_untyped_defs = true
|
||||
# disallow_untyped_decorators = true
|
||||
# exclude = "pdm/(pep582/|models/in_process/.+\\.py)"
|
||||
plugins = ["mypy_django_plugin.main"]
|
||||
|
||||
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/ArchiveBox/ArchiveBox"
|
||||
|
|
Loading…
Reference in a new issue