1
0
Fork 0
mirror of synced 2024-05-17 10:53:34 +12:00
This commit is contained in:
Nick Sweeting 2024-04-24 19:43:51 -07:00 committed by GitHub
commit 0c0ea7e4f5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
66 changed files with 12650 additions and 114 deletions

4
.gitignore vendored
View file

@ -26,11 +26,9 @@ dist/
# Data folders
data/
data1/
data2/
data3/
data*/
output/
# vim
*.sw?
.vscode/

View file

@ -15,8 +15,8 @@
# Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
# Use Debian 12 w/ faster package updates: https://packages.debian.org/bookworm-backports/
FROM python:3.11-slim-bookworm
# Uses Debian 12 w/ faster-updating apt-lists added below: https://packages.debian.org/bookworm-backports/
LABEL name="archivebox" \
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
@ -127,9 +127,9 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
# 1. packaging dependencies
apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
# 2. docker and init system dependencies
zlib1g-dev dumb-init gosu cron unzip grep \
zlib1g-dev dumb-init gosu cron unzip grep ncat \
# 3. frivolous CLI helpers to make debugging failed archiving easier
# nano iputils-ping dnsutils htop procps jq yq
# nano iputils-ping dnsutils htop procps jq yq \
&& rm -rf /var/lib/apt/lists/*
######### Language Environments ####################################

View file

@ -38,7 +38,7 @@ from hashlib import md5
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Type, Tuple, Dict, Union, List
from subprocess import run, PIPE, DEVNULL
from subprocess import run, PIPE, STDOUT, DEVNULL
from configparser import ConfigParser
from collections import defaultdict
import importlib.metadata
@ -854,7 +854,7 @@ def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Op
# Dependency Metadata Helpers
def bin_version(binary: Optional[str]) -> Optional[str]:
def bin_version(binary: Optional[str], cmd=None) -> Optional[str]:
"""check the presence and return valid version line of a specified binary"""
abspath = bin_path(binary)
@ -863,11 +863,21 @@ def bin_version(binary: Optional[str]) -> Optional[str]:
try:
bin_env = os.environ | {'LANG': 'C'}
version_str = run([abspath, "--version"], stdout=PIPE, env=bin_env).stdout.strip().decode()
is_cmd_str = cmd and isinstance(cmd, str)
version_str = run(cmd or [abspath, "--version"], shell=is_cmd_str, stdout=PIPE, stderr=STDOUT, env=bin_env).stdout.strip().decode()
if not version_str:
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
# take first 3 columns of first line of version info
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
version_str = run(cmd or [abspath, "--version"], shell=is_cmd_str, stdout=PIPE, stderr=STDOUT).stdout.strip().decode()
version_ptn = re.compile(r"\d+?\.\d+?\.?\d*", re.MULTILINE)
try:
version_nums = version_ptn.findall(version_str.split('\n')[0])[0]
if version_nums:
return version_nums
else:
raise IndexError
except IndexError:
# take first 3 columns of first line of version info
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
except OSError:
pass
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')

View file

@ -9,6 +9,8 @@ SimpleConfigValueDict = Dict[str, SimpleConfigValue]
SimpleConfigValueGetter = Callable[[], SimpleConfigValue]
ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter]
SHArgs = List[str] # shell command args list e.g. ["--something=1", "--someotherarg"]
class BaseConfig(TypedDict):
pass
@ -16,10 +18,10 @@ class BaseConfig(TypedDict):
class ConfigDict(BaseConfig, total=False):
"""
# Regenerate by pasting this quine into `archivebox shell` 🥚
from archivebox.config import ConfigDict, CONFIG_DEFAULTS
from archivebox.config import ConfigDict, CONFIG_SCHEMA
print('class ConfigDict(BaseConfig, total=False):')
print(' ' + '"'*3 + ConfigDict.__doc__ + '"'*3)
for section, configs in CONFIG_DEFAULTS.items():
for section, configs in CONFIG_SCHEMA.items():
for key, attrs in configs.items():
Type, default = attrs['type'], attrs['default']
if default is None:
@ -32,16 +34,23 @@ class ConfigDict(BaseConfig, total=False):
USE_COLOR: bool
SHOW_PROGRESS: bool
IN_DOCKER: bool
IN_QEMU: bool
PUID: int
PGID: int
PACKAGE_DIR: Path
OUTPUT_DIR: Path
CONFIG_FILE: Path
OUTPUT_DIR: Optional[str]
CONFIG_FILE: Optional[str]
ONLY_NEW: bool
TIMEOUT: int
MEDIA_TIMEOUT: int
OUTPUT_PERMISSIONS: str
RESTRICT_FILE_NAMES: str
URL_DENYLIST: str
URL_ALLOWLIST: Optional[str]
ADMIN_USERNAME: Optional[str]
ADMIN_PASSWORD: Optional[str]
ENFORCE_ATOMIC_WRITES: bool
TAG_SEPARATOR_PATTERN: str
SECRET_KEY: Optional[str]
BIND_ADDR: str
@ -49,7 +58,27 @@ class ConfigDict(BaseConfig, total=False):
DEBUG: bool
PUBLIC_INDEX: bool
PUBLIC_SNAPSHOTS: bool
PUBLIC_ADD_VIEW: bool
FOOTER_INFO: str
SNAPSHOTS_PER_PAGE: int
CUSTOM_TEMPLATES_DIR: Optional[str]
TIME_ZONE: str
TIMEZONE: str
REVERSE_PROXY_USER_HEADER: str
REVERSE_PROXY_WHITELIST: str
LOGOUT_REDIRECT_URL: str
PREVIEW_ORIGINALS: bool
LDAP: bool
LDAP_SERVER_URI: Optional[str]
LDAP_BIND_DN: Optional[str]
LDAP_BIND_PASSWORD: Optional[str]
LDAP_USER_BASE: Optional[str]
LDAP_USER_FILTER: Optional[str]
LDAP_USERNAME_ATTR: Optional[str]
LDAP_FIRSTNAME_ATTR: Optional[str]
LDAP_LASTNAME_ATTR: Optional[str]
LDAP_EMAIL_ATTR: Optional[str]
LDAP_CREATE_SUPERUSER: bool
SAVE_TITLE: bool
SAVE_FAVICON: bool
@ -58,25 +87,50 @@ class ConfigDict(BaseConfig, total=False):
SAVE_SINGLEFILE: bool
SAVE_READABILITY: bool
SAVE_MERCURY: bool
SAVE_HTMLTOTEXT: bool
SAVE_PDF: bool
SAVE_SCREENSHOT: bool
SAVE_DOM: bool
SAVE_HEADERS: bool
SAVE_WARC: bool
SAVE_GIT: bool
SAVE_MEDIA: bool
SAVE_ARCHIVE_DOT_ORG: bool
SAVE_ALLOWLIST: dict
SAVE_DENYLIST: dict
RESOLUTION: str
GIT_DOMAINS: str
CHECK_SSL_VALIDITY: bool
MEDIA_MAX_SIZE: str
CURL_USER_AGENT: str
WGET_USER_AGENT: str
CHROME_USER_AGENT: str
COOKIES_FILE: Union[str, Path, None]
CHROME_USER_DATA_DIR: Union[str, Path, None]
COOKIES_FILE: Optional[str]
CHROME_USER_DATA_DIR: Optional[str]
CHROME_TIMEOUT: int
CHROME_HEADLESS: bool
CHROME_SANDBOX: bool
YOUTUBEDL_ARGS: list
WGET_ARGS: list
CURL_ARGS: list
GIT_ARGS: list
SINGLEFILE_ARGS: Optional[list]
FAVICON_PROVIDER: str
USE_INDEXING_BACKEND: bool
USE_SEARCHING_BACKEND: bool
SEARCH_BACKEND_ENGINE: str
SEARCH_BACKEND_HOST_NAME: str
SEARCH_BACKEND_PORT: int
SEARCH_BACKEND_PASSWORD: str
SEARCH_PROCESS_HTML: bool
SONIC_COLLECTION: str
SONIC_BUCKET: str
SEARCH_BACKEND_TIMEOUT: int
FTS_SEPARATE_DATABASE: bool
FTS_TOKENIZERS: str
FTS_SQLITE_MAX_LENGTH: int
USE_CURL: bool
USE_WGET: bool
@ -85,7 +139,9 @@ class ConfigDict(BaseConfig, total=False):
USE_MERCURY: bool
USE_GIT: bool
USE_CHROME: bool
USE_NODE: bool
USE_YOUTUBEDL: bool
USE_RIPGREP: bool
CURL_BINARY: str
GIT_BINARY: str
WGET_BINARY: str
@ -93,13 +149,12 @@ class ConfigDict(BaseConfig, total=False):
READABILITY_BINARY: str
MERCURY_BINARY: str
YOUTUBEDL_BINARY: str
NODE_BINARY: str
RIPGREP_BINARY: str
CHROME_BINARY: Optional[str]
YOUTUBEDL_ARGS: List[str]
WGET_ARGS: List[str]
CURL_ARGS: List[str]
GIT_ARGS: List[str]
TAG_SEPARATOR_PATTERN: str
POCKET_CONSUMER_KEY: Optional[str]
POCKET_ACCESS_TOKENS: dict
READWISE_READER_TOKENS: dict
ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]

View file

@ -1,2 +1,3 @@
__package__ = 'archivebox.core'
# default_app_config = 'core.apps.CoreAppConfig'

View file

@ -12,6 +12,7 @@ from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.shortcuts import render, redirect
from django.contrib.auth import get_user_model
from django.contrib.auth.models import Group, Permission
from django import forms
from ..util import htmldecode, urldecode, ansi_to_html
@ -159,6 +160,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
action_form = SnapshotActionForm
def changelist_view(self, request, extra_context=None):
extra_context = extra_context or {}
return super().changelist_view(request, extra_context | GLOBAL_CONTEXT)

View file

@ -1,9 +1,16 @@
from django.apps import AppConfig
class CoreConfig(AppConfig):
class CoreAppConfig(AppConfig):
name = 'core'
# label = 'Archive Data'
verbose_name = "Archive Data"
# WIP: broken by Django 3.1.2 -> 4.0 migration
# default_auto_field = 'django.db.models.UUIDField'
def ready(self):
from .auth import register_signals

View file

@ -1,5 +1,7 @@
import os
from django.conf import settings
__package__ = 'archivebox.core'
from ..config import (
LDAP
)

View file

@ -50,7 +50,7 @@ class Tag(models.Model):
class Meta:
verbose_name = "Tag"
verbose_name_plural = "Tags"
verbose_name_plural = "🏷️ Tags"
def __str__(self):
return self.name
@ -98,6 +98,10 @@ class Snapshot(models.Model):
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
class Meta:
verbose_name = "Snapshot"
verbose_name_plural = "⭐️ Archived Webpages (Snapshots)"
def __repr__(self) -> str:
title = self.title or '-'
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
@ -282,5 +286,9 @@ class ArchiveResult(models.Model):
objects = ArchiveResultManager()
class Meta:
verbose_name = "ArchiveResult"
verbose_name_plural = "📑 Logs (ArchiveResults)"
def __str__(self):
return self.extractor

View file

@ -1,5 +1,10 @@
__package__ = 'archivebox.core'
# TODO: add this after we upgrade to Django >=3.2
# https://github.com/typeddjango/django-stubs
# import django_stubs_ext
# django_stubs_ext.monkeypatch()
import os
import sys
import re
@ -59,13 +64,88 @@ INSTALLED_APPS = [
'django.contrib.messages',
'django.contrib.staticfiles',
'django.contrib.admin',
'solo',
'core',
'api',
# Plugins
'plugins.defaults',
'plugins.system',
# 'plugins.replaywebpage', # provides UI to view WARC files
# 'plugins.gallerydl', # provides gallerydl dependency + extractor
# 'plugins.browsertrix', # provides browsertrix dependency + extractor
# 'plugins.playwright', # provides playwright dependency
# ...
# someday we may have enough plugins to justify dynamic loading:
# *(path.parent.name for path in (Path(PACKAGE_DIR) / 'plugins').glob('*/apps.py')),,
'django_extensions',
]
################################################################################
### Staticfile and Template Settings
################################################################################
STATIC_URL = '/static/'
STATIC_ROOT = Path(PACKAGE_DIR) / 'collected_static'
STATICFILES_DIRS = [
*([str(CUSTOM_TEMPLATES_DIR / 'static')] if CUSTOM_TEMPLATES_DIR else []),
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'static'),
# Plugins
# str(Path(PACKAGE_DIR) / 'plugins/defaults/static'),
# str(Path(PACKAGE_DIR) / 'plugins/replaywebpage/static'),
# str(Path(PACKAGE_DIR) / 'plugins/gallerydl/static'),
# str(Path(PACKAGE_DIR) / 'plugins/browsertrix/static'),
# str(Path(PACKAGE_DIR) / 'plugins/playwright/static'),
# ...
# someday if there are many more plugins / user-addable plugins:
# *(str(path) for path in (Path(PACKAGE_DIR) / 'plugins').glob('*/static')),
]
MEDIA_URL = '/archive/'
MEDIA_ROOT = OUTPUT_DIR / 'archive'
TEMPLATE_DIRS = [
*([str(CUSTOM_TEMPLATES_DIR)] if CUSTOM_TEMPLATES_DIR else []),
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'core'),
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'admin'),
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME),
# Plugins
# added by plugins.<PluginName>.apps.<AppName>.ready -> .settings.register_plugin_settings
# str(Path(PACKAGE_DIR) / 'plugins/defaults/templates'),
# str(Path(PACKAGE_DIR) / 'plugins/replaywebpage/templates'),
# str(Path(PACKAGE_DIR) / 'plugins/gallerydl/templates'),
# str(Path(PACKAGE_DIR) / 'plugins/browsertrix/templates'),
# str(Path(PACKAGE_DIR) / 'plugins/playwright/templates'),
# ...
#
# someday if there are many more plugins / user-addable plugins:
# *(str(path) for path in (Path(PACKAGE_DIR) / 'plugins').glob('*/templates')),
]
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': TEMPLATE_DIRS,
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
# For usage with https://www.jetadmin.io/integrations/django
# INSTALLED_APPS += ['jet_django']
@ -163,7 +243,7 @@ if DEBUG_TOOLBAR:
'debug_toolbar.panels.request.RequestPanel',
'debug_toolbar.panels.sql.SQLPanel',
'debug_toolbar.panels.staticfiles.StaticFilesPanel',
# 'debug_toolbar.panels.templates.TemplatesPanel',
# 'debug_toolbar.panels.templates.TemplatesPanel', # buggy/slow
'debug_toolbar.panels.cache.CachePanel',
'debug_toolbar.panels.signals.SignalsPanel',
'debug_toolbar.panels.logging.LoggingPanel',
@ -173,39 +253,6 @@ if DEBUG_TOOLBAR:
]
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
################################################################################
### Staticfile and Template Settings
################################################################################
STATIC_URL = '/static/'
STATICFILES_DIRS = [
*([str(CUSTOM_TEMPLATES_DIR / 'static')] if CUSTOM_TEMPLATES_DIR else []),
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'static'),
]
TEMPLATE_DIRS = [
*([str(CUSTOM_TEMPLATES_DIR)] if CUSTOM_TEMPLATES_DIR else []),
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'core'),
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'admin'),
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME),
]
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': TEMPLATE_DIRS,
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
################################################################################
@ -312,21 +359,21 @@ IGNORABLE_404_URLS = [
]
class NoisyRequestsFilter(logging.Filter):
def filter(self, record):
def filter(self, record) -> bool:
logline = record.getMessage()
# ignore harmless 404s for the patterns in IGNORABLE_404_URLS
for ignorable_url_pattern in IGNORABLE_404_URLS:
ignorable_log_pattern = re.compile(f'^"GET /.*/?{ignorable_url_pattern.pattern[:-1]} HTTP/.*" (200|30.|404) .+$', re.I | re.M)
if ignorable_log_pattern.match(logline):
return 0
return False
# ignore staticfile requests that 200 or 30*
ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M)
if ignoreable_200_log_pattern.match(logline):
return 0
return False
return 1
return True
if LOGS_DIR.exists():
ERROR_LOG = (LOGS_DIR / 'errors.log')

View file

@ -32,6 +32,10 @@ urlpatterns = [
path('archive/', RedirectView.as_view(url='/')),
path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
path('web/<path:path>', SnapshotView.as_view()), # support archive.org-style URLs
path('plugins/replaywebpage/', include('plugins.replaywebpage.urls')),
# ... dynamic load these someday if there are more of them
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
path('add/', AddView.as_view(), name='add'),

View file

@ -56,12 +56,18 @@ class SnapshotView(View):
slug, archivefile = path.split('/', 1)[0], 'index.html'
# slug is a timestamp
if slug.replace('.','').isdigit():
if slug.replace('.', '').isdigit():
# missing trailing slash -> redirect to index
if '/' not in path:
return redirect(f'{path}/index.html')
# TODO: add support for archive.org-style URLs where timestamp may be a human-readable date
# https://web.archivebox.io / web / 2022-01 / https://example.com
# https://web.archivebox.io / web / 20220505103616 / https://example.com
# https://web.archivebox.io / web / 2022-05-05__0:36:16 / https://example.com
# use archivebox.util.parse_date (supports unix timestamps, iso date strings, and lots more etc.)
try:
try:
snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))

View file

@ -7,7 +7,7 @@ if __name__ == '__main__':
# versions of ./manage.py commands whenever possible. When that's not possible
# (e.g. makemigrations), you can comment out this check temporarily
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv):
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv or 'collectstatic' in sys.argv):
print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):")
print()
print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:')

View file

@ -1,3 +0,0 @@
[mypy]
plugins =
mypy_django_plugin.main

View file

@ -0,0 +1,3 @@
__package__ = 'archivebox.plugins'

View file

@ -0,0 +1,3 @@
__package__ = 'archivebox.plugins.defaults'
default_app_config = 'plugins.defaults.apps.DefaultsPluginAppConfig'

View file

@ -0,0 +1,20 @@
from django.contrib import admin
from solo.admin import SingletonModelAdmin
from .models import (
ArchiveBoxDefaultDependency,
ArchiveBoxDefaultExtractor,
)
class DependencyAdmin(SingletonModelAdmin):
readonly_fields = ('id', 'NAME', 'LABEL', 'REQUIRED', 'bin_path', 'bin_version', 'is_valid', 'is_enabled')
class ExtractorAdmin(SingletonModelAdmin):
readonly_fields = ('id', 'NAME', 'LABEL', 'DEFAULT_ENABLED', 'DEFAULT_CMD', 'CMD', 'ARGS', 'TIMEOUT', 'dependency', 'is_valid', 'is_enabled')
print('DefaultsPluginConfig.admin')
admin.site.register(ArchiveBoxDefaultDependency, DependencyAdmin)
admin.site.register(ArchiveBoxDefaultExtractor, ExtractorAdmin)

View file

@ -0,0 +1,24 @@
# __package__ = 'archivebox.plugins.defaults'
from django.apps import AppConfig
class DefaultsPluginAppConfig(AppConfig):
name = "plugins.defaults"
# label = "ArchiveBox Defaults"
verbose_name = "Plugin Configuration Defaults"
default_auto_field = "django.db.models.AutoField"
def ready(self):
print('plugins.defaults.apps.DefaultsPluginConfig.ready')
from django.conf import settings
from .settings import register_plugin_settings
register_plugin_settings(settings, name=self.name)

View file

@ -0,0 +1,41 @@
# Generated by Django 3.1.14 on 2024-01-24 08:56
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='ArchiveBoxDefaultDependency',
fields=[
('ENABLED', models.BooleanField(default=True, editable=False)),
('BINARY', models.CharField(default='/bin/false', max_length=255)),
('ARGS', models.CharField(default='', max_length=255)),
('id', models.AutoField(default=1, primary_key=True, serialize=False)),
],
options={
'verbose_name': 'Dependency Configuration Defaults',
'abstract': False,
},
),
migrations.CreateModel(
name='ArchiveBoxDefaultExtractor',
fields=[
('ENABLED', models.BooleanField(default=True)),
('CMD', models.CharField(default=['{DEPENDENCY.BINARY}', '{ARGS}', '{url}'], max_length=255)),
('ARGS', models.CharField(default=['--timeout={TIMEOUT}'], max_length=255)),
('TIMEOUT', models.CharField(default='{TIMEOUT}', max_length=255)),
('id', models.AutoField(default=1, primary_key=True, serialize=False)),
],
options={
'verbose_name': 'Extractor Configuration Defaults',
'abstract': False,
},
),
]

View file

@ -0,0 +1,31 @@
# Generated by Django 3.1.14 on 2024-01-24 09:43
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('defaults', '0001_initial'),
]
operations = [
migrations.AlterModelOptions(
name='archiveboxdefaultdependency',
options={'verbose_name': 'Default Configuration: Dependencies'},
),
migrations.AlterModelOptions(
name='archiveboxdefaultextractor',
options={'verbose_name': 'Default Configuration: Extractors'},
),
migrations.AlterField(
model_name='archiveboxdefaultdependency',
name='BINARY',
field=models.CharField(default='/bin/bash', max_length=255),
),
migrations.AlterField(
model_name='archiveboxdefaultdependency',
name='ENABLED',
field=models.BooleanField(default=True),
),
]

View file

@ -0,0 +1,385 @@
__package__ = 'archivebox.plugins.defaults'
# import shutil
import re
from typing import List, Dict, Any
from pathlib import Path
from django.db import models, transaction
from django.utils.functional import cached_property
from solo.models import SingletonModel # type: ignore[import-untyped]
from config import bin_path, bin_version
ConfigDict = Dict[str, Any]
# def bin_path(binary: str) -> str | None:
# return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
# def bin_version(bin_path: str, cmd: str | None=None) -> str | None:
# return '0.0.0'
# def pretty_path(path: Path) -> str:
# """take a Path object and return the path as a string relative to the current directory"""
# if not path:
# return ''
# return str(path.expanduser().resolve().relative_to(Path.cwd().resolve()))
class ArchiveBoxBaseDependency(models.Model):
singleton_instance_id = 1
id = models.AutoField(default=singleton_instance_id, primary_key=True)
NAME = 'DEFAULT'
LABEL = "Default"
REQUIRED = False
PARENT_DEPENDENCIES: List[str] = []
BIN_DEPENDENCIES: List[str] = []
APT_DEPENDENCIES: List[str] = []
BREW_DEPENDENCIES: List[str] = []
PIP_DEPENDENCIES: List[str] = []
NPM_DEPENDENCIES: List[str] = []
DEFAULT_BINARY: str | None = '/bin/bash'
DEFAULT_START_CMD: str | None = '/bin/bash -c "while true; do sleep 1; done"'
DEFAULT_PID_FILE: str | None = 'logs/{NAME}_WORKER.pid'
DEFAULT_STOP_CMD: str | None = 'kill "$(<{PID_FILE})"'
DEFAULT_VERSION_COMMAND: str | None = '{BINARY} --version'
DEFAULT_ARGS: str | None = ''
VERSION_CMD = '{BINARY} --version'
ENABLED = models.BooleanField(default=True, editable=False)
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
# START_CMD = models.CharField(max_length=255, default=DEFAULT_START_CMD)
# WORKERS = models.IntegerField(default=1)
class Meta:
abstract = True
app_label = 'defaults'
def __str__(self):
return f"{self.LABEL} Dependency Configuration"
def __json__(self):
return {
'type': 'ArchiveBoxDependency',
'__class__': self.__class__.__name__,
'NAME': self.NAME,
'LABEL': self.LABEL,
'ENABLED': self.ENABLED,
'BINARY': self.BINARY,
'ARGS': self.ARGS,
# 'START_CMD': self.START_CMD,
# 'WORKERS': self.WORKERS,
}
@cached_property
def bin_path(self) -> str:
return bin_path(self.BINARY or self.DEFAULT_BINARY)
@cached_property
def bin_version(self) -> str | None:
print(f'ArchiveBoxBaseDependency.bin_version({self.bin_path}, cmd={self.VERSION_CMD.format(BINARY=self.BINARY)})')
return bin_version(self.bin_path, cmd=self.VERSION_CMD.format(BINARY=self.BINARY))
# return bin_version(self.bin_path, cmd=self.VERSION_CMD)
@cached_property
def is_valid(self) -> bool:
return bool(self.bin_path and self.bin_version)
@cached_property
def is_enabled(self) -> bool:
return bool(self.ENABLED and self.is_valid)
@cached_property
def pretty_version(self) -> str:
if self.is_enabled:
if self.is_valid:
color, symbol, note, version = 'green', '', 'valid', ''
parsed_version_num = re.search(r'[\d\.]+', self.bin_version)
if parsed_version_num:
version = f'v{parsed_version_num[0]}'
if not self.bin_version:
color, symbol, note, version = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
path = pretty_path(self.bin_path)
return ' '.join((
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(21),
version.ljust(14),
ANSI[color],
note.ljust(8),
ANSI['reset'],
path.ljust(76),
))
# @helper
def install_parents(self, config):
return {
# parent_dependency.NAME: parent_dependency.get_solo().install_self()
parent_dependency: parent_dependency
for parent_dependency in self.PARENT_DEPENDENCIES
}
# @helper
def install_self(self, config):
assert all(self.install_parents(config=config).values())
BashEnvironmentDependency.get_solo().install_pkgs(self.BIN_DEPENDENCIES)
AptEnvironmentDependency.get_solo().install_pkgs(self.APT_DEPENDENCIES)
BrewEnvironmentDependency.get_solo().install_pkgs(self.BREW_DEPENDENCIES)
PipEnvironmentDependency.get_solo().install_pkgs(self.PIP_DEPENDENCIES)
NPMEnvironmentDependency.get_solo().install_pkgs(self.NPM_DEPENDENCIES)
assert self.is_valid
return self.bin_version
# @task
def run(args, pwd, timeout):
errors = None
timer = TimedProgress(timeout, prefix=' ')
try:
proc = run(cmd=[self.bin_path, *args], pwd=pwd, timeout=timeout)
except Exception as err:
errors = err
finally:
timer.end()
return proc, timer, errors
class ArchiveBoxDefaultDependency(ArchiveBoxBaseDependency, SingletonModel):
singleton_instance_id = 1
id = models.AutoField(default=singleton_instance_id, primary_key=True)
ENABLED = models.BooleanField(default=True, editable=True)
class Meta: # pyright: ignore [reportIncompatibleVariableOverride]
abstract = False
app_label = 'defaults'
verbose_name = 'Default Configuration: Dependencies'
class ArchiveBoxBaseExtractor(models.Model):
singleton_instance_id = 1
id = models.AutoField(default=singleton_instance_id, primary_key=True)
NAME = 'DEFAULT'
LABEL = 'Default'
DEFAULT_DEPENDENCY = ArchiveBoxDefaultDependency
DEPENDENCY = DEFAULT_DEPENDENCY
DEFAULT_ENABLED = True
DEFAULT_CMD = ['{DEPENDENCY.BINARY}', '{ARGS}', '{url}']
DEFAULT_ARGS = ['--timeout={TIMEOUT}']
DEFAULT_TIMEOUT = '{TIMEOUT}'
# DEFAULT_USER_AGENT = '{USER_AGENT}'
# DEFAULT_COOKIES_TXT = '{COOKIES_TXT}'
ENABLED = models.BooleanField(default=DEFAULT_ENABLED, editable=True)
CMD = models.CharField(max_length=255, default=DEFAULT_CMD)
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
TIMEOUT = models.CharField(max_length=255, default=DEFAULT_TIMEOUT)
ALIASES = {
'ENABLED': (f'SAVE_{NAME}', f'USE_{NAME}', f'FETCH_{NAME}'),
}
def __str__(self):
return f"{self.LABEL} Extractor Configuration"
class Meta: # pyright: ignore [reportIncompatibleVariableOverride]
abstract = True
verbose_name = "Default Extractor Configuration"
app_label = 'defaults'
@cached_property
def dependency(self):
return self.DEPENDENCY.get_solo()
def __json__(self):
return {
'type': 'ArchiveBoxExtractor',
'__class__': self.__class__.__name__,
'NAME': self.NAME,
'LABEL': self.LABEL,
'ENABLED': self.ENABLED,
'DEPENDENCY': self.dependency.__json__(),
'ARGS': self.ARGS,
'CMD': self.CMD,
'TIMEOUT': self.TIMEOUT,
'is_valid': self.is_valid,
'is_enabled': self.is_enabled,
}
def format_args(self, csv: List[str], **config):
un_prefixed_config = {**self.__json__()} # e.g. ENABLED=True
prefixed_config = { # e.g. GALLERYDL_ENABLED=True
f'{self.NAME}_{key}': value
for key, value in un_prefixed_config.items()
}
merged_config = {
**config, # e.g. TIMEOUT=60
**un_prefixed_config, # e.g. ENABLED=True
**prefixed_config, # e.g. GALLERYDL_ENABLED=True
}
formatted_config = [
arg.format(**merged_config)
for arg in csv
]
return formatted_config
@cached_property
def is_valid(self):
if not self.dependency.is_valid:
return False
# TIMEOUT must be at least 5 seconds
# if self.TIMEOUT < 5:
# return False
# assert Path(self.COOKIES_TXT).exists()
# TODO: validate user agent with uaparser
# TODO: validate args, cookies.txt?
return True
@cached_property
def is_enabled(self):
return self.ENABLED and self.is_valid and self.dependency.is_enabled
def save(self, *args, **kwargs):
# assert self.is_valid
with transaction.atomic():
result = super().save(*args, **kwargs)
# post to message bus:
print({
'type': f'{self.__class__.__name__}.save',
'diff': self.__json__(),
'kwargs': kwargs,
})
# potential consumers of this event:
# - event logger: write to events.log
# - config file updater: writes to ArchiveBox.conf
# - supervisor: restarts relevant dependencies/extractors
# - etc...
return result
def out_dir(self, url: str, snapshot_dir: Path, config: ConfigDict):
return (snapshot_dir / self.NAME)
def create_out_dir(self, url: str, snapshot_dir: Path, config: ConfigDict):
out_dir = self.out_dir(url=url, snapshot_dir=snapshot_dir, config=config)
return out_dir.mkdir(exist_ok=True)
def should_extract(self, url: str, snapshot_dir: Path, config: ConfigDict):
# return False if extractor is disabled
if not self.is_enabled:
return False
out_dir = self.out_dir(url=url, snapshot_dir=snapshot_dir, config=config)
if has_existing_output := out_dir.glob('*'):
return False
if not (has_write_access := os.access(out_dir, os.W_OK | os.X_OK)):
return False
return True
def get_dependency_cmd(self, url: str, extractor_dir: Path, config: ConfigDict):
return [
self.format_args(self.CMD, **config),
url,
*self.format_args(self.ARGS, **config), # TODO: split and requote this properly
]
# @requires_config('HOSTNAME', 'TIMEOUT', 'USER_AGENT', 'CHECK_SSL_VALIDITY')
def extract(self, url: str, snapshot_dir: Path, config: ConfigDict):
if not self.ENABLED:
return
extractor_dir = self.create_extractor_directory(snapshot_dir)
cmd = self.get_dependency_cmd(url=url, extractor_dir=extractor_dir, config=config)
status, stdout, stderr, output_path = 'failed', '', '', None
try:
proc, timer, errors = self.dependency.run(cmd, cwd=extractor_dir, timeout=self.TIMEOUT)
stdout, stderr = proc.stdout, proc.stderr
if 'ERROR: Unsupported URL' in stderr:
hints = ('gallery-dl doesnt support this type of url yet',)
raise ArchiveError('Failed to save gallerydl', hints)
if proc.returncode == 0 and 'finished' in stdout:
output_path = extractor_dir / 'index.html'
status = 'succeeded'
except Exception as err:
stderr += err
num_bytes, num_dirs, num_files = get_dir_size(extractor_dir)
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=self.dependency.bin_version,
cmd_path=self.dependency.bin_path,
cmd_hostname=config.HOSTNAME,
output_path=output_path,
stdout=stdout,
stderr=stderr,
status=status,
num_bytes=num_bytes,
num_files=num_files,
num_dirs=num_dirs,
**timer.stats,
)
class ArchiveBoxDefaultExtractor(ArchiveBoxBaseExtractor, SingletonModel):
singleton_instance_id = 1
id = models.AutoField(default=singleton_instance_id, primary_key=True)
DEPENDENCY = ArchiveBoxDefaultDependency
ENABLED = models.BooleanField(default=True, editable=True)
class Meta:
abstract = False
app_label = 'defaults'
verbose_name = 'Default Configuration: Extractors'

View file

@ -0,0 +1,15 @@
from pathlib import Path
from django.conf import settings
def register_plugin_settings(settings=settings, name='defaults'):
settings.STATICFILES_DIRS += [
str(Path(settings.PACKAGE_DIR) / f'plugins/{name}/static'),
]
settings.TEMPLATE_DIRS += [
str(Path(settings.PACKAGE_DIR) / f'plugins/{name}/templates'),
]
print('REGISTERED PLUGIN SETTINGS', name)

View file

@ -0,0 +1 @@
__package__ = 'archivebox.plugins.replaywebpage'

View file

@ -0,0 +1,8 @@
from django.contrib import admin
from solo.admin import SingletonModelAdmin
from .models import GalleryDLDependency, GalleryDLExtractor
admin.site.register(GalleryDLDependency, SingletonModelAdmin)
admin.site.register(GalleryDLExtractor, SingletonModelAdmin)

View file

@ -0,0 +1,13 @@
from django.apps import AppConfig
class GalleryDLAppConfig(AppConfig):
label = "Gallery-DL"
name = "plugin_gallerydl"
default_auto_field = "django.db.models.BigAutoField"
def ready(self):
# querying models is ok, but don't fetch rows from DB or perform stateful actions here
print('√ Loaded GalleryDL Plugin')

View file

@ -0,0 +1,50 @@
# browsertrix extractor
def save_browsertrix(link, out_dir, timeout, config):
browsertrix_dir = out_dir / 'browsertrix'
browsertrix_dir.mkdir(exist_ok=True)
crawl_id = link.timestamp
browsertrix_crawler_cmd = [
'crawl',
f'--url', link.url,
f'--collection={crawl_id}',
'--scopeType=page',
'--generateWACZ',
'--text=final-to-warc',
'--timeLimit=60',
]
remote_cmd = """
rm /tmp/dump.rdb;
rm -rf /crawls/collections;
mkdir /crawls/collections;
env CRAWL_ID={crawl_id}
"""
local_cmd = ['nc', 'browsertrix', '2222']
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(local_cmd, cwd=str(out_dir), input=remote_cmd, timeout=timeout)
cmd_output = result.stdout.decode()
wacz_output_file = Path('/browsertrix/crawls') / crawl_id / f'{crawl_id}'.wacz
copy_and_overwrite(wacz_output_file, browsertrix_dir / wacz_output_file.name)
TEMPLATE = """
"""
# rm /tmp/dump.rdb;
# rm -rf /crawls/collections;
# mkdir /crawls/collections;
# env CRAWL_ID=tec2342 crawl --url 'https://example.com' --scopeType page --generateWACZ --collection tec2342 --text final-to-warc --timeLimit 60

View file

@ -0,0 +1,121 @@
from django.db import models
from django.utils.functional import cached_property
from solo.models import SingletonModel
from archivebox.plugins.defaults.models import (
ArchiveBoxDefaultDependency,
ArchiveBoxDefaultExtractor,
BashEnvironmentDependency,
PipEnvironmentDependency,
)
class GalleryDLDependency(ArchiveBoxDefaultDependency, SingletonModel):
NAME = 'GALLERYDL'
LABEL = "GalleryDL"
REQUIRED = False
PARENT_DEPENDENCIES = [
BashEnvironmentDependency,
PipEnvironmentDependency,
]
BIN_DEPENDENCIES = ['gallery-dl']
APT_DEPENDENCIES = []
BREW_DEPENDENCIES = []
PIP_PACKAGES = ['gallery-dl']
NPM_PACKAGES = []
DEFAULT_BINARY = 'gallery-dl'
DEFAULT_START_CMD = None
DEFAULT_ARGS = []
VERSION_CMD = '{BINARY} --version'
ENABLED = models.BooleanField(default=True)
BINARY = models.CharField(max_length=255, default='gallery-dl')
WORKERS = models.IntegerField(default='1')
class GalleryDLExtractor(ArchiveBoxDefaultExtractor, SingletonModel):
NAME = 'GALLERYDL'
LABEL = 'gallery-dl'
DEPENDENCY = GalleryDLDependency.get_solo()
# https://github.com/mikf/gallery-dl
DEFAULT_CMD = [
'{DEPENDENCY.BINARY}',
'{ARGS}'
'{url}',
]
DEFAULT_ARGS = [
'--timeout', self.TIMEOUT.format(**config),
'--cookies', self.COOKIES_TXT.format(**config),
'--user-agent', self.COOKIES_TXT.format(**config),
'--verify', self.CHECK_SSL_VALIDITY.format(**config),
]
ENABLED = models.BooleanField(default=True)
CMD = models.CharField(max_length=255, default=DEFAULT_CMD)
ARGS = models.CSVField(max_length=255, default=DEFAULT_ARGS)
TIMEOUT = models.CharField(max_length=255, default='{TIMEOUT}')
USER_AGENT = models.CharField(max_length=255, default='{USER_AGENT}')
COOKIES_TXT = models.CharField(max_length=255, default='{COOKIES_TXT}')
CHECK_SSL_VALIDITY = models.CharField(default='{CHECK_SSL_VALIDITY}')
# @task
# @requires_config('HOSTNAME', 'TIMEOUT', 'USER_AGENT', 'CHECK_SSL_VALIDITY')
def extract(self, url: str, out_dir: Path, config: ConfigDict):
if not self.ENABLED:
return
extractor_dir = self.create_extractor_directory(out_dir)
cmd = [
self.CMD,
url,
'--timeout', self.TIMEOUT.format(**config),
'--cookies', self.COOKIES_TXT.format(**config),
'--user-agent', self.COOKIES_TXT.format(**config),
'--verify', self.CHECK_SSL_VALIDITY.format(**config),
*split_args(self.ARGS.format(**config)),
]
status, stdout, stderr, output_path = 'failed', '', '', None
try:
proc, timer, errors = self.DEPENDENCY.run(cmd, cwd=extractor_dir, timeout=self.GALLERYDL_TIMEOUT)
stdout, stderr = proc.stdout, proc.stderr
if 'ERROR: Unsupported URL' in stderr:
hints = ('gallery-dl doesnt support this type of url yet',)
raise ArchiveError('Failed to save gallerydl', hints)
if proc.returncode == 0 and 'finished' in stdout:
output_path = extractor_dir / 'index.html'
status = 'succeeded'
except Exception as err:
stderr += err
num_bytes, num_dirs, num_files = get_dir_size(extractor_dir)
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=self.DEPENDENCY.bin_version,
cmd_path=self.DEPENDENCY.bin_path,
cmd_hostname=config.HOSTNAME,
output_path=output_path,
stdout=stdout,
stderr=stderr,
status=status,
num_bytes=num_bytes,
num_files=num_files,
num_dirs=num_dirs,
**timer.stats,
)

View file

@ -0,0 +1,59 @@
dependencies:
GalleryDLDependency:
ID: gallerydl
LABEL: GalleryDL
REQUIRED: false
PARENT_DEPENDENCIES:
- BashEnvironmentDependency
- PipEnvironmentDependency
PIP_DEPENDENCIES:
- gallery-dl
USER_CONFIG:
ENABLED: models.BooleanField(max_length=255, default={DEFAULT_CONFIG.ENABLED})
BINARY: models.CharField(max_length=255, default={DEFAULT_CONFIG.BINARY})
DEFAULT_CONFIG:
ENABLED: true
BINARY: 'gallery-dl'
CONFIG_ALIASES:
- SAVE_GALLERYDL: ENABLED
- USE_GALLERYDL: ENABLED
- GALLERYDL_ENABLED: ENABLED
- GALLERYDL_BINARY: BINARY
TASKS:
# plugins.GalleryDLDependency
run_dependency: plugins.gallerydl.models.GalleryDLDependency.run_dependency
extractors:
GalleryDLExtractor:
ID: GALLERYDL
LABEL: GalleryDL
ENABLED: true
DEPENDENCY: GalleryDLDependency
CONFIG:
ENABLED: models.BooleanField(default={DEFAULT_CONFIG.ENABLED})
CMD: models.CharField(max_length=255, default={DEFAULT_CONFIG.CMD})
ARGS: models.CharField(max_length=255, default={DEFAULT_CONFIG.ARGS})
USER_AGENT: models.CharField(max_length=255, default={DEFAULT_CONFIG.USER_AGENT})
CHECK_SSL_VALIDITY: models.CharField(max_length=255, default={DEFAULT_CONFIG.CHECK_SSL_VALIDITY})
DEFAULT_CONFIG:
ENABLED: true
CMD: gallery-dl {args} {url}
ARGS: --user-agent={USER_AGENT} --check-ssl={CHECK_SSL_VALIDITY}
CHECK_SSL_VALIDITY: {CHECK_SSL_VALIDITY}
USER_AGENT: {USER_AGENT}
TASKS:
CREATE_OUT_DIR: plugins.gallerydl.tasks.create_out_dir
SHOULD_EXTRACT: plugins.gallerydl.tasks.should_extract
EXTRACT: plugins.gallerydl.tasks.extract

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
test content this should be visible

Binary file not shown.

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,40 @@
{% load tz core_tags static %}
<!DOCTYPE html>
<html lang="en">
<head>
<title>{{title}}</title>
<meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
</style>
<style>
html, body {
width: 100%;
height: 100%;
background-color: #ddd;
}
</style>
</head>
<body>
ReplayWeb.page for: {{snapshot.url}} ({{timestamp}}) /{{warc_filename}}
{{snapshot}}
<script>
// https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/sw.min.js
// https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/ui.min.js
</script>
<style>
</style>
<script src="/static/ui.js"></script>
<replay-web-page
style="height: 600px"
embed="replay"
replayBase="/static/"
source="/static/test.wacz"
url="https://example.com/">
</replay-web-page>
</body>
</html>

View file

@ -0,0 +1,12 @@
from django.urls import path
from .views import GalleryDLIconView, GalleryDLEmbedView, GalleryDLOutputView, GalleryDLDependencyView, GalleryDLExtractorView
urlpatterns = [
path('/plugins/gallerydl/icon/<path:path>', GalleryDLIconView(.as_view), name='gallerydl_icon'),
path('/plugins/gallerydl/embed/<path:path>', GalleryDLEmbedView.as_view(), name='gallerydl_embed'),
path('/plugins/gallerydl/output/<path:path>', GalleryDLOutputView.as_view(), name='gallerydl_output'),
path('/plugins/gallerydl/dependency/', GalleryDLDependencyView.as_view(), name='gallerydl_dependency'),
path('/plugins/gallerydl/extractor/', GalleryDLExtractorView.as_view(), name='gallerydl_extractor'),
]

View file

@ -0,0 +1,78 @@
import os
import sys
from pathlib import Path
from django.views import View
from django.shortcuts import render
from django.db.models import Q
from core.models import Snapshot
# from archivebox.config import PUBLIC_SNAPSHOTS
PUBLIC_SNAPSHOTS = True
class GalleryDLIconView(View):
template_name = 'plugin_gallerydl__icon.html'
# render static html index from filesystem archive/<timestamp>/index.html
def get_context_data(self, **kwargs):
return {
# **super().get_context_data(**kwargs),
# 'VERSION': VERSION,
# 'COMMIT_HASH': COMMIT_HASH,
# 'FOOTER_INFO': FOOTER_INFO,
}
def get(self, request, path):
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
# ...
return render(template_name=self.template_name, request=self.request, context=context)
class GalleryDLEmbedView(View):
template_name = 'plugin_gallerydl__embed.html'
# render static html index from filesystem archive/<timestamp>/index.html
def get_context_data(self, **kwargs):
return {
# **super().get_context_data(**kwargs),
# 'VERSION': VERSION,
# 'COMMIT_HASH': COMMIT_HASH,
# 'FOOTER_INFO': FOOTER_INFO,
}
def get(self, request, path):
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
# ...
return render(template_name=self.template_name, request=self.request, context=context)
class GalleryDLOutputView(View):
template_name = 'plugin_gallerydl__output.html'
# render static html index from filesystem archive/<timestamp>/index.html
def get_context_data(self, **kwargs):
return {
# **super().get_context_data(**kwargs),
# 'VERSION': VERSION,
# 'COMMIT_HASH': COMMIT_HASH,
# 'FOOTER_INFO': FOOTER_INFO,
}
def get(self, request, path):
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
# ...
return render(template_name=self.template_name, request=self.request, context=context)

View file

@ -0,0 +1 @@
__package__ = 'archivebox.plugins.replaywebpage'

View file

@ -0,0 +1,8 @@
from django.apps import AppConfig
class ReplayWebPageConfig(AppConfig):
label = "ReplayWeb.Page"
name = "plugin_replaywebpage"
default_auto_field = "django.db.models.BigAutoField"

View file

@ -0,0 +1,50 @@
# browsertrix extractor
def save_browsertrix(link, out_dir, timeout, config):
browsertrix_dir = out_dir / 'browsertrix'
browsertrix_dir.mkdir(exist_ok=True)
crawl_id = link.timestamp
browsertrix_crawler_cmd = [
'crawl',
f'--url', link.url,
f'--collection={crawl_id}',
'--scopeType=page',
'--generateWACZ',
'--text=final-to-warc',
'--timeLimit=60',
]
remote_cmd = """
rm /tmp/dump.rdb;
rm -rf /crawls/collections;
mkdir /crawls/collections;
env CRAWL_ID={crawl_id}
"""
local_cmd = ['nc', 'browsertrix', '2222']
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(local_cmd, cwd=str(out_dir), input=remote_cmd, timeout=timeout)
cmd_output = result.stdout.decode()
wacz_output_file = Path('/browsertrix/crawls') / crawl_id / f'{crawl_id}'.wacz
copy_and_overwrite(wacz_output_file, browsertrix_dir / wacz_output_file.name)
TEMPLATE = """
"""
# rm /tmp/dump.rdb;
# rm -rf /crawls/collections;
# mkdir /crawls/collections;
# env CRAWL_ID=tec2342 crawl --url 'https://example.com' --scopeType page --generateWACZ --collection tec2342 --text final-to-warc --timeLimit 60

View file

@ -0,0 +1,12 @@
# from solo.models import SingletonModel
# class ReplayWebPageConfiguration(SingletonModel):
# site_name = models.CharField(max_length=255, default='Site Name')
# maintenance_mode = models.BooleanField(default=False)
# def __str__(self):
# return "Site Configuration"
# class Meta:
# verbose_name = "Site Configuration"

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
test content this should be visible

Binary file not shown.

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,40 @@
{% load tz core_tags static %}
<!DOCTYPE html>
<html lang="en">
<head>
<title>{{title}}</title>
<meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
</style>
<style>
html, body {
width: 100%;
height: 100%;
background-color: #ddd;
}
</style>
</head>
<body>
ReplayWeb.page for: {{snapshot.url}} ({{timestamp}}) /{{warc_filename}}
{{snapshot}}
<script>
// https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/sw.min.js
// https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/ui.min.js
</script>
<style>
</style>
<script src="/static/ui.js"></script>
<replay-web-page
style="height: 600px"
embed="replay"
replayBase="/static/"
source="/static/test.wacz"
url="https://example.com/">
</replay-web-page>
</body>
</html>

View file

@ -0,0 +1,7 @@
from django.urls import path
from .views import ReplayWebPageViewer
urlpatterns = [
path('<path:path>', ReplayWebPageViewer.as_view(), name='plugin_replaywebpage__viewer'),
]

View file

@ -0,0 +1,47 @@
import os
import sys
from pathlib import Path
from django.views import View
from django.shortcuts import render
from django.db.models import Q
from core.models import Snapshot
# from archivebox.config import PUBLIC_SNAPSHOTS
PUBLIC_SNAPSHOTS = True
class ReplayWebPageViewer(View):
template_name = 'plugin_replaywebpage__viewer.html'
# render static html index from filesystem archive/<timestamp>/index.html
def get_context_data(self, **kwargs):
return {
# **super().get_context_data(**kwargs),
# 'VERSION': VERSION,
# 'COMMIT_HASH': COMMIT_HASH,
# 'FOOTER_INFO': FOOTER_INFO,
}
def get(self, request, path):
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
try:
timestamp, warc_filename = path.split('/', 1)
except (IndexError, ValueError):
timestamp, warc_filename = path.split('/', 1)[0], ''
snapshot = Snapshot.objects.get(Q(timestamp=timestamp) | Q(id__startswith=timestamp))
context = self.get_context_data()
context.update({
"snapshot": snapshot,
"timestamp": timestamp,
"warc_filename": warc_filename,
})
return render(template_name=self.template_name, request=self.request, context=context)

View file

@ -0,0 +1,3 @@
__package__ = 'archivebox.plugins.system'
default_app_config = 'plugins.system.apps.SystemPluginAppConfig'

View file

@ -0,0 +1,49 @@
from django.contrib import admin
from solo.admin import SingletonModelAdmin
from plugins.defaults.admin import DependencyAdmin, ExtractorAdmin
from .models import (
BashEnvironmentDependency,
PythonEnvironmentDependency,
NodeJSEnvironmentDependency,
AptEnvironmentDependency,
BrewEnvironmentDependency,
PipEnvironmentDependency,
NPMEnvironmentDependency,
SQLiteDependency,
DjangoDependency,
ArchiveBoxDependency,
# ArchiveBoxDefaultExtractor,
)
print('DefaultsPluginConfig.admin')
class MultiDependencyAdmin(admin.ModelAdmin):
readonly_fields = DependencyAdmin.readonly_fields
list_display = ('id', 'NAME', 'ENABLED', 'BINARY', 'ARGS', 'bin_path', 'bin_version', 'is_valid', 'is_enabled')
class MultiExtractorAdmin(admin.ModelAdmin):
readonly_fields = DependencyAdmin.readonly_fields
list_display = ('id', 'NAME', 'CMD', 'ARGS', 'is_valid', 'is_enabled')
# admin.site.register(BashEnvironmentDependency, DependencyAdmin)
admin.site.register(BashEnvironmentDependency, MultiDependencyAdmin)
admin.site.register(PythonEnvironmentDependency, DependencyAdmin)
admin.site.register(NodeJSEnvironmentDependency, DependencyAdmin)
admin.site.register(AptEnvironmentDependency, DependencyAdmin)
admin.site.register(BrewEnvironmentDependency, DependencyAdmin)
admin.site.register(PipEnvironmentDependency, DependencyAdmin)
admin.site.register(NPMEnvironmentDependency, DependencyAdmin)
admin.site.register(SQLiteDependency, DependencyAdmin)
admin.site.register(DjangoDependency, DependencyAdmin)
admin.site.register(ArchiveBoxDependency, DependencyAdmin)
# admin.site.register(ArchiveBoxDefaultExtractor, ExtractorAdmin)

View file

@ -0,0 +1,21 @@
__package__ = 'archivebox.plugins.system'
from django.apps import AppConfig
class SystemPluginAppConfig(AppConfig):
name = "plugins.system"
verbose_name = "Host System Configuration"
default_auto_field = "django.db.models.AutoField"
def ready(self):
print('plugins.system.apps.SystemPluginConfig.ready')
from django.conf import settings
from plugins.defaults.settings import register_plugin_settings
register_plugin_settings(settings, name=self.name)

View file

@ -0,0 +1,144 @@
# Generated by Django 3.1.14 on 2024-01-24 08:56
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='AptEnvironmentDependency',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('ENABLED', models.BooleanField(default=True)),
('BINARY', models.CharField(default='apt-get', max_length=255)),
('ARGS', models.CharField(default='-qq', max_length=255)),
],
options={
'verbose_name': 'Package Manager: apt',
'abstract': False,
},
),
migrations.CreateModel(
name='ArchiveBoxDependency',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('ENABLED', models.BooleanField(default=True, editable=False)),
('BINARY', models.CharField(default='archivebox', editable=False, max_length=255)),
('ARGS', models.CharField(default=[], editable=False, max_length=255)),
],
options={
'verbose_name': 'Internal Dependency: ArchiveBox Package',
'abstract': False,
},
),
migrations.CreateModel(
name='BashEnvironmentDependency',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('ENABLED', models.BooleanField(default=True, editable=False)),
('BINARY', models.CharField(default='bash', max_length=255)),
('ARGS', models.CharField(default='-c', max_length=255)),
],
options={
'verbose_name': 'Shell Environment: bash',
'abstract': False,
},
),
migrations.CreateModel(
name='BrewEnvironmentDependency',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('ENABLED', models.BooleanField(default=True)),
('BINARY', models.CharField(default='brew', max_length=255)),
('ARGS', models.CharField(default='', max_length=255)),
],
options={
'verbose_name': 'Package Manager: brew',
'abstract': False,
},
),
migrations.CreateModel(
name='DjangoDependency',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('ENABLED', models.BooleanField(default=True, editable=False)),
('BINARY', models.CharField(default='django-admin.py', editable=False, max_length=255)),
('ARGS', models.CharField(default=[], editable=False, max_length=255)),
],
options={
'verbose_name': 'Internal Dependency: Django Package',
'abstract': False,
},
),
migrations.CreateModel(
name='NodeJSEnvironmentDependency',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('ENABLED', models.BooleanField(default=True)),
('BINARY', models.CharField(default='node', max_length=255)),
('ARGS', models.CharField(default='-c', max_length=255)),
],
options={
'verbose_name': 'Shell Environment: NodeJS',
'abstract': False,
},
),
migrations.CreateModel(
name='NPMEnvironmentDependency',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('ENABLED', models.BooleanField(default=True)),
('BINARY', models.CharField(default='node', max_length=255)),
('ARGS', models.CharField(default='', max_length=255)),
],
options={
'verbose_name': 'Package Manager: npm',
'abstract': False,
},
),
migrations.CreateModel(
name='PipEnvironmentDependency',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('ENABLED', models.BooleanField(default=True)),
('BINARY', models.CharField(default='pip3', max_length=255)),
('ARGS', models.CharField(default='', max_length=255)),
],
options={
'verbose_name': 'Package Manager: pip',
'abstract': False,
},
),
migrations.CreateModel(
name='PythonEnvironmentDependency',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('ENABLED', models.BooleanField(default=True, editable=False)),
('BINARY', models.CharField(default='python3', max_length=255)),
('ARGS', models.CharField(default='-c', max_length=255)),
],
options={
'verbose_name': 'Shell Environment: Python3',
'abstract': False,
},
),
migrations.CreateModel(
name='SQLiteDependency',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('ENABLED', models.BooleanField(default=True, editable=False)),
('BINARY', models.CharField(default='sqlite3', editable=False, max_length=255)),
('ARGS', models.CharField(default=[], editable=False, max_length=255)),
],
options={
'verbose_name': 'Internal Dependency: SQLite3 Package',
'abstract': False,
},
),
]

View file

@ -0,0 +1,33 @@
# Generated by Django 3.1.14 on 2024-01-24 09:43
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('system', '0001_initial'),
]
operations = [
migrations.AlterModelOptions(
name='archiveboxdependency',
options={'verbose_name': 'Internal Dependency: archivebox'},
),
migrations.AlterModelOptions(
name='djangodependency',
options={'verbose_name': 'Internal Dependency: django'},
),
migrations.AlterModelOptions(
name='nodejsenvironmentdependency',
options={'verbose_name': 'Shell Environment: node'},
),
migrations.AlterModelOptions(
name='pythonenvironmentdependency',
options={'verbose_name': 'Shell Environment: python3'},
),
migrations.AlterModelOptions(
name='sqlitedependency',
options={'verbose_name': 'Internal Dependency: sqlite3'},
),
]

View file

@ -0,0 +1,22 @@
# Generated by Django 3.1.14 on 2024-01-24 09:56
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('system', '0002_auto_20240124_0943'),
]
operations = [
migrations.AlterModelOptions(
name='bashenvironmentdependency',
options={'verbose_name': 'Shell Environment: bash', 'verbose_name_plural': 'Shell Environments: bash'},
),
migrations.AddField(
model_name='bashenvironmentdependency',
name='VERSION_CMD',
field=models.CharField(default='{BINARY} --version', max_length=255),
),
]

View file

@ -0,0 +1,448 @@
# __package__ = 'archivebox.plugins.system'
import os
import shutil
import sys
import inspect
import django
from sqlite3 import dbapi2 as sqlite3
from pathlib import Path
from typing import List, Dict, Any
from django.db import models
from django.utils.functional import cached_property
from solo.models import SingletonModel
from config import bin_path, bin_version, VERSION
from plugins.defaults.models import ArchiveBoxBaseDependency
ConfigDict = Dict[str, Any]
class BashEnvironmentDependency(ArchiveBoxBaseDependency):
singleton_instance_id = 1
id = models.AutoField(primary_key=True)
NAME = 'BASH'
LABEL = "Bash"
REQUIRED = True
PARENT_DEPENDENCIES = []
BIN_DEPENDENCIES: List[str] = ['bash']
APT_DEPENDENCIES: List[str] = []
BREW_DEPENDENCIES: List[str] = []
PIP_DEPENDENCIES: List[str] = []
NPM_DEPENDENCIES: List[str] = []
DEFAULT_BINARY = 'bash'
DEFAULT_START_CMD = None
DEFAULT_STOP_CMD = None
DEFAULT_PID_FILE = None
DEFAULT_ARGS = '-c'
ENABLED = models.BooleanField(default=True, editable=not REQUIRED)
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
VERSION_CMD = models.CharField(max_length=255, default='{BINARY} --version')
# START_CMD = models.CharField(max_length=255, default=DEFAULT_START_CMD)
# WORKERS = models.IntegerField(default=1)
class Meta:
abstract = False
app_label = 'system'
verbose_name = "Shell Environment: bash"
verbose_name_plural = "Shell Environments: bash"
# @task
def install_pkgs(self, os_pkgs=()):
assert self.is_valid, 'Bash environment is not available on this host'
for os_dependency in os_pkgs:
assert bin_path(os_dependency)
return True
class PythonEnvironmentDependency(ArchiveBoxBaseDependency):
singleton_instance_id = 1
id = models.AutoField(primary_key=True)
NAME = 'PYTHON'
LABEL = "Python"
REQUIRED = True
PARENT_DEPENDENCIES = []
BIN_DEPENDENCIES = ['python3']
APT_DEPENDENCIES = []
BREW_DEPENDENCIES = []
PIP_DEPENDENCIES = []
NPM_DEPENDENCIES = []
DEFAULT_BINARY = 'python3'
DEFAULT_START_CMD = None
DEFAULT_STOP_CMD = None
DEFAULT_PID_FILE = None
DEFAULT_ARGS = '-c'
VERSION_CMD = '{BINARY} --version'
ENABLED = models.BooleanField(default=True, editable=not REQUIRED)
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
# START_CMD = models.CharField(max_length=255, default=DEFAULT_START_CMD)
# WORKERS = models.IntegerField(default=1)
class Meta:
abstract = False
app_label = 'system'
verbose_name = "Shell Environment: python3"
class NodeJSEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel):
singleton_instance_id = 1
id = models.AutoField(primary_key=True)
NAME = 'NODEJS'
LABEL = "NodeJS"
REQUIRED = True
PARENT_DEPENDENCIES = []
BIN_DEPENDENCIES = ['node']
APT_DEPENDENCIES = []
BREW_DEPENDENCIES = []
PIP_DEPENDENCIES = []
NPM_DEPENDENCIES = []
DEFAULT_BINARY = 'node'
DEFAULT_START_CMD = None
DEFAULT_STOP_CMD = None
DEFAULT_PID_FILE = None
DEFAULT_ARGS = '-c'
VERSION_CMD = '{BINARY} --version'
ENABLED = models.BooleanField(default=True, editable=True)
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
# START_CMD = models.CharField(max_length=255, default=DEFAULT_START_CMD)
# WORKERS = models.IntegerField(default=1)
class Meta:
abstract = False
app_label = 'system'
verbose_name = "Shell Environment: node"
class AptEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel):
singleton_instance_id = 1
id = models.AutoField(primary_key=True)
NAME = 'APT'
LABEL = "apt"
REQUIRED = False
PARENT_DEPENDENCIES = ['BashEnvironmentDependency']
BIN_DEPENDENCIES = ['apt-get']
APT_DEPENDENCIES = []
BREW_DEPENDENCIES = []
PIP_PACKAGES = []
NPM_PACKAGES = []
DEFAULT_BINARY = 'apt-get'
DEFAULT_START_CMD = None
DEFAULT_STOP_CMD = None
DEFAULT_PID_FILE = None
DEFAULT_ARGS = '-qq'
ENABLED = models.BooleanField(default=True, editable=not REQUIRED)
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
class Meta:
abstract = False
app_label = 'system'
verbose_name = "Package Manager: apt"
# @task
def install_pkgs(self, apt_pkgs=()):
assert self.is_valid, 'Apt environment is not available on this host'
# with huey.lock_task('apt-install'):
run(cmd=[self.DEFAULT_BINARY, '-qq', 'update'])
for apt_package in apt_pkgs:
run(cmd=[self.DEFAULT_BINARY, 'install', '-y', apt_package])
return True
class BrewEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel):
singleton_instance_id = 1
id = models.AutoField(primary_key=True)
NAME = 'BREW'
LABEL = "homebrew"
REQUIRED = False
PARENT_DEPENDENCIES = ['BashEnvironmentDependency']
BIN_DEPENDENCIES = ['brew']
APT_DEPENDENCIES = []
BREW_DEPENDENCIES = []
PIP_PACKAGES = []
NPM_PACKAGES = []
DEFAULT_BINARY = 'brew'
DEFAULT_START_CMD = None
DEFAULT_STOP_CMD = None
DEFAULT_PID_FILE = None
DEFAULT_ARGS = ''
ENABLED = models.BooleanField(default=True, editable=True)
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
class Meta:
abstract = False
app_label = 'system'
verbose_name = "Package Manager: brew"
# @task
def install_pkgs(self, brew_pkgs=()):
assert self.is_valid, 'Brw environment is not available on this host'
run(cmd=[self.DEFAULT_BINARY, 'update'])
for brew_pkg in brew_pkgs:
run(cmd=[self.DEFAULT_BINARY, 'install', brew_pkg])
return True
class PipEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel):
singleton_instance_id = 1
id = models.AutoField(primary_key=True)
NAME = 'PIP'
LABEL = "pip"
REQUIRED = False
PARENT_DEPENDENCIES = ['BashEnvironmentDependency']
BIN_DEPENDENCIES = ['python3', 'pip3']
APT_DEPENDENCIES = ['python3.11', 'pip3', 'pipx']
BREW_DEPENDENCIES = ['python@3.11', 'pipx']
PIP_PACKAGES = ['setuptools', 'pipx']
NPM_PACKAGES = []
DEFAULT_BINARY = 'pip3'
DEFAULT_START_CMD = None
DEFAULT_STOP_CMD = None
DEFAULT_PID_FILE = None
DEFAULT_ARGS = ''
VERSION_CMD = '{BINARY} --version'
ENABLED = models.BooleanField(default=True, editable=True)
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
class Meta:
abstract = False
app_label = 'system'
verbose_name = "Package Manager: pip"
# @task
def install_pkgs(self, pip_pkgs=()):
assert self.is_valid, 'Pip environment is not available on this host'
for pip_pkg in pip_pkgs:
run(cmd=[self.DEFAULT_BINARY, 'install', '--update', '--ignore-installed', pip_pkg])
return True
class NPMEnvironmentDependency(ArchiveBoxBaseDependency, SingletonModel):
singleton_instance_id = 1
id = models.AutoField(primary_key=True)
NAME = 'NODEJS'
LABEL = "NodeJS"
REQUIRED = False
PARENT_DEPENDENCIES = ['BashEnvironmentDependency']
BIN_DEPENDENCIES = ['node', 'npm']
APT_DEPENDENCIES = ['node', 'npm']
BREW_DEPENDENCIES = ['node', 'npm']
PIP_PACKAGES = []
NPM_PACKAGES = ['npm']
DEFAULT_BINARY = 'node'
DEFAULT_START_CMD = None
DEFAULT_STOP_CMD = None
DEFAULT_PID_FILE = None
DEFAULT_ARGS = ''
VERSION_CMD = '{BINARY} --version'
ENABLED = models.BooleanField(default=True, editable=True)
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY)
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS)
class Meta:
abstract = False
app_label = 'system'
verbose_name = "Package Manager: npm"
# @task
def install_pkgs(self, npm_pkgs=()):
assert self.is_valid, 'NPM environment is not available on this host'
for npm_pkg in npm_pkgs:
run(cmd=[self.DEFAULT_BINARY, 'install', npm_pkg])
return True
class DjangoDependency(ArchiveBoxBaseDependency, SingletonModel):
singleton_instance_id = 1
id = models.AutoField(primary_key=True)
NAME = 'DJANGO'
LABEL = "Django"
REQUIRED = True
PARENT_DEPENDENCIES = []
BIN_DEPENDENCIES = ['django-admin.py']
APT_DEPENDENCIES = []
BREW_DEPENDENCIES = []
PIP_PACKAGES = ['django==3.1.14']
NPM_PACKAGES = []
DEFAULT_BINARY = 'django-admin.py'
DEFAULT_START_CMD = 'archivebox server 0.0.0.0:8000'
DEFAULT_PID_FILE = 'logs/{NAME}_WORKER.pid'
DEFAULT_STOP_CMD = 'kill "$(<{PID_FILE})"'
DEFAULT_ARGS = []
VERSION_CMD = '{BINARY} --version'
ENABLED = models.BooleanField(default=True, editable=False)
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY, editable=False)
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS, editable=False)
class Meta:
abstract = False
app_label = 'system'
verbose_name = "Internal Dependency: django"
@cached_property
def bin_path(self):
return inspect.getfile(django)
@cached_property
def bin_version(self):
return '.'.join(str(v) for v in django.VERSION[:3])
class SQLiteDependency(ArchiveBoxBaseDependency, SingletonModel):
singleton_instance_id = 1
id = models.AutoField(primary_key=True)
NAME = 'SQLITE'
LABEL = "SQLite"
REQUIRED = True
PARENT_DEPENDENCIES = []
BIN_DEPENDENCIES = []
APT_DEPENDENCIES = []
BREW_DEPENDENCIES = []
PIP_PACKAGES = []
NPM_PACKAGES = []
DEFAULT_BINARY = 'sqlite3'
DEFAULT_START_CMD = None
DEFAULT_STOP_CMD = None
DEFAULT_PID_FILE = None
DEFAULT_ARGS = []
VERSION_CMD = 'python3 -c ""'
ENABLED = models.BooleanField(default=True, editable=False)
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY, editable=False)
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS, editable=False)
class Meta:
abstract = False
app_label = 'system'
verbose_name = "Internal Dependency: sqlite3"
@cached_property
def bin_path(self):
return inspect.getfile(sqlite3)
@cached_property
def bin_version(self):
return sqlite3.version
class ArchiveBoxDependency(ArchiveBoxBaseDependency):
singleton_instance_id = 1
id = models.AutoField(primary_key=True)
NAME = 'ARCHIVEBOX'
LABEL = "ArchiveBox"
REQUIRED = True
PARENT_DEPENDENCIES = [
'PipEnvironmentDependency',
'DjangoDependency',
'SQLiteDependency',
]
BIN_DEPENDENCIES = ['archivebox']
APT_DEPENDENCIES = []
BREW_DEPENDENCIES = []
PIP_PACKAGES = ['archivebox']
NPM_PACKAGES = []
DEFAULT_BINARY = 'archivebox'
DEFAULT_START_CMD = '{BINARY} server 0.0.0.0:8000'
DEFAULT_ARGS = []
VERSION_CMD = 'archivebox --version'
ENABLED = models.BooleanField(default=True, editable=False)
BINARY = models.CharField(max_length=255, default=DEFAULT_BINARY, editable=False)
ARGS = models.CharField(max_length=255, default=DEFAULT_ARGS, editable=False)
class Meta:
abstract = False
app_label = 'system'
verbose_name = "Internal Dependency: archivebox"
@cached_property
def bin_path(self):
return sys.argv[0] or bin_path('archivebox')
@cached_property
def bin_version(self):
# return config['VERSION']
return VERSION

View file

@ -0,0 +1,3 @@
from django.conf import settings
from plugins.defaults import register_plugin_settings

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -271,7 +271,11 @@ def get_headers(url: str, timeout: int=None) -> str:
return pyjson.dumps(
{
'URL': url,
'Status-Code': response.status_code,
'Elapsed': response.elapsed,
'Encoding': response.encoding,
'Apparent-Encoding': response.apparent_encoding,
**dict(response.headers),
},
indent=4,

View file

@ -64,7 +64,7 @@ if [[ -d "$DATA_DIR/archive" ]]; then
rm -f "$DATA_DIR/archive/.permissions_test_safe_to_delete"
# echo "[√] Permissions are correct"
else
# the only time this fails is if the host filesystem doesn't allow us to write as root (e.g. some NFS mapall/maproot problems, connection issues, drive dissapeared, etc.)
# the only time this fails is if the host filesystem doesn't allow us to write as root (e.g. some NFS mapall/maproot problems, connection issues, drive dissapeared, etc.)
echo -e "\n[X] Error: archivebox user (PUID=$PUID) is not able to write to your ./data/archive dir (currently owned by $(stat -c '%u' "$DATA_DIR/archive"):$(stat -c '%g' "$DATA_DIR/archive")." > /dev/stderr
echo -e " Change ./data to be owned by PUID=$PUID PGID=$PGID on the host and retry:" > /dev/stderr
echo -e " \$ chown -R $PUID:$PGID ./data\n" > /dev/stderr
@ -89,7 +89,8 @@ if ! chown $PUID:$PGID "$DATA_DIR"/* > /dev/null 2>&1; then
find "$DATA_DIR" -type d -not -path "$DATA_DIR/archive*" -exec chown $PUID:$PGID {} \; > /dev/null 2>&1
find "$DATA_DIR" -type f -not -path "$DATA_DIR/archive/*" -exec chown $PUID:$PGID {} \; > /dev/null 2>&1
fi
mkdir -p /var/spool/cron/crontabs
chown -R $PUID:$PGID /var/spool/cron/crontabs > /dev/null 2>&1 &
# also chown BROWSERS_DIR because otherwise 'archivebox setup' wont be able to 'playwright install chromium' at runtime
export PLAYWRIGHT_BROWSERS_PATH="${PLAYWRIGHT_BROWSERS_PATH:-/browsers}"
@ -191,9 +192,11 @@ if [[ "$1" == /* || "$1" == "bash" || "$1" == "sh" || "$1" == "echo" || "$1" ==
# "docker run archivebox /bin/bash -c '...'"
# "docker run archivebox cat /VERSION.txt"
exec gosu "$PUID" /bin/bash -c "exec $(printf ' %q' "$@")"
# WARNING: make sure to test extensively if you change this line, there are many edge-cases with nested quotes, special character, etc.
# printf requotes shell parameters properly https://stackoverflow.com/a/39463371/2156113
# gosu spawns an ephemeral bash process owned by archivebox user (bash wrapper is needed to load env vars, PATH, and setup terminal TTY)
# outermost exec hands over current process ID to inner bash process, inner exec hands over inner bash PID to user's command
# - https://github.com/ArchiveBox/ArchiveBox/issues/1191
else
# handle "docker run archivebox add some subcommand --with=args abc" by calling archivebox to run as args as CLI subcommand
# e.g. "docker run archivebox help"

38
bin/docker_ipc_listener.py Executable file
View file

@ -0,0 +1,38 @@
#!/usr/bin/env python3
# Allow another docker container to run commands on this container
# This is the script to run on the server container.
# The client can connect and run a command like so:
# $ echo whoami | nc servercontainername 2222
# root
import socket
import subprocess as sp
from datetime import datetime
LISTEN_PORT = 2222
s1 = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s1.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
s1.bind(("0.0.0.0", LISTEN_PORT))
s1.listen(1)
print("Listening for shell commands on 0.0.0.0:2222", flush=True)
conn, addr = s1.accept()
while True:
cmd = conn.recv(1024).decode()
if not cmd:
conn, addr = s1.accept()
continue
timestamp = datetime.now().isoformat()
client_ip, client_port = conn.getsockname()
print(f'\n[{timestamp}][{client_ip}:{client_port}] $', cmd)
with sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.STDOUT, stdin=sp.PIPE, bufsize=1, universal_newlines=True) as p:
for line in p.stdout:
print(line.strip(), flush=True)
conn.sendall(line.encode("utf-8"))
conn.close()
conn, addr = s1.accept()

View file

@ -39,7 +39,6 @@ services:
# dns:
# - 172.20.0.53
######## Optional Addons: tweak examples below as needed for your specific use case ########
### This optional container runs any scheduled tasks in the background, add new tasks like so:
@ -188,6 +187,13 @@ services:
# - ./wireguard.conf:/config/wg0.conf:ro
### Example: Run browsertrix in parallel with ArchiveBox
# browsertrix:
# image: webrecorder/browsertrix-crawler:latest
# volumes:
# - ./browsertrix:/crawls:z
### Example: Run PYWB in parallel and auto-import WARCs from ArchiveBox
# pywb:

View file

@ -7,7 +7,12 @@ wsgi-file = archivebox/core/wsgi.py
processes = 4
threads = 1
stats = 127.0.0.1:9191
static-map /static=./archivebox/templates/static
static-map = /static=./archivebox/templates/static
static-map = /static=./archivebox/plugins/replaywebpage/static
static-map = /archive=$(PWD)/archive
static=index = index.html
harakiri = 172800
post-buffering = 1
disable-logging = True
check-static
honour-range = True

View file

@ -1,13 +1,14 @@
{
"name": "archivebox",
"version": "0.8.0",
"description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
"repository": "github:ArchiveBox/ArchiveBox",
"license": "MIT",
"dependencies": {
"@postlight/parser": "^2.2.3",
"readability-extractor": "github:ArchiveBox/readability-extractor",
"single-file-cli": "^1.1.54"
}
"name": "archivebox",
"version": "0.7.3",
"description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
"repository": "github:ArchiveBox/ArchiveBox",
"license": "MIT",
"dependencies":
{
"@postlight/parser": "^2.2.3",
"readability-extractor": "github:ArchiveBox/readability-extractor",
"single-file-cli": "^1.1.54"
}
}

View file

@ -359,6 +359,19 @@ files = [
{file = "django_ninja-1.1.0.tar.gz", hash = "sha256:87bff046416a2653ed2fbef1408e101292bf8170684821bac82accfd73bef059"},
]
[[package]]
name = "django-solo"
version = "2.0.0"
requires_python = ">=3.6"
summary = "Django Solo helps working with singletons"
dependencies = [
"django>=2.2",
]
files = [
{file = "django-solo-2.0.0.tar.gz", hash = "sha256:7c6dbe04ae858a4645b580ec83a31a960a067ad4525d8227cca50b7fc5983a62"},
{file = "django_solo-2.0.0-py3-none-any.whl", hash = "sha256:9046eca738f2ed64dbef38c2107a02af1065a8899b4f9fabf61b06b8325de1b4"},
]
[[package]]
name = "exceptiongroup"
version = "1.2.1"
@ -902,22 +915,12 @@ files = [
[[package]]
name = "setuptools"
version = "69.5.1"
version = "69.0.3"
requires_python = ">=3.8"
summary = "Easily download, build, install, upgrade, and uninstall Python packages"
groups = ["default"]
files = [
{file = "setuptools-69.5.1-py3-none-any.whl", hash = "sha256:c636ac361bc47580504644275c9ad802c50415c7522212252c033bd15f301f32"},
{file = "setuptools-69.5.1.tar.gz", hash = "sha256:6c1fccdac05a97e598fb0ae3bbed5904ccb317337a51139dcd51453611bbb987"},
]
[[package]]
name = "sgmllib3k"
version = "1.0.0"
summary = "Py3k port of sgmllib."
groups = ["default"]
files = [
{file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"},
{file = "setuptools-69.0.3-py3-none-any.whl", hash = "sha256:385eb4edd9c9d5c17540511303e39a147ce2fc04bc55289c322b9e5904fe2c05"},
{file = "setuptools-69.0.3.tar.gz", hash = "sha256:be1af57fc409f93647f2e8e4573a142ed38724b8cdd389706a867bb4efcf1e78"},
]
[[package]]

View file

@ -16,6 +16,7 @@ dependencies = [
"setuptools>=69.5.1",
"django>=4.2.0,<5.0",
"django-ninja>=1.1.0",
"django-solo>=2.0.0",
"django-extensions>=3.2.3",
"mypy-extensions>=1.0.0",
@ -123,7 +124,9 @@ test = [
lint = [
"flake8",
"mypy",
"django-stubs",
"django-stubs[compatible-mypy]>=4.2.7",
"types-requests>=2.31.0.20240125",
"pudb>=2024.1",
]
[build-system]
@ -133,6 +136,21 @@ build-backend = "pdm.backend"
[project.scripts]
archivebox = "archivebox.cli:main"
[tool.pyright]
include = ["archivebox"]
exclude = ["data", "data2", "data3", "data4", "data5", "pip_dist", "brew_dist", "dist", "vendor", "migrations", "tests"]
[tool.mypy]
mypy_path = "archivebox"
explicit_package_bases = true
check_untyped_defs = true
plugins = ["mypy_django_plugin.main"]
# TODO: remove this eventually https://github.com/hauntsaninja/no_implicit_optional
implicit_optional = true
[tool.django-stubs]
django_settings_module = "core.settings"
strict_settings = false
[tool.pdm.scripts]
lint = "./bin/lint.sh"
@ -142,19 +160,6 @@ test = "./bin/test.sh"
[tool.pytest.ini_options]
testpaths = [ "tests" ]
[tool.mypy]
mypy_path = "archivebox"
namespace_packages = true
explicit_package_bases = true
# follow_imports = "silent"
# ignore_missing_imports = true
# disallow_incomplete_defs = true
# disallow_untyped_defs = true
# disallow_untyped_decorators = true
# exclude = "pdm/(pep582/|models/in_process/.+\\.py)"
plugins = ["mypy_django_plugin.main"]
[project.urls]
Homepage = "https://github.com/ArchiveBox/ArchiveBox"