1
0
Fork 0
mirror of synced 2024-05-20 04:12:30 +12:00

Merge branch 'dev' into issue1316

This commit is contained in:
Nick Sweeting 2024-05-06 23:14:16 -07:00 committed by GitHub
commit ef856e8051
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
50 changed files with 1469 additions and 1694 deletions

View file

@ -17,6 +17,11 @@ venv/
.venv-old/
.docker-venv/
node_modules/
chrome/
chromeprofile/
pdm.dev.lock
pdm.lock
docs/
build/

5
.github/FUNDING.yml vendored
View file

@ -1,3 +1,2 @@
github: pirate
patreon: theSquashSH
custom: ["https://hcb.hackclub.com/donations/start/archivebox", "https://paypal.me/NicholasSweeting"]
github: ["ArchiveBox", "pirate"]
custom: ["https://donate.archivebox.io", "https://paypal.me/NicholasSweeting"]

View file

@ -11,7 +11,7 @@ on:
env:
DOCKER_IMAGE: archivebox-ci
jobs:
buildx:
runs-on: ubuntu-latest
@ -24,21 +24,21 @@ jobs:
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
with:
version: latest
install: true
platforms: linux/amd64,linux/arm64,linux/arm/v7
platforms: linux/amd64,linux/arm64
- name: Builder instance name
run: echo ${{ steps.buildx.outputs.name }}
- name: Available platforms
run: echo ${{ steps.buildx.outputs.platforms }}
- name: Cache Docker layers
uses: actions/cache@v3
with:
@ -51,21 +51,27 @@ jobs:
uses: docker/login-action@v3
if: github.event_name != 'pull_request'
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Collect Docker tags
# https://github.com/docker/metadata-action
id: docker_meta
uses: docker/metadata-action@v5
with:
images: archivebox/archivebox,nikisweeting/archivebox
tags: |
# :stable
type=ref,event=branch
# :0.7.3
type=semver,pattern={{version}}
# :0.7
type=semver,pattern={{major}}.{{minor}}
# :sha-463ea54
type=sha
type=raw,value=latest,enable={{is_default_branch}}
# :latest
type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'stable') }}
- name: Build and push
id: docker_build
uses: docker/build-push-action@v5
@ -77,7 +83,7 @@ jobs:
tags: ${{ steps.docker_meta.outputs.tags }}
cache-from: type=local,src=/tmp/.buildx-cache
cache-to: type=local,dest=/tmp/.buildx-cache-new
platforms: linux/amd64,linux/arm64,linux/arm/v7
platforms: linux/amd64,linux/arm64
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
@ -88,7 +94,7 @@ jobs:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
repository: archivebox/archivebox
# This ugly bit is necessary if you don't want your cache to grow forever
# until it hits GitHub's limit of 5GB.
# Temp fix

6
.gitignore vendored
View file

@ -13,8 +13,9 @@ venv/
node_modules/
# Ignore dev lockfiles (should always be built fresh)
requirements-dev.txt
pdm.lock
pdm.dev.lock
requirements-dev.txt
# Packaging artifacts
.pdm-python
@ -26,9 +27,6 @@ dist/
# Data folders
data/
data1/
data2/
data3/
data*/
output/

View file

@ -37,7 +37,7 @@ LABEL name="archivebox" \
com.docker.extension.detailed-description='See here for detailed documentation: https://wiki.archivebox.io' \
com.docker.extension.changelog='See here for release notes: https://github.com/ArchiveBox/ArchiveBox/releases' \
com.docker.extension.categories='database,utility-tools'
ARG TARGETPLATFORM
ARG TARGETOS
ARG TARGETARCH
@ -87,7 +87,9 @@ COPY --chown=root:root --chmod=755 package.json "$CODE_DIR/"
RUN grep '"version": ' "${CODE_DIR}/package.json" | awk -F'"' '{print $4}' > /VERSION.txt
# Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up Docker builds)
RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache \
RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d/99keep-cache \
&& echo 'APT::Install-Recommends "0";' > /etc/apt/apt.conf.d/99no-intall-recommends \
&& echo 'APT::Install-Suggests "0";' > /etc/apt/apt.conf.d/99no-intall-suggests \
&& rm -f /etc/apt/apt.conf.d/docker-clean
# Print debug info about build and save it to disk, for human eyes only, not used by anything else
@ -120,10 +122,10 @@ RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \
# Install system apt dependencies (adding backports to access more recent apt updates)
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing APT base system dependencies for $TARGETPLATFORM..." \
&& echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' >> /etc/apt/sources.list.d/backports.list \
&& echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' > /etc/apt/sources.list.d/backports.list \
&& mkdir -p /etc/apt/keyrings \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
&& apt-get install -qq -y -t bookworm-backports \
# 1. packaging dependencies
apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
# 2. docker and init system dependencies
@ -134,27 +136,13 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
######### Language Environments ####################################
# Install Node environment
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing Node $NODE_VERSION environment in $NODE_MODULES..." \
&& echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
&& curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
nodejs libatomic1 python3-minimal \
&& rm -rf /var/lib/apt/lists/* \
# Update NPM to latest version
&& npm i -g npm --cache /root/.npm \
# Save version info
&& ( \
which node && node --version \
&& which npm && npm --version \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
# Install Python environment
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
echo "[+] Setting up Python $PYTHON_VERSION runtime..." \
# && apt-get update -qq \
# && apt-get install -qq -y -t bookworm-backports --no-upgrade \
# python${PYTHON_VERSION} python${PYTHON_VERSION}-minimal python3-pip \
# && rm -rf /var/lib/apt/lists/* \
# tell PDM to allow using global system python site packages
# && rm /usr/lib/python3*/EXTERNALLY-MANAGED \
# create global virtual environment GLOBAL_VENV to use (better than using pip install --global)
@ -171,13 +159,34 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
# Install Node environment
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing Node $NODE_VERSION environment in $NODE_MODULES..." \
&& echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
&& curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-upgrade libatomic1 \
&& apt-get install -y -t bookworm-backports --no-upgrade \
nodejs \
&& rm -rf /var/lib/apt/lists/* \
# Update NPM to latest version
&& npm i -g npm --cache /root/.npm \
# Save version info
&& ( \
which node && node --version \
&& which npm && npm --version \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
######### Extractor Dependencies ##################################
# Install apt dependencies
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing APT extractor dependencies globally using apt..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
&& apt-get install -qq -y -t bookworm-backports \
curl wget git yt-dlp ffmpeg ripgrep \
# Packages we have also needed in the past:
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
@ -196,25 +205,21 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing Browser binary dependencies to $PLAYWRIGHT_BROWSERS_PATH..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
&& apt-get install -qq -y -t bookworm-backports \
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
at-spi2-common fonts-liberation fonts-noto-color-emoji fonts-tlwg-loma-otf fonts-unifont libatk-bridge2.0-0 libatk1.0-0 libatspi2.0-0 libavahi-client3 \
libavahi-common-data libavahi-common3 libcups2 libfontenc1 libice6 libnspr4 libnss3 libsm6 libunwind8 \
libxaw7 libxcomposite1 libxdamage1 libxfont2 \
libxkbfile1 libxmu6 libxpm4 libxt6 x11-xkb-utils xfonts-encodings \
# xfonts-scalable xfonts-utils xserver-common xvfb \
# chrome can run without dbus/upower technically, it complains about missing dbus but should run ok anyway
# libxss1 dbus dbus-x11 upower \
# && service dbus start \
&& if [[ "$TARGETPLATFORM" == *amd64* || "$TARGETPLATFORM" == *arm64* ]]; then \
# install Chromium using playwright
pip install playwright \
&& cp -r /root/.cache/ms-playwright "$PLAYWRIGHT_BROWSERS_PATH" \
&& playwright install --with-deps chromium \
&& export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \
else \
# fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
# apt-get install -qq -y -t bookworm-backports --no-install-recommends \
# chromium \
# && export CHROME_BINARY="$(which chromium)"; \
echo 'armv7 no longer supported in versions after v0.7.3' \
exit 1; \
fi \
# install Chromium using playwright
&& pip install playwright \
&& cp -r /root/.cache/ms-playwright "$PLAYWRIGHT_BROWSERS_PATH" \
&& playwright install chromium \
&& export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \
&& rm -rf /var/lib/apt/lists/* \
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
@ -247,8 +252,8 @@ COPY --chown=root:root --chmod=755 "./pyproject.toml" "requirements.txt" "$CODE_
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing PIP ArchiveBox dependencies from requirements.txt for ${TARGETPLATFORM}..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
build-essential \
&& apt-get install -qq -y -t bookworm-backports \
# build-essential \
libssl-dev libldap2-dev libsasl2-dev \
python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \
# && ln -s "$GLOBAL_VENV" "$APP_VENV" \
@ -258,8 +263,8 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
# && pdm export -o requirements.txt --without-hashes \
# && source $GLOBAL_VENV/bin/activate \
&& pip install -r requirements.txt \
&& apt-get purge -y \
build-essential \
# && apt-get purge -y \
# build-essential \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*
@ -269,7 +274,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
echo "[*] Installing PIP ArchiveBox package from $CODE_DIR..." \
# && apt-get update -qq \
# install C compiler to build deps on platforms that dont have 32-bit wheels available on pypi
# && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
# && apt-get install -qq -y -t bookworm-backports \
# build-essential \
# INSTALL ARCHIVEBOX python package globally from CODE_DIR, with all optional dependencies
&& pip install -e "$CODE_DIR"[sonic,ldap] \

View file

@ -407,7 +407,7 @@ See <a href="#%EF%B8%8F-cli-usage">below</a> for usage examples using the CLI, W
> *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.*
<ul>
<li>TrueNAS: <a href="https://truecharts.org/charts/incubator/archivebox/">Official ArchiveBox TrueChart</a> / <a href="https://dev.to/finloop/setting-up-archivebox-on-truenas-scale-1788">Custom App Guide</a></li>
<li>TrueNAS: <a href="https://truecharts.org/charts/stable/archivebox/">Official ArchiveBox TrueChart</a> / <a href="https://dev.to/finloop/setting-up-archivebox-on-truenas-scale-1788">Custom App Guide</a></li>
<li><a href="https://unraid.net/community/apps?q=archivebox#r">UnRaid</a></li>
<li><a href="https://github.com/YunoHost-Apps/archivebox_ynh">Yunohost</a></li>
<li><a href="https://www.cloudron.io/store/io.archivebox.cloudronapp.html">Cloudron</a></li>
@ -445,6 +445,9 @@ Other providers of paid ArchiveBox hosting (not officially endorsed):<br/>
<li><a href="https://fly.io/">
<img src="https://img.shields.io/badge/Unmanaged_App-Fly.io-%239a2de6.svg?style=flat" height="22px"/>
</a> (USD $10-50+/mo, <a href="https://fly.io/docs/hands-on/start/">instructions</a>)</li>
<li><a href="https://railway.app/template/2Vvhmy">
<img src="https://img.shields.io/badge/Unmanaged_App-Railway-%23A11BE6.svg?style=flat" height="22px"/>
</a> (USD $0-5+/mo)</li>
<li><a href="https://aws.amazon.com/marketplace/pp/Linnovate-Open-Source-Innovation-Support-For-Archi/B08RVW6MJ2"><img src="https://img.shields.io/badge/Unmanaged_VPS-AWS-%23ee8135.svg?style=flat" height="22px"/></a> (USD $60-200+/mo)</li>
<li><a href="https://azuremarketplace.microsoft.com/en-us/marketplace/apps/meanio.archivebox?ocid=gtmrewards_whatsnewblog_archivebox_vol118"><img src="https://img.shields.io/badge/Unmanaged_VPS-Azure-%237cb300.svg?style=flat" height="22px"/></a> (USD $60-200+/mo)</li>
<br/>

View file

@ -1 +1,7 @@
__package__ = 'archivebox'
# monkey patch django timezone to add back utc (it was removed in Django 5.0)
import datetime
from django.utils import timezone
timezone.utc = datetime.timezone.utc

View file

@ -0,0 +1 @@
__package__ = 'archivebox.api'

View file

@ -1,3 +1,5 @@
__package__ = 'archivebox.api'
from django.apps import AppConfig

View file

@ -1,184 +0,0 @@
# archivebox_api.py
from typing import List, Optional
from enum import Enum
from pydantic import BaseModel
from ninja import Router
from main import (
add,
remove,
update,
list_all,
ONLY_NEW,
) # Assuming these functions are defined in main.py
# Schemas
class StatusChoices(str, Enum):
indexed = 'indexed'
archived = 'archived'
unarchived = 'unarchived'
present = 'present'
valid = 'valid'
invalid = 'invalid'
duplicate = 'duplicate'
orphaned = 'orphaned'
corrupted = 'corrupted'
unrecognized = 'unrecognized'
class AddURLSchema(BaseModel):
urls: List[str]
tag: str = ""
depth: int = 0
update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW
update_all: bool = False
index_only: bool = False
overwrite: bool = False
init: bool = False
extractors: str = ""
parser: str = "auto"
class RemoveURLSchema(BaseModel):
yes: bool = False
delete: bool = False
before: Optional[float] = None
after: Optional[float] = None
filter_type: str = "exact"
filter_patterns: Optional[List[str]] = None
class UpdateSchema(BaseModel):
resume: Optional[float] = None
only_new: Optional[bool] = None
index_only: Optional[bool] = False
overwrite: Optional[bool] = False
before: Optional[float] = None
after: Optional[float] = None
status: Optional[StatusChoices] = None
filter_type: Optional[str] = 'exact'
filter_patterns: Optional[List[str]] = None
extractors: Optional[str] = ""
class ListAllSchema(BaseModel):
filter_patterns: Optional[List[str]] = None
filter_type: str = 'exact'
status: Optional[StatusChoices] = None
after: Optional[float] = None
before: Optional[float] = None
sort: Optional[str] = None
csv: Optional[str] = None
json: bool = False
html: bool = False
with_headers: bool = False
# API Router
router = Router()
@router.post("/add", response={200: dict})
def api_add(request, payload: AddURLSchema):
try:
result = add(
urls=payload.urls,
tag=payload.tag,
depth=payload.depth,
update=payload.update,
update_all=payload.update_all,
index_only=payload.index_only,
overwrite=payload.overwrite,
init=payload.init,
extractors=payload.extractors,
parser=payload.parser,
)
# Currently the add function returns a list of ALL items in the DB, ideally only return new items
return {
"status": "success",
"message": "URLs added successfully.",
"result": str(result),
}
except Exception as e:
# Handle exceptions raised by the add function or during processing
return {"status": "error", "message": str(e)}
@router.post("/remove", response={200: dict})
def api_remove(request, payload: RemoveURLSchema):
try:
result = remove(
yes=payload.yes,
delete=payload.delete,
before=payload.before,
after=payload.after,
filter_type=payload.filter_type,
filter_patterns=payload.filter_patterns,
)
return {
"status": "success",
"message": "URLs removed successfully.",
"result": result,
}
except Exception as e:
# Handle exceptions raised by the remove function or during processing
return {"status": "error", "message": str(e)}
@router.post("/update", response={200: dict})
def api_update(request, payload: UpdateSchema):
try:
result = update(
resume=payload.resume,
only_new=payload.only_new,
index_only=payload.index_only,
overwrite=payload.overwrite,
before=payload.before,
after=payload.after,
status=payload.status,
filter_type=payload.filter_type,
filter_patterns=payload.filter_patterns,
extractors=payload.extractors,
)
return {
"status": "success",
"message": "Archive updated successfully.",
"result": result,
}
except Exception as e:
# Handle exceptions raised by the update function or during processing
return {"status": "error", "message": str(e)}
@router.post("/list_all", response={200: dict})
def api_list_all(request, payload: ListAllSchema):
try:
result = list_all(
filter_patterns=payload.filter_patterns,
filter_type=payload.filter_type,
status=payload.status,
after=payload.after,
before=payload.before,
sort=payload.sort,
csv=payload.csv,
json=payload.json,
html=payload.html,
with_headers=payload.with_headers,
)
# TODO: This is kind of bad, make the format a choice field
if payload.json:
return {"status": "success", "format": "json", "data": result}
elif payload.html:
return {"status": "success", "format": "html", "data": result}
elif payload.csv:
return {"status": "success", "format": "csv", "data": result}
else:
return {
"status": "success",
"message": "List generated successfully.",
"data": result,
}
except Exception as e:
# Handle exceptions raised by the list_all function or during processing
return {"status": "error", "message": str(e)}

View file

@ -1,48 +1,107 @@
__package__ = 'archivebox.api'
from typing import Optional
from django.http import HttpRequest
from django.contrib.auth import login
from django.contrib.auth import authenticate
from ninja import Form, Router, Schema
from ninja.security import HttpBearer
from django.contrib.auth.models import AbstractBaseUser
from api.models import Token
router = Router()
from ninja.security import HttpBearer, APIKeyQuery, APIKeyHeader, HttpBasicAuth, django_auth_superuser
class GlobalAuth(HttpBearer):
def authenticate(self, request, token):
def auth_using_token(token, request: Optional[HttpRequest]=None) -> Optional[AbstractBaseUser]:
"""Given an API token string, check if a corresponding non-expired APIToken exists, and return its user"""
from api.models import APIToken # lazy import model to avoid loading it at urls.py import time
user = None
submitted_empty_form = token in ('string', '', None)
if submitted_empty_form:
user = request.user # see if user is authed via django session and use that as the default
else:
try:
return Token.objects.get(token=token).user
except Token.DoesNotExist:
token = APIToken.objects.get(token=token)
if token.is_valid():
user = token.user
except APIToken.DoesNotExist:
pass
if not user:
print('[❌] Failed to authenticate API user using API Key:', request)
class AuthSchema(Schema):
email: str
password: str
return None
@router.post("/authenticate", auth=None) # overriding global auth
def get_token(request, auth_data: AuthSchema):
user = authenticate(username=auth_data.email, password=auth_data.password)
if user:
# Assuming a user can have multiple tokens and you want to create a new one every time
new_token = Token.objects.create(user=user)
return {"token": new_token.token, "expires": new_token.expiry_as_iso8601}
def auth_using_password(username, password, request: Optional[HttpRequest]=None) -> Optional[AbstractBaseUser]:
"""Given a username and password, check if they are valid and return the corresponding user"""
user = None
submitted_empty_form = (username, password) in (('string', 'string'), ('', ''), (None, None))
if submitted_empty_form:
user = request.user # see if user is authed via django session and use that as the default
else:
return {"error": "Invalid credentials"}
user = authenticate(
username=username,
password=password,
)
if not user:
print('[❌] Failed to authenticate API user using API Key:', request)
return user
class TokenValidationSchema(Schema):
token: str
### Base Auth Types
class APITokenAuthCheck:
"""The base class for authentication methods that use an api.models.APIToken"""
def authenticate(self, request: HttpRequest, key: Optional[str]=None) -> Optional[AbstractBaseUser]:
user = auth_using_token(
token=key,
request=request,
)
if user is not None:
login(request, user, backend='django.contrib.auth.backends.ModelBackend')
return user
class UserPassAuthCheck:
"""The base class for authentication methods that use a username & password"""
def authenticate(self, request: HttpRequest, username: Optional[str]=None, password: Optional[str]=None) -> Optional[AbstractBaseUser]:
user = auth_using_password(
username=username,
password=password,
request=request,
)
if user is not None:
login(request, user, backend='django.contrib.auth.backends.ModelBackend')
return user
@router.post("/validate_token", auth=None) # No authentication required for this endpoint
def validate_token(request, token_data: TokenValidationSchema):
try:
# Attempt to authenticate using the provided token
user = GlobalAuth().authenticate(request, token_data.token)
if user:
return {"status": "valid"}
else:
return {"status": "invalid"}
except Token.DoesNotExist:
return {"status": "invalid"}
### Django-Ninja-Provided Auth Methods
class UsernameAndPasswordAuth(UserPassAuthCheck, HttpBasicAuth):
"""Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)"""
pass
class QueryParamTokenAuth(APITokenAuthCheck, APIKeyQuery):
"""Allow authenticating by passing api_key=xyz as a GET/POST query parameter"""
param_name = "api_key"
class HeaderTokenAuth(APITokenAuthCheck, APIKeyHeader):
"""Allow authenticating by passing X-API-Key=xyz as a request header"""
param_name = "X-API-Key"
class BearerTokenAuth(APITokenAuthCheck, HttpBearer):
"""Allow authenticating by passing Bearer=xyz as a request header"""
pass
### Enabled Auth Methods
API_AUTH_METHODS = [
QueryParamTokenAuth(),
HeaderTokenAuth(),
BearerTokenAuth(),
django_auth_superuser,
UsernameAndPasswordAuth(),
]

View file

@ -1,9 +1,10 @@
# Generated by Django 3.1.14 on 2024-04-09 18:52
# Generated by Django 4.2.11 on 2024-04-25 04:19
import api.models
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
import uuid
class Migration(migrations.Migration):
@ -16,13 +17,13 @@ class Migration(migrations.Migration):
operations = [
migrations.CreateModel(
name='Token',
name='APIToken',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('token', models.CharField(default=auth.models.hex_uuid, max_length=32, unique=True)),
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('token', models.CharField(default=api.models.generate_secret_token, max_length=32, unique=True)),
('created', models.DateTimeField(auto_now_add=True)),
('expiry', models.DateTimeField(blank=True, null=True)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='tokens', to=settings.AUTH_USER_MODEL)),
('expires', models.DateTimeField(blank=True, null=True)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
],
),
]

View file

@ -0,0 +1,17 @@
# Generated by Django 5.0.4 on 2024-04-26 05:28
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('api', '0001_initial'),
]
operations = [
migrations.AlterModelOptions(
name='apitoken',
options={'verbose_name': 'API Key', 'verbose_name_plural': 'API Keys'},
),
]

View file

@ -1,30 +1,63 @@
__package__ = 'archivebox.api'
import uuid
import secrets
from datetime import timedelta
from django.conf import settings
from django.db import models
from django.utils import timezone
from django.utils.translation import gettext_lazy as _
def hex_uuid():
return uuid.uuid4().hex
from django_stubs_ext.db.models import TypedModelMeta
class Token(models.Model):
user = models.ForeignKey(
settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name="tokens"
)
token = models.CharField(max_length=32, default=hex_uuid, unique=True)
def generate_secret_token() -> str:
# returns cryptographically secure string with len() == 32
return secrets.token_hex(16)
class APIToken(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
token = models.CharField(max_length=32, default=generate_secret_token, unique=True)
created = models.DateTimeField(auto_now_add=True)
expiry = models.DateTimeField(null=True, blank=True)
expires = models.DateTimeField(null=True, blank=True)
class Meta(TypedModelMeta):
verbose_name = "API Key"
verbose_name_plural = "API Keys"
def __str__(self) -> str:
return self.token
def __repr__(self) -> str:
return f'<APIToken user={self.user.username} token=************{self.token[-4:]}>'
def __json__(self) -> dict:
return {
"TYPE": "APIToken",
"id": str(self.id),
"user_id": str(self.user.id),
"user_username": self.user.username,
"token": self.token,
"created": self.created.isoformat(),
"expires": self.expires_as_iso8601,
}
@property
def expiry_as_iso8601(self):
def expires_as_iso8601(self):
"""Returns the expiry date of the token in ISO 8601 format or a date 100 years in the future if none."""
expiry_date = (
self.expiry if self.expiry else timezone.now() + timedelta(days=365 * 100)
)
expiry_date = self.expires or (timezone.now() + timedelta(days=365 * 100))
return expiry_date.isoformat()
def __str__(self):
return self.token
def is_valid(self, for_date=None):
for_date = for_date or timezone.now()
if self.expires and self.expires < for_date:
return False
return True

View file

@ -1,27 +1,30 @@
__package__ = 'archivebox.api'
from django.test import TestCase
from ninja.testing import TestClient
from archivebox.api.archive import router as archive_router
class ArchiveBoxAPITestCase(TestCase):
from .routes_cli import router
class ArchiveBoxCLIAPITestCase(TestCase):
def setUp(self):
self.client = TestClient(archive_router)
self.client = TestClient(router)
def test_add_endpoint(self):
response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "test"})
response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "testTag1,testTag2"})
self.assertEqual(response.status_code, 200)
self.assertEqual(response.json()["status"], "success")
self.assertTrue(response.json()["success"])
def test_remove_endpoint(self):
response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
self.assertEqual(response.status_code, 200)
self.assertEqual(response.json()["status"], "success")
self.assertTrue(response.json()["success"])
def test_update_endpoint(self):
response = self.client.post("/update", json={})
self.assertEqual(response.status_code, 200)
self.assertEqual(response.json()["status"], "success")
self.assertTrue(response.json()["success"])
def test_list_all_endpoint(self):
response = self.client.post("/list_all", json={})
self.assertEqual(response.status_code, 200)
self.assertTrue("success" in response.json()["status"])
self.assertTrue(response.json()["success"])

17
archivebox/api/urls.py Normal file
View file

@ -0,0 +1,17 @@
__package__ = 'archivebox.api'
from django.urls import path
from django.views.generic.base import RedirectView
from .v1_api import urls as v1_api_urls
urlpatterns = [
path("", RedirectView.as_view(url='/api/v1')),
path("v1/", v1_api_urls),
path("v1", RedirectView.as_view(url='/api/v1/docs')),
# ... v2 can be added here ...
# path("v2/", v2_api_urls),
# path("v2", RedirectView.as_view(url='/api/v2/docs')),
]

111
archivebox/api/v1_api.py Normal file
View file

@ -0,0 +1,111 @@
__package__ = 'archivebox.api'
from io import StringIO
from traceback import format_exception
from contextlib import redirect_stdout, redirect_stderr
from django.http import HttpRequest, HttpResponse
from django.core.exceptions import ObjectDoesNotExist, EmptyResultSet, PermissionDenied
from ninja import NinjaAPI, Swagger
# TODO: explore adding https://eadwincode.github.io/django-ninja-extra/
from api.auth import API_AUTH_METHODS
from ..config import VERSION, COMMIT_HASH
COMMIT_HASH = COMMIT_HASH or 'unknown'
html_description=f'''
<h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>
<br/>
<i><b>WARNING: This API is still in an early development stage and may change!</b></i>
<br/>
<ul>
<li> Manage your server: <a href="/admin/api/"><b>Setup API Keys</b></a>, <a href="/admin/">Go to your Server Admin UI</a>, <a href="/">Go to your Snapshots list</a>
<li>💬 Ask questions and get help here: <a href="https://zulip.archivebox.io">ArchiveBox Chat Forum</a></li>
<li>🐞 Report API bugs here: <a href="https://github.com/ArchiveBox/ArchiveBox/issues">Github Issues</a></li>
<li>📚 ArchiveBox Documentation: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Github Wiki</a></li>
<li>📜 See the API source code: <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/api"><code>archivebox/api/</code></a></li>
</ul>
<small>Served by ArchiveBox v{VERSION} (<a href="https://github.com/ArchiveBox/ArchiveBox/commit/{COMMIT_HASH}"><code>{COMMIT_HASH[:8]}</code></a>), API powered by <a href="https://django-ninja.dev/"><code>django-ninja</code></a>.</small>
'''
def register_urls(api: NinjaAPI) -> NinjaAPI:
api.add_router('/auth/', 'api.v1_auth.router')
api.add_router('/core/', 'api.v1_core.router')
api.add_router('/cli/', 'api.v1_cli.router')
return api
class NinjaAPIWithIOCapture(NinjaAPI):
def create_temporal_response(self, request: HttpRequest) -> HttpResponse:
stdout, stderr = StringIO(), StringIO()
with redirect_stderr(stderr):
with redirect_stdout(stdout):
request.stdout = stdout
request.stderr = stderr
response = super().create_temporal_response(request)
print('RESPONDING NOW', response)
return response
api = NinjaAPIWithIOCapture(
title='ArchiveBox API',
description=html_description,
version='1.0.0',
csrf=False,
auth=API_AUTH_METHODS,
urls_namespace="api",
docs=Swagger(settings={"persistAuthorization": True}),
# docs_decorator=login_required,
# renderer=ORJSONRenderer(),
)
api = register_urls(api)
urls = api.urls
@api.exception_handler(Exception)
def generic_exception_handler(request, err):
status = 503
if isinstance(err, (ObjectDoesNotExist, EmptyResultSet, PermissionDenied)):
status = 404
print(''.join(format_exception(err)))
return api.create_response(
request,
{
"succeeded": False,
"message": f'{err.__class__.__name__}: {err}',
"errors": [
''.join(format_exception(err)),
# or send simpler parent-only traceback:
# *([str(err.__context__)] if getattr(err, '__context__', None) else []),
],
},
status=status,
)
# import orjson
# from ninja.renderers import BaseRenderer
# class ORJSONRenderer(BaseRenderer):
# media_type = "application/json"
# def render(self, request, data, *, response_status):
# return {
# "success": True,
# "errors": [],
# "result": data,
# "stdout": ansi_to_html(stdout.getvalue().strip()),
# "stderr": ansi_to_html(stderr.getvalue().strip()),
# }
# return orjson.dumps(data)

52
archivebox/api/v1_auth.py Normal file
View file

@ -0,0 +1,52 @@
__package__ = 'archivebox.api'
from typing import Optional
from ninja import Router, Schema
from api.models import APIToken
from api.auth import auth_using_token, auth_using_password
router = Router(tags=['Authentication'])
class PasswordAuthSchema(Schema):
"""Schema for a /get_api_token request"""
username: Optional[str] = None
password: Optional[str] = None
@router.post("/get_api_token", auth=None, summary='Generate an API token for a given username & password (or currently logged-in user)') # auth=None because they are not authed yet
def get_api_token(request, auth_data: PasswordAuthSchema):
user = auth_using_password(
username=auth_data.username,
password=auth_data.password,
request=request,
)
if user:
# TODO: support multiple tokens in the future, for now we just have one per user
api_token, created = APIToken.objects.get_or_create(user=user)
return api_token.__json__()
return {"success": False, "errors": ["Invalid credentials"]}
class TokenAuthSchema(Schema):
"""Schema for a /check_api_token request"""
token: str
@router.post("/check_api_token", auth=None, summary='Validate an API token to make sure its valid and non-expired') # auth=None because they are not authed yet
def check_api_token(request, token_data: TokenAuthSchema):
user = auth_using_token(
token=token_data.token,
request=request,
)
if user:
return {"success": True, "user_id": str(user.id)}
return {"success": False, "user_id": None}

234
archivebox/api/v1_cli.py Normal file
View file

@ -0,0 +1,234 @@
__package__ = 'archivebox.api'
from typing import List, Dict, Any, Optional
from enum import Enum
from ninja import Router, Schema
from ..main import (
add,
remove,
update,
list_all,
schedule,
)
from ..util import ansi_to_html
from ..config import ONLY_NEW
# router for API that exposes archivebox cli subcommands as REST endpoints
router = Router(tags=['ArchiveBox CLI Sub-Commands'])
# Schemas
JSONType = List[Any] | Dict[str, Any] | bool | int | str | None
class CLICommandResponseSchema(Schema):
success: bool
errors: List[str]
result: JSONType
stdout: str
stderr: str
class FilterTypeChoices(str, Enum):
exact = 'exact'
substring = 'substring'
regex = 'regex'
domain = 'domain'
tag = 'tag'
timestamp = 'timestamp'
class StatusChoices(str, Enum):
indexed = 'indexed'
archived = 'archived'
unarchived = 'unarchived'
present = 'present'
valid = 'valid'
invalid = 'invalid'
duplicate = 'duplicate'
orphaned = 'orphaned'
corrupted = 'corrupted'
unrecognized = 'unrecognized'
class AddCommandSchema(Schema):
urls: List[str]
tag: str = ""
depth: int = 0
update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW
update_all: bool = False
index_only: bool = False
overwrite: bool = False
init: bool = False
extractors: str = ""
parser: str = "auto"
class UpdateCommandSchema(Schema):
resume: Optional[float] = 0
only_new: bool = ONLY_NEW
index_only: bool = False
overwrite: bool = False
after: Optional[float] = 0
before: Optional[float] = 999999999999999
status: Optional[StatusChoices] = StatusChoices.unarchived
filter_type: Optional[str] = FilterTypeChoices.substring
filter_patterns: Optional[List[str]] = ['https://example.com']
extractors: Optional[str] = ""
class ScheduleCommandSchema(Schema):
import_path: Optional[str] = None
add: bool = False
every: Optional[str] = None
tag: str = ''
depth: int = 0
overwrite: bool = False
update: bool = not ONLY_NEW
clear: bool = False
class ListCommandSchema(Schema):
filter_patterns: Optional[List[str]] = ['https://example.com']
filter_type: str = FilterTypeChoices.substring
status: Optional[StatusChoices] = StatusChoices.indexed
after: Optional[float] = 0
before: Optional[float] = 999999999999999
sort: str = 'added'
as_json: bool = True
as_html: bool = False
as_csv: str | bool = 'timestamp,url'
with_headers: bool = False
class RemoveCommandSchema(Schema):
delete: bool = True
after: Optional[float] = 0
before: Optional[float] = 999999999999999
filter_type: str = FilterTypeChoices.exact
filter_patterns: Optional[List[str]] = ['https://example.com']
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
def cli_add(request, args: AddCommandSchema):
result = add(
urls=args.urls,
tag=args.tag,
depth=args.depth,
update=args.update,
update_all=args.update_all,
index_only=args.index_only,
overwrite=args.overwrite,
init=args.init,
extractors=args.extractors,
parser=args.parser,
)
return {
"success": True,
"errors": [],
"result": result,
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
}
@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
def cli_update(request, args: UpdateCommandSchema):
result = update(
resume=args.resume,
only_new=args.only_new,
index_only=args.index_only,
overwrite=args.overwrite,
before=args.before,
after=args.after,
status=args.status,
filter_type=args.filter_type,
filter_patterns=args.filter_patterns,
extractors=args.extractors,
)
return {
"success": True,
"errors": [],
"result": result,
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
}
@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
def cli_schedule(request, args: ScheduleCommandSchema):
result = schedule(
import_path=args.import_path,
add=args.add,
show=args.show,
clear=args.clear,
every=args.every,
tag=args.tag,
depth=args.depth,
overwrite=args.overwrite,
update=args.update,
)
return {
"success": True,
"errors": [],
"result": result,
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
}
@router.post("/list", response=CLICommandResponseSchema, summary='archivebox list [args] [filter_patterns]')
def cli_list(request, args: ListCommandSchema):
result = list_all(
filter_patterns=args.filter_patterns,
filter_type=args.filter_type,
status=args.status,
after=args.after,
before=args.before,
sort=args.sort,
csv=args.as_csv,
json=args.as_json,
html=args.as_html,
with_headers=args.with_headers,
)
result_format = 'txt'
if args.as_json:
result_format = "json"
elif args.as_html:
result_format = "html"
elif args.as_csv:
result_format = "csv"
return {
"success": True,
"errors": [],
"result": result,
"result_format": result_format,
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
}
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
def cli_remove(request, args: RemoveCommandSchema):
result = remove(
yes=True, # no way to interactively ask for confirmation via API, so we force yes
delete=args.delete,
before=args.before,
after=args.after,
filter_type=args.filter_type,
filter_patterns=args.filter_patterns,
)
return {
"success": True,
"errors": [],
"result": result,
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
}

210
archivebox/api/v1_core.py Normal file
View file

@ -0,0 +1,210 @@
__package__ = 'archivebox.api'
from uuid import UUID
from typing import List, Optional
from datetime import datetime
from django.shortcuts import get_object_or_404
from ninja import Router, Schema, FilterSchema, Field, Query
from ninja.pagination import paginate
from core.models import Snapshot, ArchiveResult, Tag
router = Router(tags=['Core Models'])
### ArchiveResult #########################################################################
class ArchiveResultSchema(Schema):
id: UUID
snapshot_id: UUID
snapshot_url: str
snapshot_tags: str
extractor: str
cmd: List[str]
pwd: str
cmd_version: str
output: str
status: str
created: datetime
@staticmethod
def resolve_id(obj):
return obj.uuid
@staticmethod
def resolve_created(obj):
return obj.start_ts
@staticmethod
def resolve_snapshot_url(obj):
return obj.snapshot.url
@staticmethod
def resolve_snapshot_tags(obj):
return obj.snapshot.tags_str()
class ArchiveResultFilterSchema(FilterSchema):
id: Optional[UUID] = Field(None, q='uuid')
search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains'])
snapshot_id: Optional[UUID] = Field(None, q='snapshot_id')
snapshot_url: Optional[str] = Field(None, q='snapshot__url')
snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name')
status: Optional[str] = Field(None, q='status')
output: Optional[str] = Field(None, q='output__icontains')
extractor: Optional[str] = Field(None, q='extractor__icontains')
cmd: Optional[str] = Field(None, q='cmd__0__icontains')
pwd: Optional[str] = Field(None, q='pwd__icontains')
cmd_version: Optional[str] = Field(None, q='cmd_version')
created: Optional[datetime] = Field(None, q='updated')
created__gte: Optional[datetime] = Field(None, q='updated__gte')
created__lt: Optional[datetime] = Field(None, q='updated__lt')
@router.get("/archiveresults", response=List[ArchiveResultSchema])
@paginate
def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)):
qs = ArchiveResult.objects.all()
results = filters.filter(qs)
return results
@router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
def get_archiveresult(request, archiveresult_id: str):
archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
return archiveresult
# @router.post("/archiveresult", response=ArchiveResultSchema)
# def create_archiveresult(request, payload: ArchiveResultSchema):
# archiveresult = ArchiveResult.objects.create(**payload.dict())
# return archiveresult
#
# @router.put("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
# def update_archiveresult(request, archiveresult_id: str, payload: ArchiveResultSchema):
# archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
#
# for attr, value in payload.dict().items():
# setattr(archiveresult, attr, value)
# archiveresult.save()
#
# return archiveresult
#
# @router.delete("/archiveresult/{archiveresult_id}")
# def delete_archiveresult(request, archiveresult_id: str):
# archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
# archiveresult.delete()
# return {"success": True}
### Snapshot #########################################################################
class SnapshotSchema(Schema):
id: UUID
url: str
tags: str
title: Optional[str]
timestamp: str
bookmarked: datetime
added: datetime
updated: datetime
archive_path: str
archiveresults: List[ArchiveResultSchema]
# @staticmethod
# def resolve_id(obj):
# return str(obj.id)
@staticmethod
def resolve_tags(obj):
return obj.tags_str()
@staticmethod
def resolve_archiveresults(obj, context):
if context['request'].with_archiveresults:
return obj.archiveresult_set.all().distinct()
return ArchiveResult.objects.none()
class SnapshotFilterSchema(FilterSchema):
id: Optional[UUID] = Field(None, q='id')
search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains'])
url: Optional[str] = Field(None, q='url')
tag: Optional[str] = Field(None, q='tags__name')
title: Optional[str] = Field(None, q='title__icontains')
timestamp: Optional[str] = Field(None, q='timestamp__startswith')
added: Optional[datetime] = Field(None, q='added')
added__gte: Optional[datetime] = Field(None, q='added__gte')
added__lt: Optional[datetime] = Field(None, q='added__lt')
@router.get("/snapshots", response=List[SnapshotSchema])
@paginate
def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=True):
request.with_archiveresults = with_archiveresults
qs = Snapshot.objects.all()
results = filters.filter(qs)
return results
@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema)
def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
request.with_archiveresults = with_archiveresults
snapshot = get_object_or_404(Snapshot, id=snapshot_id)
return snapshot
# @router.post("/snapshot", response=SnapshotSchema)
# def create_snapshot(request, payload: SnapshotSchema):
# snapshot = Snapshot.objects.create(**payload.dict())
# return snapshot
#
# @router.put("/snapshot/{snapshot_id}", response=SnapshotSchema)
# def update_snapshot(request, snapshot_id: str, payload: SnapshotSchema):
# snapshot = get_object_or_404(Snapshot, id=snapshot_id)
#
# for attr, value in payload.dict().items():
# setattr(snapshot, attr, value)
# snapshot.save()
#
# return snapshot
#
# @router.delete("/snapshot/{snapshot_id}")
# def delete_snapshot(request, snapshot_id: str):
# snapshot = get_object_or_404(Snapshot, id=snapshot_id)
# snapshot.delete()
# return {"success": True}
### Tag #########################################################################
class TagSchema(Schema):
name: str
slug: str
@router.get("/tags", response=List[TagSchema])
def list_tags(request):
return Tag.objects.all()

View file

@ -112,7 +112,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
},
'ARCHIVE_METHOD_TOGGLES': {
@ -265,7 +265,7 @@ CONFIG_ALIASES = {
for key, default in section.items()
for alias in default.get('aliases', ())
}
USER_CONFIG = {key for section in CONFIG_SCHEMA.values() for key in section.keys()}
USER_CONFIG = {key: section[key] for section in CONFIG_SCHEMA.values() for key in section.keys()}
def get_real_name(key: str) -> str:
"""get the current canonical name for a given deprecated config key"""
@ -282,6 +282,7 @@ ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources'
LOGS_DIR_NAME = 'logs'
PERSONAS_DIR_NAME = 'personas'
CRONTABS_DIR_NAME = 'crontabs'
SQL_INDEX_FILENAME = 'index.sqlite3'
JSON_INDEX_FILENAME = 'index.json'
HTML_INDEX_FILENAME = 'index.html'
@ -355,7 +356,7 @@ ALLOWED_IN_OUTPUT_DIR = {
'static',
'sonic',
'search.sqlite3',
'crontabs',
CRONTABS_DIR_NAME,
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
@ -598,7 +599,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
@ -985,11 +985,6 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
'enabled': True,
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
},
'CUSTOM_TEMPLATES_DIR': {
'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(),
},
# 'NODE_MODULES_DIR': {
# 'path': ,
# 'enabled': ,
@ -997,50 +992,25 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
# },
}
def get_external_locations(config: ConfigDict) -> ConfigValue:
abspath = lambda path: None if path is None else Path(path).resolve()
return {
'CHROME_USER_DATA_DIR': {
'path': abspath(config['CHROME_USER_DATA_DIR']),
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
},
'COOKIES_FILE': {
'path': abspath(config['COOKIES_FILE']),
'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
},
}
def get_data_locations(config: ConfigDict) -> ConfigValue:
return {
# OLD: migrating to personas
# 'CHROME_USER_DATA_DIR': {
# 'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
# 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
# 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
# },
# 'COOKIES_FILE': {
# 'path': os.path.abspath(config['COOKIES_FILE']),
# 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
# 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
# },
'OUTPUT_DIR': {
'path': config['OUTPUT_DIR'].resolve(),
'enabled': True,
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
},
'SOURCES_DIR': {
'path': config['SOURCES_DIR'].resolve(),
'enabled': True,
'is_valid': config['SOURCES_DIR'].exists(),
},
'LOGS_DIR': {
'path': config['LOGS_DIR'].resolve(),
'enabled': True,
'is_valid': config['LOGS_DIR'].exists(),
},
'PERSONAS_DIR': {
'path': config['PERSONAS_DIR'].resolve(),
'enabled': True,
'is_valid': config['PERSONAS_DIR'].exists(),
},
'ARCHIVE_DIR': {
'path': config['ARCHIVE_DIR'].resolve(),
'enabled': True,
'is_valid': config['ARCHIVE_DIR'].exists(),
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
},
'CONFIG_FILE': {
'path': config['CONFIG_FILE'].resolve(),
'enabled': True,
@ -1052,6 +1022,38 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
},
'ARCHIVE_DIR': {
'path': config['ARCHIVE_DIR'].resolve(),
'enabled': True,
'is_valid': config['ARCHIVE_DIR'].exists(),
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
},
'SOURCES_DIR': {
'path': config['SOURCES_DIR'].resolve(),
'enabled': True,
'is_valid': config['SOURCES_DIR'].exists(),
},
'LOGS_DIR': {
'path': config['LOGS_DIR'].resolve(),
'enabled': True,
'is_valid': config['LOGS_DIR'].exists(),
},
'CUSTOM_TEMPLATES_DIR': {
'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(),
},
'PERSONAS_DIR': {
'path': config['PERSONAS_DIR'].resolve(),
'enabled': True,
'is_valid': config['PERSONAS_DIR'].exists(),
},
# managed by bin/docker_entrypoint.sh and python-crontab:
# 'CRONTABS_DIR': {
# 'path': config['CRONTABS_DIR'].resolve(),
# 'enabled': True,
# 'is_valid': config['CRONTABS_DIR'].exists(),
# },
}
def get_dependency_info(config: ConfigDict) -> ConfigValue:
@ -1366,6 +1368,7 @@ def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=C
stderr(' archivebox init')
raise SystemExit(2)
def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG):
output_dir = out_dir or config['OUTPUT_DIR']
from .index.sql import list_migrations

View file

@ -14,12 +14,17 @@ from django.shortcuts import render, redirect
from django.contrib.auth import get_user_model
from django import forms
from signal_webhooks.apps import DjangoSignalWebhooksConfig
from signal_webhooks.admin import WebhookAdmin, WebhookModel
from ..util import htmldecode, urldecode, ansi_to_html
from core.models import Snapshot, ArchiveResult, Tag
from core.forms import AddLinkForm
from core.mixins import SearchResultsAdminMixin
from api.models import APIToken
from index.html import snapshot_icons
from logging_util import printable_filesize
@ -98,10 +103,32 @@ class ArchiveBoxAdmin(admin.AdminSite):
return render(template_name='add.html', request=request, context=context)
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
DjangoSignalWebhooksConfig.verbose_name = 'API'
WebhookModel._meta.get_field('name').help_text = 'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).'
WebhookModel._meta.get_field('signal').help_text = 'The type of event the webhook should fire for (e.g. Create, Update, Delete).'
WebhookModel._meta.get_field('ref').help_text = 'Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).'
WebhookModel._meta.get_field('endpoint').help_text = 'External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).'
WebhookModel._meta.app_label = 'api'
archivebox_admin = ArchiveBoxAdmin()
archivebox_admin.register(get_user_model())
archivebox_admin.register(APIToken)
archivebox_admin.register(WebhookModel, WebhookAdmin)
archivebox_admin.disable_action('delete_selected')
# patch admin with methods to add data views
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin)
archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin)
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
class ArchiveResultInline(admin.TabularInline):
model = ArchiveResult

View file

@ -1,3 +1,5 @@
__package__ = 'archivebox.core'
from django.apps import AppConfig
@ -5,6 +7,22 @@ class CoreConfig(AppConfig):
name = 'core'
def ready(self):
# register our custom admin as the primary django admin
from django.contrib import admin
from django.contrib.admin import sites
from core.admin import archivebox_admin
admin.site = archivebox_admin
sites.site = archivebox_admin
# register signal handlers
from .auth import register_signals
register_signals()
# from django.contrib.admin.apps import AdminConfig
# class CoreAdminConfig(AdminConfig):
# default_site = "core.admin.get_admin_site"

View file

@ -1,5 +1,6 @@
import os
from django.conf import settings
__package__ = 'archivebox.core'
from ..config import (
LDAP
)

View file

@ -1,10 +1,8 @@
from django.conf import settings
from ..config import (
LDAP_CREATE_SUPERUSER
)
def create_user(sender, user=None, ldap_user=None, **kwargs):
if not user.id and LDAP_CREATE_SUPERUSER:
user.is_superuser = True

View file

@ -18,6 +18,7 @@ from ..config import (
CUSTOM_TEMPLATES_DIR,
SQL_INDEX_FILENAME,
OUTPUT_DIR,
ARCHIVE_DIR,
LOGS_DIR,
TIMEZONE,
@ -63,6 +64,9 @@ INSTALLED_APPS = [
'core',
'api',
'admin_data_views',
'signal_webhooks',
'django_extensions',
]
@ -173,6 +177,17 @@ if DEBUG_TOOLBAR:
]
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
# Must delete archivebox/templates/admin to use because it relies on some things we override
# visit /__requests_tracker__/ to access
DEBUG_REQUESTS_TRACKER = False
if DEBUG_REQUESTS_TRACKER:
INSTALLED_APPS += ["requests_tracker"]
MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
################################################################################
### Staticfile and Template Settings
################################################################################
@ -242,6 +257,29 @@ CACHES = {
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
STORAGES = {
"default": {
"BACKEND": "django.core.files.storage.FileSystemStorage",
},
"staticfiles": {
"BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage",
},
"archive": {
"BACKEND": "django.core.files.storage.FileSystemStorage",
"OPTIONS": {
"base_url": "/archive/",
"location": ARCHIVE_DIR,
},
},
# "personas": {
# "BACKEND": "django.core.files.storage.FileSystemStorage",
# "OPTIONS": {
# "base_url": "/personas/",
# "location": PERSONAS_DIR,
# },
# },
}
################################################################################
### Security Settings
################################################################################
@ -368,3 +406,32 @@ LOGGING = {
}
},
}
# Add default webhook configuration to the User model
SIGNAL_WEBHOOKS = {
"HOOKS": {
"django.contrib.auth.models.User": ...,
"core.models.Snapshot": ...,
"core.models.ArchiveResult": ...,
"core.models.Tag": ...,
"api.models.APIToken": ...,
},
}
ADMIN_DATA_VIEWS = {
"NAME": "configuration",
"URLS": [
{
"route": "live/",
"view": "core.views.live_config_list_view",
"name": "live",
"items": {
"route": "<str:key>/",
"view": "core.views.live_config_value_view",
"name": "live_config_value",
},
},
],
}

View file

@ -1,4 +1,4 @@
from .admin import archivebox_admin
__package__ = 'archivebox.core'
from django.urls import path, include
from django.views import static
@ -6,14 +6,9 @@ from django.contrib.staticfiles.urls import staticfiles_urlpatterns
from django.conf import settings
from django.views.generic.base import RedirectView
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
from .admin import archivebox_admin
from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
from ninja import NinjaAPI
from api.auth import GlobalAuth
api = NinjaAPI(auth=GlobalAuth())
api.add_router("/auth/", "api.auth.router")
api.add_router("/archive/", "api.archive.router")
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
@ -43,10 +38,10 @@ urlpatterns = [
path('accounts/', include('django.contrib.auth.urls')),
path('admin/', archivebox_admin.urls),
path("api/", api.urls),
path("api/", include('api.urls')),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
path('error/', lambda _: 1/0),
path('error/', lambda *_: 1/0),
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
@ -57,10 +52,10 @@ urlpatterns = [
urlpatterns += staticfiles_urlpatterns()
if settings.DEBUG_TOOLBAR:
import debug_toolbar
urlpatterns += [
path('__debug__/', include(debug_toolbar.urls)),
]
urlpatterns += [path('__debug__/', include("debug_toolbar.urls"))]
if settings.DEBUG_REQUESTS_TRACKER:
urlpatterns += [path("__requests_tracker__/", include("requests_tracker.urls"))]
# # Proposed FUTURE URLs spec

View file

@ -1,10 +1,12 @@
__package__ = 'archivebox.core'
from typing import Callable
from io import StringIO
from contextlib import redirect_stdout
from django.shortcuts import render, redirect
from django.http import HttpResponse, Http404
from django.http import HttpRequest, HttpResponse, Http404
from django.utils.html import format_html, mark_safe
from django.views import View, static
from django.views.generic.list import ListView
@ -14,6 +16,10 @@ from django.contrib.auth.mixins import UserPassesTestMixin
from django.views.decorators.csrf import csrf_exempt
from django.utils.decorators import method_decorator
from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
from core.models import Snapshot
from core.forms import AddLinkForm
@ -26,6 +32,10 @@ from ..config import (
COMMIT_HASH,
FOOTER_INFO,
SNAPSHOTS_PER_PAGE,
CONFIG,
CONFIG_SCHEMA,
DYNAMIC_CONFIG_SCHEMA,
USER_CONFIG,
)
from ..main import add
from ..util import base_url, ansi_to_html
@ -124,9 +134,9 @@ class SnapshotView(View):
'<center><br/><br/><br/>'
f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a> exists in DB, but resource <b><code>{snapshot.timestamp}/'
'{}'
f'</code></b> does not exist in <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
'Maybe this resource type is not availabe for this Snapshot,<br/>or the archiving process has not completed yet?<br/>'
f'<pre><code># run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
f'</code></b> does not exist in the <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
'It\'s possible that this resource type is not available for the Snapshot,<br/>or that the archiving process has not completed yet.<br/>'
f'<pre><code># if interrupted, run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
'<div class="text-align: left; width: 100%; max-width: 400px">'
'<i><b>Next steps:</i></b><br/>'
f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
@ -312,3 +322,124 @@ class HealthCheckView(View):
content_type='text/plain',
status=200
)
def find_config_section(key: str) -> str:
matching_sections = [
name for name, opts in CONFIG_SCHEMA.items() if key in opts
]
section = matching_sections[0] if matching_sections else 'DYNAMIC'
return section
def find_config_default(key: str) -> str:
default_val = USER_CONFIG.get(key, {}).get('default', lambda: None)
if isinstance(default_val, Callable):
return None
else:
default_val = repr(default_val)
return default_val
def find_config_type(key: str) -> str:
if key in USER_CONFIG:
return USER_CONFIG[key]['type'].__name__
elif key in DYNAMIC_CONFIG_SCHEMA:
return type(CONFIG[key]).__name__
return 'str'
def key_is_safe(key: str) -> bool:
for term in ('key', 'password', 'secret', 'token'):
if term in key.lower():
return False
return True
@render_with_table_view
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
rows = {
"Section": [],
"Key": [],
"Type": [],
"Value": [],
"Default": [],
# "Documentation": [],
"Aliases": [],
}
for section in CONFIG_SCHEMA.keys():
for key in CONFIG_SCHEMA[section].keys():
rows['Section'].append(section.replace('_', ' ').title().replace(' Config', ''))
rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or 'See here...'}</code></a>'))
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', [])))
section = 'DYNAMIC'
for key in DYNAMIC_CONFIG_SCHEMA.keys():
rows['Section'].append(section.replace('_', ' ').title().replace(' Config', ''))
rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or 'See here...'}</code></a>'))
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
return TableContext(
title="Computed Configuration Values",
table=rows,
)
@render_with_item_view
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
aliases = USER_CONFIG.get(key, {}).get("aliases", [])
return ItemContext(
slug=key,
title=key,
data=[
{
"name": mark_safe(f'data / ArchiveBox.conf &nbsp; [{find_config_section(key)}] &nbsp; <b><code style="color: lightgray">{key}</code></b>' if key in USER_CONFIG else f'[DYNAMIC CONFIG] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(calculated at runtime)</small>'),
"description": None,
"fields": {
'Key': key,
'Type': find_config_type(key),
'Value': CONFIG[key] if key_is_safe(key) else '********',
},
"help_texts": {
'Key': mark_safe(f'''
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a> &nbsp;
<span style="display: {"inline" if aliases else "none"}">
Aliases: {", ".join(aliases)}
</span>
'''),
'Type': mark_safe(f'''
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code">
See full definition in <code>archivebox/config.py</code>...
</a>
'''),
'Value': mark_safe(f'''
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
Default: <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code">
<code>{find_config_default(key) or 'See 1here...'}</code>
</a>
<br/><br/>
<p style="display: {"block" if key in USER_CONFIG else "none"}">
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
<br/><br/>
<code>archivebox config --set {key}="{
val.strip("'")
if (val := find_config_default(key)) else
(repr(CONFIG[key] if key_is_safe(key) else '********')).strip("'")
}"</code>
</p>
'''),
},
},
],
)

View file

@ -4,6 +4,7 @@ WARNING: THIS FILE IS ALL LEGACY CODE TO BE REMOVED.
DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py
These are the old types we used to use before ArchiveBox v0.4 (before we switched to Django).
"""
__package__ = 'archivebox.index'

View file

@ -494,12 +494,12 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
if delete:
file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()]
print(
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' +
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
)
else:
print(
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n' +
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
' (Pass --delete if you also want to permanently delete the data folders)'
)

View file

@ -104,7 +104,6 @@ from .config import (
COMMIT_HASH,
BUILD_TIME,
CODE_LOCATIONS,
EXTERNAL_LOCATIONS,
DATA_LOCATIONS,
DEPENDENCIES,
CHROME_BINARY,
@ -231,7 +230,7 @@ def version(quiet: bool=False,
p = platform.uname()
print(
'ArchiveBox v{}'.format(get_version(CONFIG)),
*((f'COMMIT_HASH={COMMIT_HASH[:7]}',) if COMMIT_HASH else ()),
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
f'BUILD_TIME={BUILD_TIME}',
)
print(
@ -272,11 +271,6 @@ def version(quiet: bool=False,
for name, path in CODE_LOCATIONS.items():
print(printable_folder_status(name, path))
print()
print('{white}[i] Secrets locations:{reset}'.format(**ANSI))
for name, path in EXTERNAL_LOCATIONS.items():
print(printable_folder_status(name, path))
print()
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
print('{white}[i] Data locations:{reset}'.format(**ANSI))
@ -695,7 +689,7 @@ def add(urls: Union[str, List[str]],
if CAN_UPGRADE:
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
return all_links
return new_links
@enforce_types
def remove(filter_str: Optional[str]=None,
@ -1362,7 +1356,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
stderr()
stderr('')
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])

View file

@ -7,7 +7,7 @@ if __name__ == '__main__':
# versions of ./manage.py commands whenever possible. When that's not possible
# (e.g. makemigrations), you can comment out this check temporarily
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv):
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv or 'startapp' in sys.argv):
print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):")
print()
print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:')

View file

@ -7,7 +7,6 @@ For examples of supported import formats see tests/.
__package__ = 'archivebox.parsers'
import re
from io import StringIO
from typing import IO, Tuple, List, Optional
@ -28,7 +27,6 @@ from ..util import (
htmldecode,
download_url,
enforce_types,
URL_REGEX,
)
from ..index.schema import Link
from ..logging_util import TimedProgress, log_source_saved
@ -202,54 +200,3 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
log_source_saved(source_file=source_path)
return source_path
# Check that plain text regex URL parsing works as expected
# this is last-line-of-defense to make sure the URL_REGEX isn't
# misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
# the consequences of bad URL parsing could be disastrous and lead to many
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
_test_url_strs = {
'example.com': 0,
'/example.com': 0,
'//example.com': 0,
':/example.com': 0,
'://example.com': 0,
'htt://example8.com': 0,
'/htt://example.com': 0,
'https://example': 1,
'https://localhost/2345': 1,
'https://localhost:1234/123': 1,
'://': 0,
'https://': 0,
'http://': 0,
'ftp://': 0,
'ftp://example.com': 0,
'https://example.com': 1,
'https://example.com/': 1,
'https://a.example.com': 1,
'https://a.example.com/': 1,
'https://a.example.com/what/is/happening.html': 1,
'https://a.example.com/what/ís/happening.html': 1,
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
'<test>http://example7.com</test>': 1,
'https://<test>': 0,
'https://[test]': 0,
'http://"test"': 0,
'http://\'test\'': 0,
'[https://example8.com/what/is/this.php?what=1]': 1,
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
'<what>https://example10.com#and-thing=2 "</about>': 1,
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
'<or>http://examplehttp://15.badc</that>': 2,
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
}
for url_str, num_urls in _test_url_strs.items():
assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
f'{url_str} does not contain {num_urls} urls')

View file

@ -10,7 +10,7 @@ from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
URL_REGEX,
find_all_urls,
)
from html.parser import HTMLParser
from urllib.parse import urljoin
@ -40,10 +40,22 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
parser.feed(line)
for url in parser.urls:
if root_url:
# resolve relative urls /home.html -> https://example.com/home.html
url = urljoin(root_url, url)
for archivable_url in re.findall(URL_REGEX, url):
url_is_absolute = (url.lower().startswith('http://') or url.lower().startswith('https://'))
# url = https://abc.com => True
# url = /page.php?next=https://example.com => False
if not url_is_absolute: # resolve it by joining it with root_url
relative_path = url
url = urljoin(root_url, relative_path) # https://example.com/somepage.html + /home.html
# => https://example.com/home.html
# special case to handle bug around // handling, crucial for urls that contain sub-urls
# e.g. https://web.archive.org/web/https://example.com
if did_urljoin_misbehave(root_url, relative_path, url):
url = fix_urljoin_bug(url)
for archivable_url in find_all_urls(url):
yield Link(
url=htmldecode(archivable_url),
timestamp=str(datetime.now(timezone.utc).timestamp()),
@ -56,3 +68,74 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
KEY = 'html'
NAME = 'Generic HTML'
PARSER = parse_generic_html_export
#### WORKAROUND CODE FOR https://github.com/python/cpython/issues/96015 ####
def did_urljoin_misbehave(root_url: str, relative_path: str, final_url: str) -> bool:
"""
Handle urljoin edge case bug where multiple slashes get turned into a single slash:
- https://github.com/python/cpython/issues/96015
- https://github.com/ArchiveBox/ArchiveBox/issues/1411
This workaround only fixes the most common case of a sub-URL inside an outer URL, e.g.:
https://web.archive.org/web/https://example.com/some/inner/url
But there are other valid URLs containing // that are not fixed by this workaround, e.g.:
https://example.com/drives/C//some/file
"""
# if relative path is actually an absolute url, cut off its own scheme so we check the path component only
relative_path = relative_path.lower()
if relative_path.startswith('http://') or relative_path.startswith('https://'):
relative_path = relative_path.split('://', 1)[-1]
# TODO: properly fix all double // getting stripped by urljoin, not just ://
original_path_had_suburl = '://' in relative_path
original_root_had_suburl = '://' in root_url[8:] # ignore first 8 chars because root always starts with https://
final_joined_has_suburl = '://' in final_url[8:] # ignore first 8 chars because final always starts with https://
urljoin_broke_suburls = (
(original_root_had_suburl or original_path_had_suburl)
and not final_joined_has_suburl
)
return urljoin_broke_suburls
def fix_urljoin_bug(url: str, nesting_limit=5):
"""
recursively replace broken suburls .../http:/... with .../http://...
basically equivalent to this for 99.9% of cases:
url = url.replace('/http:/', '/http://')
url = url.replace('/https:/', '/https://')
except this handles:
other schemes besides http/https (e.g. https://example.com/link/git+ssh://github.com/example)
other preceding separators besides / (e.g. https://example.com/login/?next=https://example.com/home)
fixing multiple suburls recursively
"""
input_url = url
for _ in range(nesting_limit):
url = re.sub(
r'(?P<root>.+?)' # https://web.archive.org/web
+ r'(?P<separator>[-=/_&+%$#@!*\(\\])' # /
+ r'(?P<subscheme>[a-zA-Z0-9+_-]{1,32}?):/' # http:/
+ r'(?P<suburl>[^/\\]+)', # example.com
r"\1\2\3://\4",
input_url,
re.IGNORECASE | re.UNICODE,
)
if url == input_url:
break # nothing left to replace, all suburls are fixed
input_url = url
return url
# sanity check to make sure workaround code works as expected and doesnt introduce *more* bugs
assert did_urljoin_misbehave('https://web.archive.org/web/https://example.com', 'abc.html', 'https://web.archive.org/web/https:/example.com/abc.html') == True
assert did_urljoin_misbehave('http://example.com', 'https://web.archive.org/web/http://example.com/abc.html', 'https://web.archive.org/web/http:/example.com/abc.html') == True
assert fix_urljoin_bug('https:/example.com') == 'https:/example.com' # should not modify original url's scheme, only sub-urls
assert fix_urljoin_bug('https://web.archive.org/web/https:/example.com/abc.html') == 'https://web.archive.org/web/https://example.com/abc.html'
assert fix_urljoin_bug('http://example.com/link/git+ssh:/github.com/example?next=ftp:/example.com') == 'http://example.com/link/git+ssh://github.com/example?next=ftp://example.com'

View file

@ -72,21 +72,13 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
json_file.seek(0)
try:
links = json.load(json_file)
if type(links) != list:
raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
except json.decoder.JSONDecodeError:
# sometimes the first line is a comment or other junk, so try without
json_file.seek(0)
first_line = json_file.readline()
#print(' > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '')
links = json.load(json_file)
# we may fail again, which means we really don't know what to do
links = json.load(json_file)
if type(links) != list:
raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
for link in links:
if link:
yield jsonObjectToLink(link,json_file.name)
yield jsonObjectToLink(link, json_file.name)
KEY = 'json'
NAME = 'Generic JSON'

View file

@ -3,11 +3,9 @@ __package__ = 'archivebox.parsers'
import json
from typing import IO, Iterable
from datetime import datetime, timezone
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
)

View file

@ -1,8 +1,6 @@
__package__ = 'archivebox.parsers'
__description__ = 'Plain Text'
import re
from typing import IO, Iterable
from datetime import datetime, timezone
from pathlib import Path
@ -11,7 +9,7 @@ from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
URL_REGEX
find_all_urls,
)
@ -39,7 +37,7 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
pass
# otherwise look for anything that looks like a URL in the line
for url in re.findall(URL_REGEX, line):
for url in find_all_urls(line):
yield Link(
url=htmldecode(url),
timestamp=str(datetime.now(timezone.utc).timestamp()),
@ -48,17 +46,6 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
sources=[text_file.name],
)
# look inside the URL for any sub-urls, e.g. for archive.org links
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
for sub_url in re.findall(URL_REGEX, line[1:]):
yield Link(
url=htmldecode(sub_url),
timestamp=str(datetime.now(timezone.utc).timestamp()),
title=None,
tags=None,
sources=[text_file.name],
)
KEY = 'txt'
NAME = 'Generic TXT'

View file

@ -6,6 +6,7 @@
<a href="/admin/core/tag/">Tags</a> |
<a href="/admin/core/archiveresult/?o=-1">Log</a> &nbsp; &nbsp;
<a href="{% url 'Docs' %}" target="_blank" rel="noopener noreferrer">Docs</a> |
<a href="/api">API</a> |
<a href="{% url 'public-index' %}">Public</a> |
<a href="/admin/">Admin</a>
&nbsp; &nbsp;
@ -16,7 +17,7 @@
{% endblock %}
{% block userlinks %}
{% if user.has_usable_password %}
<a href="{% url 'admin:password_change' %}">Account</a> /
<a href="{% url 'admin:password_change' %}" title="Change your account password">Account</a> /
{% endif %}
<a href="{% url 'admin:logout' %}">{% trans 'Log out' %}</a>
{% endblock %}

View file

@ -62,12 +62,12 @@ COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m
# https://mathiasbynens.be/demo/url-regex
URL_REGEX = re.compile(
r'(?=(' +
r'http[s]?://' + # start matching from allowed schemes
r'(?:[a-zA-Z]|[0-9]' + # followed by allowed alphanum characters
r'|[-_$@.&+!*\(\),]' + # or allowed symbols (keep hyphen first to match literal hyphen)
r'|[^\u0000-\u007F])+' + # or allowed unicode bytes
r'[^\]\[<>"\'\s]+' + # stop parsing at these symbols
r'(?=('
r'http[s]?://' # start matching from allowed schemes
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen)
r'|[^\u0000-\u007F])+' # or allowed unicode bytes
r'[^\]\[<>"\'\s]+' # stop parsing at these symbols
r'))',
re.IGNORECASE | re.UNICODE,
)
@ -90,6 +90,11 @@ def fix_url_from_markdown(url_str: str) -> str:
helpful to fix URLs parsed from markdown e.g.
input: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).somemoretext
result: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def
IMPORTANT ASSUMPTION: valid urls wont have unbalanced or incorrectly nested parentheses
e.g. this will fail the user actually wants to ingest a url like 'https://example.com/some_wei)(rd_url'
in that case it will return https://example.com/some_wei (truncated up to the first unbalanced paren)
This assumption is true 99.9999% of the time, and for the rare edge case the user can use url_list parser.
"""
trimmed_url = url_str
@ -353,7 +358,8 @@ def chrome_cleanup():
if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
remove_file("/home/archivebox/.config/chromium/SingletonLock")
def ansi_to_html(text):
@enforce_types
def ansi_to_html(text: str) -> str:
"""
Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
"""
@ -439,11 +445,14 @@ class ExtendedEncoder(pyjson.JSONEncoder):
### URL PARSING TESTS / ASSERTIONS
# they run at runtime because I like having them inline in this file,
# I like the peace of mind knowing it's enforced at runtime across all OS's (in case the regex engine ever has any weird locale-specific quirks),
# and these assertions are basically instant, so not a big performance cost to do it on startup
assert fix_url_from_markdown('/a(b)c).x(y)z') == '/a(b)c'
# Check that plain text regex URL parsing works as expected
# this is last-line-of-defense to make sure the URL_REGEX isn't
# misbehaving due to some OS-level or environment level quirks (e.g. regex engine / cpython / locale differences)
# the consequences of bad URL parsing could be disastrous and lead to many
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
assert fix_url_from_markdown('http://example.com/a(b)c).x(y)z') == 'http://example.com/a(b)c'
assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
URL_REGEX_TESTS = [
@ -482,3 +491,50 @@ URL_REGEX_TESTS = [
for urls_str, expected_url_matches in URL_REGEX_TESTS:
url_matches = list(find_all_urls(urls_str))
assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'
# More test cases
_test_url_strs = {
'example.com': 0,
'/example.com': 0,
'//example.com': 0,
':/example.com': 0,
'://example.com': 0,
'htt://example8.com': 0,
'/htt://example.com': 0,
'https://example': 1,
'https://localhost/2345': 1,
'https://localhost:1234/123': 1,
'://': 0,
'https://': 0,
'http://': 0,
'ftp://': 0,
'ftp://example.com': 0,
'https://example.com': 1,
'https://example.com/': 1,
'https://a.example.com': 1,
'https://a.example.com/': 1,
'https://a.example.com/what/is/happening.html': 1,
'https://a.example.com/what/ís/happening.html': 1,
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
'<test>http://example7.com</test>': 1,
'https://<test>': 0,
'https://[test]': 0,
'http://"test"': 0,
'http://\'test\'': 0,
'[https://example8.com/what/is/this.php?what=1]': 1,
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
'<what>https://example10.com#and-thing=2 "</about>': 1,
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
'<or>http://examplehttp://15.badc</that>': 2,
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
}
for url_str, num_urls in _test_url_strs.items():
assert len(list(find_all_urls(url_str))) == num_urls, (
f'{url_str} does not contain {num_urls} urls')

View file

@ -18,7 +18,7 @@ which docker > /dev/null || exit 1
which jq > /dev/null || exit 1
# which pdm > /dev/null || exit 1
SUPPORTED_PLATFORMS="linux/amd64,linux/arm64,linux/arm/v7"
SUPPORTED_PLATFORMS="linux/amd64,linux/arm64"
TAG_NAME="${1:-$(git rev-parse --abbrev-ref HEAD)}"
VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
@ -80,20 +80,20 @@ echo "[+] Building archivebox:$VERSION docker image..."
# docker build . --no-cache -t archivebox-dev \
# replace --load with --push to deploy
docker buildx build --platform "$SELECTED_PLATFORMS" --load . \
-t archivebox/archivebox \
# -t archivebox/archivebox \
-t archivebox/archivebox:$TAG_NAME \
-t archivebox/archivebox:$VERSION \
-t archivebox/archivebox:$SHORT_VERSION \
# -t archivebox/archivebox:$VERSION \
# -t archivebox/archivebox:$SHORT_VERSION \
-t archivebox/archivebox:$GIT_SHA \
-t archivebox/archivebox:latest \
-t nikisweeting/archivebox \
# -t archivebox/archivebox:latest \
# -t nikisweeting/archivebox \
-t nikisweeting/archivebox:$TAG_NAME \
-t nikisweeting/archivebox:$VERSION \
-t nikisweeting/archivebox:$SHORT_VERSION \
# -t nikisweeting/archivebox:$VERSION \
# -t nikisweeting/archivebox:$SHORT_VERSION \
-t nikisweeting/archivebox:$GIT_SHA \
-t nikisweeting/archivebox:latest \
# -t nikisweeting/archivebox:latest \
-t ghcr.io/archivebox/archivebox/archivebox:$TAG_NAME \
-t ghcr.io/archivebox/archivebox/archivebox:$VERSION \
-t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION \
# -t ghcr.io/archivebox/archivebox/archivebox:$VERSION \
# -t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION \
-t ghcr.io/archivebox/archivebox/archivebox:$GIT_SHA \
-t ghcr.io/archivebox/archivebox/archivebox:latest
# -t ghcr.io/archivebox/archivebox/archivebox:latest

View file

@ -18,6 +18,7 @@
# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
# set -o xtrace
# set -o nounset
shopt -s nullglob
set -o errexit
set -o errtrace
set -o pipefail

View file

@ -15,7 +15,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
source "$DIR/.venv/bin/activate"
echo "[*] Running flake8..."
cd archivebox
cd "$DIR/archivebox"
flake8 . && echo "√ No errors found."
echo

View file

@ -48,7 +48,7 @@ echo
echo "[+] Generating dev & prod requirements.txt & pdm.lock from pyproject.toml..."
pip install --upgrade pip setuptools
pdm self update
pdm self update >/dev/null 2>&1 || true
pdm venv create 3.12
echo
echo "pyproject.toml: archivebox $(grep 'version = ' pyproject.toml | awk '{print $3}' | jq -r)"
@ -73,7 +73,7 @@ cp ./pdm.dev.lock ./pip_dist/
cp ./requirements-dev.txt ./pip_dist/
echo
echo "[+]] Generating package-lock.json from package.json..."
echo "[+] Generating package-lock.json from package.json..."
npm install -g npm
echo
echo "package.json: archivebox $(jq -r '.version' package.json)"

View file

@ -27,9 +27,9 @@ if (which docker-compose > /dev/null && docker pull archivebox/archivebox:latest
if [ -f "./index.sqlite3" ]; then
mv -i ~/archivebox/* ~/archivebox/data/
fi
curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/main/docker-compose.yml' > docker-compose.yml
curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/stable/docker-compose.yml' > docker-compose.yml
mkdir -p ./etc
curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/main/etc/sonic.cfg' > ./etc/sonic.cfg
curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/stable/etc/sonic.cfg' > ./etc/sonic.cfg
docker compose run --rm archivebox init --setup
echo
echo "[+] Starting ArchiveBox server using: docker compose up -d..."

View file

@ -48,17 +48,17 @@ services:
# $ docker compose restart archivebox_scheduler
archivebox_scheduler:
image: archivebox/archivebox:latest
command: schedule --foreground --update --every=day
environment:
- TIMEOUT=120 # use a higher timeout than the main container to give slow tasks more time when retrying
# - PUID=502 # set to your host user's UID & GID if you encounter permissions issues
# - PGID=20
volumes:
- ./data:/data
# cpus: 2 # uncomment / edit these values to limit scheduler container resource consumption
# mem_limit: 2048m
# restart: always
image: archivebox/archivebox:latest
command: schedule --foreground --update --every=day
environment:
- TIMEOUT=120 # use a higher timeout than the main container to give slow tasks more time when retrying
# - PUID=502 # set to your host user's UID & GID if you encounter permissions issues
# - PGID=20
volumes:
- ./data:/data
# cpus: 2 # uncomment / edit these values to limit scheduler container resource consumption
# mem_limit: 2048m
# restart: always
### This runs the optional Sonic full-text search backend (much faster than default rg backend).
@ -72,7 +72,7 @@ services:
# not needed after first run / if you have already have ./etc/sonic.cfg present
dockerfile_inline: |
FROM quay.io/curl/curl:latest AS config_downloader
RUN curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/main/etc/sonic.cfg' > /tmp/sonic.cfg
RUN curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/stable/etc/sonic.cfg' > /tmp/sonic.cfg
FROM valeriansaliou/sonic:latest
COPY --from=config_downloader /tmp/sonic.cfg /etc/sonic.cfg
expose:
@ -99,7 +99,7 @@ services:
# restricted to access from localhost by default because it has no authentication
- 127.0.0.1:8080:8080
### Example: Put Nginx in front of the ArchiveBox server for SSL termination and static file serving.
# You can also any other ingress provider for SSL like Apache, Caddy, Traefik, Cloudflare Tunnels, etc.
@ -173,7 +173,7 @@ services:
### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel to avoid IP blocks.
# You can also use any other VPN that works at the docker IP level, e.g. Tailscale, OpenVPN, etc.
# wireguard:
# image: linuxserver/wireguard:latest
# network_mode: 'service:archivebox'

2
docs

@ -1 +1 @@
Subproject commit a1b69c51ba9b249c0b2a6efd141dbb792fc36ad2
Subproject commit f23abba9773b67ad9f2fd04d6f2e8e056dfa6521

50
package-lock.json generated
View file

@ -25,9 +25,9 @@
}
},
"node_modules/@babel/runtime-corejs2": {
"version": "7.24.4",
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.24.4.tgz",
"integrity": "sha512-ZCKqyUKt/Coimg+3Kafu43yNetgYnTXzNbEGAgxc81J5sI0qFNbQ613w7PNny+SmijAmGVroL0GDvx5rG/JI5Q==",
"version": "7.24.5",
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.24.5.tgz",
"integrity": "sha512-cC9jiO6s/IN+xwCHYy1AGrcFJ4bwgIwb8HX1KaoEpRsznLlO4x9eBP6AX7RIeMSWlQqEj2WHox637OS8cDq6Ew==",
"dependencies": {
"core-js": "^2.6.12",
"regenerator-runtime": "^0.14.0"
@ -203,9 +203,9 @@
"integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA=="
},
"node_modules/@types/node": {
"version": "20.12.7",
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.7.tgz",
"integrity": "sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg==",
"version": "20.12.8",
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.8.tgz",
"integrity": "sha512-NU0rJLJnshZWdE/097cdCBbyW1h4hEg0xpovcoAQYHl8dnEyp/NAOiE45pvc+Bd1Dt+2r94v2eGFpQJ4R7g+2w==",
"optional": true,
"dependencies": {
"undici-types": "~5.26.4"
@ -713,9 +713,9 @@
"integrity": "sha512-3VdM/SXBZX2omc9JF9nOPCtDaYQ67BGp5CoLpIQlO2KCAPETs8TcDHacF26jXadGbvUteZzRTeos2fhID5+ucQ=="
},
"node_modules/dompurify": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.1.0.tgz",
"integrity": "sha512-yoU4rhgPKCo+p5UrWWWNKiIq+ToGqmVVhk0PmMYBK4kRsR3/qhemNFL8f6CFmBd4gMwm3F4T7HBoydP5uY07fA=="
"version": "3.1.2",
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.1.2.tgz",
"integrity": "sha512-hLGGBI1tw5N8qTELr3blKjAML/LY4ANxksbS612UiJyDfyf/2D092Pvm+S7pmeTGJRqvlJkFzBoHBQKgQlOQVg=="
},
"node_modules/domutils": {
"version": "1.5.1",
@ -1655,6 +1655,26 @@
"node": ">=18"
}
},
"node_modules/puppeteer-core/node_modules/ws": {
"version": "8.16.0",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz",
"integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==",
"engines": {
"node": ">=10.0.0"
},
"peerDependencies": {
"bufferutil": "^4.0.1",
"utf-8-validate": ">=5.0.2"
},
"peerDependenciesMeta": {
"bufferutil": {
"optional": true
},
"utf-8-validate": {
"optional": true
}
}
},
"node_modules/qs": {
"version": "6.5.3",
"resolved": "https://registry.npmjs.org/qs/-/qs-6.5.3.tgz",
@ -2071,9 +2091,9 @@
}
},
"node_modules/tough-cookie": {
"version": "4.1.3",
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.3.tgz",
"integrity": "sha512-aX/y5pVRkfRnfmuX+OdbSdXvPe6ieKX/G2s7e98f4poJHnqH3281gDPm/metm6E/WRamfx7WC4HUqkWHfQHprw==",
"version": "4.1.4",
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.4.tgz",
"integrity": "sha512-Loo5UUvLD9ScZ6jh8beX1T6sO1w2/MpCRpEP7V280GKMVUQ0Jzar2U3UJPsrdbziLEMMhu3Ujnq//rhiFuIeag==",
"dependencies": {
"psl": "^1.1.33",
"punycode": "^2.1.1",
@ -2276,9 +2296,9 @@
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="
},
"node_modules/ws": {
"version": "8.16.0",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz",
"integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==",
"version": "8.17.0",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.17.0.tgz",
"integrity": "sha512-uJq6108EgZMAl20KagGkzCKfMEjxmKvZHG7Tlq0Z6nOky7YF7aq4mOx6xK8TJ/i1LeK4Qus7INktacctDgY8Ow==",
"engines": {
"node": ">=10.0.0"
},

1128
pdm.lock

File diff suppressed because it is too large Load diff

View file

@ -12,32 +12,31 @@ readme = "README.md"
# pdm install
# pdm update --unconstrained
dependencies = [
# Last Bumped: 2024-04-25
# Base Framework and Language Dependencies
"setuptools>=69.5.1",
"django>=4.2.0,<5.0",
"django>=5.0.4,<6.0",
"django-ninja>=1.1.0",
"django-extensions>=3.2.3",
"mypy-extensions>=1.0.0",
# Python Helper Libraries
"requests>=2.31.0",
"dateparser>=1.0.0",
"feedparser>=6.0.11",
"w3lib>=1.22.0",
"w3lib>=2.1.2",
# Feature-Specific Dependencies
"python-crontab>=2.5.1", # for: archivebox schedule
"croniter>=0.3.34", # for: archivebox schedule
"ipython>5.0.0", # for: archivebox shell
"python-crontab>=3.0.0", # for: archivebox schedule
"croniter>=2.0.5", # for: archivebox schedule
"ipython>=8.23.0", # for: archivebox shell
# Extractor Dependencies
"yt-dlp>=2024.4.9", # for: media
"playwright>=1.43.0; platform_machine != 'armv7l'", # WARNING: playwright doesn't have any sdist, causes trouble on build systems that refuse to install wheel-only packages
# "playwright>=1.43.0; platform_machine != 'armv7l'", # WARNING: playwright doesn't have any sdist, causes trouble on build systems that refuse to install wheel-only packages
# TODO: add more extractors
# - gallery-dl
# - scihubdl
# - See Github issues for more...
"django-signal-webhooks>=0.3.0",
"django-admin-data-views>=0.3.1",
]
homepage = "https://github.com/ArchiveBox/ArchiveBox"
@ -59,9 +58,6 @@ classifiers = [
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
@ -100,10 +96,10 @@ ldap = [
# pdm update --dev --unconstrained
[tool.pdm.dev-dependencies]
build = [
# "pdm", # usually installed by apt/brew, dont double-install with pip
"setuptools>=69.5.1",
"pip",
"wheel",
"pdm",
"homebrew-pypi-poet>=0.10.0", # for: generating archivebox.rb brewfile list of python packages
]
docs = [
@ -115,10 +111,11 @@ debug = [
"django-debug-toolbar",
"djdt_flamegraph",
"ipdb",
"requests-tracker>=0.3.3",
]
test = [
"pdm[pytest]",
"pytest",
"bottle",
]
lint = [
"flake8",
@ -126,6 +123,12 @@ lint = [
"django-stubs",
]
[tool.pdm.scripts]
lint = "./bin/lint.sh"
test = "./bin/test.sh"
# all = {composite = ["lint mypackage/", "test -v tests/"]}
[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"
@ -134,11 +137,6 @@ build-backend = "pdm.backend"
archivebox = "archivebox.cli:main"
[tool.pdm.scripts]
lint = "./bin/lint.sh"
test = "./bin/test.sh"
# all = {composite = ["lint mypackage/", "test -v tests/"]}
[tool.pytest.ini_options]
testpaths = [ "tests" ]
@ -154,6 +152,8 @@ explicit_package_bases = true
# exclude = "pdm/(pep582/|models/in_process/.+\\.py)"
plugins = ["mypy_django_plugin.main"]
[tool.django-stubs]
django_settings_module = "core.settings"
[project.urls]

View file

@ -2,54 +2,59 @@
# Please do not edit it manually.
annotated-types==0.6.0
anyio==4.3.0
asgiref==3.8.1
asttokens==2.4.1
brotli==1.1.0; implementation_name == "cpython"
brotlicffi==1.1.0.0; implementation_name != "cpython"
certifi==2024.2.2
cffi==1.16.0; implementation_name != "cpython"
cffi==1.16.0; platform_python_implementation != "PyPy" or implementation_name != "cpython"
charset-normalizer==3.3.2
colorama==0.4.6; sys_platform == "win32"
croniter==2.0.5
cryptography==42.0.7
dateparser==1.2.0
decorator==5.1.1
django==4.2.11
django==5.0.4
django-auth-ldap==4.8.0
django-extensions==3.2.3
django-ninja==1.1.0
django-settings-holder==0.1.2
django-signal-webhooks==0.3.0
exceptiongroup==1.2.1; python_version < "3.11"
executing==2.0.1
feedparser==6.0.11
greenlet==3.0.3; platform_machine != "armv7l"
h11==0.14.0
httpcore==1.0.5
httpx==0.27.0
idna==3.7
ipython==8.23.0
ipython==8.24.0
jedi==0.19.1
matplotlib-inline==0.1.7
mutagen==1.47.0
mypy-extensions==1.0.0
parso==0.8.4
pexpect==4.9.0; sys_platform != "win32" and sys_platform != "emscripten"
playwright==1.43.0; platform_machine != "armv7l"
prompt-toolkit==3.0.43
ptyprocess==0.7.0; sys_platform != "win32" and sys_platform != "emscripten"
pure-eval==0.2.2
pyasn1==0.6.0
pyasn1-modules==0.4.0
pycparser==2.22; implementation_name != "cpython"
pycparser==2.22; platform_python_implementation != "PyPy" or implementation_name != "cpython"
pycryptodomex==3.20.0
pydantic==2.7.1
pydantic-core==2.18.2
pyee==11.1.0; platform_machine != "armv7l"
pygments==2.17.2
pygments==2.18.0
python-crontab==3.0.0
python-dateutil==2.9.0.post0
python-ldap==3.4.4
pytz==2024.1
regex==2024.4.16
regex==2024.4.28
requests==2.31.0
setuptools==69.5.1
sgmllib3k==1.0.0
six==1.16.0
sniffio==1.3.1
sonic-client==1.0.0
sqlparse==0.5.0
stack-data==0.6.3