1
0
Fork 0
mirror of synced 2024-06-26 18:10:24 +12:00

Compare commits

..

No commits in common. "dev" and "v0.7.2" have entirely different histories.
dev ... v0.7.2

150 changed files with 3763 additions and 11988 deletions

View file

@ -17,11 +17,6 @@ venv/
.venv-old/
.docker-venv/
node_modules/
chrome/
chromeprofile/
pdm.dev.lock
pdm.lock
docs/
build/
@ -33,7 +28,4 @@ assets/
docker/
data/
data*/
output/
index.sqlite3
index.sqlite3-wal

2
.gitattributes vendored
View file

@ -1,2 +0,0 @@
**/*.lock
**/*-lock.json

5
.github/FUNDING.yml vendored
View file

@ -1,2 +1,3 @@
github: ["ArchiveBox", "pirate"]
custom: ["https://donate.archivebox.io", "https://paypal.me/NicholasSweeting"]
github: pirate
patreon: theSquashSH
custom: ["https://twitter.com/ArchiveBoxApp", "https://paypal.me/NicholasSweeting", "https://www.blockchain.com/eth/address/0x5D4c34D4a121Fe08d1dDB7969F07550f2dB9f471", "https://www.blockchain.com/btc/address/1HuxXriPE2Bbnag3jJrqa3bkNHrs297dYH"]

View file

@ -6,7 +6,6 @@ labels: ''
assignees: ''
---
<!-- If you perfer, you can make a PR to https://github.com/ArchiveBox/docs instead of opening an issue -->
## Wiki Page URL
<!-- e.g. https://github.com/pirate/ArchiveBox/wiki/Configuration#use_color -->

View file

@ -1,25 +0,0 @@
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
target-branch: "dev"
schedule:
interval: "monthly"
groups:
pip:
patterns:
- "*"
- package-ecosystem: "npm"
directory: "/"
target-branch: "dev"
schedule:
interval: "monthly"
groups:
npm:
patterns:
- "*"

32
.github/workflows/codeql-analysis.yml vendored Normal file
View file

@ -0,0 +1,32 @@
name: "CodeQL"
on:
push:
branches: [ dev ]
pull_request:
branches: [ dev ]
schedule:
- cron: '43 1 * * 2'
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
language: [ 'python' ]
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Initialize CodeQL
uses: github/codeql-action/init@v1
with:
languages: ${{ matrix.language }}
queries: security-extended
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v1

View file

@ -1,92 +0,0 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL"
on:
push:
branches: [ "dev" ]
pull_request:
branches: [ "dev" ]
schedule:
- cron: '33 17 * * 6'
jobs:
analyze:
name: Analyze (${{ matrix.language }})
# Runner size impacts CodeQL analysis time. To learn more, please see:
# - https://gh.io/recommended-hardware-resources-for-running-codeql
# - https://gh.io/supported-runners-and-hardware-resources
# - https://gh.io/using-larger-runners (GitHub.com only)
# Consider using larger runners or machines with greater resources for possible analysis time improvements.
runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
permissions:
# required for all workflows
security-events: write
# required to fetch internal or private CodeQL packs
packages: read
# only required for workflows in private repositories
actions: read
contents: read
strategy:
fail-fast: false
matrix:
include:
- language: python
build-mode: none
# CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
# Use `c-cpp` to analyze code written in C, C++ or both
# Use 'java-kotlin' to analyze code written in Java, Kotlin or both
# Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
# To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
# see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
# If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
# your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
steps:
- name: Checkout repository
uses: actions/checkout@v4
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v3
with:
languages: ${{ matrix.language }}
build-mode: ${{ matrix.build-mode }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.
# For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
# queries: security-extended,security-and-quality
# If the analyze step fails for one of the languages you are analyzing with
# "We were unable to automatically build your code", modify the matrix above
# to set the build mode to "manual" for that language. Then modify this step
# to build your code.
# Command-line programs to run using the OS shell.
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
- if: matrix.build-mode == 'manual'
run: |
echo 'If you are using a "manual" build mode for one or more of the' \
'languages you are analyzing, replace this with the commands to build' \
'your code, for example:'
echo ' make bootstrap'
echo ' make release'
exit 1
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v3
with:
category: "/language:${{matrix.language}}"

View file

@ -11,7 +11,7 @@ on:
env:
DOCKER_IMAGE: archivebox-ci
jobs:
buildx:
runs-on: ubuntu-latest
@ -24,21 +24,21 @@ jobs:
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
with:
version: latest
install: true
platforms: linux/amd64,linux/arm64
platforms: linux/amd64,linux/arm64,linux/arm/v7
- name: Builder instance name
run: echo ${{ steps.buildx.outputs.name }}
- name: Available platforms
run: echo ${{ steps.buildx.outputs.platforms }}
- name: Cache Docker layers
uses: actions/cache@v3
with:
@ -51,27 +51,21 @@ jobs:
uses: docker/login-action@v3
if: github.event_name != 'pull_request'
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Collect Docker tags
# https://github.com/docker/metadata-action
id: docker_meta
uses: docker/metadata-action@v5
with:
images: archivebox/archivebox,nikisweeting/archivebox
tags: |
# :stable
type=ref,event=branch
# :0.7.3
type=semver,pattern={{version}}
# :0.7
type=semver,pattern={{major}}.{{minor}}
# :sha-463ea54
type=sha
# :latest
type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'stable') }}
type=raw,value=latest,enable={{is_default_branch}}
- name: Build and push
id: docker_build
uses: docker/build-push-action@v5
@ -83,18 +77,11 @@ jobs:
tags: ${{ steps.docker_meta.outputs.tags }}
cache-from: type=local,src=/tmp/.buildx-cache
cache-to: type=local,dest=/tmp/.buildx-cache-new
platforms: linux/amd64,linux/arm64
platforms: linux/amd64,linux/arm64,linux/arm/v7
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
- name: Update README
uses: peter-evans/dockerhub-description@v4
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
repository: archivebox/archivebox
# This ugly bit is necessary if you don't want your cache to grow forever
# until it hits GitHub's limit of 5GB.
# Temp fix

View file

@ -35,7 +35,7 @@ jobs:
cache: true
- name: Install dependencies
run: pdm install --fail-fast --no-lock --dev --group=':all' --no-self
run: pdm install --fail-fast --no-lock --group :all --no-self
- name: Build package
run: |

13
.gitignore vendored
View file

@ -12,11 +12,6 @@ venv/
.docker-venv/
node_modules/
# Ignore dev lockfiles (should always be built fresh)
pdm.lock
pdm.dev.lock
requirements-dev.txt
# Packaging artifacts
.pdm-python
.pdm-build
@ -27,12 +22,10 @@ dist/
# Data folders
data/
data*/
data1/
data2/
data3/
output/
index.sqlite3
*.sqlite*
data.*
# vim
*.sw?
.vscode

View file

@ -30,4 +30,5 @@ formats:
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
python:
install:
- requirements: docs/requirements.txt
- requirements: requirements.txt
- requirements: docs/requirements.txt

View file

@ -10,7 +10,7 @@
# docker run -v "$PWD/data":/data -p 8000:8000 archivebox server
# Multi-arch build:
# docker buildx create --use
# docker buildx build . --platform=linux/amd64,linux/arm64--push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
#
# Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
@ -20,23 +20,9 @@ FROM python:3.11-slim-bookworm
LABEL name="archivebox" \
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
description="All-in-one self-hosted internet archiving solution" \
description="All-in-one personal internet archiving container" \
homepage="https://github.com/ArchiveBox/ArchiveBox" \
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker" \
org.opencontainers.image.title="ArchiveBox" \
org.opencontainers.image.vendor="ArchiveBox" \
org.opencontainers.image.description="All-in-one self-hosted internet archiving solution" \
org.opencontainers.image.source="https://github.com/ArchiveBox/ArchiveBox" \
com.docker.image.source.entrypoint="Dockerfile" \
# TODO: release ArchiveBox as a Docker Desktop extension (requires these labels):
# https://docs.docker.com/desktop/extensions-sdk/architecture/metadata/
com.docker.desktop.extension.api.version=">= 1.4.7" \
com.docker.desktop.extension.icon="https://archivebox.io/icon.png" \
com.docker.extension.publisher-url="https://archivebox.io" \
com.docker.extension.screenshots='[{"alt": "Screenshot of Admin UI", "url": "https://github.com/ArchiveBox/ArchiveBox/assets/511499/e8e0b6f8-8fdf-4b7f-8124-c10d8699bdb2"}]' \
com.docker.extension.detailed-description='See here for detailed documentation: https://wiki.archivebox.io' \
com.docker.extension.changelog='See here for release notes: https://github.com/ArchiveBox/ArchiveBox/releases' \
com.docker.extension.categories='database,utility-tools'
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker"
ARG TARGETPLATFORM
ARG TARGETOS
@ -87,9 +73,7 @@ COPY --chown=root:root --chmod=755 package.json "$CODE_DIR/"
RUN grep '"version": ' "${CODE_DIR}/package.json" | awk -F'"' '{print $4}' > /VERSION.txt
# Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up Docker builds)
RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d/99keep-cache \
&& echo 'APT::Install-Recommends "0";' > /etc/apt/apt.conf.d/99no-intall-recommends \
&& echo 'APT::Install-Suggests "0";' > /etc/apt/apt.conf.d/99no-intall-suggests \
RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache \
&& rm -f /etc/apt/apt.conf.d/docker-clean
# Print debug info about build and save it to disk, for human eyes only, not used by anything else
@ -122,10 +106,10 @@ RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \
# Install system apt dependencies (adding backports to access more recent apt updates)
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing APT base system dependencies for $TARGETPLATFORM..." \
&& echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' > /etc/apt/sources.list.d/backports.list \
&& echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' >> /etc/apt/sources.list.d/backports.list \
&& mkdir -p /etc/apt/keyrings \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
# 1. packaging dependencies
apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
# 2. docker and init system dependencies
@ -136,13 +120,27 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
######### Language Environments ####################################
# Install Node environment
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing Node $NODE_VERSION environment in $NODE_MODULES..." \
&& echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
&& curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
nodejs libatomic1 python3-minimal \
&& rm -rf /var/lib/apt/lists/* \
# Update NPM to latest version
&& npm i -g npm --cache /root/.npm \
# Save version info
&& ( \
which node && node --version \
&& which npm && npm --version \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
# Install Python environment
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
echo "[+] Setting up Python $PYTHON_VERSION runtime..." \
# && apt-get update -qq \
# && apt-get install -qq -y -t bookworm-backports --no-upgrade \
# python${PYTHON_VERSION} python${PYTHON_VERSION}-minimal python3-pip \
# && rm -rf /var/lib/apt/lists/* \
# tell PDM to allow using global system python site packages
# && rm /usr/lib/python3*/EXTERNALLY-MANAGED \
# create global virtual environment GLOBAL_VENV to use (better than using pip install --global)
@ -159,37 +157,17 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
# Install Node environment
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing Node $NODE_VERSION environment in $NODE_MODULES..." \
&& echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
&& curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-upgrade libatomic1 \
&& apt-get install -y -t bookworm-backports --no-upgrade \
nodejs \
&& rm -rf /var/lib/apt/lists/* \
# Update NPM to latest version
&& npm i -g npm --cache /root/.npm \
# Save version info
&& ( \
which node && node --version \
&& which npm && npm --version \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
######### Extractor Dependencies ##################################
# Install apt dependencies
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing APT extractor dependencies globally using apt..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
curl wget git yt-dlp ffmpeg ripgrep \
# Packages we have also needed in the past:
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
# fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& rm -rf /var/lib/apt/lists/* \
# Save version info
&& ( \
@ -205,21 +183,18 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing Browser binary dependencies to $PLAYWRIGHT_BROWSERS_PATH..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports \
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
at-spi2-common fonts-liberation fonts-noto-color-emoji fonts-tlwg-loma-otf fonts-unifont libatk-bridge2.0-0 libatk1.0-0 libatspi2.0-0 libavahi-client3 \
libavahi-common-data libavahi-common3 libcups2 libfontenc1 libice6 libnspr4 libnss3 libsm6 libunwind8 \
libxaw7 libxcomposite1 libxdamage1 libxfont2 \
libxkbfile1 libxmu6 libxpm4 libxt6 x11-xkb-utils xfonts-encodings \
# xfonts-scalable xfonts-utils xserver-common xvfb \
# chrome can run without dbus/upower technically, it complains about missing dbus but should run ok anyway
# libxss1 dbus dbus-x11 upower \
# && service dbus start \
# install Chromium using playwright
&& pip install playwright \
&& cp -r /root/.cache/ms-playwright "$PLAYWRIGHT_BROWSERS_PATH" \
&& playwright install chromium \
&& export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \
&& if [[ "$TARGETPLATFORM" == *amd64* || "$TARGETPLATFORM" == *arm64* ]]; then \
# install Chromium using playwright
pip install playwright \
&& cp -r /root/.cache/ms-playwright "$PLAYWRIGHT_BROWSERS_PATH" \
&& playwright install --with-deps chromium \
&& export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \
else \
# fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
apt-get install -qq -y -t bookworm-backports --no-install-recommends \
chromium fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& export CHROME_BINARY="$(which chromium)"; \
fi \
&& rm -rf /var/lib/apt/lists/* \
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
@ -252,7 +227,7 @@ COPY --chown=root:root --chmod=755 "./pyproject.toml" "requirements.txt" "$CODE_
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing PIP ArchiveBox dependencies from requirements.txt for ${TARGETPLATFORM}..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
build-essential \
libssl-dev libldap2-dev libsasl2-dev \
python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \
@ -274,8 +249,8 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
echo "[*] Installing PIP ArchiveBox package from $CODE_DIR..." \
# && apt-get update -qq \
# install C compiler to build deps on platforms that dont have 32-bit wheels available on pypi
# && apt-get install -qq -y -t bookworm-backports \
# build-essential \
# && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
# build-essential \
# INSTALL ARCHIVEBOX python package globally from CODE_DIR, with all optional dependencies
&& pip install -e "$CODE_DIR"[sonic,ldap] \
# save docker image size and always remove compilers / build tools after building is complete
@ -287,14 +262,9 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
# Setup ArchiveBox runtime config
WORKDIR "$DATA_DIR"
ENV IN_DOCKER=True \
DISPLAY=novnc:0.0 \
CUSTOM_TEMPLATES_DIR=/data/templates \
GOOGLE_API_KEY=no \
GOOGLE_DEFAULT_CLIENT_ID=no \
GOOGLE_DEFAULT_CLIENT_SECRET=no \
ALLOWED_HOSTS=*
ENV IN_DOCKER=True
## No need to set explicitly, these values will be autodetected by archivebox in docker:
# CHROME_SANDBOX=False \
# WGET_BINARY="wget" \
# YOUTUBEDL_BINARY="yt-dlp" \
# CHROME_BINARY="/usr/bin/chromium-browser" \
@ -319,8 +289,9 @@ WORKDIR "$DATA_DIR"
VOLUME "$DATA_DIR"
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
CMD curl --silent 'http://localhost:8000/health/' | grep -q 'OK'
# Optional:
# HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
# CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]

1190
README.md

File diff suppressed because it is too large Load diff

View file

@ -1,4 +1 @@
__package__ = 'archivebox'
from .monkey_patches import *

View file

@ -1 +0,0 @@
__package__ = 'abid_utils'

View file

@ -1,191 +0,0 @@
from typing import NamedTuple, Any, Union, Optional
import ulid
import uuid6
import hashlib
from urllib.parse import urlparse
from uuid import UUID
from typeid import TypeID # type: ignore[import-untyped]
from datetime import datetime
ABID_PREFIX_LEN = 4
ABID_SUFFIX_LEN = 26
ABID_LEN = 30
ABID_TS_LEN = 10
ABID_URI_LEN = 8
ABID_SUBTYPE_LEN = 2
ABID_RAND_LEN = 6
DEFAULT_ABID_PREFIX = 'obj_'
class ABID(NamedTuple):
"""
e.g. ABID('obj_01HX9FPYTRE4A5CCD901ZYEBQE')
"""
prefix: str # e.g. obj_
ts: str # e.g. 01HX9FPYTR
uri: str # e.g. E4A5CCD9
subtype: str # e.g. 01
rand: str # e.g. ZYEBQE
def __getattr__(self, attr: str) -> Any:
return getattr(self.ulid, attr)
def __eq__(self, other: Any) -> bool:
try:
return self.ulid == other.ulid
except AttributeError:
return NotImplemented
def __str__(self) -> str:
return self.prefix + self.suffix
def __len__(self) -> int:
return len(self.prefix + self.suffix)
@classmethod
def parse(cls, buffer: Union[str, UUID, ulid.ULID, TypeID, 'ABID'], prefix=DEFAULT_ABID_PREFIX) -> 'ABID':
assert buffer, f'Attempted to create ABID from null value {buffer}'
buffer = str(buffer)
if '_' in buffer:
prefix, suffix = buffer.split('_')
else:
prefix, suffix = prefix.strip('_'), buffer
assert len(prefix) == ABID_PREFIX_LEN - 1 # length without trailing _
assert len(suffix) == ABID_SUFFIX_LEN, f'Suffix {suffix} from {buffer} was not {ABID_SUFFIX_LEN} chars long'
return cls(
prefix=abid_part_from_prefix(prefix),
ts=suffix[0:10].upper(),
uri=suffix[10:18].upper(),
subtype=suffix[18:20].upper(),
rand=suffix[20:26].upper(),
)
@property
def suffix(self):
return ''.join((self.ts, self.uri, self.subtype, self.rand))
@property
def ulid(self) -> ulid.ULID:
return ulid.parse(self.suffix)
@property
def uuid(self) -> UUID:
return self.ulid.uuid
@property
def uuid6(self) -> uuid6.UUID:
return uuid6.UUID(hex=self.uuid.hex)
@property
def typeid(self) -> TypeID:
return TypeID.from_uuid(prefix=self.prefix.strip('_'), suffix=self.uuid6)
@property
def datetime(self) -> datetime:
return self.ulid.timestamp().datetime
####################################################
def uri_hash(uri: Union[str, bytes]) -> str:
"""
'E4A5CCD9AF4ED2A6E0954DF19FD274E9CDDB4853051F033FD518BFC90AA1AC25'
"""
if isinstance(uri, bytes):
uri_str: str = uri.decode()
else:
uri_str = uri
# only hash the domain part of URLs
if '://' in uri_str:
try:
domain = urlparse(uri_str).netloc
if domain:
uri_str = domain
except AttributeError:
pass
uri_bytes = uri_str.encode('utf-8')
return hashlib.sha256(uri_bytes).hexdigest().upper()
def abid_part_from_prefix(prefix: Optional[str]) -> str:
"""
'snp_'
"""
if prefix is None:
return 'obj_'
prefix = prefix.strip('_').lower()
assert len(prefix) == 3
return prefix + '_'
def abid_part_from_uri(uri: str) -> str:
"""
'E4A5CCD9' # takes first 8 characters of sha256(url)
"""
uri = str(uri)
return uri_hash(uri)[:ABID_URI_LEN]
def abid_part_from_ts(ts: Optional[datetime]) -> str:
"""
'01HX9FPYTR' # produces 10 character Timestamp section of ulid based on added date
"""
return str(ulid.from_timestamp(ts) if ts else ulid.new())[:ABID_TS_LEN]
def abid_part_from_subtype(subtype: str) -> str:
"""
Snapshots have 01 type, other objects have other subtypes like wget/media/etc.
Also allows us to change the ulid spec later by putting special sigil values here.
"""
subtype = str(subtype)
if len(subtype) == ABID_SUBTYPE_LEN:
return subtype
return hashlib.sha256(subtype.encode('utf-8')).hexdigest()[:ABID_SUBTYPE_LEN].upper()
def abid_part_from_rand(rand: Union[str, UUID, None, int]) -> str:
"""
'ZYEBQE' # takes last 6 characters of randomness from existing legacy uuid db field
"""
if rand is None:
# if it's None we generate a new random 6 character hex string
return str(ulid.new())[-ABID_RAND_LEN:]
elif isinstance(rand, UUID):
# if it's a uuid we take the last 6 characters of the ULID represation of it
return str(ulid.from_uuid(rand))[-ABID_RAND_LEN:]
elif isinstance(rand, int):
# if it's a BigAutoInteger field we convert it from an int to a 0-padded string
rand_str = str(rand)[-ABID_RAND_LEN:]
padding_needed = ABID_RAND_LEN - len(rand_str)
rand_str = ('0'*padding_needed) + rand_str
return rand_str
# otherwise treat it as a string, take the last 6 characters of it verbatim
return str(rand)[-ABID_RAND_LEN:].upper()
def abid_from_values(prefix, ts, uri, subtype, rand) -> ABID:
"""
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
"""
abid = ABID(
prefix=abid_part_from_prefix(prefix),
ts=abid_part_from_ts(ts),
uri=abid_part_from_uri(uri),
subtype=abid_part_from_subtype(subtype),
rand=abid_part_from_rand(rand),
)
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for ts={ts} uri={uri} subtyp={subtype} rand={rand}'
return abid

View file

@ -1,7 +0,0 @@
from django.apps import AppConfig
class AbidUtilsConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'abid_utils'

View file

@ -1,314 +0,0 @@
"""
This file provides the Django ABIDField and ABIDModel base model to inherit from.
It implements the ArchiveBox ID (ABID) interfaces including abid_values, get_abid, .abid, .uuid, .id.
"""
from typing import Any, Dict, Union, List, Set, NamedTuple, cast
from ulid import ULID
from uuid import uuid4, UUID
from typeid import TypeID # type: ignore[import-untyped]
from datetime import datetime
from functools import partial
from charidfield import CharIDField # type: ignore[import-untyped]
from django.conf import settings
from django.db import models
from django.db.utils import OperationalError
from django.contrib.auth import get_user_model
from django_stubs_ext.db.models import TypedModelMeta
from .abid import (
ABID,
ABID_LEN,
ABID_RAND_LEN,
ABID_SUFFIX_LEN,
DEFAULT_ABID_PREFIX,
abid_part_from_prefix,
abid_from_values
)
####################################################
# Database Field for typeid/ulid style IDs with a prefix, e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ
ABIDField = partial(
CharIDField,
max_length=ABID_LEN,
help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)",
default=None,
null=True,
blank=True,
db_index=True,
unique=True,
)
def get_or_create_system_user_pk(username='system'):
"""Get or create a system user with is_superuser=True to be the default owner for new DB rows"""
User = get_user_model()
# if only one user exists total, return that user
if User.objects.filter(is_superuser=True).count() == 1:
return User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0]
# otherwise, create a dedicated "system" user
user, created = User.objects.get_or_create(username=username, is_staff=True, is_superuser=True, defaults={'email': '', 'password': ''})
return user.pk
class ABIDModel(models.Model):
"""
Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface.
"""
abid_prefix: str = DEFAULT_ABID_PREFIX # e.g. 'tag_'
abid_ts_src = 'None' # e.g. 'self.created'
abid_uri_src = 'None' # e.g. 'self.uri'
abid_subtype_src = 'None' # e.g. 'self.extractor'
abid_rand_src = 'None' # e.g. 'self.uuid' or 'self.id'
id = models.UUIDField(primary_key=True, default=uuid4, editable=True)
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
abid = ABIDField(prefix=abid_prefix)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
class Meta(TypedModelMeta):
abstract = True
def save(self, *args: Any, **kwargs: Any) -> None:
if hasattr(self, 'abid'):
# self.abid = ABID.parse(self.abid) if self.abid else self.get_abid()
self.abid = self.get_abid()
else:
print(f'[!] WARNING: {self.__class__.__name__}.abid is not a DB field so ABID will not be persisted!')
self.abid = self.get_abid()
super().save(*args, **kwargs)
@property
def abid_values(self) -> Dict[str, Any]:
return {
'prefix': self.abid_prefix,
'ts': eval(self.abid_ts_src),
'uri': eval(self.abid_uri_src),
'subtype': eval(self.abid_subtype_src),
'rand': eval(self.abid_rand_src),
}
def get_abid(self) -> ABID:
"""
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
"""
prefix, ts, uri, subtype, rand = self.abid_values.values()
if (not prefix) or prefix == DEFAULT_ABID_PREFIX:
suggested_abid = self.__class__.__name__[:3].lower()
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
if not ts:
ts = datetime.utcfromtimestamp(0)
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
if not uri:
uri = str(self)
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
if not subtype:
subtype = self.__class__.__name__
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
if not rand:
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
abid = abid_from_values(
prefix=prefix,
ts=ts,
uri=uri,
subtype=subtype,
rand=rand,
)
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
return abid
@property
def ABID(self) -> ABID:
"""
ULIDParts(timestamp='01HX9FPYTR', url='E4A5CCD9', subtype='00', randomness='ZYEBQE')
"""
return ABID.parse(self.abid) if getattr(self, 'abid', None) else self.get_abid()
@property
def ULID(self) -> ULID:
"""
Get a ulid.ULID representation of the object's ABID.
"""
return self.ABID.ulid
@property
def UUID(self) -> UUID:
"""
Get a uuid.UUID (v4) representation of the object's ABID.
"""
return self.ABID.uuid
@property
def TypeID(self) -> TypeID:
"""
Get a typeid.TypeID (stripe-style) representation of the object's ABID.
"""
return self.ABID.typeid
####################################################
# Django helpers
def find_all_abid_prefixes() -> Dict[str, type[models.Model]]:
"""
Return the mapping of all ABID prefixes to their models.
e.g. {'tag_': core.models.Tag, 'snp_': core.models.Snapshot, ...}
"""
import django.apps
prefix_map = {}
for model in django.apps.apps.get_models():
abid_prefix = getattr(model, 'abid_prefix', None)
if abid_prefix:
prefix_map[abid_prefix] = model
return prefix_map
def find_prefix_for_abid(abid: ABID) -> str:
"""
Find the correct prefix for a given ABID that may have be missing a prefix (slow).
e.g. ABID('obj_01BJQMF54D093DXEAWZ6JYRPAQ') -> 'snp_'
"""
# if existing abid prefix is correct, lookup is easy
model = find_model_from_abid(abid)
if model:
assert issubclass(model, ABIDModel)
return model.abid_prefix
# prefix might be obj_ or missing, fuzzy-search to find any object that matches
return find_obj_from_abid_rand(abid)[0].abid_prefix
def find_model_from_abid_prefix(prefix: str) -> type[ABIDModel] | None:
"""
Return the Django Model that corresponds to a given ABID prefix.
e.g. 'tag_' -> core.models.Tag
"""
prefix = abid_part_from_prefix(prefix)
import django.apps
for model in django.apps.apps.get_models():
if not issubclass(model, ABIDModel): continue # skip non-ABID-enabled models
if not hasattr(model, 'objects'): continue # skip abstract models
if (model.abid_prefix == prefix):
return model
return None
def find_model_from_abid(abid: ABID) -> type[models.Model] | None:
"""
Shortcut for find_model_from_abid_prefix(abid.prefix)
"""
return find_model_from_abid_prefix(abid.prefix)
def find_obj_from_abid_rand(rand: Union[ABID, str], model=None) -> List[ABIDModel]:
"""
Find an object corresponding to an ABID by exhaustively searching using its random suffix (slow).
e.g. 'obj_....................JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ')
"""
# convert str to ABID if necessary
if isinstance(rand, ABID):
abid: ABID = rand
else:
rand = str(rand)
if len(rand) < ABID_SUFFIX_LEN:
padding_needed = ABID_SUFFIX_LEN - len(rand)
rand = ('0'*padding_needed) + rand
abid = ABID.parse(rand)
import django.apps
partial_matches: List[ABIDModel] = []
models_to_try = cast(Set[type[models.Model]], set(filter(bool, (
model,
find_model_from_abid(abid),
*django.apps.apps.get_models(),
))))
# print(abid, abid.rand, abid.uuid, models_to_try)
for model in models_to_try:
if not issubclass(model, ABIDModel): continue # skip Models that arent ABID-enabled
if not hasattr(model, 'objects'): continue # skip abstract Models
assert hasattr(model, 'objects') # force-fix for type hint nit about missing manager https://github.com/typeddjango/django-stubs/issues/1684
# continue on to try fuzzy searching by randomness portion derived from uuid field
try:
qs = []
if hasattr(model, 'abid'):
qs = model.objects.filter(abid__endswith=abid.rand)
elif hasattr(model, 'uuid'):
qs = model.objects.filter(uuid__endswith=str(abid.uuid)[-ABID_RAND_LEN:])
elif hasattr(model, 'id'):
# NOTE: this only works on SQLite where every column is a string
# other DB backends like postgres dont let you do __endswith if this is a BigAutoInteger field
# try to search for uuid=...-2354352
# try to search for id=...2354352
# try to search for id=2354352
qs = model.objects.filter(
models.Q(id__endswith=str(abid.uuid)[-ABID_RAND_LEN:])
| models.Q(id__endswith=abid.rand)
| models.Q(id__startswith=str(int(abid.rand)) if abid.rand.isdigit() else abid.rand)
)
for obj in qs:
if obj.get_abid() == abid:
# found exact match, no need to keep iterating
return [obj]
partial_matches.append(obj)
except OperationalError as err:
print(f'[!] WARNING: Got error while trying to iterate through QuerySet for {model}:', err, '\n')
return partial_matches
def find_obj_from_abid(abid: ABID, model=None, fuzzy=False) -> Any:
"""
Find an object with a given ABID by filtering possible models for a matching abid/uuid/id (fast).
e.g. 'snp_01BJQMF54D093DXEAWZ6JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ')
"""
model = model or find_model_from_abid(abid)
assert model, f'Could not find model that could match this ABID type: {abid}'
try:
if hasattr(model, 'abid'):
return model.objects.get(abid__endswith=abid.suffix)
if hasattr(model, 'uuid'):
return model.objects.get(uuid=abid.uuid)
return model.objects.get(id=abid.uuid)
except model.DoesNotExist:
# if the model has an abid field then it shouldve matched, pointless to fuzzy search in that case
if hasattr(model, 'abid') or (not fuzzy):
raise
# continue on to try fuzzy searching by randomness portion derived from uuid field
match_by_rand = find_obj_from_abid_rand(abid, model=model)
if match_by_rand:
if match_by_rand[0].abid_prefix != abid.prefix:
print(f'[!] WARNING: fetched object {match_by_rand} even though prefix {abid.prefix} doesnt match!', abid, '\n')
return match_by_rand
raise model.DoesNotExist

View file

@ -1,3 +0,0 @@
from django.test import TestCase
# Create your tests here.

View file

@ -1 +0,0 @@
__package__ = 'archivebox.api'

View file

@ -1,11 +0,0 @@
__package__ = 'archivebox.api'
from django.apps import AppConfig
class APIConfig(AppConfig):
name = 'api'
def ready(self):
pass

View file

@ -1,107 +0,0 @@
__package__ = 'archivebox.api'
from typing import Optional
from django.http import HttpRequest
from django.contrib.auth import login
from django.contrib.auth import authenticate
from django.contrib.auth.models import AbstractBaseUser
from ninja.security import HttpBearer, APIKeyQuery, APIKeyHeader, HttpBasicAuth, django_auth_superuser
def auth_using_token(token, request: Optional[HttpRequest]=None) -> Optional[AbstractBaseUser]:
"""Given an API token string, check if a corresponding non-expired APIToken exists, and return its user"""
from api.models import APIToken # lazy import model to avoid loading it at urls.py import time
user = None
submitted_empty_form = token in ('string', '', None)
if submitted_empty_form:
user = request.user # see if user is authed via django session and use that as the default
else:
try:
token = APIToken.objects.get(token=token)
if token.is_valid():
user = token.user
except APIToken.DoesNotExist:
pass
if not user:
print('[❌] Failed to authenticate API user using API Key:', request)
return None
def auth_using_password(username, password, request: Optional[HttpRequest]=None) -> Optional[AbstractBaseUser]:
"""Given a username and password, check if they are valid and return the corresponding user"""
user = None
submitted_empty_form = (username, password) in (('string', 'string'), ('', ''), (None, None))
if submitted_empty_form:
user = request.user # see if user is authed via django session and use that as the default
else:
user = authenticate(
username=username,
password=password,
)
if not user:
print('[❌] Failed to authenticate API user using API Key:', request)
return user
### Base Auth Types
class APITokenAuthCheck:
"""The base class for authentication methods that use an api.models.APIToken"""
def authenticate(self, request: HttpRequest, key: Optional[str]=None) -> Optional[AbstractBaseUser]:
user = auth_using_token(
token=key,
request=request,
)
if user is not None:
login(request, user, backend='django.contrib.auth.backends.ModelBackend')
return user
class UserPassAuthCheck:
"""The base class for authentication methods that use a username & password"""
def authenticate(self, request: HttpRequest, username: Optional[str]=None, password: Optional[str]=None) -> Optional[AbstractBaseUser]:
user = auth_using_password(
username=username,
password=password,
request=request,
)
if user is not None:
login(request, user, backend='django.contrib.auth.backends.ModelBackend')
return user
### Django-Ninja-Provided Auth Methods
class HeaderTokenAuth(APITokenAuthCheck, APIKeyHeader):
"""Allow authenticating by passing X-API-Key=xyz as a request header"""
param_name = "X-ArchiveBox-API-Key"
class BearerTokenAuth(APITokenAuthCheck, HttpBearer):
"""Allow authenticating by passing Bearer=xyz as a request header"""
pass
class QueryParamTokenAuth(APITokenAuthCheck, APIKeyQuery):
"""Allow authenticating by passing api_key=xyz as a GET/POST query parameter"""
param_name = "api_key"
class UsernameAndPasswordAuth(UserPassAuthCheck, HttpBasicAuth):
"""Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)"""
pass
### Enabled Auth Methods
API_AUTH_METHODS = [
HeaderTokenAuth(),
BearerTokenAuth(),
QueryParamTokenAuth(),
django_auth_superuser,
UsernameAndPasswordAuth(),
]

View file

@ -1,29 +0,0 @@
# Generated by Django 4.2.11 on 2024-04-25 04:19
import api.models
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
import uuid
class Migration(migrations.Migration):
initial = True
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name='APIToken',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('token', models.CharField(default=api.models.generate_secret_token, max_length=32, unique=True)),
('created', models.DateTimeField(auto_now_add=True)),
('expires', models.DateTimeField(blank=True, null=True)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
],
),
]

View file

@ -1,17 +0,0 @@
# Generated by Django 5.0.4 on 2024-04-26 05:28
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('api', '0001_initial'),
]
operations = [
migrations.AlterModelOptions(
name='apitoken',
options={'verbose_name': 'API Key', 'verbose_name_plural': 'API Keys'},
),
]

View file

@ -1,77 +0,0 @@
# Generated by Django 5.0.6 on 2024-06-03 01:52
import abid_utils.models
import charidfield.fields
import django.db.models.deletion
import signal_webhooks.fields
import signal_webhooks.utils
import uuid
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('api', '0002_alter_apitoken_options'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.RenameField(
model_name='apitoken',
old_name='user',
new_name='created_by',
),
migrations.AddField(
model_name='apitoken',
name='abid',
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='apt_', unique=True),
),
migrations.AddField(
model_name='apitoken',
name='modified',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='apitoken',
name='uuid',
field=models.UUIDField(blank=True, null=True, unique=True),
),
migrations.AlterField(
model_name='apitoken',
name='id',
field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False),
),
migrations.CreateModel(
name='OutboundWebhook',
fields=[
('name', models.CharField(db_index=True, help_text='Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).', max_length=255, unique=True, verbose_name='name')),
('signal', models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='The type of event the webhook should fire for (e.g. Create, Update, Delete).', max_length=255, verbose_name='signal')),
('ref', models.CharField(db_index=True, help_text='Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model')),
('endpoint', models.URLField(help_text='External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).', max_length=2047, verbose_name='endpoint')),
('headers', models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers')),
('auth_token', signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token')),
('enabled', models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled')),
('keep_last_response', models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response')),
('updated', models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated')),
('last_response', models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response')),
('last_success', models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success')),
('last_failure', models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure')),
('created', models.DateTimeField(auto_now_add=True)),
('modified', models.DateTimeField(auto_now=True)),
('id', models.UUIDField(blank=True, null=True, unique=True)),
('uuid', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)),
('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='whk_', unique=True)),
('created_by', models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
],
options={
'verbose_name': 'API Outbound Webhook',
'abstract': False,
},
),
migrations.AddConstraint(
model_name='outboundwebhook',
constraint=models.UniqueConstraint(fields=('ref', 'endpoint'), name='prevent_duplicate_hooks_api_outboundwebhook'),
),
]

View file

@ -1,115 +0,0 @@
__package__ = 'archivebox.api'
import uuid
import secrets
from datetime import timedelta
from django.conf import settings
from django.db import models
from django.utils import timezone
from signal_webhooks.models import WebhookBase
from django_stubs_ext.db.models import TypedModelMeta
from abid_utils.models import ABIDModel, ABIDField
def generate_secret_token() -> str:
# returns cryptographically secure string with len() == 32
return secrets.token_hex(16)
class APIToken(ABIDModel):
"""
A secret key generated by a User that's used to authenticate REST API requests to ArchiveBox.
"""
# ABID: apt_<created_ts>_<token_hash>_<user_id_hash>_<uuid_rand>
abid_prefix = 'apt_'
abid_ts_src = 'self.created'
abid_uri_src = 'self.token'
abid_subtype_src = 'self.user_id'
abid_rand_src = 'self.id'
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
abid = ABIDField(prefix=abid_prefix)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
token = models.CharField(max_length=32, default=generate_secret_token, unique=True)
created = models.DateTimeField(auto_now_add=True)
expires = models.DateTimeField(null=True, blank=True)
class Meta(TypedModelMeta):
verbose_name = "API Key"
verbose_name_plural = "API Keys"
def __str__(self) -> str:
return self.token
def __repr__(self) -> str:
return f'<APIToken user={self.user.username} token=************{self.token[-4:]}>'
def __json__(self) -> dict:
return {
"TYPE": "APIToken",
"uuid": str(self.id),
"abid": str(self.get_abid()),
"user_id": str(self.user.id),
"user_username": self.user.username,
"token": self.token,
"created": self.created.isoformat(),
"expires": self.expires_as_iso8601,
}
@property
def expires_as_iso8601(self):
"""Returns the expiry date of the token in ISO 8601 format or a date 100 years in the future if none."""
expiry_date = self.expires or (timezone.now() + timedelta(days=365 * 100))
return expiry_date.isoformat()
def is_valid(self, for_date=None):
for_date = for_date or timezone.now()
if self.expires and self.expires < for_date:
return False
return True
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
class OutboundWebhook(ABIDModel, WebhookBase):
"""
Model used in place of (extending) signals_webhooks.models.WebhookModel. Swapped using:
settings.SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
"""
abid_prefix = 'whk_'
abid_ts_src = 'self.created'
abid_uri_src = 'self.endpoint'
abid_subtype_src = 'self.ref'
abid_rand_src = 'self.id'
id = models.UUIDField(blank=True, null=True, unique=True, editable=True)
uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
abid = ABIDField(prefix=abid_prefix)
WebhookBase._meta.get_field('name').help_text = (
'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).')
WebhookBase._meta.get_field('signal').help_text = (
'The type of event the webhook should fire for (e.g. Create, Update, Delete).')
WebhookBase._meta.get_field('ref').help_text = (
'Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).')
WebhookBase._meta.get_field('endpoint').help_text = (
'External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).')
class Meta(WebhookBase.Meta):
verbose_name = 'API Outbound Webhook'

View file

@ -1,30 +0,0 @@
__package__ = 'archivebox.api'
from django.test import TestCase
from ninja.testing import TestClient
from .routes_cli import router
class ArchiveBoxCLIAPITestCase(TestCase):
def setUp(self):
self.client = TestClient(router)
def test_add_endpoint(self):
response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "testTag1,testTag2"})
self.assertEqual(response.status_code, 200)
self.assertTrue(response.json()["success"])
def test_remove_endpoint(self):
response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
self.assertEqual(response.status_code, 200)
self.assertTrue(response.json()["success"])
def test_update_endpoint(self):
response = self.client.post("/update", json={})
self.assertEqual(response.status_code, 200)
self.assertTrue(response.json()["success"])
def test_list_all_endpoint(self):
response = self.client.post("/list_all", json={})
self.assertEqual(response.status_code, 200)
self.assertTrue(response.json()["success"])

View file

@ -1,17 +0,0 @@
__package__ = 'archivebox.api'
from django.urls import path
from django.views.generic.base import RedirectView
from .v1_api import urls as v1_api_urls
urlpatterns = [
path("", RedirectView.as_view(url='/api/v1')),
path("v1/", v1_api_urls),
path("v1", RedirectView.as_view(url='/api/v1/docs')),
# ... v2 can be added here ...
# path("v2/", v2_api_urls),
# path("v2", RedirectView.as_view(url='/api/v2/docs')),
]

View file

@ -1,111 +0,0 @@
__package__ = 'archivebox.api'
from io import StringIO
from traceback import format_exception
from contextlib import redirect_stdout, redirect_stderr
from django.http import HttpRequest, HttpResponse
from django.core.exceptions import ObjectDoesNotExist, EmptyResultSet, PermissionDenied
from ninja import NinjaAPI, Swagger
# TODO: explore adding https://eadwincode.github.io/django-ninja-extra/
from api.auth import API_AUTH_METHODS
from ..config import VERSION, COMMIT_HASH
COMMIT_HASH = COMMIT_HASH or 'unknown'
html_description=f'''
<h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>
<br/>
<i><b>WARNING: This API is still in an early development stage and may change!</b></i>
<br/>
<ul>
<li> Manage your server: <a href="/admin/api/"><b>Setup API Keys</b></a>, <a href="/admin/">Go to your Server Admin UI</a>, <a href="/">Go to your Snapshots list</a>
<li>💬 Ask questions and get help here: <a href="https://zulip.archivebox.io">ArchiveBox Chat Forum</a></li>
<li>🐞 Report API bugs here: <a href="https://github.com/ArchiveBox/ArchiveBox/issues">Github Issues</a></li>
<li>📚 ArchiveBox Documentation: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Github Wiki</a></li>
<li>📜 See the API source code: <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/api"><code>archivebox/api/</code></a></li>
</ul>
<small>Served by ArchiveBox v{VERSION} (<a href="https://github.com/ArchiveBox/ArchiveBox/commit/{COMMIT_HASH}"><code>{COMMIT_HASH[:8]}</code></a>), API powered by <a href="https://django-ninja.dev/"><code>django-ninja</code></a>.</small>
'''
def register_urls(api: NinjaAPI) -> NinjaAPI:
api.add_router('/auth/', 'api.v1_auth.router')
api.add_router('/core/', 'api.v1_core.router')
api.add_router('/cli/', 'api.v1_cli.router')
return api
class NinjaAPIWithIOCapture(NinjaAPI):
def create_temporal_response(self, request: HttpRequest) -> HttpResponse:
stdout, stderr = StringIO(), StringIO()
with redirect_stderr(stderr):
with redirect_stdout(stdout):
request.stdout = stdout
request.stderr = stderr
response = super().create_temporal_response(request)
print('RESPONDING NOW', response)
return response
api = NinjaAPIWithIOCapture(
title='ArchiveBox API',
description=html_description,
version='1.0.0',
csrf=False,
auth=API_AUTH_METHODS,
urls_namespace="api",
docs=Swagger(settings={"persistAuthorization": True}),
# docs_decorator=login_required,
# renderer=ORJSONRenderer(),
)
api = register_urls(api)
urls = api.urls
@api.exception_handler(Exception)
def generic_exception_handler(request, err):
status = 503
if isinstance(err, (ObjectDoesNotExist, EmptyResultSet, PermissionDenied)):
status = 404
print(''.join(format_exception(err)))
return api.create_response(
request,
{
"succeeded": False,
"message": f'{err.__class__.__name__}: {err}',
"errors": [
''.join(format_exception(err)),
# or send simpler parent-only traceback:
# *([str(err.__context__)] if getattr(err, '__context__', None) else []),
],
},
status=status,
)
# import orjson
# from ninja.renderers import BaseRenderer
# class ORJSONRenderer(BaseRenderer):
# media_type = "application/json"
# def render(self, request, data, *, response_status):
# return {
# "success": True,
# "errors": [],
# "result": data,
# "stdout": ansi_to_html(stdout.getvalue().strip()),
# "stderr": ansi_to_html(stderr.getvalue().strip()),
# }
# return orjson.dumps(data)

View file

@ -1,52 +0,0 @@
__package__ = 'archivebox.api'
from typing import Optional
from ninja import Router, Schema
from api.models import APIToken
from api.auth import auth_using_token, auth_using_password
router = Router(tags=['Authentication'])
class PasswordAuthSchema(Schema):
"""Schema for a /get_api_token request"""
username: Optional[str] = None
password: Optional[str] = None
@router.post("/get_api_token", auth=None, summary='Generate an API token for a given username & password (or currently logged-in user)') # auth=None because they are not authed yet
def get_api_token(request, auth_data: PasswordAuthSchema):
user = auth_using_password(
username=auth_data.username,
password=auth_data.password,
request=request,
)
if user:
# TODO: support multiple tokens in the future, for now we just have one per user
api_token, created = APIToken.objects.get_or_create(user=user)
return api_token.__json__()
return {"success": False, "errors": ["Invalid credentials"]}
class TokenAuthSchema(Schema):
"""Schema for a /check_api_token request"""
token: str
@router.post("/check_api_token", auth=None, summary='Validate an API token to make sure its valid and non-expired') # auth=None because they are not authed yet
def check_api_token(request, token_data: TokenAuthSchema):
user = auth_using_token(
token=token_data.token,
request=request,
)
if user:
return {"success": True, "user_id": str(user.pk)}
return {"success": False, "user_id": None}

View file

@ -1,234 +0,0 @@
__package__ = 'archivebox.api'
from typing import List, Dict, Any, Optional
from enum import Enum
from ninja import Router, Schema
from ..main import (
add,
remove,
update,
list_all,
schedule,
)
from ..util import ansi_to_html
from ..config import ONLY_NEW
# router for API that exposes archivebox cli subcommands as REST endpoints
router = Router(tags=['ArchiveBox CLI Sub-Commands'])
# Schemas
JSONType = List[Any] | Dict[str, Any] | bool | int | str | None
class CLICommandResponseSchema(Schema):
success: bool
errors: List[str]
result: JSONType
stdout: str
stderr: str
class FilterTypeChoices(str, Enum):
exact = 'exact'
substring = 'substring'
regex = 'regex'
domain = 'domain'
tag = 'tag'
timestamp = 'timestamp'
class StatusChoices(str, Enum):
indexed = 'indexed'
archived = 'archived'
unarchived = 'unarchived'
present = 'present'
valid = 'valid'
invalid = 'invalid'
duplicate = 'duplicate'
orphaned = 'orphaned'
corrupted = 'corrupted'
unrecognized = 'unrecognized'
class AddCommandSchema(Schema):
urls: List[str]
tag: str = ""
depth: int = 0
update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW
update_all: bool = False
index_only: bool = False
overwrite: bool = False
init: bool = False
extractors: str = ""
parser: str = "auto"
class UpdateCommandSchema(Schema):
resume: Optional[float] = 0
only_new: bool = ONLY_NEW
index_only: bool = False
overwrite: bool = False
after: Optional[float] = 0
before: Optional[float] = 999999999999999
status: Optional[StatusChoices] = StatusChoices.unarchived
filter_type: Optional[str] = FilterTypeChoices.substring
filter_patterns: Optional[List[str]] = ['https://example.com']
extractors: Optional[str] = ""
class ScheduleCommandSchema(Schema):
import_path: Optional[str] = None
add: bool = False
every: Optional[str] = None
tag: str = ''
depth: int = 0
overwrite: bool = False
update: bool = not ONLY_NEW
clear: bool = False
class ListCommandSchema(Schema):
filter_patterns: Optional[List[str]] = ['https://example.com']
filter_type: str = FilterTypeChoices.substring
status: Optional[StatusChoices] = StatusChoices.indexed
after: Optional[float] = 0
before: Optional[float] = 999999999999999
sort: str = 'added'
as_json: bool = True
as_html: bool = False
as_csv: str | bool = 'timestamp,url'
with_headers: bool = False
class RemoveCommandSchema(Schema):
delete: bool = True
after: Optional[float] = 0
before: Optional[float] = 999999999999999
filter_type: str = FilterTypeChoices.exact
filter_patterns: Optional[List[str]] = ['https://example.com']
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
def cli_add(request, args: AddCommandSchema):
result = add(
urls=args.urls,
tag=args.tag,
depth=args.depth,
update=args.update,
update_all=args.update_all,
index_only=args.index_only,
overwrite=args.overwrite,
init=args.init,
extractors=args.extractors,
parser=args.parser,
)
return {
"success": True,
"errors": [],
"result": result,
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
}
@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
def cli_update(request, args: UpdateCommandSchema):
result = update(
resume=args.resume,
only_new=args.only_new,
index_only=args.index_only,
overwrite=args.overwrite,
before=args.before,
after=args.after,
status=args.status,
filter_type=args.filter_type,
filter_patterns=args.filter_patterns,
extractors=args.extractors,
)
return {
"success": True,
"errors": [],
"result": result,
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
}
@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
def cli_schedule(request, args: ScheduleCommandSchema):
result = schedule(
import_path=args.import_path,
add=args.add,
show=args.show,
clear=args.clear,
every=args.every,
tag=args.tag,
depth=args.depth,
overwrite=args.overwrite,
update=args.update,
)
return {
"success": True,
"errors": [],
"result": result,
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
}
@router.post("/list", response=CLICommandResponseSchema, summary='archivebox list [args] [filter_patterns]')
def cli_list(request, args: ListCommandSchema):
result = list_all(
filter_patterns=args.filter_patterns,
filter_type=args.filter_type,
status=args.status,
after=args.after,
before=args.before,
sort=args.sort,
csv=args.as_csv,
json=args.as_json,
html=args.as_html,
with_headers=args.with_headers,
)
result_format = 'txt'
if args.as_json:
result_format = "json"
elif args.as_html:
result_format = "html"
elif args.as_csv:
result_format = "csv"
return {
"success": True,
"errors": [],
"result": result,
"result_format": result_format,
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
}
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
def cli_remove(request, args: RemoveCommandSchema):
result = remove(
yes=True, # no way to interactively ask for confirmation via API, so we force yes
delete=args.delete,
before=args.before,
after=args.after,
filter_type=args.filter_type,
filter_patterns=args.filter_patterns,
)
return {
"success": True,
"errors": [],
"result": result,
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
}

View file

@ -1,291 +0,0 @@
__package__ = 'archivebox.api'
from uuid import UUID
from typing import List, Optional
from datetime import datetime
from django.db.models import Q
from django.shortcuts import get_object_or_404
from ninja import Router, Schema, FilterSchema, Field, Query
from ninja.pagination import paginate
from core.models import Snapshot, ArchiveResult, Tag
from abid_utils.abid import ABID
router = Router(tags=['Core Models'])
### ArchiveResult #########################################################################
class ArchiveResultSchema(Schema):
abid: str
uuid: UUID
pk: str
modified: datetime
created: datetime
created_by_id: str
snapshot_abid: str
snapshot_url: str
snapshot_tags: str
extractor: str
cmd_version: str
cmd: List[str]
pwd: str
status: str
output: str
@staticmethod
def resolve_created_by_id(obj):
return str(obj.created_by_id)
@staticmethod
def resolve_pk(obj):
return str(obj.pk)
@staticmethod
def resolve_uuid(obj):
return str(obj.uuid)
@staticmethod
def resolve_abid(obj):
return str(obj.ABID)
@staticmethod
def resolve_created(obj):
return obj.start_ts
@staticmethod
def resolve_snapshot_url(obj):
return obj.snapshot.url
@staticmethod
def resolve_snapshot_abid(obj):
return str(obj.snapshot.ABID)
@staticmethod
def resolve_snapshot_tags(obj):
return obj.snapshot.tags_str()
class ArchiveResultFilterSchema(FilterSchema):
uuid: Optional[UUID] = Field(None, q='uuid')
# abid: Optional[str] = Field(None, q='abid')
search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains'])
snapshot_uuid: Optional[UUID] = Field(None, q='snapshot_uuid__icontains')
snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains')
snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains')
status: Optional[str] = Field(None, q='status')
output: Optional[str] = Field(None, q='output__icontains')
extractor: Optional[str] = Field(None, q='extractor__icontains')
cmd: Optional[str] = Field(None, q='cmd__0__icontains')
pwd: Optional[str] = Field(None, q='pwd__icontains')
cmd_version: Optional[str] = Field(None, q='cmd_version')
created: Optional[datetime] = Field(None, q='updated')
created__gte: Optional[datetime] = Field(None, q='updated__gte')
created__lt: Optional[datetime] = Field(None, q='updated__lt')
@router.get("/archiveresults", response=List[ArchiveResultSchema])
@paginate
def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)):
"""List all ArchiveResult entries matching these filters."""
qs = ArchiveResult.objects.all()
results = filters.filter(qs)
return results
@router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
def get_archiveresult(request, archiveresult_id: str):
"""Get a specific ArchiveResult by abid, uuid, or pk."""
return ArchiveResult.objects.get(Q(pk__icontains=archiveresult_id) | Q(abid__icontains=archiveresult_id) | Q(uuid__icontains=archiveresult_id))
# @router.post("/archiveresult", response=ArchiveResultSchema)
# def create_archiveresult(request, payload: ArchiveResultSchema):
# archiveresult = ArchiveResult.objects.create(**payload.dict())
# return archiveresult
#
# @router.put("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
# def update_archiveresult(request, archiveresult_id: str, payload: ArchiveResultSchema):
# archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
#
# for attr, value in payload.dict().items():
# setattr(archiveresult, attr, value)
# archiveresult.save()
#
# return archiveresult
#
# @router.delete("/archiveresult/{archiveresult_id}")
# def delete_archiveresult(request, archiveresult_id: str):
# archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
# archiveresult.delete()
# return {"success": True}
### Snapshot #########################################################################
class SnapshotSchema(Schema):
abid: str
uuid: UUID
pk: str
modified: datetime
created: datetime
created_by_id: str
url: str
tags: str
title: Optional[str]
timestamp: str
archive_path: str
bookmarked: datetime
added: datetime
updated: Optional[datetime]
num_archiveresults: int
archiveresults: List[ArchiveResultSchema]
@staticmethod
def resolve_created_by_id(obj):
return str(obj.created_by_id)
@staticmethod
def resolve_pk(obj):
return str(obj.pk)
@staticmethod
def resolve_uuid(obj):
return str(obj.uuid)
@staticmethod
def resolve_abid(obj):
return str(obj.ABID)
@staticmethod
def resolve_tags(obj):
return obj.tags_str()
@staticmethod
def resolve_num_archiveresults(obj, context):
return obj.archiveresult_set.all().distinct().count()
@staticmethod
def resolve_archiveresults(obj, context):
if context['request'].with_archiveresults:
return obj.archiveresult_set.all().distinct()
return ArchiveResult.objects.none()
class SnapshotFilterSchema(FilterSchema):
abid: Optional[str] = Field(None, q='abid__icontains')
uuid: Optional[str] = Field(None, q='uuid__icontains')
pk: Optional[str] = Field(None, q='pk__icontains')
created_by_id: str = Field(None, q='created_by_id__icontains')
created__gte: datetime = Field(None, q='created__gte')
created__lt: datetime = Field(None, q='created__lt')
created: datetime = Field(None, q='created')
modified: datetime = Field(None, q='modified')
modified__gte: datetime = Field(None, q='modified__gte')
modified__lt: datetime = Field(None, q='modified__lt')
search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'abid__icontains', 'uuid__icontains'])
url: Optional[str] = Field(None, q='url')
tag: Optional[str] = Field(None, q='tags__name')
title: Optional[str] = Field(None, q='title__icontains')
timestamp: Optional[str] = Field(None, q='timestamp__startswith')
added__gte: Optional[datetime] = Field(None, q='added__gte')
added__lt: Optional[datetime] = Field(None, q='added__lt')
@router.get("/snapshots", response=List[SnapshotSchema])
@paginate
def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=True):
"""List all Snapshot entries matching these filters."""
request.with_archiveresults = with_archiveresults
qs = Snapshot.objects.all()
results = filters.filter(qs)
return results
@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema)
def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
"""Get a specific Snapshot by abid, uuid, or pk."""
request.with_archiveresults = with_archiveresults
snapshot = None
try:
snapshot = Snapshot.objects.get(Q(uuid__startswith=snapshot_id) | Q(abid__startswith=snapshot_id)| Q(pk__startswith=snapshot_id))
except Snapshot.DoesNotExist:
pass
try:
snapshot = snapshot or Snapshot.objects.get()
except Snapshot.DoesNotExist:
pass
try:
snapshot = snapshot or Snapshot.objects.get(Q(uuid__icontains=snapshot_id) | Q(abid__icontains=snapshot_id))
except Snapshot.DoesNotExist:
pass
return snapshot
# @router.post("/snapshot", response=SnapshotSchema)
# def create_snapshot(request, payload: SnapshotSchema):
# snapshot = Snapshot.objects.create(**payload.dict())
# return snapshot
#
# @router.put("/snapshot/{snapshot_uuid}", response=SnapshotSchema)
# def update_snapshot(request, snapshot_uuid: str, payload: SnapshotSchema):
# snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid)
#
# for attr, value in payload.dict().items():
# setattr(snapshot, attr, value)
# snapshot.save()
#
# return snapshot
#
# @router.delete("/snapshot/{snapshot_uuid}")
# def delete_snapshot(request, snapshot_uuid: str):
# snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid)
# snapshot.delete()
# return {"success": True}
### Tag #########################################################################
class TagSchema(Schema):
abid: Optional[UUID] = Field(None, q='abid')
uuid: Optional[UUID] = Field(None, q='uuid')
pk: Optional[UUID] = Field(None, q='pk')
modified: datetime
created: datetime
created_by_id: str
name: str
slug: str
@staticmethod
def resolve_created_by_id(obj):
return str(obj.created_by_id)
@router.get("/tags", response=List[TagSchema])
def list_tags(request):
return Tag.objects.all()

View file

@ -4,18 +4,14 @@ __command__ = 'archivebox'
import os
import sys
import argparse
import threading
from time import sleep
from typing import Optional, Dict, List, IO, Union, Iterable
from typing import Optional, Dict, List, IO, Union
from pathlib import Path
from ..config import OUTPUT_DIR, check_data_folder, check_migrations, stderr
from ..config import OUTPUT_DIR, check_data_folder, check_migrations
from importlib import import_module
BUILTIN_LIST = list
CLI_DIR = Path(__file__).resolve().parent
# these common commands will appear sorted before any others for ease-of-use
@ -37,40 +33,6 @@ is_valid_cli_module = lambda module, subcommand: (
)
IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread') # threads we dont have to wait for before exiting
def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: Iterable[str]=IGNORED_BG_THREADS, timeout: int=60) -> int:
"""
Block until the specified threads exit. e.g. pass thread_names=('default_hook_handler',) to wait for webhooks.
Useful for waiting for signal handlers, webhooks, etc. to finish running after a mgmt command completes.
"""
wait_for_all: bool = thread_names == ()
thread_matches = lambda thread, ptns: any(ptn in repr(thread) for ptn in ptns)
should_wait = lambda thread: (
not thread_matches(thread, ignore_names)
and (wait_for_all or thread_matches(thread, thread_names)))
for tries in range(timeout):
all_threads = [*threading.enumerate()]
blocking_threads = [*filter(should_wait, all_threads)]
threads_summary = ', '.join(repr(t) for t in blocking_threads)
if blocking_threads:
sleep(1)
if tries == 5: # only show stderr message if we need to wait more than 5s
stderr(
f'[…] Waiting up to {timeout}s for background jobs (e.g. webhooks) to finish...',
threads_summary,
)
else:
return tries
raise Exception('Background threads failed to exit after {tries}s: {threads_summary}')
def list_subcommands() -> Dict[str, str]:
"""find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
@ -117,9 +79,6 @@ def run_subcommand(subcommand: str,
module = import_module('.archivebox_{}'.format(subcommand), __package__)
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
# wait for webhooks, signals, and other background jobs to finish before exit
wait_for_bg_threads_to_exit(timeout=60)
SUBCOMMANDS = list_subcommands()

View file

@ -37,7 +37,7 @@ from sqlite3 import dbapi2 as sqlite3
from hashlib import md5
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Type, Tuple, Dict, Union, List, Any
from typing import Optional, Type, Tuple, Dict, Union, List
from subprocess import run, PIPE, DEVNULL
from configparser import ConfigParser
from collections import defaultdict
@ -72,7 +72,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'TIMEOUT': {'type': int, 'default': 60},
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, # TODO: move this to be a default WGET_ARGS
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages
'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)},
@ -112,7 +112,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
},
'ARCHIVE_METHOD_TOGGLES': {
@ -137,15 +136,14 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
},
'ARCHIVE_METHOD_OPTIONS': {
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'},
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
'USER_AGENT': {'type': str, 'default': None},
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'COOKIES_FILE': {'type': str, 'default': None},
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
@ -153,11 +151,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'CHROME_TIMEOUT': {'type': int, 'default': 0},
'CHROME_HEADLESS': {'type': bool, 'default': True},
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
'CHROME_EXTRA_ARGS': {'type': list, 'default': None},
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
'--restrict-filenames',
'--trim-filenames', '128',
'--write-description',
'--write-info-json',
'--write-annotations',
@ -179,7 +173,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--add-metadata',
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
]},
'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
'WGET_ARGS': {'type': list, 'default': ['--no-verbose',
@ -191,17 +184,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--no-parent',
'-e', 'robots=off',
]},
'WGET_EXTRA_ARGS': {'type': list, 'default': None},
'CURL_ARGS': {'type': list, 'default': ['--silent',
'--location',
'--compressed'
]},
'CURL_EXTRA_ARGS': {'type': list, 'default': None},
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
'SINGLEFILE_ARGS': {'type': list, 'default': None},
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
'MERCURY_ARGS': {'type': list, 'default': ['--format=text']},
'MERCURY_EXTRA_ARGS': {'type': list, 'default': None},
'SINGLEFILE_ARGS': {'type': list, 'default' : None},
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
},
@ -265,7 +253,7 @@ CONFIG_ALIASES = {
for key, default in section.items()
for alias in default.get('aliases', ())
}
USER_CONFIG = {key: section[key] for section in CONFIG_SCHEMA.values() for key in section.keys()}
USER_CONFIG = {key for section in CONFIG_SCHEMA.values() for key in section.keys()}
def get_real_name(key: str) -> str:
"""get the current canonical name for a given deprecated config key"""
@ -281,9 +269,6 @@ TEMPLATES_DIR_NAME = 'templates'
ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources'
LOGS_DIR_NAME = 'logs'
CACHE_DIR_NAME = 'cache'
PERSONAS_DIR_NAME = 'personas'
CRONTABS_DIR_NAME = 'crontabs'
SQL_INDEX_FILENAME = 'index.sqlite3'
JSON_INDEX_FILENAME = 'index.json'
HTML_INDEX_FILENAME = 'index.html'
@ -357,12 +342,9 @@ ALLOWED_IN_OUTPUT_DIR = {
'static',
'sonic',
'search.sqlite3',
CRONTABS_DIR_NAME,
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
CACHE_DIR_NAME,
PERSONAS_DIR_NAME,
SQL_INDEX_FILENAME,
f'{SQL_INDEX_FILENAME}-wal',
f'{SQL_INDEX_FILENAME}-shm',
@ -381,32 +363,24 @@ ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
############################## Version Config ##################################
def get_system_user() -> str:
# some host OS's are unable to provide a username (k3s, Windows), making this complicated
# uid 999 is especially problematic and breaks many attempts
SYSTEM_USER = None
FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
# Option 1
def get_system_user():
SYSTEM_USER = getpass.getuser() or os.getlogin()
try:
import pwd
SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
except (ModuleNotFoundError, Exception):
return pwd.getpwuid(os.geteuid()).pw_name or SYSTEM_USER
except KeyError:
# Process' UID might not map to a user in cases such as running the Docker image
# (where `archivebox` is 999) as a different UID.
pass
except ModuleNotFoundError:
# pwd doesn't exist on windows
pass
# Option 2
try:
SYSTEM_USER = SYSTEM_USER or getpass.getuser()
except Exception:
# this should never happen, uncomment to debug
# raise
pass
# Option 3
try:
SYSTEM_USER = SYSTEM_USER or os.getlogin()
except Exception:
pass
return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
return SYSTEM_USER
def get_version(config):
try:
@ -513,11 +487,9 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
'CACHE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / CACHE_DIR_NAME},
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
@ -547,7 +519,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
@ -558,22 +529,18 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
@ -583,7 +550,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
@ -602,9 +568,9 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
}
@ -933,36 +899,27 @@ def find_chrome_binary() -> Optional[str]:
def find_chrome_data_dir() -> Optional[str]:
"""find any installed chrome user data directories in the default locations"""
# deprecated because this is DANGEROUS, do not re-implement/uncomment this behavior.
# Going forward we want to discourage people from using their main chrome profile for archiving.
# Session tokens, personal data, and cookies are often returned in server responses,
# when they get archived, they are essentially burned as anyone who can view the archive
# can use that data to masquerade as the logged-in user that did the archiving.
# For this reason users should always create dedicated burner profiles for archiving and not use
# their daily driver main accounts.
# # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# # make sure data dir finding precedence order always matches binary finding order
# default_profile_paths = (
# '~/.config/chromium',
# '~/Library/Application Support/Chromium',
# '~/AppData/Local/Chromium/User Data',
# '~/.config/chrome',
# '~/.config/google-chrome',
# '~/Library/Application Support/Google/Chrome',
# '~/AppData/Local/Google/Chrome/User Data',
# '~/.config/google-chrome-stable',
# '~/.config/google-chrome-beta',
# '~/Library/Application Support/Google/Chrome Canary',
# '~/AppData/Local/Google/Chrome SxS/User Data',
# '~/.config/google-chrome-unstable',
# '~/.config/google-chrome-dev',
# )
# for path in default_profile_paths:
# full_path = Path(path).resolve()
# if full_path.exists():
# return full_path
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# make sure data dir finding precedence order always matches binary finding order
default_profile_paths = (
'~/.config/chromium',
'~/Library/Application Support/Chromium',
'~/AppData/Local/Chromium/User Data',
'~/.config/chrome',
'~/.config/google-chrome',
'~/Library/Application Support/Google/Chrome',
'~/AppData/Local/Google/Chrome/User Data',
'~/.config/google-chrome-stable',
'~/.config/google-chrome-beta',
'~/Library/Application Support/Google/Chrome Canary',
'~/AppData/Local/Google/Chrome SxS/User Data',
'~/.config/google-chrome-unstable',
'~/.config/google-chrome-dev',
)
for path in default_profile_paths:
full_path = Path(path).resolve()
if full_path.exists():
return full_path
return None
def wget_supports_compression(config):
@ -988,6 +945,11 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
'enabled': True,
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
},
'CUSTOM_TEMPLATES_DIR': {
'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(),
},
# 'NODE_MODULES_DIR': {
# 'path': ,
# 'enabled': ,
@ -995,25 +957,45 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
# },
}
def get_external_locations(config: ConfigDict) -> ConfigValue:
abspath = lambda path: None if path is None else Path(path).resolve()
return {
'CHROME_USER_DATA_DIR': {
'path': abspath(config['CHROME_USER_DATA_DIR']),
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
},
'COOKIES_FILE': {
'path': abspath(config['COOKIES_FILE']),
'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
},
}
def get_data_locations(config: ConfigDict) -> ConfigValue:
return {
# OLD: migrating to personas
# 'CHROME_USER_DATA_DIR': {
# 'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
# 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
# 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
# },
# 'COOKIES_FILE': {
# 'path': os.path.abspath(config['COOKIES_FILE']),
# 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
# 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
# },
'OUTPUT_DIR': {
'path': config['OUTPUT_DIR'].resolve(),
'enabled': True,
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
},
'SOURCES_DIR': {
'path': config['SOURCES_DIR'].resolve(),
'enabled': True,
'is_valid': config['SOURCES_DIR'].exists(),
},
'LOGS_DIR': {
'path': config['LOGS_DIR'].resolve(),
'enabled': True,
'is_valid': config['LOGS_DIR'].exists(),
},
'ARCHIVE_DIR': {
'path': config['ARCHIVE_DIR'].resolve(),
'enabled': True,
'is_valid': config['ARCHIVE_DIR'].exists(),
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
},
'CONFIG_FILE': {
'path': config['CONFIG_FILE'].resolve(),
'enabled': True,
@ -1025,43 +1007,6 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
},
'ARCHIVE_DIR': {
'path': config['ARCHIVE_DIR'].resolve(),
'enabled': True,
'is_valid': config['ARCHIVE_DIR'].exists(),
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
},
'SOURCES_DIR': {
'path': config['SOURCES_DIR'].resolve(),
'enabled': True,
'is_valid': config['SOURCES_DIR'].exists(),
},
'LOGS_DIR': {
'path': config['LOGS_DIR'].resolve(),
'enabled': True,
'is_valid': config['LOGS_DIR'].exists(),
},
'CACHE_DIR': {
'path': config['CACHE_DIR'].resolve(),
'enabled': True,
'is_valid': config['CACHE_DIR'].exists(),
},
'CUSTOM_TEMPLATES_DIR': {
'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(),
},
'PERSONAS_DIR': {
'path': config['PERSONAS_DIR'].resolve(),
'enabled': True,
'is_valid': config['PERSONAS_DIR'].exists(),
},
# managed by bin/docker_entrypoint.sh and python-crontab:
# 'CRONTABS_DIR': {
# 'path': config['CRONTABS_DIR'].resolve(),
# 'enabled': True,
# 'is_valid': config['CRONTABS_DIR'].exists(),
# },
}
def get_dependency_info(config: ConfigDict) -> ConfigValue:
@ -1296,7 +1241,7 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
# stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
# stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
if config['CHROME_USER_DATA_DIR'] is not None and Path(config['CHROME_USER_DATA_DIR']).exists():
if config['CHROME_USER_DATA_DIR'] is not None:
if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists():
stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
stderr(f' {config["CHROME_USER_DATA_DIR"]}')
@ -1306,13 +1251,8 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
if '/Default' in str(config['CHROME_USER_DATA_DIR']):
stderr()
stderr(' Try removing /Default from the end e.g.:')
stderr(' CHROME_USER_DATA_DIR="{}"'.format(str(config['CHROME_USER_DATA_DIR']).split('/Default')[0]))
# hard error is too annoying here, instead just set it to nothing
# raise SystemExit(2)
config['CHROME_USER_DATA_DIR'] = None
else:
config['CHROME_USER_DATA_DIR'] = None
stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0]))
raise SystemExit(2)
def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
@ -1381,7 +1321,6 @@ def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=C
stderr(' archivebox init')
raise SystemExit(2)
def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG):
output_dir = out_dir or config['OUTPUT_DIR']
from .index.sql import list_migrations
@ -1398,9 +1337,6 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO
(Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
(Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
(Path(output_dir) / CACHE_DIR_NAME).mkdir(exist_ok=True)
(Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True)
(Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True)

View file

@ -1,2 +1 @@
__package__ = 'archivebox.core'

View file

@ -6,7 +6,6 @@ from contextlib import redirect_stdout
from datetime import datetime, timezone
from django.contrib import admin
from django.db.models import Count
from django.urls import path
from django.utils.html import format_html
from django.utils.safestring import mark_safe
@ -14,32 +13,18 @@ from django.shortcuts import render, redirect
from django.contrib.auth import get_user_model
from django import forms
from signal_webhooks.admin import WebhookAdmin, get_webhook_model
# from plugantic.admin import CustomPlugin
from ..util import htmldecode, urldecode, ansi_to_html
from core.models import Snapshot, ArchiveResult, Tag
from core.forms import AddLinkForm
from core.mixins import SearchResultsAdminMixin
from api.models import APIToken
from index.html import snapshot_icons
from logging_util import printable_filesize
from main import add, remove
from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE
from extractors import archive_links
from config import (
OUTPUT_DIR,
SNAPSHOTS_PER_PAGE,
VERSION,
VERSIONS_AVAILABLE,
CAN_UPGRADE
)
GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
# Admin URLs
# /admin/
@ -54,11 +39,343 @@ GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE,
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
class ArchiveResultInline(admin.TabularInline):
model = ArchiveResult
class TagInline(admin.TabularInline):
model = Snapshot.tags.through
from django.contrib.admin.helpers import ActionForm
from django.contrib.admin.widgets import AutocompleteSelectMultiple
# WIP: broken by Django 3.1.2 -> 4.0 migration
class AutocompleteTags:
model = Tag
search_fields = ['name']
name = 'tags'
class AutocompleteTagsAdminStub:
name = 'admin'
class SnapshotActionForm(ActionForm):
tags = forms.ModelMultipleChoiceField(
queryset=Tag.objects.all(),
required=False,
# WIP: broken by Django 3.1.2 -> 4.0 migration
widget=AutocompleteSelectMultiple(
AutocompleteTags(),
AutocompleteTagsAdminStub(),
),
)
# TODO: allow selecting actions for specific extractors? is this useful?
# EXTRACTOR_CHOICES = [
# (name, name.title())
# for name, _, _ in get_default_archive_methods()
# ]
# extractor = forms.ChoiceField(
# choices=EXTRACTOR_CHOICES,
# required=False,
# widget=forms.MultileChoiceField(attrs={'class': "form-control"})
# )
class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
list_display = ('added', 'title_str', 'files', 'size', 'url_str')
sort_fields = ('title_str', 'url_str', 'added', 'files')
readonly_fields = ('info', 'bookmarked', 'added', 'updated')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
fields = ('timestamp', 'url', 'title', 'tags', *readonly_fields)
list_filter = ('added', 'updated', 'tags', 'archiveresult__status')
ordering = ['-added']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
autocomplete_fields = ['tags']
inlines = [ArchiveResultInline]
list_per_page = SNAPSHOTS_PER_PAGE
action_form = SnapshotActionForm
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path('grid/', self.admin_site.admin_view(self.grid_view), name='grid')
]
return custom_urls + urls
def get_queryset(self, request):
self.request = request
return super().get_queryset(request).prefetch_related('tags')
def tag_list(self, obj):
return ', '.join(obj.tags.values_list('name', flat=True))
# TODO: figure out a different way to do this, you cant nest forms so this doenst work
# def action(self, obj):
# # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0
# # action: update_snapshots
# # select_across: 0
# # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3
# return format_html(
# '''
# <form action="/admin/core/snapshot/" method="post" onsubmit="e => e.stopPropagation()">
# <input type="hidden" name="csrfmiddlewaretoken" value="{}">
# <input type="hidden" name="_selected_action" value="{}">
# <button name="update_snapshots">Check</button>
# <button name="update_titles">Pull title + favicon</button>
# <button name="update_snapshots">Update</button>
# <button name="overwrite_snapshots">Re-Archive (overwrite)</button>
# <button name="delete_snapshots">Permanently delete</button>
# </form>
# ''',
# csrf.get_token(self.request),
# obj.id,
# )
def info(self, obj):
return format_html(
'''
UUID: <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;
Timestamp: <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;
URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
Archived: {} ({} files {}) &nbsp; &nbsp;
Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp;
Status code: {} &nbsp; &nbsp;
Server: {} &nbsp; &nbsp;
Content type: {} &nbsp; &nbsp;
Extension: {} &nbsp; &nbsp;
<br/><br/>
<a href="/archive/{}">View Snapshot index </a> &nbsp; &nbsp;
<a href="/admin/core/snapshot/?id__exact={}">View actions </a>
''',
obj.id,
obj.timestamp,
obj.url_hash,
'' if obj.is_archived else '',
obj.num_outputs,
self.size(obj),
f'/archive/{obj.timestamp}/favicon.ico',
obj.status_code or '?',
obj.headers and obj.headers.get('Server') or '?',
obj.headers and obj.headers.get('Content-Type') or '?',
obj.extension or '?',
obj.timestamp,
obj.id,
)
def title_str(self, obj):
canon = obj.as_link().canonical_outputs()
tags = ''.join(
format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
for tag in obj.tags.all()
if str(tag).strip()
)
return format_html(
'<a href="/{}">'
'<img src="/{}/{}" class="favicon" onerror="this.remove()">'
'</a>'
'<a href="/{}/index.html">'
'<b class="status-{}">{}</b>'
'</a>',
obj.archive_path,
obj.archive_path, canon['favicon_path'],
obj.archive_path,
'fetched' if obj.latest_title or obj.title else 'pending',
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
) + mark_safe(f' <span class="tags">{tags}</span>')
def files(self, obj):
return snapshot_icons(obj)
files.admin_order_field = 'updated'
files.short_description = 'Files Saved'
def size(self, obj):
archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
if archive_size:
size_txt = printable_filesize(archive_size)
if archive_size > 52428800:
size_txt = mark_safe(f'<b>{size_txt}</b>')
else:
size_txt = mark_safe('<span style="opacity: 0.3">...</span>')
return format_html(
'<a href="/{}" title="View all files">{}</a>',
obj.archive_path,
size_txt,
)
size.admin_order_field = 'archiveresult__count'
def url_str(self, obj):
return format_html(
'<a href="{}"><code style="user-select: all;">{}</code></a>',
obj.url,
obj.url,
)
def grid_view(self, request, extra_context=None):
# cl = self.get_changelist_instance(request)
# Save before monkey patching to restore for changelist list view
saved_change_list_template = self.change_list_template
saved_list_per_page = self.list_per_page
saved_list_max_show_all = self.list_max_show_all
# Monkey patch here plus core_tags.py
self.change_list_template = 'private_index_grid.html'
self.list_per_page = SNAPSHOTS_PER_PAGE
self.list_max_show_all = self.list_per_page
# Call monkey patched view
rendered_response = self.changelist_view(request, extra_context=extra_context)
# Restore values
self.change_list_template = saved_change_list_template
self.list_per_page = saved_list_per_page
self.list_max_show_all = saved_list_max_show_all
return rendered_response
# for debugging, uncomment this to print all requests:
# def changelist_view(self, request, extra_context=None):
# print('[*] Got request', request.method, request.POST)
# return super().changelist_view(request, extra_context=None)
def update_snapshots(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], out_dir=OUTPUT_DIR)
update_snapshots.short_description = "Pull"
def update_titles(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
update_titles.short_description = "⬇️ Title"
def resnapshot_snapshot(self, request, queryset):
for snapshot in queryset:
timestamp = datetime.now(timezone.utc).isoformat('T', 'seconds')
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
add(new_url, tag=snapshot.tags_str())
resnapshot_snapshot.short_description = "Re-Snapshot"
def overwrite_snapshots(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], overwrite=True, out_dir=OUTPUT_DIR)
overwrite_snapshots.short_description = "Reset"
def delete_snapshots(self, request, queryset):
remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
delete_snapshots.short_description = "Delete"
def add_tags(self, request, queryset):
tags = request.POST.getlist('tags')
print('[+] Adding tags', tags, 'to Snapshots', queryset)
for obj in queryset:
obj.tags.add(*tags)
add_tags.short_description = "+"
def remove_tags(self, request, queryset):
tags = request.POST.getlist('tags')
print('[-] Removing tags', tags, 'to Snapshots', queryset)
for obj in queryset:
obj.tags.remove(*tags)
remove_tags.short_description = ""
title_str.short_description = 'Title'
url_str.short_description = 'Original URL'
title_str.admin_order_field = 'title'
url_str.admin_order_field = 'url'
class TagAdmin(admin.ModelAdmin):
list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id')
sort_fields = ('id', 'name', 'slug')
readonly_fields = ('id', 'num_snapshots', 'snapshots')
search_fields = ('id', 'name', 'slug')
fields = (*readonly_fields, 'name', 'slug')
actions = ['delete_selected']
ordering = ['-id']
def num_snapshots(self, obj):
return format_html(
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
obj.id,
obj.snapshot_set.count(),
)
def snapshots(self, obj):
total_count = obj.snapshot_set.count()
return mark_safe('<br/>'.join(
format_html(
'{} <code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a> {}</code>',
snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...',
snap.id,
snap.timestamp,
snap.url,
)
for snap in obj.snapshot_set.order_by('-updated')[:10]
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={obj.id}">and {total_count-10} more...<a>' if obj.snapshot_set.count() > 10 else ''))
class ArchiveResultAdmin(admin.ModelAdmin):
list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'tags_str', 'cmd_str', 'status', 'output_str')
sort_fields = ('start_ts', 'extractor', 'status')
readonly_fields = ('id', 'uuid', 'snapshot_str', 'tags_str')
search_fields = ('id', 'uuid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
fields = (*readonly_fields, 'snapshot', 'extractor', 'status', 'start_ts', 'end_ts', 'output', 'pwd', 'cmd', 'cmd_version')
autocomplete_fields = ['snapshot']
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
ordering = ['-start_ts']
list_per_page = SNAPSHOTS_PER_PAGE
def snapshot_str(self, obj):
return format_html(
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b></a><br/>'
'<small>{}</small>',
obj.snapshot.timestamp,
obj.snapshot.timestamp,
obj.snapshot.url[:128],
)
def tags_str(self, obj):
return obj.snapshot.tags_str()
def cmd_str(self, obj):
return format_html(
'<pre>{}</pre>',
' '.join(obj.cmd) if isinstance(obj.cmd, list) else str(obj.cmd),
)
def output_str(self, obj):
return format_html(
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
obj.snapshot.timestamp,
obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html',
obj.output,
)
tags_str.short_description = 'tags'
snapshot_str.short_description = 'snapshot'
class ArchiveBoxAdmin(admin.AdminSite):
site_header = 'ArchiveBox'
index_title = 'Links'
site_title = 'Index'
namespace = 'admin'
def get_urls(self):
return [
@ -104,424 +421,9 @@ class ArchiveBoxAdmin(admin.AdminSite):
return render(template_name='add.html', request=request, context=context)
archivebox_admin = ArchiveBoxAdmin()
archivebox_admin.register(get_user_model())
archivebox_admin.register(APIToken)
archivebox_admin.register(get_webhook_model(), WebhookAdmin)
archivebox_admin.disable_action('delete_selected')
# archivebox_admin.register(CustomPlugin)
# patch admin with methods to add data views (implemented by admin_data_views package)
############### Additional sections are defined in settings.ADMIN_DATA_VIEWS #########
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin)
archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin)
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
class ArchiveResultInline(admin.TabularInline):
model = ArchiveResult
class TagInline(admin.TabularInline):
model = Snapshot.tags.through
from django.contrib.admin.helpers import ActionForm
from django.contrib.admin.widgets import AutocompleteSelectMultiple
class AutocompleteTags:
model = Tag
search_fields = ['name']
name = 'tags'
remote_field = TagInline
class AutocompleteTagsAdminStub:
name = 'admin'
class SnapshotActionForm(ActionForm):
tags = forms.ModelMultipleChoiceField(
queryset=Tag.objects.all(),
required=False,
widget=AutocompleteSelectMultiple(
AutocompleteTags(),
AutocompleteTagsAdminStub(),
),
)
# TODO: allow selecting actions for specific extractors? is this useful?
# EXTRACTOR_CHOICES = [
# (name, name.title())
# for name, _, _ in get_default_archive_methods()
# ]
# extractor = forms.ChoiceField(
# choices=EXTRACTOR_CHOICES,
# required=False,
# widget=forms.MultileChoiceField(attrs={'class': "form-control"})
# )
def get_abid_info(self, obj):
return format_html(
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
'''
&nbsp; &nbsp; ABID:&nbsp; <code style="font-size: 16px; user-select: all"><b>{}</b></code><br/>
&nbsp; &nbsp; TS: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;<code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/>
&nbsp; &nbsp; URI: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/>
&nbsp; &nbsp; SUBTYPE: &nbsp; &nbsp; &nbsp; <code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/>
&nbsp; &nbsp; RAND: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; <code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/><br/>
&nbsp; &nbsp; ABID AS UUID:&nbsp; <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;<br/><br/>
&nbsp; &nbsp; .uuid: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;<br/>
&nbsp; &nbsp; .id: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;<br/>
&nbsp; &nbsp; .pk: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;<br/><br/>
''',
obj.abid,
obj.ABID.ts, obj.abid_values['ts'].isoformat() if isinstance(obj.abid_values['ts'], datetime) else obj.abid_values['ts'],
obj.ABID.uri, str(obj.abid_values['uri']),
obj.ABID.subtype, str(obj.abid_values['subtype']),
obj.ABID.rand, str(obj.abid_values['rand'])[-7:],
obj.ABID.uuid,
obj.uuid,
obj.id,
obj.pk,
)
@admin.register(Snapshot, site=archivebox_admin)
class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
list_display = ('added', 'title_str', 'files', 'size', 'url_str')
sort_fields = ('title_str', 'url_str', 'added', 'files')
readonly_fields = ('admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'identifiers')
search_fields = ('id', 'url', 'abid', 'uuid', 'timestamp', 'title', 'tags__name')
fields = ('url', 'timestamp', 'created_by', 'tags', 'title', *readonly_fields)
list_filter = ('added', 'updated', 'tags', 'archiveresult__status', 'created_by')
ordering = ['-added']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
autocomplete_fields = ['tags']
inlines = [ArchiveResultInline]
list_per_page = SNAPSHOTS_PER_PAGE
action_form = SnapshotActionForm
def changelist_view(self, request, extra_context=None):
extra_context = extra_context or {}
return super().changelist_view(request, extra_context | GLOBAL_CONTEXT)
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path('grid/', self.admin_site.admin_view(self.grid_view), name='grid')
]
return custom_urls + urls
def get_queryset(self, request):
self.request = request
return super().get_queryset(request).prefetch_related('tags').annotate(archiveresult_count=Count('archiveresult'))
def tag_list(self, obj):
return ', '.join(obj.tags.values_list('name', flat=True))
# TODO: figure out a different way to do this, you cant nest forms so this doenst work
# def action(self, obj):
# # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0
# # action: update_snapshots
# # select_across: 0
# # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3
# return format_html(
# '''
# <form action="/admin/core/snapshot/" method="post" onsubmit="e => e.stopPropagation()">
# <input type="hidden" name="csrfmiddlewaretoken" value="{}">
# <input type="hidden" name="_selected_action" value="{}">
# <button name="update_snapshots">Check</button>
# <button name="update_titles">Pull title + favicon</button>
# <button name="update_snapshots">Update</button>
# <button name="overwrite_snapshots">Re-Archive (overwrite)</button>
# <button name="delete_snapshots">Permanently delete</button>
# </form>
# ''',
# csrf.get_token(self.request),
# obj.pk,
# )
def admin_actions(self, obj):
return format_html(
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
'''
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}">Summary page </a> &nbsp; &nbsp;
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}/index.html#all">Result files 📑</a> &nbsp; &nbsp;
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/admin/core/snapshot/?id__exact={}">Admin actions </a>
''',
obj.timestamp,
obj.timestamp,
obj.pk,
)
def status_info(self, obj):
return format_html(
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
'''
Archived: {} ({} files {}) &nbsp; &nbsp;
Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp;
Status code: {} &nbsp; &nbsp;<br/>
Server: {} &nbsp; &nbsp;
Content type: {} &nbsp; &nbsp;
Extension: {} &nbsp; &nbsp;
''',
'' if obj.is_archived else '',
obj.num_outputs,
self.size(obj) or '0kb',
f'/archive/{obj.timestamp}/favicon.ico',
obj.status_code or '-',
obj.headers and obj.headers.get('Server') or '-',
obj.headers and obj.headers.get('Content-Type') or '-',
obj.extension or '-',
)
def identifiers(self, obj):
return get_abid_info(self, obj)
@admin.display(
description='Title',
ordering='title',
)
def title_str(self, obj):
canon = obj.as_link().canonical_outputs()
tags = ''.join(
format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
for tag in obj.tags.all()
if str(tag).strip()
)
return format_html(
'<a href="/{}">'
'<img src="/{}/{}" class="favicon" onerror="this.remove()">'
'</a>'
'<a href="/{}/index.html">'
'<b class="status-{}">{}</b>'
'</a>',
obj.archive_path,
obj.archive_path, canon['favicon_path'],
obj.archive_path,
'fetched' if obj.latest_title or obj.title else 'pending',
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
) + mark_safe(f' <span class="tags">{tags}</span>')
@admin.display(
description='Files Saved',
ordering='archiveresult_count',
)
def files(self, obj):
return snapshot_icons(obj)
@admin.display(
ordering='archiveresult_count'
)
def size(self, obj):
archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
if archive_size:
size_txt = printable_filesize(archive_size)
if archive_size > 52428800:
size_txt = mark_safe(f'<b>{size_txt}</b>')
else:
size_txt = mark_safe('<span style="opacity: 0.3">...</span>')
return format_html(
'<a href="/{}" title="View all files">{}</a>',
obj.archive_path,
size_txt,
)
@admin.display(
description='Original URL',
ordering='url',
)
def url_str(self, obj):
return format_html(
'<a href="{}"><code style="user-select: all;">{}</code></a>',
obj.url,
obj.url[:128],
)
def grid_view(self, request, extra_context=None):
# cl = self.get_changelist_instance(request)
# Save before monkey patching to restore for changelist list view
saved_change_list_template = self.change_list_template
saved_list_per_page = self.list_per_page
saved_list_max_show_all = self.list_max_show_all
# Monkey patch here plus core_tags.py
self.change_list_template = 'private_index_grid.html'
self.list_per_page = SNAPSHOTS_PER_PAGE
self.list_max_show_all = self.list_per_page
# Call monkey patched view
rendered_response = self.changelist_view(request, extra_context=extra_context)
# Restore values
self.change_list_template = saved_change_list_template
self.list_per_page = saved_list_per_page
self.list_max_show_all = saved_list_max_show_all
return rendered_response
# for debugging, uncomment this to print all requests:
# def changelist_view(self, request, extra_context=None):
# print('[*] Got request', request.method, request.POST)
# return super().changelist_view(request, extra_context=None)
@admin.action(
description="Pull"
)
def update_snapshots(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], out_dir=OUTPUT_DIR)
@admin.action(
description="⬇️ Title"
)
def update_titles(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
@admin.action(
description="Re-Snapshot"
)
def resnapshot_snapshot(self, request, queryset):
for snapshot in queryset:
timestamp = datetime.now(timezone.utc).isoformat('T', 'seconds')
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
add(new_url, tag=snapshot.tags_str())
@admin.action(
description="Reset"
)
def overwrite_snapshots(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], overwrite=True, out_dir=OUTPUT_DIR)
@admin.action(
description="Delete"
)
def delete_snapshots(self, request, queryset):
remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
@admin.action(
description="+"
)
def add_tags(self, request, queryset):
tags = request.POST.getlist('tags')
print('[+] Adding tags', tags, 'to Snapshots', queryset)
for obj in queryset:
obj.tags.add(*tags)
@admin.action(
description=""
)
def remove_tags(self, request, queryset):
tags = request.POST.getlist('tags')
print('[-] Removing tags', tags, 'to Snapshots', queryset)
for obj in queryset:
obj.tags.remove(*tags)
@admin.register(Tag, site=archivebox_admin)
class TagAdmin(admin.ModelAdmin):
list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'abid')
sort_fields = ('id', 'name', 'slug', 'abid')
readonly_fields = ('created', 'modified', 'identifiers', 'num_snapshots', 'snapshots')
search_fields = ('id', 'abid', 'uuid', 'name', 'slug')
fields = ('name', 'slug', 'created_by', *readonly_fields, )
actions = ['delete_selected']
ordering = ['-id']
def identifiers(self, obj):
return get_abid_info(self, obj)
def num_snapshots(self, tag):
return format_html(
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
tag.id,
tag.snapshot_set.count(),
)
def snapshots(self, tag):
total_count = tag.snapshot_set.count()
return mark_safe('<br/>'.join(
format_html(
'{} <code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a> {}</code>',
snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...',
snap.pk,
snap.abid,
snap.url,
)
for snap in tag.snapshot_set.order_by('-updated')[:10]
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={tag.id}">and {total_count-10} more...<a>' if tag.snapshot_set.count() > 10 else ''))
@admin.register(ArchiveResult, site=archivebox_admin)
class ArchiveResultAdmin(admin.ModelAdmin):
list_display = ('start_ts', 'snapshot_info', 'tags_str', 'extractor', 'cmd_str', 'status', 'output_str')
sort_fields = ('start_ts', 'extractor', 'status')
readonly_fields = ('snapshot_info', 'tags_str', 'created_by', 'created', 'modified', 'identifiers')
search_fields = ('id', 'uuid', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'cmd', 'start_ts', 'end_ts', 'cmd_version', *readonly_fields)
autocomplete_fields = ['snapshot']
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
ordering = ['-start_ts']
list_per_page = SNAPSHOTS_PER_PAGE
@admin.display(
description='Snapshot Info'
)
def snapshot_info(self, result):
return format_html(
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b> &nbsp; {} &nbsp; {}</a><br/>',
result.snapshot.timestamp,
result.snapshot.abid,
result.snapshot.added.strftime('%Y-%m-%d %H:%M'),
result.snapshot.url[:128],
)
def identifiers(self, obj):
return get_abid_info(self, obj)
@admin.display(
description='Snapshot Tags'
)
def tags_str(self, result):
return result.snapshot.tags_str()
def cmd_str(self, result):
return format_html(
'<pre>{}</pre>',
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
)
def output_str(self, result):
return format_html(
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
result.snapshot.timestamp,
result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html',
result.output,
)
admin.site = ArchiveBoxAdmin()
admin.site.register(get_user_model())
admin.site.register(Snapshot, SnapshotAdmin)
admin.site.register(Tag, TagAdmin)
admin.site.register(ArchiveResult, ArchiveResultAdmin)
admin.site.disable_action('delete_selected')

View file

@ -1,28 +1,7 @@
__package__ = 'archivebox.core'
from django.apps import AppConfig
class CoreConfig(AppConfig):
name = 'core'
def ready(self):
# register our custom admin as the primary django admin
from django.contrib import admin
from django.contrib.admin import sites
from core.admin import archivebox_admin
admin.site = archivebox_admin
sites.site = archivebox_admin
# register signal handlers
from .auth import register_signals
register_signals()
# from django.contrib.admin.apps import AdminConfig
# class CoreAdminConfig(AdminConfig):
# default_site = "core.admin.get_admin_site"
# WIP: broken by Django 3.1.2 -> 4.0 migration
default_auto_field = 'django.db.models.UUIDField'

View file

@ -1,14 +0,0 @@
__package__ = 'archivebox.core'
from ..config import (
LDAP
)
def register_signals():
if LDAP:
import django_auth_ldap.backend
from .auth_ldap import create_user
django_auth_ldap.backend.populate_user.connect(create_user)

View file

@ -1,10 +0,0 @@
from ..config import (
LDAP_CREATE_SUPERUSER
)
def create_user(sender, user=None, ldap_user=None, **kwargs):
if not user.id and LDAP_CREATE_SUPERUSER:
user.is_superuser = True
user.is_staff = True
print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')

View file

@ -17,6 +17,8 @@ except AttributeError:
def forwards_func(apps, schema_editor):
from core.models import EXTRACTORS
Snapshot = apps.get_model("core", "Snapshot")
ArchiveResult = apps.get_model("core", "ArchiveResult")

View file

@ -1,43 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-13 10:56
import charidfield.fields
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0022_auto_20231023_2008'),
]
operations = [
migrations.AlterModelOptions(
name='archiveresult',
options={'verbose_name': 'Result'},
),
migrations.AddField(
model_name='archiveresult',
name='abid',
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='res_', unique=True),
),
migrations.AddField(
model_name='snapshot',
name='abid',
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='snp_', unique=True),
),
migrations.AddField(
model_name='snapshot',
name='uuid',
field=models.UUIDField(blank=True, null=True, unique=True),
),
migrations.AddField(
model_name='tag',
name='abid',
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='tag_', unique=True),
),
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(choices=[('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('title', 'title'), ('wget', 'wget')], max_length=32),
),
]

View file

@ -1,98 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-13 11:43
from django.db import migrations
from datetime import datetime
from abid_utils.abid import abid_from_values
def calculate_abid(self):
"""
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
"""
prefix = self.abid_prefix
ts = eval(self.abid_ts_src)
uri = eval(self.abid_uri_src)
subtype = eval(self.abid_subtype_src)
rand = eval(self.abid_rand_src)
if (not prefix) or prefix == 'obj_':
suggested_abid = self.__class__.__name__[:3].lower()
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
if not ts:
ts = datetime.utcfromtimestamp(0)
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
if not uri:
uri = str(self)
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
if not subtype:
subtype = self.__class__.__name__
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
if not rand:
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
abid = abid_from_values(
prefix=prefix,
ts=ts,
uri=uri,
subtype=subtype,
rand=rand,
)
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
return abid
def copy_snapshot_uuids(apps, schema_editor):
print(' Copying snapshot.id -> snapshot.uuid...')
Snapshot = apps.get_model("core", "Snapshot")
for snapshot in Snapshot.objects.all():
snapshot.uuid = snapshot.id
snapshot.save(update_fields=["uuid"])
def generate_snapshot_abids(apps, schema_editor):
print(' Generating snapshot.abid values...')
Snapshot = apps.get_model("core", "Snapshot")
for snapshot in Snapshot.objects.all():
snapshot.abid_prefix = 'snp_'
snapshot.abid_ts_src = 'self.added'
snapshot.abid_uri_src = 'self.url'
snapshot.abid_subtype_src = '"01"'
snapshot.abid_rand_src = 'self.uuid'
snapshot.abid = calculate_abid(snapshot)
snapshot.save(update_fields=["abid"])
def generate_archiveresult_abids(apps, schema_editor):
print(' Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)')
ArchiveResult = apps.get_model("core", "ArchiveResult")
Snapshot = apps.get_model("core", "Snapshot")
for result in ArchiveResult.objects.all():
result.abid_prefix = 'res_'
result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
result.snapshot_added = result.snapshot.added
result.snapshot_url = result.snapshot.url
result.abid_ts_src = 'self.snapshot_added'
result.abid_uri_src = 'self.snapshot_url'
result.abid_subtype_src = 'self.extractor'
result.abid_rand_src = 'self.id'
result.abid = calculate_abid(result)
result.uuid = result.abid.uuid
result.save(update_fields=["abid", "uuid"])
class Migration(migrations.Migration):
dependencies = [
('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
]
operations = [
migrations.RunPython(copy_snapshot_uuids, reverse_code=migrations.RunPython.noop),
migrations.RunPython(generate_snapshot_abids, reverse_code=migrations.RunPython.noop),
migrations.RunPython(generate_archiveresult_abids, reverse_code=migrations.RunPython.noop),
]

View file

@ -1,19 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-13 12:08
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0024_auto_20240513_1143'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
),
]

View file

@ -1,76 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-13 13:01
import abid_utils.models
import django.db.models.deletion
import django.utils.timezone
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0025_alter_archiveresult_uuid'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.AddField(
model_name='archiveresult',
name='created',
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
preserve_default=False,
),
migrations.AddField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='archiveresult',
name='modified',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='snapshot',
name='created',
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
preserve_default=False,
),
migrations.AddField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='snapshot',
name='modified',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='tag',
name='created',
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
preserve_default=False,
),
migrations.AddField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='tag',
name='modified',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='tag',
name='uuid',
field=models.UUIDField(blank=True, null=True, unique=True),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, null=True, unique=True),
),
]

View file

@ -10,7 +10,7 @@ class SearchResultsAdminMixin:
search_term = search_term.strip()
if not search_term:
return qs.distinct(), use_distinct
return qs, use_distinct
try:
qsearch = query_search_index(search_term)
qs = qs | qsearch
@ -18,4 +18,4 @@ class SearchResultsAdminMixin:
print(f'[!] Error while using search backend: {err.__class__.__name__} {err}')
messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}')
return qs.distinct(), use_distinct
return qs, use_distinct

View file

@ -1,14 +1,11 @@
__package__ = 'archivebox.core'
from typing import Optional, List, Dict
from django_stubs_ext.db.models import TypedModelMeta
import uuid
import json
import uuid
from uuid import uuid4
from pathlib import Path
from typing import Optional, List
from django.db import models
from django.utils.functional import cached_property
@ -18,58 +15,40 @@ from django.urls import reverse
from django.db.models import Case, When, Value, IntegerField
from django.contrib.auth.models import User # noqa
from abid_utils.models import ABIDModel, ABIDField
from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
from ..system import get_dir_size
from ..util import parse_date, base_url
from ..util import parse_date, base_url, hashurl
from ..index.schema import Link
from ..index.html import snapshot_icons
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
STATUS_CHOICES = [
("succeeded", "succeeded"),
("failed", "failed"),
("skipped", "skipped")
]
try:
JSONField = models.JSONField
except AttributeError:
import jsonfield
JSONField = jsonfield.JSONField
# class BaseModel(models.Model):
# # TODO: migrate all models to a shared base class with all our standard fields and helpers:
# # ulid/created/modified/owner/is_deleted/as_json/from_json/etc.
# #
# # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
# # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
# class Meta(TypedModelMeta):
# abstract = True
class Tag(ABIDModel):
class Tag(models.Model):
"""
Based on django-taggit model + ABID base.
Based on django-taggit model
"""
abid_prefix = 'tag_'
abid_ts_src = 'self.created' # TODO: add created/modified time
abid_uri_src = 'self.name'
abid_subtype_src = '"03"'
abid_rand_src = 'self.id'
# id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
abid = ABIDField(prefix=abid_prefix)
name = models.CharField(unique=True, blank=False, max_length=100)
slug = models.SlugField(unique=True, blank=True, max_length=100)
# slug is autoset on save from name, never set it manually
slug = models.SlugField(unique=True, blank=True, max_length=100)
class Meta(TypedModelMeta):
class Meta:
verbose_name = "Tag"
verbose_name_plural = "Tags"
@ -105,16 +84,8 @@ class Tag(ABIDModel):
return super().save(*args, **kwargs)
class Snapshot(ABIDModel):
abid_prefix = 'snp_'
abid_ts_src = 'self.added'
abid_uri_src = 'self.url'
abid_subtype_src = '"01"'
abid_rand_src = 'self.id'
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) # legacy pk
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
abid = ABIDField(prefix=abid_prefix)
class Snapshot(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
url = models.URLField(unique=True, db_index=True)
timestamp = models.CharField(max_length=32, unique=True, db_index=True)
@ -127,7 +98,6 @@ class Snapshot(ABIDModel):
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
def __repr__(self) -> str:
title = self.title or '-'
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
@ -156,8 +126,8 @@ class Snapshot(ABIDModel):
from ..index import load_link_details
return load_link_details(self.as_link())
def tags_str(self, nocache=True) -> str | None:
cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
def tags_str(self, nocache=True) -> str:
cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
if nocache:
tags_str = calc_tags_str()
@ -187,9 +157,13 @@ class Snapshot(ABIDModel):
return self.as_link().is_archived
@cached_property
def num_outputs(self) -> int:
def num_outputs(self):
return self.archiveresult_set.filter(status='succeeded').count()
@cached_property
def url_hash(self):
return hashurl(self.url)
@cached_property
def base_url(self):
return base_url(self.url)
@ -204,7 +178,7 @@ class Snapshot(ABIDModel):
@cached_property
def archive_size(self):
cache_key = f'{str(self.pk)[:12]}-{(self.updated or self.added).timestamp()}-size'
cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'
def calc_dir_size():
try:
@ -225,7 +199,7 @@ class Snapshot(ABIDModel):
return None
@cached_property
def headers(self) -> Optional[Dict[str, str]]:
def headers(self) -> Optional[dict]:
try:
return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
except Exception:
@ -276,37 +250,11 @@ class Snapshot(ABIDModel):
tags_id = []
for tag in tags:
if tag.strip():
tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
self.tags.clear()
self.tags.add(*tags_id)
# def get_storage_dir(self, create=True, symlink=True) -> Path:
# date_str = self.added.strftime('%Y%m%d')
# domain_str = domain(self.url)
# abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
# if create and not abs_storage_dir.is_dir():
# abs_storage_dir.mkdir(parents=True, exist_ok=True)
# if symlink:
# LINK_PATHS = [
# Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
# # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
# Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
# Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
# ]
# for link_path in LINK_PATHS:
# link_path.parent.mkdir(parents=True, exist_ok=True)
# try:
# link_path.symlink_to(abs_storage_dir)
# except FileExistsError:
# link_path.unlink()
# link_path.symlink_to(abs_storage_dir)
# return abs_storage_dir
class ArchiveResultManager(models.Manager):
def indexable(self, sorted: bool = True):
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
@ -318,22 +266,13 @@ class ArchiveResultManager(models.Manager):
return qs
class ArchiveResult(ABIDModel):
abid_prefix = 'res_'
abid_ts_src = 'self.snapshot.added'
abid_uri_src = 'self.snapshot.url'
abid_subtype_src = 'self.extractor'
abid_rand_src = 'self.uuid'
EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
# id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') # legacy pk
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
abid = ABIDField(prefix=abid_prefix)
class ArchiveResult(models.Model):
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
uuid = models.UUIDField(default=uuid.uuid4, editable=False)
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
cmd = models.JSONField()
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
cmd = JSONField()
pwd = models.CharField(max_length=256)
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
output = models.CharField(max_length=1024)
@ -343,69 +282,5 @@ class ArchiveResult(ABIDModel):
objects = ArchiveResultManager()
class Meta(TypedModelMeta):
verbose_name = 'Result'
def __str__(self):
return self.extractor
@cached_property
def snapshot_dir(self):
return Path(self.snapshot.link_dir)
@property
def extractor_module(self):
return EXTRACTORS[self.extractor]
def output_path(self) -> str:
"""return the canonical output filename or directory name within the snapshot dir"""
return self.extractor_module.get_output_path()
def embed_path(self) -> str:
"""
return the actual runtime-calculated path to the file on-disk that
should be used for user-facing iframe embeds of this result
"""
if hasattr(self.extractor_module, 'get_embed_path'):
return self.extractor_module.get_embed_path(self)
return self.extractor_module.get_output_path()
def legacy_output_path(self):
link = self.snapshot.as_link()
return link.canonical_outputs().get(f'{self.extractor}_path')
def output_exists(self) -> bool:
return Path(self.output_path()).exists()
# def get_storage_dir(self, create=True, symlink=True):
# date_str = self.snapshot.added.strftime('%Y%m%d')
# domain_str = domain(self.snapshot.url)
# abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
# if create and not abs_storage_dir.is_dir():
# abs_storage_dir.mkdir(parents=True, exist_ok=True)
# if symlink:
# LINK_PATHS = [
# Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
# # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
# # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
# Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
# Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
# ]
# for link_path in LINK_PATHS:
# link_path.parent.mkdir(parents=True, exist_ok=True)
# try:
# link_path.symlink_to(abs_storage_dir)
# except FileExistsError:
# link_path.unlink()
# link_path.symlink_to(abs_storage_dir)
# return abs_storage_dir
# def symlink_index(self, create=True):
# abs_result_dir = self.get_storage_dir(create=create)

View file

@ -10,7 +10,6 @@ from pathlib import Path
from django.utils.crypto import get_random_string
from ..config import (
CONFIG,
DEBUG,
SECRET_KEY,
ALLOWED_HOSTS,
@ -19,9 +18,7 @@ from ..config import (
CUSTOM_TEMPLATES_DIR,
SQL_INDEX_FILENAME,
OUTPUT_DIR,
ARCHIVE_DIR,
LOGS_DIR,
CACHE_DIR,
TIMEZONE,
LDAP,
@ -55,26 +52,6 @@ APPEND_SLASH = True
DEBUG = DEBUG or ('--debug' in sys.argv)
# add plugins folders to system path, and load plugins in installed_apps
BUILTIN_PLUGINS_DIR = PACKAGE_DIR / 'plugins'
USER_PLUGINS_DIR = OUTPUT_DIR / 'plugins'
sys.path.insert(0, str(BUILTIN_PLUGINS_DIR))
sys.path.insert(0, str(USER_PLUGINS_DIR))
def find_plugins(plugins_dir):
return {
# plugin_entrypoint.parent.name: import_module(plugin_entrypoint.parent.name).METADATA
plugin_entrypoint.parent.name: plugin_entrypoint.parent
for plugin_entrypoint in plugins_dir.glob('*/apps.py')
}
INSTALLED_PLUGINS = {
**find_plugins(BUILTIN_PLUGINS_DIR),
**find_plugins(USER_PLUGINS_DIR),
}
INSTALLED_APPS = [
'django.contrib.auth',
'django.contrib.contenttypes',
@ -82,17 +59,8 @@ INSTALLED_APPS = [
'django.contrib.messages',
'django.contrib.staticfiles',
'django.contrib.admin',
'django_jsonform',
'signal_webhooks',
'abid_utils',
'plugantic',
'core',
'api',
*INSTALLED_PLUGINS.keys(),
'admin_data_views',
'django_extensions',
]
@ -204,17 +172,6 @@ if DEBUG_TOOLBAR:
]
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
# Must delete archivebox/templates/admin to use because it relies on some things we override
# visit /__requests_tracker__/ to access
DEBUG_REQUESTS_TRACKER = False
if DEBUG_REQUESTS_TRACKER:
INSTALLED_APPS += ["requests_tracker"]
MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
################################################################################
### Staticfile and Template Settings
################################################################################
@ -254,11 +211,6 @@ TEMPLATES = [
### External Service Settings
################################################################################
CACHE_DB_FILENAME = 'cache.sqlite3'
CACHE_DB_PATH = CACHE_DIR / CACHE_DB_FILENAME
CACHE_DB_TABLE = 'django_cache'
DATABASE_FILE = Path(OUTPUT_DIR) / SQL_INDEX_FILENAME
DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE))
@ -272,56 +224,23 @@ DATABASES = {
},
'TIME_ZONE': TIMEZONE,
# DB setup is sometimes modified at runtime by setup_django() in config.py
},
# 'cache': {
# 'ENGINE': 'django.db.backends.sqlite3',
# 'NAME': CACHE_DB_PATH,
# 'OPTIONS': {
# 'timeout': 60,
# 'check_same_thread': False,
# },
# 'TIME_ZONE': TIMEZONE,
# },
}
}
MIGRATION_MODULES = {'signal_webhooks': None}
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
CACHE_BACKEND = 'django.core.cache.backends.locmem.LocMemCache'
# CACHE_BACKEND = 'django.core.cache.backends.db.DatabaseCache'
# CACHE_BACKEND = 'django.core.cache.backends.dummy.DummyCache'
CACHES = {
'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'},
# 'sqlite': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'},
# 'dummy': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'},
# 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'},
'default': {
'BACKEND': CACHE_BACKEND,
'LOCATION': 'django_cache_default',
}
}
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
STORAGES = {
"default": {
"BACKEND": "django.core.files.storage.FileSystemStorage",
},
"staticfiles": {
"BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage",
},
"archive": {
"BACKEND": "django.core.files.storage.FileSystemStorage",
"OPTIONS": {
"base_url": "/archive/",
"location": ARCHIVE_DIR,
},
},
# "personas": {
# "BACKEND": "django.core.files.storage.FileSystemStorage",
# "OPTIONS": {
# "base_url": "/personas/",
# "location": PERSONAS_DIR,
# },
# },
}
################################################################################
### Security Settings
################################################################################
@ -350,6 +269,9 @@ AUTH_PASSWORD_VALIDATORS = [
{'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
]
# WIP: broken by Django 3.1.2 -> 4.0 migration
DEFAULT_AUTO_FIELD = 'django.db.models.UUIDField'
################################################################################
### Shell Settings
################################################################################
@ -368,6 +290,7 @@ if IS_SHELL:
LANGUAGE_CODE = 'en-us'
USE_I18N = True
USE_L10N = True
USE_TZ = True
DATETIME_FORMAT = 'Y-m-d g:iA'
SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
@ -448,54 +371,3 @@ LOGGING = {
}
},
}
# Add default webhook configuration to the User model
SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
SIGNAL_WEBHOOKS = {
"HOOKS": {
# ... is a special sigil value that means "use the default autogenerated hooks"
"django.contrib.auth.models.User": ...,
"core.models.Snapshot": ...,
"core.models.ArchiveResult": ...,
"core.models.Tag": ...,
"api.models.APIToken": ...,
},
}
ADMIN_DATA_VIEWS = {
"NAME": "Environment",
"URLS": [
{
"route": "config/",
"view": "core.views.live_config_list_view",
"name": "Configuration",
"items": {
"route": "<str:key>/",
"view": "core.views.live_config_value_view",
"name": "config_val",
},
},
{
"route": "binaries/",
"view": "plugantic.views.binaries_list_view",
"name": "Binaries",
"items": {
"route": "<str:key>/",
"view": "plugantic.views.binary_detail_view",
"name": "binary",
},
},
{
"route": "plugins/",
"view": "plugantic.views.plugins_list_view",
"name": "Plugins",
"items": {
"route": "<str:key>/",
"view": "plugantic.views.plugin_detail_view",
"name": "plugin",
},
},
],
}

View file

@ -1,4 +1,4 @@
__package__ = 'archivebox.core'
from django.contrib import admin
from django.urls import path, include
from django.views import static
@ -6,9 +6,7 @@ from django.contrib.staticfiles.urls import staticfiles_urlpatterns
from django.conf import settings
from django.views.generic.base import RedirectView
from .admin import archivebox_admin
from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
@ -36,12 +34,13 @@ urlpatterns = [
path('accounts/', include('django.contrib.auth.urls')),
path('admin/', archivebox_admin.urls),
path('admin/', admin.site.urls),
path("api/", include('api.urls')),
# do not add extra_context like this as not all admin views (e.g. ModelAdmin.autocomplete_view accept extra kwargs)
# path('admin/', admin.site.urls, {'extra_context': GLOBAL_CONTEXT}),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
path('error/', lambda *_: 1/0),
path('error/', lambda _: 1/0),
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
@ -52,10 +51,10 @@ urlpatterns = [
urlpatterns += staticfiles_urlpatterns()
if settings.DEBUG_TOOLBAR:
urlpatterns += [path('__debug__/', include("debug_toolbar.urls"))]
if settings.DEBUG_REQUESTS_TRACKER:
urlpatterns += [path("__requests_tracker__/", include("requests_tracker.urls"))]
import debug_toolbar
urlpatterns += [
path('__debug__/', include(debug_toolbar.urls)),
]
# # Proposed FUTURE URLs spec

View file

@ -1,13 +1,10 @@
__package__ = 'archivebox.core'
from typing import Callable
from io import StringIO
from pathlib import Path
from contextlib import redirect_stdout
from django.shortcuts import render, redirect
from django.http import HttpRequest, HttpResponse, Http404
from django.http import HttpResponse, Http404
from django.utils.html import format_html, mark_safe
from django.views import View, static
from django.views.generic.list import ListView
@ -17,10 +14,6 @@ from django.contrib.auth.mixins import UserPassesTestMixin
from django.views.decorators.csrf import csrf_exempt
from django.utils.decorators import method_decorator
from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
from core.models import Snapshot
from core.forms import AddLinkForm
@ -33,18 +26,10 @@ from ..config import (
COMMIT_HASH,
FOOTER_INFO,
SNAPSHOTS_PER_PAGE,
CONFIG,
CONFIG_SCHEMA,
DYNAMIC_CONFIG_SCHEMA,
USER_CONFIG,
SAVE_ARCHIVE_DOT_ORG,
PREVIEW_ORIGINALS,
)
from ..logging_util import printable_filesize
from ..main import add
from ..util import base_url, ansi_to_html, htmlencode, urldecode, urlencode, ts_to_date_str
from ..util import base_url, ansi_to_html
from ..search import query_search_index
from ..extractors.wget import wget_output_path
class HomepageView(View):
@ -61,120 +46,10 @@ class HomepageView(View):
class SnapshotView(View):
# render static html index from filesystem archive/<timestamp>/index.html
@staticmethod
def render_live_index(request, snapshot):
TITLE_LOADING_MSG = 'Not yet archived...'
HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org')
archiveresults = {}
results = snapshot.archiveresult_set.all()
for result in results:
embed_path = result.embed_path()
abs_path = result.snapshot_dir / (embed_path or 'None')
if (result.status == 'succeeded'
and (result.extractor not in HIDDEN_RESULTS)
and embed_path
and abs_path.exists()):
if abs_path.is_dir() and not any(abs_path.glob('*.*')):
continue
result_info = {
'name': result.extractor,
'path': embed_path,
'ts': ts_to_date_str(result.end_ts),
'size': abs_path.stat().st_size or '?',
}
archiveresults[result.extractor] = result_info
existing_files = {result['path'] for result in archiveresults.values()}
min_size_threshold = 10_000 # bytes
allowed_extensions = {
'txt',
'html',
'htm',
'png',
'jpg',
'jpeg',
'gif',
'webp'
'svg',
'webm',
'mp4',
'mp3',
'pdf',
'md',
}
# iterate through all the files in the snapshot dir and add the biggest ones to the result list
snap_dir = Path(snapshot.link_dir)
for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')):
extension = result_file.suffix.lstrip('.').lower()
if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions:
continue
if result_file.name in existing_files or result_file.name == 'index.html':
continue
file_size = result_file.stat().st_size or 0
if file_size > min_size_threshold:
archiveresults[result_file.name] = {
'name': result_file.stem,
'path': result_file.relative_to(snap_dir),
'ts': ts_to_date_str(result_file.stat().st_mtime or 0),
'size': file_size,
}
preferred_types = ('singlefile', 'wget', 'screenshot', 'dom', 'media', 'pdf', 'readability', 'mercury')
all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
best_result = {'path': 'None'}
for result_type in preferred_types:
if result_type in archiveresults:
best_result = archiveresults[result_type]
break
link = snapshot.as_link()
link_info = link._asdict(extended=True)
try:
warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name
except IndexError:
warc_path = 'warc/'
context = {
**link_info,
**link_info['canonical'],
'title': htmlencode(
link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
),
'extension': link.extension or 'html',
'tags': link.tags or 'untagged',
'size': printable_filesize(link.archive_size) if link.archive_size else 'pending',
'status': 'archived' if link.is_archived else 'not yet archived',
'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
'warc_path': warc_path,
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS,
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
'best_result': best_result,
# 'tags_str': 'somealskejrewlkrjwer,werlmwrwlekrjewlkrjwer324m532l,4m32,23m324234',
}
return render(template_name='core/snapshot_live.html', request=request, context=context)
def get(self, request, path):
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
snapshot = None
try:
slug, archivefile = path.split('/', 1)
except (IndexError, ValueError):
@ -190,11 +65,7 @@ class SnapshotView(View):
try:
try:
snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
if archivefile == 'index.html':
# if they requested snapshot index, serve live rendered template instead of static html
response = self.render_live_index(request, snapshot)
else:
response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
response["Link"] = f'<{snapshot.url}>; rel="canonical"'
return response
except Snapshot.DoesNotExist:
@ -246,33 +117,26 @@ class SnapshotView(View):
status=404,
)
except Http404:
assert snapshot # (Snapshot.DoesNotExist is already handled above)
# Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
return HttpResponse(
format_html(
(
'<center><br/><br/><br/>'
f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a>: <a href="{snapshot.url}" target="_blank" rel="noreferrer">{snapshot.url}</a><br/>'
f'was queued on {str(snapshot.added).split(".")[0]}, '
f'but no files have been saved yet in:<br/><b><a href="/archive/{snapshot.timestamp}/" target="_top"><code>{snapshot.timestamp}</code></a><code>/'
f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a> exists in DB, but resource <b><code>{snapshot.timestamp}/'
'{}'
f'</code></b><br/><br/>'
'It\'s possible {} '
f'during the last capture on {str(snapshot.added).split(".")[0]},<br/>or that the archiving process has not completed yet.<br/>'
f'<pre><code># run this cmd to finish/retry archiving this Snapshot</code><br/>'
f'<code style="user-select: all; color: #333">archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
f'</code></b> does not exist in <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
'Maybe this resource type is not availabe for this Snapshot,<br/>or the archiving process has not completed yet?<br/>'
f'<pre><code># run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
'<div class="text-align: left; width: 100%; max-width: 400px">'
'<i><b>Next steps:</i></b><br/>'
f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
f'- go to the <a href="/admin/core/snapshot/{snapshot.pk}/change/" target="_top">Snapshot admin</a> to edit<br/>'
f'- go to the <a href="/admin/core/snapshot/?uuid__startswith={snapshot.uuid}" target="_top">Snapshot actions</a> to re-archive<br/>'
f'- go to the <a href="/admin/core/snapshot/{snapshot.id}/change/" target="_top">Snapshot admin</a> to edit<br/>'
f'- go to the <a href="/admin/core/snapshot/?id__startswith={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
'- or return to <a href="/" target="_top">the main index...</a></div>'
'</center>'
),
archivefile if str(archivefile) != 'None' else '',
f'the {archivefile} resource could not be fetched' if str(archivefile) != 'None' else 'the original site was not available',
archivefile,
),
content_type="text/html",
status=404,
@ -367,7 +231,7 @@ class PublicIndexView(ListView):
qs = qs | query_search_index(query)
except Exception as err:
print(f'[!] Error while using search backend: {err.__class__.__name__} {err}')
return qs.distinct()
return qs
def get(self, *args, **kwargs):
if PUBLIC_INDEX or self.request.user.is_authenticated:
@ -448,124 +312,3 @@ class HealthCheckView(View):
content_type='text/plain',
status=200
)
def find_config_section(key: str) -> str:
matching_sections = [
name for name, opts in CONFIG_SCHEMA.items() if key in opts
]
section = matching_sections[0] if matching_sections else 'DYNAMIC'
return section
def find_config_default(key: str) -> str:
default_val = USER_CONFIG.get(key, {}).get('default', lambda: None)
if isinstance(default_val, Callable):
return None
else:
default_val = repr(default_val)
return default_val
def find_config_type(key: str) -> str:
if key in USER_CONFIG:
return USER_CONFIG[key]['type'].__name__
elif key in DYNAMIC_CONFIG_SCHEMA:
return type(CONFIG[key]).__name__
return 'str'
def key_is_safe(key: str) -> bool:
for term in ('key', 'password', 'secret', 'token'):
if term in key.lower():
return False
return True
@render_with_table_view
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
rows = {
"Section": [],
"Key": [],
"Type": [],
"Value": [],
"Default": [],
# "Documentation": [],
"Aliases": [],
}
for section in CONFIG_SCHEMA.keys():
for key in CONFIG_SCHEMA[section].keys():
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', [])))
section = 'DYNAMIC'
for key in DYNAMIC_CONFIG_SCHEMA.keys():
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
return TableContext(
title="Computed Configuration Values",
table=rows,
)
@render_with_item_view
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
aliases = USER_CONFIG.get(key, {}).get("aliases", [])
return ItemContext(
slug=key,
title=key,
data=[
{
"name": mark_safe(f'data / ArchiveBox.conf &nbsp; [{find_config_section(key)}] &nbsp; <b><code style="color: lightgray">{key}</code></b>' if key in USER_CONFIG else f'[DYNAMIC CONFIG] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(calculated at runtime)</small>'),
"description": None,
"fields": {
'Key': key,
'Type': find_config_type(key),
'Value': CONFIG[key] if key_is_safe(key) else '********',
},
"help_texts": {
'Key': mark_safe(f'''
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a> &nbsp;
<span style="display: {"inline" if aliases else "none"}">
Aliases: {", ".join(aliases)}
</span>
'''),
'Type': mark_safe(f'''
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code">
See full definition in <code>archivebox/config.py</code>...
</a>
'''),
'Value': mark_safe(f'''
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
Default: <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code">
<code>{find_config_default(key) or 'See 1here...'}</code>
</a>
<br/><br/>
<p style="display: {"block" if key in USER_CONFIG else "none"}">
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
<br/><br/>
<code>archivebox config --set {key}="{
val.strip("'")
if (val := find_config_default(key)) else
(repr(CONFIG[key] if key_is_safe(key) else '********')).strip("'")
}"</code>
</p>
'''),
},
},
],
)

View file

@ -1,13 +1,11 @@
__package__ = 'archivebox.extractors'
from typing import Callable, Optional, Dict, List, Iterable, Union, Protocol, cast
import os
import sys
from pathlib import Path
from importlib import import_module
from datetime import datetime, timezone
from typing import Callable, Optional, List, Iterable, Union
from datetime import datetime, timezone
from django.db.models import QuerySet
from ..config import (
@ -133,7 +131,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
link = load_link_details(link, out_dir=out_dir)
write_link_details(link, out_dir=out_dir, skip_sql_index=False)
log_link_archiving_started(link, str(out_dir), is_new)
log_link_archiving_started(link, out_dir, is_new)
link = link.overwrite(updated=datetime.now(timezone.utc))
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
start_ts = datetime.now(timezone.utc)
@ -160,13 +158,23 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
# bump the updated time on the main Snapshot here, this is critical
# to be able to cache summaries of the ArchiveResults for a given
# snapshot without having to load all the results from the DB each time.
# (we use {Snapshot.pk}-{Snapshot.updated} as the cache key and assume
# (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume
# ArchiveResults are unchanged as long as the updated timestamp is unchanged)
snapshot.save()
else:
# print('{black} X {}{reset}'.format(method_name, **ANSI))
stats['skipped'] += 1
except Exception as e:
# Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
# and https://github.com/ArchiveBox/ArchiveBox/issues/1014
# are fixed.
"""
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
method_name,
link.url,
)) from e
"""
# Instead, use the kludgy workaround from
# https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
with open(ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
@ -178,13 +186,6 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
ts
) + "\n" + str(e) + "\n"))
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
# print(f' ERROR: {method_name} {e.__class__.__name__}: {e} {getattr(e, "hints", "")}', ts, link.url, command)
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
method_name,
link.url,
)) from e
# print(' ', stats)
@ -217,7 +218,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
if type(all_links) is QuerySet:
num_links: int = all_links.count()
get_link = lambda x: x.as_link_with_details()
get_link = lambda x: x.as_link()
all_links = all_links.iterator()
else:
num_links: int = len(all_links)
@ -242,37 +243,3 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
log_archiving_finished(num_links)
return all_links
EXTRACTORS_DIR = Path(__file__).parent
class ExtractorModuleProtocol(Protocol):
"""Type interface for an Extractor Module (WIP)"""
get_output_path: Callable
# TODO:
# get_embed_path: Callable | None
# should_extract(Snapshot)
# extract(Snapshot)
def get_extractors(dir: Path=EXTRACTORS_DIR) -> Dict[str, ExtractorModuleProtocol]:
"""iterate through archivebox/extractors/*.py and load extractor modules"""
EXTRACTORS = {}
for filename in EXTRACTORS_DIR.glob('*.py'):
if filename.name.startswith('__'):
continue
extractor_name = filename.name.replace('.py', '')
extractor_module = cast(ExtractorModuleProtocol, import_module(f'.{extractor_name}', package=__package__))
assert getattr(extractor_module, 'get_output_path')
EXTRACTORS[extractor_name] = extractor_module
return EXTRACTORS
EXTRACTORS = get_extractors(EXTRACTORS_DIR)

View file

@ -10,12 +10,10 @@ from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
dedupe,
)
from ..config import (
TIMEOUT,
CURL_ARGS,
CURL_EXTRA_ARGS,
CHECK_SSL_VALIDITY,
SAVE_ARCHIVE_DOT_ORG,
CURL_BINARY,
@ -24,8 +22,6 @@ from ..config import (
)
from ..logging_util import TimedProgress
def get_output_path():
return 'archive.org.txt'
@enforce_types
@ -34,7 +30,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
if not overwrite and (out_dir / 'archive.org.txt').exists():
# if open(path, 'r', encoding='utf-8').read().strip() != 'None':
return False
@ -45,21 +41,16 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
"""submit site to archive.org for archiving via their service, save returned archive url"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
output: ArchiveOutput = 'archive.org.txt'
archive_org_url = None
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
# later options take precedence
options = [
cmd = [
CURL_BINARY,
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--head',
'--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
*dedupe(options),
submit_url,
]
status = 'succeeded'
@ -90,7 +81,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
archive_org_url = archive_org_url or submit_url
with open(str(out_dir / output), 'w', encoding='utf-8') as f:
f.write(archive_org_url)
chmod_file(str(out_dir / output), cwd=str(out_dir))
chmod_file('archive.org.txt', cwd=str(out_dir))
output = archive_org_url
return ArchiveResult(

View file

@ -19,9 +19,6 @@ from ..config import (
from ..logging_util import TimedProgress
def get_output_path():
return 'output.html'
@enforce_types
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@ -29,8 +26,8 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
if (out_dir / get_output_path()).stat().st_size > 1:
if not overwrite and (out_dir / 'output.html').exists():
if (out_dir / 'output.html').stat().st_size > 1:
return False
return SAVE_DOM
@ -40,7 +37,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
"""print HTML of site to file using chrome --dump-html"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
output: ArchiveOutput = 'output.html'
output_path = out_dir / output
cmd = [
*chrome_args(),

View file

@ -6,18 +6,13 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..system import chmod_file, run
from ..util import (
enforce_types,
domain,
dedupe,
)
from ..util import enforce_types, domain
from ..config import (
TIMEOUT,
SAVE_FAVICON,
FAVICON_PROVIDER,
CURL_BINARY,
CURL_ARGS,
CURL_EXTRA_ARGS,
CURL_VERSION,
CHECK_SSL_VALIDITY,
CURL_USER_AGENT,
@ -33,29 +28,19 @@ def should_save_favicon(link: Link, out_dir: Optional[str]=None, overwrite: Opti
return SAVE_FAVICON
@enforce_types
def get_output_path():
return 'favicon.ico'
@enforce_types
def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'favicon.ico'
# later options take precedence
options = [
cmd = [
CURL_BINARY,
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--max-time', str(timeout),
'--output', str(output),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
*dedupe(options),
FAVICON_PROVIDER.format(domain(link.url)),
]
status = 'failed'

View file

@ -26,19 +26,6 @@ from ..config import (
from ..logging_util import TimedProgress
def get_output_path():
return 'git/'
def get_embed_path(archiveresult=None):
if not archiveresult:
return get_output_path()
try:
return get_output_path() + list((archiveresult.snapshot_dir / get_output_path()).glob('*'))[0].name + '/'
except IndexError:
pass
return get_output_path()
@enforce_types
def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@ -46,7 +33,7 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
if not overwrite and (out_dir / 'git').exists():
return False
is_clonable_url = (
@ -64,7 +51,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
"""download full site using git"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
output: ArchiveOutput = 'git'
output_path = out_dir / output
output_path.mkdir(exist_ok=True)
cmd = [

View file

@ -9,13 +9,11 @@ from ..system import atomic_write
from ..util import (
enforce_types,
get_headers,
dedupe,
)
from ..config import (
TIMEOUT,
CURL_BINARY,
CURL_ARGS,
CURL_EXTRA_ARGS,
CURL_USER_AGENT,
CURL_VERSION,
CHECK_SSL_VALIDITY,
@ -23,14 +21,10 @@ from ..config import (
)
from ..logging_util import TimedProgress
def get_output_path():
return 'headers.json'
@enforce_types
def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
if not overwrite and (out_dir / 'headers.json').exists():
return False
return SAVE_HEADERS
@ -42,28 +36,24 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute()
output: ArchiveOutput = get_output_path()
output: ArchiveOutput = 'headers.json'
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
# later options take precedence
options = [
cmd = [
CURL_BINARY,
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--head',
'--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
*dedupe(options),
link.url,
]
try:
json_headers = get_headers(link.url, timeout=timeout)
output_folder.mkdir(exist_ok=True)
atomic_write(str(output_folder / get_output_path()), json_headers)
atomic_write(str(output_folder / "headers.json"), json_headers)
except (Exception, OSError) as err:
status = 'failed'
output = err

View file

@ -19,12 +19,6 @@ from ..util import (
)
from .title import get_html
def get_output_path():
return "htmltotext.txt"
class HTMLTextExtractor(HTMLParser):
TEXT_ATTRS = [
"alt", "cite", "href", "label",
@ -115,7 +109,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
if not overwrite and (out_dir / 'htmltotext.txt').exists():
return False
return SAVE_HTMLTOTEXT
@ -126,12 +120,10 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
"""extract search-indexing-friendly text from an HTML document"""
out_dir = Path(out_dir or link.link_dir)
output = get_output_path()
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
output = "htmltotext.txt"
timer = TimedProgress(timeout, prefix=' ')
extracted_text = None
status = 'failed'
try:
extractor = HTMLTextExtractor()
document = get_html(link, out_dir)
@ -144,9 +136,10 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
extracted_text = str(extractor)
atomic_write(str(out_dir / output), extracted_text)
status = 'succeeded'
except (Exception, OSError) as err:
status = 'failed'
output = err
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
finally:
timer.end()

View file

@ -8,13 +8,11 @@ from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
dedupe,
)
from ..config import (
MEDIA_TIMEOUT,
SAVE_MEDIA,
YOUTUBEDL_ARGS,
YOUTUBEDL_EXTRA_ARGS,
YOUTUBEDL_BINARY,
YOUTUBEDL_VERSION,
CHECK_SSL_VALIDITY
@ -22,27 +20,13 @@ from ..config import (
from ..logging_util import TimedProgress
def get_output_path():
return 'media/'
def get_embed_path(archiveresult=None):
if not archiveresult:
return get_output_path()
out_dir = archiveresult.snapshot_dir / get_output_path()
try:
return get_output_path() + list(out_dir.glob('*.mp4'))[0].name
except IndexError:
return get_output_path()
@enforce_types
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
if not overwrite and (out_dir / 'media').exists():
return False
return SAVE_MEDIA
@ -52,19 +36,14 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
output: ArchiveOutput = 'media'
output_path = out_dir / output
output_path.mkdir(exist_ok=True)
# later options take precedence
options = [
*YOUTUBEDL_ARGS,
*YOUTUBEDL_EXTRA_ARGS,
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
]
cmd = [
YOUTUBEDL_BINARY,
*dedupe(options),
*YOUTUBEDL_ARGS,
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
link.url,
]
status = 'succeeded'

View file

@ -11,25 +11,17 @@ from ..system import run, atomic_write
from ..util import (
enforce_types,
is_static_file,
dedupe,
)
from ..config import (
TIMEOUT,
SAVE_MERCURY,
DEPENDENCIES,
MERCURY_VERSION,
MERCURY_ARGS,
MERCURY_EXTRA_ARGS,
)
from ..logging_util import TimedProgress
def get_output_path():
return 'mercury/'
def get_embed_path(archiveresult=None):
return get_output_path() + 'content.html'
@enforce_types
def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
@ -50,7 +42,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Opti
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
if not overwrite and (out_dir / 'mercury').exists():
return False
return SAVE_MERCURY
@ -61,23 +53,19 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
"""download reader friendly version using @postlight/mercury-parser"""
out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute() / get_output_path()
output = get_output_path()
output_folder = out_dir.absolute() / "mercury"
output = "mercury"
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
output_folder.mkdir(exist_ok=True)
# later options take precedence
options = [
*MERCURY_ARGS,
*MERCURY_EXTRA_ARGS,
]
# By default, get plain text version of article
# Get plain text version of article
cmd = [
DEPENDENCIES['MERCURY_BINARY']['path'],
link.url,
*dedupe(options)
"--format=text"
]
result = run(cmd, cwd=out_dir, timeout=timeout)
try:

View file

@ -19,17 +19,13 @@ from ..config import (
from ..logging_util import TimedProgress
def get_output_path():
return 'output.pdf'
@enforce_types
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
if not overwrite and (out_dir / 'output.pdf').exists():
return False
return SAVE_PDF
@ -40,7 +36,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
"""print PDF of site to file using chrome --headless"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
output: ArchiveOutput = 'output.pdf'
cmd = [
*chrome_args(),
'--print-to-pdf',
@ -55,7 +51,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to save PDF', hints)
chmod_file(get_output_path(), cwd=str(out_dir))
chmod_file('output.pdf', cwd=str(out_dir))
except Exception as err:
status = 'failed'
output = err

View file

@ -22,12 +22,6 @@ from ..config import (
from ..logging_util import TimedProgress
from .title import get_html
def get_output_path():
return 'readability/'
def get_embed_path(archiveresult=None):
return get_output_path() + 'content.html'
@enforce_types
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
@ -35,7 +29,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite:
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
if not overwrite and (out_dir / 'readability').exists():
return False
return SAVE_READABILITY
@ -46,8 +40,8 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
"""download reader friendly version using @mozilla/readability"""
out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute() / get_output_path()
output = get_output_path()
output_folder = out_dir.absolute() / "readability"
output = "readability"
# Readability Docs: https://github.com/mozilla/readability

View file

@ -19,9 +19,6 @@ from ..config import (
from ..logging_util import TimedProgress
def get_output_path():
return 'screenshot.png'
@enforce_types
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@ -29,7 +26,7 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
if not overwrite and (out_dir / 'screenshot.png').exists():
return False
return SAVE_SCREENSHOT
@ -39,7 +36,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
"""take screenshot of site using chrome --headless"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
output: ArchiveOutput = 'screenshot.png'
cmd = [
*chrome_args(),
'--screenshot',

View file

@ -11,7 +11,6 @@ from ..util import (
enforce_types,
is_static_file,
chrome_args,
dedupe,
)
from ..config import (
TIMEOUT,
@ -19,24 +18,18 @@ from ..config import (
DEPENDENCIES,
SINGLEFILE_VERSION,
SINGLEFILE_ARGS,
SINGLEFILE_EXTRA_ARGS,
CHROME_BINARY,
COOKIES_FILE,
)
from ..logging_util import TimedProgress
def get_output_path():
return 'singlefile.html'
@enforce_types
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
if not overwrite and (out_dir / 'singlefile.html').exists():
return False
return SAVE_SINGLEFILE
@ -47,30 +40,43 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
"""download full site using single-file"""
out_dir = out_dir or Path(link.link_dir)
output = get_output_path()
output = "singlefile.html"
browser_args = chrome_args(CHROME_TIMEOUT=0)
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
# later options take precedence
options = [
'--browser-executable-path={}'.format(CHROME_BINARY),
*(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []),
browser_args,
*SINGLEFILE_ARGS,
*SINGLEFILE_EXTRA_ARGS,
'--browser-executable-path={}'.format(CHROME_BINARY),
browser_args,
]
# Deduplicate options (single-file doesn't like when you use the same option two times)
#
# NOTE: Options names that come first clobber conflicting names that come later
# My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
# specificity, therefore the user sets it with a lot intent, therefore it should take precedence
# kind of like the ergonomic principle of lexical scope in programming languages.
seen_option_names = []
def test_seen(argument):
option_name = argument.split("=")[0]
if option_name in seen_option_names:
return False
else:
seen_option_names.append(option_name)
return True
deduped_options = list(filter(test_seen, options))
cmd = [
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
*dedupe(options),
*deduped_options,
link.url,
output,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
result = None
try:
result = run(cmd, cwd=str(out_dir), timeout=timeout)
@ -78,7 +84,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
if line.strip()
]
hints = (
@ -88,14 +94,12 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
# Check for common failure cases
if (result.returncode > 0) or not (out_dir / output).is_file():
raise ArchiveError(f'SingleFile was not able to archive the page (status={result.returncode})', hints)
raise ArchiveError('SingleFile was not able to archive the page', hints)
chmod_file(output, cwd=str(out_dir))
except (Exception, OSError) as err:
status = 'failed'
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
cmd[2] = browser_args.replace('"', "\\\"")
if result:
err.hints = (result.stdout + result.stderr).decode().split('\n')
output = err
finally:
timer.end()

View file

@ -10,7 +10,6 @@ from ..util import (
enforce_types,
download_url,
htmldecode,
dedupe,
)
from ..config import (
TIMEOUT,
@ -18,7 +17,6 @@ from ..config import (
SAVE_TITLE,
CURL_BINARY,
CURL_ARGS,
CURL_EXTRA_ARGS,
CURL_VERSION,
CURL_USER_AGENT,
)
@ -60,7 +58,6 @@ class TitleParser(HTMLParser):
if tag.lower() == "title":
self.inside_title_tag = False
@enforce_types
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
"""
@ -78,20 +75,13 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
with open(abs_path / source, "r", encoding="utf-8") as f:
document = f.read()
break
except (FileNotFoundError, TypeError, UnicodeDecodeError):
except (FileNotFoundError, TypeError):
continue
if document is None:
return download_url(link.url, timeout=timeout)
else:
return document
def get_output_path():
# TODO: actually save title to this file
# (currently only saved in ArchiveResult.output as charfield value, not saved to filesystem)
return 'title.json'
@enforce_types
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
# if link already has valid title, skip it
@ -112,17 +102,12 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
from core.models import Snapshot
output: ArchiveOutput = None
# later options take precedence
options = [
cmd = [
CURL_BINARY,
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
*dedupe(options),
link.url,
]
status = 'succeeded'

View file

@ -15,11 +15,9 @@ from ..util import (
path,
domain,
urldecode,
dedupe,
)
from ..config import (
WGET_ARGS,
WGET_EXTRA_ARGS,
TIMEOUT,
SAVE_WGET,
SAVE_WARC,
@ -35,18 +33,6 @@ from ..config import (
from ..logging_util import TimedProgress
def get_output_path():
# TODO: actually save output into this folder, instead of do {domain}/**/index.html
return 'wget/'
def get_embed_path(archiveresult=None):
if not archiveresult:
return get_output_path()
link = archiveresult.snapshot.as_link()
return wget_output_path(link)
@enforce_types
def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
output_path = wget_output_path(link)
@ -69,10 +55,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
output: ArchiveOutput = None
# later options take precedence
options = [
cmd = [
WGET_BINARY,
# '--server-response', # print headers for better error parsing
*WGET_ARGS,
*WGET_EXTRA_ARGS,
'--timeout={}'.format(timeout),
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
@ -82,11 +68,6 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
*([] if SAVE_WARC else ['--timestamping']),
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
# '--server-response', # print headers for better error parsing
]
cmd = [
WGET_BINARY,
*dedupe(options),
link.url,
]
@ -145,38 +126,64 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
@enforce_types
def unsafe_wget_output_path(link: Link) -> Optional[str]:
# There used to be a bunch of complex reverse-engineering path mapping logic here,
# but it was removed in favor of just walking through the output folder recursively to try to find the
# html file that wget produced. It's *much much much* slower than deriving it statically, and is currently
# one of the main bottlenecks of ArchiveBox's performance (the output data is often on a slow HDD or network mount).
# But it's STILL better than trying to figure out URL -> html filepath mappings ourselves from first principles.
def wget_output_path(link: Link) -> Optional[str]:
"""calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path.
See docs on wget --adjust-extension (-E)
"""
# Wget downloads can save in a number of different ways depending on the url:
# https://example.com
# > example.com/index.html
# https://example.com?v=zzVa_tX1OiI
# > example.com/index.html?v=zzVa_tX1OiI.html
# https://www.example.com/?v=zzVa_tX1OiI
# > example.com/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc
# > example.com/abc.html
# https://example.com/abc/
# > example.com/abc/index.html
# https://example.com/abc?v=zzVa_tX1OiI.html
# > example.com/abc?v=zzVa_tX1OiI.html
# https://example.com/abc/?v=zzVa_tX1OiI.html
# > example.com/abc/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc/test.html
# > example.com/abc/test.html
# https://example.com/abc/test?v=zzVa_tX1OiI
# > example.com/abc/test?v=zzVa_tX1OiI.html
# https://example.com/abc/test/?v=zzVa_tX1OiI
# > example.com/abc/test/index.html?v=zzVa_tX1OiI.html
# There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
# Since the wget algorithm for -E (appending .html) is incredibly complex
# and there's no way to get the computed output path from wget
# in order to avoid having to reverse-engineer how they calculate it,
# we just look in the output folder read the filename wget used from the filesystem
full_path = without_fragment(without_query(path(link.url))).strip('/')
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
for _ in range(4):
try:
if search_dir.exists():
if search_dir.is_dir():
html_files = [
f for f in search_dir.iterdir()
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
]
if html_files:
return str(html_files[0].relative_to(link.link_dir))
if search_dir.exists():
if search_dir.is_dir():
html_files = [
f for f in search_dir.iterdir()
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
]
if html_files:
return str(html_files[0].relative_to(link.link_dir))
# sometimes wget'd URLs have no ext and return non-html
# e.g. /some/example/rss/all -> some RSS XML content)
# /some/other/url.o4g -> some binary unrecognized ext)
# test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
for file_present in search_dir.iterdir():
if file_present == last_part_of_url:
return str((search_dir / file_present).relative_to(link.link_dir))
except OSError:
# OSError 36 and others can happen here, caused by trying to check for impossible paths
# (paths derived from URLs can often contain illegal unicode characters or be too long,
# causing the OS / filesystem to reject trying to open them with a system-level error)
pass
# sometimes wget'd URLs have no ext and return non-html
# e.g. /some/example/rss/all -> some RSS XML content)
# /some/other/url.o4g -> some binary unrecognized ext)
# test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
for file_present in search_dir.iterdir():
if file_present == last_part_of_url:
return str((search_dir / file_present).relative_to(link.link_dir))
# Move up one directory level
search_dir = search_dir.parent
@ -186,101 +193,13 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]:
# check for literally any file present that isnt an empty folder
domain_dir = Path(domain(link.url).replace(":", "+"))
files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')]
files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*'))
if files_within:
return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
# abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated
# that it's better we just pretend it doesnt exist
# this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools
return None
@enforce_types
def wget_output_path(link: Link) -> Optional[str]:
"""calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path.
See docs on: wget --adjust-extension (-E), --restrict-file-names=windows|unix|ascii, --convert-links
WARNING: this function is extremely error prone because mapping URLs to filesystem paths deterministically
is basically impossible. Every OS and filesystem have different requirements on what special characters are
allowed, and URLs are *full* of all kinds of special characters, illegal unicode, and generally unsafe strings
that you dont want anywhere near your filesystem. Also URLs can be obscenely long, but most filesystems dont
accept paths longer than 250 characters. On top of all that, this function only exists to try to reverse engineer
wget's approach to solving this problem, so this is a shittier, less tested version of their already insanely
complicated attempt to do this. Here be dragons:
- https://github.com/ArchiveBox/ArchiveBox/issues/549
- https://github.com/ArchiveBox/ArchiveBox/issues/1373
- https://stackoverflow.com/questions/9532499/check-whether-a-path-is-valid-in-python-without-creating-a-file-at-the-paths-ta
- and probably many more that I didn't realize were caused by this...
The only constructive thing we could possibly do to this function is to figure out how to remove it.
Preach loudly to anyone who will listen: never attempt to map URLs to filesystem paths,
and pray you never have to deal with the aftermath of someone else's attempt to do so...
"""
# Wget downloads can save in a number of different ways depending on the url:
# https://example.com
# > example.com/index.html
# https://example.com?v=zzVa_tX1OiI
# > example.com/index.html@v=zzVa_tX1OiI.html
# https://www.example.com/?v=zzVa_tX1OiI
# > example.com/index.html@v=zzVa_tX1OiI.html
# https://example.com/abc
# > example.com/abc.html
# https://example.com/abc/
# > example.com/abc/index.html
# https://example.com/abc?v=zzVa_tX1OiI.html
# > example.com/abc@v=zzVa_tX1OiI.html
# https://example.com/abc/?v=zzVa_tX1OiI.html
# > example.com/abc/index.html@v=zzVa_tX1OiI.html
# https://example.com/abc/test.html
# > example.com/abc/test.html
# https://example.com/abc/test?v=zzVa_tX1OiI
# > example.com/abc/test@v=zzVa_tX1OiI.html
# https://example.com/abc/test/?v=zzVa_tX1OiI
# > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
# There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
# unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
# 4 characters, paths with multipe extensions, etc. the list goes on...
output_path = None
try:
output_path = unsafe_wget_output_path(link)
except Exception as err:
pass # better to pretend it just failed to download than expose gnarly OSErrors to users
# check for unprintable unicode characters
# https://github.com/ArchiveBox/ArchiveBox/issues/1373
if output_path:
safe_path = output_path.encode('utf-8', 'replace').decode()
if output_path != safe_path:
# contains unprintable unicode characters that will break other parts of archivebox
# better to pretend it doesnt exist and fallback to parent dir than crash archivebox
output_path = None
# check for a path that is just too long to safely handle across different OS's
# https://github.com/ArchiveBox/ArchiveBox/issues/549
if output_path and len(output_path) > 250:
output_path = None
if output_path:
return output_path
# fallback to just the domain dir
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
if search_dir.is_dir():
return domain(link.url).replace(":", "+")
# fallback to just the domain dir without port
search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0]
if search_dir.is_dir():
return domain(link.url).split(":", 1)[0]
return None

View file

@ -250,7 +250,7 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
"""parse and load existing index with any new links from import_path merged in"""
from core.models import Snapshot
try:
return Snapshot.objects.all().only('id')
return Snapshot.objects.all()
except (KeyboardInterrupt, SystemExit):
raise SystemExit(0)
@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links without checking archive status or data directory validity"""
links = (snapshot.as_link() for snapshot in snapshots.iterator())
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
return {
link.link_dir: link
for link in links
@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are archived with a valid data directory"""
links = (snapshot.as_link() for snapshot in snapshots.iterator())
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
return {
link.link_dir: link
for link in filter(is_archived, links)
@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are unarchived with no data directory or an empty data directory"""
links = (snapshot.as_link() for snapshot in snapshots.iterator())
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
return {
link.link_dir: link
for link in filter(is_unarchived, links)

View file

@ -118,10 +118,10 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
def snapshot_icons(snapshot) -> str:
cache_key = f'{snapshot.pk}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
def calc_snapshot_icons():
from core.models import EXTRACTOR_CHOICES
from core.models import EXTRACTORS
# start = datetime.now(timezone.utc)
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
@ -147,12 +147,12 @@ def snapshot_icons(snapshot) -> str:
# Missing specific entry for WARC
extractor_outputs = defaultdict(lambda: None)
for extractor, _ in EXTRACTOR_CHOICES:
for extractor, _ in EXTRACTORS:
for result in archive_results:
if result.extractor == extractor and result:
extractor_outputs[extractor] = result
for extractor, _ in EXTRACTOR_CHOICES:
for extractor, _ in EXTRACTORS:
if extractor not in exclude:
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)

View file

@ -4,7 +4,6 @@ WARNING: THIS FILE IS ALL LEGACY CODE TO BE REMOVED.
DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py
These are the old types we used to use before ArchiveBox v0.4 (before we switched to Django).
"""
__package__ = 'archivebox.index'
@ -192,9 +191,6 @@ class Link:
if extended:
info.update({
'snapshot_id': self.snapshot_id,
'snapshot_uuid': self.snapshot_uuid,
'snapshot_abid': self.snapshot_abid,
'link_dir': self.link_dir,
'archive_path': self.archive_path,
@ -263,22 +259,10 @@ class Link:
return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
@cached_property
def snapshot(self):
from core.models import Snapshot
return Snapshot.objects.only('uuid').get(url=self.url)
@cached_property
def snapshot_id(self):
return str(self.snapshot.pk)
@cached_property
def snapshot_uuid(self):
return str(self.snapshot.uuid)
@cached_property
def snapshot_abid(self):
return str(self.snapshot.ABID)
from core.models import Snapshot
return str(Snapshot.objects.only('id').get(url=self.url).id)
@classmethod
def field_names(cls):
@ -395,15 +379,11 @@ class Link:
output_paths = (
domain(self.url),
'output.html',
'output.pdf',
'screenshot.png',
'singlefile.html',
'readability/content.html',
'mercury/content.html',
'htmltotext.txt',
'output.html',
'media',
'git',
'singlefile.html'
)
return any(

View file

@ -45,8 +45,7 @@ def write_link_to_sql_index(link: Link):
info.pop('tags')
try:
snapshot = Snapshot.objects.get(url=link.url)
info["timestamp"] = snapshot.timestamp
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
except Snapshot.DoesNotExist:
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
@ -58,7 +57,7 @@ def write_link_to_sql_index(link: Link):
for entry in entries:
if isinstance(entry, dict):
result, _ = ArchiveResult.objects.get_or_create(
snapshot_id=snapshot.pk,
snapshot_id=snapshot.id,
extractor=extractor,
start_ts=parse_date(entry['start_ts']),
defaults={
@ -72,7 +71,7 @@ def write_link_to_sql_index(link: Link):
)
else:
result, _ = ArchiveResult.objects.update_or_create(
snapshot_id=snapshot.pk,
snapshot_id=snapshot.id,
extractor=extractor,
start_ts=parse_date(entry.start_ts),
defaults={
@ -143,12 +142,7 @@ def list_migrations(out_dir: Path=OUTPUT_DIR) -> List[Tuple[bool, str]]:
def apply_migrations(out_dir: Path=OUTPUT_DIR) -> List[str]:
from django.core.management import call_command
null, out = StringIO(), StringIO()
try:
call_command("makemigrations", interactive=False, stdout=null)
except Exception as e:
print('[!] Failed to create some migrations. Please open an issue and copy paste this output for help: {}'.format(e))
print()
call_command("makemigrations", interactive=False, stdout=null)
call_command("migrate", interactive=False, stdout=out)
out.seek(0)

View file

@ -432,14 +432,12 @@ def log_archive_method_finished(result: "ArchiveResult"):
**ANSI,
),
]
# import pudb; pudb.set_trace()
# Prettify error output hints string and limit to five lines
hints = getattr(result.output, 'hints', None) or ()
if hints:
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
hints = [hint.decode() for hint in hints if isinstance(hint, bytes)]
else:
if isinstance(hints, bytes):
hints = hints.decode()
@ -638,15 +636,17 @@ def printable_folder_status(name: str, folder: Dict) -> str:
@enforce_types
def printable_dependency_version(name: str, dependency: Dict) -> str:
color, symbol, note, version = 'red', 'X', 'invalid', '?'
version = None
if dependency['enabled']:
if dependency['is_valid']:
color, symbol, note = 'green', '', 'valid'
color, symbol, note, version = 'green', '', 'valid', ''
parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
if parsed_version_num:
version = f'v{parsed_version_num[0]}'
if not version:
color, symbol, note, version = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'

View file

@ -104,6 +104,7 @@ from .config import (
COMMIT_HASH,
BUILD_TIME,
CODE_LOCATIONS,
EXTERNAL_LOCATIONS,
DATA_LOCATIONS,
DEPENDENCIES,
CHROME_BINARY,
@ -230,7 +231,7 @@ def version(quiet: bool=False,
p = platform.uname()
print(
'ArchiveBox v{}'.format(get_version(CONFIG)),
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
*((f'COMMIT_HASH={COMMIT_HASH[:7]}',) if COMMIT_HASH else ()),
f'BUILD_TIME={BUILD_TIME}',
)
print(
@ -271,6 +272,11 @@ def version(quiet: bool=False,
for name, path in CODE_LOCATIONS.items():
print(printable_folder_status(name, path))
print()
print('{white}[i] Secrets locations:{reset}'.format(**ANSI))
for name, path in EXTERNAL_LOCATIONS.items():
print(printable_folder_status(name, path))
print()
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
print('{white}[i] Data locations:{reset}'.format(**ANSI))
@ -689,7 +695,7 @@ def add(urls: Union[str, List[str]],
if CAN_UPGRADE:
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
return new_links
return all_links
@enforce_types
def remove(filter_str: Optional[str]=None,
@ -785,8 +791,6 @@ def update(resume: Optional[float]=None,
out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
from core.models import ArchiveResult
check_data_folder(out_dir=out_dir)
check_dependencies()
new_links: List[Link] = [] # TODO: Remove input argument: only_new
@ -794,23 +798,19 @@ def update(resume: Optional[float]=None,
extractors = extractors.split(",") if extractors else []
# Step 1: Filter for selected_links
print('[*] Finding matching Snapshots to update...')
print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
matching_snapshots = list_links(
filter_patterns=filter_patterns,
filter_type=filter_type,
before=before,
after=after,
)
print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
matching_folders = list_folders(
links=matching_snapshots,
status=status,
out_dir=out_dir,
)
all_links = (link for link in matching_folders.values() if link)
print(' - Sorting by most unfinished -> least unfinished + date archived...')
all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
all_links = [link for link in matching_folders.values() if link]
if index_only:
for link in all_links:
@ -836,7 +836,6 @@ def update(resume: Optional[float]=None,
if extractors:
archive_kwargs["methods"] = extractors
archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
# Step 4: Re-write links index with updated titles, icons, and resources
@ -1356,7 +1355,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
stderr('')
stderr()
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])

View file

@ -7,7 +7,7 @@ if __name__ == '__main__':
# versions of ./manage.py commands whenever possible. When that's not possible
# (e.g. makemigrations), you can comment out this check temporarily
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv or 'startapp' in sys.argv):
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv):
print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):")
print()
print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:')

View file

@ -1,16 +0,0 @@
__package__ = 'archivebox'
import django_stubs_ext
django_stubs_ext.monkeypatch()
# monkey patch django timezone to add back utc (it was removed in Django 5.0)
import datetime
from django.utils import timezone
timezone.utc = datetime.timezone.utc
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
# from signal_webhooks.apps import DjangoSignalWebhooksConfig
# DjangoSignalWebhooksConfig.verbose_name = 'API'

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
{
"name": "archivebox",
"version": "0.8.1",
"version": "0.7.2",
"description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
"repository": "github:ArchiveBox/ArchiveBox",
@ -8,6 +8,6 @@
"dependencies": {
"@postlight/parser": "^2.2.3",
"readability-extractor": "github:ArchiveBox/readability-extractor",
"single-file-cli": "^1.1.54"
"single-file-cli": "^1.1.46"
}
}

View file

@ -7,6 +7,7 @@ For examples of supported import formats see tests/.
__package__ = 'archivebox.parsers'
import re
from io import StringIO
from typing import IO, Tuple, List, Optional
@ -27,6 +28,7 @@ from ..util import (
htmldecode,
download_url,
enforce_types,
URL_REGEX,
)
from ..index.schema import Link
from ..logging_util import TimedProgress, log_source_saved
@ -42,7 +44,6 @@ from . import medium_rss
from . import netscape_html
from . import generic_rss
from . import generic_json
from . import generic_jsonl
from . import generic_html
from . import generic_txt
from . import url_list
@ -62,7 +63,6 @@ PARSERS = {
netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER),
generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER),
generic_json.KEY: (generic_json.NAME, generic_json.PARSER),
generic_jsonl.KEY: (generic_jsonl.NAME, generic_jsonl.PARSER),
generic_html.KEY: (generic_html.NAME, generic_html.PARSER),
# Catchall fallback parser
@ -200,3 +200,54 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
log_source_saved(source_file=source_path)
return source_path
# Check that plain text regex URL parsing works as expected
# this is last-line-of-defense to make sure the URL_REGEX isn't
# misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
# the consequences of bad URL parsing could be disastrous and lead to many
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
_test_url_strs = {
'example.com': 0,
'/example.com': 0,
'//example.com': 0,
':/example.com': 0,
'://example.com': 0,
'htt://example8.com': 0,
'/htt://example.com': 0,
'https://example': 1,
'https://localhost/2345': 1,
'https://localhost:1234/123': 1,
'://': 0,
'https://': 0,
'http://': 0,
'ftp://': 0,
'ftp://example.com': 0,
'https://example.com': 1,
'https://example.com/': 1,
'https://a.example.com': 1,
'https://a.example.com/': 1,
'https://a.example.com/what/is/happening.html': 1,
'https://a.example.com/what/ís/happening.html': 1,
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
'<test>http://example7.com</test>': 1,
'https://<test>': 0,
'https://[test]': 0,
'http://"test"': 0,
'http://\'test\'': 0,
'[https://example8.com/what/is/this.php?what=1]': 1,
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
'<what>https://example10.com#and-thing=2 "</about>': 1,
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
'<or>http://examplehttp://15.badc</that>': 2,
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
}
for url_str, num_urls in _test_url_strs.items():
assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
f'{url_str} does not contain {num_urls} urls')

View file

@ -10,7 +10,7 @@ from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
find_all_urls,
URL_REGEX,
)
from html.parser import HTMLParser
from urllib.parse import urljoin
@ -40,22 +40,10 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
parser.feed(line)
for url in parser.urls:
if root_url:
url_is_absolute = (url.lower().startswith('http://') or url.lower().startswith('https://'))
# url = https://abc.com => True
# url = /page.php?next=https://example.com => False
if not url_is_absolute: # resolve it by joining it with root_url
relative_path = url
url = urljoin(root_url, relative_path) # https://example.com/somepage.html + /home.html
# => https://example.com/home.html
# special case to handle bug around // handling, crucial for urls that contain sub-urls
# e.g. https://web.archive.org/web/https://example.com
if did_urljoin_misbehave(root_url, relative_path, url):
url = fix_urljoin_bug(url)
for archivable_url in find_all_urls(url):
# resolve relative urls /home.html -> https://example.com/home.html
url = urljoin(root_url, url)
for archivable_url in re.findall(URL_REGEX, url):
yield Link(
url=htmldecode(archivable_url),
timestamp=str(datetime.now(timezone.utc).timestamp()),
@ -68,74 +56,3 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
KEY = 'html'
NAME = 'Generic HTML'
PARSER = parse_generic_html_export
#### WORKAROUND CODE FOR https://github.com/python/cpython/issues/96015 ####
def did_urljoin_misbehave(root_url: str, relative_path: str, final_url: str) -> bool:
"""
Handle urljoin edge case bug where multiple slashes get turned into a single slash:
- https://github.com/python/cpython/issues/96015
- https://github.com/ArchiveBox/ArchiveBox/issues/1411
This workaround only fixes the most common case of a sub-URL inside an outer URL, e.g.:
https://web.archive.org/web/https://example.com/some/inner/url
But there are other valid URLs containing // that are not fixed by this workaround, e.g.:
https://example.com/drives/C//some/file
"""
# if relative path is actually an absolute url, cut off its own scheme so we check the path component only
relative_path = relative_path.lower()
if relative_path.startswith('http://') or relative_path.startswith('https://'):
relative_path = relative_path.split('://', 1)[-1]
# TODO: properly fix all double // getting stripped by urljoin, not just ://
original_path_had_suburl = '://' in relative_path
original_root_had_suburl = '://' in root_url[8:] # ignore first 8 chars because root always starts with https://
final_joined_has_suburl = '://' in final_url[8:] # ignore first 8 chars because final always starts with https://
urljoin_broke_suburls = (
(original_root_had_suburl or original_path_had_suburl)
and not final_joined_has_suburl
)
return urljoin_broke_suburls
def fix_urljoin_bug(url: str, nesting_limit=5):
"""
recursively replace broken suburls .../http:/... with .../http://...
basically equivalent to this for 99.9% of cases:
url = url.replace('/http:/', '/http://')
url = url.replace('/https:/', '/https://')
except this handles:
other schemes besides http/https (e.g. https://example.com/link/git+ssh://github.com/example)
other preceding separators besides / (e.g. https://example.com/login/?next=https://example.com/home)
fixing multiple suburls recursively
"""
input_url = url
for _ in range(nesting_limit):
url = re.sub(
r'(?P<root>.+?)' # https://web.archive.org/web
+ r'(?P<separator>[-=/_&+%$#@!*\(\\])' # /
+ r'(?P<subscheme>[a-zA-Z0-9+_-]{1,32}?):/' # http:/
+ r'(?P<suburl>[^/\\]+)', # example.com
r"\1\2\3://\4",
input_url,
re.IGNORECASE | re.UNICODE,
)
if url == input_url:
break # nothing left to replace, all suburls are fixed
input_url = url
return url
# sanity check to make sure workaround code works as expected and doesnt introduce *more* bugs
assert did_urljoin_misbehave('https://web.archive.org/web/https://example.com', 'abc.html', 'https://web.archive.org/web/https:/example.com/abc.html') == True
assert did_urljoin_misbehave('http://example.com', 'https://web.archive.org/web/http://example.com/abc.html', 'https://web.archive.org/web/http:/example.com/abc.html') == True
assert fix_urljoin_bug('https:/example.com') == 'https:/example.com' # should not modify original url's scheme, only sub-urls
assert fix_urljoin_bug('https://web.archive.org/web/https:/example.com/abc.html') == 'https://web.archive.org/web/https://example.com/abc.html'
assert fix_urljoin_bug('http://example.com/link/git+ssh:/github.com/example?next=ftp:/example.com') == 'http://example.com/link/git+ssh://github.com/example?next=ftp://example.com'

View file

@ -11,60 +11,6 @@ from ..util import (
enforce_types,
)
# This gets used by generic_jsonl, too
def jsonObjectToLink(link: str, source: str):
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
# example line
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
# Parse URL
url = link.get('href') or link.get('url') or link.get('URL')
if not url:
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
# Parse the timestamp
ts_str = str(datetime.now(timezone.utc).timestamp())
if link.get('timestamp'):
# chrome/ff histories use a very precise timestamp
ts_str = str(link['timestamp'] / 10000000)
elif link.get('time'):
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
elif link.get('created_at'):
ts_str = str(json_date(link['created_at']).timestamp())
elif link.get('created'):
ts_str = str(json_date(link['created']).timestamp())
elif link.get('date'):
ts_str = str(json_date(link['date']).timestamp())
elif link.get('bookmarked'):
ts_str = str(json_date(link['bookmarked']).timestamp())
elif link.get('saved'):
ts_str = str(json_date(link['saved']).timestamp())
# Parse the title
title = None
if link.get('title'):
title = link['title'].strip()
elif link.get('description'):
title = link['description'].replace(' — Readability', '').strip()
elif link.get('name'):
title = link['name'].strip()
# if we have a list, join it with commas
tags = link.get('tags')
if type(tags) == list:
tags = ','.join(tags)
elif type(tags) == str:
# if there's no comma, assume it was space-separated
if ',' not in tags:
tags = tags.replace(' ', ',')
return Link(
url=htmldecode(url),
timestamp=ts_str,
title=htmldecode(title) or None,
tags=htmldecode(tags),
sources=[source],
)
@enforce_types
def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
@ -72,13 +18,55 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
json_file.seek(0)
links = json.load(json_file)
if type(links) != list:
raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
# sometimes the first line is a comment or filepath, so we get everything after the first {
json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
links = json.loads(json_file_json_str)
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
for link in links:
# example line
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
if link:
yield jsonObjectToLink(link, json_file.name)
# Parse URL
url = link.get('href') or link.get('url') or link.get('URL')
if not url:
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
# Parse the timestamp
ts_str = str(datetime.now(timezone.utc).timestamp())
if link.get('timestamp'):
# chrome/ff histories use a very precise timestamp
ts_str = str(link['timestamp'] / 10000000)
elif link.get('time'):
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
elif link.get('created_at'):
ts_str = str(json_date(link['created_at']).timestamp())
elif link.get('created'):
ts_str = str(json_date(link['created']).timestamp())
elif link.get('date'):
ts_str = str(json_date(link['date']).timestamp())
elif link.get('bookmarked'):
ts_str = str(json_date(link['bookmarked']).timestamp())
elif link.get('saved'):
ts_str = str(json_date(link['saved']).timestamp())
# Parse the title
title = None
if link.get('title'):
title = link['title'].strip()
elif link.get('description'):
title = link['description'].replace(' — Readability', '').strip()
elif link.get('name'):
title = link['name'].strip()
yield Link(
url=htmldecode(url),
timestamp=ts_str,
title=htmldecode(title) or None,
tags=htmldecode(link.get('tags')) or '',
sources=[json_file.name],
)
KEY = 'json'
NAME = 'Generic JSON'

View file

@ -1,32 +0,0 @@
__package__ = 'archivebox.parsers'
import json
from typing import IO, Iterable
from ..index.schema import Link
from ..util import (
enforce_types,
)
from .generic_json import jsonObjectToLink
def parse_line(line: str):
if line.strip() != "":
return json.loads(line)
@enforce_types
def parse_generic_jsonl_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse JSONL format bookmarks export files"""
json_file.seek(0)
links = [ parse_line(line) for line in json_file ]
for link in links:
if link:
yield jsonObjectToLink(link,json_file.name)
KEY = 'jsonl'
NAME = 'Generic JSONL'
PARSER = parse_generic_jsonl_export

View file

@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable
from time import mktime
from feedparser import parse as feedparser
from datetime import datetime
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types
enforce_types,
str_between,
)
@enforce_types
@ -16,27 +16,35 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse RSS XML-format files into links"""
rss_file.seek(0)
feed = feedparser(rss_file.read())
for item in feed.entries:
url = item.link
title = item.title
time = mktime(item.updated_parsed)
items = rss_file.read().split('<item>')
items = items[1:] if items else []
for item in items:
# example item:
# <item>
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
# <category>Unread</category>
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
# </item>
try:
tags = ','.join(map(lambda tag: tag.term, item.tags))
except AttributeError:
tags = ''
trailing_removed = item.split('</item>', 1)[0]
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
rows = leading_removed.split('\n')
if url is None:
# Yielding a Link with no URL will
# crash on a URL validation assertion
continue
def get_row(key):
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
url = str_between(get_row('link'), '<link>', '</link>')
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
yield Link(
url=htmldecode(url),
timestamp=str(time),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=tags,
tags=None,
sources=[rss_file.name],
)

View file

@ -1,6 +1,8 @@
__package__ = 'archivebox.parsers'
__description__ = 'Plain Text'
import re
from typing import IO, Iterable
from datetime import datetime, timezone
from pathlib import Path
@ -9,7 +11,7 @@ from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
find_all_urls,
URL_REGEX
)
@ -37,7 +39,7 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
pass
# otherwise look for anything that looks like a URL in the line
for url in find_all_urls(line):
for url in re.findall(URL_REGEX, line):
yield Link(
url=htmldecode(url),
timestamp=str(datetime.now(timezone.utc).timestamp()),
@ -46,6 +48,17 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
sources=[text_file.name],
)
# look inside the URL for any sub-urls, e.g. for archive.org links
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
for sub_url in re.findall(URL_REGEX, line[1:]):
yield Link(
url=htmldecode(sub_url),
timestamp=str(datetime.now(timezone.utc).timestamp()),
title=None,
tags=None,
sources=[text_file.name],
)
KEY = 'txt'
NAME = 'Generic TXT'

View file

@ -2,41 +2,50 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable
from time import mktime
from feedparser import parse as feedparser
from datetime import datetime, timezone
from xml.etree import ElementTree
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types
enforce_types,
)
@enforce_types
def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse Pinboard RSS feed files into links"""
rss_file.seek(0)
feed = feedparser(rss_file.read())
for item in feed.entries:
url = item.link
# title will start with "[priv] " if pin was marked private. useful?
title = item.title
time = mktime(item.updated_parsed)
root = ElementTree.parse(rss_file).getroot()
items = root.findall("{http://purl.org/rss/1.0/}item")
for item in items:
find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore
# all tags are in one entry.tags with spaces in it. annoying!
try:
tags = item.tags[0].term.replace(' ', ',')
except AttributeError:
tags = ''
url = find("{http://purl.org/rss/1.0/}link")
tags = find("{http://purl.org/dc/elements/1.1/}subject")
title = find("{http://purl.org/rss/1.0/}title")
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
if url is None:
# Yielding a Link with no URL will
# crash on a URL validation assertion
continue
# Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it:
if ts_str and ts_str[-3:-2] == ":":
ts_str = ts_str[:-3]+ts_str[-2:]
if ts_str:
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
else:
time = datetime.now(timezone.utc)
yield Link(
url=htmldecode(url),
timestamp=str(time),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=htmldecode(tags) or None,
sources=[rss_file.name],

View file

@ -1,17 +0,0 @@
__package__ = 'archivebox.plugantic'
from .binproviders import BinProvider
from .binaries import Binary
from .extractors import Extractor
from .replayers import Replayer
from .configs import ConfigSet
from .plugins import Plugin
# __all__ = [
# 'BinProvider',
# 'Binary',
# 'Extractor',
# 'Replayer',
# 'ConfigSet',
# 'Plugin',
# ]

View file

@ -1,26 +0,0 @@
# from django.contrib import admin
# from django import forms
# from django_jsonform.widgets import JSONFormWidget
# from django_pydantic_field.v2.fields import PydanticSchemaField
# from .models import CustomPlugin
# class PluginForm(forms.ModelForm):
# class Meta:
# model = CustomPlugin
# fields = '__all__'
# widgets = {
# 'items': JSONFormWidget(schema=PluginSchema),
# }
# class PluginAdmin(admin.ModelAdmin):
# formfield_overrides = {
# PydanticSchemaField: {"widget": JSONFormWidget},
# }
# form = PluginForm

View file

@ -1,6 +0,0 @@
from django.apps import AppConfig
class PluganticConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'plugantic'

View file

@ -1,323 +0,0 @@
__package__ = 'archivebox.plugantic'
import sys
import inspect
import importlib
from pathlib import Path
from typing import Any, Optional, Dict, List
from typing_extensions import Self
from subprocess import run, PIPE
from pydantic_core import ValidationError
from pydantic import BaseModel, Field, model_validator, computed_field, field_validator, validate_call, field_serializer
from .binproviders import (
SemVer,
BinName,
BinProviderName,
HostBinPath,
BinProvider,
EnvProvider,
AptProvider,
BrewProvider,
PipProvider,
ProviderLookupDict,
bin_name,
bin_abspath,
path_is_script,
path_is_executable,
)
class Binary(BaseModel):
name: BinName
description: str = Field(default='')
providers_supported: List[BinProvider] = Field(default=[EnvProvider()], alias='providers')
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = Field(default={}, alias='overrides')
loaded_provider: Optional[BinProviderName] = Field(default=None, alias='provider')
loaded_abspath: Optional[HostBinPath] = Field(default=None, alias='abspath')
loaded_version: Optional[SemVer] = Field(default=None, alias='version')
# bin_filename: see below
# is_executable: see below
# is_script
# is_valid: see below
@model_validator(mode='after')
def validate(self):
self.loaded_abspath = bin_abspath(self.name) or self.name
self.description = self.description or self.name
assert self.providers_supported, f'No providers were given for package {self.name}'
# pull in any overrides from the binproviders
for provider in self.providers_supported:
overrides_by_provider = provider.get_providers_for_bin(self.name)
if overrides_by_provider:
self.provider_overrides[provider.name] = {
**overrides_by_provider,
**self.provider_overrides.get(provider.name, {}),
}
return self
@field_validator('loaded_abspath', mode='before')
def parse_abspath(cls, value: Any):
return bin_abspath(value)
@field_validator('loaded_version', mode='before')
def parse_version(cls, value: Any):
return value and SemVer(value)
@field_serializer('provider_overrides', when_used='json')
def serialize_overrides(self, provider_overrides: Dict[BinProviderName, ProviderLookupDict]) -> Dict[BinProviderName, Dict[str, str]]:
return {
provider_name: {
key: str(val)
for key, val in overrides.items()
}
for provider_name, overrides in provider_overrides.items()
}
@computed_field # type: ignore[misc] # see mypy issue #1362
@property
def bin_filename(self) -> BinName:
if self.is_script:
# e.g. '.../Python.framework/Versions/3.11/lib/python3.11/sqlite3/__init__.py' -> sqlite
name = self.name
elif self.loaded_abspath:
# e.g. '/opt/homebrew/bin/wget' -> wget
name = bin_name(self.loaded_abspath)
else:
# e.g. 'ytdlp' -> 'yt-dlp'
name = bin_name(self.name)
return name
@computed_field # type: ignore[misc] # see mypy issue #1362
@property
def is_executable(self) -> bool:
try:
assert self.loaded_abspath and path_is_executable(self.loaded_abspath)
return True
except (ValidationError, AssertionError):
return False
@computed_field # type: ignore[misc] # see mypy issue #1362
@property
def is_script(self) -> bool:
try:
assert self.loaded_abspath and path_is_script(self.loaded_abspath)
return True
except (ValidationError, AssertionError):
return False
@computed_field # type: ignore[misc] # see mypy issue #1362
@property
def is_valid(self) -> bool:
return bool(
self.name
and self.loaded_abspath
and self.loaded_version
and (self.is_executable or self.is_script)
)
@validate_call
def install(self) -> Self:
if not self.providers_supported:
return self
exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
for provider in self.providers_supported:
try:
installed_bin = provider.install(self.name, overrides=self.provider_overrides.get(provider.name))
if installed_bin:
# print('INSTALLED', self.name, installed_bin)
return self.model_copy(update={
'loaded_provider': provider.name,
'loaded_abspath': installed_bin.abspath,
'loaded_version': installed_bin.version,
})
except Exception as err:
print(err)
exc = err
raise exc
@validate_call
def load(self, cache=True) -> Self:
if self.is_valid:
return self
if not self.providers_supported:
return self
exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
for provider in self.providers_supported:
try:
installed_bin = provider.load(self.name, cache=cache, overrides=self.provider_overrides.get(provider.name))
if installed_bin:
# print('LOADED', provider, self.name, installed_bin)
return self.model_copy(update={
'loaded_provider': provider.name,
'loaded_abspath': installed_bin.abspath,
'loaded_version': installed_bin.version,
})
except Exception as err:
print(err)
exc = err
raise exc
@validate_call
def load_or_install(self, cache=True) -> Self:
if self.is_valid:
return self
if not self.providers_supported:
return self
exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
for provider in self.providers_supported:
try:
installed_bin = provider.load_or_install(self.name, overrides=self.provider_overrides.get(provider.name), cache=cache)
if installed_bin:
# print('LOADED_OR_INSTALLED', self.name, installed_bin)
return self.model_copy(update={
'loaded_provider': provider.name,
'loaded_abspath': installed_bin.abspath,
'loaded_version': installed_bin.version,
})
except Exception as err:
print(err)
exc = err
raise exc
@validate_call
def exec(self, args=(), pwd='.'):
assert self.loaded_abspath
assert self.loaded_version
return run([self.loaded_abspath, *args], stdout=PIPE, stderr=PIPE, pwd=pwd)
class SystemPythonHelpers:
@staticmethod
def get_subdeps() -> str:
return 'python3 python3-minimal python3-pip python3-virtualenv'
@staticmethod
def get_abspath() -> str:
return sys.executable
@staticmethod
def get_version() -> str:
return '{}.{}.{}'.format(*sys.version_info[:3])
class SqliteHelpers:
@staticmethod
def get_abspath() -> Path:
import sqlite3
importlib.reload(sqlite3)
return Path(inspect.getfile(sqlite3))
@staticmethod
def get_version() -> SemVer:
import sqlite3
importlib.reload(sqlite3)
version = sqlite3.version
assert version
return SemVer(version)
class DjangoHelpers:
@staticmethod
def get_django_abspath() -> str:
import django
return inspect.getfile(django)
@staticmethod
def get_django_version() -> str:
import django
return '{}.{}.{} {} ({})'.format(*django.VERSION)
class YtdlpHelpers:
@staticmethod
def get_ytdlp_subdeps() -> str:
return 'yt-dlp ffmpeg'
@staticmethod
def get_ytdlp_version() -> str:
import yt_dlp
importlib.reload(yt_dlp)
version = yt_dlp.version.__version__
assert version
return version
class PythonBinary(Binary):
name: BinName = 'python'
providers_supported: List[BinProvider] = [
EnvProvider(
subdeps_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_subdeps'},
abspath_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_abspath'},
version_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_version'},
),
]
class SqliteBinary(Binary):
name: BinName = 'sqlite'
providers_supported: List[BinProvider] = [
EnvProvider(
version_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_version'},
abspath_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_abspath'},
),
]
class DjangoBinary(Binary):
name: BinName = 'django'
providers_supported: List[BinProvider] = [
EnvProvider(
abspath_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_abspath'},
version_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_version'},
),
]
class YtdlpBinary(Binary):
name: BinName = 'yt-dlp'
providers_supported: List[BinProvider] = [
# EnvProvider(),
PipProvider(version_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_version'}),
BrewProvider(subdeps_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_subdeps'}),
# AptProvider(subdeps_provider={'yt-dlp': lambda: 'yt-dlp ffmpeg'}),
]
class WgetBinary(Binary):
name: BinName = 'wget'
providers_supported: List[BinProvider] = [EnvProvider(), AptProvider()]
# if __name__ == '__main__':
# PYTHON_BINARY = PythonBinary()
# SQLITE_BINARY = SqliteBinary()
# DJANGO_BINARY = DjangoBinary()
# WGET_BINARY = WgetBinary()
# YTDLP_BINARY = YtdlpPBinary()
# print('-------------------------------------DEFINING BINARIES---------------------------------')
# print(PYTHON_BINARY)
# print(SQLITE_BINARY)
# print(DJANGO_BINARY)
# print(WGET_BINARY)
# print(YTDLP_BINARY)

View file

@ -1,561 +0,0 @@
__package__ = 'archivebox.plugantic'
import os
import shutil
import operator
from typing import Callable, Any, Optional, Type, Dict, Annotated, ClassVar, Literal, cast, TYPE_CHECKING
from typing_extensions import Self
from abc import ABC, abstractmethod
from collections import namedtuple
from pathlib import Path
from subprocess import run, PIPE
from pydantic_core import core_schema, ValidationError
from pydantic import BaseModel, Field, TypeAdapter, AfterValidator, validate_call, GetCoreSchemaHandler
def func_takes_args_or_kwargs(lambda_func: Callable[..., Any]) -> bool:
"""returns True if a lambda func takes args/kwargs of any kind, otherwise false if it's pure/argless"""
code = lambda_func.__code__
has_args = code.co_argcount > 0
has_varargs = code.co_flags & 0x04 != 0
has_varkw = code.co_flags & 0x08 != 0
return has_args or has_varargs or has_varkw
def is_semver_str(semver: Any) -> bool:
if isinstance(semver, str):
return (semver.count('.') == 2 and semver.replace('.', '').isdigit())
return False
def semver_to_str(semver: tuple[int, int, int] | str) -> str:
if isinstance(semver, (list, tuple)):
return '.'.join(str(chunk) for chunk in semver)
if is_semver_str(semver):
return semver
raise ValidationError('Tried to convert invalid SemVer: {}'.format(semver))
SemVerTuple = namedtuple('SemVerTuple', ('major', 'minor', 'patch'), defaults=(0, 0, 0))
SemVerParsableTypes = str | tuple[str | int, ...] | list[str | int]
class SemVer(SemVerTuple):
major: int
minor: int = 0
patch: int = 0
if TYPE_CHECKING:
full_text: str | None = ''
def __new__(cls, *args, full_text=None, **kwargs):
# '1.1.1'
if len(args) == 1 and is_semver_str(args[0]):
result = SemVer.parse(args[0])
# ('1', '2', '3')
elif len(args) == 1 and isinstance(args[0], (tuple, list)):
result = SemVer.parse(args[0])
# (1, '2', None)
elif not all(isinstance(arg, (int, type(None))) for arg in args):
result = SemVer.parse(args)
# (None)
elif all(chunk in ('', 0, None) for chunk in (*args, *kwargs.values())):
result = None
# 1, 2, 3
else:
result = SemVerTuple.__new__(cls, *args, **kwargs)
if result is not None:
# add first line as extra hidden metadata so it can be logged without having to re-run version cmd
result.full_text = full_text or str(result)
return result
@classmethod
def parse(cls, version_stdout: SemVerParsableTypes) -> Self | None:
"""
parses a version tag string formatted like into (major, minor, patch) ints
'Google Chrome 124.0.6367.208' -> (124, 0, 6367)
'GNU Wget 1.24.5 built on darwin23.2.0.' -> (1, 24, 5)
'curl 8.4.0 (x86_64-apple-darwin23.0) ...' -> (8, 4, 0)
'2024.04.09' -> (2024, 4, 9)
"""
# print('INITIAL_VALUE', type(version_stdout).__name__, version_stdout)
if isinstance(version_stdout, (tuple, list)):
version_stdout = '.'.join(str(chunk) for chunk in version_stdout)
elif isinstance(version_stdout, bytes):
version_stdout = version_stdout.decode()
elif not isinstance(version_stdout, str):
version_stdout = str(version_stdout)
# no text to work with, return None immediately
if not version_stdout.strip():
# raise Exception('Tried to parse semver from empty version output (is binary installed and available?)')
return None
just_numbers = lambda col: col.lower().strip('v').split('+')[0].split('-')[0].split('_')[0]
contains_semver = lambda col: (
col.count('.') in (1, 2, 3)
and all(chunk.isdigit() for chunk in col.split('.')[:3]) # first 3 chunks can only be nums
)
full_text = version_stdout.split('\n')[0].strip()
first_line_columns = full_text.split()[:4]
version_columns = list(filter(contains_semver, map(just_numbers, first_line_columns)))
# could not find any column of first line that looks like a version number, despite there being some text
if not version_columns:
# raise Exception('Failed to parse semver from version command output: {}'.format(' '.join(first_line_columns)))
return None
# take first col containing a semver, and truncate it to 3 chunks (e.g. 2024.04.09.91) -> (2024, 04, 09)
first_version_tuple = version_columns[0].split('.', 3)[:3]
# print('FINAL_VALUE', first_version_tuple)
return cls(*(int(chunk) for chunk in first_version_tuple), full_text=full_text)
def __str__(self):
return '.'.join(str(chunk) for chunk in self)
# @classmethod
# def __get_pydantic_core_schema__(cls, source: Type[Any], handler: GetCoreSchemaHandler) -> core_schema.CoreSchema:
# default_schema = handler(source)
# return core_schema.no_info_after_validator_function(
# cls.parse,
# default_schema,
# serialization=core_schema.plain_serializer_function_ser_schema(
# lambda semver: str(semver),
# info_arg=False,
# return_schema=core_schema.str_schema(),
# ),
# )
assert SemVer(None) == None
assert SemVer('') == None
assert SemVer.parse('') == None
assert SemVer(1) == (1, 0, 0)
assert SemVer(1, 2) == (1, 2, 0)
assert SemVer('1.2+234234') == (1, 2, 0)
assert SemVer((1, 2, 3)) == (1, 2, 3)
assert getattr(SemVer((1, 2, 3)), 'full_text') == '1.2.3'
assert SemVer(('1', '2', '3')) == (1, 2, 3)
assert SemVer.parse('5.6.7') == (5, 6, 7)
assert SemVer.parse('124.0.6367.208') == (124, 0, 6367)
assert SemVer.parse('Google Chrome 124.1+234.234') == (124, 1, 0)
assert SemVer.parse('Google Ch1rome 124.0.6367.208') == (124, 0, 6367)
assert SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324') == (124, 0, 6367)
assert getattr(SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324'), 'full_text') == 'Google Chrome 124.0.6367.208+beta_234. 234.234.123'
assert SemVer.parse('Google Chrome') == None
@validate_call
def bin_name(bin_path_or_name: str | Path) -> str:
name = Path(bin_path_or_name).name
assert len(name) > 1
assert name.replace('-', '').replace('_', '').replace('.', '').isalnum(), (
f'Binary name can only contain a-Z0-9-_.: {name}')
return name
BinName = Annotated[str, AfterValidator(bin_name)]
@validate_call
def path_is_file(path: Path | str) -> Path:
path = Path(path) if isinstance(path, str) else path
assert path.is_file(), f'Path is not a file: {path}'
return path
HostExistsPath = Annotated[Path, AfterValidator(path_is_file)]
@validate_call
def path_is_executable(path: HostExistsPath) -> HostExistsPath:
assert os.access(path, os.X_OK), f'Path is not executable (fix by running chmod +x {path})'
return path
@validate_call
def path_is_script(path: HostExistsPath) -> HostExistsPath:
SCRIPT_EXTENSIONS = ('.py', '.js', '.sh')
assert path.suffix.lower() in SCRIPT_EXTENSIONS, 'Path is not a script (does not end in {})'.format(', '.join(SCRIPT_EXTENSIONS))
return path
HostExecutablePath = Annotated[HostExistsPath, AfterValidator(path_is_executable)]
@validate_call
def path_is_abspath(path: Path) -> Path:
return path.resolve()
HostAbsPath = Annotated[HostExistsPath, AfterValidator(path_is_abspath)]
HostBinPath = Annotated[Path, AfterValidator(path_is_abspath), AfterValidator(path_is_file)]
@validate_call
def bin_abspath(bin_path_or_name: BinName | Path) -> HostBinPath | None:
assert bin_path_or_name
if str(bin_path_or_name).startswith('/'):
# already a path, get its absolute form
abspath = Path(bin_path_or_name).resolve()
else:
# not a path yet, get path using os.which
binpath = shutil.which(bin_path_or_name)
if not binpath:
return None
abspath = Path(binpath).resolve()
try:
return TypeAdapter(HostBinPath).validate_python(abspath)
except ValidationError:
return None
@validate_call
def bin_version(bin_path: HostBinPath, args=('--version',)) -> SemVer | None:
return SemVer(run([bin_path, *args], stdout=PIPE).stdout.strip().decode())
class InstalledBin(BaseModel):
abspath: HostBinPath
version: SemVer
def is_valid_install_string(pkgs_str: str) -> str:
"""Make sure a string is a valid install string for a package manager, e.g. 'yt-dlp ffmpeg'"""
assert pkgs_str
assert all(len(pkg) > 1 for pkg in pkgs_str.split(' '))
return pkgs_str
def is_valid_python_dotted_import(import_str: str) -> str:
assert import_str and import_str.replace('.', '').replace('_', '').isalnum()
return import_str
InstallStr = Annotated[str, AfterValidator(is_valid_install_string)]
LazyImportStr = Annotated[str, AfterValidator(is_valid_python_dotted_import)]
ProviderHandler = Callable[..., Any] | Callable[[], Any] # must take no args [], or [bin_name: str, **kwargs]
#ProviderHandlerStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
ProviderHandlerRef = LazyImportStr | ProviderHandler
ProviderLookupDict = Dict[str, LazyImportStr]
ProviderType = Literal['abspath', 'version', 'subdeps', 'install']
# class Host(BaseModel):
# machine: str
# system: str
# platform: str
# in_docker: bool
# in_qemu: bool
# python: str
BinProviderName = Literal['env', 'pip', 'apt', 'brew', 'npm', 'vendor']
class BinProvider(ABC, BaseModel):
name: BinProviderName
abspath_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_abspath'}, exclude=True)
version_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_version'}, exclude=True)
subdeps_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_subdeps'}, exclude=True)
install_provider: ProviderLookupDict = Field(default={'*': 'self.on_install'}, exclude=True)
_abspath_cache: ClassVar = {}
_version_cache: ClassVar = {}
_install_cache: ClassVar = {}
# def provider_version(self) -> SemVer | None:
# """Version of the actual underlying package manager (e.g. pip v20.4.1)"""
# if self.name in ('env', 'vendor'):
# return SemVer('0.0.0')
# installer_binpath = Path(shutil.which(self.name)).resolve()
# return bin_version(installer_binpath)
# def provider_host(self) -> Host:
# """Information about the host env, archictecture, and OS needed to select & build packages"""
# p = platform.uname()
# return Host(
# machine=p.machine,
# system=p.system,
# platform=platform.platform(),
# python=sys.implementation.name,
# in_docker=os.environ.get('IN_DOCKER', '').lower() == 'true',
# in_qemu=os.environ.get('IN_QEMU', '').lower() == 'true',
# )
def get_default_providers(self):
return self.get_providers_for_bin('*')
def resolve_provider_func(self, provider_func: ProviderHandlerRef | None) -> ProviderHandler | None:
if provider_func is None:
return None
# if provider_func is a dotted path to a function on self, swap it for the actual function
if isinstance(provider_func, str) and provider_func.startswith('self.'):
provider_func = getattr(self, provider_func.split('self.', 1)[-1])
# if provider_func is a dot-formatted import string, import the function
if isinstance(provider_func, str):
from django.utils.module_loading import import_string
package_name, module_name, classname, path = provider_func.split('.', 3) # -> abc, def, ghi.jkl
# get .ghi.jkl nested attr present on module abc.def
imported_module = import_string(f'{package_name}.{module_name}.{classname}')
provider_func = operator.attrgetter(path)(imported_module)
# # abc.def.ghi.jkl -> 1, 2, 3
# for idx in range(1, len(path)):
# parent_path = '.'.join(path[:-idx]) # abc.def.ghi
# try:
# parent_module = import_string(parent_path)
# provider_func = getattr(parent_module, path[-idx])
# except AttributeError, ImportError:
# continue
assert TypeAdapter(ProviderHandler).validate_python(provider_func), (
f'{self.__class__.__name__} provider func for {bin_name} was not a function or dotted-import path: {provider_func}')
return provider_func
@validate_call
def get_providers_for_bin(self, bin_name: str) -> ProviderLookupDict:
providers_for_bin = {
'abspath': self.abspath_provider.get(bin_name),
'version': self.version_provider.get(bin_name),
'subdeps': self.subdeps_provider.get(bin_name),
'install': self.install_provider.get(bin_name),
}
only_set_providers_for_bin = {k: v for k, v in providers_for_bin.items() if v is not None}
return only_set_providers_for_bin
@validate_call
def get_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None) -> ProviderHandler:
"""
Get the provider func for a given key + Dict of provider callbacks + fallback default provider.
e.g. get_provider_for_action(bin_name='yt-dlp', 'install', default_provider=self.on_install, ...) -> Callable
"""
provider_func_ref = (
(overrides or {}).get(provider_type)
or self.get_providers_for_bin(bin_name).get(provider_type)
or self.get_default_providers().get(provider_type)
or default_provider
)
# print('getting provider for action', bin_name, provider_type, provider_func)
provider_func = self.resolve_provider_func(provider_func_ref)
assert provider_func, f'No {self.name} provider func was found for {bin_name} in: {self.__class__.__name__}.'
return provider_func
@validate_call
def call_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None, **kwargs) -> Any:
provider_func: ProviderHandler = self.get_provider_for_action(
bin_name=bin_name,
provider_type=provider_type,
default_provider=default_provider,
overrides=overrides,
)
if not func_takes_args_or_kwargs(provider_func):
# if it's a pure argless lambdas, dont pass bin_path and other **kwargs
provider_func_without_args = cast(Callable[[], Any], provider_func)
return provider_func_without_args()
provider_func = cast(Callable[..., Any], provider_func)
return provider_func(bin_name, **kwargs)
def on_get_abspath(self, bin_name: BinName, **_) -> HostBinPath | None:
print(f'[*] {self.__class__.__name__}: Getting abspath for {bin_name}...')
try:
return bin_abspath(bin_name)
except ValidationError:
return None
def on_get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, **_) -> SemVer | None:
abspath = abspath or self._abspath_cache.get(bin_name) or self.get_abspath(bin_name)
if not abspath: return None
print(f'[*] {self.__class__.__name__}: Getting version for {bin_name}...')
try:
return bin_version(abspath)
except ValidationError:
return None
def on_get_subdeps(self, bin_name: BinName, **_) -> InstallStr:
print(f'[*] {self.__class__.__name__}: Getting subdependencies for {bin_name}')
# ... subdependency calculation logic here
return TypeAdapter(InstallStr).validate_python(bin_name)
@abstractmethod
def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
subdeps = subdeps or self.get_subdeps(bin_name)
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
# ... install logic here
assert True
@validate_call
def get_abspath(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> HostBinPath | None:
abspath = self.call_provider_for_action(
bin_name=bin_name,
provider_type='abspath',
default_provider=self.on_get_abspath,
overrides=overrides,
)
if not abspath:
return None
result = TypeAdapter(HostBinPath).validate_python(abspath)
self._abspath_cache[bin_name] = result
return result
@validate_call
def get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, overrides: Optional[ProviderLookupDict]=None) -> SemVer | None:
version = self.call_provider_for_action(
bin_name=bin_name,
provider_type='version',
default_provider=self.on_get_version,
overrides=overrides,
abspath=abspath,
)
if not version:
return None
result = SemVer(version)
self._version_cache[bin_name] = result
return result
@validate_call
def get_subdeps(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstallStr:
subdeps = self.call_provider_for_action(
bin_name=bin_name,
provider_type='subdeps',
default_provider=self.on_get_subdeps,
overrides=overrides,
)
if not subdeps:
subdeps = bin_name
result = TypeAdapter(InstallStr).validate_python(subdeps)
return result
@validate_call
def install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstalledBin | None:
subdeps = self.get_subdeps(bin_name, overrides=overrides)
self.call_provider_for_action(
bin_name=bin_name,
provider_type='install',
default_provider=self.on_install,
overrides=overrides,
subdeps=subdeps,
)
installed_abspath = self.get_abspath(bin_name)
assert installed_abspath, f'Unable to find {bin_name} abspath after installing with {self.name}'
installed_version = self.get_version(bin_name, abspath=installed_abspath)
assert installed_version, f'Unable to find {bin_name} version after installing with {self.name}'
result = InstalledBin(abspath=installed_abspath, version=installed_version)
self._install_cache[bin_name] = result
return result
@validate_call
def load(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=False) -> InstalledBin | None:
installed_abspath = None
installed_version = None
if cache:
installed_bin = self._install_cache.get(bin_name)
if installed_bin:
return installed_bin
installed_abspath = self._abspath_cache.get(bin_name)
installed_version = self._version_cache.get(bin_name)
installed_abspath = installed_abspath or self.get_abspath(bin_name, overrides=overrides)
if not installed_abspath:
return None
installed_version = installed_version or self.get_version(bin_name, abspath=installed_abspath, overrides=overrides)
if not installed_version:
return None
return InstalledBin(abspath=installed_abspath, version=installed_version)
@validate_call
def load_or_install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=True) -> InstalledBin | None:
installed = self.load(bin_name, overrides=overrides, cache=cache)
if not installed:
installed = self.install(bin_name, overrides=overrides)
return installed
class PipProvider(BinProvider):
name: BinProviderName = 'pip'
def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
subdeps = subdeps or self.on_get_subdeps(bin_name)
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
proc = run(['pip', 'install', '--upgrade', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
if proc.returncode != 0:
print(proc.stdout.strip().decode())
print(proc.stderr.strip().decode())
raise Exception(f'{self.__class__.__name__}: install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
class AptProvider(BinProvider):
name: BinProviderName = 'apt'
subdeps_provider: ProviderLookupDict = {
'yt-dlp': lambda: 'yt-dlp ffmpeg',
}
def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
subdeps = subdeps or self.on_get_subdeps(bin_name)
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
run(['apt-get', 'update', '-qq'])
proc = run(['apt-get', 'install', '-y', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
if proc.returncode != 0:
print(proc.stdout.strip().decode())
print(proc.stderr.strip().decode())
raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
class BrewProvider(BinProvider):
name: BinProviderName = 'brew'
def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
subdeps = subdeps or self.on_get_subdeps(bin_name)
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
proc = run(['brew', 'install', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
if proc.returncode != 0:
print(proc.stdout.strip().decode())
print(proc.stderr.strip().decode())
raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
class EnvProvider(BinProvider):
name: BinProviderName = 'env'
abspath_provider: ProviderLookupDict = {
# 'python': lambda: Path('/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/bin/python3.10'),
}
version_provider: ProviderLookupDict = {
# 'python': lambda: '{}.{}.{}'.format(*sys.version_info[:3]),
}
def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
"""The env provider is ready-only and does not install any packages, so this is a no-op"""
pass

View file

@ -1,53 +0,0 @@
__package__ = 'archivebox.plugantic'
from typing import Optional, List, Literal
from pathlib import Path
from pydantic import BaseModel, Field
ConfigSectionName = Literal['GENERAL_CONFIG', 'ARCHIVE_METHOD_TOGGLES', 'ARCHIVE_METHOD_OPTIONS', 'DEPENDENCY_CONFIG']
class ConfigSet(BaseModel):
section: ConfigSectionName = 'GENERAL_CONFIG'
class WgetToggleConfig(ConfigSet):
section: ConfigSectionName = 'ARCHIVE_METHOD_TOGGLES'
SAVE_WGET: bool = True
SAVE_WARC: bool = True
class WgetDependencyConfig(ConfigSet):
section: ConfigSectionName = 'DEPENDENCY_CONFIG'
WGET_BINARY: str = Field(default='wget')
WGET_ARGS: Optional[List[str]] = Field(default=None)
WGET_EXTRA_ARGS: List[str] = []
WGET_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
class WgetOptionsConfig(ConfigSet):
section: ConfigSectionName = 'ARCHIVE_METHOD_OPTIONS'
# loaded from shared config
WGET_AUTO_COMPRESSION: bool = Field(default=True)
SAVE_WGET_REQUISITES: bool = Field(default=True)
WGET_USER_AGENT: str = Field(default='', alias='USER_AGENT')
WGET_TIMEOUT: int = Field(default=60, alias='TIMEOUT')
WGET_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY')
WGET_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES')
WGET_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE')
CONFIG = {
'CHECK_SSL_VALIDITY': False,
'SAVE_WARC': False,
'TIMEOUT': 999,
}
WGET_CONFIG = [
WgetToggleConfig(**CONFIG),
WgetDependencyConfig(**CONFIG),
WgetOptionsConfig(**CONFIG),
]

View file

@ -1,118 +0,0 @@
__package__ = 'archivebox.plugantic'
from typing import Optional, List, Literal, Annotated, Dict, Any
from typing_extensions import Self
from abc import ABC
from pathlib import Path
from pydantic import BaseModel, model_validator, field_serializer, AfterValidator
from .binaries import (
Binary,
YtdlpBinary,
WgetBinary,
)
# stubs
class Snapshot:
pass
class ArchiveResult:
pass
def get_wget_output_path(*args, **kwargs) -> Path:
return Path('.').resolve()
def no_empty_args(args: List[str]) -> List[str]:
assert all(len(arg) for arg in args)
return args
ExtractorName = Literal['wget', 'warc', 'media']
HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
CmdArgsList = Annotated[List[str], AfterValidator(no_empty_args)]
class Extractor(ABC, BaseModel):
name: ExtractorName
binary: Binary
output_path_func: HandlerFuncStr = 'self.get_output_path'
should_extract_func: HandlerFuncStr = 'self.should_extract'
extract_func: HandlerFuncStr = 'self.extract'
exec_func: HandlerFuncStr = 'self.exec'
default_args: CmdArgsList = []
extra_args: CmdArgsList = []
args: Optional[CmdArgsList] = None
@model_validator(mode='after')
def validate_model(self) -> Self:
if self.args is None:
self.args = [*self.default_args, *self.extra_args]
return self
@field_serializer('binary', when_used='json')
def dump_binary(binary) -> str:
return binary.name
def get_output_path(self, snapshot) -> Path:
return Path(self.name)
def should_extract(self, snapshot) -> bool:
output_dir = self.get_output_path(snapshot)
if output_dir.glob('*.*'):
return False
return True
def extract(self, url: str, **kwargs) -> Dict[str, Any]:
output_dir = self.get_output_path(url, **kwargs)
cmd = [url, *self.args] if self.args is not None else [url, *self.default_args, *self.extra_args]
proc = self.exec(cmd, pwd=output_dir)
return {
'status': 'succeeded' if proc.returncode == 0 else 'failed',
'output': proc.stdout.decode().strip().split('\n')[-1],
'output_files': list(output_dir.glob('*.*')),
'stdout': proc.stdout.decode().strip(),
'stderr': proc.stderr.decode().strip(),
'returncode': proc.returncode,
}
def exec(self, args: CmdArgsList, pwd: Optional[Path]=None):
pwd = pwd or Path('.')
assert self.binary.loaded_provider
return self.binary.exec(args, pwd=pwd)
class YtdlpExtractor(Extractor):
name: ExtractorName = 'media'
binary: Binary = YtdlpBinary()
def get_output_path(self, snapshot) -> Path:
return Path(self.name)
class WgetExtractor(Extractor):
name: ExtractorName = 'wget'
binary: Binary = WgetBinary()
def get_output_path(self, snapshot) -> Path:
return get_wget_output_path(snapshot)
class WarcExtractor(Extractor):
name: ExtractorName = 'warc'
binary: Binary = WgetBinary()
def get_output_path(self, snapshot) -> Path:
return get_wget_output_path(snapshot)

View file

@ -1,396 +0,0 @@
from typing import Dict, Any, List
import configparser
import json
import ast
JSONValue = str | bool | int | None | List['JSONValue']
def load_ini_value(val: str) -> JSONValue:
"""Convert lax INI values into strict TOML-compliant (JSON) values"""
if val.lower() in ('true', 'yes', '1'):
return True
if val.lower() in ('false', 'no', '0'):
return False
if val.isdigit():
return int(val)
try:
return ast.literal_eval(val)
except Exception:
pass
try:
return json.loads(val)
except Exception as err:
pass
return val
def convert(ini_str: str) -> str:
"""Convert a string of INI config into its TOML equivalent (warning: strips comments)"""
config = configparser.ConfigParser()
config.optionxform = str # capitalize key names
config.read_string(ini_str)
# Initialize an empty dictionary to store the TOML representation
toml_dict = {}
# Iterate over each section in the INI configuration
for section in config.sections():
toml_dict[section] = {}
# Iterate over each key-value pair in the section
for key, value in config.items(section):
parsed_value = load_ini_value(value)
# Convert the parsed value to its TOML-compatible JSON representation
toml_dict[section.upper()][key.upper()] = json.dumps(parsed_value)
# Build the TOML string
toml_str = ""
for section, items in toml_dict.items():
toml_str += f"[{section}]\n"
for key, value in items.items():
toml_str += f"{key} = {value}\n"
toml_str += "\n"
return toml_str.strip()
### Basic Assertions
test_input = """
[SERVER_CONFIG]
IS_TTY=False
USE_COLOR=False
SHOW_PROGRESS=False
IN_DOCKER=False
IN_QEMU=False
PUID=501
PGID=20
OUTPUT_DIR=/opt/archivebox/data
CONFIG_FILE=/opt/archivebox/data/ArchiveBox.conf
ONLY_NEW=True
TIMEOUT=60
MEDIA_TIMEOUT=3600
OUTPUT_PERMISSIONS=644
RESTRICT_FILE_NAMES=windows
URL_DENYLIST=\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$
URL_ALLOWLIST=None
ADMIN_USERNAME=None
ADMIN_PASSWORD=None
ENFORCE_ATOMIC_WRITES=True
TAG_SEPARATOR_PATTERN=[,]
SECRET_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
BIND_ADDR=127.0.0.1:8000
ALLOWED_HOSTS=*
DEBUG=False
PUBLIC_INDEX=True
PUBLIC_SNAPSHOTS=True
PUBLIC_ADD_VIEW=False
FOOTER_INFO=Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.
SNAPSHOTS_PER_PAGE=40
CUSTOM_TEMPLATES_DIR=None
TIME_ZONE=UTC
TIMEZONE=UTC
REVERSE_PROXY_USER_HEADER=Remote-User
REVERSE_PROXY_WHITELIST=
LOGOUT_REDIRECT_URL=/
PREVIEW_ORIGINALS=True
LDAP=False
LDAP_SERVER_URI=None
LDAP_BIND_DN=None
LDAP_BIND_PASSWORD=None
LDAP_USER_BASE=None
LDAP_USER_FILTER=None
LDAP_USERNAME_ATTR=None
LDAP_FIRSTNAME_ATTR=None
LDAP_LASTNAME_ATTR=None
LDAP_EMAIL_ATTR=None
LDAP_CREATE_SUPERUSER=False
SAVE_TITLE=True
SAVE_FAVICON=True
SAVE_WGET=True
SAVE_WGET_REQUISITES=True
SAVE_SINGLEFILE=True
SAVE_READABILITY=True
SAVE_MERCURY=True
SAVE_HTMLTOTEXT=True
SAVE_PDF=True
SAVE_SCREENSHOT=True
SAVE_DOM=True
SAVE_HEADERS=True
SAVE_WARC=True
SAVE_GIT=True
SAVE_MEDIA=True
SAVE_ARCHIVE_DOT_ORG=True
RESOLUTION=1440,2000
GIT_DOMAINS=github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht
CHECK_SSL_VALIDITY=True
MEDIA_MAX_SIZE=750m
USER_AGENT=None
CURL_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)
WGET_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5
CHROME_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)
COOKIES_FILE=None
CHROME_USER_DATA_DIR=None
CHROME_TIMEOUT=0
CHROME_HEADLESS=True
CHROME_SANDBOX=True
CHROME_EXTRA_ARGS=[]
YOUTUBEDL_ARGS=['--restrict-filenames', '--trim-filenames', '128', '--write-description', '--write-info-json', '--write-annotations', '--write-thumbnail', '--no-call-home', '--write-sub', '--write-auto-subs', '--convert-subs=srt', '--yes-playlist', '--continue', '--no-abort-on-error', '--ignore-errors', '--geo-bypass', '--add-metadata', '--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)']
YOUTUBEDL_EXTRA_ARGS=[]
WGET_ARGS=['--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off']
WGET_EXTRA_ARGS=[]
CURL_ARGS=['--silent', '--location', '--compressed']
CURL_EXTRA_ARGS=[]
GIT_ARGS=['--recursive']
SINGLEFILE_ARGS=[]
SINGLEFILE_EXTRA_ARGS=[]
MERCURY_ARGS=['--format=text']
MERCURY_EXTRA_ARGS=[]
FAVICON_PROVIDER=https://www.google.com/s2/favicons?domain={}
USE_INDEXING_BACKEND=True
USE_SEARCHING_BACKEND=True
SEARCH_BACKEND_ENGINE=ripgrep
SEARCH_BACKEND_HOST_NAME=localhost
SEARCH_BACKEND_PORT=1491
SEARCH_BACKEND_PASSWORD=SecretPassword
SEARCH_PROCESS_HTML=True
SONIC_COLLECTION=archivebox
SONIC_BUCKET=snapshots
SEARCH_BACKEND_TIMEOUT=90
FTS_SEPARATE_DATABASE=True
FTS_TOKENIZERS=porter unicode61 remove_diacritics 2
FTS_SQLITE_MAX_LENGTH=1000000000
USE_CURL=True
USE_WGET=True
USE_SINGLEFILE=True
USE_READABILITY=True
USE_MERCURY=True
USE_GIT=True
USE_CHROME=True
USE_NODE=True
USE_YOUTUBEDL=True
USE_RIPGREP=True
CURL_BINARY=curl
GIT_BINARY=git
WGET_BINARY=wget
SINGLEFILE_BINARY=single-file
READABILITY_BINARY=readability-extractor
MERCURY_BINARY=postlight-parser
YOUTUBEDL_BINARY=yt-dlp
NODE_BINARY=node
RIPGREP_BINARY=rg
CHROME_BINARY=chrome
POCKET_CONSUMER_KEY=None
USER=squash
PACKAGE_DIR=/opt/archivebox/archivebox
TEMPLATES_DIR=/opt/archivebox/archivebox/templates
ARCHIVE_DIR=/opt/archivebox/data/archive
SOURCES_DIR=/opt/archivebox/data/sources
LOGS_DIR=/opt/archivebox/data/logs
PERSONAS_DIR=/opt/archivebox/data/personas
URL_DENYLIST_PTN=re.compile('\\.(css|js|otf|ttf|woff|woff2|gstatic\\.com|googleapis\\.com/css)(\\?.*)?$', re.IGNORECASE|re.MULTILINE)
URL_ALLOWLIST_PTN=None
DIR_OUTPUT_PERMISSIONS=755
ARCHIVEBOX_BINARY=/opt/archivebox/.venv/bin/archivebox
VERSION=0.8.0
COMMIT_HASH=102e87578c6036bb0132dd1ebd17f8f05ffc880f
BUILD_TIME=2024-05-15 03:28:05 1715768885
VERSIONS_AVAILABLE=None
CAN_UPGRADE=False
PYTHON_BINARY=/opt/archivebox/.venv/bin/python3.10
PYTHON_ENCODING=UTF-8
PYTHON_VERSION=3.10.14
DJANGO_BINARY=/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py
DJANGO_VERSION=5.0.6 final (0)
SQLITE_BINARY=/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py
SQLITE_VERSION=2.6.0
CURL_VERSION=curl 8.4.0 (x86_64-apple-darwin23.0)
WGET_VERSION=GNU Wget 1.24.5
WGET_AUTO_COMPRESSION=True
RIPGREP_VERSION=ripgrep 14.1.0
SINGLEFILE_VERSION=None
READABILITY_VERSION=None
MERCURY_VERSION=None
GIT_VERSION=git version 2.44.0
YOUTUBEDL_VERSION=2024.04.09
CHROME_VERSION=Google Chrome 124.0.6367.207
NODE_VERSION=v21.7.3
"""
expected_output = '''[SERVER_CONFIG]
IS_TTY = false
USE_COLOR = false
SHOW_PROGRESS = false
IN_DOCKER = false
IN_QEMU = false
PUID = 501
PGID = 20
OUTPUT_DIR = "/opt/archivebox/data"
CONFIG_FILE = "/opt/archivebox/data/ArchiveBox.conf"
ONLY_NEW = true
TIMEOUT = 60
MEDIA_TIMEOUT = 3600
OUTPUT_PERMISSIONS = 644
RESTRICT_FILE_NAMES = "windows"
URL_DENYLIST = "\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$"
URL_ALLOWLIST = null
ADMIN_USERNAME = null
ADMIN_PASSWORD = null
ENFORCE_ATOMIC_WRITES = true
TAG_SEPARATOR_PATTERN = "[,]"
SECRET_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
BIND_ADDR = "127.0.0.1:8000"
ALLOWED_HOSTS = "*"
DEBUG = false
PUBLIC_INDEX = true
PUBLIC_SNAPSHOTS = true
PUBLIC_ADD_VIEW = false
FOOTER_INFO = "Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests."
SNAPSHOTS_PER_PAGE = 40
CUSTOM_TEMPLATES_DIR = null
TIME_ZONE = "UTC"
TIMEZONE = "UTC"
REVERSE_PROXY_USER_HEADER = "Remote-User"
REVERSE_PROXY_WHITELIST = ""
LOGOUT_REDIRECT_URL = "/"
PREVIEW_ORIGINALS = true
LDAP = false
LDAP_SERVER_URI = null
LDAP_BIND_DN = null
LDAP_BIND_PASSWORD = null
LDAP_USER_BASE = null
LDAP_USER_FILTER = null
LDAP_USERNAME_ATTR = null
LDAP_FIRSTNAME_ATTR = null
LDAP_LASTNAME_ATTR = null
LDAP_EMAIL_ATTR = null
LDAP_CREATE_SUPERUSER = false
SAVE_TITLE = true
SAVE_FAVICON = true
SAVE_WGET = true
SAVE_WGET_REQUISITES = true
SAVE_SINGLEFILE = true
SAVE_READABILITY = true
SAVE_MERCURY = true
SAVE_HTMLTOTEXT = true
SAVE_PDF = true
SAVE_SCREENSHOT = true
SAVE_DOM = true
SAVE_HEADERS = true
SAVE_WARC = true
SAVE_GIT = true
SAVE_MEDIA = true
SAVE_ARCHIVE_DOT_ORG = true
RESOLUTION = [1440, 2000]
GIT_DOMAINS = "github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht"
CHECK_SSL_VALIDITY = true
MEDIA_MAX_SIZE = "750m"
USER_AGENT = null
CURL_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)"
WGET_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5"
CHROME_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)"
COOKIES_FILE = null
CHROME_USER_DATA_DIR = null
CHROME_TIMEOUT = false
CHROME_HEADLESS = true
CHROME_SANDBOX = true
CHROME_EXTRA_ARGS = []
YOUTUBEDL_ARGS = ["--restrict-filenames", "--trim-filenames", "128", "--write-description", "--write-info-json", "--write-annotations", "--write-thumbnail", "--no-call-home", "--write-sub", "--write-auto-subs", "--convert-subs=srt", "--yes-playlist", "--continue", "--no-abort-on-error", "--ignore-errors", "--geo-bypass", "--add-metadata", "--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)"]
YOUTUBEDL_EXTRA_ARGS = []
WGET_ARGS = ["--no-verbose", "--adjust-extension", "--convert-links", "--force-directories", "--backup-converted", "--span-hosts", "--no-parent", "-e", "robots=off"]
WGET_EXTRA_ARGS = []
CURL_ARGS = ["--silent", "--location", "--compressed"]
CURL_EXTRA_ARGS = []
GIT_ARGS = ["--recursive"]
SINGLEFILE_ARGS = []
SINGLEFILE_EXTRA_ARGS = []
MERCURY_ARGS = ["--format=text"]
MERCURY_EXTRA_ARGS = []
FAVICON_PROVIDER = "https://www.google.com/s2/favicons?domain={}"
USE_INDEXING_BACKEND = true
USE_SEARCHING_BACKEND = true
SEARCH_BACKEND_ENGINE = "ripgrep"
SEARCH_BACKEND_HOST_NAME = "localhost"
SEARCH_BACKEND_PORT = 1491
SEARCH_BACKEND_PASSWORD = "SecretPassword"
SEARCH_PROCESS_HTML = true
SONIC_COLLECTION = "archivebox"
SONIC_BUCKET = "snapshots"
SEARCH_BACKEND_TIMEOUT = 90
FTS_SEPARATE_DATABASE = true
FTS_TOKENIZERS = "porter unicode61 remove_diacritics 2"
FTS_SQLITE_MAX_LENGTH = 1000000000
USE_CURL = true
USE_WGET = true
USE_SINGLEFILE = true
USE_READABILITY = true
USE_MERCURY = true
USE_GIT = true
USE_CHROME = true
USE_NODE = true
USE_YOUTUBEDL = true
USE_RIPGREP = true
CURL_BINARY = "curl"
GIT_BINARY = "git"
WGET_BINARY = "wget"
SINGLEFILE_BINARY = "single-file"
READABILITY_BINARY = "readability-extractor"
MERCURY_BINARY = "postlight-parser"
YOUTUBEDL_BINARY = "yt-dlp"
NODE_BINARY = "node"
RIPGREP_BINARY = "rg"
CHROME_BINARY = "chrome"
POCKET_CONSUMER_KEY = null
USER = "squash"
PACKAGE_DIR = "/opt/archivebox/archivebox"
TEMPLATES_DIR = "/opt/archivebox/archivebox/templates"
ARCHIVE_DIR = "/opt/archivebox/data/archive"
SOURCES_DIR = "/opt/archivebox/data/sources"
LOGS_DIR = "/opt/archivebox/data/logs"
PERSONAS_DIR = "/opt/archivebox/data/personas"
URL_DENYLIST_PTN = "re.compile(\'\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$\', re.IGNORECASE|re.MULTILINE)"
URL_ALLOWLIST_PTN = null
DIR_OUTPUT_PERMISSIONS = 755
ARCHIVEBOX_BINARY = "/opt/archivebox/.venv/bin/archivebox"
VERSION = "0.8.0"
COMMIT_HASH = "102e87578c6036bb0132dd1ebd17f8f05ffc880f"
BUILD_TIME = "2024-05-15 03:28:05 1715768885"
VERSIONS_AVAILABLE = null
CAN_UPGRADE = false
PYTHON_BINARY = "/opt/archivebox/.venv/bin/python3.10"
PYTHON_ENCODING = "UTF-8"
PYTHON_VERSION = "3.10.14"
DJANGO_BINARY = "/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py"
DJANGO_VERSION = "5.0.6 final (0)"
SQLITE_BINARY = "/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py"
SQLITE_VERSION = "2.6.0"
CURL_VERSION = "curl 8.4.0 (x86_64-apple-darwin23.0)"
WGET_VERSION = "GNU Wget 1.24.5"
WGET_AUTO_COMPRESSION = true
RIPGREP_VERSION = "ripgrep 14.1.0"
SINGLEFILE_VERSION = null
READABILITY_VERSION = null
MERCURY_VERSION = null
GIT_VERSION = "git version 2.44.0"
YOUTUBEDL_VERSION = "2024.04.09"
CHROME_VERSION = "Google Chrome 124.0.6367.207"
NODE_VERSION = "v21.7.3"'''
first_output = convert(test_input) # make sure ini -> toml parses correctly
second_output = convert(first_output) # make sure toml -> toml parses/dumps consistently
assert first_output == second_output == expected_output # make sure parsing is indempotent
# # DEBUGGING
# import sys
# import difflib
# sys.stdout.writelines(difflib.context_diff(first_output, second_output, fromfile='first', tofile='second'))
# print(repr(second_output))

View file

@ -1,38 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-18 00:16
import abid_utils.models
import archivebox.plugantic.plugins
import charidfield.fields
import django.core.serializers.json
import django.db.models.deletion
import django_pydantic_field.fields
import uuid
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name='Plugin',
fields=[
('created', models.DateTimeField(auto_now_add=True)),
('modified', models.DateTimeField(auto_now=True)),
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('uuid', models.UUIDField(blank=True, null=True, unique=True)),
('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='plg_', unique=True)),
('schema', django_pydantic_field.fields.PydanticSchemaField(config=None, encoder=django.core.serializers.json.DjangoJSONEncoder, schema=archivebox.plugantic.plugins.Plugin)),
('created_by', models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
],
options={
'abstract': False,
},
),
]

View file

@ -1,21 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-18 01:16
import archivebox.plugantic.plugins
import django.core.serializers.json
import django_pydantic_field.fields
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('plugantic', '0001_initial'),
]
operations = [
migrations.AlterField(
model_name='plugin',
name='schema',
field=django_pydantic_field.fields.PydanticSchemaField(config=None, default=None, encoder=django.core.serializers.json.DjangoJSONEncoder, schema=archivebox.plugantic.plugins.Plugin),
),
]

View file

@ -1,21 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-18 01:25
import archivebox.plugantic.replayers
import django.core.serializers.json
import django_pydantic_field.fields
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('plugantic', '0002_alter_plugin_schema'),
]
operations = [
migrations.AlterField(
model_name='plugin',
name='schema',
field=django_pydantic_field.fields.PydanticSchemaField(config=None, default={'embed_template': 'plugins/generic_replayer/templates/embed.html', 'fullpage_template': 'plugins/generic_replayer/templates/fullpage.html', 'name': 'GenericReplayer', 'row_template': 'plugins/generic_replayer/templates/row.html', 'url_pattern': '*'}, encoder=django.core.serializers.json.DjangoJSONEncoder, schema=archivebox.plugantic.replayers.Replayer),
),
]

View file

@ -1,32 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-18 01:28
import archivebox.plugantic.configs
import django.core.serializers.json
import django_pydantic_field.compat.django
import django_pydantic_field.fields
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('plugantic', '0003_alter_plugin_schema'),
]
operations = [
migrations.RemoveField(
model_name='plugin',
name='schema',
),
migrations.AddField(
model_name='plugin',
name='configs',
field=django_pydantic_field.fields.PydanticSchemaField(config=None, default=[], encoder=django.core.serializers.json.DjangoJSONEncoder, schema=django_pydantic_field.compat.django.GenericContainer(list, (archivebox.plugantic.configs.ConfigSet,))),
),
migrations.AddField(
model_name='plugin',
name='name',
field=models.CharField(default='name', max_length=64, unique=True),
preserve_default=False,
),
]

View file

@ -1,39 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-18 01:42
import abid_utils.models
import charidfield.fields
import django.db.models.deletion
import pathlib
import uuid
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('plugantic', '0004_remove_plugin_schema_plugin_configs_plugin_name'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name='CustomPlugin',
fields=[
('created', models.DateTimeField(auto_now_add=True)),
('modified', models.DateTimeField(auto_now=True)),
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('uuid', models.UUIDField(blank=True, null=True, unique=True)),
('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='plg_', unique=True)),
('name', models.CharField(max_length=64, unique=True)),
('path', models.FilePathField(path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/archivebox/plugins'))),
('created_by', models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
],
options={
'abstract': False,
},
),
migrations.DeleteModel(
name='Plugin',
),
]

View file

@ -1,19 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-18 01:45
import pathlib
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('plugantic', '0005_customplugin_delete_plugin'),
]
operations = [
migrations.AlterField(
model_name='customplugin',
name='path',
field=models.FilePathField(allow_files=False, allow_folders=True, path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/archivebox/plugins'), recursive=True),
),
]

View file

@ -1,19 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-18 01:46
import pathlib
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('plugantic', '0006_alter_customplugin_path'),
]
operations = [
migrations.AlterField(
model_name='customplugin',
name='path',
field=models.FilePathField(allow_files=False, allow_folders=True, path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data/plugins'), recursive=True),
),
]

View file

@ -1,19 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-18 01:47
import pathlib
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('plugantic', '0007_alter_customplugin_path'),
]
operations = [
migrations.AlterField(
model_name='customplugin',
name='path',
field=models.FilePathField(allow_files=False, allow_folders=True, path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data'), recursive=True),
),
]

View file

@ -1,18 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-18 01:48
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('plugantic', '0008_alter_customplugin_path'),
]
operations = [
migrations.AlterField(
model_name='customplugin',
name='path',
field=models.FilePathField(allow_files=False, allow_folders=True, path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True),
),
]

Some files were not shown because too many files have changed in this diff Show more