Compare commits
164 commits
Author | SHA1 | Date | |
---|---|---|---|
|
ba14ee0e5e | ||
|
99e6f0c93f | ||
|
5b369246fd | ||
|
d11173eaa4 | ||
|
de489d3c60 | ||
|
78f0ae469e | ||
|
8d1d39b21e | ||
|
508809eab5 | ||
|
de62976975 | ||
|
59fa687768 | ||
|
22be8dcf03 | ||
|
ff18adf0e4 | ||
|
5418e70526 | ||
|
f574d34357 | ||
|
c63917a22d | ||
|
1cd62ecc61 | ||
|
c570674798 | ||
|
4af743e9e0 | ||
|
ac7b3eabea | ||
|
729f05ab93 | ||
|
11a24d9640 | ||
|
b3f2a71c93 | ||
|
07d80db5aa | ||
|
925b6d943d | ||
|
8eac7f09ce | ||
|
3114980eeb | ||
|
99b19e1917 | ||
|
9677282dc5 | ||
|
f72bae8eec | ||
|
48becde9b4 | ||
|
e4176dbf7a | ||
|
774ce3fda7 | ||
|
29c7aa26bc | ||
|
acfd346440 | ||
|
a1afd0211f | ||
|
fdf6f465db | ||
|
406f57031a | ||
|
a4cc10d7f8 | ||
|
241a7c6ab2 | ||
|
1ba8215072 | ||
|
206e7e74b3 | ||
|
0420662174 | ||
|
9733b8d04c | ||
|
4f9f22e024 | ||
|
f896e5dbeb | ||
|
e97d779cd3 | ||
|
b5ad134264 | ||
|
ce833e8ead | ||
|
33bc4622a0 | ||
|
0529099639 | ||
|
e1a04729b3 | ||
|
0438924491 | ||
|
2b4b6e5b3a | ||
|
27809f2976 | ||
|
baf24d2d6a | ||
|
8841e8b181 | ||
|
457c42bf84 | ||
|
c7f55fc3ba | ||
|
102e87578c | ||
|
913590ee39 | ||
|
3882b1ee22 | ||
|
471cf06d89 | ||
|
340fc95f75 | ||
|
75a3f03149 | ||
|
2e9512adfd | ||
|
1c76193704 | ||
|
e23c7cb3db | ||
|
7a8ed9cd55 | ||
|
72f52d5dd5 | ||
|
3ce801a182 | ||
|
8b75788644 | ||
|
7673b42117 | ||
|
03296e2200 | ||
|
e522810a20 | ||
|
69579a73ec | ||
|
f5f8d091c3 | ||
|
51601632c2 | ||
|
b489338555 | ||
|
d03f447555 | ||
|
68a859ccfd | ||
|
6baf2b2f69 | ||
|
d451636224 | ||
|
208c16c611 | ||
|
16d1b92fd6 | ||
|
b90ba6c909 | ||
|
09360fd191 | ||
|
4c5a3fba8b | ||
|
f2729c9dc7 | ||
|
cf9ef88aa8 | ||
|
9b21ce490e | ||
|
f62cb5fb43 | ||
|
f770bba3cf | ||
|
ce42472732 | ||
|
ef856e8051 | ||
|
27d5d1ddc8 | ||
|
664e09f0b4 | ||
|
f472705d10 | ||
|
3095265880 | ||
|
60df0c3137 | ||
|
32aea66913 | ||
|
8ccd606973 | ||
|
94ee394339 | ||
|
027c029316 | ||
|
8667ed29f1 | ||
|
f998647350 | ||
|
29c794925e | ||
|
641a07b08a | ||
|
c30d697904 | ||
|
d782bafe2e | ||
|
47666ec26b | ||
|
f067451267 | ||
|
c7fc9c004f | ||
|
08931edbe0 | ||
|
9dc7065506 | ||
|
12a990c178 | ||
|
f95b369f0d | ||
|
90b7a7f40d | ||
|
3805a1730d | ||
|
2094ed842b | ||
|
8d7dd47c43 | ||
|
e20eb52f15 | ||
|
17b35496cc | ||
|
1c9f9fe1b7 | ||
|
8f3901dd36 | ||
|
18a5b6e99c | ||
|
6a6ae7468e | ||
|
1d9e7ec66a | ||
|
8cbc1a4adc | ||
|
4a5ad32040 | ||
|
af669d2f37 | ||
|
716ba52450 | ||
|
75153252dc | ||
|
e5aba0dc2e | ||
|
6cb357e76c | ||
|
128419f991 | ||
|
beb3932d80 | ||
|
3afdd3d96f | ||
|
463ea54616 | ||
|
c6d644be29 | ||
|
8e9cfc8869 | ||
|
98c5e69203 | ||
|
8dcfa93ec6 | ||
|
e28f33fcd0 | ||
|
665a2e505f | ||
|
17f40f3ada | ||
|
c6f8a33a63 | ||
|
24175f5b4a | ||
|
a1a877f47f | ||
|
63fc317229 | ||
|
756e159dfe | ||
|
667cf38fc6 | ||
|
11acc9ceea | ||
|
55d6bde7db | ||
|
bc0b0303ea | ||
|
82b38df8ec | ||
|
8ced9fd4bb | ||
|
e4dc2701ef | ||
|
99502bd928 | ||
|
b76875aab6 | ||
|
9ad99d86c1 | ||
|
5f9aac18f2 | ||
|
4ae765ec27 | ||
|
9d4cc361e6 | ||
|
279883d6bb |
|
@ -17,6 +17,11 @@ venv/
|
||||||
.venv-old/
|
.venv-old/
|
||||||
.docker-venv/
|
.docker-venv/
|
||||||
node_modules/
|
node_modules/
|
||||||
|
chrome/
|
||||||
|
chromeprofile/
|
||||||
|
|
||||||
|
pdm.dev.lock
|
||||||
|
pdm.lock
|
||||||
|
|
||||||
docs/
|
docs/
|
||||||
build/
|
build/
|
||||||
|
@ -30,3 +35,5 @@ docker/
|
||||||
data/
|
data/
|
||||||
data*/
|
data*/
|
||||||
output/
|
output/
|
||||||
|
index.sqlite3
|
||||||
|
index.sqlite3-wal
|
||||||
|
|
2
.gitattributes
vendored
Normal file
2
.gitattributes
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
**/*.lock
|
||||||
|
**/*-lock.json
|
5
.github/FUNDING.yml
vendored
5
.github/FUNDING.yml
vendored
|
@ -1,3 +1,2 @@
|
||||||
github: pirate
|
github: ["ArchiveBox", "pirate"]
|
||||||
patreon: theSquashSH
|
custom: ["https://donate.archivebox.io", "https://paypal.me/NicholasSweeting"]
|
||||||
custom: ["https://hcb.hackclub.com/donations/start/archivebox", "https://paypal.me/NicholasSweeting"]
|
|
||||||
|
|
25
.github/dependabot.yml
vendored
Normal file
25
.github/dependabot.yml
vendored
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
# To get started with Dependabot version updates, you'll need to specify which
|
||||||
|
# package ecosystems to update and where the package manifests are located.
|
||||||
|
# Please see the documentation for all configuration options:
|
||||||
|
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
|
||||||
|
|
||||||
|
version: 2
|
||||||
|
updates:
|
||||||
|
- package-ecosystem: "pip"
|
||||||
|
directory: "/"
|
||||||
|
target-branch: "dev"
|
||||||
|
schedule:
|
||||||
|
interval: "monthly"
|
||||||
|
groups:
|
||||||
|
pip:
|
||||||
|
patterns:
|
||||||
|
- "*"
|
||||||
|
- package-ecosystem: "npm"
|
||||||
|
directory: "/"
|
||||||
|
target-branch: "dev"
|
||||||
|
schedule:
|
||||||
|
interval: "monthly"
|
||||||
|
groups:
|
||||||
|
npm:
|
||||||
|
patterns:
|
||||||
|
- "*"
|
92
.github/workflows/codeql.yml
vendored
Normal file
92
.github/workflows/codeql.yml
vendored
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
# For most projects, this workflow file will not need changing; you simply need
|
||||||
|
# to commit it to your repository.
|
||||||
|
#
|
||||||
|
# You may wish to alter this file to override the set of languages analyzed,
|
||||||
|
# or to provide custom queries or build logic.
|
||||||
|
#
|
||||||
|
# ******** NOTE ********
|
||||||
|
# We have attempted to detect the languages in your repository. Please check
|
||||||
|
# the `language` matrix defined below to confirm you have the correct set of
|
||||||
|
# supported CodeQL languages.
|
||||||
|
#
|
||||||
|
name: "CodeQL"
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ "dev" ]
|
||||||
|
pull_request:
|
||||||
|
branches: [ "dev" ]
|
||||||
|
schedule:
|
||||||
|
- cron: '33 17 * * 6'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
analyze:
|
||||||
|
name: Analyze (${{ matrix.language }})
|
||||||
|
# Runner size impacts CodeQL analysis time. To learn more, please see:
|
||||||
|
# - https://gh.io/recommended-hardware-resources-for-running-codeql
|
||||||
|
# - https://gh.io/supported-runners-and-hardware-resources
|
||||||
|
# - https://gh.io/using-larger-runners (GitHub.com only)
|
||||||
|
# Consider using larger runners or machines with greater resources for possible analysis time improvements.
|
||||||
|
runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
|
||||||
|
timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
|
||||||
|
permissions:
|
||||||
|
# required for all workflows
|
||||||
|
security-events: write
|
||||||
|
|
||||||
|
# required to fetch internal or private CodeQL packs
|
||||||
|
packages: read
|
||||||
|
|
||||||
|
# only required for workflows in private repositories
|
||||||
|
actions: read
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- language: python
|
||||||
|
build-mode: none
|
||||||
|
# CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
|
||||||
|
# Use `c-cpp` to analyze code written in C, C++ or both
|
||||||
|
# Use 'java-kotlin' to analyze code written in Java, Kotlin or both
|
||||||
|
# Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
|
||||||
|
# To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
|
||||||
|
# see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
|
||||||
|
# If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
|
||||||
|
# your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
# Initializes the CodeQL tools for scanning.
|
||||||
|
- name: Initialize CodeQL
|
||||||
|
uses: github/codeql-action/init@v3
|
||||||
|
with:
|
||||||
|
languages: ${{ matrix.language }}
|
||||||
|
build-mode: ${{ matrix.build-mode }}
|
||||||
|
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||||
|
# By default, queries listed here will override any specified in a config file.
|
||||||
|
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||||
|
|
||||||
|
# For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
|
||||||
|
# queries: security-extended,security-and-quality
|
||||||
|
|
||||||
|
# If the analyze step fails for one of the languages you are analyzing with
|
||||||
|
# "We were unable to automatically build your code", modify the matrix above
|
||||||
|
# to set the build mode to "manual" for that language. Then modify this step
|
||||||
|
# to build your code.
|
||||||
|
# ℹ️ Command-line programs to run using the OS shell.
|
||||||
|
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
|
||||||
|
- if: matrix.build-mode == 'manual'
|
||||||
|
run: |
|
||||||
|
echo 'If you are using a "manual" build mode for one or more of the' \
|
||||||
|
'languages you are analyzing, replace this with the commands to build' \
|
||||||
|
'your code, for example:'
|
||||||
|
echo ' make bootstrap'
|
||||||
|
echo ' make release'
|
||||||
|
exit 1
|
||||||
|
|
||||||
|
- name: Perform CodeQL Analysis
|
||||||
|
uses: github/codeql-action/analyze@v3
|
||||||
|
with:
|
||||||
|
category: "/language:${{matrix.language}}"
|
32
.github/workflows/docker.yml
vendored
32
.github/workflows/docker.yml
vendored
|
@ -11,7 +11,7 @@ on:
|
||||||
|
|
||||||
env:
|
env:
|
||||||
DOCKER_IMAGE: archivebox-ci
|
DOCKER_IMAGE: archivebox-ci
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
buildx:
|
buildx:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
@ -24,21 +24,21 @@ jobs:
|
||||||
|
|
||||||
- name: Set up QEMU
|
- name: Set up QEMU
|
||||||
uses: docker/setup-qemu-action@v3
|
uses: docker/setup-qemu-action@v3
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
- name: Set up Docker Buildx
|
||||||
id: buildx
|
id: buildx
|
||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v3
|
||||||
with:
|
with:
|
||||||
version: latest
|
version: latest
|
||||||
install: true
|
install: true
|
||||||
platforms: linux/amd64,linux/arm64,linux/arm/v7
|
platforms: linux/amd64,linux/arm64
|
||||||
|
|
||||||
- name: Builder instance name
|
- name: Builder instance name
|
||||||
run: echo ${{ steps.buildx.outputs.name }}
|
run: echo ${{ steps.buildx.outputs.name }}
|
||||||
|
|
||||||
- name: Available platforms
|
- name: Available platforms
|
||||||
run: echo ${{ steps.buildx.outputs.platforms }}
|
run: echo ${{ steps.buildx.outputs.platforms }}
|
||||||
|
|
||||||
- name: Cache Docker layers
|
- name: Cache Docker layers
|
||||||
uses: actions/cache@v3
|
uses: actions/cache@v3
|
||||||
with:
|
with:
|
||||||
|
@ -51,21 +51,27 @@ jobs:
|
||||||
uses: docker/login-action@v3
|
uses: docker/login-action@v3
|
||||||
if: github.event_name != 'pull_request'
|
if: github.event_name != 'pull_request'
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.DOCKER_USERNAME }}
|
username: ${{ secrets.DOCKER_USERNAME }}
|
||||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||||
|
|
||||||
- name: Collect Docker tags
|
- name: Collect Docker tags
|
||||||
|
# https://github.com/docker/metadata-action
|
||||||
id: docker_meta
|
id: docker_meta
|
||||||
uses: docker/metadata-action@v5
|
uses: docker/metadata-action@v5
|
||||||
with:
|
with:
|
||||||
images: archivebox/archivebox,nikisweeting/archivebox
|
images: archivebox/archivebox,nikisweeting/archivebox
|
||||||
tags: |
|
tags: |
|
||||||
|
# :stable
|
||||||
type=ref,event=branch
|
type=ref,event=branch
|
||||||
|
# :0.7.3
|
||||||
type=semver,pattern={{version}}
|
type=semver,pattern={{version}}
|
||||||
|
# :0.7
|
||||||
type=semver,pattern={{major}}.{{minor}}
|
type=semver,pattern={{major}}.{{minor}}
|
||||||
|
# :sha-463ea54
|
||||||
type=sha
|
type=sha
|
||||||
type=raw,value=latest,enable={{is_default_branch}}
|
# :latest
|
||||||
|
type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'stable') }}
|
||||||
|
|
||||||
- name: Build and push
|
- name: Build and push
|
||||||
id: docker_build
|
id: docker_build
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v5
|
||||||
|
@ -77,7 +83,7 @@ jobs:
|
||||||
tags: ${{ steps.docker_meta.outputs.tags }}
|
tags: ${{ steps.docker_meta.outputs.tags }}
|
||||||
cache-from: type=local,src=/tmp/.buildx-cache
|
cache-from: type=local,src=/tmp/.buildx-cache
|
||||||
cache-to: type=local,dest=/tmp/.buildx-cache-new
|
cache-to: type=local,dest=/tmp/.buildx-cache-new
|
||||||
platforms: linux/amd64,linux/arm64,linux/arm/v7
|
platforms: linux/amd64,linux/arm64
|
||||||
|
|
||||||
- name: Image digest
|
- name: Image digest
|
||||||
run: echo ${{ steps.docker_build.outputs.digest }}
|
run: echo ${{ steps.docker_build.outputs.digest }}
|
||||||
|
@ -88,7 +94,7 @@ jobs:
|
||||||
username: ${{ secrets.DOCKER_USERNAME }}
|
username: ${{ secrets.DOCKER_USERNAME }}
|
||||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||||
repository: archivebox/archivebox
|
repository: archivebox/archivebox
|
||||||
|
|
||||||
# This ugly bit is necessary if you don't want your cache to grow forever
|
# This ugly bit is necessary if you don't want your cache to grow forever
|
||||||
# until it hits GitHub's limit of 5GB.
|
# until it hits GitHub's limit of 5GB.
|
||||||
# Temp fix
|
# Temp fix
|
||||||
|
|
12
.gitignore
vendored
12
.gitignore
vendored
|
@ -12,6 +12,11 @@ venv/
|
||||||
.docker-venv/
|
.docker-venv/
|
||||||
node_modules/
|
node_modules/
|
||||||
|
|
||||||
|
# Ignore dev lockfiles (should always be built fresh)
|
||||||
|
pdm.lock
|
||||||
|
pdm.dev.lock
|
||||||
|
requirements-dev.txt
|
||||||
|
|
||||||
# Packaging artifacts
|
# Packaging artifacts
|
||||||
.pdm-python
|
.pdm-python
|
||||||
.pdm-build
|
.pdm-build
|
||||||
|
@ -22,11 +27,12 @@ dist/
|
||||||
|
|
||||||
# Data folders
|
# Data folders
|
||||||
data/
|
data/
|
||||||
data1/
|
|
||||||
data2/
|
|
||||||
data3/
|
|
||||||
data*/
|
data*/
|
||||||
output/
|
output/
|
||||||
|
index.sqlite3
|
||||||
|
*.sqlite*
|
||||||
|
data.*
|
||||||
|
|
||||||
# vim
|
# vim
|
||||||
*.sw?
|
*.sw?
|
||||||
|
.vscode
|
||||||
|
|
104
Dockerfile
104
Dockerfile
|
@ -20,9 +20,23 @@ FROM python:3.11-slim-bookworm
|
||||||
|
|
||||||
LABEL name="archivebox" \
|
LABEL name="archivebox" \
|
||||||
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
|
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
|
||||||
description="All-in-one personal internet archiving container" \
|
description="All-in-one self-hosted internet archiving solution" \
|
||||||
homepage="https://github.com/ArchiveBox/ArchiveBox" \
|
homepage="https://github.com/ArchiveBox/ArchiveBox" \
|
||||||
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker"
|
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker" \
|
||||||
|
org.opencontainers.image.title="ArchiveBox" \
|
||||||
|
org.opencontainers.image.vendor="ArchiveBox" \
|
||||||
|
org.opencontainers.image.description="All-in-one self-hosted internet archiving solution" \
|
||||||
|
org.opencontainers.image.source="https://github.com/ArchiveBox/ArchiveBox" \
|
||||||
|
com.docker.image.source.entrypoint="Dockerfile" \
|
||||||
|
# TODO: release ArchiveBox as a Docker Desktop extension (requires these labels):
|
||||||
|
# https://docs.docker.com/desktop/extensions-sdk/architecture/metadata/
|
||||||
|
com.docker.desktop.extension.api.version=">= 1.4.7" \
|
||||||
|
com.docker.desktop.extension.icon="https://archivebox.io/icon.png" \
|
||||||
|
com.docker.extension.publisher-url="https://archivebox.io" \
|
||||||
|
com.docker.extension.screenshots='[{"alt": "Screenshot of Admin UI", "url": "https://github.com/ArchiveBox/ArchiveBox/assets/511499/e8e0b6f8-8fdf-4b7f-8124-c10d8699bdb2"}]' \
|
||||||
|
com.docker.extension.detailed-description='See here for detailed documentation: https://wiki.archivebox.io' \
|
||||||
|
com.docker.extension.changelog='See here for release notes: https://github.com/ArchiveBox/ArchiveBox/releases' \
|
||||||
|
com.docker.extension.categories='database,utility-tools'
|
||||||
|
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
ARG TARGETOS
|
ARG TARGETOS
|
||||||
|
@ -73,7 +87,9 @@ COPY --chown=root:root --chmod=755 package.json "$CODE_DIR/"
|
||||||
RUN grep '"version": ' "${CODE_DIR}/package.json" | awk -F'"' '{print $4}' > /VERSION.txt
|
RUN grep '"version": ' "${CODE_DIR}/package.json" | awk -F'"' '{print $4}' > /VERSION.txt
|
||||||
|
|
||||||
# Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up Docker builds)
|
# Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up Docker builds)
|
||||||
RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache \
|
RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d/99keep-cache \
|
||||||
|
&& echo 'APT::Install-Recommends "0";' > /etc/apt/apt.conf.d/99no-intall-recommends \
|
||||||
|
&& echo 'APT::Install-Suggests "0";' > /etc/apt/apt.conf.d/99no-intall-suggests \
|
||||||
&& rm -f /etc/apt/apt.conf.d/docker-clean
|
&& rm -f /etc/apt/apt.conf.d/docker-clean
|
||||||
|
|
||||||
# Print debug info about build and save it to disk, for human eyes only, not used by anything else
|
# Print debug info about build and save it to disk, for human eyes only, not used by anything else
|
||||||
|
@ -106,10 +122,10 @@ RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \
|
||||||
# Install system apt dependencies (adding backports to access more recent apt updates)
|
# Install system apt dependencies (adding backports to access more recent apt updates)
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
|
||||||
echo "[+] Installing APT base system dependencies for $TARGETPLATFORM..." \
|
echo "[+] Installing APT base system dependencies for $TARGETPLATFORM..." \
|
||||||
&& echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' >> /etc/apt/sources.list.d/backports.list \
|
&& echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' > /etc/apt/sources.list.d/backports.list \
|
||||||
&& mkdir -p /etc/apt/keyrings \
|
&& mkdir -p /etc/apt/keyrings \
|
||||||
&& apt-get update -qq \
|
&& apt-get update -qq \
|
||||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
&& apt-get install -qq -y -t bookworm-backports \
|
||||||
# 1. packaging dependencies
|
# 1. packaging dependencies
|
||||||
apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
|
apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
|
||||||
# 2. docker and init system dependencies
|
# 2. docker and init system dependencies
|
||||||
|
@ -120,27 +136,13 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
||||||
|
|
||||||
######### Language Environments ####################################
|
######### Language Environments ####################################
|
||||||
|
|
||||||
# Install Node environment
|
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
|
|
||||||
echo "[+] Installing Node $NODE_VERSION environment in $NODE_MODULES..." \
|
|
||||||
&& echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
|
|
||||||
&& curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
|
|
||||||
&& apt-get update -qq \
|
|
||||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
|
||||||
nodejs libatomic1 python3-minimal \
|
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
|
||||||
# Update NPM to latest version
|
|
||||||
&& npm i -g npm --cache /root/.npm \
|
|
||||||
# Save version info
|
|
||||||
&& ( \
|
|
||||||
which node && node --version \
|
|
||||||
&& which npm && npm --version \
|
|
||||||
&& echo -e '\n\n' \
|
|
||||||
) | tee -a /VERSION.txt
|
|
||||||
|
|
||||||
# Install Python environment
|
# Install Python environment
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
||||||
echo "[+] Setting up Python $PYTHON_VERSION runtime..." \
|
echo "[+] Setting up Python $PYTHON_VERSION runtime..." \
|
||||||
|
# && apt-get update -qq \
|
||||||
|
# && apt-get install -qq -y -t bookworm-backports --no-upgrade \
|
||||||
|
# python${PYTHON_VERSION} python${PYTHON_VERSION}-minimal python3-pip \
|
||||||
|
# && rm -rf /var/lib/apt/lists/* \
|
||||||
# tell PDM to allow using global system python site packages
|
# tell PDM to allow using global system python site packages
|
||||||
# && rm /usr/lib/python3*/EXTERNALLY-MANAGED \
|
# && rm /usr/lib/python3*/EXTERNALLY-MANAGED \
|
||||||
# create global virtual environment GLOBAL_VENV to use (better than using pip install --global)
|
# create global virtual environment GLOBAL_VENV to use (better than using pip install --global)
|
||||||
|
@ -157,13 +159,34 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
||||||
&& echo -e '\n\n' \
|
&& echo -e '\n\n' \
|
||||||
) | tee -a /VERSION.txt
|
) | tee -a /VERSION.txt
|
||||||
|
|
||||||
|
|
||||||
|
# Install Node environment
|
||||||
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
|
||||||
|
echo "[+] Installing Node $NODE_VERSION environment in $NODE_MODULES..." \
|
||||||
|
&& echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
|
||||||
|
&& curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
|
||||||
|
&& apt-get update -qq \
|
||||||
|
&& apt-get install -qq -y -t bookworm-backports --no-upgrade libatomic1 \
|
||||||
|
&& apt-get install -y -t bookworm-backports --no-upgrade \
|
||||||
|
nodejs \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
# Update NPM to latest version
|
||||||
|
&& npm i -g npm --cache /root/.npm \
|
||||||
|
# Save version info
|
||||||
|
&& ( \
|
||||||
|
which node && node --version \
|
||||||
|
&& which npm && npm --version \
|
||||||
|
&& echo -e '\n\n' \
|
||||||
|
) | tee -a /VERSION.txt
|
||||||
|
|
||||||
|
|
||||||
######### Extractor Dependencies ##################################
|
######### Extractor Dependencies ##################################
|
||||||
|
|
||||||
# Install apt dependencies
|
# Install apt dependencies
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
||||||
echo "[+] Installing APT extractor dependencies globally using apt..." \
|
echo "[+] Installing APT extractor dependencies globally using apt..." \
|
||||||
&& apt-get update -qq \
|
&& apt-get update -qq \
|
||||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
&& apt-get install -qq -y -t bookworm-backports \
|
||||||
curl wget git yt-dlp ffmpeg ripgrep \
|
curl wget git yt-dlp ffmpeg ripgrep \
|
||||||
# Packages we have also needed in the past:
|
# Packages we have also needed in the past:
|
||||||
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
|
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
|
||||||
|
@ -182,25 +205,21 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \
|
||||||
echo "[+] Installing Browser binary dependencies to $PLAYWRIGHT_BROWSERS_PATH..." \
|
echo "[+] Installing Browser binary dependencies to $PLAYWRIGHT_BROWSERS_PATH..." \
|
||||||
&& apt-get update -qq \
|
&& apt-get update -qq \
|
||||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
&& apt-get install -qq -y -t bookworm-backports \
|
||||||
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
||||||
|
at-spi2-common fonts-liberation fonts-noto-color-emoji fonts-tlwg-loma-otf fonts-unifont libatk-bridge2.0-0 libatk1.0-0 libatspi2.0-0 libavahi-client3 \
|
||||||
|
libavahi-common-data libavahi-common3 libcups2 libfontenc1 libice6 libnspr4 libnss3 libsm6 libunwind8 \
|
||||||
|
libxaw7 libxcomposite1 libxdamage1 libxfont2 \
|
||||||
|
libxkbfile1 libxmu6 libxpm4 libxt6 x11-xkb-utils xfonts-encodings \
|
||||||
|
# xfonts-scalable xfonts-utils xserver-common xvfb \
|
||||||
# chrome can run without dbus/upower technically, it complains about missing dbus but should run ok anyway
|
# chrome can run without dbus/upower technically, it complains about missing dbus but should run ok anyway
|
||||||
# libxss1 dbus dbus-x11 upower \
|
# libxss1 dbus dbus-x11 upower \
|
||||||
# && service dbus start \
|
# && service dbus start \
|
||||||
&& if [[ "$TARGETPLATFORM" == *amd64* || "$TARGETPLATFORM" == *arm64* ]]; then \
|
# install Chromium using playwright
|
||||||
# install Chromium using playwright
|
&& pip install playwright \
|
||||||
pip install playwright \
|
&& cp -r /root/.cache/ms-playwright "$PLAYWRIGHT_BROWSERS_PATH" \
|
||||||
&& cp -r /root/.cache/ms-playwright "$PLAYWRIGHT_BROWSERS_PATH" \
|
&& playwright install chromium \
|
||||||
&& playwright install --with-deps chromium \
|
&& export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \
|
||||||
&& export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \
|
|
||||||
else \
|
|
||||||
# fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
|
|
||||||
# apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
|
||||||
# chromium \
|
|
||||||
# && export CHROME_BINARY="$(which chromium)"; \
|
|
||||||
echo 'armv7 no longer supported in versions after v0.7.3' \
|
|
||||||
exit 1; \
|
|
||||||
fi \
|
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
|
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
|
||||||
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
|
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
|
||||||
|
@ -233,7 +252,7 @@ COPY --chown=root:root --chmod=755 "./pyproject.toml" "requirements.txt" "$CODE_
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
||||||
echo "[+] Installing PIP ArchiveBox dependencies from requirements.txt for ${TARGETPLATFORM}..." \
|
echo "[+] Installing PIP ArchiveBox dependencies from requirements.txt for ${TARGETPLATFORM}..." \
|
||||||
&& apt-get update -qq \
|
&& apt-get update -qq \
|
||||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
&& apt-get install -qq -y -t bookworm-backports \
|
||||||
build-essential \
|
build-essential \
|
||||||
libssl-dev libldap2-dev libsasl2-dev \
|
libssl-dev libldap2-dev libsasl2-dev \
|
||||||
python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \
|
python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \
|
||||||
|
@ -255,8 +274,8 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
||||||
echo "[*] Installing PIP ArchiveBox package from $CODE_DIR..." \
|
echo "[*] Installing PIP ArchiveBox package from $CODE_DIR..." \
|
||||||
# && apt-get update -qq \
|
# && apt-get update -qq \
|
||||||
# install C compiler to build deps on platforms that dont have 32-bit wheels available on pypi
|
# install C compiler to build deps on platforms that dont have 32-bit wheels available on pypi
|
||||||
# && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
# && apt-get install -qq -y -t bookworm-backports \
|
||||||
# build-essential \
|
# build-essential \
|
||||||
# INSTALL ARCHIVEBOX python package globally from CODE_DIR, with all optional dependencies
|
# INSTALL ARCHIVEBOX python package globally from CODE_DIR, with all optional dependencies
|
||||||
&& pip install -e "$CODE_DIR"[sonic,ldap] \
|
&& pip install -e "$CODE_DIR"[sonic,ldap] \
|
||||||
# save docker image size and always remove compilers / build tools after building is complete
|
# save docker image size and always remove compilers / build tools after building is complete
|
||||||
|
@ -271,7 +290,6 @@ WORKDIR "$DATA_DIR"
|
||||||
ENV IN_DOCKER=True \
|
ENV IN_DOCKER=True \
|
||||||
DISPLAY=novnc:0.0 \
|
DISPLAY=novnc:0.0 \
|
||||||
CUSTOM_TEMPLATES_DIR=/data/templates \
|
CUSTOM_TEMPLATES_DIR=/data/templates \
|
||||||
CHROME_USER_DATA_DIR=/data/personas/Default/chromium \
|
|
||||||
GOOGLE_API_KEY=no \
|
GOOGLE_API_KEY=no \
|
||||||
GOOGLE_DEFAULT_CLIENT_ID=no \
|
GOOGLE_DEFAULT_CLIENT_ID=no \
|
||||||
GOOGLE_DEFAULT_CLIENT_SECRET=no \
|
GOOGLE_DEFAULT_CLIENT_SECRET=no \
|
||||||
|
|
62
README.md
62
README.md
|
@ -93,7 +93,7 @@ docker run -it -v $PWD:/data archivebox/archivebox init --setup
|
||||||
pip install archivebox
|
pip install archivebox
|
||||||
mkdir -p ~/archivebox/data && cd ~/archivebox/data
|
mkdir -p ~/archivebox/data && cd ~/archivebox/data
|
||||||
archivebox init --setup
|
archivebox init --setup
|
||||||
# archviebox add 'https://example.com'
|
# archivebox add 'https://example.com'
|
||||||
# archivebox help
|
# archivebox help
|
||||||
# archivebox server 0.0.0.0:8000
|
# archivebox server 0.0.0.0:8000
|
||||||
<br/>
|
<br/>
|
||||||
|
@ -124,8 +124,8 @@ curl -fsSL 'https://get.archivebox.io' | sh
|
||||||
|
|
||||||
## Key Features
|
## Key Features
|
||||||
|
|
||||||
- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/dev/LICENSE), doesn't require signing up online, stores all data locally
|
- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/dev/LICENSE), own your own data & maintain your privacy by self-hosting
|
||||||
- [**Powerful, intuitive command line interface**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular optional dependencies](#dependencies)
|
- [**Powerful CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular dependencies](#dependencies) and [support for Google Drive/NFS/SMB/S3/B2/etc.](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-Up-Storage)
|
||||||
- [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
|
- [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
|
||||||
- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (yt-dlp), articles (readability), code (git), etc.](#output-formats)
|
- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (yt-dlp), articles (readability), code (git), etc.](#output-formats)
|
||||||
- [**Supports scheduled/realtime importing**](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from [many types of sources](#input-formats)
|
- [**Supports scheduled/realtime importing**](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from [many types of sources](#input-formats)
|
||||||
|
@ -152,8 +152,8 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur
|
||||||
- **Governments:**
|
- **Governments:**
|
||||||
`snapshoting public service sites`, `recordkeeping compliance`
|
`snapshoting public service sites`, `recordkeeping compliance`
|
||||||
|
|
||||||
> ***[Contact us](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your org wants help using ArchiveBox professionally.*
|
> ***[Contact us](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your org wants help using ArchiveBox professionally.* (we are also seeking [grant funding](https://github.com/ArchiveBox/ArchiveBox/issues/1126#issuecomment-1487431394))
|
||||||
> We offer: setup & support, hosting, custom features, security, hashing & audit logging/chain-of-custody, etc.
|
> We offer: setup & support, CAPTCHA/ratelimit unblocking, SSO, audit logging/chain-of-custody, and more
|
||||||
> *ArchiveBox has 🏛️ 501(c)(3) [nonprofit status](https://hackclub.com/hcb/) and all our work supports open-source development.*
|
> *ArchiveBox has 🏛️ 501(c)(3) [nonprofit status](https://hackclub.com/hcb/) and all our work supports open-source development.*
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
@ -407,11 +407,12 @@ See <a href="#%EF%B8%8F-cli-usage">below</a> for usage examples using the CLI, W
|
||||||
> *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.*
|
> *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.*
|
||||||
|
|
||||||
<ul>
|
<ul>
|
||||||
<li>TrueNAS: <a href="https://truecharts.org/charts/incubator/archivebox/">Official ArchiveBox TrueChart</a> / <a href="https://dev.to/finloop/setting-up-archivebox-on-truenas-scale-1788">Custom App Guide</a></li>
|
<li>TrueNAS: <a href="https://truecharts.org/charts/stable/archivebox/">Official ArchiveBox TrueChart</a> / <a href="https://dev.to/finloop/setting-up-archivebox-on-truenas-scale-1788">Custom App Guide</a></li>
|
||||||
<li><a href="https://unraid.net/community/apps?q=archivebox#r">UnRaid</a></li>
|
<li><a href="https://unraid.net/community/apps?q=archivebox#r">UnRaid</a></li>
|
||||||
<li><a href="https://github.com/YunoHost-Apps/archivebox_ynh">Yunohost</a></li>
|
<li><a href="https://github.com/YunoHost-Apps/archivebox_ynh">Yunohost</a></li>
|
||||||
<li><a href="https://www.cloudron.io/store/io.archivebox.cloudronapp.html">Cloudron</a></li>
|
<li><a href="https://www.cloudron.io/store/io.archivebox.cloudronapp.html">Cloudron</a></li>
|
||||||
<li><a href="https://github.com/ArchiveBox/ArchiveBox/pull/922/files#diff-00f0606e18b2618c3cc1667ca7c2b703b537af690ca71eba1330633587dcb1ee">AppImage</a></li>
|
<li><a href="https://github.com/ArchiveBox/ArchiveBox/pull/922/files#diff-00f0606e18b2618c3cc1667ca7c2b703b537af690ca71eba1330633587dcb1ee">AppImage</a></li>
|
||||||
|
<li><a href="https://runtipi.io/docs/apps-available#:~:text=for%20AI%20Chats.-,ArchiveBox,Open%20source%20self%2Dhosted%20web%20archiving.,-Atuin%20Server">Runtipi</a></li>
|
||||||
<li><a href="https://github.com/ArchiveBox/ArchiveBox/issues/986">Umbrel</a> (need contributors...)</li>
|
<li><a href="https://github.com/ArchiveBox/ArchiveBox/issues/986">Umbrel</a> (need contributors...)</li>
|
||||||
|
|
||||||
<li>More: <a href="https://github.com/ArchiveBox/ArchiveBox/issues/new"><i>contribute another distribution...!</i></a></li>
|
<li>More: <a href="https://github.com/ArchiveBox/ArchiveBox/issues/new"><i>contribute another distribution...!</i></a></li>
|
||||||
|
@ -445,6 +446,9 @@ Other providers of paid ArchiveBox hosting (not officially endorsed):<br/>
|
||||||
<li><a href="https://fly.io/">
|
<li><a href="https://fly.io/">
|
||||||
<img src="https://img.shields.io/badge/Unmanaged_App-Fly.io-%239a2de6.svg?style=flat" height="22px"/>
|
<img src="https://img.shields.io/badge/Unmanaged_App-Fly.io-%239a2de6.svg?style=flat" height="22px"/>
|
||||||
</a> (USD $10-50+/mo, <a href="https://fly.io/docs/hands-on/start/">instructions</a>)</li>
|
</a> (USD $10-50+/mo, <a href="https://fly.io/docs/hands-on/start/">instructions</a>)</li>
|
||||||
|
<li><a href="https://railway.app/template/2Vvhmy">
|
||||||
|
<img src="https://img.shields.io/badge/Unmanaged_App-Railway-%23A11BE6.svg?style=flat" height="22px"/>
|
||||||
|
</a> (USD $0-5+/mo)</li>
|
||||||
<li><a href="https://aws.amazon.com/marketplace/pp/Linnovate-Open-Source-Innovation-Support-For-Archi/B08RVW6MJ2"><img src="https://img.shields.io/badge/Unmanaged_VPS-AWS-%23ee8135.svg?style=flat" height="22px"/></a> (USD $60-200+/mo)</li>
|
<li><a href="https://aws.amazon.com/marketplace/pp/Linnovate-Open-Source-Innovation-Support-For-Archi/B08RVW6MJ2"><img src="https://img.shields.io/badge/Unmanaged_VPS-AWS-%23ee8135.svg?style=flat" height="22px"/></a> (USD $60-200+/mo)</li>
|
||||||
<li><a href="https://azuremarketplace.microsoft.com/en-us/marketplace/apps/meanio.archivebox?ocid=gtmrewards_whatsnewblog_archivebox_vol118"><img src="https://img.shields.io/badge/Unmanaged_VPS-Azure-%237cb300.svg?style=flat" height="22px"/></a> (USD $60-200+/mo)</li>
|
<li><a href="https://azuremarketplace.microsoft.com/en-us/marketplace/apps/meanio.archivebox?ocid=gtmrewards_whatsnewblog_archivebox_vol118"><img src="https://img.shields.io/badge/Unmanaged_VPS-Azure-%237cb300.svg?style=flat" height="22px"/></a> (USD $60-200+/mo)</li>
|
||||||
<br/>
|
<br/>
|
||||||
|
@ -669,7 +673,7 @@ docker run -it -v $PWD:/data archivebox/archivebox add --depth=1 'https://exampl
|
||||||
```bash
|
```bash
|
||||||
# archivebox add --help
|
# archivebox add --help
|
||||||
archivebox add 'https://example.com/some/page'
|
archivebox add 'https://example.com/some/page'
|
||||||
archivebox add < ~/Downloads/firefox_bookmarks_export.html
|
archivebox add --parser=generic_rss < ~/Downloads/some_feed.xml
|
||||||
archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12'
|
archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12'
|
||||||
echo 'http://example.com' | archivebox add
|
echo 'http://example.com' | archivebox add
|
||||||
echo 'any text with <a href="https://example.com">urls</a> in it' | archivebox add
|
echo 'any text with <a href="https://example.com">urls</a> in it' | archivebox add
|
||||||
|
@ -865,6 +869,7 @@ Each snapshot subfolder <code>data/archive/TIMESTAMP/</code> includes a static <
|
||||||
|
|
||||||
<h4>Learn More</h4>
|
<h4>Learn More</h4>
|
||||||
<ul>
|
<ul>
|
||||||
|
<li><a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-Up-Storage">Wiki: Setting Up Storage (SMB, NFS, S3, B2, Google Drive, etc.)</a></li>
|
||||||
<li><a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Disk-Layout">Wiki: Usage (Disk Layout)</a></li>
|
<li><a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Disk-Layout">Wiki: Usage (Disk Layout)</a></li>
|
||||||
<li><a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#large-archives">Wiki: Usage (Large Archives)</a></li>
|
<li><a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#large-archives">Wiki: Usage (Large Archives)</a></li>
|
||||||
<li><a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#output-folder">Wiki: Security Overview (Output Folder)</a></li>
|
<li><a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#output-folder">Wiki: Security Overview (Output Folder)</a></li>
|
||||||
|
@ -1007,7 +1012,7 @@ https://127.0.0.1:8000/archive/*
|
||||||
|
|
||||||
### Working Around Sites that Block Archiving
|
### Working Around Sites that Block Archiving
|
||||||
|
|
||||||
For various reasons, many large sites (Reddit, Twitter, Cloudflare, etc.) actively block archiving or bots in general. There are a number of approaches to work around this.
|
For various reasons, many large sites (Reddit, Twitter, Cloudflare, etc.) actively block archiving or bots in general. There are a number of approaches to work around this, and we also provide <a href="https://docs.monadical.com/s/archivebox-consulting-services">consulting services</a> to help here.
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
<details>
|
<details>
|
||||||
|
@ -1018,7 +1023,7 @@ For various reasons, many large sites (Reddit, Twitter, Cloudflare, etc.) active
|
||||||
<ul>
|
<ul>
|
||||||
<li>Set <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#curl_user_agent"><code>CHROME_USER_AGENT</code>, <code>WGET_USER_AGENT</code>, <code>CURL_USER_AGENT</code></a> to impersonate a real browser (by default, ArchiveBox reveals that it's a bot when using the default user agent settings)</li>
|
<li>Set <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#curl_user_agent"><code>CHROME_USER_AGENT</code>, <code>WGET_USER_AGENT</code>, <code>CURL_USER_AGENT</code></a> to impersonate a real browser (by default, ArchiveBox reveals that it's a bot when using the default user agent settings)</li>
|
||||||
<li>Set up a logged-in browser session for archiving using <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile"><code>CHROME_USER_DATA_DIR</code> & <code>COOKIES_FILE</code></a></li>
|
<li>Set up a logged-in browser session for archiving using <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile"><code>CHROME_USER_DATA_DIR</code> & <code>COOKIES_FILE</code></a></li>
|
||||||
<li>Rewrite your URLs before archiving to swap in an alternative frontend thats more bot-friendly e.g.<br>
|
<li>Rewrite your URLs before archiving to swap in alternative frontends that are more bot-friendly e.g.<br>
|
||||||
<code>reddit.com/some/url</code> -> <code>teddit.net/some/url</code>: <a href="https://github.com/mendel5/alternative-front-ends">https://github.com/mendel5/alternative-front-ends</a></li>
|
<code>reddit.com/some/url</code> -> <code>teddit.net/some/url</code>: <a href="https://github.com/mendel5/alternative-front-ends">https://github.com/mendel5/alternative-front-ends</a></li>
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
|
@ -1174,7 +1179,7 @@ ArchiveBox's stance is that duplication of other people's content is only ethica
|
||||||
- A. doesn't deprive the original creators of revenue and
|
- A. doesn't deprive the original creators of revenue and
|
||||||
- B. is responsibly curated by an individual/institution.
|
- B. is responsibly curated by an individual/institution.
|
||||||
|
|
||||||
In the U.S., <a href="https://guides.library.oregonstate.edu/copyright/libraries">libraries, researchers, and archivists</a> are allowed to duplicate copyrighted materials under <a href="https://libguides.ala.org/copyright/fairuse">"fair use"</a> for <a href="https://guides.cuny.edu/cunyfairuse/librarians#:~:text=One%20of%20these%20specified%20conditions,may%20be%20liable%20for%20copyright">private study, scholarship, or research</a>. Archive.org's preservation work is covered under this exemption, as they are as a non-profit providing public service, and they respond to <a href="https://cardozoaelj.com/2015/03/20/use-of-copyright-law-to-take-down-revenge-porn/">unethical content</a>/<a href="https://help.archive.org/help/rights/">DMCA</a>/<a href="https://gdpr.eu/right-to-be-forgotten/#:~:text=An%20individual%20has%20the%20right,that%20individual%20withdraws%20their%20consent.">GDPR</a> removal requests.
|
In the U.S., <a href="https://guides.library.oregonstate.edu/copyright/libraries">libraries, researchers, and archivists</a> are allowed to duplicate copyrighted materials under <a href="https://libguides.ala.org/copyright/fairuse">"fair use"</a> for <a href="https://guides.cuny.edu/cunyfairuse/librarians#:~:text=One%20of%20these%20specified%20conditions,may%20be%20liable%20for%20copyright">private study, scholarship, or research</a>. Archive.org's non-profit preservation work is <a href="https://blog.archive.org/2024/03/01/fair-use-in-action-at-the-internet-archive/">covered under fair use</a> in the US, and they properly handle <a href="https://cardozoaelj.com/2015/03/20/use-of-copyright-law-to-take-down-revenge-porn/">unethical content</a>/<a href="https://help.archive.org/help/rights/">DMCA</a>/<a href="https://gdpr.eu/right-to-be-forgotten/#:~:text=An%20individual%20has%20the%20right,that%20individual%20withdraws%20their%20consent.">GDPR</a> removal requests to maintain good standing in the eyes of the law.
|
||||||
|
|
||||||
As long as you A. don't try to profit off pirating copyrighted content and B. have processes in place to respond to removal requests, many countries allow you to use sofware like ArchiveBox to ethically and responsibly archive any web content you can view. That being said, ArchiveBox is not liable for how you choose to operate the software. You must research your own local laws and regulations, and get proper legal council if you plan to host a public instance (start by putting your DMCA/GDPR contact info in <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#footer_info"><code>FOOTER_INFO</code></a> and changing your instance's branding using <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#custom_templates_dir"><code>CUSTOM_TEMPLATES_DIR</code></a>).
|
As long as you A. don't try to profit off pirating copyrighted content and B. have processes in place to respond to removal requests, many countries allow you to use sofware like ArchiveBox to ethically and responsibly archive any web content you can view. That being said, ArchiveBox is not liable for how you choose to operate the software. You must research your own local laws and regulations, and get proper legal council if you plan to host a public instance (start by putting your DMCA/GDPR contact info in <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#footer_info"><code>FOOTER_INFO</code></a> and changing your instance's branding using <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#custom_templates_dir"><code>CUSTOM_TEMPLATES_DIR</code></a>).
|
||||||
|
|
||||||
|
@ -1187,21 +1192,25 @@ As long as you A. don't try to profit off pirating copyrighted content and B. ha
|
||||||
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/4cac62a9-e8fb-425b-85a3-ca644aa6dd42" width="5%" align="right" alt="comparison" style="float: right"/>
|
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/4cac62a9-e8fb-425b-85a3-ca644aa6dd42" width="5%" align="right" alt="comparison" style="float: right"/>
|
||||||
|
|
||||||
|
|
||||||
> **Check out our [community wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for a list of web archiving tools and orgs.**
|
> **Check out our [community wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for a list of alternative web archiving tools and orgs.**
|
||||||
|
|
||||||
A variety of open and closed-source archiving projects exist, but few provide a nice UI and CLI to manage a large, high-fidelity collection over time.
|
ArchiveBox gained momentum in the internet archiving industry because it uniquely combines 3 things:
|
||||||
|
|
||||||
|
- **it's distributed:** users own their data instead of entrusting it to one big central provider
|
||||||
|
- **it's future-proof:** saving in *multiple formats* and extracting out raw TXT, PNG, PDF, MP4, etc. files
|
||||||
|
- **it's extensible:** with powerful APIs, flexible storage, and a big community adding new extractors regularly
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
<details>
|
<details>
|
||||||
<summary><i>Click to read about how we differ from other centralized archiving services and open source tools...</i></summary><br/>
|
<summary><i>Expand for a more direct comparison to Archive.org and specific open-source alternatives...</i></summary><br/>
|
||||||
|
|
||||||
ArchiveBox tries to be a robust, set-and-forget archiving solution suitable for archiving RSS feeds, bookmarks, or your entire browsing history (beware, it may be too big to store), including private/authenticated content that you wouldn't otherwise share with a centralized service.
|
ArchiveBox tries to be a robust, set-and-forget archiving solution suitable for archiving RSS feeds, bookmarks, or your entire browsing history (beware, it may be too big to store), including private/authenticated content that you wouldn't otherwise share with a centralized service like Archive.org.
|
||||||
|
|
||||||
<h3>Comparison With Centralized Public Archives</h3>
|
<h3>Comparison With Centralized Public Archives</h3>
|
||||||
|
|
||||||
Not all content is suitable to be archived in a centralized collection, whether because it's private, copyrighted, too large, or too complex. ArchiveBox hopes to fill that gap.
|
Not all content is suitable to be archived on a centralized, publicly accessible platform. Archive.org doesn't offer the ability to save things behind login walls for good reason, as the content may not have been intended for a public audience. ArchiveBox exists to fill that gap by letting everyone save what they have access to on an individual basis, and to encourage decentralized archiving that's less succeptible to censorship or natural disasters.
|
||||||
|
|
||||||
By having each user store their own content locally, we can save much larger portions of everyone's browsing history than a shared centralized service would be able to handle. The eventual goal is to work towards federated archiving where users can share portions of their collections with each other.
|
By having users store their content locally or within their organizations, we can also save much larger portions of the internet than a centralized service has the disk capcity handle. The eventual goal is to work towards federated archiving where users can share portions of their collections with each other, and with central archives on a case-by-case basis.
|
||||||
|
|
||||||
<h3>Comparison With Other Self-Hosted Archiving Options</h3>
|
<h3>Comparison With Other Self-Hosted Archiving Options</h3>
|
||||||
|
|
||||||
|
@ -1251,7 +1260,7 @@ ArchiveBox is neither the highest fidelity nor the simplest tool available for s
|
||||||
|
|
||||||
**Need help building a custom archiving solution?**
|
**Need help building a custom archiving solution?**
|
||||||
|
|
||||||
> ✨ **[Hire the team that built Archivebox](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102) to work on your project.** ([@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp))
|
> ✨ **[Hire the team that built Archivebox](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102) to solve archiving for your org.** ([@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp))
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
|
@ -1264,9 +1273,11 @@ ArchiveBox is neither the highest fidelity nor the simplest tool available for s
|
||||||
|
|
||||||
<img src="https://read-the-docs-guidelines.readthedocs-hosted.com/_images/logo-dark.png" width="13%" align="right" style="float: right"/>
|
<img src="https://read-the-docs-guidelines.readthedocs-hosted.com/_images/logo-dark.png" width="13%" align="right" style="float: right"/>
|
||||||
|
|
||||||
We use the [GitHub wiki system](https://github.com/ArchiveBox/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation.
|
We use the [ArchiveBox GitHub Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki) for documentation.
|
||||||
|
|
||||||
You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/ArchiveBox/ArchiveBox/wiki/Home) folder.
|
<sub>There is also a mirror available on <a href="https://archivebox.readthedocs.io/en/latest/">Read the Docs</a> (though it's sometimes outdated).</sub>
|
||||||
|
|
||||||
|
> ✏️ You can submit docs changes & suggestions in our dedicated repo [`ArchiveBox/docs`](https://github.com/ArchiveBox/docs).
|
||||||
|
|
||||||
## Getting Started
|
## Getting Started
|
||||||
|
|
||||||
|
@ -1277,16 +1288,19 @@ You can also access the docs locally by looking in the [`ArchiveBox/docs/`](http
|
||||||
- [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration)
|
- [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration)
|
||||||
- [Supported Sources](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
|
- [Supported Sources](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
|
||||||
- [Supported Outputs](https://github.com/ArchiveBox/ArchiveBox/wiki#can-save-these-things-for-each-site)
|
- [Supported Outputs](https://github.com/ArchiveBox/ArchiveBox/wiki#can-save-these-things-for-each-site)
|
||||||
|
- [Scheduled Archiving](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving)
|
||||||
|
|
||||||
## Advanced
|
## Advanced
|
||||||
|
|
||||||
- [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting)
|
|
||||||
- [Scheduled Archiving](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving)
|
|
||||||
- [Publishing Your Archive](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive)
|
|
||||||
- [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install)
|
|
||||||
- [Cookies & Sessions Setup](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile)
|
|
||||||
- [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview)
|
- [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview)
|
||||||
|
- [Cookies & Sessions Setup](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile) (archiving sites that require logins)
|
||||||
|
- [Setting up the Search Backends](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-up-Search) (choosing ripgrep, Sonic, or FTS5)
|
||||||
|
- [Setting up Local/Remote Storages](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-up-Storage) (S3/B2/Google Drive/SMB/NFS/etc.)
|
||||||
|
- [Setting up Authentication & Permissions](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-up-Authentication) (SSO/LDAP/OAuth/API Keys/etc.)
|
||||||
|
- [Publishing Your Archive](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive) (sharing your archive server with others)
|
||||||
|
- [Chromium Install Options](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install) (installing and configuring ArchiveBox's Chrome)
|
||||||
- [Upgrading or Merging Archives](https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives)
|
- [Upgrading or Merging Archives](https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives)
|
||||||
|
- [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting)
|
||||||
|
|
||||||
## Developers
|
## Developers
|
||||||
|
|
||||||
|
|
|
@ -1 +1,4 @@
|
||||||
__package__ = 'archivebox'
|
__package__ = 'archivebox'
|
||||||
|
|
||||||
|
|
||||||
|
from .monkey_patches import *
|
||||||
|
|
1
archivebox/abid_utils/__init__.py
Normal file
1
archivebox/abid_utils/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
__package__ = 'abid_utils'
|
191
archivebox/abid_utils/abid.py
Normal file
191
archivebox/abid_utils/abid.py
Normal file
|
@ -0,0 +1,191 @@
|
||||||
|
from typing import NamedTuple, Any, Union, Optional
|
||||||
|
|
||||||
|
import ulid
|
||||||
|
import uuid6
|
||||||
|
import hashlib
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from uuid import UUID
|
||||||
|
from typeid import TypeID # type: ignore[import-untyped]
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
ABID_PREFIX_LEN = 4
|
||||||
|
ABID_SUFFIX_LEN = 26
|
||||||
|
ABID_LEN = 30
|
||||||
|
ABID_TS_LEN = 10
|
||||||
|
ABID_URI_LEN = 8
|
||||||
|
ABID_SUBTYPE_LEN = 2
|
||||||
|
ABID_RAND_LEN = 6
|
||||||
|
|
||||||
|
DEFAULT_ABID_PREFIX = 'obj_'
|
||||||
|
|
||||||
|
|
||||||
|
class ABID(NamedTuple):
|
||||||
|
"""
|
||||||
|
e.g. ABID('obj_01HX9FPYTRE4A5CCD901ZYEBQE')
|
||||||
|
"""
|
||||||
|
prefix: str # e.g. obj_
|
||||||
|
ts: str # e.g. 01HX9FPYTR
|
||||||
|
uri: str # e.g. E4A5CCD9
|
||||||
|
subtype: str # e.g. 01
|
||||||
|
rand: str # e.g. ZYEBQE
|
||||||
|
|
||||||
|
def __getattr__(self, attr: str) -> Any:
|
||||||
|
return getattr(self.ulid, attr)
|
||||||
|
|
||||||
|
def __eq__(self, other: Any) -> bool:
|
||||||
|
try:
|
||||||
|
return self.ulid == other.ulid
|
||||||
|
except AttributeError:
|
||||||
|
return NotImplemented
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return self.prefix + self.suffix
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return len(self.prefix + self.suffix)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def parse(cls, buffer: Union[str, UUID, ulid.ULID, TypeID, 'ABID'], prefix=DEFAULT_ABID_PREFIX) -> 'ABID':
|
||||||
|
assert buffer, f'Attempted to create ABID from null value {buffer}'
|
||||||
|
|
||||||
|
buffer = str(buffer)
|
||||||
|
if '_' in buffer:
|
||||||
|
prefix, suffix = buffer.split('_')
|
||||||
|
else:
|
||||||
|
prefix, suffix = prefix.strip('_'), buffer
|
||||||
|
|
||||||
|
assert len(prefix) == ABID_PREFIX_LEN - 1 # length without trailing _
|
||||||
|
assert len(suffix) == ABID_SUFFIX_LEN, f'Suffix {suffix} from {buffer} was not {ABID_SUFFIX_LEN} chars long'
|
||||||
|
|
||||||
|
return cls(
|
||||||
|
prefix=abid_part_from_prefix(prefix),
|
||||||
|
ts=suffix[0:10].upper(),
|
||||||
|
uri=suffix[10:18].upper(),
|
||||||
|
subtype=suffix[18:20].upper(),
|
||||||
|
rand=suffix[20:26].upper(),
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def suffix(self):
|
||||||
|
return ''.join((self.ts, self.uri, self.subtype, self.rand))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ulid(self) -> ulid.ULID:
|
||||||
|
return ulid.parse(self.suffix)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def uuid(self) -> UUID:
|
||||||
|
return self.ulid.uuid
|
||||||
|
|
||||||
|
@property
|
||||||
|
def uuid6(self) -> uuid6.UUID:
|
||||||
|
return uuid6.UUID(hex=self.uuid.hex)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def typeid(self) -> TypeID:
|
||||||
|
return TypeID.from_uuid(prefix=self.prefix.strip('_'), suffix=self.uuid6)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def datetime(self) -> datetime:
|
||||||
|
return self.ulid.timestamp().datetime
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
|
||||||
|
|
||||||
|
def uri_hash(uri: Union[str, bytes]) -> str:
|
||||||
|
"""
|
||||||
|
'E4A5CCD9AF4ED2A6E0954DF19FD274E9CDDB4853051F033FD518BFC90AA1AC25'
|
||||||
|
"""
|
||||||
|
if isinstance(uri, bytes):
|
||||||
|
uri_str: str = uri.decode()
|
||||||
|
else:
|
||||||
|
uri_str = uri
|
||||||
|
|
||||||
|
# only hash the domain part of URLs
|
||||||
|
if '://' in uri_str:
|
||||||
|
try:
|
||||||
|
domain = urlparse(uri_str).netloc
|
||||||
|
if domain:
|
||||||
|
uri_str = domain
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
uri_bytes = uri_str.encode('utf-8')
|
||||||
|
|
||||||
|
return hashlib.sha256(uri_bytes).hexdigest().upper()
|
||||||
|
|
||||||
|
def abid_part_from_prefix(prefix: Optional[str]) -> str:
|
||||||
|
"""
|
||||||
|
'snp_'
|
||||||
|
"""
|
||||||
|
if prefix is None:
|
||||||
|
return 'obj_'
|
||||||
|
|
||||||
|
prefix = prefix.strip('_').lower()
|
||||||
|
assert len(prefix) == 3
|
||||||
|
return prefix + '_'
|
||||||
|
|
||||||
|
def abid_part_from_uri(uri: str) -> str:
|
||||||
|
"""
|
||||||
|
'E4A5CCD9' # takes first 8 characters of sha256(url)
|
||||||
|
"""
|
||||||
|
uri = str(uri)
|
||||||
|
return uri_hash(uri)[:ABID_URI_LEN]
|
||||||
|
|
||||||
|
def abid_part_from_ts(ts: Optional[datetime]) -> str:
|
||||||
|
"""
|
||||||
|
'01HX9FPYTR' # produces 10 character Timestamp section of ulid based on added date
|
||||||
|
"""
|
||||||
|
return str(ulid.from_timestamp(ts) if ts else ulid.new())[:ABID_TS_LEN]
|
||||||
|
|
||||||
|
def abid_part_from_subtype(subtype: str) -> str:
|
||||||
|
"""
|
||||||
|
Snapshots have 01 type, other objects have other subtypes like wget/media/etc.
|
||||||
|
Also allows us to change the ulid spec later by putting special sigil values here.
|
||||||
|
"""
|
||||||
|
subtype = str(subtype)
|
||||||
|
if len(subtype) == ABID_SUBTYPE_LEN:
|
||||||
|
return subtype
|
||||||
|
|
||||||
|
return hashlib.sha256(subtype.encode('utf-8')).hexdigest()[:ABID_SUBTYPE_LEN].upper()
|
||||||
|
|
||||||
|
def abid_part_from_rand(rand: Union[str, UUID, None, int]) -> str:
|
||||||
|
"""
|
||||||
|
'ZYEBQE' # takes last 6 characters of randomness from existing legacy uuid db field
|
||||||
|
"""
|
||||||
|
if rand is None:
|
||||||
|
# if it's None we generate a new random 6 character hex string
|
||||||
|
return str(ulid.new())[-ABID_RAND_LEN:]
|
||||||
|
elif isinstance(rand, UUID):
|
||||||
|
# if it's a uuid we take the last 6 characters of the ULID represation of it
|
||||||
|
return str(ulid.from_uuid(rand))[-ABID_RAND_LEN:]
|
||||||
|
elif isinstance(rand, int):
|
||||||
|
# if it's a BigAutoInteger field we convert it from an int to a 0-padded string
|
||||||
|
rand_str = str(rand)[-ABID_RAND_LEN:]
|
||||||
|
padding_needed = ABID_RAND_LEN - len(rand_str)
|
||||||
|
rand_str = ('0'*padding_needed) + rand_str
|
||||||
|
return rand_str
|
||||||
|
|
||||||
|
# otherwise treat it as a string, take the last 6 characters of it verbatim
|
||||||
|
return str(rand)[-ABID_RAND_LEN:].upper()
|
||||||
|
|
||||||
|
|
||||||
|
def abid_from_values(prefix, ts, uri, subtype, rand) -> ABID:
|
||||||
|
"""
|
||||||
|
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
|
||||||
|
"""
|
||||||
|
|
||||||
|
abid = ABID(
|
||||||
|
prefix=abid_part_from_prefix(prefix),
|
||||||
|
ts=abid_part_from_ts(ts),
|
||||||
|
uri=abid_part_from_uri(uri),
|
||||||
|
subtype=abid_part_from_subtype(subtype),
|
||||||
|
rand=abid_part_from_rand(rand),
|
||||||
|
)
|
||||||
|
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for ts={ts} uri={uri} subtyp={subtype} rand={rand}'
|
||||||
|
return abid
|
7
archivebox/abid_utils/apps.py
Normal file
7
archivebox/abid_utils/apps.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class AbidUtilsConfig(AppConfig):
|
||||||
|
default_auto_field = 'django.db.models.BigAutoField'
|
||||||
|
|
||||||
|
name = 'abid_utils'
|
0
archivebox/abid_utils/migrations/__init__.py
Normal file
0
archivebox/abid_utils/migrations/__init__.py
Normal file
314
archivebox/abid_utils/models.py
Normal file
314
archivebox/abid_utils/models.py
Normal file
|
@ -0,0 +1,314 @@
|
||||||
|
"""
|
||||||
|
This file provides the Django ABIDField and ABIDModel base model to inherit from.
|
||||||
|
|
||||||
|
It implements the ArchiveBox ID (ABID) interfaces including abid_values, get_abid, .abid, .uuid, .id.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Any, Dict, Union, List, Set, NamedTuple, cast
|
||||||
|
|
||||||
|
from ulid import ULID
|
||||||
|
from uuid import uuid4, UUID
|
||||||
|
from typeid import TypeID # type: ignore[import-untyped]
|
||||||
|
from datetime import datetime
|
||||||
|
from functools import partial
|
||||||
|
from charidfield import CharIDField # type: ignore[import-untyped]
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.db import models
|
||||||
|
from django.db.utils import OperationalError
|
||||||
|
from django.contrib.auth import get_user_model
|
||||||
|
|
||||||
|
from django_stubs_ext.db.models import TypedModelMeta
|
||||||
|
|
||||||
|
from .abid import (
|
||||||
|
ABID,
|
||||||
|
ABID_LEN,
|
||||||
|
ABID_RAND_LEN,
|
||||||
|
ABID_SUFFIX_LEN,
|
||||||
|
DEFAULT_ABID_PREFIX,
|
||||||
|
abid_part_from_prefix,
|
||||||
|
abid_from_values
|
||||||
|
)
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
|
||||||
|
|
||||||
|
# Database Field for typeid/ulid style IDs with a prefix, e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ
|
||||||
|
ABIDField = partial(
|
||||||
|
CharIDField,
|
||||||
|
max_length=ABID_LEN,
|
||||||
|
help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)",
|
||||||
|
default=None,
|
||||||
|
null=True,
|
||||||
|
blank=True,
|
||||||
|
db_index=True,
|
||||||
|
unique=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_or_create_system_user_pk(username='system'):
|
||||||
|
"""Get or create a system user with is_superuser=True to be the default owner for new DB rows"""
|
||||||
|
|
||||||
|
User = get_user_model()
|
||||||
|
|
||||||
|
# if only one user exists total, return that user
|
||||||
|
if User.objects.filter(is_superuser=True).count() == 1:
|
||||||
|
return User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0]
|
||||||
|
|
||||||
|
# otherwise, create a dedicated "system" user
|
||||||
|
user, created = User.objects.get_or_create(username=username, is_staff=True, is_superuser=True, defaults={'email': '', 'password': ''})
|
||||||
|
return user.pk
|
||||||
|
|
||||||
|
|
||||||
|
class ABIDModel(models.Model):
|
||||||
|
"""
|
||||||
|
Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface.
|
||||||
|
"""
|
||||||
|
abid_prefix: str = DEFAULT_ABID_PREFIX # e.g. 'tag_'
|
||||||
|
abid_ts_src = 'None' # e.g. 'self.created'
|
||||||
|
abid_uri_src = 'None' # e.g. 'self.uri'
|
||||||
|
abid_subtype_src = 'None' # e.g. 'self.extractor'
|
||||||
|
abid_rand_src = 'None' # e.g. 'self.uuid' or 'self.id'
|
||||||
|
|
||||||
|
id = models.UUIDField(primary_key=True, default=uuid4, editable=True)
|
||||||
|
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
|
||||||
|
abid = ABIDField(prefix=abid_prefix)
|
||||||
|
|
||||||
|
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
|
||||||
|
created = models.DateTimeField(auto_now_add=True)
|
||||||
|
modified = models.DateTimeField(auto_now=True)
|
||||||
|
|
||||||
|
class Meta(TypedModelMeta):
|
||||||
|
abstract = True
|
||||||
|
|
||||||
|
def save(self, *args: Any, **kwargs: Any) -> None:
|
||||||
|
if hasattr(self, 'abid'):
|
||||||
|
# self.abid = ABID.parse(self.abid) if self.abid else self.get_abid()
|
||||||
|
self.abid = self.get_abid()
|
||||||
|
else:
|
||||||
|
print(f'[!] WARNING: {self.__class__.__name__}.abid is not a DB field so ABID will not be persisted!')
|
||||||
|
self.abid = self.get_abid()
|
||||||
|
|
||||||
|
super().save(*args, **kwargs)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def abid_values(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
'prefix': self.abid_prefix,
|
||||||
|
'ts': eval(self.abid_ts_src),
|
||||||
|
'uri': eval(self.abid_uri_src),
|
||||||
|
'subtype': eval(self.abid_subtype_src),
|
||||||
|
'rand': eval(self.abid_rand_src),
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_abid(self) -> ABID:
|
||||||
|
"""
|
||||||
|
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
|
||||||
|
"""
|
||||||
|
prefix, ts, uri, subtype, rand = self.abid_values.values()
|
||||||
|
|
||||||
|
if (not prefix) or prefix == DEFAULT_ABID_PREFIX:
|
||||||
|
suggested_abid = self.__class__.__name__[:3].lower()
|
||||||
|
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
|
||||||
|
|
||||||
|
if not ts:
|
||||||
|
ts = datetime.utcfromtimestamp(0)
|
||||||
|
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
|
||||||
|
|
||||||
|
if not uri:
|
||||||
|
uri = str(self)
|
||||||
|
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
|
||||||
|
|
||||||
|
if not subtype:
|
||||||
|
subtype = self.__class__.__name__
|
||||||
|
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
|
||||||
|
|
||||||
|
if not rand:
|
||||||
|
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
|
||||||
|
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
|
||||||
|
|
||||||
|
abid = abid_from_values(
|
||||||
|
prefix=prefix,
|
||||||
|
ts=ts,
|
||||||
|
uri=uri,
|
||||||
|
subtype=subtype,
|
||||||
|
rand=rand,
|
||||||
|
)
|
||||||
|
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
|
||||||
|
return abid
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ABID(self) -> ABID:
|
||||||
|
"""
|
||||||
|
ULIDParts(timestamp='01HX9FPYTR', url='E4A5CCD9', subtype='00', randomness='ZYEBQE')
|
||||||
|
"""
|
||||||
|
return ABID.parse(self.abid) if getattr(self, 'abid', None) else self.get_abid()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ULID(self) -> ULID:
|
||||||
|
"""
|
||||||
|
Get a ulid.ULID representation of the object's ABID.
|
||||||
|
"""
|
||||||
|
return self.ABID.ulid
|
||||||
|
|
||||||
|
@property
|
||||||
|
def UUID(self) -> UUID:
|
||||||
|
"""
|
||||||
|
Get a uuid.UUID (v4) representation of the object's ABID.
|
||||||
|
"""
|
||||||
|
return self.ABID.uuid
|
||||||
|
|
||||||
|
@property
|
||||||
|
def TypeID(self) -> TypeID:
|
||||||
|
"""
|
||||||
|
Get a typeid.TypeID (stripe-style) representation of the object's ABID.
|
||||||
|
"""
|
||||||
|
return self.ABID.typeid
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
|
||||||
|
# Django helpers
|
||||||
|
def find_all_abid_prefixes() -> Dict[str, type[models.Model]]:
|
||||||
|
"""
|
||||||
|
Return the mapping of all ABID prefixes to their models.
|
||||||
|
e.g. {'tag_': core.models.Tag, 'snp_': core.models.Snapshot, ...}
|
||||||
|
"""
|
||||||
|
import django.apps
|
||||||
|
prefix_map = {}
|
||||||
|
|
||||||
|
for model in django.apps.apps.get_models():
|
||||||
|
abid_prefix = getattr(model, 'abid_prefix', None)
|
||||||
|
if abid_prefix:
|
||||||
|
prefix_map[abid_prefix] = model
|
||||||
|
return prefix_map
|
||||||
|
|
||||||
|
def find_prefix_for_abid(abid: ABID) -> str:
|
||||||
|
"""
|
||||||
|
Find the correct prefix for a given ABID that may have be missing a prefix (slow).
|
||||||
|
e.g. ABID('obj_01BJQMF54D093DXEAWZ6JYRPAQ') -> 'snp_'
|
||||||
|
"""
|
||||||
|
# if existing abid prefix is correct, lookup is easy
|
||||||
|
model = find_model_from_abid(abid)
|
||||||
|
if model:
|
||||||
|
assert issubclass(model, ABIDModel)
|
||||||
|
return model.abid_prefix
|
||||||
|
|
||||||
|
# prefix might be obj_ or missing, fuzzy-search to find any object that matches
|
||||||
|
return find_obj_from_abid_rand(abid)[0].abid_prefix
|
||||||
|
|
||||||
|
def find_model_from_abid_prefix(prefix: str) -> type[ABIDModel] | None:
|
||||||
|
"""
|
||||||
|
Return the Django Model that corresponds to a given ABID prefix.
|
||||||
|
e.g. 'tag_' -> core.models.Tag
|
||||||
|
"""
|
||||||
|
prefix = abid_part_from_prefix(prefix)
|
||||||
|
|
||||||
|
import django.apps
|
||||||
|
|
||||||
|
for model in django.apps.apps.get_models():
|
||||||
|
if not issubclass(model, ABIDModel): continue # skip non-ABID-enabled models
|
||||||
|
if not hasattr(model, 'objects'): continue # skip abstract models
|
||||||
|
|
||||||
|
if (model.abid_prefix == prefix):
|
||||||
|
return model
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def find_model_from_abid(abid: ABID) -> type[models.Model] | None:
|
||||||
|
"""
|
||||||
|
Shortcut for find_model_from_abid_prefix(abid.prefix)
|
||||||
|
"""
|
||||||
|
return find_model_from_abid_prefix(abid.prefix)
|
||||||
|
|
||||||
|
def find_obj_from_abid_rand(rand: Union[ABID, str], model=None) -> List[ABIDModel]:
|
||||||
|
"""
|
||||||
|
Find an object corresponding to an ABID by exhaustively searching using its random suffix (slow).
|
||||||
|
e.g. 'obj_....................JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ')
|
||||||
|
"""
|
||||||
|
|
||||||
|
# convert str to ABID if necessary
|
||||||
|
if isinstance(rand, ABID):
|
||||||
|
abid: ABID = rand
|
||||||
|
else:
|
||||||
|
rand = str(rand)
|
||||||
|
if len(rand) < ABID_SUFFIX_LEN:
|
||||||
|
padding_needed = ABID_SUFFIX_LEN - len(rand)
|
||||||
|
rand = ('0'*padding_needed) + rand
|
||||||
|
abid = ABID.parse(rand)
|
||||||
|
|
||||||
|
import django.apps
|
||||||
|
|
||||||
|
partial_matches: List[ABIDModel] = []
|
||||||
|
|
||||||
|
models_to_try = cast(Set[type[models.Model]], set(filter(bool, (
|
||||||
|
model,
|
||||||
|
find_model_from_abid(abid),
|
||||||
|
*django.apps.apps.get_models(),
|
||||||
|
))))
|
||||||
|
# print(abid, abid.rand, abid.uuid, models_to_try)
|
||||||
|
|
||||||
|
for model in models_to_try:
|
||||||
|
if not issubclass(model, ABIDModel): continue # skip Models that arent ABID-enabled
|
||||||
|
if not hasattr(model, 'objects'): continue # skip abstract Models
|
||||||
|
assert hasattr(model, 'objects') # force-fix for type hint nit about missing manager https://github.com/typeddjango/django-stubs/issues/1684
|
||||||
|
|
||||||
|
# continue on to try fuzzy searching by randomness portion derived from uuid field
|
||||||
|
try:
|
||||||
|
qs = []
|
||||||
|
if hasattr(model, 'abid'):
|
||||||
|
qs = model.objects.filter(abid__endswith=abid.rand)
|
||||||
|
elif hasattr(model, 'uuid'):
|
||||||
|
qs = model.objects.filter(uuid__endswith=str(abid.uuid)[-ABID_RAND_LEN:])
|
||||||
|
elif hasattr(model, 'id'):
|
||||||
|
# NOTE: this only works on SQLite where every column is a string
|
||||||
|
# other DB backends like postgres dont let you do __endswith if this is a BigAutoInteger field
|
||||||
|
|
||||||
|
# try to search for uuid=...-2354352
|
||||||
|
# try to search for id=...2354352
|
||||||
|
# try to search for id=2354352
|
||||||
|
qs = model.objects.filter(
|
||||||
|
models.Q(id__endswith=str(abid.uuid)[-ABID_RAND_LEN:])
|
||||||
|
| models.Q(id__endswith=abid.rand)
|
||||||
|
| models.Q(id__startswith=str(int(abid.rand)) if abid.rand.isdigit() else abid.rand)
|
||||||
|
)
|
||||||
|
|
||||||
|
for obj in qs:
|
||||||
|
if obj.get_abid() == abid:
|
||||||
|
# found exact match, no need to keep iterating
|
||||||
|
return [obj]
|
||||||
|
partial_matches.append(obj)
|
||||||
|
except OperationalError as err:
|
||||||
|
print(f'[!] WARNING: Got error while trying to iterate through QuerySet for {model}:', err, '\n')
|
||||||
|
|
||||||
|
return partial_matches
|
||||||
|
|
||||||
|
def find_obj_from_abid(abid: ABID, model=None, fuzzy=False) -> Any:
|
||||||
|
"""
|
||||||
|
Find an object with a given ABID by filtering possible models for a matching abid/uuid/id (fast).
|
||||||
|
e.g. 'snp_01BJQMF54D093DXEAWZ6JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ')
|
||||||
|
"""
|
||||||
|
|
||||||
|
model = model or find_model_from_abid(abid)
|
||||||
|
assert model, f'Could not find model that could match this ABID type: {abid}'
|
||||||
|
|
||||||
|
try:
|
||||||
|
if hasattr(model, 'abid'):
|
||||||
|
return model.objects.get(abid__endswith=abid.suffix)
|
||||||
|
if hasattr(model, 'uuid'):
|
||||||
|
return model.objects.get(uuid=abid.uuid)
|
||||||
|
return model.objects.get(id=abid.uuid)
|
||||||
|
except model.DoesNotExist:
|
||||||
|
# if the model has an abid field then it shouldve matched, pointless to fuzzy search in that case
|
||||||
|
if hasattr(model, 'abid') or (not fuzzy):
|
||||||
|
raise
|
||||||
|
|
||||||
|
# continue on to try fuzzy searching by randomness portion derived from uuid field
|
||||||
|
match_by_rand = find_obj_from_abid_rand(abid, model=model)
|
||||||
|
if match_by_rand:
|
||||||
|
if match_by_rand[0].abid_prefix != abid.prefix:
|
||||||
|
print(f'[!] WARNING: fetched object {match_by_rand} even though prefix {abid.prefix} doesnt match!', abid, '\n')
|
||||||
|
return match_by_rand
|
||||||
|
|
||||||
|
raise model.DoesNotExist
|
||||||
|
|
3
archivebox/abid_utils/tests.py
Normal file
3
archivebox/abid_utils/tests.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
from django.test import TestCase
|
||||||
|
|
||||||
|
# Create your tests here.
|
1
archivebox/api/__init__.py
Normal file
1
archivebox/api/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
__package__ = 'archivebox.api'
|
11
archivebox/api/apps.py
Normal file
11
archivebox/api/apps.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class APIConfig(AppConfig):
|
||||||
|
name = 'api'
|
||||||
|
|
||||||
|
def ready(self):
|
||||||
|
pass
|
107
archivebox/api/auth.py
Normal file
107
archivebox/api/auth.py
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from django.http import HttpRequest
|
||||||
|
from django.contrib.auth import login
|
||||||
|
from django.contrib.auth import authenticate
|
||||||
|
from django.contrib.auth.models import AbstractBaseUser
|
||||||
|
|
||||||
|
from ninja.security import HttpBearer, APIKeyQuery, APIKeyHeader, HttpBasicAuth, django_auth_superuser
|
||||||
|
|
||||||
|
|
||||||
|
def auth_using_token(token, request: Optional[HttpRequest]=None) -> Optional[AbstractBaseUser]:
|
||||||
|
"""Given an API token string, check if a corresponding non-expired APIToken exists, and return its user"""
|
||||||
|
from api.models import APIToken # lazy import model to avoid loading it at urls.py import time
|
||||||
|
|
||||||
|
user = None
|
||||||
|
|
||||||
|
submitted_empty_form = token in ('string', '', None)
|
||||||
|
if submitted_empty_form:
|
||||||
|
user = request.user # see if user is authed via django session and use that as the default
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
token = APIToken.objects.get(token=token)
|
||||||
|
if token.is_valid():
|
||||||
|
user = token.user
|
||||||
|
except APIToken.DoesNotExist:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if not user:
|
||||||
|
print('[❌] Failed to authenticate API user using API Key:', request)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def auth_using_password(username, password, request: Optional[HttpRequest]=None) -> Optional[AbstractBaseUser]:
|
||||||
|
"""Given a username and password, check if they are valid and return the corresponding user"""
|
||||||
|
user = None
|
||||||
|
|
||||||
|
submitted_empty_form = (username, password) in (('string', 'string'), ('', ''), (None, None))
|
||||||
|
if submitted_empty_form:
|
||||||
|
user = request.user # see if user is authed via django session and use that as the default
|
||||||
|
else:
|
||||||
|
user = authenticate(
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not user:
|
||||||
|
print('[❌] Failed to authenticate API user using API Key:', request)
|
||||||
|
|
||||||
|
return user
|
||||||
|
|
||||||
|
|
||||||
|
### Base Auth Types
|
||||||
|
|
||||||
|
class APITokenAuthCheck:
|
||||||
|
"""The base class for authentication methods that use an api.models.APIToken"""
|
||||||
|
def authenticate(self, request: HttpRequest, key: Optional[str]=None) -> Optional[AbstractBaseUser]:
|
||||||
|
user = auth_using_token(
|
||||||
|
token=key,
|
||||||
|
request=request,
|
||||||
|
)
|
||||||
|
if user is not None:
|
||||||
|
login(request, user, backend='django.contrib.auth.backends.ModelBackend')
|
||||||
|
return user
|
||||||
|
|
||||||
|
class UserPassAuthCheck:
|
||||||
|
"""The base class for authentication methods that use a username & password"""
|
||||||
|
def authenticate(self, request: HttpRequest, username: Optional[str]=None, password: Optional[str]=None) -> Optional[AbstractBaseUser]:
|
||||||
|
user = auth_using_password(
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
request=request,
|
||||||
|
)
|
||||||
|
if user is not None:
|
||||||
|
login(request, user, backend='django.contrib.auth.backends.ModelBackend')
|
||||||
|
return user
|
||||||
|
|
||||||
|
|
||||||
|
### Django-Ninja-Provided Auth Methods
|
||||||
|
|
||||||
|
class HeaderTokenAuth(APITokenAuthCheck, APIKeyHeader):
|
||||||
|
"""Allow authenticating by passing X-API-Key=xyz as a request header"""
|
||||||
|
param_name = "X-ArchiveBox-API-Key"
|
||||||
|
|
||||||
|
class BearerTokenAuth(APITokenAuthCheck, HttpBearer):
|
||||||
|
"""Allow authenticating by passing Bearer=xyz as a request header"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class QueryParamTokenAuth(APITokenAuthCheck, APIKeyQuery):
|
||||||
|
"""Allow authenticating by passing api_key=xyz as a GET/POST query parameter"""
|
||||||
|
param_name = "api_key"
|
||||||
|
|
||||||
|
class UsernameAndPasswordAuth(UserPassAuthCheck, HttpBasicAuth):
|
||||||
|
"""Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
### Enabled Auth Methods
|
||||||
|
|
||||||
|
API_AUTH_METHODS = [
|
||||||
|
HeaderTokenAuth(),
|
||||||
|
BearerTokenAuth(),
|
||||||
|
QueryParamTokenAuth(),
|
||||||
|
django_auth_superuser,
|
||||||
|
UsernameAndPasswordAuth(),
|
||||||
|
]
|
29
archivebox/api/migrations/0001_initial.py
Normal file
29
archivebox/api/migrations/0001_initial.py
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
# Generated by Django 4.2.11 on 2024-04-25 04:19
|
||||||
|
|
||||||
|
import api.models
|
||||||
|
from django.conf import settings
|
||||||
|
from django.db import migrations, models
|
||||||
|
import django.db.models.deletion
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='APIToken',
|
||||||
|
fields=[
|
||||||
|
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
|
||||||
|
('token', models.CharField(default=api.models.generate_secret_token, max_length=32, unique=True)),
|
||||||
|
('created', models.DateTimeField(auto_now_add=True)),
|
||||||
|
('expires', models.DateTimeField(blank=True, null=True)),
|
||||||
|
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
17
archivebox/api/migrations/0002_alter_apitoken_options.py
Normal file
17
archivebox/api/migrations/0002_alter_apitoken_options.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# Generated by Django 5.0.4 on 2024-04-26 05:28
|
||||||
|
|
||||||
|
from django.db import migrations
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('api', '0001_initial'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterModelOptions(
|
||||||
|
name='apitoken',
|
||||||
|
options={'verbose_name': 'API Key', 'verbose_name_plural': 'API Keys'},
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,77 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-06-03 01:52
|
||||||
|
|
||||||
|
import abid_utils.models
|
||||||
|
import charidfield.fields
|
||||||
|
import django.db.models.deletion
|
||||||
|
import signal_webhooks.fields
|
||||||
|
import signal_webhooks.utils
|
||||||
|
import uuid
|
||||||
|
from django.conf import settings
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('api', '0002_alter_apitoken_options'),
|
||||||
|
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RenameField(
|
||||||
|
model_name='apitoken',
|
||||||
|
old_name='user',
|
||||||
|
new_name='created_by',
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='apitoken',
|
||||||
|
name='abid',
|
||||||
|
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='apt_', unique=True),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='apitoken',
|
||||||
|
name='modified',
|
||||||
|
field=models.DateTimeField(auto_now=True),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='apitoken',
|
||||||
|
name='uuid',
|
||||||
|
field=models.UUIDField(blank=True, null=True, unique=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='apitoken',
|
||||||
|
name='id',
|
||||||
|
field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False),
|
||||||
|
),
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='OutboundWebhook',
|
||||||
|
fields=[
|
||||||
|
('name', models.CharField(db_index=True, help_text='Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).', max_length=255, unique=True, verbose_name='name')),
|
||||||
|
('signal', models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='The type of event the webhook should fire for (e.g. Create, Update, Delete).', max_length=255, verbose_name='signal')),
|
||||||
|
('ref', models.CharField(db_index=True, help_text='Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model')),
|
||||||
|
('endpoint', models.URLField(help_text='External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).', max_length=2047, verbose_name='endpoint')),
|
||||||
|
('headers', models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers')),
|
||||||
|
('auth_token', signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token')),
|
||||||
|
('enabled', models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled')),
|
||||||
|
('keep_last_response', models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response')),
|
||||||
|
('updated', models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated')),
|
||||||
|
('last_response', models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response')),
|
||||||
|
('last_success', models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success')),
|
||||||
|
('last_failure', models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure')),
|
||||||
|
('created', models.DateTimeField(auto_now_add=True)),
|
||||||
|
('modified', models.DateTimeField(auto_now=True)),
|
||||||
|
('id', models.UUIDField(blank=True, null=True, unique=True)),
|
||||||
|
('uuid', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)),
|
||||||
|
('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='whk_', unique=True)),
|
||||||
|
('created_by', models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
'verbose_name': 'API Outbound Webhook',
|
||||||
|
'abstract': False,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
migrations.AddConstraint(
|
||||||
|
model_name='outboundwebhook',
|
||||||
|
constraint=models.UniqueConstraint(fields=('ref', 'endpoint'), name='prevent_duplicate_hooks_api_outboundwebhook'),
|
||||||
|
),
|
||||||
|
]
|
0
archivebox/api/migrations/__init__.py
Normal file
0
archivebox/api/migrations/__init__.py
Normal file
115
archivebox/api/models.py
Normal file
115
archivebox/api/models.py
Normal file
|
@ -0,0 +1,115 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
import secrets
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.db import models
|
||||||
|
from django.utils import timezone
|
||||||
|
|
||||||
|
from signal_webhooks.models import WebhookBase
|
||||||
|
|
||||||
|
from django_stubs_ext.db.models import TypedModelMeta
|
||||||
|
|
||||||
|
from abid_utils.models import ABIDModel, ABIDField
|
||||||
|
|
||||||
|
|
||||||
|
def generate_secret_token() -> str:
|
||||||
|
# returns cryptographically secure string with len() == 32
|
||||||
|
return secrets.token_hex(16)
|
||||||
|
|
||||||
|
|
||||||
|
class APIToken(ABIDModel):
|
||||||
|
"""
|
||||||
|
A secret key generated by a User that's used to authenticate REST API requests to ArchiveBox.
|
||||||
|
"""
|
||||||
|
# ABID: apt_<created_ts>_<token_hash>_<user_id_hash>_<uuid_rand>
|
||||||
|
abid_prefix = 'apt_'
|
||||||
|
abid_ts_src = 'self.created'
|
||||||
|
abid_uri_src = 'self.token'
|
||||||
|
abid_subtype_src = 'self.user_id'
|
||||||
|
abid_rand_src = 'self.id'
|
||||||
|
|
||||||
|
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
|
||||||
|
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
|
||||||
|
abid = ABIDField(prefix=abid_prefix)
|
||||||
|
|
||||||
|
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
|
||||||
|
token = models.CharField(max_length=32, default=generate_secret_token, unique=True)
|
||||||
|
|
||||||
|
created = models.DateTimeField(auto_now_add=True)
|
||||||
|
expires = models.DateTimeField(null=True, blank=True)
|
||||||
|
|
||||||
|
|
||||||
|
class Meta(TypedModelMeta):
|
||||||
|
verbose_name = "API Key"
|
||||||
|
verbose_name_plural = "API Keys"
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return self.token
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f'<APIToken user={self.user.username} token=************{self.token[-4:]}>'
|
||||||
|
|
||||||
|
def __json__(self) -> dict:
|
||||||
|
return {
|
||||||
|
"TYPE": "APIToken",
|
||||||
|
"uuid": str(self.id),
|
||||||
|
"abid": str(self.get_abid()),
|
||||||
|
"user_id": str(self.user.id),
|
||||||
|
"user_username": self.user.username,
|
||||||
|
"token": self.token,
|
||||||
|
"created": self.created.isoformat(),
|
||||||
|
"expires": self.expires_as_iso8601,
|
||||||
|
}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def expires_as_iso8601(self):
|
||||||
|
"""Returns the expiry date of the token in ISO 8601 format or a date 100 years in the future if none."""
|
||||||
|
expiry_date = self.expires or (timezone.now() + timedelta(days=365 * 100))
|
||||||
|
|
||||||
|
return expiry_date.isoformat()
|
||||||
|
|
||||||
|
def is_valid(self, for_date=None):
|
||||||
|
for_date = for_date or timezone.now()
|
||||||
|
|
||||||
|
if self.expires and self.expires < for_date:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
|
||||||
|
|
||||||
|
class OutboundWebhook(ABIDModel, WebhookBase):
|
||||||
|
"""
|
||||||
|
Model used in place of (extending) signals_webhooks.models.WebhookModel. Swapped using:
|
||||||
|
settings.SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
|
||||||
|
"""
|
||||||
|
abid_prefix = 'whk_'
|
||||||
|
abid_ts_src = 'self.created'
|
||||||
|
abid_uri_src = 'self.endpoint'
|
||||||
|
abid_subtype_src = 'self.ref'
|
||||||
|
abid_rand_src = 'self.id'
|
||||||
|
|
||||||
|
id = models.UUIDField(blank=True, null=True, unique=True, editable=True)
|
||||||
|
uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
|
||||||
|
abid = ABIDField(prefix=abid_prefix)
|
||||||
|
|
||||||
|
WebhookBase._meta.get_field('name').help_text = (
|
||||||
|
'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).')
|
||||||
|
WebhookBase._meta.get_field('signal').help_text = (
|
||||||
|
'The type of event the webhook should fire for (e.g. Create, Update, Delete).')
|
||||||
|
WebhookBase._meta.get_field('ref').help_text = (
|
||||||
|
'Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).')
|
||||||
|
WebhookBase._meta.get_field('endpoint').help_text = (
|
||||||
|
'External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).')
|
||||||
|
|
||||||
|
class Meta(WebhookBase.Meta):
|
||||||
|
verbose_name = 'API Outbound Webhook'
|
||||||
|
|
30
archivebox/api/tests.py
Normal file
30
archivebox/api/tests.py
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
from django.test import TestCase
|
||||||
|
from ninja.testing import TestClient
|
||||||
|
|
||||||
|
from .routes_cli import router
|
||||||
|
|
||||||
|
class ArchiveBoxCLIAPITestCase(TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.client = TestClient(router)
|
||||||
|
|
||||||
|
def test_add_endpoint(self):
|
||||||
|
response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "testTag1,testTag2"})
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
self.assertTrue(response.json()["success"])
|
||||||
|
|
||||||
|
def test_remove_endpoint(self):
|
||||||
|
response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
self.assertTrue(response.json()["success"])
|
||||||
|
|
||||||
|
def test_update_endpoint(self):
|
||||||
|
response = self.client.post("/update", json={})
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
self.assertTrue(response.json()["success"])
|
||||||
|
|
||||||
|
def test_list_all_endpoint(self):
|
||||||
|
response = self.client.post("/list_all", json={})
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
self.assertTrue(response.json()["success"])
|
17
archivebox/api/urls.py
Normal file
17
archivebox/api/urls.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
from django.urls import path
|
||||||
|
from django.views.generic.base import RedirectView
|
||||||
|
|
||||||
|
from .v1_api import urls as v1_api_urls
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
path("", RedirectView.as_view(url='/api/v1')),
|
||||||
|
|
||||||
|
path("v1/", v1_api_urls),
|
||||||
|
path("v1", RedirectView.as_view(url='/api/v1/docs')),
|
||||||
|
|
||||||
|
# ... v2 can be added here ...
|
||||||
|
# path("v2/", v2_api_urls),
|
||||||
|
# path("v2", RedirectView.as_view(url='/api/v2/docs')),
|
||||||
|
]
|
111
archivebox/api/v1_api.py
Normal file
111
archivebox/api/v1_api.py
Normal file
|
@ -0,0 +1,111 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
|
||||||
|
from io import StringIO
|
||||||
|
from traceback import format_exception
|
||||||
|
from contextlib import redirect_stdout, redirect_stderr
|
||||||
|
|
||||||
|
from django.http import HttpRequest, HttpResponse
|
||||||
|
from django.core.exceptions import ObjectDoesNotExist, EmptyResultSet, PermissionDenied
|
||||||
|
|
||||||
|
from ninja import NinjaAPI, Swagger
|
||||||
|
|
||||||
|
# TODO: explore adding https://eadwincode.github.io/django-ninja-extra/
|
||||||
|
|
||||||
|
from api.auth import API_AUTH_METHODS
|
||||||
|
from ..config import VERSION, COMMIT_HASH
|
||||||
|
|
||||||
|
|
||||||
|
COMMIT_HASH = COMMIT_HASH or 'unknown'
|
||||||
|
|
||||||
|
html_description=f'''
|
||||||
|
<h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>
|
||||||
|
<br/>
|
||||||
|
<i><b>WARNING: This API is still in an early development stage and may change!</b></i>
|
||||||
|
<br/>
|
||||||
|
<ul>
|
||||||
|
<li>⬅️ Manage your server: <a href="/admin/api/"><b>Setup API Keys</b></a>, <a href="/admin/">Go to your Server Admin UI</a>, <a href="/">Go to your Snapshots list</a>
|
||||||
|
<li>💬 Ask questions and get help here: <a href="https://zulip.archivebox.io">ArchiveBox Chat Forum</a></li>
|
||||||
|
<li>🐞 Report API bugs here: <a href="https://github.com/ArchiveBox/ArchiveBox/issues">Github Issues</a></li>
|
||||||
|
<li>📚 ArchiveBox Documentation: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Github Wiki</a></li>
|
||||||
|
<li>📜 See the API source code: <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/api"><code>archivebox/api/</code></a></li>
|
||||||
|
</ul>
|
||||||
|
<small>Served by ArchiveBox v{VERSION} (<a href="https://github.com/ArchiveBox/ArchiveBox/commit/{COMMIT_HASH}"><code>{COMMIT_HASH[:8]}</code></a>), API powered by <a href="https://django-ninja.dev/"><code>django-ninja</code></a>.</small>
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def register_urls(api: NinjaAPI) -> NinjaAPI:
|
||||||
|
api.add_router('/auth/', 'api.v1_auth.router')
|
||||||
|
api.add_router('/core/', 'api.v1_core.router')
|
||||||
|
api.add_router('/cli/', 'api.v1_cli.router')
|
||||||
|
return api
|
||||||
|
|
||||||
|
|
||||||
|
class NinjaAPIWithIOCapture(NinjaAPI):
|
||||||
|
def create_temporal_response(self, request: HttpRequest) -> HttpResponse:
|
||||||
|
stdout, stderr = StringIO(), StringIO()
|
||||||
|
|
||||||
|
with redirect_stderr(stderr):
|
||||||
|
with redirect_stdout(stdout):
|
||||||
|
request.stdout = stdout
|
||||||
|
request.stderr = stderr
|
||||||
|
|
||||||
|
response = super().create_temporal_response(request)
|
||||||
|
|
||||||
|
print('RESPONDING NOW', response)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
api = NinjaAPIWithIOCapture(
|
||||||
|
title='ArchiveBox API',
|
||||||
|
description=html_description,
|
||||||
|
version='1.0.0',
|
||||||
|
csrf=False,
|
||||||
|
auth=API_AUTH_METHODS,
|
||||||
|
urls_namespace="api",
|
||||||
|
docs=Swagger(settings={"persistAuthorization": True}),
|
||||||
|
# docs_decorator=login_required,
|
||||||
|
# renderer=ORJSONRenderer(),
|
||||||
|
)
|
||||||
|
api = register_urls(api)
|
||||||
|
urls = api.urls
|
||||||
|
|
||||||
|
|
||||||
|
@api.exception_handler(Exception)
|
||||||
|
def generic_exception_handler(request, err):
|
||||||
|
status = 503
|
||||||
|
if isinstance(err, (ObjectDoesNotExist, EmptyResultSet, PermissionDenied)):
|
||||||
|
status = 404
|
||||||
|
|
||||||
|
print(''.join(format_exception(err)))
|
||||||
|
|
||||||
|
return api.create_response(
|
||||||
|
request,
|
||||||
|
{
|
||||||
|
"succeeded": False,
|
||||||
|
"message": f'{err.__class__.__name__}: {err}',
|
||||||
|
"errors": [
|
||||||
|
''.join(format_exception(err)),
|
||||||
|
# or send simpler parent-only traceback:
|
||||||
|
# *([str(err.__context__)] if getattr(err, '__context__', None) else []),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
status=status,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# import orjson
|
||||||
|
# from ninja.renderers import BaseRenderer
|
||||||
|
# class ORJSONRenderer(BaseRenderer):
|
||||||
|
# media_type = "application/json"
|
||||||
|
# def render(self, request, data, *, response_status):
|
||||||
|
# return {
|
||||||
|
# "success": True,
|
||||||
|
# "errors": [],
|
||||||
|
# "result": data,
|
||||||
|
# "stdout": ansi_to_html(stdout.getvalue().strip()),
|
||||||
|
# "stderr": ansi_to_html(stderr.getvalue().strip()),
|
||||||
|
# }
|
||||||
|
# return orjson.dumps(data)
|
52
archivebox/api/v1_auth.py
Normal file
52
archivebox/api/v1_auth.py
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from ninja import Router, Schema
|
||||||
|
|
||||||
|
from api.models import APIToken
|
||||||
|
from api.auth import auth_using_token, auth_using_password
|
||||||
|
|
||||||
|
|
||||||
|
router = Router(tags=['Authentication'])
|
||||||
|
|
||||||
|
|
||||||
|
class PasswordAuthSchema(Schema):
|
||||||
|
"""Schema for a /get_api_token request"""
|
||||||
|
username: Optional[str] = None
|
||||||
|
password: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/get_api_token", auth=None, summary='Generate an API token for a given username & password (or currently logged-in user)') # auth=None because they are not authed yet
|
||||||
|
def get_api_token(request, auth_data: PasswordAuthSchema):
|
||||||
|
user = auth_using_password(
|
||||||
|
username=auth_data.username,
|
||||||
|
password=auth_data.password,
|
||||||
|
request=request,
|
||||||
|
)
|
||||||
|
|
||||||
|
if user:
|
||||||
|
# TODO: support multiple tokens in the future, for now we just have one per user
|
||||||
|
api_token, created = APIToken.objects.get_or_create(user=user)
|
||||||
|
|
||||||
|
return api_token.__json__()
|
||||||
|
|
||||||
|
return {"success": False, "errors": ["Invalid credentials"]}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class TokenAuthSchema(Schema):
|
||||||
|
"""Schema for a /check_api_token request"""
|
||||||
|
token: str
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/check_api_token", auth=None, summary='Validate an API token to make sure its valid and non-expired') # auth=None because they are not authed yet
|
||||||
|
def check_api_token(request, token_data: TokenAuthSchema):
|
||||||
|
user = auth_using_token(
|
||||||
|
token=token_data.token,
|
||||||
|
request=request,
|
||||||
|
)
|
||||||
|
if user:
|
||||||
|
return {"success": True, "user_id": str(user.pk)}
|
||||||
|
|
||||||
|
return {"success": False, "user_id": None}
|
234
archivebox/api/v1_cli.py
Normal file
234
archivebox/api/v1_cli.py
Normal file
|
@ -0,0 +1,234 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
from ninja import Router, Schema
|
||||||
|
|
||||||
|
from ..main import (
|
||||||
|
add,
|
||||||
|
remove,
|
||||||
|
update,
|
||||||
|
list_all,
|
||||||
|
schedule,
|
||||||
|
)
|
||||||
|
from ..util import ansi_to_html
|
||||||
|
from ..config import ONLY_NEW
|
||||||
|
|
||||||
|
|
||||||
|
# router for API that exposes archivebox cli subcommands as REST endpoints
|
||||||
|
router = Router(tags=['ArchiveBox CLI Sub-Commands'])
|
||||||
|
|
||||||
|
|
||||||
|
# Schemas
|
||||||
|
|
||||||
|
JSONType = List[Any] | Dict[str, Any] | bool | int | str | None
|
||||||
|
|
||||||
|
class CLICommandResponseSchema(Schema):
|
||||||
|
success: bool
|
||||||
|
errors: List[str]
|
||||||
|
result: JSONType
|
||||||
|
stdout: str
|
||||||
|
stderr: str
|
||||||
|
|
||||||
|
class FilterTypeChoices(str, Enum):
|
||||||
|
exact = 'exact'
|
||||||
|
substring = 'substring'
|
||||||
|
regex = 'regex'
|
||||||
|
domain = 'domain'
|
||||||
|
tag = 'tag'
|
||||||
|
timestamp = 'timestamp'
|
||||||
|
|
||||||
|
class StatusChoices(str, Enum):
|
||||||
|
indexed = 'indexed'
|
||||||
|
archived = 'archived'
|
||||||
|
unarchived = 'unarchived'
|
||||||
|
present = 'present'
|
||||||
|
valid = 'valid'
|
||||||
|
invalid = 'invalid'
|
||||||
|
duplicate = 'duplicate'
|
||||||
|
orphaned = 'orphaned'
|
||||||
|
corrupted = 'corrupted'
|
||||||
|
unrecognized = 'unrecognized'
|
||||||
|
|
||||||
|
|
||||||
|
class AddCommandSchema(Schema):
|
||||||
|
urls: List[str]
|
||||||
|
tag: str = ""
|
||||||
|
depth: int = 0
|
||||||
|
update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW
|
||||||
|
update_all: bool = False
|
||||||
|
index_only: bool = False
|
||||||
|
overwrite: bool = False
|
||||||
|
init: bool = False
|
||||||
|
extractors: str = ""
|
||||||
|
parser: str = "auto"
|
||||||
|
|
||||||
|
class UpdateCommandSchema(Schema):
|
||||||
|
resume: Optional[float] = 0
|
||||||
|
only_new: bool = ONLY_NEW
|
||||||
|
index_only: bool = False
|
||||||
|
overwrite: bool = False
|
||||||
|
after: Optional[float] = 0
|
||||||
|
before: Optional[float] = 999999999999999
|
||||||
|
status: Optional[StatusChoices] = StatusChoices.unarchived
|
||||||
|
filter_type: Optional[str] = FilterTypeChoices.substring
|
||||||
|
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||||
|
extractors: Optional[str] = ""
|
||||||
|
|
||||||
|
class ScheduleCommandSchema(Schema):
|
||||||
|
import_path: Optional[str] = None
|
||||||
|
add: bool = False
|
||||||
|
every: Optional[str] = None
|
||||||
|
tag: str = ''
|
||||||
|
depth: int = 0
|
||||||
|
overwrite: bool = False
|
||||||
|
update: bool = not ONLY_NEW
|
||||||
|
clear: bool = False
|
||||||
|
|
||||||
|
class ListCommandSchema(Schema):
|
||||||
|
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||||
|
filter_type: str = FilterTypeChoices.substring
|
||||||
|
status: Optional[StatusChoices] = StatusChoices.indexed
|
||||||
|
after: Optional[float] = 0
|
||||||
|
before: Optional[float] = 999999999999999
|
||||||
|
sort: str = 'added'
|
||||||
|
as_json: bool = True
|
||||||
|
as_html: bool = False
|
||||||
|
as_csv: str | bool = 'timestamp,url'
|
||||||
|
with_headers: bool = False
|
||||||
|
|
||||||
|
class RemoveCommandSchema(Schema):
|
||||||
|
delete: bool = True
|
||||||
|
after: Optional[float] = 0
|
||||||
|
before: Optional[float] = 999999999999999
|
||||||
|
filter_type: str = FilterTypeChoices.exact
|
||||||
|
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
|
||||||
|
def cli_add(request, args: AddCommandSchema):
|
||||||
|
result = add(
|
||||||
|
urls=args.urls,
|
||||||
|
tag=args.tag,
|
||||||
|
depth=args.depth,
|
||||||
|
update=args.update,
|
||||||
|
update_all=args.update_all,
|
||||||
|
index_only=args.index_only,
|
||||||
|
overwrite=args.overwrite,
|
||||||
|
init=args.init,
|
||||||
|
extractors=args.extractors,
|
||||||
|
parser=args.parser,
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"errors": [],
|
||||||
|
"result": result,
|
||||||
|
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||||
|
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
|
||||||
|
def cli_update(request, args: UpdateCommandSchema):
|
||||||
|
result = update(
|
||||||
|
resume=args.resume,
|
||||||
|
only_new=args.only_new,
|
||||||
|
index_only=args.index_only,
|
||||||
|
overwrite=args.overwrite,
|
||||||
|
before=args.before,
|
||||||
|
after=args.after,
|
||||||
|
status=args.status,
|
||||||
|
filter_type=args.filter_type,
|
||||||
|
filter_patterns=args.filter_patterns,
|
||||||
|
extractors=args.extractors,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"errors": [],
|
||||||
|
"result": result,
|
||||||
|
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||||
|
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
|
||||||
|
def cli_schedule(request, args: ScheduleCommandSchema):
|
||||||
|
result = schedule(
|
||||||
|
import_path=args.import_path,
|
||||||
|
add=args.add,
|
||||||
|
show=args.show,
|
||||||
|
clear=args.clear,
|
||||||
|
every=args.every,
|
||||||
|
tag=args.tag,
|
||||||
|
depth=args.depth,
|
||||||
|
overwrite=args.overwrite,
|
||||||
|
update=args.update,
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"errors": [],
|
||||||
|
"result": result,
|
||||||
|
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||||
|
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/list", response=CLICommandResponseSchema, summary='archivebox list [args] [filter_patterns]')
|
||||||
|
def cli_list(request, args: ListCommandSchema):
|
||||||
|
result = list_all(
|
||||||
|
filter_patterns=args.filter_patterns,
|
||||||
|
filter_type=args.filter_type,
|
||||||
|
status=args.status,
|
||||||
|
after=args.after,
|
||||||
|
before=args.before,
|
||||||
|
sort=args.sort,
|
||||||
|
csv=args.as_csv,
|
||||||
|
json=args.as_json,
|
||||||
|
html=args.as_html,
|
||||||
|
with_headers=args.with_headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
result_format = 'txt'
|
||||||
|
if args.as_json:
|
||||||
|
result_format = "json"
|
||||||
|
elif args.as_html:
|
||||||
|
result_format = "html"
|
||||||
|
elif args.as_csv:
|
||||||
|
result_format = "csv"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"errors": [],
|
||||||
|
"result": result,
|
||||||
|
"result_format": result_format,
|
||||||
|
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||||
|
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
|
||||||
|
def cli_remove(request, args: RemoveCommandSchema):
|
||||||
|
result = remove(
|
||||||
|
yes=True, # no way to interactively ask for confirmation via API, so we force yes
|
||||||
|
delete=args.delete,
|
||||||
|
before=args.before,
|
||||||
|
after=args.after,
|
||||||
|
filter_type=args.filter_type,
|
||||||
|
filter_patterns=args.filter_patterns,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"errors": [],
|
||||||
|
"result": result,
|
||||||
|
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||||
|
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||||
|
}
|
||||||
|
|
291
archivebox/api/v1_core.py
Normal file
291
archivebox/api/v1_core.py
Normal file
|
@ -0,0 +1,291 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
from uuid import UUID
|
||||||
|
from typing import List, Optional
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from django.db.models import Q
|
||||||
|
from django.shortcuts import get_object_or_404
|
||||||
|
|
||||||
|
from ninja import Router, Schema, FilterSchema, Field, Query
|
||||||
|
from ninja.pagination import paginate
|
||||||
|
|
||||||
|
from core.models import Snapshot, ArchiveResult, Tag
|
||||||
|
from abid_utils.abid import ABID
|
||||||
|
|
||||||
|
router = Router(tags=['Core Models'])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### ArchiveResult #########################################################################
|
||||||
|
|
||||||
|
class ArchiveResultSchema(Schema):
|
||||||
|
abid: str
|
||||||
|
uuid: UUID
|
||||||
|
pk: str
|
||||||
|
modified: datetime
|
||||||
|
created: datetime
|
||||||
|
created_by_id: str
|
||||||
|
|
||||||
|
snapshot_abid: str
|
||||||
|
snapshot_url: str
|
||||||
|
snapshot_tags: str
|
||||||
|
|
||||||
|
extractor: str
|
||||||
|
cmd_version: str
|
||||||
|
cmd: List[str]
|
||||||
|
pwd: str
|
||||||
|
status: str
|
||||||
|
output: str
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_created_by_id(obj):
|
||||||
|
return str(obj.created_by_id)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_pk(obj):
|
||||||
|
return str(obj.pk)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_uuid(obj):
|
||||||
|
return str(obj.uuid)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_abid(obj):
|
||||||
|
return str(obj.ABID)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_created(obj):
|
||||||
|
return obj.start_ts
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_snapshot_url(obj):
|
||||||
|
return obj.snapshot.url
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_snapshot_abid(obj):
|
||||||
|
return str(obj.snapshot.ABID)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_snapshot_tags(obj):
|
||||||
|
return obj.snapshot.tags_str()
|
||||||
|
|
||||||
|
|
||||||
|
class ArchiveResultFilterSchema(FilterSchema):
|
||||||
|
uuid: Optional[UUID] = Field(None, q='uuid')
|
||||||
|
# abid: Optional[str] = Field(None, q='abid')
|
||||||
|
|
||||||
|
search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains'])
|
||||||
|
snapshot_uuid: Optional[UUID] = Field(None, q='snapshot_uuid__icontains')
|
||||||
|
snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains')
|
||||||
|
snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains')
|
||||||
|
|
||||||
|
status: Optional[str] = Field(None, q='status')
|
||||||
|
output: Optional[str] = Field(None, q='output__icontains')
|
||||||
|
extractor: Optional[str] = Field(None, q='extractor__icontains')
|
||||||
|
cmd: Optional[str] = Field(None, q='cmd__0__icontains')
|
||||||
|
pwd: Optional[str] = Field(None, q='pwd__icontains')
|
||||||
|
cmd_version: Optional[str] = Field(None, q='cmd_version')
|
||||||
|
|
||||||
|
created: Optional[datetime] = Field(None, q='updated')
|
||||||
|
created__gte: Optional[datetime] = Field(None, q='updated__gte')
|
||||||
|
created__lt: Optional[datetime] = Field(None, q='updated__lt')
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/archiveresults", response=List[ArchiveResultSchema])
|
||||||
|
@paginate
|
||||||
|
def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)):
|
||||||
|
"""List all ArchiveResult entries matching these filters."""
|
||||||
|
qs = ArchiveResult.objects.all()
|
||||||
|
results = filters.filter(qs)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
|
||||||
|
def get_archiveresult(request, archiveresult_id: str):
|
||||||
|
"""Get a specific ArchiveResult by abid, uuid, or pk."""
|
||||||
|
return ArchiveResult.objects.get(Q(pk__icontains=archiveresult_id) | Q(abid__icontains=archiveresult_id) | Q(uuid__icontains=archiveresult_id))
|
||||||
|
|
||||||
|
|
||||||
|
# @router.post("/archiveresult", response=ArchiveResultSchema)
|
||||||
|
# def create_archiveresult(request, payload: ArchiveResultSchema):
|
||||||
|
# archiveresult = ArchiveResult.objects.create(**payload.dict())
|
||||||
|
# return archiveresult
|
||||||
|
#
|
||||||
|
# @router.put("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
|
||||||
|
# def update_archiveresult(request, archiveresult_id: str, payload: ArchiveResultSchema):
|
||||||
|
# archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
|
||||||
|
#
|
||||||
|
# for attr, value in payload.dict().items():
|
||||||
|
# setattr(archiveresult, attr, value)
|
||||||
|
# archiveresult.save()
|
||||||
|
#
|
||||||
|
# return archiveresult
|
||||||
|
#
|
||||||
|
# @router.delete("/archiveresult/{archiveresult_id}")
|
||||||
|
# def delete_archiveresult(request, archiveresult_id: str):
|
||||||
|
# archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
|
||||||
|
# archiveresult.delete()
|
||||||
|
# return {"success": True}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Snapshot #########################################################################
|
||||||
|
|
||||||
|
|
||||||
|
class SnapshotSchema(Schema):
|
||||||
|
abid: str
|
||||||
|
uuid: UUID
|
||||||
|
pk: str
|
||||||
|
modified: datetime
|
||||||
|
created: datetime
|
||||||
|
created_by_id: str
|
||||||
|
|
||||||
|
url: str
|
||||||
|
tags: str
|
||||||
|
title: Optional[str]
|
||||||
|
timestamp: str
|
||||||
|
archive_path: str
|
||||||
|
|
||||||
|
bookmarked: datetime
|
||||||
|
added: datetime
|
||||||
|
updated: Optional[datetime]
|
||||||
|
|
||||||
|
num_archiveresults: int
|
||||||
|
archiveresults: List[ArchiveResultSchema]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_created_by_id(obj):
|
||||||
|
return str(obj.created_by_id)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_pk(obj):
|
||||||
|
return str(obj.pk)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_uuid(obj):
|
||||||
|
return str(obj.uuid)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_abid(obj):
|
||||||
|
return str(obj.ABID)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_tags(obj):
|
||||||
|
return obj.tags_str()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_num_archiveresults(obj, context):
|
||||||
|
return obj.archiveresult_set.all().distinct().count()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_archiveresults(obj, context):
|
||||||
|
if context['request'].with_archiveresults:
|
||||||
|
return obj.archiveresult_set.all().distinct()
|
||||||
|
return ArchiveResult.objects.none()
|
||||||
|
|
||||||
|
|
||||||
|
class SnapshotFilterSchema(FilterSchema):
|
||||||
|
abid: Optional[str] = Field(None, q='abid__icontains')
|
||||||
|
uuid: Optional[str] = Field(None, q='uuid__icontains')
|
||||||
|
pk: Optional[str] = Field(None, q='pk__icontains')
|
||||||
|
created_by_id: str = Field(None, q='created_by_id__icontains')
|
||||||
|
created__gte: datetime = Field(None, q='created__gte')
|
||||||
|
created__lt: datetime = Field(None, q='created__lt')
|
||||||
|
created: datetime = Field(None, q='created')
|
||||||
|
modified: datetime = Field(None, q='modified')
|
||||||
|
modified__gte: datetime = Field(None, q='modified__gte')
|
||||||
|
modified__lt: datetime = Field(None, q='modified__lt')
|
||||||
|
|
||||||
|
search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'abid__icontains', 'uuid__icontains'])
|
||||||
|
url: Optional[str] = Field(None, q='url')
|
||||||
|
tag: Optional[str] = Field(None, q='tags__name')
|
||||||
|
title: Optional[str] = Field(None, q='title__icontains')
|
||||||
|
timestamp: Optional[str] = Field(None, q='timestamp__startswith')
|
||||||
|
|
||||||
|
added__gte: Optional[datetime] = Field(None, q='added__gte')
|
||||||
|
added__lt: Optional[datetime] = Field(None, q='added__lt')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/snapshots", response=List[SnapshotSchema])
|
||||||
|
@paginate
|
||||||
|
def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=True):
|
||||||
|
"""List all Snapshot entries matching these filters."""
|
||||||
|
request.with_archiveresults = with_archiveresults
|
||||||
|
|
||||||
|
qs = Snapshot.objects.all()
|
||||||
|
results = filters.filter(qs)
|
||||||
|
return results
|
||||||
|
|
||||||
|
@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema)
|
||||||
|
def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
|
||||||
|
"""Get a specific Snapshot by abid, uuid, or pk."""
|
||||||
|
request.with_archiveresults = with_archiveresults
|
||||||
|
snapshot = None
|
||||||
|
try:
|
||||||
|
snapshot = Snapshot.objects.get(Q(uuid__startswith=snapshot_id) | Q(abid__startswith=snapshot_id)| Q(pk__startswith=snapshot_id))
|
||||||
|
except Snapshot.DoesNotExist:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
snapshot = snapshot or Snapshot.objects.get()
|
||||||
|
except Snapshot.DoesNotExist:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
snapshot = snapshot or Snapshot.objects.get(Q(uuid__icontains=snapshot_id) | Q(abid__icontains=snapshot_id))
|
||||||
|
except Snapshot.DoesNotExist:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return snapshot
|
||||||
|
|
||||||
|
|
||||||
|
# @router.post("/snapshot", response=SnapshotSchema)
|
||||||
|
# def create_snapshot(request, payload: SnapshotSchema):
|
||||||
|
# snapshot = Snapshot.objects.create(**payload.dict())
|
||||||
|
# return snapshot
|
||||||
|
#
|
||||||
|
# @router.put("/snapshot/{snapshot_uuid}", response=SnapshotSchema)
|
||||||
|
# def update_snapshot(request, snapshot_uuid: str, payload: SnapshotSchema):
|
||||||
|
# snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid)
|
||||||
|
#
|
||||||
|
# for attr, value in payload.dict().items():
|
||||||
|
# setattr(snapshot, attr, value)
|
||||||
|
# snapshot.save()
|
||||||
|
#
|
||||||
|
# return snapshot
|
||||||
|
#
|
||||||
|
# @router.delete("/snapshot/{snapshot_uuid}")
|
||||||
|
# def delete_snapshot(request, snapshot_uuid: str):
|
||||||
|
# snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid)
|
||||||
|
# snapshot.delete()
|
||||||
|
# return {"success": True}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Tag #########################################################################
|
||||||
|
|
||||||
|
|
||||||
|
class TagSchema(Schema):
|
||||||
|
abid: Optional[UUID] = Field(None, q='abid')
|
||||||
|
uuid: Optional[UUID] = Field(None, q='uuid')
|
||||||
|
pk: Optional[UUID] = Field(None, q='pk')
|
||||||
|
modified: datetime
|
||||||
|
created: datetime
|
||||||
|
created_by_id: str
|
||||||
|
|
||||||
|
name: str
|
||||||
|
slug: str
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_created_by_id(obj):
|
||||||
|
return str(obj.created_by_id)
|
||||||
|
|
||||||
|
@router.get("/tags", response=List[TagSchema])
|
||||||
|
def list_tags(request):
|
||||||
|
return Tag.objects.all()
|
|
@ -4,14 +4,18 @@ __command__ = 'archivebox'
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
|
import threading
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
from typing import Optional, Dict, List, IO, Union
|
from typing import Optional, Dict, List, IO, Union, Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from ..config import OUTPUT_DIR, check_data_folder, check_migrations
|
from ..config import OUTPUT_DIR, check_data_folder, check_migrations, stderr
|
||||||
|
|
||||||
from importlib import import_module
|
from importlib import import_module
|
||||||
|
|
||||||
|
BUILTIN_LIST = list
|
||||||
|
|
||||||
CLI_DIR = Path(__file__).resolve().parent
|
CLI_DIR = Path(__file__).resolve().parent
|
||||||
|
|
||||||
# these common commands will appear sorted before any others for ease-of-use
|
# these common commands will appear sorted before any others for ease-of-use
|
||||||
|
@ -33,6 +37,40 @@ is_valid_cli_module = lambda module, subcommand: (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread') # threads we dont have to wait for before exiting
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: Iterable[str]=IGNORED_BG_THREADS, timeout: int=60) -> int:
|
||||||
|
"""
|
||||||
|
Block until the specified threads exit. e.g. pass thread_names=('default_hook_handler',) to wait for webhooks.
|
||||||
|
Useful for waiting for signal handlers, webhooks, etc. to finish running after a mgmt command completes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
wait_for_all: bool = thread_names == ()
|
||||||
|
|
||||||
|
thread_matches = lambda thread, ptns: any(ptn in repr(thread) for ptn in ptns)
|
||||||
|
|
||||||
|
should_wait = lambda thread: (
|
||||||
|
not thread_matches(thread, ignore_names)
|
||||||
|
and (wait_for_all or thread_matches(thread, thread_names)))
|
||||||
|
|
||||||
|
for tries in range(timeout):
|
||||||
|
all_threads = [*threading.enumerate()]
|
||||||
|
blocking_threads = [*filter(should_wait, all_threads)]
|
||||||
|
threads_summary = ', '.join(repr(t) for t in blocking_threads)
|
||||||
|
if blocking_threads:
|
||||||
|
sleep(1)
|
||||||
|
if tries == 5: # only show stderr message if we need to wait more than 5s
|
||||||
|
stderr(
|
||||||
|
f'[…] Waiting up to {timeout}s for background jobs (e.g. webhooks) to finish...',
|
||||||
|
threads_summary,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return tries
|
||||||
|
|
||||||
|
raise Exception('Background threads failed to exit after {tries}s: {threads_summary}')
|
||||||
|
|
||||||
|
|
||||||
def list_subcommands() -> Dict[str, str]:
|
def list_subcommands() -> Dict[str, str]:
|
||||||
"""find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
|
"""find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
|
||||||
|
|
||||||
|
@ -79,6 +117,9 @@ def run_subcommand(subcommand: str,
|
||||||
module = import_module('.archivebox_{}'.format(subcommand), __package__)
|
module = import_module('.archivebox_{}'.format(subcommand), __package__)
|
||||||
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
|
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
|
||||||
|
|
||||||
|
# wait for webhooks, signals, and other background jobs to finish before exit
|
||||||
|
wait_for_bg_threads_to_exit(timeout=60)
|
||||||
|
|
||||||
|
|
||||||
SUBCOMMANDS = list_subcommands()
|
SUBCOMMANDS = list_subcommands()
|
||||||
|
|
||||||
|
|
|
@ -37,7 +37,7 @@ from sqlite3 import dbapi2 as sqlite3
|
||||||
from hashlib import md5
|
from hashlib import md5
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from typing import Optional, Type, Tuple, Dict, Union, List
|
from typing import Optional, Type, Tuple, Dict, Union, List, Any
|
||||||
from subprocess import run, PIPE, DEVNULL
|
from subprocess import run, PIPE, DEVNULL
|
||||||
from configparser import ConfigParser
|
from configparser import ConfigParser
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
@ -72,7 +72,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'TIMEOUT': {'type': int, 'default': 60},
|
'TIMEOUT': {'type': int, 'default': 60},
|
||||||
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
|
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
|
||||||
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
|
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
|
||||||
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
|
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, # TODO: move this to be a default WGET_ARGS
|
||||||
|
|
||||||
'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages
|
'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages
|
||||||
'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)},
|
'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)},
|
||||||
|
@ -112,7 +112,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
|
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
|
||||||
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
|
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
|
||||||
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
|
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
|
||||||
'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
|
'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
|
||||||
},
|
},
|
||||||
|
|
||||||
'ARCHIVE_METHOD_TOGGLES': {
|
'ARCHIVE_METHOD_TOGGLES': {
|
||||||
|
@ -265,7 +265,7 @@ CONFIG_ALIASES = {
|
||||||
for key, default in section.items()
|
for key, default in section.items()
|
||||||
for alias in default.get('aliases', ())
|
for alias in default.get('aliases', ())
|
||||||
}
|
}
|
||||||
USER_CONFIG = {key for section in CONFIG_SCHEMA.values() for key in section.keys()}
|
USER_CONFIG = {key: section[key] for section in CONFIG_SCHEMA.values() for key in section.keys()}
|
||||||
|
|
||||||
def get_real_name(key: str) -> str:
|
def get_real_name(key: str) -> str:
|
||||||
"""get the current canonical name for a given deprecated config key"""
|
"""get the current canonical name for a given deprecated config key"""
|
||||||
|
@ -281,7 +281,9 @@ TEMPLATES_DIR_NAME = 'templates'
|
||||||
ARCHIVE_DIR_NAME = 'archive'
|
ARCHIVE_DIR_NAME = 'archive'
|
||||||
SOURCES_DIR_NAME = 'sources'
|
SOURCES_DIR_NAME = 'sources'
|
||||||
LOGS_DIR_NAME = 'logs'
|
LOGS_DIR_NAME = 'logs'
|
||||||
|
CACHE_DIR_NAME = 'cache'
|
||||||
PERSONAS_DIR_NAME = 'personas'
|
PERSONAS_DIR_NAME = 'personas'
|
||||||
|
CRONTABS_DIR_NAME = 'crontabs'
|
||||||
SQL_INDEX_FILENAME = 'index.sqlite3'
|
SQL_INDEX_FILENAME = 'index.sqlite3'
|
||||||
JSON_INDEX_FILENAME = 'index.json'
|
JSON_INDEX_FILENAME = 'index.json'
|
||||||
HTML_INDEX_FILENAME = 'index.html'
|
HTML_INDEX_FILENAME = 'index.html'
|
||||||
|
@ -355,10 +357,11 @@ ALLOWED_IN_OUTPUT_DIR = {
|
||||||
'static',
|
'static',
|
||||||
'sonic',
|
'sonic',
|
||||||
'search.sqlite3',
|
'search.sqlite3',
|
||||||
'crontabs',
|
CRONTABS_DIR_NAME,
|
||||||
ARCHIVE_DIR_NAME,
|
ARCHIVE_DIR_NAME,
|
||||||
SOURCES_DIR_NAME,
|
SOURCES_DIR_NAME,
|
||||||
LOGS_DIR_NAME,
|
LOGS_DIR_NAME,
|
||||||
|
CACHE_DIR_NAME,
|
||||||
PERSONAS_DIR_NAME,
|
PERSONAS_DIR_NAME,
|
||||||
SQL_INDEX_FILENAME,
|
SQL_INDEX_FILENAME,
|
||||||
f'{SQL_INDEX_FILENAME}-wal',
|
f'{SQL_INDEX_FILENAME}-wal',
|
||||||
|
@ -510,6 +513,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
|
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
|
||||||
'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
|
'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
|
||||||
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
|
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
|
||||||
|
'CACHE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / CACHE_DIR_NAME},
|
||||||
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
|
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
|
||||||
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
|
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
|
||||||
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
|
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
|
||||||
|
@ -598,7 +602,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
|
|
||||||
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
||||||
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
||||||
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
|
|
||||||
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
||||||
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
|
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
|
||||||
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
|
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
|
||||||
|
@ -985,11 +988,6 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
||||||
'enabled': True,
|
'enabled': True,
|
||||||
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
|
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
|
||||||
},
|
},
|
||||||
'CUSTOM_TEMPLATES_DIR': {
|
|
||||||
'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
|
|
||||||
'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
|
|
||||||
'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(),
|
|
||||||
},
|
|
||||||
# 'NODE_MODULES_DIR': {
|
# 'NODE_MODULES_DIR': {
|
||||||
# 'path': ,
|
# 'path': ,
|
||||||
# 'enabled': ,
|
# 'enabled': ,
|
||||||
|
@ -997,50 +995,25 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
||||||
# },
|
# },
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_external_locations(config: ConfigDict) -> ConfigValue:
|
|
||||||
abspath = lambda path: None if path is None else Path(path).resolve()
|
|
||||||
return {
|
|
||||||
'CHROME_USER_DATA_DIR': {
|
|
||||||
'path': abspath(config['CHROME_USER_DATA_DIR']),
|
|
||||||
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
|
|
||||||
'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
|
|
||||||
},
|
|
||||||
'COOKIES_FILE': {
|
|
||||||
'path': abspath(config['COOKIES_FILE']),
|
|
||||||
'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
|
|
||||||
'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_data_locations(config: ConfigDict) -> ConfigValue:
|
def get_data_locations(config: ConfigDict) -> ConfigValue:
|
||||||
return {
|
return {
|
||||||
|
# OLD: migrating to personas
|
||||||
|
# 'CHROME_USER_DATA_DIR': {
|
||||||
|
# 'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
|
||||||
|
# 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
|
||||||
|
# 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
|
||||||
|
# },
|
||||||
|
# 'COOKIES_FILE': {
|
||||||
|
# 'path': os.path.abspath(config['COOKIES_FILE']),
|
||||||
|
# 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
|
||||||
|
# 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
|
||||||
|
# },
|
||||||
'OUTPUT_DIR': {
|
'OUTPUT_DIR': {
|
||||||
'path': config['OUTPUT_DIR'].resolve(),
|
'path': config['OUTPUT_DIR'].resolve(),
|
||||||
'enabled': True,
|
'enabled': True,
|
||||||
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
||||||
'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
|
'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
|
||||||
},
|
},
|
||||||
'SOURCES_DIR': {
|
|
||||||
'path': config['SOURCES_DIR'].resolve(),
|
|
||||||
'enabled': True,
|
|
||||||
'is_valid': config['SOURCES_DIR'].exists(),
|
|
||||||
},
|
|
||||||
'LOGS_DIR': {
|
|
||||||
'path': config['LOGS_DIR'].resolve(),
|
|
||||||
'enabled': True,
|
|
||||||
'is_valid': config['LOGS_DIR'].exists(),
|
|
||||||
},
|
|
||||||
'PERSONAS_DIR': {
|
|
||||||
'path': config['PERSONAS_DIR'].resolve(),
|
|
||||||
'enabled': True,
|
|
||||||
'is_valid': config['PERSONAS_DIR'].exists(),
|
|
||||||
},
|
|
||||||
'ARCHIVE_DIR': {
|
|
||||||
'path': config['ARCHIVE_DIR'].resolve(),
|
|
||||||
'enabled': True,
|
|
||||||
'is_valid': config['ARCHIVE_DIR'].exists(),
|
|
||||||
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
|
|
||||||
},
|
|
||||||
'CONFIG_FILE': {
|
'CONFIG_FILE': {
|
||||||
'path': config['CONFIG_FILE'].resolve(),
|
'path': config['CONFIG_FILE'].resolve(),
|
||||||
'enabled': True,
|
'enabled': True,
|
||||||
|
@ -1052,6 +1025,43 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
||||||
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
||||||
'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
|
'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
|
||||||
},
|
},
|
||||||
|
'ARCHIVE_DIR': {
|
||||||
|
'path': config['ARCHIVE_DIR'].resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': config['ARCHIVE_DIR'].exists(),
|
||||||
|
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
|
||||||
|
},
|
||||||
|
'SOURCES_DIR': {
|
||||||
|
'path': config['SOURCES_DIR'].resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': config['SOURCES_DIR'].exists(),
|
||||||
|
},
|
||||||
|
'LOGS_DIR': {
|
||||||
|
'path': config['LOGS_DIR'].resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': config['LOGS_DIR'].exists(),
|
||||||
|
},
|
||||||
|
'CACHE_DIR': {
|
||||||
|
'path': config['CACHE_DIR'].resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': config['CACHE_DIR'].exists(),
|
||||||
|
},
|
||||||
|
'CUSTOM_TEMPLATES_DIR': {
|
||||||
|
'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
|
||||||
|
'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
|
||||||
|
'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(),
|
||||||
|
},
|
||||||
|
'PERSONAS_DIR': {
|
||||||
|
'path': config['PERSONAS_DIR'].resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': config['PERSONAS_DIR'].exists(),
|
||||||
|
},
|
||||||
|
# managed by bin/docker_entrypoint.sh and python-crontab:
|
||||||
|
# 'CRONTABS_DIR': {
|
||||||
|
# 'path': config['CRONTABS_DIR'].resolve(),
|
||||||
|
# 'enabled': True,
|
||||||
|
# 'is_valid': config['CRONTABS_DIR'].exists(),
|
||||||
|
# },
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||||
|
@ -1286,7 +1296,7 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
|
||||||
|
|
||||||
# stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
|
# stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
|
||||||
# stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
|
# stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
|
||||||
if config['CHROME_USER_DATA_DIR'] is not None:
|
if config['CHROME_USER_DATA_DIR'] is not None and Path(config['CHROME_USER_DATA_DIR']).exists():
|
||||||
if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists():
|
if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists():
|
||||||
stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
|
stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
|
||||||
stderr(f' {config["CHROME_USER_DATA_DIR"]}')
|
stderr(f' {config["CHROME_USER_DATA_DIR"]}')
|
||||||
|
@ -1296,8 +1306,13 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
|
||||||
if '/Default' in str(config['CHROME_USER_DATA_DIR']):
|
if '/Default' in str(config['CHROME_USER_DATA_DIR']):
|
||||||
stderr()
|
stderr()
|
||||||
stderr(' Try removing /Default from the end e.g.:')
|
stderr(' Try removing /Default from the end e.g.:')
|
||||||
stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0]))
|
stderr(' CHROME_USER_DATA_DIR="{}"'.format(str(config['CHROME_USER_DATA_DIR']).split('/Default')[0]))
|
||||||
raise SystemExit(2)
|
|
||||||
|
# hard error is too annoying here, instead just set it to nothing
|
||||||
|
# raise SystemExit(2)
|
||||||
|
config['CHROME_USER_DATA_DIR'] = None
|
||||||
|
else:
|
||||||
|
config['CHROME_USER_DATA_DIR'] = None
|
||||||
|
|
||||||
|
|
||||||
def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
||||||
|
@ -1366,6 +1381,7 @@ def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=C
|
||||||
stderr(' archivebox init')
|
stderr(' archivebox init')
|
||||||
raise SystemExit(2)
|
raise SystemExit(2)
|
||||||
|
|
||||||
|
|
||||||
def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG):
|
def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG):
|
||||||
output_dir = out_dir or config['OUTPUT_DIR']
|
output_dir = out_dir or config['OUTPUT_DIR']
|
||||||
from .index.sql import list_migrations
|
from .index.sql import list_migrations
|
||||||
|
@ -1382,6 +1398,7 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO
|
||||||
|
|
||||||
(Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
|
(Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
|
||||||
(Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
|
(Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
|
||||||
|
(Path(output_dir) / CACHE_DIR_NAME).mkdir(exist_ok=True)
|
||||||
(Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True)
|
(Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True)
|
||||||
(Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True)
|
(Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
|
|
@ -14,12 +14,17 @@ from django.shortcuts import render, redirect
|
||||||
from django.contrib.auth import get_user_model
|
from django.contrib.auth import get_user_model
|
||||||
from django import forms
|
from django import forms
|
||||||
|
|
||||||
|
|
||||||
|
from signal_webhooks.admin import WebhookAdmin, get_webhook_model
|
||||||
|
# from plugantic.admin import CustomPlugin
|
||||||
|
|
||||||
from ..util import htmldecode, urldecode, ansi_to_html
|
from ..util import htmldecode, urldecode, ansi_to_html
|
||||||
|
|
||||||
from core.models import Snapshot, ArchiveResult, Tag
|
from core.models import Snapshot, ArchiveResult, Tag
|
||||||
from core.forms import AddLinkForm
|
from core.forms import AddLinkForm
|
||||||
|
|
||||||
from core.mixins import SearchResultsAdminMixin
|
from core.mixins import SearchResultsAdminMixin
|
||||||
|
from api.models import APIToken
|
||||||
|
|
||||||
from index.html import snapshot_icons
|
from index.html import snapshot_icons
|
||||||
from logging_util import printable_filesize
|
from logging_util import printable_filesize
|
||||||
|
@ -33,6 +38,7 @@ from config import (
|
||||||
CAN_UPGRADE
|
CAN_UPGRADE
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
|
GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
|
||||||
|
|
||||||
# Admin URLs
|
# Admin URLs
|
||||||
|
@ -98,10 +104,25 @@ class ArchiveBoxAdmin(admin.AdminSite):
|
||||||
|
|
||||||
return render(template_name='add.html', request=request, context=context)
|
return render(template_name='add.html', request=request, context=context)
|
||||||
|
|
||||||
|
|
||||||
archivebox_admin = ArchiveBoxAdmin()
|
archivebox_admin = ArchiveBoxAdmin()
|
||||||
archivebox_admin.register(get_user_model())
|
archivebox_admin.register(get_user_model())
|
||||||
|
archivebox_admin.register(APIToken)
|
||||||
|
archivebox_admin.register(get_webhook_model(), WebhookAdmin)
|
||||||
archivebox_admin.disable_action('delete_selected')
|
archivebox_admin.disable_action('delete_selected')
|
||||||
|
|
||||||
|
# archivebox_admin.register(CustomPlugin)
|
||||||
|
|
||||||
|
# patch admin with methods to add data views (implemented by admin_data_views package)
|
||||||
|
############### Additional sections are defined in settings.ADMIN_DATA_VIEWS #########
|
||||||
|
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
|
||||||
|
|
||||||
|
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||||
|
archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||||
|
archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||||
|
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||||
|
|
||||||
|
|
||||||
class ArchiveResultInline(admin.TabularInline):
|
class ArchiveResultInline(admin.TabularInline):
|
||||||
model = ArchiveResult
|
model = ArchiveResult
|
||||||
|
|
||||||
|
@ -143,14 +164,41 @@ class SnapshotActionForm(ActionForm):
|
||||||
# )
|
# )
|
||||||
|
|
||||||
|
|
||||||
|
def get_abid_info(self, obj):
|
||||||
|
return format_html(
|
||||||
|
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
|
||||||
|
'''
|
||||||
|
ABID: <code style="font-size: 16px; user-select: all"><b>{}</b></code><br/>
|
||||||
|
TS: <code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/>
|
||||||
|
URI: <code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/>
|
||||||
|
SUBTYPE: <code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/>
|
||||||
|
RAND: <code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/><br/>
|
||||||
|
ABID AS UUID: <code style="font-size: 10px; user-select: all">{}</code> <br/><br/>
|
||||||
|
|
||||||
|
.uuid: <code style="font-size: 10px; user-select: all">{}</code> <br/>
|
||||||
|
.id: <code style="font-size: 10px; user-select: all">{}</code> <br/>
|
||||||
|
.pk: <code style="font-size: 10px; user-select: all">{}</code> <br/><br/>
|
||||||
|
''',
|
||||||
|
obj.abid,
|
||||||
|
obj.ABID.ts, obj.abid_values['ts'].isoformat() if isinstance(obj.abid_values['ts'], datetime) else obj.abid_values['ts'],
|
||||||
|
obj.ABID.uri, str(obj.abid_values['uri']),
|
||||||
|
obj.ABID.subtype, str(obj.abid_values['subtype']),
|
||||||
|
obj.ABID.rand, str(obj.abid_values['rand'])[-7:],
|
||||||
|
obj.ABID.uuid,
|
||||||
|
obj.uuid,
|
||||||
|
obj.id,
|
||||||
|
obj.pk,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@admin.register(Snapshot, site=archivebox_admin)
|
@admin.register(Snapshot, site=archivebox_admin)
|
||||||
class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
||||||
list_display = ('added', 'title_str', 'files', 'size', 'url_str')
|
list_display = ('added', 'title_str', 'files', 'size', 'url_str')
|
||||||
sort_fields = ('title_str', 'url_str', 'added', 'files')
|
sort_fields = ('title_str', 'url_str', 'added', 'files')
|
||||||
readonly_fields = ('info', 'bookmarked', 'added', 'updated')
|
readonly_fields = ('admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'identifiers')
|
||||||
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
|
search_fields = ('id', 'url', 'abid', 'uuid', 'timestamp', 'title', 'tags__name')
|
||||||
fields = ('timestamp', 'url', 'title', 'tags', *readonly_fields)
|
fields = ('url', 'timestamp', 'created_by', 'tags', 'title', *readonly_fields)
|
||||||
list_filter = ('added', 'updated', 'tags', 'archiveresult__status')
|
list_filter = ('added', 'updated', 'tags', 'archiveresult__status', 'created_by')
|
||||||
ordering = ['-added']
|
ordering = ['-added']
|
||||||
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
|
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
|
||||||
autocomplete_fields = ['tags']
|
autocomplete_fields = ['tags']
|
||||||
|
@ -196,40 +244,46 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
||||||
# </form>
|
# </form>
|
||||||
# ''',
|
# ''',
|
||||||
# csrf.get_token(self.request),
|
# csrf.get_token(self.request),
|
||||||
# obj.id,
|
# obj.pk,
|
||||||
# )
|
# )
|
||||||
|
|
||||||
def info(self, obj):
|
def admin_actions(self, obj):
|
||||||
return format_html(
|
return format_html(
|
||||||
|
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
|
||||||
|
'''
|
||||||
|
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}">Summary page ➡️</a>
|
||||||
|
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}/index.html#all">Result files 📑</a>
|
||||||
|
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/admin/core/snapshot/?id__exact={}">Admin actions ⚙️</a>
|
||||||
|
''',
|
||||||
|
obj.timestamp,
|
||||||
|
obj.timestamp,
|
||||||
|
obj.pk,
|
||||||
|
)
|
||||||
|
|
||||||
|
def status_info(self, obj):
|
||||||
|
return format_html(
|
||||||
|
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
|
||||||
'''
|
'''
|
||||||
UUID: <code style="font-size: 10px; user-select: all">{}</code>
|
|
||||||
Timestamp: <code style="font-size: 10px; user-select: all">{}</code>
|
|
||||||
URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
|
|
||||||
Archived: {} ({} files {})
|
Archived: {} ({} files {})
|
||||||
Favicon: <img src="{}" style="height: 20px"/>
|
Favicon: <img src="{}" style="height: 20px"/>
|
||||||
Status code: {}
|
Status code: {} <br/>
|
||||||
Server: {}
|
Server: {}
|
||||||
Content type: {}
|
Content type: {}
|
||||||
Extension: {}
|
Extension: {}
|
||||||
<br/><br/>
|
|
||||||
<a href="/archive/{}">View Snapshot index ➡️</a>
|
|
||||||
<a href="/admin/core/snapshot/?id__exact={}">View actions ⚙️</a>
|
|
||||||
''',
|
''',
|
||||||
obj.id,
|
|
||||||
obj.timestamp,
|
|
||||||
obj.url_hash,
|
|
||||||
'✅' if obj.is_archived else '❌',
|
'✅' if obj.is_archived else '❌',
|
||||||
obj.num_outputs,
|
obj.num_outputs,
|
||||||
self.size(obj),
|
self.size(obj) or '0kb',
|
||||||
f'/archive/{obj.timestamp}/favicon.ico',
|
f'/archive/{obj.timestamp}/favicon.ico',
|
||||||
obj.status_code or '?',
|
obj.status_code or '-',
|
||||||
obj.headers and obj.headers.get('Server') or '?',
|
obj.headers and obj.headers.get('Server') or '-',
|
||||||
obj.headers and obj.headers.get('Content-Type') or '?',
|
obj.headers and obj.headers.get('Content-Type') or '-',
|
||||||
obj.extension or '?',
|
obj.extension or '-',
|
||||||
obj.timestamp,
|
|
||||||
obj.id,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def identifiers(self, obj):
|
||||||
|
return get_abid_info(self, obj)
|
||||||
|
|
||||||
@admin.display(
|
@admin.display(
|
||||||
description='Title',
|
description='Title',
|
||||||
ordering='title',
|
ordering='title',
|
||||||
|
@ -289,7 +343,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
||||||
return format_html(
|
return format_html(
|
||||||
'<a href="{}"><code style="user-select: all;">{}</code></a>',
|
'<a href="{}"><code style="user-select: all;">{}</code></a>',
|
||||||
obj.url,
|
obj.url,
|
||||||
obj.url,
|
obj.url[:128],
|
||||||
)
|
)
|
||||||
|
|
||||||
def grid_view(self, request, extra_context=None):
|
def grid_view(self, request, extra_context=None):
|
||||||
|
@ -392,42 +446,45 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
||||||
|
|
||||||
@admin.register(Tag, site=archivebox_admin)
|
@admin.register(Tag, site=archivebox_admin)
|
||||||
class TagAdmin(admin.ModelAdmin):
|
class TagAdmin(admin.ModelAdmin):
|
||||||
list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id')
|
list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'abid')
|
||||||
sort_fields = ('id', 'name', 'slug')
|
sort_fields = ('id', 'name', 'slug', 'abid')
|
||||||
readonly_fields = ('id', 'num_snapshots', 'snapshots')
|
readonly_fields = ('created', 'modified', 'identifiers', 'num_snapshots', 'snapshots')
|
||||||
search_fields = ('id', 'name', 'slug')
|
search_fields = ('id', 'abid', 'uuid', 'name', 'slug')
|
||||||
fields = (*readonly_fields, 'name', 'slug')
|
fields = ('name', 'slug', 'created_by', *readonly_fields, )
|
||||||
actions = ['delete_selected']
|
actions = ['delete_selected']
|
||||||
ordering = ['-id']
|
ordering = ['-id']
|
||||||
|
|
||||||
def num_snapshots(self, obj):
|
def identifiers(self, obj):
|
||||||
|
return get_abid_info(self, obj)
|
||||||
|
|
||||||
|
def num_snapshots(self, tag):
|
||||||
return format_html(
|
return format_html(
|
||||||
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
|
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
|
||||||
obj.id,
|
tag.id,
|
||||||
obj.snapshot_set.count(),
|
tag.snapshot_set.count(),
|
||||||
)
|
)
|
||||||
|
|
||||||
def snapshots(self, obj):
|
def snapshots(self, tag):
|
||||||
total_count = obj.snapshot_set.count()
|
total_count = tag.snapshot_set.count()
|
||||||
return mark_safe('<br/>'.join(
|
return mark_safe('<br/>'.join(
|
||||||
format_html(
|
format_html(
|
||||||
'{} <code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a> {}</code>',
|
'{} <code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a> {}</code>',
|
||||||
snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...',
|
snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...',
|
||||||
snap.id,
|
snap.pk,
|
||||||
snap.timestamp,
|
snap.abid,
|
||||||
snap.url,
|
snap.url,
|
||||||
)
|
)
|
||||||
for snap in obj.snapshot_set.order_by('-updated')[:10]
|
for snap in tag.snapshot_set.order_by('-updated')[:10]
|
||||||
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={obj.id}">and {total_count-10} more...<a>' if obj.snapshot_set.count() > 10 else ''))
|
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={tag.id}">and {total_count-10} more...<a>' if tag.snapshot_set.count() > 10 else ''))
|
||||||
|
|
||||||
|
|
||||||
@admin.register(ArchiveResult, site=archivebox_admin)
|
@admin.register(ArchiveResult, site=archivebox_admin)
|
||||||
class ArchiveResultAdmin(admin.ModelAdmin):
|
class ArchiveResultAdmin(admin.ModelAdmin):
|
||||||
list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'tags_str', 'cmd_str', 'status', 'output_str')
|
list_display = ('start_ts', 'snapshot_info', 'tags_str', 'extractor', 'cmd_str', 'status', 'output_str')
|
||||||
sort_fields = ('start_ts', 'extractor', 'status')
|
sort_fields = ('start_ts', 'extractor', 'status')
|
||||||
readonly_fields = ('id', 'uuid', 'snapshot_str', 'tags_str')
|
readonly_fields = ('snapshot_info', 'tags_str', 'created_by', 'created', 'modified', 'identifiers')
|
||||||
search_fields = ('id', 'uuid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
search_fields = ('id', 'uuid', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||||
fields = (*readonly_fields, 'snapshot', 'extractor', 'status', 'start_ts', 'end_ts', 'output', 'pwd', 'cmd', 'cmd_version')
|
fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'cmd', 'start_ts', 'end_ts', 'cmd_version', *readonly_fields)
|
||||||
autocomplete_fields = ['snapshot']
|
autocomplete_fields = ['snapshot']
|
||||||
|
|
||||||
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
|
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
|
||||||
|
@ -435,33 +492,36 @@ class ArchiveResultAdmin(admin.ModelAdmin):
|
||||||
list_per_page = SNAPSHOTS_PER_PAGE
|
list_per_page = SNAPSHOTS_PER_PAGE
|
||||||
|
|
||||||
@admin.display(
|
@admin.display(
|
||||||
description='snapshot'
|
description='Snapshot Info'
|
||||||
)
|
)
|
||||||
def snapshot_str(self, obj):
|
def snapshot_info(self, result):
|
||||||
return format_html(
|
return format_html(
|
||||||
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b></a><br/>'
|
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b> {} {}</a><br/>',
|
||||||
'<small>{}</small>',
|
result.snapshot.timestamp,
|
||||||
obj.snapshot.timestamp,
|
result.snapshot.abid,
|
||||||
obj.snapshot.timestamp,
|
result.snapshot.added.strftime('%Y-%m-%d %H:%M'),
|
||||||
obj.snapshot.url[:128],
|
result.snapshot.url[:128],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def identifiers(self, obj):
|
||||||
|
return get_abid_info(self, obj)
|
||||||
|
|
||||||
@admin.display(
|
@admin.display(
|
||||||
description='tags'
|
description='Snapshot Tags'
|
||||||
)
|
)
|
||||||
def tags_str(self, obj):
|
def tags_str(self, result):
|
||||||
return obj.snapshot.tags_str()
|
return result.snapshot.tags_str()
|
||||||
|
|
||||||
def cmd_str(self, obj):
|
def cmd_str(self, result):
|
||||||
return format_html(
|
return format_html(
|
||||||
'<pre>{}</pre>',
|
'<pre>{}</pre>',
|
||||||
' '.join(obj.cmd) if isinstance(obj.cmd, list) else str(obj.cmd),
|
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
|
||||||
)
|
)
|
||||||
|
|
||||||
def output_str(self, obj):
|
def output_str(self, result):
|
||||||
return format_html(
|
return format_html(
|
||||||
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
|
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
|
||||||
obj.snapshot.timestamp,
|
result.snapshot.timestamp,
|
||||||
obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html',
|
result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html',
|
||||||
obj.output,
|
result.output,
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
__package__ = 'archivebox.core'
|
||||||
|
|
||||||
from django.apps import AppConfig
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,6 +7,22 @@ class CoreConfig(AppConfig):
|
||||||
name = 'core'
|
name = 'core'
|
||||||
|
|
||||||
def ready(self):
|
def ready(self):
|
||||||
|
# register our custom admin as the primary django admin
|
||||||
|
from django.contrib import admin
|
||||||
|
from django.contrib.admin import sites
|
||||||
|
from core.admin import archivebox_admin
|
||||||
|
|
||||||
|
admin.site = archivebox_admin
|
||||||
|
sites.site = archivebox_admin
|
||||||
|
|
||||||
|
|
||||||
|
# register signal handlers
|
||||||
from .auth import register_signals
|
from .auth import register_signals
|
||||||
|
|
||||||
register_signals()
|
register_signals()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# from django.contrib.admin.apps import AdminConfig
|
||||||
|
# class CoreAdminConfig(AdminConfig):
|
||||||
|
# default_site = "core.admin.get_admin_site"
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import os
|
__package__ = 'archivebox.core'
|
||||||
from django.conf import settings
|
|
||||||
|
|
||||||
from ..config import (
|
from ..config import (
|
||||||
LDAP
|
LDAP
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
from django.conf import settings
|
|
||||||
from ..config import (
|
from ..config import (
|
||||||
LDAP_CREATE_SUPERUSER
|
LDAP_CREATE_SUPERUSER
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_user(sender, user=None, ldap_user=None, **kwargs):
|
def create_user(sender, user=None, ldap_user=None, **kwargs):
|
||||||
|
|
||||||
if not user.id and LDAP_CREATE_SUPERUSER:
|
if not user.id and LDAP_CREATE_SUPERUSER:
|
||||||
user.is_superuser = True
|
user.is_superuser = True
|
||||||
|
|
||||||
|
|
|
@ -17,8 +17,6 @@ except AttributeError:
|
||||||
|
|
||||||
|
|
||||||
def forwards_func(apps, schema_editor):
|
def forwards_func(apps, schema_editor):
|
||||||
from core.models import EXTRACTORS
|
|
||||||
|
|
||||||
Snapshot = apps.get_model("core", "Snapshot")
|
Snapshot = apps.get_model("core", "Snapshot")
|
||||||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,43 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-13 10:56
|
||||||
|
|
||||||
|
import charidfield.fields
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('core', '0022_auto_20231023_2008'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterModelOptions(
|
||||||
|
name='archiveresult',
|
||||||
|
options={'verbose_name': 'Result'},
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='abid',
|
||||||
|
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='res_', unique=True),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='snapshot',
|
||||||
|
name='abid',
|
||||||
|
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='snp_', unique=True),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='snapshot',
|
||||||
|
name='uuid',
|
||||||
|
field=models.UUIDField(blank=True, null=True, unique=True),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='tag',
|
||||||
|
name='abid',
|
||||||
|
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='tag_', unique=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='extractor',
|
||||||
|
field=models.CharField(choices=[('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('title', 'title'), ('wget', 'wget')], max_length=32),
|
||||||
|
),
|
||||||
|
]
|
98
archivebox/core/migrations/0024_auto_20240513_1143.py
Normal file
98
archivebox/core/migrations/0024_auto_20240513_1143.py
Normal file
|
@ -0,0 +1,98 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-13 11:43
|
||||||
|
|
||||||
|
from django.db import migrations
|
||||||
|
from datetime import datetime
|
||||||
|
from abid_utils.abid import abid_from_values
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_abid(self):
|
||||||
|
"""
|
||||||
|
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
|
||||||
|
"""
|
||||||
|
prefix = self.abid_prefix
|
||||||
|
ts = eval(self.abid_ts_src)
|
||||||
|
uri = eval(self.abid_uri_src)
|
||||||
|
subtype = eval(self.abid_subtype_src)
|
||||||
|
rand = eval(self.abid_rand_src)
|
||||||
|
|
||||||
|
if (not prefix) or prefix == 'obj_':
|
||||||
|
suggested_abid = self.__class__.__name__[:3].lower()
|
||||||
|
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
|
||||||
|
|
||||||
|
if not ts:
|
||||||
|
ts = datetime.utcfromtimestamp(0)
|
||||||
|
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
|
||||||
|
|
||||||
|
if not uri:
|
||||||
|
uri = str(self)
|
||||||
|
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
|
||||||
|
|
||||||
|
if not subtype:
|
||||||
|
subtype = self.__class__.__name__
|
||||||
|
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
|
||||||
|
|
||||||
|
if not rand:
|
||||||
|
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
|
||||||
|
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
|
||||||
|
|
||||||
|
abid = abid_from_values(
|
||||||
|
prefix=prefix,
|
||||||
|
ts=ts,
|
||||||
|
uri=uri,
|
||||||
|
subtype=subtype,
|
||||||
|
rand=rand,
|
||||||
|
)
|
||||||
|
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
|
||||||
|
return abid
|
||||||
|
|
||||||
|
|
||||||
|
def copy_snapshot_uuids(apps, schema_editor):
|
||||||
|
print(' Copying snapshot.id -> snapshot.uuid...')
|
||||||
|
Snapshot = apps.get_model("core", "Snapshot")
|
||||||
|
for snapshot in Snapshot.objects.all():
|
||||||
|
snapshot.uuid = snapshot.id
|
||||||
|
snapshot.save(update_fields=["uuid"])
|
||||||
|
|
||||||
|
def generate_snapshot_abids(apps, schema_editor):
|
||||||
|
print(' Generating snapshot.abid values...')
|
||||||
|
Snapshot = apps.get_model("core", "Snapshot")
|
||||||
|
for snapshot in Snapshot.objects.all():
|
||||||
|
snapshot.abid_prefix = 'snp_'
|
||||||
|
snapshot.abid_ts_src = 'self.added'
|
||||||
|
snapshot.abid_uri_src = 'self.url'
|
||||||
|
snapshot.abid_subtype_src = '"01"'
|
||||||
|
snapshot.abid_rand_src = 'self.uuid'
|
||||||
|
|
||||||
|
snapshot.abid = calculate_abid(snapshot)
|
||||||
|
snapshot.save(update_fields=["abid"])
|
||||||
|
|
||||||
|
def generate_archiveresult_abids(apps, schema_editor):
|
||||||
|
print(' Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)')
|
||||||
|
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||||
|
Snapshot = apps.get_model("core", "Snapshot")
|
||||||
|
for result in ArchiveResult.objects.all():
|
||||||
|
result.abid_prefix = 'res_'
|
||||||
|
result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
|
||||||
|
result.snapshot_added = result.snapshot.added
|
||||||
|
result.snapshot_url = result.snapshot.url
|
||||||
|
result.abid_ts_src = 'self.snapshot_added'
|
||||||
|
result.abid_uri_src = 'self.snapshot_url'
|
||||||
|
result.abid_subtype_src = 'self.extractor'
|
||||||
|
result.abid_rand_src = 'self.id'
|
||||||
|
|
||||||
|
result.abid = calculate_abid(result)
|
||||||
|
result.uuid = result.abid.uuid
|
||||||
|
result.save(update_fields=["abid", "uuid"])
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RunPython(copy_snapshot_uuids, reverse_code=migrations.RunPython.noop),
|
||||||
|
migrations.RunPython(generate_snapshot_abids, reverse_code=migrations.RunPython.noop),
|
||||||
|
migrations.RunPython(generate_archiveresult_abids, reverse_code=migrations.RunPython.noop),
|
||||||
|
]
|
19
archivebox/core/migrations/0025_alter_archiveresult_uuid.py
Normal file
19
archivebox/core/migrations/0025_alter_archiveresult_uuid.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-13 12:08
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('core', '0024_auto_20240513_1143'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='uuid',
|
||||||
|
field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,76 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-13 13:01
|
||||||
|
|
||||||
|
import abid_utils.models
|
||||||
|
import django.db.models.deletion
|
||||||
|
import django.utils.timezone
|
||||||
|
from django.conf import settings
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('core', '0025_alter_archiveresult_uuid'),
|
||||||
|
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='created',
|
||||||
|
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
|
||||||
|
preserve_default=False,
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='created_by',
|
||||||
|
field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='modified',
|
||||||
|
field=models.DateTimeField(auto_now=True),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='snapshot',
|
||||||
|
name='created',
|
||||||
|
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
|
||||||
|
preserve_default=False,
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='snapshot',
|
||||||
|
name='created_by',
|
||||||
|
field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='snapshot',
|
||||||
|
name='modified',
|
||||||
|
field=models.DateTimeField(auto_now=True),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='tag',
|
||||||
|
name='created',
|
||||||
|
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
|
||||||
|
preserve_default=False,
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='tag',
|
||||||
|
name='created_by',
|
||||||
|
field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='tag',
|
||||||
|
name='modified',
|
||||||
|
field=models.DateTimeField(auto_now=True),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='tag',
|
||||||
|
name='uuid',
|
||||||
|
field=models.UUIDField(blank=True, null=True, unique=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='uuid',
|
||||||
|
field=models.UUIDField(blank=True, null=True, unique=True),
|
||||||
|
),
|
||||||
|
]
|
|
@ -10,7 +10,7 @@ class SearchResultsAdminMixin:
|
||||||
|
|
||||||
search_term = search_term.strip()
|
search_term = search_term.strip()
|
||||||
if not search_term:
|
if not search_term:
|
||||||
return qs, use_distinct
|
return qs.distinct(), use_distinct
|
||||||
try:
|
try:
|
||||||
qsearch = query_search_index(search_term)
|
qsearch = query_search_index(search_term)
|
||||||
qs = qs | qsearch
|
qs = qs | qsearch
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
__package__ = 'archivebox.core'
|
__package__ = 'archivebox.core'
|
||||||
|
|
||||||
|
|
||||||
import uuid
|
from typing import Optional, List, Dict
|
||||||
|
from django_stubs_ext.db.models import TypedModelMeta
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from uuid import uuid4
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, List
|
|
||||||
|
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.utils.functional import cached_property
|
from django.utils.functional import cached_property
|
||||||
|
@ -15,40 +18,58 @@ from django.urls import reverse
|
||||||
from django.db.models import Case, When, Value, IntegerField
|
from django.db.models import Case, When, Value, IntegerField
|
||||||
from django.contrib.auth.models import User # noqa
|
from django.contrib.auth.models import User # noqa
|
||||||
|
|
||||||
|
from abid_utils.models import ABIDModel, ABIDField
|
||||||
|
|
||||||
from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
|
from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
|
||||||
from ..system import get_dir_size
|
from ..system import get_dir_size
|
||||||
from ..util import parse_date, base_url, hashurl
|
from ..util import parse_date, base_url
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..index.html import snapshot_icons
|
from ..index.html import snapshot_icons
|
||||||
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
||||||
|
|
||||||
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
|
|
||||||
|
EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
|
||||||
STATUS_CHOICES = [
|
STATUS_CHOICES = [
|
||||||
("succeeded", "succeeded"),
|
("succeeded", "succeeded"),
|
||||||
("failed", "failed"),
|
("failed", "failed"),
|
||||||
("skipped", "skipped")
|
("skipped", "skipped")
|
||||||
]
|
]
|
||||||
|
|
||||||
try:
|
|
||||||
JSONField = models.JSONField
|
|
||||||
except AttributeError:
|
|
||||||
import jsonfield
|
|
||||||
JSONField = jsonfield.JSONField
|
|
||||||
|
|
||||||
|
|
||||||
class Tag(models.Model):
|
# class BaseModel(models.Model):
|
||||||
|
# # TODO: migrate all models to a shared base class with all our standard fields and helpers:
|
||||||
|
# # ulid/created/modified/owner/is_deleted/as_json/from_json/etc.
|
||||||
|
# #
|
||||||
|
# # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
||||||
|
# # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
|
||||||
|
|
||||||
|
# class Meta(TypedModelMeta):
|
||||||
|
# abstract = True
|
||||||
|
|
||||||
|
|
||||||
|
class Tag(ABIDModel):
|
||||||
"""
|
"""
|
||||||
Based on django-taggit model
|
Based on django-taggit model + ABID base.
|
||||||
"""
|
"""
|
||||||
|
abid_prefix = 'tag_'
|
||||||
|
abid_ts_src = 'self.created' # TODO: add created/modified time
|
||||||
|
abid_uri_src = 'self.name'
|
||||||
|
abid_subtype_src = '"03"'
|
||||||
|
abid_rand_src = 'self.id'
|
||||||
|
|
||||||
|
# id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
|
||||||
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
||||||
|
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
|
||||||
|
abid = ABIDField(prefix=abid_prefix)
|
||||||
|
|
||||||
|
|
||||||
name = models.CharField(unique=True, blank=False, max_length=100)
|
name = models.CharField(unique=True, blank=False, max_length=100)
|
||||||
|
|
||||||
# slug is autoset on save from name, never set it manually
|
|
||||||
slug = models.SlugField(unique=True, blank=True, max_length=100)
|
slug = models.SlugField(unique=True, blank=True, max_length=100)
|
||||||
|
# slug is autoset on save from name, never set it manually
|
||||||
|
|
||||||
|
|
||||||
class Meta:
|
class Meta(TypedModelMeta):
|
||||||
verbose_name = "Tag"
|
verbose_name = "Tag"
|
||||||
verbose_name_plural = "Tags"
|
verbose_name_plural = "Tags"
|
||||||
|
|
||||||
|
@ -84,8 +105,16 @@ class Tag(models.Model):
|
||||||
return super().save(*args, **kwargs)
|
return super().save(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class Snapshot(models.Model):
|
class Snapshot(ABIDModel):
|
||||||
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
|
abid_prefix = 'snp_'
|
||||||
|
abid_ts_src = 'self.added'
|
||||||
|
abid_uri_src = 'self.url'
|
||||||
|
abid_subtype_src = '"01"'
|
||||||
|
abid_rand_src = 'self.id'
|
||||||
|
|
||||||
|
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) # legacy pk
|
||||||
|
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
|
||||||
|
abid = ABIDField(prefix=abid_prefix)
|
||||||
|
|
||||||
url = models.URLField(unique=True, db_index=True)
|
url = models.URLField(unique=True, db_index=True)
|
||||||
timestamp = models.CharField(max_length=32, unique=True, db_index=True)
|
timestamp = models.CharField(max_length=32, unique=True, db_index=True)
|
||||||
|
@ -98,6 +127,7 @@ class Snapshot(models.Model):
|
||||||
|
|
||||||
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
|
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
|
||||||
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
title = self.title or '-'
|
title = self.title or '-'
|
||||||
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
|
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
|
||||||
|
@ -126,8 +156,8 @@ class Snapshot(models.Model):
|
||||||
from ..index import load_link_details
|
from ..index import load_link_details
|
||||||
return load_link_details(self.as_link())
|
return load_link_details(self.as_link())
|
||||||
|
|
||||||
def tags_str(self, nocache=True) -> str:
|
def tags_str(self, nocache=True) -> str | None:
|
||||||
cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
|
cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
|
||||||
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
|
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
|
||||||
if nocache:
|
if nocache:
|
||||||
tags_str = calc_tags_str()
|
tags_str = calc_tags_str()
|
||||||
|
@ -157,13 +187,9 @@ class Snapshot(models.Model):
|
||||||
return self.as_link().is_archived
|
return self.as_link().is_archived
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def num_outputs(self):
|
def num_outputs(self) -> int:
|
||||||
return self.archiveresult_set.filter(status='succeeded').count()
|
return self.archiveresult_set.filter(status='succeeded').count()
|
||||||
|
|
||||||
@cached_property
|
|
||||||
def url_hash(self):
|
|
||||||
return hashurl(self.url)
|
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def base_url(self):
|
def base_url(self):
|
||||||
return base_url(self.url)
|
return base_url(self.url)
|
||||||
|
@ -178,7 +204,7 @@ class Snapshot(models.Model):
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def archive_size(self):
|
def archive_size(self):
|
||||||
cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'
|
cache_key = f'{str(self.pk)[:12]}-{(self.updated or self.added).timestamp()}-size'
|
||||||
|
|
||||||
def calc_dir_size():
|
def calc_dir_size():
|
||||||
try:
|
try:
|
||||||
|
@ -199,7 +225,7 @@ class Snapshot(models.Model):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def headers(self) -> Optional[dict]:
|
def headers(self) -> Optional[Dict[str, str]]:
|
||||||
try:
|
try:
|
||||||
return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
|
return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
|
||||||
except Exception:
|
except Exception:
|
||||||
|
@ -250,11 +276,37 @@ class Snapshot(models.Model):
|
||||||
tags_id = []
|
tags_id = []
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
if tag.strip():
|
if tag.strip():
|
||||||
tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
|
tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
|
||||||
self.tags.clear()
|
self.tags.clear()
|
||||||
self.tags.add(*tags_id)
|
self.tags.add(*tags_id)
|
||||||
|
|
||||||
|
|
||||||
|
# def get_storage_dir(self, create=True, symlink=True) -> Path:
|
||||||
|
# date_str = self.added.strftime('%Y%m%d')
|
||||||
|
# domain_str = domain(self.url)
|
||||||
|
# abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
|
||||||
|
|
||||||
|
# if create and not abs_storage_dir.is_dir():
|
||||||
|
# abs_storage_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# if symlink:
|
||||||
|
# LINK_PATHS = [
|
||||||
|
# Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
|
||||||
|
# # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
|
||||||
|
# Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
|
||||||
|
# Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
|
||||||
|
# ]
|
||||||
|
# for link_path in LINK_PATHS:
|
||||||
|
# link_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
# try:
|
||||||
|
# link_path.symlink_to(abs_storage_dir)
|
||||||
|
# except FileExistsError:
|
||||||
|
# link_path.unlink()
|
||||||
|
# link_path.symlink_to(abs_storage_dir)
|
||||||
|
|
||||||
|
# return abs_storage_dir
|
||||||
|
|
||||||
|
|
||||||
class ArchiveResultManager(models.Manager):
|
class ArchiveResultManager(models.Manager):
|
||||||
def indexable(self, sorted: bool = True):
|
def indexable(self, sorted: bool = True):
|
||||||
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
||||||
|
@ -266,13 +318,22 @@ class ArchiveResultManager(models.Manager):
|
||||||
return qs
|
return qs
|
||||||
|
|
||||||
|
|
||||||
class ArchiveResult(models.Model):
|
class ArchiveResult(ABIDModel):
|
||||||
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
abid_prefix = 'res_'
|
||||||
uuid = models.UUIDField(default=uuid.uuid4, editable=False)
|
abid_ts_src = 'self.snapshot.added'
|
||||||
|
abid_uri_src = 'self.snapshot.url'
|
||||||
|
abid_subtype_src = 'self.extractor'
|
||||||
|
abid_rand_src = 'self.uuid'
|
||||||
|
EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
|
||||||
|
|
||||||
|
# id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
|
||||||
|
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') # legacy pk
|
||||||
|
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
|
||||||
|
abid = ABIDField(prefix=abid_prefix)
|
||||||
|
|
||||||
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
||||||
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
|
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
|
||||||
cmd = JSONField()
|
cmd = models.JSONField()
|
||||||
pwd = models.CharField(max_length=256)
|
pwd = models.CharField(max_length=256)
|
||||||
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
|
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
|
||||||
output = models.CharField(max_length=1024)
|
output = models.CharField(max_length=1024)
|
||||||
|
@ -282,5 +343,69 @@ class ArchiveResult(models.Model):
|
||||||
|
|
||||||
objects = ArchiveResultManager()
|
objects = ArchiveResultManager()
|
||||||
|
|
||||||
|
class Meta(TypedModelMeta):
|
||||||
|
verbose_name = 'Result'
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.extractor
|
return self.extractor
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def snapshot_dir(self):
|
||||||
|
return Path(self.snapshot.link_dir)
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def extractor_module(self):
|
||||||
|
return EXTRACTORS[self.extractor]
|
||||||
|
|
||||||
|
def output_path(self) -> str:
|
||||||
|
"""return the canonical output filename or directory name within the snapshot dir"""
|
||||||
|
return self.extractor_module.get_output_path()
|
||||||
|
|
||||||
|
def embed_path(self) -> str:
|
||||||
|
"""
|
||||||
|
return the actual runtime-calculated path to the file on-disk that
|
||||||
|
should be used for user-facing iframe embeds of this result
|
||||||
|
"""
|
||||||
|
|
||||||
|
if hasattr(self.extractor_module, 'get_embed_path'):
|
||||||
|
return self.extractor_module.get_embed_path(self)
|
||||||
|
|
||||||
|
return self.extractor_module.get_output_path()
|
||||||
|
|
||||||
|
def legacy_output_path(self):
|
||||||
|
link = self.snapshot.as_link()
|
||||||
|
return link.canonical_outputs().get(f'{self.extractor}_path')
|
||||||
|
|
||||||
|
def output_exists(self) -> bool:
|
||||||
|
return Path(self.output_path()).exists()
|
||||||
|
|
||||||
|
|
||||||
|
# def get_storage_dir(self, create=True, symlink=True):
|
||||||
|
# date_str = self.snapshot.added.strftime('%Y%m%d')
|
||||||
|
# domain_str = domain(self.snapshot.url)
|
||||||
|
# abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
|
||||||
|
|
||||||
|
# if create and not abs_storage_dir.is_dir():
|
||||||
|
# abs_storage_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# if symlink:
|
||||||
|
# LINK_PATHS = [
|
||||||
|
# Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
|
||||||
|
# # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
|
||||||
|
# # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
|
||||||
|
# Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
|
||||||
|
# Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
|
||||||
|
# ]
|
||||||
|
# for link_path in LINK_PATHS:
|
||||||
|
# link_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
# try:
|
||||||
|
# link_path.symlink_to(abs_storage_dir)
|
||||||
|
# except FileExistsError:
|
||||||
|
# link_path.unlink()
|
||||||
|
# link_path.symlink_to(abs_storage_dir)
|
||||||
|
|
||||||
|
# return abs_storage_dir
|
||||||
|
|
||||||
|
# def symlink_index(self, create=True):
|
||||||
|
# abs_result_dir = self.get_storage_dir(create=create)
|
||||||
|
|
|
@ -10,6 +10,7 @@ from pathlib import Path
|
||||||
from django.utils.crypto import get_random_string
|
from django.utils.crypto import get_random_string
|
||||||
|
|
||||||
from ..config import (
|
from ..config import (
|
||||||
|
CONFIG,
|
||||||
DEBUG,
|
DEBUG,
|
||||||
SECRET_KEY,
|
SECRET_KEY,
|
||||||
ALLOWED_HOSTS,
|
ALLOWED_HOSTS,
|
||||||
|
@ -18,7 +19,9 @@ from ..config import (
|
||||||
CUSTOM_TEMPLATES_DIR,
|
CUSTOM_TEMPLATES_DIR,
|
||||||
SQL_INDEX_FILENAME,
|
SQL_INDEX_FILENAME,
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
|
ARCHIVE_DIR,
|
||||||
LOGS_DIR,
|
LOGS_DIR,
|
||||||
|
CACHE_DIR,
|
||||||
TIMEZONE,
|
TIMEZONE,
|
||||||
|
|
||||||
LDAP,
|
LDAP,
|
||||||
|
@ -52,6 +55,26 @@ APPEND_SLASH = True
|
||||||
|
|
||||||
DEBUG = DEBUG or ('--debug' in sys.argv)
|
DEBUG = DEBUG or ('--debug' in sys.argv)
|
||||||
|
|
||||||
|
|
||||||
|
# add plugins folders to system path, and load plugins in installed_apps
|
||||||
|
BUILTIN_PLUGINS_DIR = PACKAGE_DIR / 'plugins'
|
||||||
|
USER_PLUGINS_DIR = OUTPUT_DIR / 'plugins'
|
||||||
|
sys.path.insert(0, str(BUILTIN_PLUGINS_DIR))
|
||||||
|
sys.path.insert(0, str(USER_PLUGINS_DIR))
|
||||||
|
|
||||||
|
def find_plugins(plugins_dir):
|
||||||
|
return {
|
||||||
|
# plugin_entrypoint.parent.name: import_module(plugin_entrypoint.parent.name).METADATA
|
||||||
|
plugin_entrypoint.parent.name: plugin_entrypoint.parent
|
||||||
|
for plugin_entrypoint in plugins_dir.glob('*/apps.py')
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTALLED_PLUGINS = {
|
||||||
|
**find_plugins(BUILTIN_PLUGINS_DIR),
|
||||||
|
**find_plugins(USER_PLUGINS_DIR),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
INSTALLED_APPS = [
|
INSTALLED_APPS = [
|
||||||
'django.contrib.auth',
|
'django.contrib.auth',
|
||||||
'django.contrib.contenttypes',
|
'django.contrib.contenttypes',
|
||||||
|
@ -59,8 +82,17 @@ INSTALLED_APPS = [
|
||||||
'django.contrib.messages',
|
'django.contrib.messages',
|
||||||
'django.contrib.staticfiles',
|
'django.contrib.staticfiles',
|
||||||
'django.contrib.admin',
|
'django.contrib.admin',
|
||||||
|
'django_jsonform',
|
||||||
|
|
||||||
|
'signal_webhooks',
|
||||||
|
'abid_utils',
|
||||||
|
'plugantic',
|
||||||
'core',
|
'core',
|
||||||
|
'api',
|
||||||
|
|
||||||
|
*INSTALLED_PLUGINS.keys(),
|
||||||
|
|
||||||
|
'admin_data_views',
|
||||||
|
|
||||||
'django_extensions',
|
'django_extensions',
|
||||||
]
|
]
|
||||||
|
@ -172,6 +204,17 @@ if DEBUG_TOOLBAR:
|
||||||
]
|
]
|
||||||
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
|
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
|
||||||
|
|
||||||
|
|
||||||
|
# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
|
||||||
|
# Must delete archivebox/templates/admin to use because it relies on some things we override
|
||||||
|
# visit /__requests_tracker__/ to access
|
||||||
|
DEBUG_REQUESTS_TRACKER = False
|
||||||
|
if DEBUG_REQUESTS_TRACKER:
|
||||||
|
INSTALLED_APPS += ["requests_tracker"]
|
||||||
|
MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
|
||||||
|
INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
|
||||||
|
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
### Staticfile and Template Settings
|
### Staticfile and Template Settings
|
||||||
################################################################################
|
################################################################################
|
||||||
|
@ -211,6 +254,11 @@ TEMPLATES = [
|
||||||
### External Service Settings
|
### External Service Settings
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
|
|
||||||
|
CACHE_DB_FILENAME = 'cache.sqlite3'
|
||||||
|
CACHE_DB_PATH = CACHE_DIR / CACHE_DB_FILENAME
|
||||||
|
CACHE_DB_TABLE = 'django_cache'
|
||||||
|
|
||||||
DATABASE_FILE = Path(OUTPUT_DIR) / SQL_INDEX_FILENAME
|
DATABASE_FILE = Path(OUTPUT_DIR) / SQL_INDEX_FILENAME
|
||||||
DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE))
|
DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE))
|
||||||
|
|
||||||
|
@ -224,23 +272,56 @@ DATABASES = {
|
||||||
},
|
},
|
||||||
'TIME_ZONE': TIMEZONE,
|
'TIME_ZONE': TIMEZONE,
|
||||||
# DB setup is sometimes modified at runtime by setup_django() in config.py
|
# DB setup is sometimes modified at runtime by setup_django() in config.py
|
||||||
}
|
},
|
||||||
|
# 'cache': {
|
||||||
|
# 'ENGINE': 'django.db.backends.sqlite3',
|
||||||
|
# 'NAME': CACHE_DB_PATH,
|
||||||
|
# 'OPTIONS': {
|
||||||
|
# 'timeout': 60,
|
||||||
|
# 'check_same_thread': False,
|
||||||
|
# },
|
||||||
|
# 'TIME_ZONE': TIMEZONE,
|
||||||
|
# },
|
||||||
}
|
}
|
||||||
|
MIGRATION_MODULES = {'signal_webhooks': None}
|
||||||
|
|
||||||
|
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
|
||||||
|
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||||
|
|
||||||
CACHE_BACKEND = 'django.core.cache.backends.locmem.LocMemCache'
|
|
||||||
# CACHE_BACKEND = 'django.core.cache.backends.db.DatabaseCache'
|
|
||||||
# CACHE_BACKEND = 'django.core.cache.backends.dummy.DummyCache'
|
|
||||||
|
|
||||||
CACHES = {
|
CACHES = {
|
||||||
'default': {
|
'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'},
|
||||||
'BACKEND': CACHE_BACKEND,
|
# 'sqlite': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'},
|
||||||
'LOCATION': 'django_cache_default',
|
# 'dummy': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'},
|
||||||
}
|
# 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'},
|
||||||
}
|
}
|
||||||
|
|
||||||
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
|
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
|
||||||
|
|
||||||
|
|
||||||
|
STORAGES = {
|
||||||
|
"default": {
|
||||||
|
"BACKEND": "django.core.files.storage.FileSystemStorage",
|
||||||
|
},
|
||||||
|
"staticfiles": {
|
||||||
|
"BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage",
|
||||||
|
},
|
||||||
|
"archive": {
|
||||||
|
"BACKEND": "django.core.files.storage.FileSystemStorage",
|
||||||
|
"OPTIONS": {
|
||||||
|
"base_url": "/archive/",
|
||||||
|
"location": ARCHIVE_DIR,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
# "personas": {
|
||||||
|
# "BACKEND": "django.core.files.storage.FileSystemStorage",
|
||||||
|
# "OPTIONS": {
|
||||||
|
# "base_url": "/personas/",
|
||||||
|
# "location": PERSONAS_DIR,
|
||||||
|
# },
|
||||||
|
# },
|
||||||
|
}
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
### Security Settings
|
### Security Settings
|
||||||
################################################################################
|
################################################################################
|
||||||
|
@ -367,3 +448,54 @@ LOGGING = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Add default webhook configuration to the User model
|
||||||
|
SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
|
||||||
|
SIGNAL_WEBHOOKS = {
|
||||||
|
"HOOKS": {
|
||||||
|
# ... is a special sigil value that means "use the default autogenerated hooks"
|
||||||
|
"django.contrib.auth.models.User": ...,
|
||||||
|
"core.models.Snapshot": ...,
|
||||||
|
"core.models.ArchiveResult": ...,
|
||||||
|
"core.models.Tag": ...,
|
||||||
|
"api.models.APIToken": ...,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ADMIN_DATA_VIEWS = {
|
||||||
|
"NAME": "Environment",
|
||||||
|
"URLS": [
|
||||||
|
{
|
||||||
|
"route": "config/",
|
||||||
|
"view": "core.views.live_config_list_view",
|
||||||
|
"name": "Configuration",
|
||||||
|
"items": {
|
||||||
|
"route": "<str:key>/",
|
||||||
|
"view": "core.views.live_config_value_view",
|
||||||
|
"name": "config_val",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"route": "binaries/",
|
||||||
|
"view": "plugantic.views.binaries_list_view",
|
||||||
|
"name": "Binaries",
|
||||||
|
"items": {
|
||||||
|
"route": "<str:key>/",
|
||||||
|
"view": "plugantic.views.binary_detail_view",
|
||||||
|
"name": "binary",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"route": "plugins/",
|
||||||
|
"view": "plugantic.views.plugins_list_view",
|
||||||
|
"name": "Plugins",
|
||||||
|
"items": {
|
||||||
|
"route": "<str:key>/",
|
||||||
|
"view": "plugantic.views.plugin_detail_view",
|
||||||
|
"name": "plugin",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .admin import archivebox_admin
|
__package__ = 'archivebox.core'
|
||||||
|
|
||||||
from django.urls import path, include
|
from django.urls import path, include
|
||||||
from django.views import static
|
from django.views import static
|
||||||
|
@ -6,7 +6,14 @@ from django.contrib.staticfiles.urls import staticfiles_urlpatterns
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.views.generic.base import RedirectView
|
from django.views.generic.base import RedirectView
|
||||||
|
|
||||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
from .admin import archivebox_admin
|
||||||
|
from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
||||||
|
|
||||||
|
|
||||||
|
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
|
||||||
|
# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
|
||||||
|
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
|
||||||
|
|
||||||
|
|
||||||
# print('DEBUG', settings.DEBUG)
|
# print('DEBUG', settings.DEBUG)
|
||||||
|
|
||||||
|
@ -31,8 +38,10 @@ urlpatterns = [
|
||||||
path('accounts/', include('django.contrib.auth.urls')),
|
path('accounts/', include('django.contrib.auth.urls')),
|
||||||
path('admin/', archivebox_admin.urls),
|
path('admin/', archivebox_admin.urls),
|
||||||
|
|
||||||
|
path("api/", include('api.urls')),
|
||||||
|
|
||||||
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
||||||
path('error/', lambda _: 1/0),
|
path('error/', lambda *_: 1/0),
|
||||||
|
|
||||||
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
|
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
|
||||||
|
|
||||||
|
@ -43,10 +52,10 @@ urlpatterns = [
|
||||||
urlpatterns += staticfiles_urlpatterns()
|
urlpatterns += staticfiles_urlpatterns()
|
||||||
|
|
||||||
if settings.DEBUG_TOOLBAR:
|
if settings.DEBUG_TOOLBAR:
|
||||||
import debug_toolbar
|
urlpatterns += [path('__debug__/', include("debug_toolbar.urls"))]
|
||||||
urlpatterns += [
|
|
||||||
path('__debug__/', include(debug_toolbar.urls)),
|
if settings.DEBUG_REQUESTS_TRACKER:
|
||||||
]
|
urlpatterns += [path("__requests_tracker__/", include("requests_tracker.urls"))]
|
||||||
|
|
||||||
|
|
||||||
# # Proposed FUTURE URLs spec
|
# # Proposed FUTURE URLs spec
|
||||||
|
|
|
@ -1,10 +1,13 @@
|
||||||
__package__ = 'archivebox.core'
|
__package__ = 'archivebox.core'
|
||||||
|
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
from pathlib import Path
|
||||||
from contextlib import redirect_stdout
|
from contextlib import redirect_stdout
|
||||||
|
|
||||||
from django.shortcuts import render, redirect
|
from django.shortcuts import render, redirect
|
||||||
from django.http import HttpResponse, Http404
|
from django.http import HttpRequest, HttpResponse, Http404
|
||||||
from django.utils.html import format_html, mark_safe
|
from django.utils.html import format_html, mark_safe
|
||||||
from django.views import View, static
|
from django.views import View, static
|
||||||
from django.views.generic.list import ListView
|
from django.views.generic.list import ListView
|
||||||
|
@ -14,6 +17,10 @@ from django.contrib.auth.mixins import UserPassesTestMixin
|
||||||
from django.views.decorators.csrf import csrf_exempt
|
from django.views.decorators.csrf import csrf_exempt
|
||||||
from django.utils.decorators import method_decorator
|
from django.utils.decorators import method_decorator
|
||||||
|
|
||||||
|
from admin_data_views.typing import TableContext, ItemContext
|
||||||
|
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||||
|
|
||||||
|
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
from core.forms import AddLinkForm
|
from core.forms import AddLinkForm
|
||||||
|
|
||||||
|
@ -26,10 +33,18 @@ from ..config import (
|
||||||
COMMIT_HASH,
|
COMMIT_HASH,
|
||||||
FOOTER_INFO,
|
FOOTER_INFO,
|
||||||
SNAPSHOTS_PER_PAGE,
|
SNAPSHOTS_PER_PAGE,
|
||||||
|
CONFIG,
|
||||||
|
CONFIG_SCHEMA,
|
||||||
|
DYNAMIC_CONFIG_SCHEMA,
|
||||||
|
USER_CONFIG,
|
||||||
|
SAVE_ARCHIVE_DOT_ORG,
|
||||||
|
PREVIEW_ORIGINALS,
|
||||||
)
|
)
|
||||||
|
from ..logging_util import printable_filesize
|
||||||
from ..main import add
|
from ..main import add
|
||||||
from ..util import base_url, ansi_to_html
|
from ..util import base_url, ansi_to_html, htmlencode, urldecode, urlencode, ts_to_date_str
|
||||||
from ..search import query_search_index
|
from ..search import query_search_index
|
||||||
|
from ..extractors.wget import wget_output_path
|
||||||
|
|
||||||
|
|
||||||
class HomepageView(View):
|
class HomepageView(View):
|
||||||
|
@ -46,10 +61,120 @@ class HomepageView(View):
|
||||||
class SnapshotView(View):
|
class SnapshotView(View):
|
||||||
# render static html index from filesystem archive/<timestamp>/index.html
|
# render static html index from filesystem archive/<timestamp>/index.html
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def render_live_index(request, snapshot):
|
||||||
|
TITLE_LOADING_MSG = 'Not yet archived...'
|
||||||
|
HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org')
|
||||||
|
|
||||||
|
archiveresults = {}
|
||||||
|
|
||||||
|
results = snapshot.archiveresult_set.all()
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
embed_path = result.embed_path()
|
||||||
|
abs_path = result.snapshot_dir / (embed_path or 'None')
|
||||||
|
|
||||||
|
if (result.status == 'succeeded'
|
||||||
|
and (result.extractor not in HIDDEN_RESULTS)
|
||||||
|
and embed_path
|
||||||
|
and abs_path.exists()):
|
||||||
|
if abs_path.is_dir() and not any(abs_path.glob('*.*')):
|
||||||
|
continue
|
||||||
|
|
||||||
|
result_info = {
|
||||||
|
'name': result.extractor,
|
||||||
|
'path': embed_path,
|
||||||
|
'ts': ts_to_date_str(result.end_ts),
|
||||||
|
'size': abs_path.stat().st_size or '?',
|
||||||
|
}
|
||||||
|
archiveresults[result.extractor] = result_info
|
||||||
|
|
||||||
|
existing_files = {result['path'] for result in archiveresults.values()}
|
||||||
|
min_size_threshold = 10_000 # bytes
|
||||||
|
allowed_extensions = {
|
||||||
|
'txt',
|
||||||
|
'html',
|
||||||
|
'htm',
|
||||||
|
'png',
|
||||||
|
'jpg',
|
||||||
|
'jpeg',
|
||||||
|
'gif',
|
||||||
|
'webp'
|
||||||
|
'svg',
|
||||||
|
'webm',
|
||||||
|
'mp4',
|
||||||
|
'mp3',
|
||||||
|
'pdf',
|
||||||
|
'md',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# iterate through all the files in the snapshot dir and add the biggest ones to the result list
|
||||||
|
snap_dir = Path(snapshot.link_dir)
|
||||||
|
for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')):
|
||||||
|
extension = result_file.suffix.lstrip('.').lower()
|
||||||
|
if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions:
|
||||||
|
continue
|
||||||
|
if result_file.name in existing_files or result_file.name == 'index.html':
|
||||||
|
continue
|
||||||
|
|
||||||
|
file_size = result_file.stat().st_size or 0
|
||||||
|
|
||||||
|
if file_size > min_size_threshold:
|
||||||
|
archiveresults[result_file.name] = {
|
||||||
|
'name': result_file.stem,
|
||||||
|
'path': result_file.relative_to(snap_dir),
|
||||||
|
'ts': ts_to_date_str(result_file.stat().st_mtime or 0),
|
||||||
|
'size': file_size,
|
||||||
|
}
|
||||||
|
|
||||||
|
preferred_types = ('singlefile', 'wget', 'screenshot', 'dom', 'media', 'pdf', 'readability', 'mercury')
|
||||||
|
all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
|
||||||
|
|
||||||
|
best_result = {'path': 'None'}
|
||||||
|
for result_type in preferred_types:
|
||||||
|
if result_type in archiveresults:
|
||||||
|
best_result = archiveresults[result_type]
|
||||||
|
break
|
||||||
|
|
||||||
|
link = snapshot.as_link()
|
||||||
|
|
||||||
|
link_info = link._asdict(extended=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name
|
||||||
|
except IndexError:
|
||||||
|
warc_path = 'warc/'
|
||||||
|
|
||||||
|
context = {
|
||||||
|
**link_info,
|
||||||
|
**link_info['canonical'],
|
||||||
|
'title': htmlencode(
|
||||||
|
link.title
|
||||||
|
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
|
||||||
|
),
|
||||||
|
'extension': link.extension or 'html',
|
||||||
|
'tags': link.tags or 'untagged',
|
||||||
|
'size': printable_filesize(link.archive_size) if link.archive_size else 'pending',
|
||||||
|
'status': 'archived' if link.is_archived else 'not yet archived',
|
||||||
|
'status_color': 'success' if link.is_archived else 'danger',
|
||||||
|
'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
|
||||||
|
'warc_path': warc_path,
|
||||||
|
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
|
||||||
|
'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS,
|
||||||
|
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
|
||||||
|
'best_result': best_result,
|
||||||
|
# 'tags_str': 'somealskejrewlkrjwer,werlmwrwlekrjewlkrjwer324m532l,4m32,23m324234',
|
||||||
|
}
|
||||||
|
return render(template_name='core/snapshot_live.html', request=request, context=context)
|
||||||
|
|
||||||
|
|
||||||
def get(self, request, path):
|
def get(self, request, path):
|
||||||
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
||||||
return redirect(f'/admin/login/?next={request.path}')
|
return redirect(f'/admin/login/?next={request.path}')
|
||||||
|
|
||||||
|
snapshot = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
slug, archivefile = path.split('/', 1)
|
slug, archivefile = path.split('/', 1)
|
||||||
except (IndexError, ValueError):
|
except (IndexError, ValueError):
|
||||||
|
@ -65,7 +190,11 @@ class SnapshotView(View):
|
||||||
try:
|
try:
|
||||||
try:
|
try:
|
||||||
snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
|
snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
|
||||||
response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
|
if archivefile == 'index.html':
|
||||||
|
# if they requested snapshot index, serve live rendered template instead of static html
|
||||||
|
response = self.render_live_index(request, snapshot)
|
||||||
|
else:
|
||||||
|
response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
|
||||||
response["Link"] = f'<{snapshot.url}>; rel="canonical"'
|
response["Link"] = f'<{snapshot.url}>; rel="canonical"'
|
||||||
return response
|
return response
|
||||||
except Snapshot.DoesNotExist:
|
except Snapshot.DoesNotExist:
|
||||||
|
@ -117,26 +246,33 @@ class SnapshotView(View):
|
||||||
status=404,
|
status=404,
|
||||||
)
|
)
|
||||||
except Http404:
|
except Http404:
|
||||||
|
assert snapshot # (Snapshot.DoesNotExist is already handled above)
|
||||||
|
|
||||||
# Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
|
# Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
|
||||||
return HttpResponse(
|
return HttpResponse(
|
||||||
format_html(
|
format_html(
|
||||||
(
|
(
|
||||||
'<center><br/><br/><br/>'
|
'<center><br/><br/><br/>'
|
||||||
f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a> exists in DB, but resource <b><code>{snapshot.timestamp}/'
|
f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a>: <a href="{snapshot.url}" target="_blank" rel="noreferrer">{snapshot.url}</a><br/>'
|
||||||
|
f'was queued on {str(snapshot.added).split(".")[0]}, '
|
||||||
|
f'but no files have been saved yet in:<br/><b><a href="/archive/{snapshot.timestamp}/" target="_top"><code>{snapshot.timestamp}</code></a><code>/'
|
||||||
'{}'
|
'{}'
|
||||||
f'</code></b> does not exist in <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
|
f'</code></b><br/><br/>'
|
||||||
'Maybe this resource type is not availabe for this Snapshot,<br/>or the archiving process has not completed yet?<br/>'
|
'It\'s possible {} '
|
||||||
f'<pre><code># run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
|
f'during the last capture on {str(snapshot.added).split(".")[0]},<br/>or that the archiving process has not completed yet.<br/>'
|
||||||
|
f'<pre><code># run this cmd to finish/retry archiving this Snapshot</code><br/>'
|
||||||
|
f'<code style="user-select: all; color: #333">archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
|
||||||
'<div class="text-align: left; width: 100%; max-width: 400px">'
|
'<div class="text-align: left; width: 100%; max-width: 400px">'
|
||||||
'<i><b>Next steps:</i></b><br/>'
|
'<i><b>Next steps:</i></b><br/>'
|
||||||
f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
|
f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
|
||||||
f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
|
f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
|
||||||
f'- go to the <a href="/admin/core/snapshot/{snapshot.id}/change/" target="_top">Snapshot admin</a> to edit<br/>'
|
f'- go to the <a href="/admin/core/snapshot/{snapshot.pk}/change/" target="_top">Snapshot admin</a> to edit<br/>'
|
||||||
f'- go to the <a href="/admin/core/snapshot/?id__startswith={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
|
f'- go to the <a href="/admin/core/snapshot/?uuid__startswith={snapshot.uuid}" target="_top">Snapshot actions</a> to re-archive<br/>'
|
||||||
'- or return to <a href="/" target="_top">the main index...</a></div>'
|
'- or return to <a href="/" target="_top">the main index...</a></div>'
|
||||||
'</center>'
|
'</center>'
|
||||||
),
|
),
|
||||||
archivefile,
|
archivefile if str(archivefile) != 'None' else '',
|
||||||
|
f'the {archivefile} resource could not be fetched' if str(archivefile) != 'None' else 'the original site was not available',
|
||||||
),
|
),
|
||||||
content_type="text/html",
|
content_type="text/html",
|
||||||
status=404,
|
status=404,
|
||||||
|
@ -312,3 +448,124 @@ class HealthCheckView(View):
|
||||||
content_type='text/plain',
|
content_type='text/plain',
|
||||||
status=200
|
status=200
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def find_config_section(key: str) -> str:
|
||||||
|
matching_sections = [
|
||||||
|
name for name, opts in CONFIG_SCHEMA.items() if key in opts
|
||||||
|
]
|
||||||
|
section = matching_sections[0] if matching_sections else 'DYNAMIC'
|
||||||
|
return section
|
||||||
|
|
||||||
|
def find_config_default(key: str) -> str:
|
||||||
|
default_val = USER_CONFIG.get(key, {}).get('default', lambda: None)
|
||||||
|
if isinstance(default_val, Callable):
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
default_val = repr(default_val)
|
||||||
|
return default_val
|
||||||
|
|
||||||
|
def find_config_type(key: str) -> str:
|
||||||
|
if key in USER_CONFIG:
|
||||||
|
return USER_CONFIG[key]['type'].__name__
|
||||||
|
elif key in DYNAMIC_CONFIG_SCHEMA:
|
||||||
|
return type(CONFIG[key]).__name__
|
||||||
|
return 'str'
|
||||||
|
|
||||||
|
def key_is_safe(key: str) -> bool:
|
||||||
|
for term in ('key', 'password', 'secret', 'token'):
|
||||||
|
if term in key.lower():
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
@render_with_table_view
|
||||||
|
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||||
|
|
||||||
|
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||||
|
|
||||||
|
rows = {
|
||||||
|
"Section": [],
|
||||||
|
"Key": [],
|
||||||
|
"Type": [],
|
||||||
|
"Value": [],
|
||||||
|
"Default": [],
|
||||||
|
# "Documentation": [],
|
||||||
|
"Aliases": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for section in CONFIG_SCHEMA.keys():
|
||||||
|
for key in CONFIG_SCHEMA[section].keys():
|
||||||
|
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
|
||||||
|
rows['Key'].append(ItemLink(key, key=key))
|
||||||
|
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
|
||||||
|
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
|
||||||
|
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
|
||||||
|
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
|
||||||
|
rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', [])))
|
||||||
|
|
||||||
|
section = 'DYNAMIC'
|
||||||
|
for key in DYNAMIC_CONFIG_SCHEMA.keys():
|
||||||
|
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
|
||||||
|
rows['Key'].append(ItemLink(key, key=key))
|
||||||
|
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
|
||||||
|
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
|
||||||
|
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
|
||||||
|
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
|
||||||
|
rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
|
||||||
|
|
||||||
|
return TableContext(
|
||||||
|
title="Computed Configuration Values",
|
||||||
|
table=rows,
|
||||||
|
)
|
||||||
|
|
||||||
|
@render_with_item_view
|
||||||
|
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||||
|
|
||||||
|
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||||
|
|
||||||
|
aliases = USER_CONFIG.get(key, {}).get("aliases", [])
|
||||||
|
|
||||||
|
return ItemContext(
|
||||||
|
slug=key,
|
||||||
|
title=key,
|
||||||
|
data=[
|
||||||
|
{
|
||||||
|
"name": mark_safe(f'data / ArchiveBox.conf [{find_config_section(key)}] <b><code style="color: lightgray">{key}</code></b>' if key in USER_CONFIG else f'[DYNAMIC CONFIG] <b><code style="color: lightgray">{key}</code></b> <small>(calculated at runtime)</small>'),
|
||||||
|
"description": None,
|
||||||
|
"fields": {
|
||||||
|
'Key': key,
|
||||||
|
'Type': find_config_type(key),
|
||||||
|
'Value': CONFIG[key] if key_is_safe(key) else '********',
|
||||||
|
},
|
||||||
|
"help_texts": {
|
||||||
|
'Key': mark_safe(f'''
|
||||||
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a>
|
||||||
|
<span style="display: {"inline" if aliases else "none"}">
|
||||||
|
Aliases: {", ".join(aliases)}
|
||||||
|
</span>
|
||||||
|
'''),
|
||||||
|
'Type': mark_safe(f'''
|
||||||
|
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code">
|
||||||
|
See full definition in <code>archivebox/config.py</code>...
|
||||||
|
</a>
|
||||||
|
'''),
|
||||||
|
'Value': mark_safe(f'''
|
||||||
|
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
|
||||||
|
Default: <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code">
|
||||||
|
<code>{find_config_default(key) or 'See 1here...'}</code>
|
||||||
|
</a>
|
||||||
|
<br/><br/>
|
||||||
|
<p style="display: {"block" if key in USER_CONFIG else "none"}">
|
||||||
|
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
|
||||||
|
<br/><br/>
|
||||||
|
<code>archivebox config --set {key}="{
|
||||||
|
val.strip("'")
|
||||||
|
if (val := find_config_default(key)) else
|
||||||
|
(repr(CONFIG[key] if key_is_safe(key) else '********')).strip("'")
|
||||||
|
}"</code>
|
||||||
|
</p>
|
||||||
|
'''),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
__package__ = 'archivebox.extractors'
|
__package__ = 'archivebox.extractors'
|
||||||
|
|
||||||
|
from typing import Callable, Optional, Dict, List, Iterable, Union, Protocol, cast
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from importlib import import_module
|
||||||
from typing import Callable, Optional, List, Iterable, Union
|
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from django.db.models import QuerySet
|
from django.db.models import QuerySet
|
||||||
|
|
||||||
from ..config import (
|
from ..config import (
|
||||||
|
@ -158,7 +160,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
||||||
# bump the updated time on the main Snapshot here, this is critical
|
# bump the updated time on the main Snapshot here, this is critical
|
||||||
# to be able to cache summaries of the ArchiveResults for a given
|
# to be able to cache summaries of the ArchiveResults for a given
|
||||||
# snapshot without having to load all the results from the DB each time.
|
# snapshot without having to load all the results from the DB each time.
|
||||||
# (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume
|
# (we use {Snapshot.pk}-{Snapshot.updated} as the cache key and assume
|
||||||
# ArchiveResults are unchanged as long as the updated timestamp is unchanged)
|
# ArchiveResults are unchanged as long as the updated timestamp is unchanged)
|
||||||
snapshot.save()
|
snapshot.save()
|
||||||
else:
|
else:
|
||||||
|
@ -240,3 +242,37 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
|
||||||
|
|
||||||
log_archiving_finished(num_links)
|
log_archiving_finished(num_links)
|
||||||
return all_links
|
return all_links
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
EXTRACTORS_DIR = Path(__file__).parent
|
||||||
|
|
||||||
|
class ExtractorModuleProtocol(Protocol):
|
||||||
|
"""Type interface for an Extractor Module (WIP)"""
|
||||||
|
|
||||||
|
get_output_path: Callable
|
||||||
|
|
||||||
|
# TODO:
|
||||||
|
# get_embed_path: Callable | None
|
||||||
|
# should_extract(Snapshot)
|
||||||
|
# extract(Snapshot)
|
||||||
|
|
||||||
|
|
||||||
|
def get_extractors(dir: Path=EXTRACTORS_DIR) -> Dict[str, ExtractorModuleProtocol]:
|
||||||
|
"""iterate through archivebox/extractors/*.py and load extractor modules"""
|
||||||
|
EXTRACTORS = {}
|
||||||
|
|
||||||
|
for filename in EXTRACTORS_DIR.glob('*.py'):
|
||||||
|
if filename.name.startswith('__'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
extractor_name = filename.name.replace('.py', '')
|
||||||
|
|
||||||
|
extractor_module = cast(ExtractorModuleProtocol, import_module(f'.{extractor_name}', package=__package__))
|
||||||
|
|
||||||
|
assert getattr(extractor_module, 'get_output_path')
|
||||||
|
EXTRACTORS[extractor_name] = extractor_module
|
||||||
|
|
||||||
|
return EXTRACTORS
|
||||||
|
|
||||||
|
EXTRACTORS = get_extractors(EXTRACTORS_DIR)
|
||||||
|
|
|
@ -24,6 +24,8 @@ from ..config import (
|
||||||
)
|
)
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'archive.org.txt'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
@ -32,7 +34,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'archive.org.txt').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
# if open(path, 'r', encoding='utf-8').read().strip() != 'None':
|
# if open(path, 'r', encoding='utf-8').read().strip() != 'None':
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -43,7 +45,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
|
||||||
"""submit site to archive.org for archiving via their service, save returned archive url"""
|
"""submit site to archive.org for archiving via their service, save returned archive url"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = 'archive.org.txt'
|
output: ArchiveOutput = get_output_path()
|
||||||
archive_org_url = None
|
archive_org_url = None
|
||||||
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||||
# later options take precedence
|
# later options take precedence
|
||||||
|
@ -88,7 +90,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
|
||||||
archive_org_url = archive_org_url or submit_url
|
archive_org_url = archive_org_url or submit_url
|
||||||
with open(str(out_dir / output), 'w', encoding='utf-8') as f:
|
with open(str(out_dir / output), 'w', encoding='utf-8') as f:
|
||||||
f.write(archive_org_url)
|
f.write(archive_org_url)
|
||||||
chmod_file('archive.org.txt', cwd=str(out_dir))
|
chmod_file(str(out_dir / output), cwd=str(out_dir))
|
||||||
output = archive_org_url
|
output = archive_org_url
|
||||||
|
|
||||||
return ArchiveResult(
|
return ArchiveResult(
|
||||||
|
|
|
@ -19,6 +19,9 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'output.html'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
|
@ -26,8 +29,8 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'output.html').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
if (out_dir / 'output.html').stat().st_size > 1:
|
if (out_dir / get_output_path()).stat().st_size > 1:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_DOM
|
return SAVE_DOM
|
||||||
|
@ -37,7 +40,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
"""print HTML of site to file using chrome --dump-html"""
|
"""print HTML of site to file using chrome --dump-html"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = 'output.html'
|
output: ArchiveOutput = get_output_path()
|
||||||
output_path = out_dir / output
|
output_path = out_dir / output
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(),
|
*chrome_args(),
|
||||||
|
|
|
@ -8,8 +8,8 @@ from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||||
from ..system import chmod_file, run
|
from ..system import chmod_file, run
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
domain,
|
domain,
|
||||||
dedupe,
|
dedupe,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
|
@ -33,6 +33,11 @@ def should_save_favicon(link: Link, out_dir: Optional[str]=None, overwrite: Opti
|
||||||
|
|
||||||
return SAVE_FAVICON
|
return SAVE_FAVICON
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def get_output_path():
|
||||||
|
return 'favicon.ico'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||||
"""download site favicon from google's favicon api"""
|
"""download site favicon from google's favicon api"""
|
||||||
|
|
|
@ -26,6 +26,19 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'git/'
|
||||||
|
|
||||||
|
def get_embed_path(archiveresult=None):
|
||||||
|
if not archiveresult:
|
||||||
|
return get_output_path()
|
||||||
|
|
||||||
|
try:
|
||||||
|
return get_output_path() + list((archiveresult.snapshot_dir / get_output_path()).glob('*'))[0].name + '/'
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return get_output_path()
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
|
@ -33,7 +46,7 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'git').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
is_clonable_url = (
|
is_clonable_url = (
|
||||||
|
@ -51,7 +64,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
"""download full site using git"""
|
"""download full site using git"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = 'git'
|
output: ArchiveOutput = get_output_path()
|
||||||
output_path = out_dir / output
|
output_path = out_dir / output
|
||||||
output_path.mkdir(exist_ok=True)
|
output_path.mkdir(exist_ok=True)
|
||||||
cmd = [
|
cmd = [
|
||||||
|
|
|
@ -23,10 +23,14 @@ from ..config import (
|
||||||
)
|
)
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'headers.json'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'headers.json').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_HEADERS
|
return SAVE_HEADERS
|
||||||
|
@ -38,7 +42,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
||||||
|
|
||||||
out_dir = Path(out_dir or link.link_dir)
|
out_dir = Path(out_dir or link.link_dir)
|
||||||
output_folder = out_dir.absolute()
|
output_folder = out_dir.absolute()
|
||||||
output: ArchiveOutput = 'headers.json'
|
output: ArchiveOutput = get_output_path()
|
||||||
|
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
@ -59,7 +63,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
||||||
try:
|
try:
|
||||||
json_headers = get_headers(link.url, timeout=timeout)
|
json_headers = get_headers(link.url, timeout=timeout)
|
||||||
output_folder.mkdir(exist_ok=True)
|
output_folder.mkdir(exist_ok=True)
|
||||||
atomic_write(str(output_folder / "headers.json"), json_headers)
|
atomic_write(str(output_folder / get_output_path()), json_headers)
|
||||||
except (Exception, OSError) as err:
|
except (Exception, OSError) as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = err
|
output = err
|
||||||
|
|
|
@ -19,6 +19,12 @@ from ..util import (
|
||||||
)
|
)
|
||||||
from .title import get_html
|
from .title import get_html
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return "htmltotext.txt"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLTextExtractor(HTMLParser):
|
class HTMLTextExtractor(HTMLParser):
|
||||||
TEXT_ATTRS = [
|
TEXT_ATTRS = [
|
||||||
"alt", "cite", "href", "label",
|
"alt", "cite", "href", "label",
|
||||||
|
@ -109,7 +115,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'htmltotext.txt').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_HTMLTOTEXT
|
return SAVE_HTMLTOTEXT
|
||||||
|
@ -120,7 +126,7 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
"""extract search-indexing-friendly text from an HTML document"""
|
"""extract search-indexing-friendly text from an HTML document"""
|
||||||
|
|
||||||
out_dir = Path(out_dir or link.link_dir)
|
out_dir = Path(out_dir or link.link_dir)
|
||||||
output = "htmltotext.txt"
|
output = get_output_path()
|
||||||
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
|
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
|
||||||
|
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
|
|
@ -22,13 +22,27 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'media/'
|
||||||
|
|
||||||
|
def get_embed_path(archiveresult=None):
|
||||||
|
if not archiveresult:
|
||||||
|
return get_output_path()
|
||||||
|
|
||||||
|
out_dir = archiveresult.snapshot_dir / get_output_path()
|
||||||
|
try:
|
||||||
|
return get_output_path() + list(out_dir.glob('*.mp4'))[0].name
|
||||||
|
except IndexError:
|
||||||
|
return get_output_path()
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'media').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_MEDIA
|
return SAVE_MEDIA
|
||||||
|
@ -38,7 +52,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
||||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
|
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = 'media'
|
output: ArchiveOutput = get_output_path()
|
||||||
output_path = out_dir / output
|
output_path = out_dir / output
|
||||||
output_path.mkdir(exist_ok=True)
|
output_path.mkdir(exist_ok=True)
|
||||||
# later options take precedence
|
# later options take precedence
|
||||||
|
|
|
@ -24,6 +24,12 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'mercury/'
|
||||||
|
|
||||||
|
def get_embed_path(archiveresult=None):
|
||||||
|
return get_output_path() + 'content.html'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
|
def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
|
||||||
|
@ -44,7 +50,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Opti
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'mercury').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_MERCURY
|
return SAVE_MERCURY
|
||||||
|
@ -55,8 +61,8 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
||||||
"""download reader friendly version using @postlight/mercury-parser"""
|
"""download reader friendly version using @postlight/mercury-parser"""
|
||||||
|
|
||||||
out_dir = Path(out_dir or link.link_dir)
|
out_dir = Path(out_dir or link.link_dir)
|
||||||
output_folder = out_dir.absolute() / "mercury"
|
output_folder = out_dir.absolute() / get_output_path()
|
||||||
output = "mercury"
|
output = get_output_path()
|
||||||
|
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
|
|
@ -19,13 +19,17 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'output.pdf'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'output.pdf').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_PDF
|
return SAVE_PDF
|
||||||
|
@ -36,7 +40,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
"""print PDF of site to file using chrome --headless"""
|
"""print PDF of site to file using chrome --headless"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = 'output.pdf'
|
output: ArchiveOutput = get_output_path()
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(),
|
*chrome_args(),
|
||||||
'--print-to-pdf',
|
'--print-to-pdf',
|
||||||
|
@ -51,7 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
hints = (result.stderr or result.stdout).decode()
|
hints = (result.stderr or result.stdout).decode()
|
||||||
raise ArchiveError('Failed to save PDF', hints)
|
raise ArchiveError('Failed to save PDF', hints)
|
||||||
|
|
||||||
chmod_file('output.pdf', cwd=str(out_dir))
|
chmod_file(get_output_path(), cwd=str(out_dir))
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = err
|
output = err
|
||||||
|
|
|
@ -22,6 +22,12 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
from .title import get_html
|
from .title import get_html
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'readability/'
|
||||||
|
|
||||||
|
def get_embed_path(archiveresult=None):
|
||||||
|
return get_output_path() + 'content.html'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
|
@ -29,7 +35,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'readability').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_READABILITY
|
return SAVE_READABILITY
|
||||||
|
@ -40,8 +46,8 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||||
"""download reader friendly version using @mozilla/readability"""
|
"""download reader friendly version using @mozilla/readability"""
|
||||||
|
|
||||||
out_dir = Path(out_dir or link.link_dir)
|
out_dir = Path(out_dir or link.link_dir)
|
||||||
output_folder = out_dir.absolute() / "readability"
|
output_folder = out_dir.absolute() / get_output_path()
|
||||||
output = "readability"
|
output = get_output_path()
|
||||||
|
|
||||||
# Readability Docs: https://github.com/mozilla/readability
|
# Readability Docs: https://github.com/mozilla/readability
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,9 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'screenshot.png'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
|
@ -26,7 +29,7 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'screenshot.png').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_SCREENSHOT
|
return SAVE_SCREENSHOT
|
||||||
|
@ -36,7 +39,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
"""take screenshot of site using chrome --headless"""
|
"""take screenshot of site using chrome --headless"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = 'screenshot.png'
|
output: ArchiveOutput = get_output_path()
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(),
|
*chrome_args(),
|
||||||
'--screenshot',
|
'--screenshot',
|
||||||
|
|
|
@ -26,13 +26,17 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'singlefile.html'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'singlefile.html').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_SINGLEFILE
|
return SAVE_SINGLEFILE
|
||||||
|
@ -43,7 +47,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
"""download full site using single-file"""
|
"""download full site using single-file"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output = "singlefile.html"
|
output = get_output_path()
|
||||||
|
|
||||||
browser_args = chrome_args(CHROME_TIMEOUT=0)
|
browser_args = chrome_args(CHROME_TIMEOUT=0)
|
||||||
|
|
||||||
|
@ -90,7 +94,8 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
|
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
|
||||||
cmd[2] = browser_args.replace('"', "\\\"")
|
cmd[2] = browser_args.replace('"', "\\\"")
|
||||||
err.hints = (result.stdout + result.stderr).decode().split('\n')
|
if result:
|
||||||
|
err.hints = (result.stdout + result.stderr).decode().split('\n')
|
||||||
output = err
|
output = err
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
|
@ -60,6 +60,7 @@ class TitleParser(HTMLParser):
|
||||||
if tag.lower() == "title":
|
if tag.lower() == "title":
|
||||||
self.inside_title_tag = False
|
self.inside_title_tag = False
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
||||||
"""
|
"""
|
||||||
|
@ -84,6 +85,13 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
||||||
else:
|
else:
|
||||||
return document
|
return document
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
# TODO: actually save title to this file
|
||||||
|
# (currently only saved in ArchiveResult.output as charfield value, not saved to filesystem)
|
||||||
|
return 'title.json'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
# if link already has valid title, skip it
|
# if link already has valid title, skip it
|
||||||
|
|
|
@ -35,6 +35,18 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
# TODO: actually save output into this folder, instead of do {domain}/**/index.html
|
||||||
|
return 'wget/'
|
||||||
|
|
||||||
|
def get_embed_path(archiveresult=None):
|
||||||
|
if not archiveresult:
|
||||||
|
return get_output_path()
|
||||||
|
|
||||||
|
link = archiveresult.snapshot.as_link()
|
||||||
|
return wget_output_path(link)
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
output_path = wget_output_path(link)
|
output_path = wget_output_path(link)
|
||||||
|
@ -133,64 +145,38 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def wget_output_path(link: Link) -> Optional[str]:
|
def unsafe_wget_output_path(link: Link) -> Optional[str]:
|
||||||
"""calculate the path to the wgetted .html file, since wget may
|
# There used to be a bunch of complex reverse-engineering path mapping logic here,
|
||||||
adjust some paths to be different than the base_url path.
|
# but it was removed in favor of just walking through the output folder recursively to try to find the
|
||||||
|
# html file that wget produced. It's *much much much* slower than deriving it statically, and is currently
|
||||||
See docs on wget --adjust-extension (-E)
|
# one of the main bottlenecks of ArchiveBox's performance (the output data is often on a slow HDD or network mount).
|
||||||
"""
|
# But it's STILL better than trying to figure out URL -> html filepath mappings ourselves from first principles.
|
||||||
|
|
||||||
# Wget downloads can save in a number of different ways depending on the url:
|
|
||||||
# https://example.com
|
|
||||||
# > example.com/index.html
|
|
||||||
# https://example.com?v=zzVa_tX1OiI
|
|
||||||
# > example.com/index.html?v=zzVa_tX1OiI.html
|
|
||||||
# https://www.example.com/?v=zzVa_tX1OiI
|
|
||||||
# > example.com/index.html?v=zzVa_tX1OiI.html
|
|
||||||
|
|
||||||
# https://example.com/abc
|
|
||||||
# > example.com/abc.html
|
|
||||||
# https://example.com/abc/
|
|
||||||
# > example.com/abc/index.html
|
|
||||||
# https://example.com/abc?v=zzVa_tX1OiI.html
|
|
||||||
# > example.com/abc?v=zzVa_tX1OiI.html
|
|
||||||
# https://example.com/abc/?v=zzVa_tX1OiI.html
|
|
||||||
# > example.com/abc/index.html?v=zzVa_tX1OiI.html
|
|
||||||
|
|
||||||
# https://example.com/abc/test.html
|
|
||||||
# > example.com/abc/test.html
|
|
||||||
# https://example.com/abc/test?v=zzVa_tX1OiI
|
|
||||||
# > example.com/abc/test?v=zzVa_tX1OiI.html
|
|
||||||
# https://example.com/abc/test/?v=zzVa_tX1OiI
|
|
||||||
# > example.com/abc/test/index.html?v=zzVa_tX1OiI.html
|
|
||||||
|
|
||||||
# There's also lots of complexity around how the urlencoding and renaming
|
|
||||||
# is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
|
|
||||||
|
|
||||||
# Since the wget algorithm for -E (appending .html) is incredibly complex
|
|
||||||
# and there's no way to get the computed output path from wget
|
|
||||||
# in order to avoid having to reverse-engineer how they calculate it,
|
|
||||||
# we just look in the output folder read the filename wget used from the filesystem
|
|
||||||
full_path = without_fragment(without_query(path(link.url))).strip('/')
|
full_path = without_fragment(without_query(path(link.url))).strip('/')
|
||||||
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
|
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
|
||||||
for _ in range(4):
|
for _ in range(4):
|
||||||
if search_dir.exists():
|
try:
|
||||||
if search_dir.is_dir():
|
if search_dir.exists():
|
||||||
html_files = [
|
if search_dir.is_dir():
|
||||||
f for f in search_dir.iterdir()
|
html_files = [
|
||||||
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
|
f for f in search_dir.iterdir()
|
||||||
]
|
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
|
||||||
if html_files:
|
]
|
||||||
return str(html_files[0].relative_to(link.link_dir))
|
if html_files:
|
||||||
|
return str(html_files[0].relative_to(link.link_dir))
|
||||||
|
|
||||||
# sometimes wget'd URLs have no ext and return non-html
|
# sometimes wget'd URLs have no ext and return non-html
|
||||||
# e.g. /some/example/rss/all -> some RSS XML content)
|
# e.g. /some/example/rss/all -> some RSS XML content)
|
||||||
# /some/other/url.o4g -> some binary unrecognized ext)
|
# /some/other/url.o4g -> some binary unrecognized ext)
|
||||||
# test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
|
# test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
|
||||||
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
|
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
|
||||||
for file_present in search_dir.iterdir():
|
for file_present in search_dir.iterdir():
|
||||||
if file_present == last_part_of_url:
|
if file_present == last_part_of_url:
|
||||||
return str((search_dir / file_present).relative_to(link.link_dir))
|
return str((search_dir / file_present).relative_to(link.link_dir))
|
||||||
|
except OSError:
|
||||||
|
# OSError 36 and others can happen here, caused by trying to check for impossible paths
|
||||||
|
# (paths derived from URLs can often contain illegal unicode characters or be too long,
|
||||||
|
# causing the OS / filesystem to reject trying to open them with a system-level error)
|
||||||
|
pass
|
||||||
|
|
||||||
# Move up one directory level
|
# Move up one directory level
|
||||||
search_dir = search_dir.parent
|
search_dir = search_dir.parent
|
||||||
|
@ -200,10 +186,93 @@ def wget_output_path(link: Link) -> Optional[str]:
|
||||||
|
|
||||||
# check for literally any file present that isnt an empty folder
|
# check for literally any file present that isnt an empty folder
|
||||||
domain_dir = Path(domain(link.url).replace(":", "+"))
|
domain_dir = Path(domain(link.url).replace(":", "+"))
|
||||||
files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*'))
|
files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')]
|
||||||
if files_within:
|
if files_within:
|
||||||
return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
|
return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
|
||||||
|
|
||||||
|
# abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated
|
||||||
|
# that it's better we just pretend it doesnt exist
|
||||||
|
# this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def wget_output_path(link: Link) -> Optional[str]:
|
||||||
|
"""calculate the path to the wgetted .html file, since wget may
|
||||||
|
adjust some paths to be different than the base_url path.
|
||||||
|
|
||||||
|
See docs on: wget --adjust-extension (-E), --restrict-file-names=windows|unix|ascii, --convert-links
|
||||||
|
|
||||||
|
WARNING: this function is extremely error prone because mapping URLs to filesystem paths deterministically
|
||||||
|
is basically impossible. Every OS and filesystem have different requirements on what special characters are
|
||||||
|
allowed, and URLs are *full* of all kinds of special characters, illegal unicode, and generally unsafe strings
|
||||||
|
that you dont want anywhere near your filesystem. Also URLs can be obscenely long, but most filesystems dont
|
||||||
|
accept paths longer than 250 characters. On top of all that, this function only exists to try to reverse engineer
|
||||||
|
wget's approach to solving this problem, so this is a shittier, less tested version of their already insanely
|
||||||
|
complicated attempt to do this. Here be dragons:
|
||||||
|
- https://github.com/ArchiveBox/ArchiveBox/issues/549
|
||||||
|
- https://github.com/ArchiveBox/ArchiveBox/issues/1373
|
||||||
|
- https://stackoverflow.com/questions/9532499/check-whether-a-path-is-valid-in-python-without-creating-a-file-at-the-paths-ta
|
||||||
|
- and probably many more that I didn't realize were caused by this...
|
||||||
|
|
||||||
|
The only constructive thing we could possibly do to this function is to figure out how to remove it.
|
||||||
|
|
||||||
|
Preach loudly to anyone who will listen: never attempt to map URLs to filesystem paths,
|
||||||
|
and pray you never have to deal with the aftermath of someone else's attempt to do so...
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Wget downloads can save in a number of different ways depending on the url:
|
||||||
|
# https://example.com
|
||||||
|
# > example.com/index.html
|
||||||
|
# https://example.com?v=zzVa_tX1OiI
|
||||||
|
# > example.com/index.html@v=zzVa_tX1OiI.html
|
||||||
|
# https://www.example.com/?v=zzVa_tX1OiI
|
||||||
|
# > example.com/index.html@v=zzVa_tX1OiI.html
|
||||||
|
|
||||||
|
# https://example.com/abc
|
||||||
|
# > example.com/abc.html
|
||||||
|
# https://example.com/abc/
|
||||||
|
# > example.com/abc/index.html
|
||||||
|
# https://example.com/abc?v=zzVa_tX1OiI.html
|
||||||
|
# > example.com/abc@v=zzVa_tX1OiI.html
|
||||||
|
# https://example.com/abc/?v=zzVa_tX1OiI.html
|
||||||
|
# > example.com/abc/index.html@v=zzVa_tX1OiI.html
|
||||||
|
|
||||||
|
# https://example.com/abc/test.html
|
||||||
|
# > example.com/abc/test.html
|
||||||
|
# https://example.com/abc/test?v=zzVa_tX1OiI
|
||||||
|
# > example.com/abc/test@v=zzVa_tX1OiI.html
|
||||||
|
# https://example.com/abc/test/?v=zzVa_tX1OiI
|
||||||
|
# > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
|
||||||
|
|
||||||
|
# There's also lots of complexity around how the urlencoding and renaming
|
||||||
|
# is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
|
||||||
|
# unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
|
||||||
|
# 4 characters, paths with multipe extensions, etc. the list goes on...
|
||||||
|
|
||||||
|
output_path = None
|
||||||
|
try:
|
||||||
|
output_path = unsafe_wget_output_path(link)
|
||||||
|
except Exception as err:
|
||||||
|
pass # better to pretend it just failed to download than expose gnarly OSErrors to users
|
||||||
|
|
||||||
|
# check for unprintable unicode characters
|
||||||
|
# https://github.com/ArchiveBox/ArchiveBox/issues/1373
|
||||||
|
if output_path:
|
||||||
|
safe_path = output_path.encode('utf-8', 'replace').decode()
|
||||||
|
if output_path != safe_path:
|
||||||
|
# contains unprintable unicode characters that will break other parts of archivebox
|
||||||
|
# better to pretend it doesnt exist and fallback to parent dir than crash archivebox
|
||||||
|
output_path = None
|
||||||
|
|
||||||
|
# check for a path that is just too long to safely handle across different OS's
|
||||||
|
# https://github.com/ArchiveBox/ArchiveBox/issues/549
|
||||||
|
if output_path and len(output_path) > 250:
|
||||||
|
output_path = None
|
||||||
|
|
||||||
|
if output_path:
|
||||||
|
return output_path
|
||||||
|
|
||||||
# fallback to just the domain dir
|
# fallback to just the domain dir
|
||||||
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
|
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
|
||||||
if search_dir.is_dir():
|
if search_dir.is_dir():
|
||||||
|
|
|
@ -118,10 +118,10 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
|
||||||
|
|
||||||
|
|
||||||
def snapshot_icons(snapshot) -> str:
|
def snapshot_icons(snapshot) -> str:
|
||||||
cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
|
cache_key = f'{snapshot.pk}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
|
||||||
|
|
||||||
def calc_snapshot_icons():
|
def calc_snapshot_icons():
|
||||||
from core.models import EXTRACTORS
|
from core.models import EXTRACTOR_CHOICES
|
||||||
# start = datetime.now(timezone.utc)
|
# start = datetime.now(timezone.utc)
|
||||||
|
|
||||||
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
||||||
|
@ -147,12 +147,12 @@ def snapshot_icons(snapshot) -> str:
|
||||||
# Missing specific entry for WARC
|
# Missing specific entry for WARC
|
||||||
|
|
||||||
extractor_outputs = defaultdict(lambda: None)
|
extractor_outputs = defaultdict(lambda: None)
|
||||||
for extractor, _ in EXTRACTORS:
|
for extractor, _ in EXTRACTOR_CHOICES:
|
||||||
for result in archive_results:
|
for result in archive_results:
|
||||||
if result.extractor == extractor and result:
|
if result.extractor == extractor and result:
|
||||||
extractor_outputs[extractor] = result
|
extractor_outputs[extractor] = result
|
||||||
|
|
||||||
for extractor, _ in EXTRACTORS:
|
for extractor, _ in EXTRACTOR_CHOICES:
|
||||||
if extractor not in exclude:
|
if extractor not in exclude:
|
||||||
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||||
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
|
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
|
||||||
|
|
|
@ -4,6 +4,7 @@ WARNING: THIS FILE IS ALL LEGACY CODE TO BE REMOVED.
|
||||||
|
|
||||||
DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py
|
DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py
|
||||||
|
|
||||||
|
These are the old types we used to use before ArchiveBox v0.4 (before we switched to Django).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__package__ = 'archivebox.index'
|
__package__ = 'archivebox.index'
|
||||||
|
@ -191,6 +192,9 @@ class Link:
|
||||||
if extended:
|
if extended:
|
||||||
info.update({
|
info.update({
|
||||||
'snapshot_id': self.snapshot_id,
|
'snapshot_id': self.snapshot_id,
|
||||||
|
'snapshot_uuid': self.snapshot_uuid,
|
||||||
|
'snapshot_abid': self.snapshot_abid,
|
||||||
|
|
||||||
'link_dir': self.link_dir,
|
'link_dir': self.link_dir,
|
||||||
'archive_path': self.archive_path,
|
'archive_path': self.archive_path,
|
||||||
|
|
||||||
|
@ -260,9 +264,21 @@ class Link:
|
||||||
return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
|
return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def snapshot_id(self):
|
def snapshot(self):
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
return str(Snapshot.objects.only('id').get(url=self.url).id)
|
return Snapshot.objects.only('uuid').get(url=self.url)
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def snapshot_id(self):
|
||||||
|
return str(self.snapshot.pk)
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def snapshot_uuid(self):
|
||||||
|
return str(self.snapshot.uuid)
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def snapshot_abid(self):
|
||||||
|
return str(self.snapshot.ABID)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def field_names(cls):
|
def field_names(cls):
|
||||||
|
|
|
@ -45,7 +45,8 @@ def write_link_to_sql_index(link: Link):
|
||||||
info.pop('tags')
|
info.pop('tags')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
|
snapshot = Snapshot.objects.get(url=link.url)
|
||||||
|
info["timestamp"] = snapshot.timestamp
|
||||||
except Snapshot.DoesNotExist:
|
except Snapshot.DoesNotExist:
|
||||||
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
|
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
|
||||||
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
|
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
|
||||||
|
@ -57,7 +58,7 @@ def write_link_to_sql_index(link: Link):
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
if isinstance(entry, dict):
|
if isinstance(entry, dict):
|
||||||
result, _ = ArchiveResult.objects.get_or_create(
|
result, _ = ArchiveResult.objects.get_or_create(
|
||||||
snapshot_id=snapshot.id,
|
snapshot_id=snapshot.pk,
|
||||||
extractor=extractor,
|
extractor=extractor,
|
||||||
start_ts=parse_date(entry['start_ts']),
|
start_ts=parse_date(entry['start_ts']),
|
||||||
defaults={
|
defaults={
|
||||||
|
@ -71,7 +72,7 @@ def write_link_to_sql_index(link: Link):
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
result, _ = ArchiveResult.objects.update_or_create(
|
result, _ = ArchiveResult.objects.update_or_create(
|
||||||
snapshot_id=snapshot.id,
|
snapshot_id=snapshot.pk,
|
||||||
extractor=extractor,
|
extractor=extractor,
|
||||||
start_ts=parse_date(entry.start_ts),
|
start_ts=parse_date(entry.start_ts),
|
||||||
defaults={
|
defaults={
|
||||||
|
@ -142,7 +143,12 @@ def list_migrations(out_dir: Path=OUTPUT_DIR) -> List[Tuple[bool, str]]:
|
||||||
def apply_migrations(out_dir: Path=OUTPUT_DIR) -> List[str]:
|
def apply_migrations(out_dir: Path=OUTPUT_DIR) -> List[str]:
|
||||||
from django.core.management import call_command
|
from django.core.management import call_command
|
||||||
null, out = StringIO(), StringIO()
|
null, out = StringIO(), StringIO()
|
||||||
call_command("makemigrations", interactive=False, stdout=null)
|
try:
|
||||||
|
call_command("makemigrations", interactive=False, stdout=null)
|
||||||
|
except Exception as e:
|
||||||
|
print('[!] Failed to create some migrations. Please open an issue and copy paste this output for help: {}'.format(e))
|
||||||
|
print()
|
||||||
|
|
||||||
call_command("migrate", interactive=False, stdout=out)
|
call_command("migrate", interactive=False, stdout=out)
|
||||||
out.seek(0)
|
out.seek(0)
|
||||||
|
|
||||||
|
|
|
@ -638,17 +638,15 @@ def printable_folder_status(name: str, folder: Dict) -> str:
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def printable_dependency_version(name: str, dependency: Dict) -> str:
|
def printable_dependency_version(name: str, dependency: Dict) -> str:
|
||||||
version = None
|
color, symbol, note, version = 'red', 'X', 'invalid', '?'
|
||||||
|
|
||||||
if dependency['enabled']:
|
if dependency['enabled']:
|
||||||
if dependency['is_valid']:
|
if dependency['is_valid']:
|
||||||
color, symbol, note, version = 'green', '√', 'valid', ''
|
color, symbol, note = 'green', '√', 'valid'
|
||||||
|
|
||||||
parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
|
parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
|
||||||
if parsed_version_num:
|
if parsed_version_num:
|
||||||
version = f'v{parsed_version_num[0]}'
|
version = f'v{parsed_version_num[0]}'
|
||||||
|
|
||||||
if not version:
|
|
||||||
color, symbol, note, version = 'red', 'X', 'invalid', '?'
|
|
||||||
else:
|
else:
|
||||||
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
|
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
|
||||||
|
|
||||||
|
|
|
@ -104,7 +104,6 @@ from .config import (
|
||||||
COMMIT_HASH,
|
COMMIT_HASH,
|
||||||
BUILD_TIME,
|
BUILD_TIME,
|
||||||
CODE_LOCATIONS,
|
CODE_LOCATIONS,
|
||||||
EXTERNAL_LOCATIONS,
|
|
||||||
DATA_LOCATIONS,
|
DATA_LOCATIONS,
|
||||||
DEPENDENCIES,
|
DEPENDENCIES,
|
||||||
CHROME_BINARY,
|
CHROME_BINARY,
|
||||||
|
@ -231,7 +230,7 @@ def version(quiet: bool=False,
|
||||||
p = platform.uname()
|
p = platform.uname()
|
||||||
print(
|
print(
|
||||||
'ArchiveBox v{}'.format(get_version(CONFIG)),
|
'ArchiveBox v{}'.format(get_version(CONFIG)),
|
||||||
*((f'COMMIT_HASH={COMMIT_HASH[:7]}',) if COMMIT_HASH else ()),
|
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
|
||||||
f'BUILD_TIME={BUILD_TIME}',
|
f'BUILD_TIME={BUILD_TIME}',
|
||||||
)
|
)
|
||||||
print(
|
print(
|
||||||
|
@ -272,11 +271,6 @@ def version(quiet: bool=False,
|
||||||
for name, path in CODE_LOCATIONS.items():
|
for name, path in CODE_LOCATIONS.items():
|
||||||
print(printable_folder_status(name, path))
|
print(printable_folder_status(name, path))
|
||||||
|
|
||||||
print()
|
|
||||||
print('{white}[i] Secrets locations:{reset}'.format(**ANSI))
|
|
||||||
for name, path in EXTERNAL_LOCATIONS.items():
|
|
||||||
print(printable_folder_status(name, path))
|
|
||||||
|
|
||||||
print()
|
print()
|
||||||
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
|
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
|
||||||
print('{white}[i] Data locations:{reset}'.format(**ANSI))
|
print('{white}[i] Data locations:{reset}'.format(**ANSI))
|
||||||
|
@ -695,7 +689,7 @@ def add(urls: Union[str, List[str]],
|
||||||
if CAN_UPGRADE:
|
if CAN_UPGRADE:
|
||||||
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||||
|
|
||||||
return all_links
|
return new_links
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def remove(filter_str: Optional[str]=None,
|
def remove(filter_str: Optional[str]=None,
|
||||||
|
@ -1362,7 +1356,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
|
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
|
||||||
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
|
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
|
||||||
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
||||||
stderr()
|
stderr('')
|
||||||
|
|
||||||
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
|
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ if __name__ == '__main__':
|
||||||
# versions of ./manage.py commands whenever possible. When that's not possible
|
# versions of ./manage.py commands whenever possible. When that's not possible
|
||||||
# (e.g. makemigrations), you can comment out this check temporarily
|
# (e.g. makemigrations), you can comment out this check temporarily
|
||||||
|
|
||||||
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv):
|
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv or 'startapp' in sys.argv):
|
||||||
print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):")
|
print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):")
|
||||||
print()
|
print()
|
||||||
print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:')
|
print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:')
|
||||||
|
|
16
archivebox/monkey_patches.py
Normal file
16
archivebox/monkey_patches.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
__package__ = 'archivebox'
|
||||||
|
|
||||||
|
import django_stubs_ext
|
||||||
|
|
||||||
|
django_stubs_ext.monkeypatch()
|
||||||
|
|
||||||
|
|
||||||
|
# monkey patch django timezone to add back utc (it was removed in Django 5.0)
|
||||||
|
import datetime
|
||||||
|
from django.utils import timezone
|
||||||
|
timezone.utc = datetime.timezone.utc
|
||||||
|
|
||||||
|
|
||||||
|
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
|
||||||
|
# from signal_webhooks.apps import DjangoSignalWebhooksConfig
|
||||||
|
# DjangoSignalWebhooksConfig.verbose_name = 'API'
|
2679
archivebox/package-lock.json
generated
Normal file
2679
archivebox/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "archivebox",
|
"name": "archivebox",
|
||||||
"version": "0.7.3",
|
"version": "0.8.1",
|
||||||
"description": "ArchiveBox: The self-hosted internet archive",
|
"description": "ArchiveBox: The self-hosted internet archive",
|
||||||
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
|
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
|
||||||
"repository": "github:ArchiveBox/ArchiveBox",
|
"repository": "github:ArchiveBox/ArchiveBox",
|
||||||
|
@ -8,6 +8,6 @@
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@postlight/parser": "^2.2.3",
|
"@postlight/parser": "^2.2.3",
|
||||||
"readability-extractor": "github:ArchiveBox/readability-extractor",
|
"readability-extractor": "github:ArchiveBox/readability-extractor",
|
||||||
"single-file-cli": "^1.1.46"
|
"single-file-cli": "^1.1.54"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,7 +7,6 @@ For examples of supported import formats see tests/.
|
||||||
|
|
||||||
__package__ = 'archivebox.parsers'
|
__package__ = 'archivebox.parsers'
|
||||||
|
|
||||||
import re
|
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
|
||||||
from typing import IO, Tuple, List, Optional
|
from typing import IO, Tuple, List, Optional
|
||||||
|
@ -28,7 +27,6 @@ from ..util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
download_url,
|
download_url,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
URL_REGEX,
|
|
||||||
)
|
)
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..logging_util import TimedProgress, log_source_saved
|
from ..logging_util import TimedProgress, log_source_saved
|
||||||
|
@ -202,54 +200,3 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
|
||||||
log_source_saved(source_file=source_path)
|
log_source_saved(source_file=source_path)
|
||||||
|
|
||||||
return source_path
|
return source_path
|
||||||
|
|
||||||
|
|
||||||
# Check that plain text regex URL parsing works as expected
|
|
||||||
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
|
||||||
# misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
|
|
||||||
# the consequences of bad URL parsing could be disastrous and lead to many
|
|
||||||
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
|
|
||||||
_test_url_strs = {
|
|
||||||
'example.com': 0,
|
|
||||||
'/example.com': 0,
|
|
||||||
'//example.com': 0,
|
|
||||||
':/example.com': 0,
|
|
||||||
'://example.com': 0,
|
|
||||||
'htt://example8.com': 0,
|
|
||||||
'/htt://example.com': 0,
|
|
||||||
'https://example': 1,
|
|
||||||
'https://localhost/2345': 1,
|
|
||||||
'https://localhost:1234/123': 1,
|
|
||||||
'://': 0,
|
|
||||||
'https://': 0,
|
|
||||||
'http://': 0,
|
|
||||||
'ftp://': 0,
|
|
||||||
'ftp://example.com': 0,
|
|
||||||
'https://example.com': 1,
|
|
||||||
'https://example.com/': 1,
|
|
||||||
'https://a.example.com': 1,
|
|
||||||
'https://a.example.com/': 1,
|
|
||||||
'https://a.example.com/what/is/happening.html': 1,
|
|
||||||
'https://a.example.com/what/ís/happening.html': 1,
|
|
||||||
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
|
|
||||||
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
|
|
||||||
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
|
|
||||||
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
|
|
||||||
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
|
|
||||||
'<test>http://example7.com</test>': 1,
|
|
||||||
'https://<test>': 0,
|
|
||||||
'https://[test]': 0,
|
|
||||||
'http://"test"': 0,
|
|
||||||
'http://\'test\'': 0,
|
|
||||||
'[https://example8.com/what/is/this.php?what=1]': 1,
|
|
||||||
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
|
|
||||||
'<what>https://example10.com#and-thing=2 "</about>': 1,
|
|
||||||
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
|
|
||||||
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
|
|
||||||
'<or>http://examplehttp://15.badc</that>': 2,
|
|
||||||
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
|
|
||||||
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
|
|
||||||
}
|
|
||||||
for url_str, num_urls in _test_url_strs.items():
|
|
||||||
assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
|
|
||||||
f'{url_str} does not contain {num_urls} urls')
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ from ..index.schema import Link
|
||||||
from ..util import (
|
from ..util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
URL_REGEX,
|
find_all_urls,
|
||||||
)
|
)
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
@ -40,10 +40,22 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
|
||||||
parser.feed(line)
|
parser.feed(line)
|
||||||
for url in parser.urls:
|
for url in parser.urls:
|
||||||
if root_url:
|
if root_url:
|
||||||
# resolve relative urls /home.html -> https://example.com/home.html
|
url_is_absolute = (url.lower().startswith('http://') or url.lower().startswith('https://'))
|
||||||
url = urljoin(root_url, url)
|
# url = https://abc.com => True
|
||||||
|
# url = /page.php?next=https://example.com => False
|
||||||
for archivable_url in re.findall(URL_REGEX, url):
|
|
||||||
|
if not url_is_absolute: # resolve it by joining it with root_url
|
||||||
|
relative_path = url
|
||||||
|
|
||||||
|
url = urljoin(root_url, relative_path) # https://example.com/somepage.html + /home.html
|
||||||
|
# => https://example.com/home.html
|
||||||
|
|
||||||
|
# special case to handle bug around // handling, crucial for urls that contain sub-urls
|
||||||
|
# e.g. https://web.archive.org/web/https://example.com
|
||||||
|
if did_urljoin_misbehave(root_url, relative_path, url):
|
||||||
|
url = fix_urljoin_bug(url)
|
||||||
|
|
||||||
|
for archivable_url in find_all_urls(url):
|
||||||
yield Link(
|
yield Link(
|
||||||
url=htmldecode(archivable_url),
|
url=htmldecode(archivable_url),
|
||||||
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
||||||
|
@ -56,3 +68,74 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
|
||||||
KEY = 'html'
|
KEY = 'html'
|
||||||
NAME = 'Generic HTML'
|
NAME = 'Generic HTML'
|
||||||
PARSER = parse_generic_html_export
|
PARSER = parse_generic_html_export
|
||||||
|
|
||||||
|
|
||||||
|
#### WORKAROUND CODE FOR https://github.com/python/cpython/issues/96015 ####
|
||||||
|
|
||||||
|
def did_urljoin_misbehave(root_url: str, relative_path: str, final_url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Handle urljoin edge case bug where multiple slashes get turned into a single slash:
|
||||||
|
- https://github.com/python/cpython/issues/96015
|
||||||
|
- https://github.com/ArchiveBox/ArchiveBox/issues/1411
|
||||||
|
|
||||||
|
This workaround only fixes the most common case of a sub-URL inside an outer URL, e.g.:
|
||||||
|
https://web.archive.org/web/https://example.com/some/inner/url
|
||||||
|
|
||||||
|
But there are other valid URLs containing // that are not fixed by this workaround, e.g.:
|
||||||
|
https://example.com/drives/C//some/file
|
||||||
|
"""
|
||||||
|
|
||||||
|
# if relative path is actually an absolute url, cut off its own scheme so we check the path component only
|
||||||
|
relative_path = relative_path.lower()
|
||||||
|
if relative_path.startswith('http://') or relative_path.startswith('https://'):
|
||||||
|
relative_path = relative_path.split('://', 1)[-1]
|
||||||
|
|
||||||
|
# TODO: properly fix all double // getting stripped by urljoin, not just ://
|
||||||
|
original_path_had_suburl = '://' in relative_path
|
||||||
|
original_root_had_suburl = '://' in root_url[8:] # ignore first 8 chars because root always starts with https://
|
||||||
|
final_joined_has_suburl = '://' in final_url[8:] # ignore first 8 chars because final always starts with https://
|
||||||
|
|
||||||
|
urljoin_broke_suburls = (
|
||||||
|
(original_root_had_suburl or original_path_had_suburl)
|
||||||
|
and not final_joined_has_suburl
|
||||||
|
)
|
||||||
|
return urljoin_broke_suburls
|
||||||
|
|
||||||
|
|
||||||
|
def fix_urljoin_bug(url: str, nesting_limit=5):
|
||||||
|
"""
|
||||||
|
recursively replace broken suburls .../http:/... with .../http://...
|
||||||
|
|
||||||
|
basically equivalent to this for 99.9% of cases:
|
||||||
|
url = url.replace('/http:/', '/http://')
|
||||||
|
url = url.replace('/https:/', '/https://')
|
||||||
|
except this handles:
|
||||||
|
other schemes besides http/https (e.g. https://example.com/link/git+ssh://github.com/example)
|
||||||
|
other preceding separators besides / (e.g. https://example.com/login/?next=https://example.com/home)
|
||||||
|
fixing multiple suburls recursively
|
||||||
|
"""
|
||||||
|
input_url = url
|
||||||
|
for _ in range(nesting_limit):
|
||||||
|
url = re.sub(
|
||||||
|
r'(?P<root>.+?)' # https://web.archive.org/web
|
||||||
|
+ r'(?P<separator>[-=/_&+%$#@!*\(\\])' # /
|
||||||
|
+ r'(?P<subscheme>[a-zA-Z0-9+_-]{1,32}?):/' # http:/
|
||||||
|
+ r'(?P<suburl>[^/\\]+)', # example.com
|
||||||
|
r"\1\2\3://\4",
|
||||||
|
input_url,
|
||||||
|
re.IGNORECASE | re.UNICODE,
|
||||||
|
)
|
||||||
|
if url == input_url:
|
||||||
|
break # nothing left to replace, all suburls are fixed
|
||||||
|
input_url = url
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
# sanity check to make sure workaround code works as expected and doesnt introduce *more* bugs
|
||||||
|
assert did_urljoin_misbehave('https://web.archive.org/web/https://example.com', 'abc.html', 'https://web.archive.org/web/https:/example.com/abc.html') == True
|
||||||
|
assert did_urljoin_misbehave('http://example.com', 'https://web.archive.org/web/http://example.com/abc.html', 'https://web.archive.org/web/http:/example.com/abc.html') == True
|
||||||
|
assert fix_urljoin_bug('https:/example.com') == 'https:/example.com' # should not modify original url's scheme, only sub-urls
|
||||||
|
assert fix_urljoin_bug('https://web.archive.org/web/https:/example.com/abc.html') == 'https://web.archive.org/web/https://example.com/abc.html'
|
||||||
|
assert fix_urljoin_bug('http://example.com/link/git+ssh:/github.com/example?next=ftp:/example.com') == 'http://example.com/link/git+ssh://github.com/example?next=ftp://example.com'
|
||||||
|
|
||||||
|
|
|
@ -72,21 +72,13 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
|
|
||||||
json_file.seek(0)
|
json_file.seek(0)
|
||||||
|
|
||||||
try:
|
links = json.load(json_file)
|
||||||
links = json.load(json_file)
|
if type(links) != list:
|
||||||
if type(links) != list:
|
raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
|
||||||
raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
|
|
||||||
except json.decoder.JSONDecodeError:
|
|
||||||
# sometimes the first line is a comment or other junk, so try without
|
|
||||||
json_file.seek(0)
|
|
||||||
first_line = json_file.readline()
|
|
||||||
#print(' > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '')
|
|
||||||
links = json.load(json_file)
|
|
||||||
# we may fail again, which means we really don't know what to do
|
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
if link:
|
if link:
|
||||||
yield jsonObjectToLink(link,json_file.name)
|
yield jsonObjectToLink(link, json_file.name)
|
||||||
|
|
||||||
KEY = 'json'
|
KEY = 'json'
|
||||||
NAME = 'Generic JSON'
|
NAME = 'Generic JSON'
|
||||||
|
|
|
@ -3,11 +3,9 @@ __package__ = 'archivebox.parsers'
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from typing import IO, Iterable
|
from typing import IO, Iterable
|
||||||
from datetime import datetime, timezone
|
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from ..util import (
|
||||||
htmldecode,
|
|
||||||
enforce_types,
|
enforce_types,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
__package__ = 'archivebox.parsers'
|
__package__ = 'archivebox.parsers'
|
||||||
__description__ = 'Plain Text'
|
__description__ = 'Plain Text'
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from typing import IO, Iterable
|
from typing import IO, Iterable
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -11,7 +9,7 @@ from ..index.schema import Link
|
||||||
from ..util import (
|
from ..util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
URL_REGEX
|
find_all_urls,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,7 +37,7 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# otherwise look for anything that looks like a URL in the line
|
# otherwise look for anything that looks like a URL in the line
|
||||||
for url in re.findall(URL_REGEX, line):
|
for url in find_all_urls(line):
|
||||||
yield Link(
|
yield Link(
|
||||||
url=htmldecode(url),
|
url=htmldecode(url),
|
||||||
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
||||||
|
@ -48,17 +46,6 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
sources=[text_file.name],
|
sources=[text_file.name],
|
||||||
)
|
)
|
||||||
|
|
||||||
# look inside the URL for any sub-urls, e.g. for archive.org links
|
|
||||||
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
|
||||||
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
|
||||||
for sub_url in re.findall(URL_REGEX, line[1:]):
|
|
||||||
yield Link(
|
|
||||||
url=htmldecode(sub_url),
|
|
||||||
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
|
||||||
title=None,
|
|
||||||
tags=None,
|
|
||||||
sources=[text_file.name],
|
|
||||||
)
|
|
||||||
|
|
||||||
KEY = 'txt'
|
KEY = 'txt'
|
||||||
NAME = 'Generic TXT'
|
NAME = 'Generic TXT'
|
||||||
|
|
17
archivebox/plugantic/__init__.py
Normal file
17
archivebox/plugantic/__init__.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
__package__ = 'archivebox.plugantic'
|
||||||
|
|
||||||
|
from .binproviders import BinProvider
|
||||||
|
from .binaries import Binary
|
||||||
|
from .extractors import Extractor
|
||||||
|
from .replayers import Replayer
|
||||||
|
from .configs import ConfigSet
|
||||||
|
from .plugins import Plugin
|
||||||
|
|
||||||
|
# __all__ = [
|
||||||
|
# 'BinProvider',
|
||||||
|
# 'Binary',
|
||||||
|
# 'Extractor',
|
||||||
|
# 'Replayer',
|
||||||
|
# 'ConfigSet',
|
||||||
|
# 'Plugin',
|
||||||
|
# ]
|
26
archivebox/plugantic/admin.py
Normal file
26
archivebox/plugantic/admin.py
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
# from django.contrib import admin
|
||||||
|
# from django import forms
|
||||||
|
|
||||||
|
# from django_jsonform.widgets import JSONFormWidget
|
||||||
|
|
||||||
|
# from django_pydantic_field.v2.fields import PydanticSchemaField
|
||||||
|
|
||||||
|
# from .models import CustomPlugin
|
||||||
|
|
||||||
|
|
||||||
|
# class PluginForm(forms.ModelForm):
|
||||||
|
# class Meta:
|
||||||
|
# model = CustomPlugin
|
||||||
|
# fields = '__all__'
|
||||||
|
# widgets = {
|
||||||
|
# 'items': JSONFormWidget(schema=PluginSchema),
|
||||||
|
# }
|
||||||
|
|
||||||
|
|
||||||
|
# class PluginAdmin(admin.ModelAdmin):
|
||||||
|
# formfield_overrides = {
|
||||||
|
# PydanticSchemaField: {"widget": JSONFormWidget},
|
||||||
|
# }
|
||||||
|
# form = PluginForm
|
||||||
|
|
||||||
|
|
6
archivebox/plugantic/apps.py
Normal file
6
archivebox/plugantic/apps.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class PluganticConfig(AppConfig):
|
||||||
|
default_auto_field = 'django.db.models.BigAutoField'
|
||||||
|
name = 'plugantic'
|
323
archivebox/plugantic/binaries.py
Normal file
323
archivebox/plugantic/binaries.py
Normal file
|
@ -0,0 +1,323 @@
|
||||||
|
__package__ = 'archivebox.plugantic'
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import inspect
|
||||||
|
import importlib
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
from typing import Any, Optional, Dict, List
|
||||||
|
from typing_extensions import Self
|
||||||
|
from subprocess import run, PIPE
|
||||||
|
|
||||||
|
|
||||||
|
from pydantic_core import ValidationError
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field, model_validator, computed_field, field_validator, validate_call, field_serializer
|
||||||
|
|
||||||
|
from .binproviders import (
|
||||||
|
SemVer,
|
||||||
|
BinName,
|
||||||
|
BinProviderName,
|
||||||
|
HostBinPath,
|
||||||
|
BinProvider,
|
||||||
|
EnvProvider,
|
||||||
|
AptProvider,
|
||||||
|
BrewProvider,
|
||||||
|
PipProvider,
|
||||||
|
ProviderLookupDict,
|
||||||
|
bin_name,
|
||||||
|
bin_abspath,
|
||||||
|
path_is_script,
|
||||||
|
path_is_executable,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Binary(BaseModel):
|
||||||
|
name: BinName
|
||||||
|
description: str = Field(default='')
|
||||||
|
|
||||||
|
providers_supported: List[BinProvider] = Field(default=[EnvProvider()], alias='providers')
|
||||||
|
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = Field(default={}, alias='overrides')
|
||||||
|
|
||||||
|
loaded_provider: Optional[BinProviderName] = Field(default=None, alias='provider')
|
||||||
|
loaded_abspath: Optional[HostBinPath] = Field(default=None, alias='abspath')
|
||||||
|
loaded_version: Optional[SemVer] = Field(default=None, alias='version')
|
||||||
|
|
||||||
|
# bin_filename: see below
|
||||||
|
# is_executable: see below
|
||||||
|
# is_script
|
||||||
|
# is_valid: see below
|
||||||
|
|
||||||
|
|
||||||
|
@model_validator(mode='after')
|
||||||
|
def validate(self):
|
||||||
|
self.loaded_abspath = bin_abspath(self.name) or self.name
|
||||||
|
self.description = self.description or self.name
|
||||||
|
|
||||||
|
assert self.providers_supported, f'No providers were given for package {self.name}'
|
||||||
|
|
||||||
|
# pull in any overrides from the binproviders
|
||||||
|
for provider in self.providers_supported:
|
||||||
|
overrides_by_provider = provider.get_providers_for_bin(self.name)
|
||||||
|
if overrides_by_provider:
|
||||||
|
self.provider_overrides[provider.name] = {
|
||||||
|
**overrides_by_provider,
|
||||||
|
**self.provider_overrides.get(provider.name, {}),
|
||||||
|
}
|
||||||
|
return self
|
||||||
|
|
||||||
|
@field_validator('loaded_abspath', mode='before')
|
||||||
|
def parse_abspath(cls, value: Any):
|
||||||
|
return bin_abspath(value)
|
||||||
|
|
||||||
|
@field_validator('loaded_version', mode='before')
|
||||||
|
def parse_version(cls, value: Any):
|
||||||
|
return value and SemVer(value)
|
||||||
|
|
||||||
|
@field_serializer('provider_overrides', when_used='json')
|
||||||
|
def serialize_overrides(self, provider_overrides: Dict[BinProviderName, ProviderLookupDict]) -> Dict[BinProviderName, Dict[str, str]]:
|
||||||
|
return {
|
||||||
|
provider_name: {
|
||||||
|
key: str(val)
|
||||||
|
for key, val in overrides.items()
|
||||||
|
}
|
||||||
|
for provider_name, overrides in provider_overrides.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
@computed_field # type: ignore[misc] # see mypy issue #1362
|
||||||
|
@property
|
||||||
|
def bin_filename(self) -> BinName:
|
||||||
|
if self.is_script:
|
||||||
|
# e.g. '.../Python.framework/Versions/3.11/lib/python3.11/sqlite3/__init__.py' -> sqlite
|
||||||
|
name = self.name
|
||||||
|
elif self.loaded_abspath:
|
||||||
|
# e.g. '/opt/homebrew/bin/wget' -> wget
|
||||||
|
name = bin_name(self.loaded_abspath)
|
||||||
|
else:
|
||||||
|
# e.g. 'ytdlp' -> 'yt-dlp'
|
||||||
|
name = bin_name(self.name)
|
||||||
|
return name
|
||||||
|
|
||||||
|
@computed_field # type: ignore[misc] # see mypy issue #1362
|
||||||
|
@property
|
||||||
|
def is_executable(self) -> bool:
|
||||||
|
try:
|
||||||
|
assert self.loaded_abspath and path_is_executable(self.loaded_abspath)
|
||||||
|
return True
|
||||||
|
except (ValidationError, AssertionError):
|
||||||
|
return False
|
||||||
|
|
||||||
|
@computed_field # type: ignore[misc] # see mypy issue #1362
|
||||||
|
@property
|
||||||
|
def is_script(self) -> bool:
|
||||||
|
try:
|
||||||
|
assert self.loaded_abspath and path_is_script(self.loaded_abspath)
|
||||||
|
return True
|
||||||
|
except (ValidationError, AssertionError):
|
||||||
|
return False
|
||||||
|
|
||||||
|
@computed_field # type: ignore[misc] # see mypy issue #1362
|
||||||
|
@property
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return bool(
|
||||||
|
self.name
|
||||||
|
and self.loaded_abspath
|
||||||
|
and self.loaded_version
|
||||||
|
and (self.is_executable or self.is_script)
|
||||||
|
)
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def install(self) -> Self:
|
||||||
|
if not self.providers_supported:
|
||||||
|
return self
|
||||||
|
|
||||||
|
exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
|
||||||
|
for provider in self.providers_supported:
|
||||||
|
try:
|
||||||
|
installed_bin = provider.install(self.name, overrides=self.provider_overrides.get(provider.name))
|
||||||
|
if installed_bin:
|
||||||
|
# print('INSTALLED', self.name, installed_bin)
|
||||||
|
return self.model_copy(update={
|
||||||
|
'loaded_provider': provider.name,
|
||||||
|
'loaded_abspath': installed_bin.abspath,
|
||||||
|
'loaded_version': installed_bin.version,
|
||||||
|
})
|
||||||
|
except Exception as err:
|
||||||
|
print(err)
|
||||||
|
exc = err
|
||||||
|
raise exc
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def load(self, cache=True) -> Self:
|
||||||
|
if self.is_valid:
|
||||||
|
return self
|
||||||
|
|
||||||
|
if not self.providers_supported:
|
||||||
|
return self
|
||||||
|
|
||||||
|
exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
|
||||||
|
for provider in self.providers_supported:
|
||||||
|
try:
|
||||||
|
installed_bin = provider.load(self.name, cache=cache, overrides=self.provider_overrides.get(provider.name))
|
||||||
|
if installed_bin:
|
||||||
|
# print('LOADED', provider, self.name, installed_bin)
|
||||||
|
return self.model_copy(update={
|
||||||
|
'loaded_provider': provider.name,
|
||||||
|
'loaded_abspath': installed_bin.abspath,
|
||||||
|
'loaded_version': installed_bin.version,
|
||||||
|
})
|
||||||
|
except Exception as err:
|
||||||
|
print(err)
|
||||||
|
exc = err
|
||||||
|
raise exc
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def load_or_install(self, cache=True) -> Self:
|
||||||
|
if self.is_valid:
|
||||||
|
return self
|
||||||
|
|
||||||
|
if not self.providers_supported:
|
||||||
|
return self
|
||||||
|
|
||||||
|
exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
|
||||||
|
for provider in self.providers_supported:
|
||||||
|
try:
|
||||||
|
installed_bin = provider.load_or_install(self.name, overrides=self.provider_overrides.get(provider.name), cache=cache)
|
||||||
|
if installed_bin:
|
||||||
|
# print('LOADED_OR_INSTALLED', self.name, installed_bin)
|
||||||
|
return self.model_copy(update={
|
||||||
|
'loaded_provider': provider.name,
|
||||||
|
'loaded_abspath': installed_bin.abspath,
|
||||||
|
'loaded_version': installed_bin.version,
|
||||||
|
})
|
||||||
|
except Exception as err:
|
||||||
|
print(err)
|
||||||
|
exc = err
|
||||||
|
raise exc
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def exec(self, args=(), pwd='.'):
|
||||||
|
assert self.loaded_abspath
|
||||||
|
assert self.loaded_version
|
||||||
|
return run([self.loaded_abspath, *args], stdout=PIPE, stderr=PIPE, pwd=pwd)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class SystemPythonHelpers:
|
||||||
|
@staticmethod
|
||||||
|
def get_subdeps() -> str:
|
||||||
|
return 'python3 python3-minimal python3-pip python3-virtualenv'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_abspath() -> str:
|
||||||
|
return sys.executable
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_version() -> str:
|
||||||
|
return '{}.{}.{}'.format(*sys.version_info[:3])
|
||||||
|
|
||||||
|
|
||||||
|
class SqliteHelpers:
|
||||||
|
@staticmethod
|
||||||
|
def get_abspath() -> Path:
|
||||||
|
import sqlite3
|
||||||
|
importlib.reload(sqlite3)
|
||||||
|
return Path(inspect.getfile(sqlite3))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_version() -> SemVer:
|
||||||
|
import sqlite3
|
||||||
|
importlib.reload(sqlite3)
|
||||||
|
version = sqlite3.version
|
||||||
|
assert version
|
||||||
|
return SemVer(version)
|
||||||
|
|
||||||
|
class DjangoHelpers:
|
||||||
|
@staticmethod
|
||||||
|
def get_django_abspath() -> str:
|
||||||
|
import django
|
||||||
|
return inspect.getfile(django)
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_django_version() -> str:
|
||||||
|
import django
|
||||||
|
return '{}.{}.{} {} ({})'.format(*django.VERSION)
|
||||||
|
|
||||||
|
class YtdlpHelpers:
|
||||||
|
@staticmethod
|
||||||
|
def get_ytdlp_subdeps() -> str:
|
||||||
|
return 'yt-dlp ffmpeg'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_ytdlp_version() -> str:
|
||||||
|
import yt_dlp
|
||||||
|
importlib.reload(yt_dlp)
|
||||||
|
|
||||||
|
version = yt_dlp.version.__version__
|
||||||
|
assert version
|
||||||
|
return version
|
||||||
|
|
||||||
|
class PythonBinary(Binary):
|
||||||
|
name: BinName = 'python'
|
||||||
|
|
||||||
|
providers_supported: List[BinProvider] = [
|
||||||
|
EnvProvider(
|
||||||
|
subdeps_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_subdeps'},
|
||||||
|
abspath_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_abspath'},
|
||||||
|
version_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_version'},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
class SqliteBinary(Binary):
|
||||||
|
name: BinName = 'sqlite'
|
||||||
|
providers_supported: List[BinProvider] = [
|
||||||
|
EnvProvider(
|
||||||
|
version_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_version'},
|
||||||
|
abspath_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_abspath'},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
class DjangoBinary(Binary):
|
||||||
|
name: BinName = 'django'
|
||||||
|
providers_supported: List[BinProvider] = [
|
||||||
|
EnvProvider(
|
||||||
|
abspath_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_abspath'},
|
||||||
|
version_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_version'},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class YtdlpBinary(Binary):
|
||||||
|
name: BinName = 'yt-dlp'
|
||||||
|
providers_supported: List[BinProvider] = [
|
||||||
|
# EnvProvider(),
|
||||||
|
PipProvider(version_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_version'}),
|
||||||
|
BrewProvider(subdeps_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_subdeps'}),
|
||||||
|
# AptProvider(subdeps_provider={'yt-dlp': lambda: 'yt-dlp ffmpeg'}),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class WgetBinary(Binary):
|
||||||
|
name: BinName = 'wget'
|
||||||
|
providers_supported: List[BinProvider] = [EnvProvider(), AptProvider()]
|
||||||
|
|
||||||
|
|
||||||
|
# if __name__ == '__main__':
|
||||||
|
# PYTHON_BINARY = PythonBinary()
|
||||||
|
# SQLITE_BINARY = SqliteBinary()
|
||||||
|
# DJANGO_BINARY = DjangoBinary()
|
||||||
|
# WGET_BINARY = WgetBinary()
|
||||||
|
# YTDLP_BINARY = YtdlpPBinary()
|
||||||
|
|
||||||
|
# print('-------------------------------------DEFINING BINARIES---------------------------------')
|
||||||
|
# print(PYTHON_BINARY)
|
||||||
|
# print(SQLITE_BINARY)
|
||||||
|
# print(DJANGO_BINARY)
|
||||||
|
# print(WGET_BINARY)
|
||||||
|
# print(YTDLP_BINARY)
|
561
archivebox/plugantic/binproviders.py
Normal file
561
archivebox/plugantic/binproviders.py
Normal file
|
@ -0,0 +1,561 @@
|
||||||
|
__package__ = 'archivebox.plugantic'
|
||||||
|
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import operator
|
||||||
|
|
||||||
|
from typing import Callable, Any, Optional, Type, Dict, Annotated, ClassVar, Literal, cast, TYPE_CHECKING
|
||||||
|
from typing_extensions import Self
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from collections import namedtuple
|
||||||
|
from pathlib import Path
|
||||||
|
from subprocess import run, PIPE
|
||||||
|
|
||||||
|
from pydantic_core import core_schema, ValidationError
|
||||||
|
from pydantic import BaseModel, Field, TypeAdapter, AfterValidator, validate_call, GetCoreSchemaHandler
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def func_takes_args_or_kwargs(lambda_func: Callable[..., Any]) -> bool:
|
||||||
|
"""returns True if a lambda func takes args/kwargs of any kind, otherwise false if it's pure/argless"""
|
||||||
|
code = lambda_func.__code__
|
||||||
|
has_args = code.co_argcount > 0
|
||||||
|
has_varargs = code.co_flags & 0x04 != 0
|
||||||
|
has_varkw = code.co_flags & 0x08 != 0
|
||||||
|
return has_args or has_varargs or has_varkw
|
||||||
|
|
||||||
|
|
||||||
|
def is_semver_str(semver: Any) -> bool:
|
||||||
|
if isinstance(semver, str):
|
||||||
|
return (semver.count('.') == 2 and semver.replace('.', '').isdigit())
|
||||||
|
return False
|
||||||
|
|
||||||
|
def semver_to_str(semver: tuple[int, int, int] | str) -> str:
|
||||||
|
if isinstance(semver, (list, tuple)):
|
||||||
|
return '.'.join(str(chunk) for chunk in semver)
|
||||||
|
if is_semver_str(semver):
|
||||||
|
return semver
|
||||||
|
raise ValidationError('Tried to convert invalid SemVer: {}'.format(semver))
|
||||||
|
|
||||||
|
|
||||||
|
SemVerTuple = namedtuple('SemVerTuple', ('major', 'minor', 'patch'), defaults=(0, 0, 0))
|
||||||
|
SemVerParsableTypes = str | tuple[str | int, ...] | list[str | int]
|
||||||
|
|
||||||
|
class SemVer(SemVerTuple):
|
||||||
|
major: int
|
||||||
|
minor: int = 0
|
||||||
|
patch: int = 0
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
full_text: str | None = ''
|
||||||
|
|
||||||
|
def __new__(cls, *args, full_text=None, **kwargs):
|
||||||
|
# '1.1.1'
|
||||||
|
if len(args) == 1 and is_semver_str(args[0]):
|
||||||
|
result = SemVer.parse(args[0])
|
||||||
|
|
||||||
|
# ('1', '2', '3')
|
||||||
|
elif len(args) == 1 and isinstance(args[0], (tuple, list)):
|
||||||
|
result = SemVer.parse(args[0])
|
||||||
|
|
||||||
|
# (1, '2', None)
|
||||||
|
elif not all(isinstance(arg, (int, type(None))) for arg in args):
|
||||||
|
result = SemVer.parse(args)
|
||||||
|
|
||||||
|
# (None)
|
||||||
|
elif all(chunk in ('', 0, None) for chunk in (*args, *kwargs.values())):
|
||||||
|
result = None
|
||||||
|
|
||||||
|
# 1, 2, 3
|
||||||
|
else:
|
||||||
|
result = SemVerTuple.__new__(cls, *args, **kwargs)
|
||||||
|
|
||||||
|
if result is not None:
|
||||||
|
# add first line as extra hidden metadata so it can be logged without having to re-run version cmd
|
||||||
|
result.full_text = full_text or str(result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def parse(cls, version_stdout: SemVerParsableTypes) -> Self | None:
|
||||||
|
"""
|
||||||
|
parses a version tag string formatted like into (major, minor, patch) ints
|
||||||
|
'Google Chrome 124.0.6367.208' -> (124, 0, 6367)
|
||||||
|
'GNU Wget 1.24.5 built on darwin23.2.0.' -> (1, 24, 5)
|
||||||
|
'curl 8.4.0 (x86_64-apple-darwin23.0) ...' -> (8, 4, 0)
|
||||||
|
'2024.04.09' -> (2024, 4, 9)
|
||||||
|
|
||||||
|
"""
|
||||||
|
# print('INITIAL_VALUE', type(version_stdout).__name__, version_stdout)
|
||||||
|
|
||||||
|
if isinstance(version_stdout, (tuple, list)):
|
||||||
|
version_stdout = '.'.join(str(chunk) for chunk in version_stdout)
|
||||||
|
elif isinstance(version_stdout, bytes):
|
||||||
|
version_stdout = version_stdout.decode()
|
||||||
|
elif not isinstance(version_stdout, str):
|
||||||
|
version_stdout = str(version_stdout)
|
||||||
|
|
||||||
|
# no text to work with, return None immediately
|
||||||
|
if not version_stdout.strip():
|
||||||
|
# raise Exception('Tried to parse semver from empty version output (is binary installed and available?)')
|
||||||
|
return None
|
||||||
|
|
||||||
|
just_numbers = lambda col: col.lower().strip('v').split('+')[0].split('-')[0].split('_')[0]
|
||||||
|
contains_semver = lambda col: (
|
||||||
|
col.count('.') in (1, 2, 3)
|
||||||
|
and all(chunk.isdigit() for chunk in col.split('.')[:3]) # first 3 chunks can only be nums
|
||||||
|
)
|
||||||
|
|
||||||
|
full_text = version_stdout.split('\n')[0].strip()
|
||||||
|
first_line_columns = full_text.split()[:4]
|
||||||
|
version_columns = list(filter(contains_semver, map(just_numbers, first_line_columns)))
|
||||||
|
|
||||||
|
# could not find any column of first line that looks like a version number, despite there being some text
|
||||||
|
if not version_columns:
|
||||||
|
# raise Exception('Failed to parse semver from version command output: {}'.format(' '.join(first_line_columns)))
|
||||||
|
return None
|
||||||
|
|
||||||
|
# take first col containing a semver, and truncate it to 3 chunks (e.g. 2024.04.09.91) -> (2024, 04, 09)
|
||||||
|
first_version_tuple = version_columns[0].split('.', 3)[:3]
|
||||||
|
|
||||||
|
# print('FINAL_VALUE', first_version_tuple)
|
||||||
|
|
||||||
|
return cls(*(int(chunk) for chunk in first_version_tuple), full_text=full_text)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return '.'.join(str(chunk) for chunk in self)
|
||||||
|
|
||||||
|
# @classmethod
|
||||||
|
# def __get_pydantic_core_schema__(cls, source: Type[Any], handler: GetCoreSchemaHandler) -> core_schema.CoreSchema:
|
||||||
|
# default_schema = handler(source)
|
||||||
|
# return core_schema.no_info_after_validator_function(
|
||||||
|
# cls.parse,
|
||||||
|
# default_schema,
|
||||||
|
# serialization=core_schema.plain_serializer_function_ser_schema(
|
||||||
|
# lambda semver: str(semver),
|
||||||
|
# info_arg=False,
|
||||||
|
# return_schema=core_schema.str_schema(),
|
||||||
|
# ),
|
||||||
|
# )
|
||||||
|
|
||||||
|
assert SemVer(None) == None
|
||||||
|
assert SemVer('') == None
|
||||||
|
assert SemVer.parse('') == None
|
||||||
|
assert SemVer(1) == (1, 0, 0)
|
||||||
|
assert SemVer(1, 2) == (1, 2, 0)
|
||||||
|
assert SemVer('1.2+234234') == (1, 2, 0)
|
||||||
|
assert SemVer((1, 2, 3)) == (1, 2, 3)
|
||||||
|
assert getattr(SemVer((1, 2, 3)), 'full_text') == '1.2.3'
|
||||||
|
assert SemVer(('1', '2', '3')) == (1, 2, 3)
|
||||||
|
assert SemVer.parse('5.6.7') == (5, 6, 7)
|
||||||
|
assert SemVer.parse('124.0.6367.208') == (124, 0, 6367)
|
||||||
|
assert SemVer.parse('Google Chrome 124.1+234.234') == (124, 1, 0)
|
||||||
|
assert SemVer.parse('Google Ch1rome 124.0.6367.208') == (124, 0, 6367)
|
||||||
|
assert SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324') == (124, 0, 6367)
|
||||||
|
assert getattr(SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324'), 'full_text') == 'Google Chrome 124.0.6367.208+beta_234. 234.234.123'
|
||||||
|
assert SemVer.parse('Google Chrome') == None
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def bin_name(bin_path_or_name: str | Path) -> str:
|
||||||
|
name = Path(bin_path_or_name).name
|
||||||
|
assert len(name) > 1
|
||||||
|
assert name.replace('-', '').replace('_', '').replace('.', '').isalnum(), (
|
||||||
|
f'Binary name can only contain a-Z0-9-_.: {name}')
|
||||||
|
return name
|
||||||
|
|
||||||
|
BinName = Annotated[str, AfterValidator(bin_name)]
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def path_is_file(path: Path | str) -> Path:
|
||||||
|
path = Path(path) if isinstance(path, str) else path
|
||||||
|
assert path.is_file(), f'Path is not a file: {path}'
|
||||||
|
return path
|
||||||
|
|
||||||
|
HostExistsPath = Annotated[Path, AfterValidator(path_is_file)]
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def path_is_executable(path: HostExistsPath) -> HostExistsPath:
|
||||||
|
assert os.access(path, os.X_OK), f'Path is not executable (fix by running chmod +x {path})'
|
||||||
|
return path
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def path_is_script(path: HostExistsPath) -> HostExistsPath:
|
||||||
|
SCRIPT_EXTENSIONS = ('.py', '.js', '.sh')
|
||||||
|
assert path.suffix.lower() in SCRIPT_EXTENSIONS, 'Path is not a script (does not end in {})'.format(', '.join(SCRIPT_EXTENSIONS))
|
||||||
|
return path
|
||||||
|
|
||||||
|
HostExecutablePath = Annotated[HostExistsPath, AfterValidator(path_is_executable)]
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def path_is_abspath(path: Path) -> Path:
|
||||||
|
return path.resolve()
|
||||||
|
|
||||||
|
HostAbsPath = Annotated[HostExistsPath, AfterValidator(path_is_abspath)]
|
||||||
|
HostBinPath = Annotated[Path, AfterValidator(path_is_abspath), AfterValidator(path_is_file)]
|
||||||
|
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def bin_abspath(bin_path_or_name: BinName | Path) -> HostBinPath | None:
|
||||||
|
assert bin_path_or_name
|
||||||
|
|
||||||
|
if str(bin_path_or_name).startswith('/'):
|
||||||
|
# already a path, get its absolute form
|
||||||
|
abspath = Path(bin_path_or_name).resolve()
|
||||||
|
else:
|
||||||
|
# not a path yet, get path using os.which
|
||||||
|
binpath = shutil.which(bin_path_or_name)
|
||||||
|
if not binpath:
|
||||||
|
return None
|
||||||
|
abspath = Path(binpath).resolve()
|
||||||
|
|
||||||
|
try:
|
||||||
|
return TypeAdapter(HostBinPath).validate_python(abspath)
|
||||||
|
except ValidationError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def bin_version(bin_path: HostBinPath, args=('--version',)) -> SemVer | None:
|
||||||
|
return SemVer(run([bin_path, *args], stdout=PIPE).stdout.strip().decode())
|
||||||
|
|
||||||
|
|
||||||
|
class InstalledBin(BaseModel):
|
||||||
|
abspath: HostBinPath
|
||||||
|
version: SemVer
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_install_string(pkgs_str: str) -> str:
|
||||||
|
"""Make sure a string is a valid install string for a package manager, e.g. 'yt-dlp ffmpeg'"""
|
||||||
|
assert pkgs_str
|
||||||
|
assert all(len(pkg) > 1 for pkg in pkgs_str.split(' '))
|
||||||
|
return pkgs_str
|
||||||
|
|
||||||
|
def is_valid_python_dotted_import(import_str: str) -> str:
|
||||||
|
assert import_str and import_str.replace('.', '').replace('_', '').isalnum()
|
||||||
|
return import_str
|
||||||
|
|
||||||
|
InstallStr = Annotated[str, AfterValidator(is_valid_install_string)]
|
||||||
|
|
||||||
|
LazyImportStr = Annotated[str, AfterValidator(is_valid_python_dotted_import)]
|
||||||
|
|
||||||
|
ProviderHandler = Callable[..., Any] | Callable[[], Any] # must take no args [], or [bin_name: str, **kwargs]
|
||||||
|
#ProviderHandlerStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
|
||||||
|
ProviderHandlerRef = LazyImportStr | ProviderHandler
|
||||||
|
ProviderLookupDict = Dict[str, LazyImportStr]
|
||||||
|
ProviderType = Literal['abspath', 'version', 'subdeps', 'install']
|
||||||
|
|
||||||
|
|
||||||
|
# class Host(BaseModel):
|
||||||
|
# machine: str
|
||||||
|
# system: str
|
||||||
|
# platform: str
|
||||||
|
# in_docker: bool
|
||||||
|
# in_qemu: bool
|
||||||
|
# python: str
|
||||||
|
|
||||||
|
BinProviderName = Literal['env', 'pip', 'apt', 'brew', 'npm', 'vendor']
|
||||||
|
|
||||||
|
|
||||||
|
class BinProvider(ABC, BaseModel):
|
||||||
|
name: BinProviderName
|
||||||
|
|
||||||
|
abspath_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_abspath'}, exclude=True)
|
||||||
|
version_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_version'}, exclude=True)
|
||||||
|
subdeps_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_subdeps'}, exclude=True)
|
||||||
|
install_provider: ProviderLookupDict = Field(default={'*': 'self.on_install'}, exclude=True)
|
||||||
|
|
||||||
|
_abspath_cache: ClassVar = {}
|
||||||
|
_version_cache: ClassVar = {}
|
||||||
|
_install_cache: ClassVar = {}
|
||||||
|
|
||||||
|
# def provider_version(self) -> SemVer | None:
|
||||||
|
# """Version of the actual underlying package manager (e.g. pip v20.4.1)"""
|
||||||
|
# if self.name in ('env', 'vendor'):
|
||||||
|
# return SemVer('0.0.0')
|
||||||
|
# installer_binpath = Path(shutil.which(self.name)).resolve()
|
||||||
|
# return bin_version(installer_binpath)
|
||||||
|
|
||||||
|
# def provider_host(self) -> Host:
|
||||||
|
# """Information about the host env, archictecture, and OS needed to select & build packages"""
|
||||||
|
# p = platform.uname()
|
||||||
|
# return Host(
|
||||||
|
# machine=p.machine,
|
||||||
|
# system=p.system,
|
||||||
|
# platform=platform.platform(),
|
||||||
|
# python=sys.implementation.name,
|
||||||
|
# in_docker=os.environ.get('IN_DOCKER', '').lower() == 'true',
|
||||||
|
# in_qemu=os.environ.get('IN_QEMU', '').lower() == 'true',
|
||||||
|
# )
|
||||||
|
|
||||||
|
def get_default_providers(self):
|
||||||
|
return self.get_providers_for_bin('*')
|
||||||
|
|
||||||
|
def resolve_provider_func(self, provider_func: ProviderHandlerRef | None) -> ProviderHandler | None:
|
||||||
|
if provider_func is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# if provider_func is a dotted path to a function on self, swap it for the actual function
|
||||||
|
if isinstance(provider_func, str) and provider_func.startswith('self.'):
|
||||||
|
provider_func = getattr(self, provider_func.split('self.', 1)[-1])
|
||||||
|
|
||||||
|
# if provider_func is a dot-formatted import string, import the function
|
||||||
|
if isinstance(provider_func, str):
|
||||||
|
from django.utils.module_loading import import_string
|
||||||
|
|
||||||
|
package_name, module_name, classname, path = provider_func.split('.', 3) # -> abc, def, ghi.jkl
|
||||||
|
|
||||||
|
# get .ghi.jkl nested attr present on module abc.def
|
||||||
|
imported_module = import_string(f'{package_name}.{module_name}.{classname}')
|
||||||
|
provider_func = operator.attrgetter(path)(imported_module)
|
||||||
|
|
||||||
|
# # abc.def.ghi.jkl -> 1, 2, 3
|
||||||
|
# for idx in range(1, len(path)):
|
||||||
|
# parent_path = '.'.join(path[:-idx]) # abc.def.ghi
|
||||||
|
# try:
|
||||||
|
# parent_module = import_string(parent_path)
|
||||||
|
# provider_func = getattr(parent_module, path[-idx])
|
||||||
|
# except AttributeError, ImportError:
|
||||||
|
# continue
|
||||||
|
|
||||||
|
assert TypeAdapter(ProviderHandler).validate_python(provider_func), (
|
||||||
|
f'{self.__class__.__name__} provider func for {bin_name} was not a function or dotted-import path: {provider_func}')
|
||||||
|
|
||||||
|
return provider_func
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def get_providers_for_bin(self, bin_name: str) -> ProviderLookupDict:
|
||||||
|
providers_for_bin = {
|
||||||
|
'abspath': self.abspath_provider.get(bin_name),
|
||||||
|
'version': self.version_provider.get(bin_name),
|
||||||
|
'subdeps': self.subdeps_provider.get(bin_name),
|
||||||
|
'install': self.install_provider.get(bin_name),
|
||||||
|
}
|
||||||
|
only_set_providers_for_bin = {k: v for k, v in providers_for_bin.items() if v is not None}
|
||||||
|
|
||||||
|
return only_set_providers_for_bin
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def get_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None) -> ProviderHandler:
|
||||||
|
"""
|
||||||
|
Get the provider func for a given key + Dict of provider callbacks + fallback default provider.
|
||||||
|
e.g. get_provider_for_action(bin_name='yt-dlp', 'install', default_provider=self.on_install, ...) -> Callable
|
||||||
|
"""
|
||||||
|
|
||||||
|
provider_func_ref = (
|
||||||
|
(overrides or {}).get(provider_type)
|
||||||
|
or self.get_providers_for_bin(bin_name).get(provider_type)
|
||||||
|
or self.get_default_providers().get(provider_type)
|
||||||
|
or default_provider
|
||||||
|
)
|
||||||
|
# print('getting provider for action', bin_name, provider_type, provider_func)
|
||||||
|
|
||||||
|
provider_func = self.resolve_provider_func(provider_func_ref)
|
||||||
|
|
||||||
|
assert provider_func, f'No {self.name} provider func was found for {bin_name} in: {self.__class__.__name__}.'
|
||||||
|
|
||||||
|
return provider_func
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def call_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None, **kwargs) -> Any:
|
||||||
|
provider_func: ProviderHandler = self.get_provider_for_action(
|
||||||
|
bin_name=bin_name,
|
||||||
|
provider_type=provider_type,
|
||||||
|
default_provider=default_provider,
|
||||||
|
overrides=overrides,
|
||||||
|
)
|
||||||
|
if not func_takes_args_or_kwargs(provider_func):
|
||||||
|
# if it's a pure argless lambdas, dont pass bin_path and other **kwargs
|
||||||
|
provider_func_without_args = cast(Callable[[], Any], provider_func)
|
||||||
|
return provider_func_without_args()
|
||||||
|
|
||||||
|
provider_func = cast(Callable[..., Any], provider_func)
|
||||||
|
return provider_func(bin_name, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def on_get_abspath(self, bin_name: BinName, **_) -> HostBinPath | None:
|
||||||
|
print(f'[*] {self.__class__.__name__}: Getting abspath for {bin_name}...')
|
||||||
|
try:
|
||||||
|
return bin_abspath(bin_name)
|
||||||
|
except ValidationError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def on_get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, **_) -> SemVer | None:
|
||||||
|
abspath = abspath or self._abspath_cache.get(bin_name) or self.get_abspath(bin_name)
|
||||||
|
if not abspath: return None
|
||||||
|
|
||||||
|
print(f'[*] {self.__class__.__name__}: Getting version for {bin_name}...')
|
||||||
|
try:
|
||||||
|
return bin_version(abspath)
|
||||||
|
except ValidationError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def on_get_subdeps(self, bin_name: BinName, **_) -> InstallStr:
|
||||||
|
print(f'[*] {self.__class__.__name__}: Getting subdependencies for {bin_name}')
|
||||||
|
# ... subdependency calculation logic here
|
||||||
|
return TypeAdapter(InstallStr).validate_python(bin_name)
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
|
||||||
|
subdeps = subdeps or self.get_subdeps(bin_name)
|
||||||
|
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
|
||||||
|
# ... install logic here
|
||||||
|
assert True
|
||||||
|
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def get_abspath(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> HostBinPath | None:
|
||||||
|
abspath = self.call_provider_for_action(
|
||||||
|
bin_name=bin_name,
|
||||||
|
provider_type='abspath',
|
||||||
|
default_provider=self.on_get_abspath,
|
||||||
|
overrides=overrides,
|
||||||
|
)
|
||||||
|
if not abspath:
|
||||||
|
return None
|
||||||
|
result = TypeAdapter(HostBinPath).validate_python(abspath)
|
||||||
|
self._abspath_cache[bin_name] = result
|
||||||
|
return result
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, overrides: Optional[ProviderLookupDict]=None) -> SemVer | None:
|
||||||
|
version = self.call_provider_for_action(
|
||||||
|
bin_name=bin_name,
|
||||||
|
provider_type='version',
|
||||||
|
default_provider=self.on_get_version,
|
||||||
|
overrides=overrides,
|
||||||
|
abspath=abspath,
|
||||||
|
)
|
||||||
|
if not version:
|
||||||
|
return None
|
||||||
|
result = SemVer(version)
|
||||||
|
self._version_cache[bin_name] = result
|
||||||
|
return result
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def get_subdeps(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstallStr:
|
||||||
|
subdeps = self.call_provider_for_action(
|
||||||
|
bin_name=bin_name,
|
||||||
|
provider_type='subdeps',
|
||||||
|
default_provider=self.on_get_subdeps,
|
||||||
|
overrides=overrides,
|
||||||
|
)
|
||||||
|
if not subdeps:
|
||||||
|
subdeps = bin_name
|
||||||
|
result = TypeAdapter(InstallStr).validate_python(subdeps)
|
||||||
|
return result
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstalledBin | None:
|
||||||
|
subdeps = self.get_subdeps(bin_name, overrides=overrides)
|
||||||
|
|
||||||
|
self.call_provider_for_action(
|
||||||
|
bin_name=bin_name,
|
||||||
|
provider_type='install',
|
||||||
|
default_provider=self.on_install,
|
||||||
|
overrides=overrides,
|
||||||
|
subdeps=subdeps,
|
||||||
|
)
|
||||||
|
|
||||||
|
installed_abspath = self.get_abspath(bin_name)
|
||||||
|
assert installed_abspath, f'Unable to find {bin_name} abspath after installing with {self.name}'
|
||||||
|
|
||||||
|
installed_version = self.get_version(bin_name, abspath=installed_abspath)
|
||||||
|
assert installed_version, f'Unable to find {bin_name} version after installing with {self.name}'
|
||||||
|
|
||||||
|
result = InstalledBin(abspath=installed_abspath, version=installed_version)
|
||||||
|
self._install_cache[bin_name] = result
|
||||||
|
return result
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def load(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=False) -> InstalledBin | None:
|
||||||
|
installed_abspath = None
|
||||||
|
installed_version = None
|
||||||
|
|
||||||
|
if cache:
|
||||||
|
installed_bin = self._install_cache.get(bin_name)
|
||||||
|
if installed_bin:
|
||||||
|
return installed_bin
|
||||||
|
installed_abspath = self._abspath_cache.get(bin_name)
|
||||||
|
installed_version = self._version_cache.get(bin_name)
|
||||||
|
|
||||||
|
|
||||||
|
installed_abspath = installed_abspath or self.get_abspath(bin_name, overrides=overrides)
|
||||||
|
if not installed_abspath:
|
||||||
|
return None
|
||||||
|
|
||||||
|
installed_version = installed_version or self.get_version(bin_name, abspath=installed_abspath, overrides=overrides)
|
||||||
|
if not installed_version:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return InstalledBin(abspath=installed_abspath, version=installed_version)
|
||||||
|
|
||||||
|
@validate_call
|
||||||
|
def load_or_install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=True) -> InstalledBin | None:
|
||||||
|
installed = self.load(bin_name, overrides=overrides, cache=cache)
|
||||||
|
if not installed:
|
||||||
|
installed = self.install(bin_name, overrides=overrides)
|
||||||
|
return installed
|
||||||
|
|
||||||
|
|
||||||
|
class PipProvider(BinProvider):
|
||||||
|
name: BinProviderName = 'pip'
|
||||||
|
|
||||||
|
def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
|
||||||
|
subdeps = subdeps or self.on_get_subdeps(bin_name)
|
||||||
|
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
|
||||||
|
|
||||||
|
proc = run(['pip', 'install', '--upgrade', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
|
||||||
|
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(proc.stdout.strip().decode())
|
||||||
|
print(proc.stderr.strip().decode())
|
||||||
|
raise Exception(f'{self.__class__.__name__}: install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
|
||||||
|
|
||||||
|
|
||||||
|
class AptProvider(BinProvider):
|
||||||
|
name: BinProviderName = 'apt'
|
||||||
|
|
||||||
|
subdeps_provider: ProviderLookupDict = {
|
||||||
|
'yt-dlp': lambda: 'yt-dlp ffmpeg',
|
||||||
|
}
|
||||||
|
|
||||||
|
def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
|
||||||
|
subdeps = subdeps or self.on_get_subdeps(bin_name)
|
||||||
|
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
|
||||||
|
|
||||||
|
run(['apt-get', 'update', '-qq'])
|
||||||
|
proc = run(['apt-get', 'install', '-y', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
|
||||||
|
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(proc.stdout.strip().decode())
|
||||||
|
print(proc.stderr.strip().decode())
|
||||||
|
raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
|
||||||
|
|
||||||
|
class BrewProvider(BinProvider):
|
||||||
|
name: BinProviderName = 'brew'
|
||||||
|
|
||||||
|
def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
|
||||||
|
subdeps = subdeps or self.on_get_subdeps(bin_name)
|
||||||
|
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
|
||||||
|
|
||||||
|
proc = run(['brew', 'install', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
|
||||||
|
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(proc.stdout.strip().decode())
|
||||||
|
print(proc.stderr.strip().decode())
|
||||||
|
raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
|
||||||
|
|
||||||
|
|
||||||
|
class EnvProvider(BinProvider):
|
||||||
|
name: BinProviderName = 'env'
|
||||||
|
|
||||||
|
abspath_provider: ProviderLookupDict = {
|
||||||
|
# 'python': lambda: Path('/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/bin/python3.10'),
|
||||||
|
}
|
||||||
|
version_provider: ProviderLookupDict = {
|
||||||
|
# 'python': lambda: '{}.{}.{}'.format(*sys.version_info[:3]),
|
||||||
|
}
|
||||||
|
|
||||||
|
def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
|
||||||
|
"""The env provider is ready-only and does not install any packages, so this is a no-op"""
|
||||||
|
pass
|
53
archivebox/plugantic/configs.py
Normal file
53
archivebox/plugantic/configs.py
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
__package__ = 'archivebox.plugantic'
|
||||||
|
|
||||||
|
|
||||||
|
from typing import Optional, List, Literal
|
||||||
|
from pathlib import Path
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
ConfigSectionName = Literal['GENERAL_CONFIG', 'ARCHIVE_METHOD_TOGGLES', 'ARCHIVE_METHOD_OPTIONS', 'DEPENDENCY_CONFIG']
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigSet(BaseModel):
|
||||||
|
section: ConfigSectionName = 'GENERAL_CONFIG'
|
||||||
|
|
||||||
|
class WgetToggleConfig(ConfigSet):
|
||||||
|
section: ConfigSectionName = 'ARCHIVE_METHOD_TOGGLES'
|
||||||
|
|
||||||
|
SAVE_WGET: bool = True
|
||||||
|
SAVE_WARC: bool = True
|
||||||
|
|
||||||
|
class WgetDependencyConfig(ConfigSet):
|
||||||
|
section: ConfigSectionName = 'DEPENDENCY_CONFIG'
|
||||||
|
|
||||||
|
WGET_BINARY: str = Field(default='wget')
|
||||||
|
WGET_ARGS: Optional[List[str]] = Field(default=None)
|
||||||
|
WGET_EXTRA_ARGS: List[str] = []
|
||||||
|
WGET_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
|
||||||
|
|
||||||
|
class WgetOptionsConfig(ConfigSet):
|
||||||
|
section: ConfigSectionName = 'ARCHIVE_METHOD_OPTIONS'
|
||||||
|
|
||||||
|
# loaded from shared config
|
||||||
|
WGET_AUTO_COMPRESSION: bool = Field(default=True)
|
||||||
|
SAVE_WGET_REQUISITES: bool = Field(default=True)
|
||||||
|
WGET_USER_AGENT: str = Field(default='', alias='USER_AGENT')
|
||||||
|
WGET_TIMEOUT: int = Field(default=60, alias='TIMEOUT')
|
||||||
|
WGET_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY')
|
||||||
|
WGET_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES')
|
||||||
|
WGET_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE')
|
||||||
|
|
||||||
|
|
||||||
|
CONFIG = {
|
||||||
|
'CHECK_SSL_VALIDITY': False,
|
||||||
|
'SAVE_WARC': False,
|
||||||
|
'TIMEOUT': 999,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
WGET_CONFIG = [
|
||||||
|
WgetToggleConfig(**CONFIG),
|
||||||
|
WgetDependencyConfig(**CONFIG),
|
||||||
|
WgetOptionsConfig(**CONFIG),
|
||||||
|
]
|
118
archivebox/plugantic/extractors.py
Normal file
118
archivebox/plugantic/extractors.py
Normal file
|
@ -0,0 +1,118 @@
|
||||||
|
__package__ = 'archivebox.plugantic'
|
||||||
|
|
||||||
|
from typing import Optional, List, Literal, Annotated, Dict, Any
|
||||||
|
from typing_extensions import Self
|
||||||
|
|
||||||
|
from abc import ABC
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pydantic import BaseModel, model_validator, field_serializer, AfterValidator
|
||||||
|
|
||||||
|
from .binaries import (
|
||||||
|
Binary,
|
||||||
|
YtdlpBinary,
|
||||||
|
WgetBinary,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# stubs
|
||||||
|
class Snapshot:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class ArchiveResult:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_wget_output_path(*args, **kwargs) -> Path:
|
||||||
|
return Path('.').resolve()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def no_empty_args(args: List[str]) -> List[str]:
|
||||||
|
assert all(len(arg) for arg in args)
|
||||||
|
return args
|
||||||
|
|
||||||
|
ExtractorName = Literal['wget', 'warc', 'media']
|
||||||
|
|
||||||
|
HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
|
||||||
|
CmdArgsList = Annotated[List[str], AfterValidator(no_empty_args)]
|
||||||
|
|
||||||
|
|
||||||
|
class Extractor(ABC, BaseModel):
|
||||||
|
name: ExtractorName
|
||||||
|
binary: Binary
|
||||||
|
|
||||||
|
output_path_func: HandlerFuncStr = 'self.get_output_path'
|
||||||
|
should_extract_func: HandlerFuncStr = 'self.should_extract'
|
||||||
|
extract_func: HandlerFuncStr = 'self.extract'
|
||||||
|
exec_func: HandlerFuncStr = 'self.exec'
|
||||||
|
|
||||||
|
default_args: CmdArgsList = []
|
||||||
|
extra_args: CmdArgsList = []
|
||||||
|
args: Optional[CmdArgsList] = None
|
||||||
|
|
||||||
|
@model_validator(mode='after')
|
||||||
|
def validate_model(self) -> Self:
|
||||||
|
if self.args is None:
|
||||||
|
self.args = [*self.default_args, *self.extra_args]
|
||||||
|
return self
|
||||||
|
|
||||||
|
@field_serializer('binary', when_used='json')
|
||||||
|
def dump_binary(binary) -> str:
|
||||||
|
return binary.name
|
||||||
|
|
||||||
|
def get_output_path(self, snapshot) -> Path:
|
||||||
|
return Path(self.name)
|
||||||
|
|
||||||
|
def should_extract(self, snapshot) -> bool:
|
||||||
|
output_dir = self.get_output_path(snapshot)
|
||||||
|
if output_dir.glob('*.*'):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def extract(self, url: str, **kwargs) -> Dict[str, Any]:
|
||||||
|
output_dir = self.get_output_path(url, **kwargs)
|
||||||
|
|
||||||
|
cmd = [url, *self.args] if self.args is not None else [url, *self.default_args, *self.extra_args]
|
||||||
|
proc = self.exec(cmd, pwd=output_dir)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'status': 'succeeded' if proc.returncode == 0 else 'failed',
|
||||||
|
'output': proc.stdout.decode().strip().split('\n')[-1],
|
||||||
|
'output_files': list(output_dir.glob('*.*')),
|
||||||
|
|
||||||
|
'stdout': proc.stdout.decode().strip(),
|
||||||
|
'stderr': proc.stderr.decode().strip(),
|
||||||
|
'returncode': proc.returncode,
|
||||||
|
}
|
||||||
|
|
||||||
|
def exec(self, args: CmdArgsList, pwd: Optional[Path]=None):
|
||||||
|
pwd = pwd or Path('.')
|
||||||
|
assert self.binary.loaded_provider
|
||||||
|
return self.binary.exec(args, pwd=pwd)
|
||||||
|
|
||||||
|
|
||||||
|
class YtdlpExtractor(Extractor):
|
||||||
|
name: ExtractorName = 'media'
|
||||||
|
binary: Binary = YtdlpBinary()
|
||||||
|
|
||||||
|
def get_output_path(self, snapshot) -> Path:
|
||||||
|
return Path(self.name)
|
||||||
|
|
||||||
|
|
||||||
|
class WgetExtractor(Extractor):
|
||||||
|
name: ExtractorName = 'wget'
|
||||||
|
binary: Binary = WgetBinary()
|
||||||
|
|
||||||
|
def get_output_path(self, snapshot) -> Path:
|
||||||
|
return get_wget_output_path(snapshot)
|
||||||
|
|
||||||
|
|
||||||
|
class WarcExtractor(Extractor):
|
||||||
|
name: ExtractorName = 'warc'
|
||||||
|
binary: Binary = WgetBinary()
|
||||||
|
|
||||||
|
def get_output_path(self, snapshot) -> Path:
|
||||||
|
return get_wget_output_path(snapshot)
|
||||||
|
|
||||||
|
|
396
archivebox/plugantic/ini_to_toml.py
Normal file
396
archivebox/plugantic/ini_to_toml.py
Normal file
|
@ -0,0 +1,396 @@
|
||||||
|
from typing import Dict, Any, List
|
||||||
|
|
||||||
|
import configparser
|
||||||
|
import json
|
||||||
|
import ast
|
||||||
|
|
||||||
|
JSONValue = str | bool | int | None | List['JSONValue']
|
||||||
|
|
||||||
|
def load_ini_value(val: str) -> JSONValue:
|
||||||
|
"""Convert lax INI values into strict TOML-compliant (JSON) values"""
|
||||||
|
if val.lower() in ('true', 'yes', '1'):
|
||||||
|
return True
|
||||||
|
if val.lower() in ('false', 'no', '0'):
|
||||||
|
return False
|
||||||
|
if val.isdigit():
|
||||||
|
return int(val)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return ast.literal_eval(val)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
return json.loads(val)
|
||||||
|
except Exception as err:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return val
|
||||||
|
|
||||||
|
|
||||||
|
def convert(ini_str: str) -> str:
|
||||||
|
"""Convert a string of INI config into its TOML equivalent (warning: strips comments)"""
|
||||||
|
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.optionxform = str # capitalize key names
|
||||||
|
config.read_string(ini_str)
|
||||||
|
|
||||||
|
# Initialize an empty dictionary to store the TOML representation
|
||||||
|
toml_dict = {}
|
||||||
|
|
||||||
|
# Iterate over each section in the INI configuration
|
||||||
|
for section in config.sections():
|
||||||
|
toml_dict[section] = {}
|
||||||
|
|
||||||
|
# Iterate over each key-value pair in the section
|
||||||
|
for key, value in config.items(section):
|
||||||
|
parsed_value = load_ini_value(value)
|
||||||
|
|
||||||
|
# Convert the parsed value to its TOML-compatible JSON representation
|
||||||
|
toml_dict[section.upper()][key.upper()] = json.dumps(parsed_value)
|
||||||
|
|
||||||
|
# Build the TOML string
|
||||||
|
toml_str = ""
|
||||||
|
for section, items in toml_dict.items():
|
||||||
|
toml_str += f"[{section}]\n"
|
||||||
|
for key, value in items.items():
|
||||||
|
toml_str += f"{key} = {value}\n"
|
||||||
|
toml_str += "\n"
|
||||||
|
|
||||||
|
return toml_str.strip()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Basic Assertions
|
||||||
|
|
||||||
|
test_input = """
|
||||||
|
[SERVER_CONFIG]
|
||||||
|
IS_TTY=False
|
||||||
|
USE_COLOR=False
|
||||||
|
SHOW_PROGRESS=False
|
||||||
|
IN_DOCKER=False
|
||||||
|
IN_QEMU=False
|
||||||
|
PUID=501
|
||||||
|
PGID=20
|
||||||
|
OUTPUT_DIR=/opt/archivebox/data
|
||||||
|
CONFIG_FILE=/opt/archivebox/data/ArchiveBox.conf
|
||||||
|
ONLY_NEW=True
|
||||||
|
TIMEOUT=60
|
||||||
|
MEDIA_TIMEOUT=3600
|
||||||
|
OUTPUT_PERMISSIONS=644
|
||||||
|
RESTRICT_FILE_NAMES=windows
|
||||||
|
URL_DENYLIST=\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$
|
||||||
|
URL_ALLOWLIST=None
|
||||||
|
ADMIN_USERNAME=None
|
||||||
|
ADMIN_PASSWORD=None
|
||||||
|
ENFORCE_ATOMIC_WRITES=True
|
||||||
|
TAG_SEPARATOR_PATTERN=[,]
|
||||||
|
SECRET_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||||
|
BIND_ADDR=127.0.0.1:8000
|
||||||
|
ALLOWED_HOSTS=*
|
||||||
|
DEBUG=False
|
||||||
|
PUBLIC_INDEX=True
|
||||||
|
PUBLIC_SNAPSHOTS=True
|
||||||
|
PUBLIC_ADD_VIEW=False
|
||||||
|
FOOTER_INFO=Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.
|
||||||
|
SNAPSHOTS_PER_PAGE=40
|
||||||
|
CUSTOM_TEMPLATES_DIR=None
|
||||||
|
TIME_ZONE=UTC
|
||||||
|
TIMEZONE=UTC
|
||||||
|
REVERSE_PROXY_USER_HEADER=Remote-User
|
||||||
|
REVERSE_PROXY_WHITELIST=
|
||||||
|
LOGOUT_REDIRECT_URL=/
|
||||||
|
PREVIEW_ORIGINALS=True
|
||||||
|
LDAP=False
|
||||||
|
LDAP_SERVER_URI=None
|
||||||
|
LDAP_BIND_DN=None
|
||||||
|
LDAP_BIND_PASSWORD=None
|
||||||
|
LDAP_USER_BASE=None
|
||||||
|
LDAP_USER_FILTER=None
|
||||||
|
LDAP_USERNAME_ATTR=None
|
||||||
|
LDAP_FIRSTNAME_ATTR=None
|
||||||
|
LDAP_LASTNAME_ATTR=None
|
||||||
|
LDAP_EMAIL_ATTR=None
|
||||||
|
LDAP_CREATE_SUPERUSER=False
|
||||||
|
SAVE_TITLE=True
|
||||||
|
SAVE_FAVICON=True
|
||||||
|
SAVE_WGET=True
|
||||||
|
SAVE_WGET_REQUISITES=True
|
||||||
|
SAVE_SINGLEFILE=True
|
||||||
|
SAVE_READABILITY=True
|
||||||
|
SAVE_MERCURY=True
|
||||||
|
SAVE_HTMLTOTEXT=True
|
||||||
|
SAVE_PDF=True
|
||||||
|
SAVE_SCREENSHOT=True
|
||||||
|
SAVE_DOM=True
|
||||||
|
SAVE_HEADERS=True
|
||||||
|
SAVE_WARC=True
|
||||||
|
SAVE_GIT=True
|
||||||
|
SAVE_MEDIA=True
|
||||||
|
SAVE_ARCHIVE_DOT_ORG=True
|
||||||
|
RESOLUTION=1440,2000
|
||||||
|
GIT_DOMAINS=github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht
|
||||||
|
CHECK_SSL_VALIDITY=True
|
||||||
|
MEDIA_MAX_SIZE=750m
|
||||||
|
USER_AGENT=None
|
||||||
|
CURL_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)
|
||||||
|
WGET_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5
|
||||||
|
CHROME_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)
|
||||||
|
COOKIES_FILE=None
|
||||||
|
CHROME_USER_DATA_DIR=None
|
||||||
|
CHROME_TIMEOUT=0
|
||||||
|
CHROME_HEADLESS=True
|
||||||
|
CHROME_SANDBOX=True
|
||||||
|
CHROME_EXTRA_ARGS=[]
|
||||||
|
YOUTUBEDL_ARGS=['--restrict-filenames', '--trim-filenames', '128', '--write-description', '--write-info-json', '--write-annotations', '--write-thumbnail', '--no-call-home', '--write-sub', '--write-auto-subs', '--convert-subs=srt', '--yes-playlist', '--continue', '--no-abort-on-error', '--ignore-errors', '--geo-bypass', '--add-metadata', '--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)']
|
||||||
|
YOUTUBEDL_EXTRA_ARGS=[]
|
||||||
|
WGET_ARGS=['--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off']
|
||||||
|
WGET_EXTRA_ARGS=[]
|
||||||
|
CURL_ARGS=['--silent', '--location', '--compressed']
|
||||||
|
CURL_EXTRA_ARGS=[]
|
||||||
|
GIT_ARGS=['--recursive']
|
||||||
|
SINGLEFILE_ARGS=[]
|
||||||
|
SINGLEFILE_EXTRA_ARGS=[]
|
||||||
|
MERCURY_ARGS=['--format=text']
|
||||||
|
MERCURY_EXTRA_ARGS=[]
|
||||||
|
FAVICON_PROVIDER=https://www.google.com/s2/favicons?domain={}
|
||||||
|
USE_INDEXING_BACKEND=True
|
||||||
|
USE_SEARCHING_BACKEND=True
|
||||||
|
SEARCH_BACKEND_ENGINE=ripgrep
|
||||||
|
SEARCH_BACKEND_HOST_NAME=localhost
|
||||||
|
SEARCH_BACKEND_PORT=1491
|
||||||
|
SEARCH_BACKEND_PASSWORD=SecretPassword
|
||||||
|
SEARCH_PROCESS_HTML=True
|
||||||
|
SONIC_COLLECTION=archivebox
|
||||||
|
SONIC_BUCKET=snapshots
|
||||||
|
SEARCH_BACKEND_TIMEOUT=90
|
||||||
|
FTS_SEPARATE_DATABASE=True
|
||||||
|
FTS_TOKENIZERS=porter unicode61 remove_diacritics 2
|
||||||
|
FTS_SQLITE_MAX_LENGTH=1000000000
|
||||||
|
USE_CURL=True
|
||||||
|
USE_WGET=True
|
||||||
|
USE_SINGLEFILE=True
|
||||||
|
USE_READABILITY=True
|
||||||
|
USE_MERCURY=True
|
||||||
|
USE_GIT=True
|
||||||
|
USE_CHROME=True
|
||||||
|
USE_NODE=True
|
||||||
|
USE_YOUTUBEDL=True
|
||||||
|
USE_RIPGREP=True
|
||||||
|
CURL_BINARY=curl
|
||||||
|
GIT_BINARY=git
|
||||||
|
WGET_BINARY=wget
|
||||||
|
SINGLEFILE_BINARY=single-file
|
||||||
|
READABILITY_BINARY=readability-extractor
|
||||||
|
MERCURY_BINARY=postlight-parser
|
||||||
|
YOUTUBEDL_BINARY=yt-dlp
|
||||||
|
NODE_BINARY=node
|
||||||
|
RIPGREP_BINARY=rg
|
||||||
|
CHROME_BINARY=chrome
|
||||||
|
POCKET_CONSUMER_KEY=None
|
||||||
|
USER=squash
|
||||||
|
PACKAGE_DIR=/opt/archivebox/archivebox
|
||||||
|
TEMPLATES_DIR=/opt/archivebox/archivebox/templates
|
||||||
|
ARCHIVE_DIR=/opt/archivebox/data/archive
|
||||||
|
SOURCES_DIR=/opt/archivebox/data/sources
|
||||||
|
LOGS_DIR=/opt/archivebox/data/logs
|
||||||
|
PERSONAS_DIR=/opt/archivebox/data/personas
|
||||||
|
URL_DENYLIST_PTN=re.compile('\\.(css|js|otf|ttf|woff|woff2|gstatic\\.com|googleapis\\.com/css)(\\?.*)?$', re.IGNORECASE|re.MULTILINE)
|
||||||
|
URL_ALLOWLIST_PTN=None
|
||||||
|
DIR_OUTPUT_PERMISSIONS=755
|
||||||
|
ARCHIVEBOX_BINARY=/opt/archivebox/.venv/bin/archivebox
|
||||||
|
VERSION=0.8.0
|
||||||
|
COMMIT_HASH=102e87578c6036bb0132dd1ebd17f8f05ffc880f
|
||||||
|
BUILD_TIME=2024-05-15 03:28:05 1715768885
|
||||||
|
VERSIONS_AVAILABLE=None
|
||||||
|
CAN_UPGRADE=False
|
||||||
|
PYTHON_BINARY=/opt/archivebox/.venv/bin/python3.10
|
||||||
|
PYTHON_ENCODING=UTF-8
|
||||||
|
PYTHON_VERSION=3.10.14
|
||||||
|
DJANGO_BINARY=/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py
|
||||||
|
DJANGO_VERSION=5.0.6 final (0)
|
||||||
|
SQLITE_BINARY=/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py
|
||||||
|
SQLITE_VERSION=2.6.0
|
||||||
|
CURL_VERSION=curl 8.4.0 (x86_64-apple-darwin23.0)
|
||||||
|
WGET_VERSION=GNU Wget 1.24.5
|
||||||
|
WGET_AUTO_COMPRESSION=True
|
||||||
|
RIPGREP_VERSION=ripgrep 14.1.0
|
||||||
|
SINGLEFILE_VERSION=None
|
||||||
|
READABILITY_VERSION=None
|
||||||
|
MERCURY_VERSION=None
|
||||||
|
GIT_VERSION=git version 2.44.0
|
||||||
|
YOUTUBEDL_VERSION=2024.04.09
|
||||||
|
CHROME_VERSION=Google Chrome 124.0.6367.207
|
||||||
|
NODE_VERSION=v21.7.3
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
expected_output = '''[SERVER_CONFIG]
|
||||||
|
IS_TTY = false
|
||||||
|
USE_COLOR = false
|
||||||
|
SHOW_PROGRESS = false
|
||||||
|
IN_DOCKER = false
|
||||||
|
IN_QEMU = false
|
||||||
|
PUID = 501
|
||||||
|
PGID = 20
|
||||||
|
OUTPUT_DIR = "/opt/archivebox/data"
|
||||||
|
CONFIG_FILE = "/opt/archivebox/data/ArchiveBox.conf"
|
||||||
|
ONLY_NEW = true
|
||||||
|
TIMEOUT = 60
|
||||||
|
MEDIA_TIMEOUT = 3600
|
||||||
|
OUTPUT_PERMISSIONS = 644
|
||||||
|
RESTRICT_FILE_NAMES = "windows"
|
||||||
|
URL_DENYLIST = "\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$"
|
||||||
|
URL_ALLOWLIST = null
|
||||||
|
ADMIN_USERNAME = null
|
||||||
|
ADMIN_PASSWORD = null
|
||||||
|
ENFORCE_ATOMIC_WRITES = true
|
||||||
|
TAG_SEPARATOR_PATTERN = "[,]"
|
||||||
|
SECRET_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
||||||
|
BIND_ADDR = "127.0.0.1:8000"
|
||||||
|
ALLOWED_HOSTS = "*"
|
||||||
|
DEBUG = false
|
||||||
|
PUBLIC_INDEX = true
|
||||||
|
PUBLIC_SNAPSHOTS = true
|
||||||
|
PUBLIC_ADD_VIEW = false
|
||||||
|
FOOTER_INFO = "Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests."
|
||||||
|
SNAPSHOTS_PER_PAGE = 40
|
||||||
|
CUSTOM_TEMPLATES_DIR = null
|
||||||
|
TIME_ZONE = "UTC"
|
||||||
|
TIMEZONE = "UTC"
|
||||||
|
REVERSE_PROXY_USER_HEADER = "Remote-User"
|
||||||
|
REVERSE_PROXY_WHITELIST = ""
|
||||||
|
LOGOUT_REDIRECT_URL = "/"
|
||||||
|
PREVIEW_ORIGINALS = true
|
||||||
|
LDAP = false
|
||||||
|
LDAP_SERVER_URI = null
|
||||||
|
LDAP_BIND_DN = null
|
||||||
|
LDAP_BIND_PASSWORD = null
|
||||||
|
LDAP_USER_BASE = null
|
||||||
|
LDAP_USER_FILTER = null
|
||||||
|
LDAP_USERNAME_ATTR = null
|
||||||
|
LDAP_FIRSTNAME_ATTR = null
|
||||||
|
LDAP_LASTNAME_ATTR = null
|
||||||
|
LDAP_EMAIL_ATTR = null
|
||||||
|
LDAP_CREATE_SUPERUSER = false
|
||||||
|
SAVE_TITLE = true
|
||||||
|
SAVE_FAVICON = true
|
||||||
|
SAVE_WGET = true
|
||||||
|
SAVE_WGET_REQUISITES = true
|
||||||
|
SAVE_SINGLEFILE = true
|
||||||
|
SAVE_READABILITY = true
|
||||||
|
SAVE_MERCURY = true
|
||||||
|
SAVE_HTMLTOTEXT = true
|
||||||
|
SAVE_PDF = true
|
||||||
|
SAVE_SCREENSHOT = true
|
||||||
|
SAVE_DOM = true
|
||||||
|
SAVE_HEADERS = true
|
||||||
|
SAVE_WARC = true
|
||||||
|
SAVE_GIT = true
|
||||||
|
SAVE_MEDIA = true
|
||||||
|
SAVE_ARCHIVE_DOT_ORG = true
|
||||||
|
RESOLUTION = [1440, 2000]
|
||||||
|
GIT_DOMAINS = "github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht"
|
||||||
|
CHECK_SSL_VALIDITY = true
|
||||||
|
MEDIA_MAX_SIZE = "750m"
|
||||||
|
USER_AGENT = null
|
||||||
|
CURL_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)"
|
||||||
|
WGET_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5"
|
||||||
|
CHROME_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)"
|
||||||
|
COOKIES_FILE = null
|
||||||
|
CHROME_USER_DATA_DIR = null
|
||||||
|
CHROME_TIMEOUT = false
|
||||||
|
CHROME_HEADLESS = true
|
||||||
|
CHROME_SANDBOX = true
|
||||||
|
CHROME_EXTRA_ARGS = []
|
||||||
|
YOUTUBEDL_ARGS = ["--restrict-filenames", "--trim-filenames", "128", "--write-description", "--write-info-json", "--write-annotations", "--write-thumbnail", "--no-call-home", "--write-sub", "--write-auto-subs", "--convert-subs=srt", "--yes-playlist", "--continue", "--no-abort-on-error", "--ignore-errors", "--geo-bypass", "--add-metadata", "--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)"]
|
||||||
|
YOUTUBEDL_EXTRA_ARGS = []
|
||||||
|
WGET_ARGS = ["--no-verbose", "--adjust-extension", "--convert-links", "--force-directories", "--backup-converted", "--span-hosts", "--no-parent", "-e", "robots=off"]
|
||||||
|
WGET_EXTRA_ARGS = []
|
||||||
|
CURL_ARGS = ["--silent", "--location", "--compressed"]
|
||||||
|
CURL_EXTRA_ARGS = []
|
||||||
|
GIT_ARGS = ["--recursive"]
|
||||||
|
SINGLEFILE_ARGS = []
|
||||||
|
SINGLEFILE_EXTRA_ARGS = []
|
||||||
|
MERCURY_ARGS = ["--format=text"]
|
||||||
|
MERCURY_EXTRA_ARGS = []
|
||||||
|
FAVICON_PROVIDER = "https://www.google.com/s2/favicons?domain={}"
|
||||||
|
USE_INDEXING_BACKEND = true
|
||||||
|
USE_SEARCHING_BACKEND = true
|
||||||
|
SEARCH_BACKEND_ENGINE = "ripgrep"
|
||||||
|
SEARCH_BACKEND_HOST_NAME = "localhost"
|
||||||
|
SEARCH_BACKEND_PORT = 1491
|
||||||
|
SEARCH_BACKEND_PASSWORD = "SecretPassword"
|
||||||
|
SEARCH_PROCESS_HTML = true
|
||||||
|
SONIC_COLLECTION = "archivebox"
|
||||||
|
SONIC_BUCKET = "snapshots"
|
||||||
|
SEARCH_BACKEND_TIMEOUT = 90
|
||||||
|
FTS_SEPARATE_DATABASE = true
|
||||||
|
FTS_TOKENIZERS = "porter unicode61 remove_diacritics 2"
|
||||||
|
FTS_SQLITE_MAX_LENGTH = 1000000000
|
||||||
|
USE_CURL = true
|
||||||
|
USE_WGET = true
|
||||||
|
USE_SINGLEFILE = true
|
||||||
|
USE_READABILITY = true
|
||||||
|
USE_MERCURY = true
|
||||||
|
USE_GIT = true
|
||||||
|
USE_CHROME = true
|
||||||
|
USE_NODE = true
|
||||||
|
USE_YOUTUBEDL = true
|
||||||
|
USE_RIPGREP = true
|
||||||
|
CURL_BINARY = "curl"
|
||||||
|
GIT_BINARY = "git"
|
||||||
|
WGET_BINARY = "wget"
|
||||||
|
SINGLEFILE_BINARY = "single-file"
|
||||||
|
READABILITY_BINARY = "readability-extractor"
|
||||||
|
MERCURY_BINARY = "postlight-parser"
|
||||||
|
YOUTUBEDL_BINARY = "yt-dlp"
|
||||||
|
NODE_BINARY = "node"
|
||||||
|
RIPGREP_BINARY = "rg"
|
||||||
|
CHROME_BINARY = "chrome"
|
||||||
|
POCKET_CONSUMER_KEY = null
|
||||||
|
USER = "squash"
|
||||||
|
PACKAGE_DIR = "/opt/archivebox/archivebox"
|
||||||
|
TEMPLATES_DIR = "/opt/archivebox/archivebox/templates"
|
||||||
|
ARCHIVE_DIR = "/opt/archivebox/data/archive"
|
||||||
|
SOURCES_DIR = "/opt/archivebox/data/sources"
|
||||||
|
LOGS_DIR = "/opt/archivebox/data/logs"
|
||||||
|
PERSONAS_DIR = "/opt/archivebox/data/personas"
|
||||||
|
URL_DENYLIST_PTN = "re.compile(\'\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$\', re.IGNORECASE|re.MULTILINE)"
|
||||||
|
URL_ALLOWLIST_PTN = null
|
||||||
|
DIR_OUTPUT_PERMISSIONS = 755
|
||||||
|
ARCHIVEBOX_BINARY = "/opt/archivebox/.venv/bin/archivebox"
|
||||||
|
VERSION = "0.8.0"
|
||||||
|
COMMIT_HASH = "102e87578c6036bb0132dd1ebd17f8f05ffc880f"
|
||||||
|
BUILD_TIME = "2024-05-15 03:28:05 1715768885"
|
||||||
|
VERSIONS_AVAILABLE = null
|
||||||
|
CAN_UPGRADE = false
|
||||||
|
PYTHON_BINARY = "/opt/archivebox/.venv/bin/python3.10"
|
||||||
|
PYTHON_ENCODING = "UTF-8"
|
||||||
|
PYTHON_VERSION = "3.10.14"
|
||||||
|
DJANGO_BINARY = "/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py"
|
||||||
|
DJANGO_VERSION = "5.0.6 final (0)"
|
||||||
|
SQLITE_BINARY = "/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py"
|
||||||
|
SQLITE_VERSION = "2.6.0"
|
||||||
|
CURL_VERSION = "curl 8.4.0 (x86_64-apple-darwin23.0)"
|
||||||
|
WGET_VERSION = "GNU Wget 1.24.5"
|
||||||
|
WGET_AUTO_COMPRESSION = true
|
||||||
|
RIPGREP_VERSION = "ripgrep 14.1.0"
|
||||||
|
SINGLEFILE_VERSION = null
|
||||||
|
READABILITY_VERSION = null
|
||||||
|
MERCURY_VERSION = null
|
||||||
|
GIT_VERSION = "git version 2.44.0"
|
||||||
|
YOUTUBEDL_VERSION = "2024.04.09"
|
||||||
|
CHROME_VERSION = "Google Chrome 124.0.6367.207"
|
||||||
|
NODE_VERSION = "v21.7.3"'''
|
||||||
|
|
||||||
|
|
||||||
|
first_output = convert(test_input) # make sure ini -> toml parses correctly
|
||||||
|
second_output = convert(first_output) # make sure toml -> toml parses/dumps consistently
|
||||||
|
assert first_output == second_output == expected_output # make sure parsing is indempotent
|
||||||
|
|
||||||
|
# # DEBUGGING
|
||||||
|
# import sys
|
||||||
|
# import difflib
|
||||||
|
# sys.stdout.writelines(difflib.context_diff(first_output, second_output, fromfile='first', tofile='second'))
|
||||||
|
# print(repr(second_output))
|
38
archivebox/plugantic/migrations/0001_initial.py
Normal file
38
archivebox/plugantic/migrations/0001_initial.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 00:16
|
||||||
|
|
||||||
|
import abid_utils.models
|
||||||
|
import archivebox.plugantic.plugins
|
||||||
|
import charidfield.fields
|
||||||
|
import django.core.serializers.json
|
||||||
|
import django.db.models.deletion
|
||||||
|
import django_pydantic_field.fields
|
||||||
|
import uuid
|
||||||
|
from django.conf import settings
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='Plugin',
|
||||||
|
fields=[
|
||||||
|
('created', models.DateTimeField(auto_now_add=True)),
|
||||||
|
('modified', models.DateTimeField(auto_now=True)),
|
||||||
|
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
|
||||||
|
('uuid', models.UUIDField(blank=True, null=True, unique=True)),
|
||||||
|
('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='plg_', unique=True)),
|
||||||
|
('schema', django_pydantic_field.fields.PydanticSchemaField(config=None, encoder=django.core.serializers.json.DjangoJSONEncoder, schema=archivebox.plugantic.plugins.Plugin)),
|
||||||
|
('created_by', models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
'abstract': False,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
21
archivebox/plugantic/migrations/0002_alter_plugin_schema.py
Normal file
21
archivebox/plugantic/migrations/0002_alter_plugin_schema.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:16
|
||||||
|
|
||||||
|
import archivebox.plugantic.plugins
|
||||||
|
import django.core.serializers.json
|
||||||
|
import django_pydantic_field.fields
|
||||||
|
from django.db import migrations
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0001_initial'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='plugin',
|
||||||
|
name='schema',
|
||||||
|
field=django_pydantic_field.fields.PydanticSchemaField(config=None, default=None, encoder=django.core.serializers.json.DjangoJSONEncoder, schema=archivebox.plugantic.plugins.Plugin),
|
||||||
|
),
|
||||||
|
]
|
21
archivebox/plugantic/migrations/0003_alter_plugin_schema.py
Normal file
21
archivebox/plugantic/migrations/0003_alter_plugin_schema.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:25
|
||||||
|
|
||||||
|
import archivebox.plugantic.replayers
|
||||||
|
import django.core.serializers.json
|
||||||
|
import django_pydantic_field.fields
|
||||||
|
from django.db import migrations
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0002_alter_plugin_schema'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='plugin',
|
||||||
|
name='schema',
|
||||||
|
field=django_pydantic_field.fields.PydanticSchemaField(config=None, default={'embed_template': 'plugins/generic_replayer/templates/embed.html', 'fullpage_template': 'plugins/generic_replayer/templates/fullpage.html', 'name': 'GenericReplayer', 'row_template': 'plugins/generic_replayer/templates/row.html', 'url_pattern': '*'}, encoder=django.core.serializers.json.DjangoJSONEncoder, schema=archivebox.plugantic.replayers.Replayer),
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,32 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:28
|
||||||
|
|
||||||
|
import archivebox.plugantic.configs
|
||||||
|
import django.core.serializers.json
|
||||||
|
import django_pydantic_field.compat.django
|
||||||
|
import django_pydantic_field.fields
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0003_alter_plugin_schema'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='plugin',
|
||||||
|
name='schema',
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='plugin',
|
||||||
|
name='configs',
|
||||||
|
field=django_pydantic_field.fields.PydanticSchemaField(config=None, default=[], encoder=django.core.serializers.json.DjangoJSONEncoder, schema=django_pydantic_field.compat.django.GenericContainer(list, (archivebox.plugantic.configs.ConfigSet,))),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='plugin',
|
||||||
|
name='name',
|
||||||
|
field=models.CharField(default='name', max_length=64, unique=True),
|
||||||
|
preserve_default=False,
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,39 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:42
|
||||||
|
|
||||||
|
import abid_utils.models
|
||||||
|
import charidfield.fields
|
||||||
|
import django.db.models.deletion
|
||||||
|
import pathlib
|
||||||
|
import uuid
|
||||||
|
from django.conf import settings
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0004_remove_plugin_schema_plugin_configs_plugin_name'),
|
||||||
|
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='CustomPlugin',
|
||||||
|
fields=[
|
||||||
|
('created', models.DateTimeField(auto_now_add=True)),
|
||||||
|
('modified', models.DateTimeField(auto_now=True)),
|
||||||
|
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
|
||||||
|
('uuid', models.UUIDField(blank=True, null=True, unique=True)),
|
||||||
|
('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='plg_', unique=True)),
|
||||||
|
('name', models.CharField(max_length=64, unique=True)),
|
||||||
|
('path', models.FilePathField(path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/archivebox/plugins'))),
|
||||||
|
('created_by', models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
'abstract': False,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
migrations.DeleteModel(
|
||||||
|
name='Plugin',
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,19 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:45
|
||||||
|
|
||||||
|
import pathlib
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0005_customplugin_delete_plugin'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='customplugin',
|
||||||
|
name='path',
|
||||||
|
field=models.FilePathField(allow_files=False, allow_folders=True, path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/archivebox/plugins'), recursive=True),
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,19 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:46
|
||||||
|
|
||||||
|
import pathlib
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0006_alter_customplugin_path'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='customplugin',
|
||||||
|
name='path',
|
||||||
|
field=models.FilePathField(allow_files=False, allow_folders=True, path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data/plugins'), recursive=True),
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,19 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:47
|
||||||
|
|
||||||
|
import pathlib
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0007_alter_customplugin_path'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='customplugin',
|
||||||
|
name='path',
|
||||||
|
field=models.FilePathField(allow_files=False, allow_folders=True, path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data'), recursive=True),
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,18 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:48
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0008_alter_customplugin_path'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='customplugin',
|
||||||
|
name='path',
|
||||||
|
field=models.FilePathField(allow_files=False, allow_folders=True, path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True),
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,18 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:48
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0009_alter_customplugin_path'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='customplugin',
|
||||||
|
name='path',
|
||||||
|
field=models.FilePathField(allow_files=False, allow_folders=True, match='/plugins/*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True),
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,18 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:48
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0010_alter_customplugin_path'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='customplugin',
|
||||||
|
name='path',
|
||||||
|
field=models.FilePathField(allow_files=False, allow_folders=True, match='plugins/*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True),
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,18 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:49
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0011_alter_customplugin_path'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='customplugin',
|
||||||
|
name='path',
|
||||||
|
field=models.FilePathField(allow_files=False, allow_folders=True, default='example_plugin', match='plugins/*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True),
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,18 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:49
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0012_alter_customplugin_path'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='customplugin',
|
||||||
|
name='path',
|
||||||
|
field=models.FilePathField(allow_files=False, allow_folders=True, default='/plugins/example_plugin', match='plugins/*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True),
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,18 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:50
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0013_alter_customplugin_path'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='customplugin',
|
||||||
|
name='path',
|
||||||
|
field=models.FilePathField(allow_files=False, allow_folders=True, default='/plugins/example_plugin', match='*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data/plugins', recursive=True),
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,18 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:51
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0014_alter_customplugin_path'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='customplugin',
|
||||||
|
name='path',
|
||||||
|
field=models.FilePathField(allow_files=False, allow_folders=True, match='*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data/plugins', recursive=True),
|
||||||
|
),
|
||||||
|
]
|
16
archivebox/plugantic/migrations/0016_delete_customplugin.py
Normal file
16
archivebox/plugantic/migrations/0016_delete_customplugin.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-05-18 01:57
|
||||||
|
|
||||||
|
from django.db import migrations
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('plugantic', '0015_alter_customplugin_path'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.DeleteModel(
|
||||||
|
name='CustomPlugin',
|
||||||
|
),
|
||||||
|
]
|
0
archivebox/plugantic/migrations/__init__.py
Normal file
0
archivebox/plugantic/migrations/__init__.py
Normal file
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue