Compare commits
406 commits
Author | SHA1 | Date | |
---|---|---|---|
102e87578c | |||
913590ee39 | |||
3882b1ee22 | |||
471cf06d89 | |||
340fc95f75 | |||
75a3f03149 | |||
2e9512adfd | |||
1c76193704 | |||
e23c7cb3db | |||
7a8ed9cd55 | |||
72f52d5dd5 | |||
3ce801a182 | |||
8b75788644 | |||
7673b42117 | |||
03296e2200 | |||
e522810a20 | |||
69579a73ec | |||
f5f8d091c3 | |||
51601632c2 | |||
b489338555 | |||
d03f447555 | |||
68a859ccfd | |||
6baf2b2f69 | |||
d451636224 | |||
208c16c611 | |||
16d1b92fd6 | |||
b90ba6c909 | |||
09360fd191 | |||
4c5a3fba8b | |||
f2729c9dc7 | |||
cf9ef88aa8 | |||
9b21ce490e | |||
f62cb5fb43 | |||
f770bba3cf | |||
ce42472732 | |||
ef856e8051 | |||
27d5d1ddc8 | |||
664e09f0b4 | |||
f472705d10 | |||
3095265880 | |||
60df0c3137 | |||
32aea66913 | |||
8ccd606973 | |||
94ee394339 | |||
027c029316 | |||
8667ed29f1 | |||
f998647350 | |||
29c794925e | |||
641a07b08a | |||
c30d697904 | |||
d782bafe2e | |||
47666ec26b | |||
f067451267 | |||
c7fc9c004f | |||
08931edbe0 | |||
9dc7065506 | |||
12a990c178 | |||
f95b369f0d | |||
90b7a7f40d | |||
3805a1730d | |||
2094ed842b | |||
8d7dd47c43 | |||
e20eb52f15 | |||
17b35496cc | |||
1c9f9fe1b7 | |||
8f3901dd36 | |||
18a5b6e99c | |||
6a6ae7468e | |||
1d9e7ec66a | |||
8cbc1a4adc | |||
4a5ad32040 | |||
af669d2f37 | |||
716ba52450 | |||
75153252dc | |||
e5aba0dc2e | |||
6cb357e76c | |||
128419f991 | |||
beb3932d80 | |||
3afdd3d96f | |||
463ea54616 | |||
c6d644be29 | |||
8e9cfc8869 | |||
98c5e69203 | |||
8dcfa93ec6 | |||
e28f33fcd0 | |||
665a2e505f | |||
17f40f3ada | |||
c6f8a33a63 | |||
24175f5b4a | |||
a1a877f47f | |||
63fc317229 | |||
756e159dfe | |||
667cf38fc6 | |||
11acc9ceea | |||
55d6bde7db | |||
bc0b0303ea | |||
82b38df8ec | |||
8ced9fd4bb | |||
e4dc2701ef | |||
99502bd928 | |||
b76875aab6 | |||
9ad99d86c1 | |||
5f9aac18f2 | |||
4ae765ec27 | |||
9d4cc361e6 | |||
e48159b8a0 | |||
ac73fb5129 | |||
b4c3aa5097 | |||
a4453b6f87 | |||
6981837a0b | |||
8b1b01e508 | |||
1d49bee90b | |||
0521379464 | |||
ee2809eb4f | |||
88f21d0d70 | |||
2c6704b1d0 | |||
1dbe08872c | |||
2220a5350c | |||
a1ef5f6035 | |||
28e85e0b95 | |||
67baea172e | |||
d9beebdee7 | |||
d32413d74b | |||
37c9a33c8b | |||
b921efb0e0 | |||
e00845f58c | |||
8007e97c3f | |||
c0b5dbcecb | |||
c5bb99dce1 | |||
1fc5d7c5c8 | |||
0872c84ba7 | |||
06a0580430 | |||
5478d13d52 | |||
ca2c484a8e | |||
48f4b12ae2 | |||
099f7d00fe | |||
3512dc7e60 | |||
db3fee1845 | |||
86c3e271ad | |||
f4deb97f59 | |||
d8cf09c21e | |||
1cf0f37a95 | |||
5082d61613 | |||
4686da91e6 | |||
a729480b75 | |||
62183b4c85 | |||
d74ddd42ae | |||
741ff5f1a8 | |||
0f402df42f | |||
e7119adb0b | |||
9f462a87a8 | |||
1f828d9441 | |||
a577d1ed23 | |||
89ab18c772 | |||
7b042c854a | |||
fe11e1c2f4 | |||
ccabda4c7d | |||
178e676e0f | |||
68326a60ee | |||
22f9a289d3 | |||
f02b27920c | |||
597f1a39e0 | |||
4d9c5a7b4b | |||
ab8f395e0a | |||
a00b34cc13 | |||
c7cdc2fc27 | |||
ab225104c5 | |||
4e69d2c9e1 | |||
31d05d8526 | |||
8b9bc3dec8 | |||
6a4e568d1b | |||
2d32f05a62 | |||
3afd7b0cf0 | |||
0899c298c0 | |||
743e1ca7ad | |||
52faf81096 | |||
9026726a00 | |||
113895277d | |||
b3be86f2cf | |||
8a25502650 | |||
87a86dd111 | |||
9b1df2b381 | |||
6de23cd8fe | |||
335e3aadb0 | |||
20575e7cc8 | |||
841c01ec2c | |||
f554ae903e | |||
fa24136ff7 | |||
b706ab390d | |||
84fe8e1c75 | |||
8519c81711 | |||
9373a2853e | |||
0b8ff4c84c | |||
0cdfab7f45 | |||
e9f3b041ec | |||
968cd8f19c | |||
ae7cbf4c54 | |||
f9ad8e911b | |||
bf0983ce77 | |||
dd6bb4b274 | |||
a79bfde0db | |||
74e3645c6a | |||
2a845d1976 | |||
903c72fa88 | |||
91c4641199 | |||
c94ed53570 | |||
3ad32509e9 | |||
00d2d20a63 | |||
19aefc85e6 | |||
f5aaeb6de7 | |||
eb62b44036 | |||
babd273fc0 | |||
9f8ad4b126 | |||
402aac2366 | |||
a4bd441077 | |||
2b6bd42a2a | |||
dcd9b7bd14 | |||
ea0563d85b | |||
13e3322993 | |||
5ee85107e6 | |||
31392f8c34 | |||
da38950cea | |||
26481d77c7 | |||
3f50922f72 | |||
55f1ec5b4b | |||
53732c2958 | |||
114002aa5d | |||
dd2864128d | |||
c908d3e8d0 | |||
b15bc27bb3 | |||
dd1216546c | |||
2c51430a31 | |||
3073922440 | |||
e42a7390fb | |||
eed9148592 | |||
b004aa5170 | |||
85be7f891a | |||
a3dc7106ee | |||
780dac3b12 | |||
1f5c6d1df8 | |||
68d12b4ccb | |||
21584cdd72 | |||
6f87bf014e | |||
ba851b17a6 | |||
f184a5522f | |||
b72a8ab654 | |||
d936b9eb8a | |||
c840887281 | |||
bd19b794e5 | |||
a4be98dd2b | |||
2d26728c2a | |||
2c49e1be18 | |||
28cf2eae39 | |||
62dc631e73 | |||
d2b3a3ed1e | |||
8c6efd20ae | |||
a54655512a | |||
225ec2dc32 | |||
a69eb009c8 | |||
d544ddb8df | |||
614263d65d | |||
ec964758ab | |||
c9d21353d2 | |||
e98607bdd5 | |||
c050552b53 | |||
84b6412b78 | |||
c4e09c37d0 | |||
bde7869beb | |||
7058f7d0a6 | |||
abd9dc6bfd | |||
77a0c71074 | |||
4d1af6352a | |||
b122702569 | |||
beeb313144 | |||
3a38e7d26d | |||
065e2228ea | |||
b245e90871 | |||
2ea4133615 | |||
35de1a5a55 | |||
d9b8d19675 | |||
f673c1bfe9 | |||
11a2b2186f | |||
e4a8a891e6 | |||
6dc35097fc | |||
ea71871af1 | |||
4d9019ada8 | |||
e6d6b7cb6d | |||
dfd8cd487d | |||
6be974f0fa | |||
15a7143238 | |||
56a7525822 | |||
2577a8a3be | |||
269bf3f7f3 | |||
17fdf76178 | |||
1161f08b55 | |||
4e7da217bb | |||
e222d518e4 | |||
3839e2d4ff | |||
bae2f3a09d | |||
4ae58884ca | |||
9f86ec31a0 | |||
d40f46a985 | |||
202ccc812c | |||
0ec2fbf8b2 | |||
5517bc7c0a | |||
fffc872470 | |||
d4703d1e16 | |||
43ceb24c50 | |||
dbcbdc7691 | |||
b2d1083453 | |||
bd290aa282 | |||
b708303dd4 | |||
0abca0b547 | |||
82c9c691c0 | |||
8387e02d3c | |||
7d0f734f43 | |||
22aae92e95 | |||
51f2382407 | |||
092e0b6dfa | |||
224df914ec | |||
e8772513ca | |||
5de9d934af | |||
1daa2d86ac | |||
3f48560e9c | |||
914df75ee7 | |||
95580ee743 | |||
bddea22dac | |||
e193e48aff | |||
2cddb61877 | |||
4146c1d673 | |||
1e7cc8e082 | |||
ab7170ddd3 | |||
48f476c56f | |||
016935f8c1 | |||
3b36928bdc | |||
93781c58ce | |||
adf877a2d1 | |||
c8094887f8 | |||
481554c521 | |||
dd17d2b602 | |||
7036428d7e | |||
702c70fcfa | |||
cf7babebd4 | |||
5bdcbaeebd | |||
c7cab7cadc | |||
279883d6bb | |||
10922d426e | |||
233388f94d | |||
a97886e47a | |||
3b19b59300 | |||
ea1a808261 | |||
aca0512926 | |||
22eff07a11 | |||
702b81f9e6 | |||
6edc650673 | |||
81affa0a91 | |||
d0cd84a2af | |||
1773146833 | |||
0a25495520 | |||
ef1a697588 | |||
2e77c3964f | |||
a2cfe764d4 | |||
30fbb5af0c | |||
a1456f1f14 | |||
ac0c03da6f | |||
857f585369 | |||
4344f759d6 | |||
7090d49c18 | |||
a7fa19917d | |||
39e3d03c82 | |||
131c9f5d83 | |||
4e20cef406 | |||
a74486689c | |||
2de06c14e1 | |||
f839f4f16e | |||
8e03755a7a | |||
d6a8e12482 | |||
aa49456dad | |||
1c86dfff51 | |||
414a499511 | |||
fd2a91b55b | |||
ed49301cf0 | |||
75d8ed3fec | |||
8ccb4f73ec | |||
8c07b7e127 | |||
9766de1b70 | |||
ff5aef521e | |||
6184f659dc | |||
014c1e1dc7 | |||
df11a8a3ba | |||
eb2112e5b1 | |||
825b882633 | |||
4adb214812 | |||
23a9c538c2 | |||
3d2c4c70d2 | |||
5de45dbf30 | |||
e43babb7ac | |||
a232b45b61 | |||
710167e967 | |||
84e026d862 | |||
975b1b5ae1 | |||
e085b1d13b | |||
bc2bfc1cc7 | |||
a6ea05820b | |||
e888869abc | |||
3108966070 |
|
@ -17,6 +17,11 @@ venv/
|
|||
.venv-old/
|
||||
.docker-venv/
|
||||
node_modules/
|
||||
chrome/
|
||||
chromeprofile/
|
||||
|
||||
pdm.dev.lock
|
||||
pdm.lock
|
||||
|
||||
docs/
|
||||
build/
|
||||
|
@ -28,4 +33,5 @@ assets/
|
|||
docker/
|
||||
|
||||
data/
|
||||
data*/
|
||||
output/
|
||||
|
|
5
.github/FUNDING.yml
vendored
5
.github/FUNDING.yml
vendored
|
@ -1,3 +1,2 @@
|
|||
github: pirate
|
||||
patreon: theSquashSH
|
||||
custom: ["https://twitter.com/ArchiveBoxApp", "https://paypal.me/NicholasSweeting", "https://www.blockchain.com/eth/address/0x5D4c34D4a121Fe08d1dDB7969F07550f2dB9f471", "https://www.blockchain.com/btc/address/1HuxXriPE2Bbnag3jJrqa3bkNHrs297dYH"]
|
||||
github: ["ArchiveBox", "pirate"]
|
||||
custom: ["https://donate.archivebox.io", "https://paypal.me/NicholasSweeting"]
|
||||
|
|
|
@ -6,6 +6,7 @@ labels: ''
|
|||
assignees: ''
|
||||
|
||||
---
|
||||
<!-- If you perfer, you can make a PR to https://github.com/ArchiveBox/docs instead of opening an issue -->
|
||||
|
||||
## Wiki Page URL
|
||||
<!-- e.g. https://github.com/pirate/ArchiveBox/wiki/Configuration#use_color -->
|
||||
|
|
12
.github/dependabot.yml
vendored
Normal file
12
.github/dependabot.yml
vendored
Normal file
|
@ -0,0 +1,12 @@
|
|||
# To get started with Dependabot version updates, you'll need to specify which
|
||||
# package ecosystems to update and where the package manifests are located.
|
||||
# Please see the documentation for all configuration options:
|
||||
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
|
||||
|
||||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "pip" # See documentation for possible values
|
||||
directory: "/"
|
||||
target-branch: "dev"
|
||||
schedule:
|
||||
interval: "weekly"
|
32
.github/workflows/codeql-analysis.yml
vendored
32
.github/workflows/codeql-analysis.yml
vendored
|
@ -1,32 +0,0 @@
|
|||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ dev ]
|
||||
pull_request:
|
||||
branches: [ dev ]
|
||||
schedule:
|
||||
- cron: '43 1 * * 2'
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
language: [ 'python' ]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v1
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
queries: security-extended
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v1
|
92
.github/workflows/codeql.yml
vendored
Normal file
92
.github/workflows/codeql.yml
vendored
Normal file
|
@ -0,0 +1,92 @@
|
|||
# For most projects, this workflow file will not need changing; you simply need
|
||||
# to commit it to your repository.
|
||||
#
|
||||
# You may wish to alter this file to override the set of languages analyzed,
|
||||
# or to provide custom queries or build logic.
|
||||
#
|
||||
# ******** NOTE ********
|
||||
# We have attempted to detect the languages in your repository. Please check
|
||||
# the `language` matrix defined below to confirm you have the correct set of
|
||||
# supported CodeQL languages.
|
||||
#
|
||||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ "dev" ]
|
||||
pull_request:
|
||||
branches: [ "dev" ]
|
||||
schedule:
|
||||
- cron: '33 17 * * 6'
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze (${{ matrix.language }})
|
||||
# Runner size impacts CodeQL analysis time. To learn more, please see:
|
||||
# - https://gh.io/recommended-hardware-resources-for-running-codeql
|
||||
# - https://gh.io/supported-runners-and-hardware-resources
|
||||
# - https://gh.io/using-larger-runners (GitHub.com only)
|
||||
# Consider using larger runners or machines with greater resources for possible analysis time improvements.
|
||||
runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
|
||||
timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
|
||||
permissions:
|
||||
# required for all workflows
|
||||
security-events: write
|
||||
|
||||
# required to fetch internal or private CodeQL packs
|
||||
packages: read
|
||||
|
||||
# only required for workflows in private repositories
|
||||
actions: read
|
||||
contents: read
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- language: python
|
||||
build-mode: none
|
||||
# CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
|
||||
# Use `c-cpp` to analyze code written in C, C++ or both
|
||||
# Use 'java-kotlin' to analyze code written in Java, Kotlin or both
|
||||
# Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
|
||||
# To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
|
||||
# see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
|
||||
# If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
|
||||
# your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v3
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
build-mode: ${{ matrix.build-mode }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
# By default, queries listed here will override any specified in a config file.
|
||||
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||
|
||||
# For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
|
||||
# queries: security-extended,security-and-quality
|
||||
|
||||
# If the analyze step fails for one of the languages you are analyzing with
|
||||
# "We were unable to automatically build your code", modify the matrix above
|
||||
# to set the build mode to "manual" for that language. Then modify this step
|
||||
# to build your code.
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
|
||||
- if: matrix.build-mode == 'manual'
|
||||
run: |
|
||||
echo 'If you are using a "manual" build mode for one or more of the' \
|
||||
'languages you are analyzing, replace this with the commands to build' \
|
||||
'your code, for example:'
|
||||
echo ' make bootstrap'
|
||||
echo ' make release'
|
||||
exit 1
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v3
|
||||
with:
|
||||
category: "/language:${{matrix.language}}"
|
39
.github/workflows/docker.yml
vendored
39
.github/workflows/docker.yml
vendored
|
@ -11,7 +11,7 @@ on:
|
|||
|
||||
env:
|
||||
DOCKER_IMAGE: archivebox-ci
|
||||
|
||||
|
||||
jobs:
|
||||
buildx:
|
||||
runs-on: ubuntu-latest
|
||||
|
@ -24,21 +24,21 @@ jobs:
|
|||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
id: buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
version: latest
|
||||
install: true
|
||||
platforms: linux/amd64,linux/arm64,linux/arm/v7
|
||||
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
||||
- name: Builder instance name
|
||||
run: echo ${{ steps.buildx.outputs.name }}
|
||||
|
||||
|
||||
- name: Available platforms
|
||||
run: echo ${{ steps.buildx.outputs.platforms }}
|
||||
|
||||
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
|
@ -51,21 +51,27 @@ jobs:
|
|||
uses: docker/login-action@v3
|
||||
if: github.event_name != 'pull_request'
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
|
||||
- name: Collect Docker tags
|
||||
# https://github.com/docker/metadata-action
|
||||
id: docker_meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: archivebox/archivebox,nikisweeting/archivebox
|
||||
tags: |
|
||||
# :stable
|
||||
type=ref,event=branch
|
||||
# :0.7.3
|
||||
type=semver,pattern={{version}}
|
||||
# :0.7
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
# :sha-463ea54
|
||||
type=sha
|
||||
type=raw,value=latest,enable={{is_default_branch}}
|
||||
|
||||
# :latest
|
||||
type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'stable') }}
|
||||
|
||||
- name: Build and push
|
||||
id: docker_build
|
||||
uses: docker/build-push-action@v5
|
||||
|
@ -77,11 +83,18 @@ jobs:
|
|||
tags: ${{ steps.docker_meta.outputs.tags }}
|
||||
cache-from: type=local,src=/tmp/.buildx-cache
|
||||
cache-to: type=local,dest=/tmp/.buildx-cache-new
|
||||
platforms: linux/amd64,linux/arm64,linux/arm/v7
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
||||
- name: Image digest
|
||||
run: echo ${{ steps.docker_build.outputs.digest }}
|
||||
|
||||
|
||||
- name: Update README
|
||||
uses: peter-evans/dockerhub-description@v4
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
repository: archivebox/archivebox
|
||||
|
||||
# This ugly bit is necessary if you don't want your cache to grow forever
|
||||
# until it hits GitHub's limit of 5GB.
|
||||
# Temp fix
|
||||
|
|
2
.github/workflows/pip.yml
vendored
2
.github/workflows/pip.yml
vendored
|
@ -35,7 +35,7 @@ jobs:
|
|||
cache: true
|
||||
|
||||
- name: Install dependencies
|
||||
run: pdm install --fail-fast --no-lock --group :all --no-self
|
||||
run: pdm install --fail-fast --no-lock --dev --group=':all' --no-self
|
||||
|
||||
- name: Build package
|
||||
run: |
|
||||
|
|
9
.gitignore
vendored
9
.gitignore
vendored
|
@ -12,6 +12,11 @@ venv/
|
|||
.docker-venv/
|
||||
node_modules/
|
||||
|
||||
# Ignore dev lockfiles (should always be built fresh)
|
||||
pdm.lock
|
||||
pdm.dev.lock
|
||||
requirements-dev.txt
|
||||
|
||||
# Packaging artifacts
|
||||
.pdm-python
|
||||
.pdm-build
|
||||
|
@ -22,9 +27,7 @@ dist/
|
|||
|
||||
# Data folders
|
||||
data/
|
||||
data1/
|
||||
data2/
|
||||
data3/
|
||||
data*/
|
||||
output/
|
||||
|
||||
# vim
|
||||
|
|
|
@ -30,5 +30,4 @@ formats:
|
|||
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
|
||||
python:
|
||||
install:
|
||||
- requirements: requirements.txt
|
||||
- requirements: docs/requirements.txt
|
||||
- requirements: docs/requirements.txt
|
||||
|
|
126
Dockerfile
126
Dockerfile
|
@ -10,7 +10,7 @@
|
|||
# docker run -v "$PWD/data":/data -p 8000:8000 archivebox server
|
||||
# Multi-arch build:
|
||||
# docker buildx create --use
|
||||
# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
|
||||
# docker buildx build . --platform=linux/amd64,linux/arm64--push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
|
||||
#
|
||||
# Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
|
||||
|
||||
|
@ -20,9 +20,23 @@ FROM python:3.11-slim-bookworm
|
|||
|
||||
LABEL name="archivebox" \
|
||||
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
|
||||
description="All-in-one personal internet archiving container" \
|
||||
description="All-in-one self-hosted internet archiving solution" \
|
||||
homepage="https://github.com/ArchiveBox/ArchiveBox" \
|
||||
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker"
|
||||
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker" \
|
||||
org.opencontainers.image.title="ArchiveBox" \
|
||||
org.opencontainers.image.vendor="ArchiveBox" \
|
||||
org.opencontainers.image.description="All-in-one self-hosted internet archiving solution" \
|
||||
org.opencontainers.image.source="https://github.com/ArchiveBox/ArchiveBox" \
|
||||
com.docker.image.source.entrypoint="Dockerfile" \
|
||||
# TODO: release ArchiveBox as a Docker Desktop extension (requires these labels):
|
||||
# https://docs.docker.com/desktop/extensions-sdk/architecture/metadata/
|
||||
com.docker.desktop.extension.api.version=">= 1.4.7" \
|
||||
com.docker.desktop.extension.icon="https://archivebox.io/icon.png" \
|
||||
com.docker.extension.publisher-url="https://archivebox.io" \
|
||||
com.docker.extension.screenshots='[{"alt": "Screenshot of Admin UI", "url": "https://github.com/ArchiveBox/ArchiveBox/assets/511499/e8e0b6f8-8fdf-4b7f-8124-c10d8699bdb2"}]' \
|
||||
com.docker.extension.detailed-description='See here for detailed documentation: https://wiki.archivebox.io' \
|
||||
com.docker.extension.changelog='See here for release notes: https://github.com/ArchiveBox/ArchiveBox/releases' \
|
||||
com.docker.extension.categories='database,utility-tools'
|
||||
|
||||
ARG TARGETPLATFORM
|
||||
ARG TARGETOS
|
||||
|
@ -73,7 +87,9 @@ COPY --chown=root:root --chmod=755 package.json "$CODE_DIR/"
|
|||
RUN grep '"version": ' "${CODE_DIR}/package.json" | awk -F'"' '{print $4}' > /VERSION.txt
|
||||
|
||||
# Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up Docker builds)
|
||||
RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache \
|
||||
RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d/99keep-cache \
|
||||
&& echo 'APT::Install-Recommends "0";' > /etc/apt/apt.conf.d/99no-intall-recommends \
|
||||
&& echo 'APT::Install-Suggests "0";' > /etc/apt/apt.conf.d/99no-intall-suggests \
|
||||
&& rm -f /etc/apt/apt.conf.d/docker-clean
|
||||
|
||||
# Print debug info about build and save it to disk, for human eyes only, not used by anything else
|
||||
|
@ -106,10 +122,10 @@ RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \
|
|||
# Install system apt dependencies (adding backports to access more recent apt updates)
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] Installing APT base system dependencies for $TARGETPLATFORM..." \
|
||||
&& echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' >> /etc/apt/sources.list.d/backports.list \
|
||||
&& echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' > /etc/apt/sources.list.d/backports.list \
|
||||
&& mkdir -p /etc/apt/keyrings \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
||||
&& apt-get install -qq -y -t bookworm-backports \
|
||||
# 1. packaging dependencies
|
||||
apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
|
||||
# 2. docker and init system dependencies
|
||||
|
@ -120,27 +136,13 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
|||
|
||||
######### Language Environments ####################################
|
||||
|
||||
# Install Node environment
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] Installing Node $NODE_VERSION environment in $NODE_MODULES..." \
|
||||
&& echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
|
||||
&& curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
||||
nodejs libatomic1 python3-minimal \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
# Update NPM to latest version
|
||||
&& npm i -g npm --cache /root/.npm \
|
||||
# Save version info
|
||||
&& ( \
|
||||
which node && node --version \
|
||||
&& which npm && npm --version \
|
||||
&& echo -e '\n\n' \
|
||||
) | tee -a /VERSION.txt
|
||||
|
||||
# Install Python environment
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] Setting up Python $PYTHON_VERSION runtime..." \
|
||||
# && apt-get update -qq \
|
||||
# && apt-get install -qq -y -t bookworm-backports --no-upgrade \
|
||||
# python${PYTHON_VERSION} python${PYTHON_VERSION}-minimal python3-pip \
|
||||
# && rm -rf /var/lib/apt/lists/* \
|
||||
# tell PDM to allow using global system python site packages
|
||||
# && rm /usr/lib/python3*/EXTERNALLY-MANAGED \
|
||||
# create global virtual environment GLOBAL_VENV to use (better than using pip install --global)
|
||||
|
@ -157,17 +159,37 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
|||
&& echo -e '\n\n' \
|
||||
) | tee -a /VERSION.txt
|
||||
|
||||
|
||||
# Install Node environment
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] Installing Node $NODE_VERSION environment in $NODE_MODULES..." \
|
||||
&& echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
|
||||
&& curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y -t bookworm-backports --no-upgrade libatomic1 \
|
||||
&& apt-get install -y -t bookworm-backports --no-upgrade \
|
||||
nodejs \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
# Update NPM to latest version
|
||||
&& npm i -g npm --cache /root/.npm \
|
||||
# Save version info
|
||||
&& ( \
|
||||
which node && node --version \
|
||||
&& which npm && npm --version \
|
||||
&& echo -e '\n\n' \
|
||||
) | tee -a /VERSION.txt
|
||||
|
||||
|
||||
######### Extractor Dependencies ##################################
|
||||
|
||||
# Install apt dependencies
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] Installing APT extractor dependencies globally using apt..." \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
||||
&& apt-get install -qq -y -t bookworm-backports \
|
||||
curl wget git yt-dlp ffmpeg ripgrep \
|
||||
# Packages we have also needed in the past:
|
||||
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
|
||||
# fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
# Save version info
|
||||
&& ( \
|
||||
|
@ -183,18 +205,21 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
|||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] Installing Browser binary dependencies to $PLAYWRIGHT_BROWSERS_PATH..." \
|
||||
&& apt-get update -qq \
|
||||
&& if [[ "$TARGETPLATFORM" == *amd64* || "$TARGETPLATFORM" == *arm64* ]]; then \
|
||||
# install Chromium using playwright
|
||||
pip install playwright \
|
||||
&& cp -r /root/.cache/ms-playwright "$PLAYWRIGHT_BROWSERS_PATH" \
|
||||
&& playwright install --with-deps chromium \
|
||||
&& export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \
|
||||
else \
|
||||
# fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
|
||||
apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
||||
chromium fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
||||
&& export CHROME_BINARY="$(which chromium)"; \
|
||||
fi \
|
||||
&& apt-get install -qq -y -t bookworm-backports \
|
||||
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
||||
at-spi2-common fonts-liberation fonts-noto-color-emoji fonts-tlwg-loma-otf fonts-unifont libatk-bridge2.0-0 libatk1.0-0 libatspi2.0-0 libavahi-client3 \
|
||||
libavahi-common-data libavahi-common3 libcups2 libfontenc1 libice6 libnspr4 libnss3 libsm6 libunwind8 \
|
||||
libxaw7 libxcomposite1 libxdamage1 libxfont2 \
|
||||
libxkbfile1 libxmu6 libxpm4 libxt6 x11-xkb-utils xfonts-encodings \
|
||||
# xfonts-scalable xfonts-utils xserver-common xvfb \
|
||||
# chrome can run without dbus/upower technically, it complains about missing dbus but should run ok anyway
|
||||
# libxss1 dbus dbus-x11 upower \
|
||||
# && service dbus start \
|
||||
# install Chromium using playwright
|
||||
&& pip install playwright \
|
||||
&& cp -r /root/.cache/ms-playwright "$PLAYWRIGHT_BROWSERS_PATH" \
|
||||
&& playwright install chromium \
|
||||
&& export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
|
||||
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
|
||||
|
@ -227,8 +252,8 @@ COPY --chown=root:root --chmod=755 "./pyproject.toml" "requirements.txt" "$CODE_
|
|||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] Installing PIP ArchiveBox dependencies from requirements.txt for ${TARGETPLATFORM}..." \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
||||
build-essential \
|
||||
&& apt-get install -qq -y -t bookworm-backports \
|
||||
# build-essential \
|
||||
libssl-dev libldap2-dev libsasl2-dev \
|
||||
python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \
|
||||
# && ln -s "$GLOBAL_VENV" "$APP_VENV" \
|
||||
|
@ -238,8 +263,8 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
|||
# && pdm export -o requirements.txt --without-hashes \
|
||||
# && source $GLOBAL_VENV/bin/activate \
|
||||
&& pip install -r requirements.txt \
|
||||
&& apt-get purge -y \
|
||||
build-essential \
|
||||
# && apt-get purge -y \
|
||||
# build-essential \
|
||||
&& apt-get autoremove -y \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
@ -249,7 +274,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
|||
echo "[*] Installing PIP ArchiveBox package from $CODE_DIR..." \
|
||||
# && apt-get update -qq \
|
||||
# install C compiler to build deps on platforms that dont have 32-bit wheels available on pypi
|
||||
# && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
||||
# && apt-get install -qq -y -t bookworm-backports \
|
||||
# build-essential \
|
||||
# INSTALL ARCHIVEBOX python package globally from CODE_DIR, with all optional dependencies
|
||||
&& pip install -e "$CODE_DIR"[sonic,ldap] \
|
||||
|
@ -262,9 +287,15 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
|||
|
||||
# Setup ArchiveBox runtime config
|
||||
WORKDIR "$DATA_DIR"
|
||||
ENV IN_DOCKER=True
|
||||
ENV IN_DOCKER=True \
|
||||
DISPLAY=novnc:0.0 \
|
||||
CUSTOM_TEMPLATES_DIR=/data/templates \
|
||||
CHROME_USER_DATA_DIR=/data/personas/Default/chromium \
|
||||
GOOGLE_API_KEY=no \
|
||||
GOOGLE_DEFAULT_CLIENT_ID=no \
|
||||
GOOGLE_DEFAULT_CLIENT_SECRET=no \
|
||||
ALLOWED_HOSTS=*
|
||||
## No need to set explicitly, these values will be autodetected by archivebox in docker:
|
||||
# CHROME_SANDBOX=False \
|
||||
# WGET_BINARY="wget" \
|
||||
# YOUTUBEDL_BINARY="yt-dlp" \
|
||||
# CHROME_BINARY="/usr/bin/chromium-browser" \
|
||||
|
@ -289,9 +320,8 @@ WORKDIR "$DATA_DIR"
|
|||
VOLUME "$DATA_DIR"
|
||||
EXPOSE 8000
|
||||
|
||||
# Optional:
|
||||
# HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
|
||||
# CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
|
||||
HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
|
||||
CMD curl --silent 'http://localhost:8000/health/' | grep -q 'OK'
|
||||
|
||||
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
|
||||
CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]
|
||||
|
|
|
@ -1 +1,7 @@
|
|||
__package__ = 'archivebox'
|
||||
|
||||
|
||||
# monkey patch django timezone to add back utc (it was removed in Django 5.0)
|
||||
import datetime
|
||||
from django.utils import timezone
|
||||
timezone.utc = datetime.timezone.utc
|
||||
|
|
1
archivebox/api/__init__.py
Normal file
1
archivebox/api/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
__package__ = 'archivebox.api'
|
7
archivebox/api/apps.py
Normal file
7
archivebox/api/apps.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class APIConfig(AppConfig):
|
||||
name = 'api'
|
107
archivebox/api/auth.py
Normal file
107
archivebox/api/auth.py
Normal file
|
@ -0,0 +1,107 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from django.http import HttpRequest
|
||||
from django.contrib.auth import login
|
||||
from django.contrib.auth import authenticate
|
||||
from django.contrib.auth.models import AbstractBaseUser
|
||||
|
||||
from ninja.security import HttpBearer, APIKeyQuery, APIKeyHeader, HttpBasicAuth, django_auth_superuser
|
||||
|
||||
|
||||
def auth_using_token(token, request: Optional[HttpRequest]=None) -> Optional[AbstractBaseUser]:
|
||||
"""Given an API token string, check if a corresponding non-expired APIToken exists, and return its user"""
|
||||
from api.models import APIToken # lazy import model to avoid loading it at urls.py import time
|
||||
|
||||
user = None
|
||||
|
||||
submitted_empty_form = token in ('string', '', None)
|
||||
if submitted_empty_form:
|
||||
user = request.user # see if user is authed via django session and use that as the default
|
||||
else:
|
||||
try:
|
||||
token = APIToken.objects.get(token=token)
|
||||
if token.is_valid():
|
||||
user = token.user
|
||||
except APIToken.DoesNotExist:
|
||||
pass
|
||||
|
||||
if not user:
|
||||
print('[❌] Failed to authenticate API user using API Key:', request)
|
||||
|
||||
return None
|
||||
|
||||
def auth_using_password(username, password, request: Optional[HttpRequest]=None) -> Optional[AbstractBaseUser]:
|
||||
"""Given a username and password, check if they are valid and return the corresponding user"""
|
||||
user = None
|
||||
|
||||
submitted_empty_form = (username, password) in (('string', 'string'), ('', ''), (None, None))
|
||||
if submitted_empty_form:
|
||||
user = request.user # see if user is authed via django session and use that as the default
|
||||
else:
|
||||
user = authenticate(
|
||||
username=username,
|
||||
password=password,
|
||||
)
|
||||
|
||||
if not user:
|
||||
print('[❌] Failed to authenticate API user using API Key:', request)
|
||||
|
||||
return user
|
||||
|
||||
|
||||
### Base Auth Types
|
||||
|
||||
class APITokenAuthCheck:
|
||||
"""The base class for authentication methods that use an api.models.APIToken"""
|
||||
def authenticate(self, request: HttpRequest, key: Optional[str]=None) -> Optional[AbstractBaseUser]:
|
||||
user = auth_using_token(
|
||||
token=key,
|
||||
request=request,
|
||||
)
|
||||
if user is not None:
|
||||
login(request, user, backend='django.contrib.auth.backends.ModelBackend')
|
||||
return user
|
||||
|
||||
class UserPassAuthCheck:
|
||||
"""The base class for authentication methods that use a username & password"""
|
||||
def authenticate(self, request: HttpRequest, username: Optional[str]=None, password: Optional[str]=None) -> Optional[AbstractBaseUser]:
|
||||
user = auth_using_password(
|
||||
username=username,
|
||||
password=password,
|
||||
request=request,
|
||||
)
|
||||
if user is not None:
|
||||
login(request, user, backend='django.contrib.auth.backends.ModelBackend')
|
||||
return user
|
||||
|
||||
|
||||
### Django-Ninja-Provided Auth Methods
|
||||
|
||||
class HeaderTokenAuth(APITokenAuthCheck, APIKeyHeader):
|
||||
"""Allow authenticating by passing X-API-Key=xyz as a request header"""
|
||||
param_name = "X-ArchiveBox-API-Key"
|
||||
|
||||
class BearerTokenAuth(APITokenAuthCheck, HttpBearer):
|
||||
"""Allow authenticating by passing Bearer=xyz as a request header"""
|
||||
pass
|
||||
|
||||
class QueryParamTokenAuth(APITokenAuthCheck, APIKeyQuery):
|
||||
"""Allow authenticating by passing api_key=xyz as a GET/POST query parameter"""
|
||||
param_name = "api_key"
|
||||
|
||||
class UsernameAndPasswordAuth(UserPassAuthCheck, HttpBasicAuth):
|
||||
"""Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)"""
|
||||
pass
|
||||
|
||||
|
||||
### Enabled Auth Methods
|
||||
|
||||
API_AUTH_METHODS = [
|
||||
HeaderTokenAuth(),
|
||||
BearerTokenAuth(),
|
||||
QueryParamTokenAuth(),
|
||||
django_auth_superuser,
|
||||
UsernameAndPasswordAuth(),
|
||||
]
|
29
archivebox/api/migrations/0001_initial.py
Normal file
29
archivebox/api/migrations/0001_initial.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
# Generated by Django 4.2.11 on 2024-04-25 04:19
|
||||
|
||||
import api.models
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import uuid
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='APIToken',
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
|
||||
('token', models.CharField(default=api.models.generate_secret_token, max_length=32, unique=True)),
|
||||
('created', models.DateTimeField(auto_now_add=True)),
|
||||
('expires', models.DateTimeField(blank=True, null=True)),
|
||||
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
],
|
||||
),
|
||||
]
|
17
archivebox/api/migrations/0002_alter_apitoken_options.py
Normal file
17
archivebox/api/migrations/0002_alter_apitoken_options.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# Generated by Django 5.0.4 on 2024-04-26 05:28
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('api', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='apitoken',
|
||||
options={'verbose_name': 'API Key', 'verbose_name_plural': 'API Keys'},
|
||||
),
|
||||
]
|
0
archivebox/api/migrations/__init__.py
Normal file
0
archivebox/api/migrations/__init__.py
Normal file
63
archivebox/api/models.py
Normal file
63
archivebox/api/models.py
Normal file
|
@ -0,0 +1,63 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
import uuid
|
||||
import secrets
|
||||
from datetime import timedelta
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
|
||||
def generate_secret_token() -> str:
|
||||
# returns cryptographically secure string with len() == 32
|
||||
return secrets.token_hex(16)
|
||||
|
||||
|
||||
class APIToken(models.Model):
|
||||
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
|
||||
|
||||
user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
|
||||
token = models.CharField(max_length=32, default=generate_secret_token, unique=True)
|
||||
|
||||
created = models.DateTimeField(auto_now_add=True)
|
||||
expires = models.DateTimeField(null=True, blank=True)
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
verbose_name = "API Key"
|
||||
verbose_name_plural = "API Keys"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.token
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'<APIToken user={self.user.username} token=************{self.token[-4:]}>'
|
||||
|
||||
def __json__(self) -> dict:
|
||||
return {
|
||||
"TYPE": "APIToken",
|
||||
"id": str(self.id),
|
||||
"user_id": str(self.user.id),
|
||||
"user_username": self.user.username,
|
||||
"token": self.token,
|
||||
"created": self.created.isoformat(),
|
||||
"expires": self.expires_as_iso8601,
|
||||
}
|
||||
|
||||
@property
|
||||
def expires_as_iso8601(self):
|
||||
"""Returns the expiry date of the token in ISO 8601 format or a date 100 years in the future if none."""
|
||||
expiry_date = self.expires or (timezone.now() + timedelta(days=365 * 100))
|
||||
|
||||
return expiry_date.isoformat()
|
||||
|
||||
def is_valid(self, for_date=None):
|
||||
for_date = for_date or timezone.now()
|
||||
|
||||
if self.expires and self.expires < for_date:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
30
archivebox/api/tests.py
Normal file
30
archivebox/api/tests.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
from django.test import TestCase
|
||||
from ninja.testing import TestClient
|
||||
|
||||
from .routes_cli import router
|
||||
|
||||
class ArchiveBoxCLIAPITestCase(TestCase):
|
||||
def setUp(self):
|
||||
self.client = TestClient(router)
|
||||
|
||||
def test_add_endpoint(self):
|
||||
response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "testTag1,testTag2"})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertTrue(response.json()["success"])
|
||||
|
||||
def test_remove_endpoint(self):
|
||||
response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertTrue(response.json()["success"])
|
||||
|
||||
def test_update_endpoint(self):
|
||||
response = self.client.post("/update", json={})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertTrue(response.json()["success"])
|
||||
|
||||
def test_list_all_endpoint(self):
|
||||
response = self.client.post("/list_all", json={})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertTrue(response.json()["success"])
|
17
archivebox/api/urls.py
Normal file
17
archivebox/api/urls.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
from django.urls import path
|
||||
from django.views.generic.base import RedirectView
|
||||
|
||||
from .v1_api import urls as v1_api_urls
|
||||
|
||||
urlpatterns = [
|
||||
path("", RedirectView.as_view(url='/api/v1')),
|
||||
|
||||
path("v1/", v1_api_urls),
|
||||
path("v1", RedirectView.as_view(url='/api/v1/docs')),
|
||||
|
||||
# ... v2 can be added here ...
|
||||
# path("v2/", v2_api_urls),
|
||||
# path("v2", RedirectView.as_view(url='/api/v2/docs')),
|
||||
]
|
111
archivebox/api/v1_api.py
Normal file
111
archivebox/api/v1_api.py
Normal file
|
@ -0,0 +1,111 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
|
||||
from io import StringIO
|
||||
from traceback import format_exception
|
||||
from contextlib import redirect_stdout, redirect_stderr
|
||||
|
||||
from django.http import HttpRequest, HttpResponse
|
||||
from django.core.exceptions import ObjectDoesNotExist, EmptyResultSet, PermissionDenied
|
||||
|
||||
from ninja import NinjaAPI, Swagger
|
||||
|
||||
# TODO: explore adding https://eadwincode.github.io/django-ninja-extra/
|
||||
|
||||
from api.auth import API_AUTH_METHODS
|
||||
from ..config import VERSION, COMMIT_HASH
|
||||
|
||||
|
||||
COMMIT_HASH = COMMIT_HASH or 'unknown'
|
||||
|
||||
html_description=f'''
|
||||
<h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>
|
||||
<br/>
|
||||
<i><b>WARNING: This API is still in an early development stage and may change!</b></i>
|
||||
<br/>
|
||||
<ul>
|
||||
<li>⬅️ Manage your server: <a href="/admin/api/"><b>Setup API Keys</b></a>, <a href="/admin/">Go to your Server Admin UI</a>, <a href="/">Go to your Snapshots list</a>
|
||||
<li>💬 Ask questions and get help here: <a href="https://zulip.archivebox.io">ArchiveBox Chat Forum</a></li>
|
||||
<li>🐞 Report API bugs here: <a href="https://github.com/ArchiveBox/ArchiveBox/issues">Github Issues</a></li>
|
||||
<li>📚 ArchiveBox Documentation: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Github Wiki</a></li>
|
||||
<li>📜 See the API source code: <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/api"><code>archivebox/api/</code></a></li>
|
||||
</ul>
|
||||
<small>Served by ArchiveBox v{VERSION} (<a href="https://github.com/ArchiveBox/ArchiveBox/commit/{COMMIT_HASH}"><code>{COMMIT_HASH[:8]}</code></a>), API powered by <a href="https://django-ninja.dev/"><code>django-ninja</code></a>.</small>
|
||||
'''
|
||||
|
||||
|
||||
def register_urls(api: NinjaAPI) -> NinjaAPI:
|
||||
api.add_router('/auth/', 'api.v1_auth.router')
|
||||
api.add_router('/core/', 'api.v1_core.router')
|
||||
api.add_router('/cli/', 'api.v1_cli.router')
|
||||
return api
|
||||
|
||||
|
||||
class NinjaAPIWithIOCapture(NinjaAPI):
|
||||
def create_temporal_response(self, request: HttpRequest) -> HttpResponse:
|
||||
stdout, stderr = StringIO(), StringIO()
|
||||
|
||||
with redirect_stderr(stderr):
|
||||
with redirect_stdout(stdout):
|
||||
request.stdout = stdout
|
||||
request.stderr = stderr
|
||||
|
||||
response = super().create_temporal_response(request)
|
||||
|
||||
print('RESPONDING NOW', response)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
api = NinjaAPIWithIOCapture(
|
||||
title='ArchiveBox API',
|
||||
description=html_description,
|
||||
version='1.0.0',
|
||||
csrf=False,
|
||||
auth=API_AUTH_METHODS,
|
||||
urls_namespace="api",
|
||||
docs=Swagger(settings={"persistAuthorization": True}),
|
||||
# docs_decorator=login_required,
|
||||
# renderer=ORJSONRenderer(),
|
||||
)
|
||||
api = register_urls(api)
|
||||
urls = api.urls
|
||||
|
||||
|
||||
@api.exception_handler(Exception)
|
||||
def generic_exception_handler(request, err):
|
||||
status = 503
|
||||
if isinstance(err, (ObjectDoesNotExist, EmptyResultSet, PermissionDenied)):
|
||||
status = 404
|
||||
|
||||
print(''.join(format_exception(err)))
|
||||
|
||||
return api.create_response(
|
||||
request,
|
||||
{
|
||||
"succeeded": False,
|
||||
"message": f'{err.__class__.__name__}: {err}',
|
||||
"errors": [
|
||||
''.join(format_exception(err)),
|
||||
# or send simpler parent-only traceback:
|
||||
# *([str(err.__context__)] if getattr(err, '__context__', None) else []),
|
||||
],
|
||||
},
|
||||
status=status,
|
||||
)
|
||||
|
||||
|
||||
|
||||
# import orjson
|
||||
# from ninja.renderers import BaseRenderer
|
||||
# class ORJSONRenderer(BaseRenderer):
|
||||
# media_type = "application/json"
|
||||
# def render(self, request, data, *, response_status):
|
||||
# return {
|
||||
# "success": True,
|
||||
# "errors": [],
|
||||
# "result": data,
|
||||
# "stdout": ansi_to_html(stdout.getvalue().strip()),
|
||||
# "stderr": ansi_to_html(stderr.getvalue().strip()),
|
||||
# }
|
||||
# return orjson.dumps(data)
|
52
archivebox/api/v1_auth.py
Normal file
52
archivebox/api/v1_auth.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ninja import Router, Schema
|
||||
|
||||
from api.models import APIToken
|
||||
from api.auth import auth_using_token, auth_using_password
|
||||
|
||||
|
||||
router = Router(tags=['Authentication'])
|
||||
|
||||
|
||||
class PasswordAuthSchema(Schema):
|
||||
"""Schema for a /get_api_token request"""
|
||||
username: Optional[str] = None
|
||||
password: Optional[str] = None
|
||||
|
||||
|
||||
@router.post("/get_api_token", auth=None, summary='Generate an API token for a given username & password (or currently logged-in user)') # auth=None because they are not authed yet
|
||||
def get_api_token(request, auth_data: PasswordAuthSchema):
|
||||
user = auth_using_password(
|
||||
username=auth_data.username,
|
||||
password=auth_data.password,
|
||||
request=request,
|
||||
)
|
||||
|
||||
if user:
|
||||
# TODO: support multiple tokens in the future, for now we just have one per user
|
||||
api_token, created = APIToken.objects.get_or_create(user=user)
|
||||
|
||||
return api_token.__json__()
|
||||
|
||||
return {"success": False, "errors": ["Invalid credentials"]}
|
||||
|
||||
|
||||
|
||||
class TokenAuthSchema(Schema):
|
||||
"""Schema for a /check_api_token request"""
|
||||
token: str
|
||||
|
||||
|
||||
@router.post("/check_api_token", auth=None, summary='Validate an API token to make sure its valid and non-expired') # auth=None because they are not authed yet
|
||||
def check_api_token(request, token_data: TokenAuthSchema):
|
||||
user = auth_using_token(
|
||||
token=token_data.token,
|
||||
request=request,
|
||||
)
|
||||
if user:
|
||||
return {"success": True, "user_id": str(user.id)}
|
||||
|
||||
return {"success": False, "user_id": None}
|
234
archivebox/api/v1_cli.py
Normal file
234
archivebox/api/v1_cli.py
Normal file
|
@ -0,0 +1,234 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
from typing import List, Dict, Any, Optional
|
||||
from enum import Enum
|
||||
|
||||
from ninja import Router, Schema
|
||||
|
||||
from ..main import (
|
||||
add,
|
||||
remove,
|
||||
update,
|
||||
list_all,
|
||||
schedule,
|
||||
)
|
||||
from ..util import ansi_to_html
|
||||
from ..config import ONLY_NEW
|
||||
|
||||
|
||||
# router for API that exposes archivebox cli subcommands as REST endpoints
|
||||
router = Router(tags=['ArchiveBox CLI Sub-Commands'])
|
||||
|
||||
|
||||
# Schemas
|
||||
|
||||
JSONType = List[Any] | Dict[str, Any] | bool | int | str | None
|
||||
|
||||
class CLICommandResponseSchema(Schema):
|
||||
success: bool
|
||||
errors: List[str]
|
||||
result: JSONType
|
||||
stdout: str
|
||||
stderr: str
|
||||
|
||||
class FilterTypeChoices(str, Enum):
|
||||
exact = 'exact'
|
||||
substring = 'substring'
|
||||
regex = 'regex'
|
||||
domain = 'domain'
|
||||
tag = 'tag'
|
||||
timestamp = 'timestamp'
|
||||
|
||||
class StatusChoices(str, Enum):
|
||||
indexed = 'indexed'
|
||||
archived = 'archived'
|
||||
unarchived = 'unarchived'
|
||||
present = 'present'
|
||||
valid = 'valid'
|
||||
invalid = 'invalid'
|
||||
duplicate = 'duplicate'
|
||||
orphaned = 'orphaned'
|
||||
corrupted = 'corrupted'
|
||||
unrecognized = 'unrecognized'
|
||||
|
||||
|
||||
class AddCommandSchema(Schema):
|
||||
urls: List[str]
|
||||
tag: str = ""
|
||||
depth: int = 0
|
||||
update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW
|
||||
update_all: bool = False
|
||||
index_only: bool = False
|
||||
overwrite: bool = False
|
||||
init: bool = False
|
||||
extractors: str = ""
|
||||
parser: str = "auto"
|
||||
|
||||
class UpdateCommandSchema(Schema):
|
||||
resume: Optional[float] = 0
|
||||
only_new: bool = ONLY_NEW
|
||||
index_only: bool = False
|
||||
overwrite: bool = False
|
||||
after: Optional[float] = 0
|
||||
before: Optional[float] = 999999999999999
|
||||
status: Optional[StatusChoices] = StatusChoices.unarchived
|
||||
filter_type: Optional[str] = FilterTypeChoices.substring
|
||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||
extractors: Optional[str] = ""
|
||||
|
||||
class ScheduleCommandSchema(Schema):
|
||||
import_path: Optional[str] = None
|
||||
add: bool = False
|
||||
every: Optional[str] = None
|
||||
tag: str = ''
|
||||
depth: int = 0
|
||||
overwrite: bool = False
|
||||
update: bool = not ONLY_NEW
|
||||
clear: bool = False
|
||||
|
||||
class ListCommandSchema(Schema):
|
||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||
filter_type: str = FilterTypeChoices.substring
|
||||
status: Optional[StatusChoices] = StatusChoices.indexed
|
||||
after: Optional[float] = 0
|
||||
before: Optional[float] = 999999999999999
|
||||
sort: str = 'added'
|
||||
as_json: bool = True
|
||||
as_html: bool = False
|
||||
as_csv: str | bool = 'timestamp,url'
|
||||
with_headers: bool = False
|
||||
|
||||
class RemoveCommandSchema(Schema):
|
||||
delete: bool = True
|
||||
after: Optional[float] = 0
|
||||
before: Optional[float] = 999999999999999
|
||||
filter_type: str = FilterTypeChoices.exact
|
||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
|
||||
def cli_add(request, args: AddCommandSchema):
|
||||
result = add(
|
||||
urls=args.urls,
|
||||
tag=args.tag,
|
||||
depth=args.depth,
|
||||
update=args.update,
|
||||
update_all=args.update_all,
|
||||
index_only=args.index_only,
|
||||
overwrite=args.overwrite,
|
||||
init=args.init,
|
||||
extractors=args.extractors,
|
||||
parser=args.parser,
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
|
||||
def cli_update(request, args: UpdateCommandSchema):
|
||||
result = update(
|
||||
resume=args.resume,
|
||||
only_new=args.only_new,
|
||||
index_only=args.index_only,
|
||||
overwrite=args.overwrite,
|
||||
before=args.before,
|
||||
after=args.after,
|
||||
status=args.status,
|
||||
filter_type=args.filter_type,
|
||||
filter_patterns=args.filter_patterns,
|
||||
extractors=args.extractors,
|
||||
)
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
|
||||
def cli_schedule(request, args: ScheduleCommandSchema):
|
||||
result = schedule(
|
||||
import_path=args.import_path,
|
||||
add=args.add,
|
||||
show=args.show,
|
||||
clear=args.clear,
|
||||
every=args.every,
|
||||
tag=args.tag,
|
||||
depth=args.depth,
|
||||
overwrite=args.overwrite,
|
||||
update=args.update,
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@router.post("/list", response=CLICommandResponseSchema, summary='archivebox list [args] [filter_patterns]')
|
||||
def cli_list(request, args: ListCommandSchema):
|
||||
result = list_all(
|
||||
filter_patterns=args.filter_patterns,
|
||||
filter_type=args.filter_type,
|
||||
status=args.status,
|
||||
after=args.after,
|
||||
before=args.before,
|
||||
sort=args.sort,
|
||||
csv=args.as_csv,
|
||||
json=args.as_json,
|
||||
html=args.as_html,
|
||||
with_headers=args.with_headers,
|
||||
)
|
||||
|
||||
result_format = 'txt'
|
||||
if args.as_json:
|
||||
result_format = "json"
|
||||
elif args.as_html:
|
||||
result_format = "html"
|
||||
elif args.as_csv:
|
||||
result_format = "csv"
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"result_format": result_format,
|
||||
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
|
||||
def cli_remove(request, args: RemoveCommandSchema):
|
||||
result = remove(
|
||||
yes=True, # no way to interactively ask for confirmation via API, so we force yes
|
||||
delete=args.delete,
|
||||
before=args.before,
|
||||
after=args.after,
|
||||
filter_type=args.filter_type,
|
||||
filter_patterns=args.filter_patterns,
|
||||
)
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||
}
|
||||
|
210
archivebox/api/v1_core.py
Normal file
210
archivebox/api/v1_core.py
Normal file
|
@ -0,0 +1,210 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
from uuid import UUID
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from django.shortcuts import get_object_or_404
|
||||
|
||||
from ninja import Router, Schema, FilterSchema, Field, Query
|
||||
from ninja.pagination import paginate
|
||||
|
||||
from core.models import Snapshot, ArchiveResult, Tag
|
||||
|
||||
|
||||
router = Router(tags=['Core Models'])
|
||||
|
||||
|
||||
|
||||
|
||||
### ArchiveResult #########################################################################
|
||||
|
||||
class ArchiveResultSchema(Schema):
|
||||
id: UUID
|
||||
|
||||
snapshot_id: UUID
|
||||
snapshot_url: str
|
||||
snapshot_tags: str
|
||||
|
||||
extractor: str
|
||||
cmd: List[str]
|
||||
pwd: str
|
||||
cmd_version: str
|
||||
output: str
|
||||
status: str
|
||||
|
||||
created: datetime
|
||||
|
||||
@staticmethod
|
||||
def resolve_id(obj):
|
||||
return obj.uuid
|
||||
|
||||
@staticmethod
|
||||
def resolve_created(obj):
|
||||
return obj.start_ts
|
||||
|
||||
@staticmethod
|
||||
def resolve_snapshot_url(obj):
|
||||
return obj.snapshot.url
|
||||
|
||||
@staticmethod
|
||||
def resolve_snapshot_tags(obj):
|
||||
return obj.snapshot.tags_str()
|
||||
|
||||
|
||||
class ArchiveResultFilterSchema(FilterSchema):
|
||||
id: Optional[UUID] = Field(None, q='uuid')
|
||||
|
||||
search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains'])
|
||||
snapshot_id: Optional[UUID] = Field(None, q='snapshot_id')
|
||||
snapshot_url: Optional[str] = Field(None, q='snapshot__url')
|
||||
snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name')
|
||||
|
||||
status: Optional[str] = Field(None, q='status')
|
||||
output: Optional[str] = Field(None, q='output__icontains')
|
||||
extractor: Optional[str] = Field(None, q='extractor__icontains')
|
||||
cmd: Optional[str] = Field(None, q='cmd__0__icontains')
|
||||
pwd: Optional[str] = Field(None, q='pwd__icontains')
|
||||
cmd_version: Optional[str] = Field(None, q='cmd_version')
|
||||
|
||||
created: Optional[datetime] = Field(None, q='updated')
|
||||
created__gte: Optional[datetime] = Field(None, q='updated__gte')
|
||||
created__lt: Optional[datetime] = Field(None, q='updated__lt')
|
||||
|
||||
|
||||
@router.get("/archiveresults", response=List[ArchiveResultSchema])
|
||||
@paginate
|
||||
def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)):
|
||||
qs = ArchiveResult.objects.all()
|
||||
results = filters.filter(qs)
|
||||
return results
|
||||
|
||||
|
||||
@router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
|
||||
def get_archiveresult(request, archiveresult_id: str):
|
||||
archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
|
||||
return archiveresult
|
||||
|
||||
|
||||
# @router.post("/archiveresult", response=ArchiveResultSchema)
|
||||
# def create_archiveresult(request, payload: ArchiveResultSchema):
|
||||
# archiveresult = ArchiveResult.objects.create(**payload.dict())
|
||||
# return archiveresult
|
||||
#
|
||||
# @router.put("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
|
||||
# def update_archiveresult(request, archiveresult_id: str, payload: ArchiveResultSchema):
|
||||
# archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
|
||||
#
|
||||
# for attr, value in payload.dict().items():
|
||||
# setattr(archiveresult, attr, value)
|
||||
# archiveresult.save()
|
||||
#
|
||||
# return archiveresult
|
||||
#
|
||||
# @router.delete("/archiveresult/{archiveresult_id}")
|
||||
# def delete_archiveresult(request, archiveresult_id: str):
|
||||
# archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
|
||||
# archiveresult.delete()
|
||||
# return {"success": True}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
### Snapshot #########################################################################
|
||||
|
||||
|
||||
class SnapshotSchema(Schema):
|
||||
id: UUID
|
||||
|
||||
url: str
|
||||
tags: str
|
||||
title: Optional[str]
|
||||
timestamp: str
|
||||
bookmarked: datetime
|
||||
added: datetime
|
||||
updated: datetime
|
||||
archive_path: str
|
||||
|
||||
archiveresults: List[ArchiveResultSchema]
|
||||
|
||||
# @staticmethod
|
||||
# def resolve_id(obj):
|
||||
# return str(obj.id)
|
||||
|
||||
@staticmethod
|
||||
def resolve_tags(obj):
|
||||
return obj.tags_str()
|
||||
|
||||
@staticmethod
|
||||
def resolve_archiveresults(obj, context):
|
||||
if context['request'].with_archiveresults:
|
||||
return obj.archiveresult_set.all().distinct()
|
||||
return ArchiveResult.objects.none()
|
||||
|
||||
|
||||
class SnapshotFilterSchema(FilterSchema):
|
||||
id: Optional[UUID] = Field(None, q='id')
|
||||
|
||||
search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains'])
|
||||
url: Optional[str] = Field(None, q='url')
|
||||
tag: Optional[str] = Field(None, q='tags__name')
|
||||
title: Optional[str] = Field(None, q='title__icontains')
|
||||
|
||||
timestamp: Optional[str] = Field(None, q='timestamp__startswith')
|
||||
|
||||
added: Optional[datetime] = Field(None, q='added')
|
||||
added__gte: Optional[datetime] = Field(None, q='added__gte')
|
||||
added__lt: Optional[datetime] = Field(None, q='added__lt')
|
||||
|
||||
|
||||
@router.get("/snapshots", response=List[SnapshotSchema])
|
||||
@paginate
|
||||
def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=True):
|
||||
request.with_archiveresults = with_archiveresults
|
||||
|
||||
qs = Snapshot.objects.all()
|
||||
results = filters.filter(qs)
|
||||
return results
|
||||
|
||||
@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema)
|
||||
def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
|
||||
request.with_archiveresults = with_archiveresults
|
||||
snapshot = get_object_or_404(Snapshot, id=snapshot_id)
|
||||
return snapshot
|
||||
|
||||
|
||||
# @router.post("/snapshot", response=SnapshotSchema)
|
||||
# def create_snapshot(request, payload: SnapshotSchema):
|
||||
# snapshot = Snapshot.objects.create(**payload.dict())
|
||||
# return snapshot
|
||||
#
|
||||
# @router.put("/snapshot/{snapshot_id}", response=SnapshotSchema)
|
||||
# def update_snapshot(request, snapshot_id: str, payload: SnapshotSchema):
|
||||
# snapshot = get_object_or_404(Snapshot, id=snapshot_id)
|
||||
#
|
||||
# for attr, value in payload.dict().items():
|
||||
# setattr(snapshot, attr, value)
|
||||
# snapshot.save()
|
||||
#
|
||||
# return snapshot
|
||||
#
|
||||
# @router.delete("/snapshot/{snapshot_id}")
|
||||
# def delete_snapshot(request, snapshot_id: str):
|
||||
# snapshot = get_object_or_404(Snapshot, id=snapshot_id)
|
||||
# snapshot.delete()
|
||||
# return {"success": True}
|
||||
|
||||
|
||||
|
||||
### Tag #########################################################################
|
||||
|
||||
|
||||
class TagSchema(Schema):
|
||||
name: str
|
||||
slug: str
|
||||
|
||||
|
||||
@router.get("/tags", response=List[TagSchema])
|
||||
def list_tags(request):
|
||||
return Tag.objects.all()
|
|
@ -72,7 +72,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'TIMEOUT': {'type': int, 'default': 60},
|
||||
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
|
||||
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
|
||||
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
|
||||
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, # TODO: move this to be a default WGET_ARGS
|
||||
|
||||
'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages
|
||||
'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)},
|
||||
|
@ -112,6 +112,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
|
||||
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
|
||||
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
|
||||
'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
|
||||
},
|
||||
|
||||
'ARCHIVE_METHOD_TOGGLES': {
|
||||
|
@ -136,14 +137,15 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
},
|
||||
|
||||
'ARCHIVE_METHOD_OPTIONS': {
|
||||
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
|
||||
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'},
|
||||
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
|
||||
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
|
||||
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
||||
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
||||
|
||||
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||
'USER_AGENT': {'type': str, 'default': None},
|
||||
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||
'CHROME_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||
|
||||
'COOKIES_FILE': {'type': str, 'default': None},
|
||||
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
|
||||
|
@ -151,7 +153,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'CHROME_TIMEOUT': {'type': int, 'default': 0},
|
||||
'CHROME_HEADLESS': {'type': bool, 'default': True},
|
||||
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
|
||||
'CHROME_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
|
||||
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
|
||||
'--restrict-filenames',
|
||||
'--trim-filenames', '128',
|
||||
'--write-description',
|
||||
'--write-info-json',
|
||||
'--write-annotations',
|
||||
|
@ -173,6 +179,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'--add-metadata',
|
||||
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
|
||||
]},
|
||||
'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
|
||||
|
||||
'WGET_ARGS': {'type': list, 'default': ['--no-verbose',
|
||||
|
@ -184,12 +191,17 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
]},
|
||||
'WGET_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
'CURL_ARGS': {'type': list, 'default': ['--silent',
|
||||
'--location',
|
||||
'--compressed'
|
||||
]},
|
||||
'CURL_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
||||
'SINGLEFILE_ARGS': {'type': list, 'default' : None},
|
||||
'SINGLEFILE_ARGS': {'type': list, 'default': None},
|
||||
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
'MERCURY_ARGS': {'type': list, 'default': ['--format=text']},
|
||||
'MERCURY_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
|
||||
},
|
||||
|
||||
|
@ -253,7 +265,7 @@ CONFIG_ALIASES = {
|
|||
for key, default in section.items()
|
||||
for alias in default.get('aliases', ())
|
||||
}
|
||||
USER_CONFIG = {key for section in CONFIG_SCHEMA.values() for key in section.keys()}
|
||||
USER_CONFIG = {key: section[key] for section in CONFIG_SCHEMA.values() for key in section.keys()}
|
||||
|
||||
def get_real_name(key: str) -> str:
|
||||
"""get the current canonical name for a given deprecated config key"""
|
||||
|
@ -269,6 +281,8 @@ TEMPLATES_DIR_NAME = 'templates'
|
|||
ARCHIVE_DIR_NAME = 'archive'
|
||||
SOURCES_DIR_NAME = 'sources'
|
||||
LOGS_DIR_NAME = 'logs'
|
||||
PERSONAS_DIR_NAME = 'personas'
|
||||
CRONTABS_DIR_NAME = 'crontabs'
|
||||
SQL_INDEX_FILENAME = 'index.sqlite3'
|
||||
JSON_INDEX_FILENAME = 'index.json'
|
||||
HTML_INDEX_FILENAME = 'index.html'
|
||||
|
@ -342,9 +356,11 @@ ALLOWED_IN_OUTPUT_DIR = {
|
|||
'static',
|
||||
'sonic',
|
||||
'search.sqlite3',
|
||||
CRONTABS_DIR_NAME,
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
PERSONAS_DIR_NAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
f'{SQL_INDEX_FILENAME}-wal',
|
||||
f'{SQL_INDEX_FILENAME}-shm',
|
||||
|
@ -363,24 +379,32 @@ ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
|
|||
|
||||
############################## Version Config ##################################
|
||||
|
||||
def get_system_user():
|
||||
SYSTEM_USER = getpass.getuser() or os.getlogin()
|
||||
def get_system_user() -> str:
|
||||
# some host OS's are unable to provide a username (k3s, Windows), making this complicated
|
||||
# uid 999 is especially problematic and breaks many attempts
|
||||
SYSTEM_USER = None
|
||||
FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
|
||||
|
||||
# Option 1
|
||||
try:
|
||||
import pwd
|
||||
return pwd.getpwuid(os.geteuid()).pw_name or SYSTEM_USER
|
||||
except KeyError:
|
||||
# Process' UID might not map to a user in cases such as running the Docker image
|
||||
# (where `archivebox` is 999) as a different UID.
|
||||
pass
|
||||
except ModuleNotFoundError:
|
||||
# pwd doesn't exist on windows
|
||||
pass
|
||||
except Exception:
|
||||
# this should never happen, uncomment to debug
|
||||
# raise
|
||||
SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
|
||||
except (ModuleNotFoundError, Exception):
|
||||
pass
|
||||
|
||||
return SYSTEM_USER
|
||||
# Option 2
|
||||
try:
|
||||
SYSTEM_USER = SYSTEM_USER or getpass.getuser()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Option 3
|
||||
try:
|
||||
SYSTEM_USER = SYSTEM_USER or os.getlogin()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
|
||||
|
||||
def get_version(config):
|
||||
try:
|
||||
|
@ -487,9 +511,10 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
|
||||
'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
|
||||
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
|
||||
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
|
||||
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
|
||||
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
|
||||
'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
|
||||
'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
|
||||
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
||||
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
||||
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
|
||||
|
@ -519,6 +544,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
|
||||
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
|
||||
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
|
||||
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
|
||||
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
|
||||
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
|
||||
|
||||
|
@ -529,18 +555,22 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
|
||||
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
|
||||
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
|
||||
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
|
||||
|
||||
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
|
||||
|
||||
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
||||
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
||||
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
|
||||
'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
|
||||
|
||||
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
|
||||
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
|
||||
|
||||
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
|
||||
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
|
||||
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
|
||||
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
|
||||
|
||||
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
||||
|
@ -550,6 +580,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
|
||||
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
|
||||
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
|
||||
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
|
||||
|
||||
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
|
||||
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
|
||||
|
@ -568,9 +599,9 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
|
||||
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
||||
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
||||
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
|
||||
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
||||
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
|
||||
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
|
||||
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
|
||||
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
|
||||
}
|
||||
|
@ -899,27 +930,36 @@ def find_chrome_binary() -> Optional[str]:
|
|||
|
||||
def find_chrome_data_dir() -> Optional[str]:
|
||||
"""find any installed chrome user data directories in the default locations"""
|
||||
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
|
||||
# make sure data dir finding precedence order always matches binary finding order
|
||||
default_profile_paths = (
|
||||
'~/.config/chromium',
|
||||
'~/Library/Application Support/Chromium',
|
||||
'~/AppData/Local/Chromium/User Data',
|
||||
'~/.config/chrome',
|
||||
'~/.config/google-chrome',
|
||||
'~/Library/Application Support/Google/Chrome',
|
||||
'~/AppData/Local/Google/Chrome/User Data',
|
||||
'~/.config/google-chrome-stable',
|
||||
'~/.config/google-chrome-beta',
|
||||
'~/Library/Application Support/Google/Chrome Canary',
|
||||
'~/AppData/Local/Google/Chrome SxS/User Data',
|
||||
'~/.config/google-chrome-unstable',
|
||||
'~/.config/google-chrome-dev',
|
||||
)
|
||||
for path in default_profile_paths:
|
||||
full_path = Path(path).resolve()
|
||||
if full_path.exists():
|
||||
return full_path
|
||||
# deprecated because this is DANGEROUS, do not re-implement/uncomment this behavior.
|
||||
|
||||
# Going forward we want to discourage people from using their main chrome profile for archiving.
|
||||
# Session tokens, personal data, and cookies are often returned in server responses,
|
||||
# when they get archived, they are essentially burned as anyone who can view the archive
|
||||
# can use that data to masquerade as the logged-in user that did the archiving.
|
||||
# For this reason users should always create dedicated burner profiles for archiving and not use
|
||||
# their daily driver main accounts.
|
||||
|
||||
# # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
|
||||
# # make sure data dir finding precedence order always matches binary finding order
|
||||
# default_profile_paths = (
|
||||
# '~/.config/chromium',
|
||||
# '~/Library/Application Support/Chromium',
|
||||
# '~/AppData/Local/Chromium/User Data',
|
||||
# '~/.config/chrome',
|
||||
# '~/.config/google-chrome',
|
||||
# '~/Library/Application Support/Google/Chrome',
|
||||
# '~/AppData/Local/Google/Chrome/User Data',
|
||||
# '~/.config/google-chrome-stable',
|
||||
# '~/.config/google-chrome-beta',
|
||||
# '~/Library/Application Support/Google/Chrome Canary',
|
||||
# '~/AppData/Local/Google/Chrome SxS/User Data',
|
||||
# '~/.config/google-chrome-unstable',
|
||||
# '~/.config/google-chrome-dev',
|
||||
# )
|
||||
# for path in default_profile_paths:
|
||||
# full_path = Path(path).resolve()
|
||||
# if full_path.exists():
|
||||
# return full_path
|
||||
return None
|
||||
|
||||
def wget_supports_compression(config):
|
||||
|
@ -945,11 +985,6 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
|||
'enabled': True,
|
||||
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
|
||||
},
|
||||
'CUSTOM_TEMPLATES_DIR': {
|
||||
'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
|
||||
'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
|
||||
'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(),
|
||||
},
|
||||
# 'NODE_MODULES_DIR': {
|
||||
# 'path': ,
|
||||
# 'enabled': ,
|
||||
|
@ -957,45 +992,25 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
|||
# },
|
||||
}
|
||||
|
||||
def get_external_locations(config: ConfigDict) -> ConfigValue:
|
||||
abspath = lambda path: None if path is None else Path(path).resolve()
|
||||
return {
|
||||
'CHROME_USER_DATA_DIR': {
|
||||
'path': abspath(config['CHROME_USER_DATA_DIR']),
|
||||
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
|
||||
'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
|
||||
},
|
||||
'COOKIES_FILE': {
|
||||
'path': abspath(config['COOKIES_FILE']),
|
||||
'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
|
||||
'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
|
||||
},
|
||||
}
|
||||
|
||||
def get_data_locations(config: ConfigDict) -> ConfigValue:
|
||||
return {
|
||||
# OLD: migrating to personas
|
||||
# 'CHROME_USER_DATA_DIR': {
|
||||
# 'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
|
||||
# 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
|
||||
# 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
|
||||
# },
|
||||
# 'COOKIES_FILE': {
|
||||
# 'path': os.path.abspath(config['COOKIES_FILE']),
|
||||
# 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
|
||||
# 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
|
||||
# },
|
||||
'OUTPUT_DIR': {
|
||||
'path': config['OUTPUT_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
||||
'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
|
||||
},
|
||||
'SOURCES_DIR': {
|
||||
'path': config['SOURCES_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['SOURCES_DIR'].exists(),
|
||||
},
|
||||
'LOGS_DIR': {
|
||||
'path': config['LOGS_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['LOGS_DIR'].exists(),
|
||||
},
|
||||
'ARCHIVE_DIR': {
|
||||
'path': config['ARCHIVE_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['ARCHIVE_DIR'].exists(),
|
||||
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
|
||||
},
|
||||
'CONFIG_FILE': {
|
||||
'path': config['CONFIG_FILE'].resolve(),
|
||||
'enabled': True,
|
||||
|
@ -1007,6 +1022,38 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
|||
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
||||
'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
|
||||
},
|
||||
'ARCHIVE_DIR': {
|
||||
'path': config['ARCHIVE_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['ARCHIVE_DIR'].exists(),
|
||||
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
|
||||
},
|
||||
'SOURCES_DIR': {
|
||||
'path': config['SOURCES_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['SOURCES_DIR'].exists(),
|
||||
},
|
||||
'LOGS_DIR': {
|
||||
'path': config['LOGS_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['LOGS_DIR'].exists(),
|
||||
},
|
||||
'CUSTOM_TEMPLATES_DIR': {
|
||||
'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
|
||||
'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
|
||||
'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(),
|
||||
},
|
||||
'PERSONAS_DIR': {
|
||||
'path': config['PERSONAS_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['PERSONAS_DIR'].exists(),
|
||||
},
|
||||
# managed by bin/docker_entrypoint.sh and python-crontab:
|
||||
# 'CRONTABS_DIR': {
|
||||
# 'path': config['CRONTABS_DIR'].resolve(),
|
||||
# 'enabled': True,
|
||||
# 'is_valid': config['CRONTABS_DIR'].exists(),
|
||||
# },
|
||||
}
|
||||
|
||||
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||
|
@ -1321,6 +1368,7 @@ def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=C
|
|||
stderr(' archivebox init')
|
||||
raise SystemExit(2)
|
||||
|
||||
|
||||
def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG):
|
||||
output_dir = out_dir or config['OUTPUT_DIR']
|
||||
from .index.sql import list_migrations
|
||||
|
@ -1337,6 +1385,8 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO
|
|||
|
||||
(Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
|
||||
(Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
|
||||
(Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True)
|
||||
(Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1 +1,2 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ from contextlib import redirect_stdout
|
|||
from datetime import datetime, timezone
|
||||
|
||||
from django.contrib import admin
|
||||
from django.db.models import Count
|
||||
from django.urls import path
|
||||
from django.utils.html import format_html
|
||||
from django.utils.safestring import mark_safe
|
||||
|
@ -13,18 +14,31 @@ from django.shortcuts import render, redirect
|
|||
from django.contrib.auth import get_user_model
|
||||
from django import forms
|
||||
|
||||
|
||||
from signal_webhooks.apps import DjangoSignalWebhooksConfig
|
||||
from signal_webhooks.admin import WebhookAdmin, WebhookModel
|
||||
|
||||
from ..util import htmldecode, urldecode, ansi_to_html
|
||||
|
||||
from core.models import Snapshot, ArchiveResult, Tag
|
||||
from core.forms import AddLinkForm
|
||||
|
||||
from core.mixins import SearchResultsAdminMixin
|
||||
from api.models import APIToken
|
||||
|
||||
from index.html import snapshot_icons
|
||||
from logging_util import printable_filesize
|
||||
from main import add, remove
|
||||
from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE
|
||||
from extractors import archive_links
|
||||
from config import (
|
||||
OUTPUT_DIR,
|
||||
SNAPSHOTS_PER_PAGE,
|
||||
VERSION,
|
||||
VERSIONS_AVAILABLE,
|
||||
CAN_UPGRADE
|
||||
)
|
||||
|
||||
GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
|
||||
|
||||
# Admin URLs
|
||||
# /admin/
|
||||
|
@ -39,6 +53,82 @@ from extractors import archive_links
|
|||
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
|
||||
|
||||
|
||||
class ArchiveBoxAdmin(admin.AdminSite):
|
||||
site_header = 'ArchiveBox'
|
||||
index_title = 'Links'
|
||||
site_title = 'Index'
|
||||
namespace = 'admin'
|
||||
|
||||
def get_urls(self):
|
||||
return [
|
||||
path('core/snapshot/add/', self.add_view, name='Add'),
|
||||
] + super().get_urls()
|
||||
|
||||
def add_view(self, request):
|
||||
if not request.user.is_authenticated:
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
request.current_app = self.name
|
||||
context = {
|
||||
**self.each_context(request),
|
||||
'title': 'Add URLs',
|
||||
}
|
||||
|
||||
if request.method == 'GET':
|
||||
context['form'] = AddLinkForm()
|
||||
|
||||
elif request.method == 'POST':
|
||||
form = AddLinkForm(request.POST)
|
||||
if form.is_valid():
|
||||
url = form.cleaned_data["url"]
|
||||
print(f'[+] Adding URL: {url}')
|
||||
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
||||
input_kwargs = {
|
||||
"urls": url,
|
||||
"depth": depth,
|
||||
"update_all": False,
|
||||
"out_dir": OUTPUT_DIR,
|
||||
}
|
||||
add_stdout = StringIO()
|
||||
with redirect_stdout(add_stdout):
|
||||
add(**input_kwargs)
|
||||
print(add_stdout.getvalue())
|
||||
|
||||
context.update({
|
||||
"stdout": ansi_to_html(add_stdout.getvalue().strip()),
|
||||
"form": AddLinkForm()
|
||||
})
|
||||
else:
|
||||
context["form"] = form
|
||||
|
||||
return render(template_name='add.html', request=request, context=context)
|
||||
|
||||
|
||||
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
|
||||
DjangoSignalWebhooksConfig.verbose_name = 'API'
|
||||
WebhookModel._meta.get_field('name').help_text = 'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).'
|
||||
WebhookModel._meta.get_field('signal').help_text = 'The type of event the webhook should fire for (e.g. Create, Update, Delete).'
|
||||
WebhookModel._meta.get_field('ref').help_text = 'Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).'
|
||||
WebhookModel._meta.get_field('endpoint').help_text = 'External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).'
|
||||
WebhookModel._meta.app_label = 'api'
|
||||
|
||||
|
||||
archivebox_admin = ArchiveBoxAdmin()
|
||||
archivebox_admin.register(get_user_model())
|
||||
archivebox_admin.register(APIToken)
|
||||
archivebox_admin.register(WebhookModel, WebhookAdmin)
|
||||
archivebox_admin.disable_action('delete_selected')
|
||||
|
||||
|
||||
# patch admin with methods to add data views
|
||||
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
|
||||
|
||||
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||
archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||
archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||
|
||||
|
||||
class ArchiveResultInline(admin.TabularInline):
|
||||
model = ArchiveResult
|
||||
|
||||
|
@ -48,11 +138,11 @@ class TagInline(admin.TabularInline):
|
|||
from django.contrib.admin.helpers import ActionForm
|
||||
from django.contrib.admin.widgets import AutocompleteSelectMultiple
|
||||
|
||||
# WIP: broken by Django 3.1.2 -> 4.0 migration
|
||||
class AutocompleteTags:
|
||||
model = Tag
|
||||
search_fields = ['name']
|
||||
name = 'tags'
|
||||
remote_field = TagInline
|
||||
|
||||
class AutocompleteTagsAdminStub:
|
||||
name = 'admin'
|
||||
|
@ -62,7 +152,6 @@ class SnapshotActionForm(ActionForm):
|
|||
tags = forms.ModelMultipleChoiceField(
|
||||
queryset=Tag.objects.all(),
|
||||
required=False,
|
||||
# WIP: broken by Django 3.1.2 -> 4.0 migration
|
||||
widget=AutocompleteSelectMultiple(
|
||||
AutocompleteTags(),
|
||||
AutocompleteTagsAdminStub(),
|
||||
|
@ -81,6 +170,7 @@ class SnapshotActionForm(ActionForm):
|
|||
# )
|
||||
|
||||
|
||||
@admin.register(Snapshot, site=archivebox_admin)
|
||||
class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
||||
list_display = ('added', 'title_str', 'files', 'size', 'url_str')
|
||||
sort_fields = ('title_str', 'url_str', 'added', 'files')
|
||||
|
@ -96,6 +186,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
|
||||
action_form = SnapshotActionForm
|
||||
|
||||
def changelist_view(self, request, extra_context=None):
|
||||
extra_context = extra_context or {}
|
||||
return super().changelist_view(request, extra_context | GLOBAL_CONTEXT)
|
||||
|
||||
def get_urls(self):
|
||||
urls = super().get_urls()
|
||||
custom_urls = [
|
||||
|
@ -105,7 +199,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
|
||||
def get_queryset(self, request):
|
||||
self.request = request
|
||||
return super().get_queryset(request).prefetch_related('tags')
|
||||
return super().get_queryset(request).prefetch_related('tags').annotate(archiveresult_count=Count('archiveresult'))
|
||||
|
||||
def tag_list(self, obj):
|
||||
return ', '.join(obj.tags.values_list('name', flat=True))
|
||||
|
@ -163,6 +257,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
obj.id,
|
||||
)
|
||||
|
||||
@admin.display(
|
||||
description='Title',
|
||||
ordering='title',
|
||||
)
|
||||
def title_str(self, obj):
|
||||
canon = obj.as_link().canonical_outputs()
|
||||
tags = ''.join(
|
||||
|
@ -184,12 +282,17 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
|
||||
) + mark_safe(f' <span class="tags">{tags}</span>')
|
||||
|
||||
@admin.display(
|
||||
description='Files Saved',
|
||||
ordering='archiveresult_count',
|
||||
)
|
||||
def files(self, obj):
|
||||
return snapshot_icons(obj)
|
||||
|
||||
files.admin_order_field = 'updated'
|
||||
files.short_description = 'Files Saved'
|
||||
|
||||
@admin.display(
|
||||
ordering='archiveresult_count'
|
||||
)
|
||||
def size(self, obj):
|
||||
archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
|
||||
if archive_size:
|
||||
|
@ -204,8 +307,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
size_txt,
|
||||
)
|
||||
|
||||
size.admin_order_field = 'archiveresult__count'
|
||||
|
||||
@admin.display(
|
||||
description='Original URL',
|
||||
ordering='url',
|
||||
)
|
||||
def url_str(self, obj):
|
||||
return format_html(
|
||||
'<a href="{}"><code style="user-select: all;">{}</code></a>',
|
||||
|
@ -242,65 +348,76 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
# print('[*] Got request', request.method, request.POST)
|
||||
# return super().changelist_view(request, extra_context=None)
|
||||
|
||||
@admin.action(
|
||||
description="Pull"
|
||||
)
|
||||
def update_snapshots(self, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], out_dir=OUTPUT_DIR)
|
||||
update_snapshots.short_description = "Pull"
|
||||
|
||||
@admin.action(
|
||||
description="⬇️ Title"
|
||||
)
|
||||
def update_titles(self, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
|
||||
update_titles.short_description = "⬇️ Title"
|
||||
|
||||
@admin.action(
|
||||
description="Re-Snapshot"
|
||||
)
|
||||
def resnapshot_snapshot(self, request, queryset):
|
||||
for snapshot in queryset:
|
||||
timestamp = datetime.now(timezone.utc).isoformat('T', 'seconds')
|
||||
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
|
||||
add(new_url, tag=snapshot.tags_str())
|
||||
resnapshot_snapshot.short_description = "Re-Snapshot"
|
||||
|
||||
@admin.action(
|
||||
description="Reset"
|
||||
)
|
||||
def overwrite_snapshots(self, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], overwrite=True, out_dir=OUTPUT_DIR)
|
||||
overwrite_snapshots.short_description = "Reset"
|
||||
|
||||
@admin.action(
|
||||
description="Delete"
|
||||
)
|
||||
def delete_snapshots(self, request, queryset):
|
||||
remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
|
||||
|
||||
delete_snapshots.short_description = "Delete"
|
||||
|
||||
@admin.action(
|
||||
description="+"
|
||||
)
|
||||
def add_tags(self, request, queryset):
|
||||
tags = request.POST.getlist('tags')
|
||||
print('[+] Adding tags', tags, 'to Snapshots', queryset)
|
||||
for obj in queryset:
|
||||
obj.tags.add(*tags)
|
||||
|
||||
add_tags.short_description = "+"
|
||||
|
||||
@admin.action(
|
||||
description="–"
|
||||
)
|
||||
def remove_tags(self, request, queryset):
|
||||
tags = request.POST.getlist('tags')
|
||||
print('[-] Removing tags', tags, 'to Snapshots', queryset)
|
||||
for obj in queryset:
|
||||
obj.tags.remove(*tags)
|
||||
|
||||
remove_tags.short_description = "–"
|
||||
|
||||
|
||||
|
||||
title_str.short_description = 'Title'
|
||||
url_str.short_description = 'Original URL'
|
||||
|
||||
title_str.admin_order_field = 'title'
|
||||
url_str.admin_order_field = 'url'
|
||||
|
||||
|
||||
|
||||
|
||||
@admin.register(Tag, site=archivebox_admin)
|
||||
class TagAdmin(admin.ModelAdmin):
|
||||
list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id')
|
||||
sort_fields = ('id', 'name', 'slug')
|
||||
|
@ -331,6 +448,7 @@ class TagAdmin(admin.ModelAdmin):
|
|||
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={obj.id}">and {total_count-10} more...<a>' if obj.snapshot_set.count() > 10 else ''))
|
||||
|
||||
|
||||
@admin.register(ArchiveResult, site=archivebox_admin)
|
||||
class ArchiveResultAdmin(admin.ModelAdmin):
|
||||
list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'tags_str', 'cmd_str', 'status', 'output_str')
|
||||
sort_fields = ('start_ts', 'extractor', 'status')
|
||||
|
@ -343,6 +461,9 @@ class ArchiveResultAdmin(admin.ModelAdmin):
|
|||
ordering = ['-start_ts']
|
||||
list_per_page = SNAPSHOTS_PER_PAGE
|
||||
|
||||
@admin.display(
|
||||
description='snapshot'
|
||||
)
|
||||
def snapshot_str(self, obj):
|
||||
return format_html(
|
||||
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b></a><br/>'
|
||||
|
@ -352,6 +473,9 @@ class ArchiveResultAdmin(admin.ModelAdmin):
|
|||
obj.snapshot.url[:128],
|
||||
)
|
||||
|
||||
@admin.display(
|
||||
description='tags'
|
||||
)
|
||||
def tags_str(self, obj):
|
||||
return obj.snapshot.tags_str()
|
||||
|
||||
|
@ -368,62 +492,3 @@ class ArchiveResultAdmin(admin.ModelAdmin):
|
|||
obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html',
|
||||
obj.output,
|
||||
)
|
||||
|
||||
tags_str.short_description = 'tags'
|
||||
snapshot_str.short_description = 'snapshot'
|
||||
|
||||
class ArchiveBoxAdmin(admin.AdminSite):
|
||||
site_header = 'ArchiveBox'
|
||||
index_title = 'Links'
|
||||
site_title = 'Index'
|
||||
|
||||
def get_urls(self):
|
||||
return [
|
||||
path('core/snapshot/add/', self.add_view, name='Add'),
|
||||
] + super().get_urls()
|
||||
|
||||
def add_view(self, request):
|
||||
if not request.user.is_authenticated:
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
request.current_app = self.name
|
||||
context = {
|
||||
**self.each_context(request),
|
||||
'title': 'Add URLs',
|
||||
}
|
||||
|
||||
if request.method == 'GET':
|
||||
context['form'] = AddLinkForm()
|
||||
|
||||
elif request.method == 'POST':
|
||||
form = AddLinkForm(request.POST)
|
||||
if form.is_valid():
|
||||
url = form.cleaned_data["url"]
|
||||
print(f'[+] Adding URL: {url}')
|
||||
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
||||
input_kwargs = {
|
||||
"urls": url,
|
||||
"depth": depth,
|
||||
"update_all": False,
|
||||
"out_dir": OUTPUT_DIR,
|
||||
}
|
||||
add_stdout = StringIO()
|
||||
with redirect_stdout(add_stdout):
|
||||
add(**input_kwargs)
|
||||
print(add_stdout.getvalue())
|
||||
|
||||
context.update({
|
||||
"stdout": ansi_to_html(add_stdout.getvalue().strip()),
|
||||
"form": AddLinkForm()
|
||||
})
|
||||
else:
|
||||
context["form"] = form
|
||||
|
||||
return render(template_name='add.html', request=request, context=context)
|
||||
|
||||
admin.site = ArchiveBoxAdmin()
|
||||
admin.site.register(get_user_model())
|
||||
admin.site.register(Snapshot, SnapshotAdmin)
|
||||
admin.site.register(Tag, TagAdmin)
|
||||
admin.site.register(ArchiveResult, ArchiveResultAdmin)
|
||||
admin.site.disable_action('delete_selected')
|
||||
|
|
|
@ -1,7 +1,28 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class CoreConfig(AppConfig):
|
||||
name = 'core'
|
||||
# WIP: broken by Django 3.1.2 -> 4.0 migration
|
||||
default_auto_field = 'django.db.models.UUIDField'
|
||||
|
||||
def ready(self):
|
||||
# register our custom admin as the primary django admin
|
||||
from django.contrib import admin
|
||||
from django.contrib.admin import sites
|
||||
from core.admin import archivebox_admin
|
||||
|
||||
admin.site = archivebox_admin
|
||||
sites.site = archivebox_admin
|
||||
|
||||
|
||||
# register signal handlers
|
||||
from .auth import register_signals
|
||||
|
||||
register_signals()
|
||||
|
||||
|
||||
|
||||
# from django.contrib.admin.apps import AdminConfig
|
||||
# class CoreAdminConfig(AdminConfig):
|
||||
# default_site = "core.admin.get_admin_site"
|
||||
|
|
14
archivebox/core/auth.py
Normal file
14
archivebox/core/auth.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
|
||||
from ..config import (
|
||||
LDAP
|
||||
)
|
||||
|
||||
def register_signals():
|
||||
|
||||
if LDAP:
|
||||
import django_auth_ldap.backend
|
||||
from .auth_ldap import create_user
|
||||
|
||||
django_auth_ldap.backend.populate_user.connect(create_user)
|
10
archivebox/core/auth_ldap.py
Normal file
10
archivebox/core/auth_ldap.py
Normal file
|
@ -0,0 +1,10 @@
|
|||
from ..config import (
|
||||
LDAP_CREATE_SUPERUSER
|
||||
)
|
||||
|
||||
def create_user(sender, user=None, ldap_user=None, **kwargs):
|
||||
if not user.id and LDAP_CREATE_SUPERUSER:
|
||||
user.is_superuser = True
|
||||
|
||||
user.is_staff = True
|
||||
print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')
|
|
@ -10,7 +10,7 @@ class SearchResultsAdminMixin:
|
|||
|
||||
search_term = search_term.strip()
|
||||
if not search_term:
|
||||
return qs, use_distinct
|
||||
return qs.distinct(), use_distinct
|
||||
try:
|
||||
qsearch = query_search_index(search_term)
|
||||
qs = qs | qsearch
|
||||
|
@ -18,4 +18,4 @@ class SearchResultsAdminMixin:
|
|||
print(f'[!] Error while using search backend: {err.__class__.__name__} {err}')
|
||||
messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}')
|
||||
|
||||
return qs, use_distinct
|
||||
return qs.distinct(), use_distinct
|
||||
|
|
|
@ -18,6 +18,7 @@ from ..config import (
|
|||
CUSTOM_TEMPLATES_DIR,
|
||||
SQL_INDEX_FILENAME,
|
||||
OUTPUT_DIR,
|
||||
ARCHIVE_DIR,
|
||||
LOGS_DIR,
|
||||
TIMEZONE,
|
||||
|
||||
|
@ -61,7 +62,11 @@ INSTALLED_APPS = [
|
|||
'django.contrib.admin',
|
||||
|
||||
'core',
|
||||
'api',
|
||||
|
||||
'admin_data_views',
|
||||
|
||||
'signal_webhooks',
|
||||
'django_extensions',
|
||||
]
|
||||
|
||||
|
@ -172,6 +177,17 @@ if DEBUG_TOOLBAR:
|
|||
]
|
||||
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
|
||||
|
||||
|
||||
# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
|
||||
# Must delete archivebox/templates/admin to use because it relies on some things we override
|
||||
# visit /__requests_tracker__/ to access
|
||||
DEBUG_REQUESTS_TRACKER = False
|
||||
if DEBUG_REQUESTS_TRACKER:
|
||||
INSTALLED_APPS += ["requests_tracker"]
|
||||
MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
|
||||
INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
|
||||
|
||||
|
||||
################################################################################
|
||||
### Staticfile and Template Settings
|
||||
################################################################################
|
||||
|
@ -241,6 +257,29 @@ CACHES = {
|
|||
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
|
||||
|
||||
|
||||
STORAGES = {
|
||||
"default": {
|
||||
"BACKEND": "django.core.files.storage.FileSystemStorage",
|
||||
},
|
||||
"staticfiles": {
|
||||
"BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage",
|
||||
},
|
||||
"archive": {
|
||||
"BACKEND": "django.core.files.storage.FileSystemStorage",
|
||||
"OPTIONS": {
|
||||
"base_url": "/archive/",
|
||||
"location": ARCHIVE_DIR,
|
||||
},
|
||||
},
|
||||
# "personas": {
|
||||
# "BACKEND": "django.core.files.storage.FileSystemStorage",
|
||||
# "OPTIONS": {
|
||||
# "base_url": "/personas/",
|
||||
# "location": PERSONAS_DIR,
|
||||
# },
|
||||
# },
|
||||
}
|
||||
|
||||
################################################################################
|
||||
### Security Settings
|
||||
################################################################################
|
||||
|
@ -269,9 +308,6 @@ AUTH_PASSWORD_VALIDATORS = [
|
|||
{'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
|
||||
]
|
||||
|
||||
# WIP: broken by Django 3.1.2 -> 4.0 migration
|
||||
DEFAULT_AUTO_FIELD = 'django.db.models.UUIDField'
|
||||
|
||||
################################################################################
|
||||
### Shell Settings
|
||||
################################################################################
|
||||
|
@ -290,7 +326,6 @@ if IS_SHELL:
|
|||
|
||||
LANGUAGE_CODE = 'en-us'
|
||||
USE_I18N = True
|
||||
USE_L10N = True
|
||||
USE_TZ = True
|
||||
DATETIME_FORMAT = 'Y-m-d g:iA'
|
||||
SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
|
||||
|
@ -371,3 +406,32 @@ LOGGING = {
|
|||
}
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# Add default webhook configuration to the User model
|
||||
SIGNAL_WEBHOOKS = {
|
||||
"HOOKS": {
|
||||
"django.contrib.auth.models.User": ..., # ... is a special value that means "use the default autogenerated hooks"
|
||||
"core.models.Snapshot": ...,
|
||||
"core.models.ArchiveResult": ...,
|
||||
"core.models.Tag": ...,
|
||||
"api.models.APIToken": ...,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
ADMIN_DATA_VIEWS = {
|
||||
"NAME": "configuration",
|
||||
"URLS": [
|
||||
{
|
||||
"route": "live/",
|
||||
"view": "core.views.live_config_list_view",
|
||||
"name": "live",
|
||||
"items": {
|
||||
"route": "<str:key>/",
|
||||
"view": "core.views.live_config_value_view",
|
||||
"name": "live_config_value",
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from django.contrib import admin
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
from django.urls import path, include
|
||||
from django.views import static
|
||||
|
@ -6,7 +6,9 @@ from django.contrib.staticfiles.urls import staticfiles_urlpatterns
|
|||
from django.conf import settings
|
||||
from django.views.generic.base import RedirectView
|
||||
|
||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
||||
from .admin import archivebox_admin
|
||||
from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
||||
|
||||
|
||||
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
|
||||
# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
|
||||
|
@ -34,13 +36,12 @@ urlpatterns = [
|
|||
|
||||
|
||||
path('accounts/', include('django.contrib.auth.urls')),
|
||||
path('admin/', admin.site.urls),
|
||||
path('admin/', archivebox_admin.urls),
|
||||
|
||||
# do not add extra_context like this as not all admin views (e.g. ModelAdmin.autocomplete_view accept extra kwargs)
|
||||
# path('admin/', admin.site.urls, {'extra_context': GLOBAL_CONTEXT}),
|
||||
path("api/", include('api.urls')),
|
||||
|
||||
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
||||
path('error/', lambda _: 1/0),
|
||||
path('error/', lambda *_: 1/0),
|
||||
|
||||
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
|
||||
|
||||
|
@ -51,10 +52,10 @@ urlpatterns = [
|
|||
urlpatterns += staticfiles_urlpatterns()
|
||||
|
||||
if settings.DEBUG_TOOLBAR:
|
||||
import debug_toolbar
|
||||
urlpatterns += [
|
||||
path('__debug__/', include(debug_toolbar.urls)),
|
||||
]
|
||||
urlpatterns += [path('__debug__/', include("debug_toolbar.urls"))]
|
||||
|
||||
if settings.DEBUG_REQUESTS_TRACKER:
|
||||
urlpatterns += [path("__requests_tracker__/", include("requests_tracker.urls"))]
|
||||
|
||||
|
||||
# # Proposed FUTURE URLs spec
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
from typing import Callable
|
||||
|
||||
from io import StringIO
|
||||
from contextlib import redirect_stdout
|
||||
|
||||
from django.shortcuts import render, redirect
|
||||
from django.http import HttpResponse, Http404
|
||||
from django.http import HttpRequest, HttpResponse, Http404
|
||||
from django.utils.html import format_html, mark_safe
|
||||
from django.views import View, static
|
||||
from django.views.generic.list import ListView
|
||||
|
@ -14,6 +16,10 @@ from django.contrib.auth.mixins import UserPassesTestMixin
|
|||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django.utils.decorators import method_decorator
|
||||
|
||||
from admin_data_views.typing import TableContext, ItemContext
|
||||
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||
|
||||
|
||||
from core.models import Snapshot
|
||||
from core.forms import AddLinkForm
|
||||
|
||||
|
@ -26,6 +32,10 @@ from ..config import (
|
|||
COMMIT_HASH,
|
||||
FOOTER_INFO,
|
||||
SNAPSHOTS_PER_PAGE,
|
||||
CONFIG,
|
||||
CONFIG_SCHEMA,
|
||||
DYNAMIC_CONFIG_SCHEMA,
|
||||
USER_CONFIG,
|
||||
)
|
||||
from ..main import add
|
||||
from ..util import base_url, ansi_to_html
|
||||
|
@ -124,9 +134,9 @@ class SnapshotView(View):
|
|||
'<center><br/><br/><br/>'
|
||||
f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a> exists in DB, but resource <b><code>{snapshot.timestamp}/'
|
||||
'{}'
|
||||
f'</code></b> does not exist in <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
|
||||
'Maybe this resource type is not availabe for this Snapshot,<br/>or the archiving process has not completed yet?<br/>'
|
||||
f'<pre><code># run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
|
||||
f'</code></b> does not exist in the <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
|
||||
'It\'s possible that this resource type is not available for the Snapshot,<br/>or that the archiving process has not completed yet.<br/>'
|
||||
f'<pre><code># if interrupted, run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
|
||||
'<div class="text-align: left; width: 100%; max-width: 400px">'
|
||||
'<i><b>Next steps:</i></b><br/>'
|
||||
f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
|
||||
|
@ -231,7 +241,7 @@ class PublicIndexView(ListView):
|
|||
qs = qs | query_search_index(query)
|
||||
except Exception as err:
|
||||
print(f'[!] Error while using search backend: {err.__class__.__name__} {err}')
|
||||
return qs
|
||||
return qs.distinct()
|
||||
|
||||
def get(self, *args, **kwargs):
|
||||
if PUBLIC_INDEX or self.request.user.is_authenticated:
|
||||
|
@ -312,3 +322,124 @@ class HealthCheckView(View):
|
|||
content_type='text/plain',
|
||||
status=200
|
||||
)
|
||||
|
||||
|
||||
def find_config_section(key: str) -> str:
|
||||
matching_sections = [
|
||||
name for name, opts in CONFIG_SCHEMA.items() if key in opts
|
||||
]
|
||||
section = matching_sections[0] if matching_sections else 'DYNAMIC'
|
||||
return section
|
||||
|
||||
def find_config_default(key: str) -> str:
|
||||
default_val = USER_CONFIG.get(key, {}).get('default', lambda: None)
|
||||
if isinstance(default_val, Callable):
|
||||
return None
|
||||
else:
|
||||
default_val = repr(default_val)
|
||||
return default_val
|
||||
|
||||
def find_config_type(key: str) -> str:
|
||||
if key in USER_CONFIG:
|
||||
return USER_CONFIG[key]['type'].__name__
|
||||
elif key in DYNAMIC_CONFIG_SCHEMA:
|
||||
return type(CONFIG[key]).__name__
|
||||
return 'str'
|
||||
|
||||
def key_is_safe(key: str) -> bool:
|
||||
for term in ('key', 'password', 'secret', 'token'):
|
||||
if term in key.lower():
|
||||
return False
|
||||
return True
|
||||
|
||||
@render_with_table_view
|
||||
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
rows = {
|
||||
"Section": [],
|
||||
"Key": [],
|
||||
"Type": [],
|
||||
"Value": [],
|
||||
"Default": [],
|
||||
# "Documentation": [],
|
||||
"Aliases": [],
|
||||
}
|
||||
|
||||
for section in CONFIG_SCHEMA.keys():
|
||||
for key in CONFIG_SCHEMA[section].keys():
|
||||
rows['Section'].append(section.replace('_', ' ').title().replace(' Config', ''))
|
||||
rows['Key'].append(ItemLink(key, key=key))
|
||||
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
|
||||
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
|
||||
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or 'See here...'}</code></a>'))
|
||||
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
|
||||
rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', [])))
|
||||
|
||||
section = 'DYNAMIC'
|
||||
for key in DYNAMIC_CONFIG_SCHEMA.keys():
|
||||
rows['Section'].append(section.replace('_', ' ').title().replace(' Config', ''))
|
||||
rows['Key'].append(ItemLink(key, key=key))
|
||||
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
|
||||
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
|
||||
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or 'See here...'}</code></a>'))
|
||||
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
|
||||
rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
|
||||
|
||||
return TableContext(
|
||||
title="Computed Configuration Values",
|
||||
table=rows,
|
||||
)
|
||||
|
||||
@render_with_item_view
|
||||
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
aliases = USER_CONFIG.get(key, {}).get("aliases", [])
|
||||
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=key,
|
||||
data=[
|
||||
{
|
||||
"name": mark_safe(f'data / ArchiveBox.conf [{find_config_section(key)}] <b><code style="color: lightgray">{key}</code></b>' if key in USER_CONFIG else f'[DYNAMIC CONFIG] <b><code style="color: lightgray">{key}</code></b> <small>(calculated at runtime)</small>'),
|
||||
"description": None,
|
||||
"fields": {
|
||||
'Key': key,
|
||||
'Type': find_config_type(key),
|
||||
'Value': CONFIG[key] if key_is_safe(key) else '********',
|
||||
},
|
||||
"help_texts": {
|
||||
'Key': mark_safe(f'''
|
||||
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a>
|
||||
<span style="display: {"inline" if aliases else "none"}">
|
||||
Aliases: {", ".join(aliases)}
|
||||
</span>
|
||||
'''),
|
||||
'Type': mark_safe(f'''
|
||||
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code">
|
||||
See full definition in <code>archivebox/config.py</code>...
|
||||
</a>
|
||||
'''),
|
||||
'Value': mark_safe(f'''
|
||||
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
|
||||
Default: <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code">
|
||||
<code>{find_config_default(key) or 'See 1here...'}</code>
|
||||
</a>
|
||||
<br/><br/>
|
||||
<p style="display: {"block" if key in USER_CONFIG else "none"}">
|
||||
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
|
||||
<br/><br/>
|
||||
<code>archivebox config --set {key}="{
|
||||
val.strip("'")
|
||||
if (val := find_config_default(key)) else
|
||||
(repr(CONFIG[key] if key_is_safe(key) else '********')).strip("'")
|
||||
}"</code>
|
||||
</p>
|
||||
'''),
|
||||
},
|
||||
},
|
||||
],
|
||||
)
|
||||
|
|
|
@ -131,7 +131,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||
|
||||
link = load_link_details(link, out_dir=out_dir)
|
||||
write_link_details(link, out_dir=out_dir, skip_sql_index=False)
|
||||
log_link_archiving_started(link, out_dir, is_new)
|
||||
log_link_archiving_started(link, str(out_dir), is_new)
|
||||
link = link.overwrite(updated=datetime.now(timezone.utc))
|
||||
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
|
@ -165,16 +165,6 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||
# print('{black} X {}{reset}'.format(method_name, **ANSI))
|
||||
stats['skipped'] += 1
|
||||
except Exception as e:
|
||||
# Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
|
||||
# and https://github.com/ArchiveBox/ArchiveBox/issues/1014
|
||||
# are fixed.
|
||||
"""
|
||||
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
|
||||
method_name,
|
||||
link.url,
|
||||
)) from e
|
||||
"""
|
||||
# Instead, use the kludgy workaround from
|
||||
# https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
|
||||
with open(ERROR_LOG, "a", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
|
@ -186,6 +176,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||
ts
|
||||
) + "\n" + str(e) + "\n"))
|
||||
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
|
||||
|
||||
# print(f' ERROR: {method_name} {e.__class__.__name__}: {e} {getattr(e, "hints", "")}', ts, link.url, command)
|
||||
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
|
||||
method_name,
|
||||
link.url,
|
||||
)) from e
|
||||
|
||||
|
||||
# print(' ', stats)
|
||||
|
||||
|
@ -218,7 +215,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
|
|||
|
||||
if type(all_links) is QuerySet:
|
||||
num_links: int = all_links.count()
|
||||
get_link = lambda x: x.as_link()
|
||||
get_link = lambda x: x.as_link_with_details()
|
||||
all_links = all_links.iterator()
|
||||
else:
|
||||
num_links: int = len(all_links)
|
||||
|
|
|
@ -10,10 +10,12 @@ from ..system import run, chmod_file
|
|||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CHECK_SSL_VALIDITY,
|
||||
SAVE_ARCHIVE_DOT_ORG,
|
||||
CURL_BINARY,
|
||||
|
@ -44,13 +46,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
|
|||
output: ArchiveOutput = 'archive.org.txt'
|
||||
archive_org_url = None
|
||||
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
# later options take precedence
|
||||
options = [
|
||||
*CURL_ARGS,
|
||||
*CURL_EXTRA_ARGS,
|
||||
'--head',
|
||||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*dedupe(options),
|
||||
submit_url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
|
|
|
@ -6,13 +6,18 @@ from typing import Optional
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..system import chmod_file, run
|
||||
from ..util import enforce_types, domain
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
domain,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_FAVICON,
|
||||
FAVICON_PROVIDER,
|
||||
CURL_BINARY,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CURL_VERSION,
|
||||
CHECK_SSL_VALIDITY,
|
||||
CURL_USER_AGENT,
|
||||
|
@ -34,13 +39,18 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
|||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'favicon.ico'
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
# later options take precedence
|
||||
options = [
|
||||
*CURL_ARGS,
|
||||
*CURL_EXTRA_ARGS,
|
||||
'--max-time', str(timeout),
|
||||
'--output', str(output),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*dedupe(options),
|
||||
FAVICON_PROVIDER.format(domain(link.url)),
|
||||
]
|
||||
status = 'failed'
|
||||
|
|
|
@ -9,11 +9,13 @@ from ..system import atomic_write
|
|||
from ..util import (
|
||||
enforce_types,
|
||||
get_headers,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
CURL_BINARY,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CURL_USER_AGENT,
|
||||
CURL_VERSION,
|
||||
CHECK_SSL_VALIDITY,
|
||||
|
@ -40,14 +42,18 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
|||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
# later options take precedence
|
||||
options = [
|
||||
*CURL_ARGS,
|
||||
*CURL_EXTRA_ARGS,
|
||||
'--head',
|
||||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*dedupe(options),
|
||||
link.url,
|
||||
]
|
||||
try:
|
||||
|
|
|
@ -121,9 +121,11 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
|
||||
out_dir = Path(out_dir or link.link_dir)
|
||||
output = "htmltotext.txt"
|
||||
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
|
||||
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
extracted_text = None
|
||||
status = 'failed'
|
||||
try:
|
||||
extractor = HTMLTextExtractor()
|
||||
document = get_html(link, out_dir)
|
||||
|
@ -136,10 +138,9 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
extracted_text = str(extractor)
|
||||
|
||||
atomic_write(str(out_dir / output), extracted_text)
|
||||
status = 'succeeded'
|
||||
except (Exception, OSError) as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
|
|
|
@ -8,11 +8,13 @@ from ..system import run, chmod_file
|
|||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
MEDIA_TIMEOUT,
|
||||
SAVE_MEDIA,
|
||||
YOUTUBEDL_ARGS,
|
||||
YOUTUBEDL_EXTRA_ARGS,
|
||||
YOUTUBEDL_BINARY,
|
||||
YOUTUBEDL_VERSION,
|
||||
CHECK_SSL_VALIDITY
|
||||
|
@ -39,11 +41,16 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
|||
output: ArchiveOutput = 'media'
|
||||
output_path = out_dir / output
|
||||
output_path.mkdir(exist_ok=True)
|
||||
cmd = [
|
||||
YOUTUBEDL_BINARY,
|
||||
# later options take precedence
|
||||
options = [
|
||||
*YOUTUBEDL_ARGS,
|
||||
*YOUTUBEDL_EXTRA_ARGS,
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
||||
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
|
||||
]
|
||||
cmd = [
|
||||
YOUTUBEDL_BINARY,
|
||||
*dedupe(options),
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
|
|
|
@ -11,13 +11,15 @@ from ..system import run, atomic_write
|
|||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_MERCURY,
|
||||
DEPENDENCIES,
|
||||
MERCURY_VERSION,
|
||||
MERCURY_ARGS,
|
||||
MERCURY_EXTRA_ARGS,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
@ -60,12 +62,16 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
|||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
output_folder.mkdir(exist_ok=True)
|
||||
|
||||
# Get plain text version of article
|
||||
# later options take precedence
|
||||
options = [
|
||||
*MERCURY_ARGS,
|
||||
*MERCURY_EXTRA_ARGS,
|
||||
]
|
||||
# By default, get plain text version of article
|
||||
cmd = [
|
||||
DEPENDENCIES['MERCURY_BINARY']['path'],
|
||||
link.url,
|
||||
"--format=text"
|
||||
*dedupe(options)
|
||||
]
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
try:
|
||||
|
|
|
@ -11,6 +11,7 @@ from ..util import (
|
|||
enforce_types,
|
||||
is_static_file,
|
||||
chrome_args,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
|
@ -18,7 +19,9 @@ from ..config import (
|
|||
DEPENDENCIES,
|
||||
SINGLEFILE_VERSION,
|
||||
SINGLEFILE_ARGS,
|
||||
SINGLEFILE_EXTRA_ARGS,
|
||||
CHROME_BINARY,
|
||||
COOKIES_FILE,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
@ -46,37 +49,24 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
|
||||
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
|
||||
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
|
||||
# later options take precedence
|
||||
options = [
|
||||
*SINGLEFILE_ARGS,
|
||||
'--browser-executable-path={}'.format(CHROME_BINARY),
|
||||
*(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []),
|
||||
browser_args,
|
||||
*SINGLEFILE_ARGS,
|
||||
*SINGLEFILE_EXTRA_ARGS,
|
||||
]
|
||||
|
||||
# Deduplicate options (single-file doesn't like when you use the same option two times)
|
||||
#
|
||||
# NOTE: Options names that come first clobber conflicting names that come later
|
||||
# My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
|
||||
# specificity, therefore the user sets it with a lot intent, therefore it should take precedence
|
||||
# kind of like the ergonomic principle of lexical scope in programming languages.
|
||||
seen_option_names = []
|
||||
def test_seen(argument):
|
||||
option_name = argument.split("=")[0]
|
||||
if option_name in seen_option_names:
|
||||
return False
|
||||
else:
|
||||
seen_option_names.append(option_name)
|
||||
return True
|
||||
deduped_options = list(filter(test_seen, options))
|
||||
|
||||
cmd = [
|
||||
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
|
||||
*deduped_options,
|
||||
*dedupe(options),
|
||||
link.url,
|
||||
output,
|
||||
]
|
||||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
result = None
|
||||
try:
|
||||
result = run(cmd, cwd=str(out_dir), timeout=timeout)
|
||||
|
||||
|
@ -84,7 +74,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||
output_tail = [
|
||||
line.strip()
|
||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
|
||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
|
||||
if line.strip()
|
||||
]
|
||||
hints = (
|
||||
|
@ -94,12 +84,13 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
|
||||
# Check for common failure cases
|
||||
if (result.returncode > 0) or not (out_dir / output).is_file():
|
||||
raise ArchiveError('SingleFile was not able to archive the page', hints)
|
||||
raise ArchiveError(f'SingleFile was not able to archive the page (status={result.returncode})', hints)
|
||||
chmod_file(output, cwd=str(out_dir))
|
||||
except (Exception, OSError) as err:
|
||||
status = 'failed'
|
||||
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
|
||||
cmd[2] = browser_args.replace('"', "\\\"")
|
||||
err.hints = (result.stdout + result.stderr).decode().split('\n')
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
|
|
@ -10,6 +10,7 @@ from ..util import (
|
|||
enforce_types,
|
||||
download_url,
|
||||
htmldecode,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
|
@ -17,6 +18,7 @@ from ..config import (
|
|||
SAVE_TITLE,
|
||||
CURL_BINARY,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CURL_VERSION,
|
||||
CURL_USER_AGENT,
|
||||
)
|
||||
|
@ -75,7 +77,7 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
|||
with open(abs_path / source, "r", encoding="utf-8") as f:
|
||||
document = f.read()
|
||||
break
|
||||
except (FileNotFoundError, TypeError):
|
||||
except (FileNotFoundError, TypeError, UnicodeDecodeError):
|
||||
continue
|
||||
if document is None:
|
||||
return download_url(link.url, timeout=timeout)
|
||||
|
@ -102,12 +104,17 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
|||
from core.models import Snapshot
|
||||
|
||||
output: ArchiveOutput = None
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
# later options take precedence
|
||||
options = [
|
||||
*CURL_ARGS,
|
||||
*CURL_EXTRA_ARGS,
|
||||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*dedupe(options),
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
|
|
|
@ -15,9 +15,11 @@ from ..util import (
|
|||
path,
|
||||
domain,
|
||||
urldecode,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
WGET_ARGS,
|
||||
WGET_EXTRA_ARGS,
|
||||
TIMEOUT,
|
||||
SAVE_WGET,
|
||||
SAVE_WARC,
|
||||
|
@ -55,10 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
|||
|
||||
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||
output: ArchiveOutput = None
|
||||
cmd = [
|
||||
WGET_BINARY,
|
||||
# '--server-response', # print headers for better error parsing
|
||||
# later options take precedence
|
||||
options = [
|
||||
*WGET_ARGS,
|
||||
*WGET_EXTRA_ARGS,
|
||||
'--timeout={}'.format(timeout),
|
||||
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
|
||||
*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
|
||||
|
@ -68,6 +70,11 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
|||
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
|
||||
*([] if SAVE_WARC else ['--timestamping']),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
|
||||
# '--server-response', # print headers for better error parsing
|
||||
]
|
||||
cmd = [
|
||||
WGET_BINARY,
|
||||
*dedupe(options),
|
||||
link.url,
|
||||
]
|
||||
|
||||
|
@ -126,64 +133,38 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
|||
|
||||
|
||||
@enforce_types
|
||||
def wget_output_path(link: Link) -> Optional[str]:
|
||||
"""calculate the path to the wgetted .html file, since wget may
|
||||
adjust some paths to be different than the base_url path.
|
||||
|
||||
See docs on wget --adjust-extension (-E)
|
||||
"""
|
||||
|
||||
# Wget downloads can save in a number of different ways depending on the url:
|
||||
# https://example.com
|
||||
# > example.com/index.html
|
||||
# https://example.com?v=zzVa_tX1OiI
|
||||
# > example.com/index.html?v=zzVa_tX1OiI.html
|
||||
# https://www.example.com/?v=zzVa_tX1OiI
|
||||
# > example.com/index.html?v=zzVa_tX1OiI.html
|
||||
|
||||
# https://example.com/abc
|
||||
# > example.com/abc.html
|
||||
# https://example.com/abc/
|
||||
# > example.com/abc/index.html
|
||||
# https://example.com/abc?v=zzVa_tX1OiI.html
|
||||
# > example.com/abc?v=zzVa_tX1OiI.html
|
||||
# https://example.com/abc/?v=zzVa_tX1OiI.html
|
||||
# > example.com/abc/index.html?v=zzVa_tX1OiI.html
|
||||
|
||||
# https://example.com/abc/test.html
|
||||
# > example.com/abc/test.html
|
||||
# https://example.com/abc/test?v=zzVa_tX1OiI
|
||||
# > example.com/abc/test?v=zzVa_tX1OiI.html
|
||||
# https://example.com/abc/test/?v=zzVa_tX1OiI
|
||||
# > example.com/abc/test/index.html?v=zzVa_tX1OiI.html
|
||||
|
||||
# There's also lots of complexity around how the urlencoding and renaming
|
||||
# is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
|
||||
|
||||
# Since the wget algorithm for -E (appending .html) is incredibly complex
|
||||
# and there's no way to get the computed output path from wget
|
||||
# in order to avoid having to reverse-engineer how they calculate it,
|
||||
# we just look in the output folder read the filename wget used from the filesystem
|
||||
def unsafe_wget_output_path(link: Link) -> Optional[str]:
|
||||
# There used to be a bunch of complex reverse-engineering path mapping logic here,
|
||||
# but it was removed in favor of just walking through the output folder recursively to try to find the
|
||||
# html file that wget produced. It's *much much much* slower than deriving it statically, and is currently
|
||||
# one of the main bottlenecks of ArchiveBox's performance (the output data is often on a slow HDD or network mount).
|
||||
# But it's STILL better than trying to figure out URL -> html filepath mappings ourselves from first principles.
|
||||
full_path = without_fragment(without_query(path(link.url))).strip('/')
|
||||
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
|
||||
for _ in range(4):
|
||||
if search_dir.exists():
|
||||
if search_dir.is_dir():
|
||||
html_files = [
|
||||
f for f in search_dir.iterdir()
|
||||
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
|
||||
]
|
||||
if html_files:
|
||||
return str(html_files[0].relative_to(link.link_dir))
|
||||
try:
|
||||
if search_dir.exists():
|
||||
if search_dir.is_dir():
|
||||
html_files = [
|
||||
f for f in search_dir.iterdir()
|
||||
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
|
||||
]
|
||||
if html_files:
|
||||
return str(html_files[0].relative_to(link.link_dir))
|
||||
|
||||
# sometimes wget'd URLs have no ext and return non-html
|
||||
# e.g. /some/example/rss/all -> some RSS XML content)
|
||||
# /some/other/url.o4g -> some binary unrecognized ext)
|
||||
# test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
|
||||
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
|
||||
for file_present in search_dir.iterdir():
|
||||
if file_present == last_part_of_url:
|
||||
return str((search_dir / file_present).relative_to(link.link_dir))
|
||||
# sometimes wget'd URLs have no ext and return non-html
|
||||
# e.g. /some/example/rss/all -> some RSS XML content)
|
||||
# /some/other/url.o4g -> some binary unrecognized ext)
|
||||
# test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
|
||||
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
|
||||
for file_present in search_dir.iterdir():
|
||||
if file_present == last_part_of_url:
|
||||
return str((search_dir / file_present).relative_to(link.link_dir))
|
||||
except OSError:
|
||||
# OSError 36 and others can happen here, caused by trying to check for impossible paths
|
||||
# (paths derived from URLs can often contain illegal unicode characters or be too long,
|
||||
# causing the OS / filesystem to reject trying to open them with a system-level error)
|
||||
pass
|
||||
|
||||
# Move up one directory level
|
||||
search_dir = search_dir.parent
|
||||
|
@ -193,13 +174,101 @@ def wget_output_path(link: Link) -> Optional[str]:
|
|||
|
||||
# check for literally any file present that isnt an empty folder
|
||||
domain_dir = Path(domain(link.url).replace(":", "+"))
|
||||
files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*'))
|
||||
files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')]
|
||||
if files_within:
|
||||
return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
|
||||
|
||||
# abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated
|
||||
# that it's better we just pretend it doesnt exist
|
||||
# this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools
|
||||
return None
|
||||
|
||||
|
||||
@enforce_types
|
||||
def wget_output_path(link: Link) -> Optional[str]:
|
||||
"""calculate the path to the wgetted .html file, since wget may
|
||||
adjust some paths to be different than the base_url path.
|
||||
|
||||
See docs on: wget --adjust-extension (-E), --restrict-file-names=windows|unix|ascii, --convert-links
|
||||
|
||||
WARNING: this function is extremely error prone because mapping URLs to filesystem paths deterministically
|
||||
is basically impossible. Every OS and filesystem have different requirements on what special characters are
|
||||
allowed, and URLs are *full* of all kinds of special characters, illegal unicode, and generally unsafe strings
|
||||
that you dont want anywhere near your filesystem. Also URLs can be obscenely long, but most filesystems dont
|
||||
accept paths longer than 250 characters. On top of all that, this function only exists to try to reverse engineer
|
||||
wget's approach to solving this problem, so this is a shittier, less tested version of their already insanely
|
||||
complicated attempt to do this. Here be dragons:
|
||||
- https://github.com/ArchiveBox/ArchiveBox/issues/549
|
||||
- https://github.com/ArchiveBox/ArchiveBox/issues/1373
|
||||
- https://stackoverflow.com/questions/9532499/check-whether-a-path-is-valid-in-python-without-creating-a-file-at-the-paths-ta
|
||||
- and probably many more that I didn't realize were caused by this...
|
||||
|
||||
The only constructive thing we could possibly do to this function is to figure out how to remove it.
|
||||
|
||||
Preach loudly to anyone who will listen: never attempt to map URLs to filesystem paths,
|
||||
and pray you never have to deal with the aftermath of someone else's attempt to do so...
|
||||
"""
|
||||
|
||||
# Wget downloads can save in a number of different ways depending on the url:
|
||||
# https://example.com
|
||||
# > example.com/index.html
|
||||
# https://example.com?v=zzVa_tX1OiI
|
||||
# > example.com/index.html@v=zzVa_tX1OiI.html
|
||||
# https://www.example.com/?v=zzVa_tX1OiI
|
||||
# > example.com/index.html@v=zzVa_tX1OiI.html
|
||||
|
||||
# https://example.com/abc
|
||||
# > example.com/abc.html
|
||||
# https://example.com/abc/
|
||||
# > example.com/abc/index.html
|
||||
# https://example.com/abc?v=zzVa_tX1OiI.html
|
||||
# > example.com/abc@v=zzVa_tX1OiI.html
|
||||
# https://example.com/abc/?v=zzVa_tX1OiI.html
|
||||
# > example.com/abc/index.html@v=zzVa_tX1OiI.html
|
||||
|
||||
# https://example.com/abc/test.html
|
||||
# > example.com/abc/test.html
|
||||
# https://example.com/abc/test?v=zzVa_tX1OiI
|
||||
# > example.com/abc/test@v=zzVa_tX1OiI.html
|
||||
# https://example.com/abc/test/?v=zzVa_tX1OiI
|
||||
# > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
|
||||
|
||||
# There's also lots of complexity around how the urlencoding and renaming
|
||||
# is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
|
||||
# unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
|
||||
# 4 characters, paths with multipe extensions, etc. the list goes on...
|
||||
|
||||
output_path = None
|
||||
try:
|
||||
output_path = unsafe_wget_output_path(link)
|
||||
except Exception as err:
|
||||
pass # better to pretend it just failed to download than expose gnarly OSErrors to users
|
||||
|
||||
# check for unprintable unicode characters
|
||||
# https://github.com/ArchiveBox/ArchiveBox/issues/1373
|
||||
if output_path:
|
||||
safe_path = output_path.encode('utf-8', 'replace').decode()
|
||||
if output_path != safe_path:
|
||||
# contains unprintable unicode characters that will break other parts of archivebox
|
||||
# better to pretend it doesnt exist and fallback to parent dir than crash archivebox
|
||||
output_path = None
|
||||
|
||||
# check for a path that is just too long to safely handle across different OS's
|
||||
# https://github.com/ArchiveBox/ArchiveBox/issues/549
|
||||
if output_path and len(output_path) > 250:
|
||||
output_path = None
|
||||
|
||||
if output_path:
|
||||
return output_path
|
||||
|
||||
# fallback to just the domain dir
|
||||
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
|
||||
if search_dir.is_dir():
|
||||
return domain(link.url).replace(":", "+")
|
||||
|
||||
# fallback to just the domain dir without port
|
||||
search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0]
|
||||
if search_dir.is_dir():
|
||||
return domain(link.url).split(":", 1)[0]
|
||||
|
||||
return None
|
||||
|
|
0
archivebox/index.sqlite3
Normal file
0
archivebox/index.sqlite3
Normal file
|
@ -250,7 +250,7 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
|
|||
"""parse and load existing index with any new links from import_path merged in"""
|
||||
from core.models import Snapshot
|
||||
try:
|
||||
return Snapshot.objects.all()
|
||||
return Snapshot.objects.all().only('id')
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise SystemExit(0)
|
||||
|
@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
|
|||
|
||||
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links without checking archive status or data directory validity"""
|
||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in links
|
||||
|
@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
|
|||
|
||||
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are archived with a valid data directory"""
|
||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_archived, links)
|
||||
|
@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
|
|||
|
||||
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_unarchived, links)
|
||||
|
|
|
@ -4,6 +4,7 @@ WARNING: THIS FILE IS ALL LEGACY CODE TO BE REMOVED.
|
|||
|
||||
DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py
|
||||
|
||||
These are the old types we used to use before ArchiveBox v0.4 (before we switched to Django).
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.index'
|
||||
|
@ -379,11 +380,15 @@ class Link:
|
|||
|
||||
output_paths = (
|
||||
domain(self.url),
|
||||
'output.html',
|
||||
'output.pdf',
|
||||
'screenshot.png',
|
||||
'output.html',
|
||||
'singlefile.html',
|
||||
'readability/content.html',
|
||||
'mercury/content.html',
|
||||
'htmltotext.txt',
|
||||
'media',
|
||||
'singlefile.html'
|
||||
'git',
|
||||
)
|
||||
|
||||
return any(
|
||||
|
|
|
@ -432,12 +432,14 @@ def log_archive_method_finished(result: "ArchiveResult"):
|
|||
**ANSI,
|
||||
),
|
||||
]
|
||||
|
||||
# import pudb; pudb.set_trace()
|
||||
|
||||
# Prettify error output hints string and limit to five lines
|
||||
hints = getattr(result.output, 'hints', None) or ()
|
||||
if hints:
|
||||
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
|
||||
hints = [hint.decode() for hint in hints if isinstance(hint, bytes)]
|
||||
hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
|
||||
else:
|
||||
if isinstance(hints, bytes):
|
||||
hints = hints.decode()
|
||||
|
@ -636,17 +638,15 @@ def printable_folder_status(name: str, folder: Dict) -> str:
|
|||
|
||||
@enforce_types
|
||||
def printable_dependency_version(name: str, dependency: Dict) -> str:
|
||||
version = None
|
||||
color, symbol, note, version = 'red', 'X', 'invalid', '?'
|
||||
|
||||
if dependency['enabled']:
|
||||
if dependency['is_valid']:
|
||||
color, symbol, note, version = 'green', '√', 'valid', ''
|
||||
color, symbol, note = 'green', '√', 'valid'
|
||||
|
||||
parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
|
||||
if parsed_version_num:
|
||||
version = f'v{parsed_version_num[0]}'
|
||||
|
||||
if not version:
|
||||
color, symbol, note, version = 'red', 'X', 'invalid', '?'
|
||||
else:
|
||||
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
|
||||
|
||||
|
|
|
@ -104,7 +104,6 @@ from .config import (
|
|||
COMMIT_HASH,
|
||||
BUILD_TIME,
|
||||
CODE_LOCATIONS,
|
||||
EXTERNAL_LOCATIONS,
|
||||
DATA_LOCATIONS,
|
||||
DEPENDENCIES,
|
||||
CHROME_BINARY,
|
||||
|
@ -231,7 +230,7 @@ def version(quiet: bool=False,
|
|||
p = platform.uname()
|
||||
print(
|
||||
'ArchiveBox v{}'.format(get_version(CONFIG)),
|
||||
*((f'COMMIT_HASH={COMMIT_HASH[:7]}',) if COMMIT_HASH else ()),
|
||||
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
|
||||
f'BUILD_TIME={BUILD_TIME}',
|
||||
)
|
||||
print(
|
||||
|
@ -272,11 +271,6 @@ def version(quiet: bool=False,
|
|||
for name, path in CODE_LOCATIONS.items():
|
||||
print(printable_folder_status(name, path))
|
||||
|
||||
print()
|
||||
print('{white}[i] Secrets locations:{reset}'.format(**ANSI))
|
||||
for name, path in EXTERNAL_LOCATIONS.items():
|
||||
print(printable_folder_status(name, path))
|
||||
|
||||
print()
|
||||
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
|
||||
print('{white}[i] Data locations:{reset}'.format(**ANSI))
|
||||
|
@ -695,7 +689,7 @@ def add(urls: Union[str, List[str]],
|
|||
if CAN_UPGRADE:
|
||||
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||
|
||||
return all_links
|
||||
return new_links
|
||||
|
||||
@enforce_types
|
||||
def remove(filter_str: Optional[str]=None,
|
||||
|
@ -791,6 +785,8 @@ def update(resume: Optional[float]=None,
|
|||
out_dir: Path=OUTPUT_DIR) -> List[Link]:
|
||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||
|
||||
from core.models import ArchiveResult
|
||||
|
||||
check_data_folder(out_dir=out_dir)
|
||||
check_dependencies()
|
||||
new_links: List[Link] = [] # TODO: Remove input argument: only_new
|
||||
|
@ -798,19 +794,23 @@ def update(resume: Optional[float]=None,
|
|||
extractors = extractors.split(",") if extractors else []
|
||||
|
||||
# Step 1: Filter for selected_links
|
||||
print('[*] Finding matching Snapshots to update...')
|
||||
print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
|
||||
matching_snapshots = list_links(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
)
|
||||
|
||||
print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
|
||||
matching_folders = list_folders(
|
||||
links=matching_snapshots,
|
||||
status=status,
|
||||
out_dir=out_dir,
|
||||
)
|
||||
all_links = [link for link in matching_folders.values() if link]
|
||||
all_links = (link for link in matching_folders.values() if link)
|
||||
print(' - Sorting by most unfinished -> least unfinished + date archived...')
|
||||
all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
|
||||
|
||||
if index_only:
|
||||
for link in all_links:
|
||||
|
@ -836,6 +836,7 @@ def update(resume: Optional[float]=None,
|
|||
if extractors:
|
||||
archive_kwargs["methods"] = extractors
|
||||
|
||||
|
||||
archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
|
||||
|
||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||
|
@ -1355,7 +1356,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
|
|||
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
|
||||
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
|
||||
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
||||
stderr()
|
||||
stderr('')
|
||||
|
||||
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ if __name__ == '__main__':
|
|||
# versions of ./manage.py commands whenever possible. When that's not possible
|
||||
# (e.g. makemigrations), you can comment out this check temporarily
|
||||
|
||||
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv):
|
||||
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv or 'startapp' in sys.argv):
|
||||
print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):")
|
||||
print()
|
||||
print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:')
|
||||
|
|
2391
archivebox/package-lock.json
generated
Normal file
2391
archivebox/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "archivebox",
|
||||
"version": "0.7.2",
|
||||
"version": "0.8.0",
|
||||
"description": "ArchiveBox: The self-hosted internet archive",
|
||||
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
|
||||
"repository": "github:ArchiveBox/ArchiveBox",
|
||||
|
@ -8,6 +8,6 @@
|
|||
"dependencies": {
|
||||
"@postlight/parser": "^2.2.3",
|
||||
"readability-extractor": "github:ArchiveBox/readability-extractor",
|
||||
"single-file-cli": "^1.1.46"
|
||||
"single-file-cli": "^1.1.54"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,7 +7,6 @@ For examples of supported import formats see tests/.
|
|||
|
||||
__package__ = 'archivebox.parsers'
|
||||
|
||||
import re
|
||||
from io import StringIO
|
||||
|
||||
from typing import IO, Tuple, List, Optional
|
||||
|
@ -28,7 +27,6 @@ from ..util import (
|
|||
htmldecode,
|
||||
download_url,
|
||||
enforce_types,
|
||||
URL_REGEX,
|
||||
)
|
||||
from ..index.schema import Link
|
||||
from ..logging_util import TimedProgress, log_source_saved
|
||||
|
@ -44,6 +42,7 @@ from . import medium_rss
|
|||
from . import netscape_html
|
||||
from . import generic_rss
|
||||
from . import generic_json
|
||||
from . import generic_jsonl
|
||||
from . import generic_html
|
||||
from . import generic_txt
|
||||
from . import url_list
|
||||
|
@ -63,6 +62,7 @@ PARSERS = {
|
|||
netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER),
|
||||
generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER),
|
||||
generic_json.KEY: (generic_json.NAME, generic_json.PARSER),
|
||||
generic_jsonl.KEY: (generic_jsonl.NAME, generic_jsonl.PARSER),
|
||||
generic_html.KEY: (generic_html.NAME, generic_html.PARSER),
|
||||
|
||||
# Catchall fallback parser
|
||||
|
@ -200,54 +200,3 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
|
|||
log_source_saved(source_file=source_path)
|
||||
|
||||
return source_path
|
||||
|
||||
|
||||
# Check that plain text regex URL parsing works as expected
|
||||
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
||||
# misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
|
||||
# the consequences of bad URL parsing could be disastrous and lead to many
|
||||
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
|
||||
_test_url_strs = {
|
||||
'example.com': 0,
|
||||
'/example.com': 0,
|
||||
'//example.com': 0,
|
||||
':/example.com': 0,
|
||||
'://example.com': 0,
|
||||
'htt://example8.com': 0,
|
||||
'/htt://example.com': 0,
|
||||
'https://example': 1,
|
||||
'https://localhost/2345': 1,
|
||||
'https://localhost:1234/123': 1,
|
||||
'://': 0,
|
||||
'https://': 0,
|
||||
'http://': 0,
|
||||
'ftp://': 0,
|
||||
'ftp://example.com': 0,
|
||||
'https://example.com': 1,
|
||||
'https://example.com/': 1,
|
||||
'https://a.example.com': 1,
|
||||
'https://a.example.com/': 1,
|
||||
'https://a.example.com/what/is/happening.html': 1,
|
||||
'https://a.example.com/what/ís/happening.html': 1,
|
||||
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
|
||||
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
|
||||
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
|
||||
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
|
||||
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
|
||||
'<test>http://example7.com</test>': 1,
|
||||
'https://<test>': 0,
|
||||
'https://[test]': 0,
|
||||
'http://"test"': 0,
|
||||
'http://\'test\'': 0,
|
||||
'[https://example8.com/what/is/this.php?what=1]': 1,
|
||||
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
|
||||
'<what>https://example10.com#and-thing=2 "</about>': 1,
|
||||
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
|
||||
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
|
||||
'<or>http://examplehttp://15.badc</that>': 2,
|
||||
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
|
||||
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
|
||||
}
|
||||
for url_str, num_urls in _test_url_strs.items():
|
||||
assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
|
||||
f'{url_str} does not contain {num_urls} urls')
|
||||
|
|
|
@ -10,7 +10,7 @@ from ..index.schema import Link
|
|||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
URL_REGEX,
|
||||
find_all_urls,
|
||||
)
|
||||
from html.parser import HTMLParser
|
||||
from urllib.parse import urljoin
|
||||
|
@ -40,10 +40,22 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
|
|||
parser.feed(line)
|
||||
for url in parser.urls:
|
||||
if root_url:
|
||||
# resolve relative urls /home.html -> https://example.com/home.html
|
||||
url = urljoin(root_url, url)
|
||||
|
||||
for archivable_url in re.findall(URL_REGEX, url):
|
||||
url_is_absolute = (url.lower().startswith('http://') or url.lower().startswith('https://'))
|
||||
# url = https://abc.com => True
|
||||
# url = /page.php?next=https://example.com => False
|
||||
|
||||
if not url_is_absolute: # resolve it by joining it with root_url
|
||||
relative_path = url
|
||||
|
||||
url = urljoin(root_url, relative_path) # https://example.com/somepage.html + /home.html
|
||||
# => https://example.com/home.html
|
||||
|
||||
# special case to handle bug around // handling, crucial for urls that contain sub-urls
|
||||
# e.g. https://web.archive.org/web/https://example.com
|
||||
if did_urljoin_misbehave(root_url, relative_path, url):
|
||||
url = fix_urljoin_bug(url)
|
||||
|
||||
for archivable_url in find_all_urls(url):
|
||||
yield Link(
|
||||
url=htmldecode(archivable_url),
|
||||
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
||||
|
@ -56,3 +68,74 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
|
|||
KEY = 'html'
|
||||
NAME = 'Generic HTML'
|
||||
PARSER = parse_generic_html_export
|
||||
|
||||
|
||||
#### WORKAROUND CODE FOR https://github.com/python/cpython/issues/96015 ####
|
||||
|
||||
def did_urljoin_misbehave(root_url: str, relative_path: str, final_url: str) -> bool:
|
||||
"""
|
||||
Handle urljoin edge case bug where multiple slashes get turned into a single slash:
|
||||
- https://github.com/python/cpython/issues/96015
|
||||
- https://github.com/ArchiveBox/ArchiveBox/issues/1411
|
||||
|
||||
This workaround only fixes the most common case of a sub-URL inside an outer URL, e.g.:
|
||||
https://web.archive.org/web/https://example.com/some/inner/url
|
||||
|
||||
But there are other valid URLs containing // that are not fixed by this workaround, e.g.:
|
||||
https://example.com/drives/C//some/file
|
||||
"""
|
||||
|
||||
# if relative path is actually an absolute url, cut off its own scheme so we check the path component only
|
||||
relative_path = relative_path.lower()
|
||||
if relative_path.startswith('http://') or relative_path.startswith('https://'):
|
||||
relative_path = relative_path.split('://', 1)[-1]
|
||||
|
||||
# TODO: properly fix all double // getting stripped by urljoin, not just ://
|
||||
original_path_had_suburl = '://' in relative_path
|
||||
original_root_had_suburl = '://' in root_url[8:] # ignore first 8 chars because root always starts with https://
|
||||
final_joined_has_suburl = '://' in final_url[8:] # ignore first 8 chars because final always starts with https://
|
||||
|
||||
urljoin_broke_suburls = (
|
||||
(original_root_had_suburl or original_path_had_suburl)
|
||||
and not final_joined_has_suburl
|
||||
)
|
||||
return urljoin_broke_suburls
|
||||
|
||||
|
||||
def fix_urljoin_bug(url: str, nesting_limit=5):
|
||||
"""
|
||||
recursively replace broken suburls .../http:/... with .../http://...
|
||||
|
||||
basically equivalent to this for 99.9% of cases:
|
||||
url = url.replace('/http:/', '/http://')
|
||||
url = url.replace('/https:/', '/https://')
|
||||
except this handles:
|
||||
other schemes besides http/https (e.g. https://example.com/link/git+ssh://github.com/example)
|
||||
other preceding separators besides / (e.g. https://example.com/login/?next=https://example.com/home)
|
||||
fixing multiple suburls recursively
|
||||
"""
|
||||
input_url = url
|
||||
for _ in range(nesting_limit):
|
||||
url = re.sub(
|
||||
r'(?P<root>.+?)' # https://web.archive.org/web
|
||||
+ r'(?P<separator>[-=/_&+%$#@!*\(\\])' # /
|
||||
+ r'(?P<subscheme>[a-zA-Z0-9+_-]{1,32}?):/' # http:/
|
||||
+ r'(?P<suburl>[^/\\]+)', # example.com
|
||||
r"\1\2\3://\4",
|
||||
input_url,
|
||||
re.IGNORECASE | re.UNICODE,
|
||||
)
|
||||
if url == input_url:
|
||||
break # nothing left to replace, all suburls are fixed
|
||||
input_url = url
|
||||
|
||||
return url
|
||||
|
||||
|
||||
# sanity check to make sure workaround code works as expected and doesnt introduce *more* bugs
|
||||
assert did_urljoin_misbehave('https://web.archive.org/web/https://example.com', 'abc.html', 'https://web.archive.org/web/https:/example.com/abc.html') == True
|
||||
assert did_urljoin_misbehave('http://example.com', 'https://web.archive.org/web/http://example.com/abc.html', 'https://web.archive.org/web/http:/example.com/abc.html') == True
|
||||
assert fix_urljoin_bug('https:/example.com') == 'https:/example.com' # should not modify original url's scheme, only sub-urls
|
||||
assert fix_urljoin_bug('https://web.archive.org/web/https:/example.com/abc.html') == 'https://web.archive.org/web/https://example.com/abc.html'
|
||||
assert fix_urljoin_bug('http://example.com/link/git+ssh:/github.com/example?next=ftp:/example.com') == 'http://example.com/link/git+ssh://github.com/example?next=ftp://example.com'
|
||||
|
||||
|
|
|
@ -11,6 +11,60 @@ from ..util import (
|
|||
enforce_types,
|
||||
)
|
||||
|
||||
# This gets used by generic_jsonl, too
|
||||
def jsonObjectToLink(link: str, source: str):
|
||||
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
|
||||
|
||||
# example line
|
||||
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
|
||||
# Parse URL
|
||||
url = link.get('href') or link.get('url') or link.get('URL')
|
||||
if not url:
|
||||
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
|
||||
|
||||
# Parse the timestamp
|
||||
ts_str = str(datetime.now(timezone.utc).timestamp())
|
||||
if link.get('timestamp'):
|
||||
# chrome/ff histories use a very precise timestamp
|
||||
ts_str = str(link['timestamp'] / 10000000)
|
||||
elif link.get('time'):
|
||||
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
|
||||
elif link.get('created_at'):
|
||||
ts_str = str(json_date(link['created_at']).timestamp())
|
||||
elif link.get('created'):
|
||||
ts_str = str(json_date(link['created']).timestamp())
|
||||
elif link.get('date'):
|
||||
ts_str = str(json_date(link['date']).timestamp())
|
||||
elif link.get('bookmarked'):
|
||||
ts_str = str(json_date(link['bookmarked']).timestamp())
|
||||
elif link.get('saved'):
|
||||
ts_str = str(json_date(link['saved']).timestamp())
|
||||
|
||||
# Parse the title
|
||||
title = None
|
||||
if link.get('title'):
|
||||
title = link['title'].strip()
|
||||
elif link.get('description'):
|
||||
title = link['description'].replace(' — Readability', '').strip()
|
||||
elif link.get('name'):
|
||||
title = link['name'].strip()
|
||||
|
||||
# if we have a list, join it with commas
|
||||
tags = link.get('tags')
|
||||
if type(tags) == list:
|
||||
tags = ','.join(tags)
|
||||
elif type(tags) == str:
|
||||
# if there's no comma, assume it was space-separated
|
||||
if ',' not in tags:
|
||||
tags = tags.replace(' ', ',')
|
||||
|
||||
return Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=ts_str,
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(tags),
|
||||
sources=[source],
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
|
@ -18,55 +72,13 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
|
||||
json_file.seek(0)
|
||||
|
||||
# sometimes the first line is a comment or filepath, so we get everything after the first {
|
||||
json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
|
||||
links = json.loads(json_file_json_str)
|
||||
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
|
||||
|
||||
links = json.load(json_file)
|
||||
if type(links) != list:
|
||||
raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
|
||||
|
||||
for link in links:
|
||||
# example line
|
||||
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
|
||||
if link:
|
||||
# Parse URL
|
||||
url = link.get('href') or link.get('url') or link.get('URL')
|
||||
if not url:
|
||||
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
|
||||
|
||||
# Parse the timestamp
|
||||
ts_str = str(datetime.now(timezone.utc).timestamp())
|
||||
if link.get('timestamp'):
|
||||
# chrome/ff histories use a very precise timestamp
|
||||
ts_str = str(link['timestamp'] / 10000000)
|
||||
elif link.get('time'):
|
||||
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
|
||||
elif link.get('created_at'):
|
||||
ts_str = str(json_date(link['created_at']).timestamp())
|
||||
elif link.get('created'):
|
||||
ts_str = str(json_date(link['created']).timestamp())
|
||||
elif link.get('date'):
|
||||
ts_str = str(json_date(link['date']).timestamp())
|
||||
elif link.get('bookmarked'):
|
||||
ts_str = str(json_date(link['bookmarked']).timestamp())
|
||||
elif link.get('saved'):
|
||||
ts_str = str(json_date(link['saved']).timestamp())
|
||||
|
||||
# Parse the title
|
||||
title = None
|
||||
if link.get('title'):
|
||||
title = link['title'].strip()
|
||||
elif link.get('description'):
|
||||
title = link['description'].replace(' — Readability', '').strip()
|
||||
elif link.get('name'):
|
||||
title = link['name'].strip()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=ts_str,
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(link.get('tags')) or '',
|
||||
sources=[json_file.name],
|
||||
)
|
||||
|
||||
yield jsonObjectToLink(link, json_file.name)
|
||||
|
||||
KEY = 'json'
|
||||
NAME = 'Generic JSON'
|
||||
|
|
32
archivebox/parsers/generic_jsonl.py
Normal file
32
archivebox/parsers/generic_jsonl.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
import json
|
||||
|
||||
from typing import IO, Iterable
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
from .generic_json import jsonObjectToLink
|
||||
|
||||
def parse_line(line: str):
|
||||
if line.strip() != "":
|
||||
return json.loads(line)
|
||||
|
||||
@enforce_types
|
||||
def parse_generic_jsonl_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
"""Parse JSONL format bookmarks export files"""
|
||||
|
||||
json_file.seek(0)
|
||||
|
||||
links = [ parse_line(line) for line in json_file ]
|
||||
|
||||
for link in links:
|
||||
if link:
|
||||
yield jsonObjectToLink(link,json_file.name)
|
||||
|
||||
KEY = 'jsonl'
|
||||
NAME = 'Generic JSONL'
|
||||
PARSER = parse_generic_jsonl_export
|
|
@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers'
|
|||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
from feedparser import parse as feedparser
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
str_between,
|
||||
enforce_types
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
|
@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
"""Parse RSS XML-format files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
items = rss_file.read().split('<item>')
|
||||
items = items[1:] if items else []
|
||||
for item in items:
|
||||
# example item:
|
||||
# <item>
|
||||
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
|
||||
# <category>Unread</category>
|
||||
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
|
||||
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
|
||||
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
|
||||
# </item>
|
||||
feed = feedparser(rss_file.read())
|
||||
for item in feed.entries:
|
||||
url = item.link
|
||||
title = item.title
|
||||
time = mktime(item.updated_parsed)
|
||||
|
||||
trailing_removed = item.split('</item>', 1)[0]
|
||||
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
|
||||
rows = leading_removed.split('\n')
|
||||
try:
|
||||
tags = ','.join(map(lambda tag: tag.term, item.tags))
|
||||
except AttributeError:
|
||||
tags = ''
|
||||
|
||||
def get_row(key):
|
||||
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
|
||||
|
||||
url = str_between(get_row('link'), '<link>', '</link>')
|
||||
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
|
||||
if url is None:
|
||||
# Yielding a Link with no URL will
|
||||
# crash on a URL validation assertion
|
||||
continue
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
timestamp=str(time),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
tags=tags,
|
||||
sources=[rss_file.name],
|
||||
)
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
__description__ = 'Plain Text'
|
||||
|
||||
import re
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
@ -11,7 +9,7 @@ from ..index.schema import Link
|
|||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
URL_REGEX
|
||||
find_all_urls,
|
||||
)
|
||||
|
||||
|
||||
|
@ -39,7 +37,7 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
pass
|
||||
|
||||
# otherwise look for anything that looks like a URL in the line
|
||||
for url in re.findall(URL_REGEX, line):
|
||||
for url in find_all_urls(line):
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
||||
|
@ -48,17 +46,6 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
sources=[text_file.name],
|
||||
)
|
||||
|
||||
# look inside the URL for any sub-urls, e.g. for archive.org links
|
||||
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
||||
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
||||
for sub_url in re.findall(URL_REGEX, line[1:]):
|
||||
yield Link(
|
||||
url=htmldecode(sub_url),
|
||||
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
||||
title=None,
|
||||
tags=None,
|
||||
sources=[text_file.name],
|
||||
)
|
||||
|
||||
KEY = 'txt'
|
||||
NAME = 'Generic TXT'
|
||||
|
|
|
@ -2,50 +2,41 @@ __package__ = 'archivebox.parsers'
|
|||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from xml.etree import ElementTree
|
||||
from time import mktime
|
||||
from feedparser import parse as feedparser
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
enforce_types
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
"""Parse Pinboard RSS feed files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
root = ElementTree.parse(rss_file).getroot()
|
||||
items = root.findall("{http://purl.org/rss/1.0/}item")
|
||||
for item in items:
|
||||
find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore
|
||||
feed = feedparser(rss_file.read())
|
||||
for item in feed.entries:
|
||||
url = item.link
|
||||
# title will start with "[priv] " if pin was marked private. useful?
|
||||
title = item.title
|
||||
time = mktime(item.updated_parsed)
|
||||
|
||||
url = find("{http://purl.org/rss/1.0/}link")
|
||||
tags = find("{http://purl.org/dc/elements/1.1/}subject")
|
||||
title = find("{http://purl.org/rss/1.0/}title")
|
||||
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
|
||||
# all tags are in one entry.tags with spaces in it. annoying!
|
||||
try:
|
||||
tags = item.tags[0].term.replace(' ', ',')
|
||||
except AttributeError:
|
||||
tags = ''
|
||||
|
||||
if url is None:
|
||||
# Yielding a Link with no URL will
|
||||
# crash on a URL validation assertion
|
||||
continue
|
||||
|
||||
# Pinboard includes a colon in its date stamp timezone offsets, which
|
||||
# Python can't parse. Remove it:
|
||||
if ts_str and ts_str[-3:-2] == ":":
|
||||
ts_str = ts_str[:-3]+ts_str[-2:]
|
||||
|
||||
if ts_str:
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
else:
|
||||
time = datetime.now(timezone.utc)
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
timestamp=str(time),
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(tags) or None,
|
||||
sources=[rss_file.name],
|
||||
|
|
|
@ -30,8 +30,7 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False,
|
|||
|
||||
if capture_output:
|
||||
if ('stdout' in kwargs) or ('stderr' in kwargs):
|
||||
raise ValueError('stdout and stderr arguments may not be used '
|
||||
'with capture_output.')
|
||||
raise ValueError('stdout and stderr arguments may not be used with capture_output.')
|
||||
kwargs['stdout'] = PIPE
|
||||
kwargs['stderr'] = PIPE
|
||||
|
||||
|
@ -146,20 +145,24 @@ def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional
|
|||
recursively and limiting to a given filter list
|
||||
"""
|
||||
num_bytes, num_dirs, num_files = 0, 0, 0
|
||||
for entry in os.scandir(path):
|
||||
if (pattern is not None) and (pattern not in entry.path):
|
||||
continue
|
||||
if entry.is_dir(follow_symlinks=False):
|
||||
if not recursive:
|
||||
try:
|
||||
for entry in os.scandir(path):
|
||||
if (pattern is not None) and (pattern not in entry.path):
|
||||
continue
|
||||
num_dirs += 1
|
||||
bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
|
||||
num_bytes += bytes_inside
|
||||
num_dirs += dirs_inside
|
||||
num_files += files_inside
|
||||
else:
|
||||
num_bytes += entry.stat(follow_symlinks=False).st_size
|
||||
num_files += 1
|
||||
if entry.is_dir(follow_symlinks=False):
|
||||
if not recursive:
|
||||
continue
|
||||
num_dirs += 1
|
||||
bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
|
||||
num_bytes += bytes_inside
|
||||
num_dirs += dirs_inside
|
||||
num_files += files_inside
|
||||
else:
|
||||
num_bytes += entry.stat(follow_symlinks=False).st_size
|
||||
num_files += 1
|
||||
except OSError:
|
||||
# e.g. FileNameTooLong or other error while trying to read dir
|
||||
pass
|
||||
return num_bytes, num_dirs, num_files
|
||||
|
||||
|
||||
|
@ -171,7 +174,7 @@ def dedupe_cron_jobs(cron: CronTab) -> CronTab:
|
|||
deduped: Set[Tuple[str, str]] = set()
|
||||
|
||||
for job in list(cron):
|
||||
unique_tuple = (str(job.slices), job.command)
|
||||
unique_tuple = (str(job.slices), str(job.command))
|
||||
if unique_tuple not in deduped:
|
||||
deduped.add(unique_tuple)
|
||||
cron.remove(job)
|
||||
|
|
|
@ -5,7 +5,8 @@
|
|||
<a href="{% url 'Home' %}">Snapshots</a> |
|
||||
<a href="/admin/core/tag/">Tags</a> |
|
||||
<a href="/admin/core/archiveresult/?o=-1">Log</a>
|
||||
<a href="{% url 'Docs' %}">Docs</a> |
|
||||
<a href="{% url 'Docs' %}" target="_blank" rel="noopener noreferrer">Docs</a> |
|
||||
<a href="/api">API</a> |
|
||||
<a href="{% url 'public-index' %}">Public</a> |
|
||||
<a href="/admin/">Admin</a>
|
||||
|
||||
|
@ -16,7 +17,7 @@
|
|||
{% endblock %}
|
||||
{% block userlinks %}
|
||||
{% if user.has_usable_password %}
|
||||
<a href="{% url 'admin:password_change' %}">Account</a> /
|
||||
<a href="{% url 'admin:password_change' %}" title="Change your account password">Account</a> /
|
||||
{% endif %}
|
||||
<a href="{% url 'admin:logout' %}">{% trans 'Log out' %}</a>
|
||||
{% endblock %}
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -3,6 +3,7 @@ __package__ = 'archivebox'
|
|||
import re
|
||||
import requests
|
||||
import json as pyjson
|
||||
import http.cookiejar
|
||||
|
||||
from typing import List, Optional, Any
|
||||
from pathlib import Path
|
||||
|
@ -56,19 +57,62 @@ short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
|
|||
ts_to_date_str = lambda ts: ts and parse_date(ts).strftime('%Y-%m-%d %H:%M')
|
||||
ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
|
||||
|
||||
COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
|
||||
|
||||
|
||||
# https://mathiasbynens.be/demo/url-regex
|
||||
URL_REGEX = re.compile(
|
||||
r'(?=('
|
||||
r'http[s]?://' # start matching from allowed schemes
|
||||
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
||||
r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen)
|
||||
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
|
||||
r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols
|
||||
r'(?=('
|
||||
r'http[s]?://' # start matching from allowed schemes
|
||||
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
||||
r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen)
|
||||
r'|[^\u0000-\u007F])+' # or allowed unicode bytes
|
||||
r'[^\]\[<>"\'\s]+' # stop parsing at these symbols
|
||||
r'))',
|
||||
re.IGNORECASE,
|
||||
re.IGNORECASE | re.UNICODE,
|
||||
)
|
||||
|
||||
COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
|
||||
def parens_are_matched(string: str, open_char='(', close_char=')'):
|
||||
"""check that all parentheses in a string are balanced and nested properly"""
|
||||
count = 0
|
||||
for c in string:
|
||||
if c == open_char:
|
||||
count += 1
|
||||
elif c == close_char:
|
||||
count -= 1
|
||||
if count < 0:
|
||||
return False
|
||||
return count == 0
|
||||
|
||||
def fix_url_from_markdown(url_str: str) -> str:
|
||||
"""
|
||||
cleanup a regex-parsed url that may contain dangling trailing parens from markdown link syntax
|
||||
helpful to fix URLs parsed from markdown e.g.
|
||||
input: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).somemoretext
|
||||
result: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def
|
||||
|
||||
IMPORTANT ASSUMPTION: valid urls wont have unbalanced or incorrectly nested parentheses
|
||||
e.g. this will fail the user actually wants to ingest a url like 'https://example.com/some_wei)(rd_url'
|
||||
in that case it will return https://example.com/some_wei (truncated up to the first unbalanced paren)
|
||||
This assumption is true 99.9999% of the time, and for the rare edge case the user can use url_list parser.
|
||||
"""
|
||||
trimmed_url = url_str
|
||||
|
||||
# cut off one trailing character at a time
|
||||
# until parens are balanced e.g. /a(b)c).x(y)z -> /a(b)c
|
||||
while not parens_are_matched(trimmed_url):
|
||||
trimmed_url = trimmed_url[:-1]
|
||||
|
||||
# make sure trimmed url is still valid
|
||||
if re.findall(URL_REGEX, trimmed_url):
|
||||
return trimmed_url
|
||||
|
||||
return url_str
|
||||
|
||||
def find_all_urls(urls_str: str):
|
||||
for url in re.findall(URL_REGEX, urls_str):
|
||||
yield fix_url_from_markdown(url)
|
||||
|
||||
|
||||
def is_static_file(url: str):
|
||||
# TODO: the proper way is with MIME type detection + ext, not only extension
|
||||
|
@ -164,9 +208,22 @@ def parse_date(date: Any) -> Optional[datetime]:
|
|||
@enforce_types
|
||||
def download_url(url: str, timeout: int=None) -> str:
|
||||
"""Download the contents of a remote url and return the text"""
|
||||
from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT
|
||||
from .config import (
|
||||
TIMEOUT,
|
||||
CHECK_SSL_VALIDITY,
|
||||
WGET_USER_AGENT,
|
||||
COOKIES_FILE,
|
||||
)
|
||||
timeout = timeout or TIMEOUT
|
||||
response = requests.get(
|
||||
session = requests.Session()
|
||||
|
||||
if COOKIES_FILE and Path(COOKIES_FILE).is_file():
|
||||
cookie_jar = http.cookiejar.MozillaCookieJar(COOKIES_FILE)
|
||||
cookie_jar.load(ignore_discard=True, ignore_expires=True)
|
||||
for cookie in cookie_jar:
|
||||
session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path)
|
||||
|
||||
response = session.get(
|
||||
url,
|
||||
headers={'User-Agent': WGET_USER_AGENT},
|
||||
verify=CHECK_SSL_VALIDITY,
|
||||
|
@ -179,7 +236,11 @@ def download_url(url: str, timeout: int=None) -> str:
|
|||
if encoding is not None:
|
||||
response.encoding = encoding
|
||||
|
||||
return response.text
|
||||
try:
|
||||
return response.text
|
||||
except UnicodeDecodeError:
|
||||
# if response is non-test (e.g. image or other binary files), just return the filename instead
|
||||
return url.rsplit('/', 1)[-1]
|
||||
|
||||
@enforce_types
|
||||
def get_headers(url: str, timeout: int=None) -> str:
|
||||
|
@ -221,7 +282,13 @@ def get_headers(url: str, timeout: int=None) -> str:
|
|||
def chrome_args(**options) -> List[str]:
|
||||
"""helper to build up a chrome shell command with arguments"""
|
||||
|
||||
from .config import CHROME_OPTIONS, CHROME_VERSION
|
||||
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
|
||||
|
||||
from .config import (
|
||||
CHROME_OPTIONS,
|
||||
CHROME_VERSION,
|
||||
CHROME_EXTRA_ARGS,
|
||||
)
|
||||
|
||||
options = {**CHROME_OPTIONS, **options}
|
||||
|
||||
|
@ -230,6 +297,8 @@ def chrome_args(**options) -> List[str]:
|
|||
|
||||
cmd_args = [options['CHROME_BINARY']]
|
||||
|
||||
cmd_args += CHROME_EXTRA_ARGS
|
||||
|
||||
if options['CHROME_HEADLESS']:
|
||||
chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
|
||||
if chrome_major_version >= 111:
|
||||
|
@ -248,14 +317,19 @@ def chrome_args(**options) -> List[str]:
|
|||
"--disable-software-rasterizer",
|
||||
"--run-all-compositor-stages-before-draw",
|
||||
"--hide-scrollbars",
|
||||
"--window-size=1440,2000",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--no-first-run",
|
||||
"--use-fake-ui-for-media-stream",
|
||||
"--use-fake-device-for-media-stream",
|
||||
"--disable-sync",
|
||||
# "--password-store=basic",
|
||||
)
|
||||
|
||||
# disable automatic updating when running headless, as there's no user to see the upgrade prompts
|
||||
cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",)
|
||||
|
||||
# set window size for screenshot/pdf/etc. rendering
|
||||
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
|
||||
|
||||
if not options['CHECK_SSL_VALIDITY']:
|
||||
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
||||
|
@ -263,16 +337,15 @@ def chrome_args(**options) -> List[str]:
|
|||
if options['CHROME_USER_AGENT']:
|
||||
cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
|
||||
|
||||
if options['RESOLUTION']:
|
||||
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
|
||||
|
||||
if options['CHROME_TIMEOUT']:
|
||||
cmd_args += ('--timeout={}'.format(options['CHROME_TIMEOUT'] * 1000),)
|
||||
|
||||
if options['CHROME_USER_DATA_DIR']:
|
||||
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
|
||||
|
||||
return cmd_args
|
||||
cmd_args.append('--profile-directory=Default')
|
||||
|
||||
return dedupe(cmd_args)
|
||||
|
||||
|
||||
def chrome_cleanup():
|
||||
"""
|
||||
|
@ -285,7 +358,8 @@ def chrome_cleanup():
|
|||
if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
|
||||
remove_file("/home/archivebox/.config/chromium/SingletonLock")
|
||||
|
||||
def ansi_to_html(text):
|
||||
@enforce_types
|
||||
def ansi_to_html(text: str) -> str:
|
||||
"""
|
||||
Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
|
||||
"""
|
||||
|
@ -309,6 +383,20 @@ def ansi_to_html(text):
|
|||
return COLOR_REGEX.sub(single_sub, text)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def dedupe(options: List[str]) -> List[str]:
|
||||
"""
|
||||
Deduplicates the given options. Options that come later clobber earlier
|
||||
conflicting options.
|
||||
"""
|
||||
deduped = {}
|
||||
|
||||
for option in options:
|
||||
deduped[option.split('=')[0]] = option
|
||||
|
||||
return list(deduped.values())
|
||||
|
||||
|
||||
class AttributeDict(dict):
|
||||
"""Helper to allow accessing dict values via Example.key or Example['key']"""
|
||||
|
||||
|
@ -355,3 +443,98 @@ class ExtendedEncoder(pyjson.JSONEncoder):
|
|||
|
||||
return pyjson.JSONEncoder.default(self, obj)
|
||||
|
||||
|
||||
### URL PARSING TESTS / ASSERTIONS
|
||||
|
||||
# Check that plain text regex URL parsing works as expected
|
||||
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
||||
# misbehaving due to some OS-level or environment level quirks (e.g. regex engine / cpython / locale differences)
|
||||
# the consequences of bad URL parsing could be disastrous and lead to many
|
||||
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
|
||||
|
||||
assert fix_url_from_markdown('http://example.com/a(b)c).x(y)z') == 'http://example.com/a(b)c'
|
||||
assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
|
||||
|
||||
URL_REGEX_TESTS = [
|
||||
('https://example.com', ['https://example.com']),
|
||||
('http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234', ['http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234']),
|
||||
|
||||
('https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc', ['https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ']),
|
||||
('<a href="https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ"> abc', ['https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ']),
|
||||
|
||||
('///a', []),
|
||||
('http://', []),
|
||||
('http://../', ['http://../']),
|
||||
('http://-error-.invalid/', ['http://-error-.invalid/']),
|
||||
('https://a(b)c+1#2?3&4/', ['https://a(b)c+1#2?3&4/']),
|
||||
('http://उदाहरण.परीक्षा', ['http://उदाहरण.परीक्षा']),
|
||||
('http://例子.测试', ['http://例子.测试']),
|
||||
('http://➡.ws/䨹 htps://abc.1243?234', ['http://➡.ws/䨹']),
|
||||
('http://⌘.ws">https://exa+mple.com//:abc ', ['http://⌘.ws', 'https://exa+mple.com//:abc']),
|
||||
('http://مثال.إختبار/abc?def=ت&ب=abc#abc=234', ['http://مثال.إختبار/abc?def=ت&ب=abc#abc=234']),
|
||||
('http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c\'om', ['http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c']),
|
||||
|
||||
('http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', ['http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', 'http://ex.co:19/a?_d=4#-a=2.3']),
|
||||
('http://code.google.com/events/#&product=browser', ['http://code.google.com/events/#&product=browser']),
|
||||
('http://foo.bar?q=Spaces should be encoded', ['http://foo.bar?q=Spaces']),
|
||||
('http://foo.com/blah_(wikipedia)#c(i)t[e]-1', ['http://foo.com/blah_(wikipedia)#c(i)t']),
|
||||
('http://foo.com/(something)?after=parens', ['http://foo.com/(something)?after=parens']),
|
||||
('http://foo.com/unicode_(✪)_in_parens) abc', ['http://foo.com/unicode_(✪)_in_parens']),
|
||||
('http://foo.bar/?q=Test%20URL-encoded%20stuff', ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
|
||||
|
||||
('[xyz](http://a.b/?q=(Test)%20U)RL-encoded%20stuff', ['http://a.b/?q=(Test)%20U']),
|
||||
('[xyz](http://a.b/?q=(Test)%20U)-ab https://abc+123', ['http://a.b/?q=(Test)%20U', 'https://abc+123']),
|
||||
('[xyz](http://a.b/?q=(Test)%20U) https://a(b)c+12)3', ['http://a.b/?q=(Test)%20U', 'https://a(b)c+12']),
|
||||
('[xyz](http://a.b/?q=(Test)a\nabchttps://a(b)c+12)3', ['http://a.b/?q=(Test)a', 'https://a(b)c+12']),
|
||||
('http://foo.bar/?q=Test%20URL-encoded%20stuff', ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
|
||||
]
|
||||
for urls_str, expected_url_matches in URL_REGEX_TESTS:
|
||||
url_matches = list(find_all_urls(urls_str))
|
||||
assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'
|
||||
|
||||
|
||||
# More test cases
|
||||
_test_url_strs = {
|
||||
'example.com': 0,
|
||||
'/example.com': 0,
|
||||
'//example.com': 0,
|
||||
':/example.com': 0,
|
||||
'://example.com': 0,
|
||||
'htt://example8.com': 0,
|
||||
'/htt://example.com': 0,
|
||||
'https://example': 1,
|
||||
'https://localhost/2345': 1,
|
||||
'https://localhost:1234/123': 1,
|
||||
'://': 0,
|
||||
'https://': 0,
|
||||
'http://': 0,
|
||||
'ftp://': 0,
|
||||
'ftp://example.com': 0,
|
||||
'https://example.com': 1,
|
||||
'https://example.com/': 1,
|
||||
'https://a.example.com': 1,
|
||||
'https://a.example.com/': 1,
|
||||
'https://a.example.com/what/is/happening.html': 1,
|
||||
'https://a.example.com/what/ís/happening.html': 1,
|
||||
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
|
||||
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
|
||||
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
|
||||
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
|
||||
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
|
||||
'<test>http://example7.com</test>': 1,
|
||||
'https://<test>': 0,
|
||||
'https://[test]': 0,
|
||||
'http://"test"': 0,
|
||||
'http://\'test\'': 0,
|
||||
'[https://example8.com/what/is/this.php?what=1]': 1,
|
||||
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
|
||||
'<what>https://example10.com#and-thing=2 "</about>': 1,
|
||||
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
|
||||
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
|
||||
'<or>http://examplehttp://15.badc</that>': 2,
|
||||
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
|
||||
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
|
||||
}
|
||||
for url_str, num_urls in _test_url_strs.items():
|
||||
assert len(list(find_all_urls(url_str))) == num_urls, (
|
||||
f'{url_str} does not contain {num_urls} urls')
|
||||
|
|
6
archivebox/vendor/requirements.txt
vendored
Normal file
6
archivebox/vendor/requirements.txt
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
# this folder contains vendored versions of these packages
|
||||
|
||||
atomicwrites==1.4.0
|
||||
pocket==0.3.7
|
||||
django-taggit==1.3.0
|
||||
base32-crockford==0.3.0
|
|
@ -10,7 +10,7 @@ set -o nounset
|
|||
set -o pipefail
|
||||
IFS=$'\n'
|
||||
|
||||
REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
|
||||
REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" > /dev/null 2>&1 && cd .. && pwd )"
|
||||
|
||||
|
||||
CURRENT_PLAFORM="$(uname)"
|
||||
|
@ -30,6 +30,14 @@ echo
|
|||
echo "[+] Uninstalling any exisitng archivebox versions..."
|
||||
brew uninstall archivebox || true
|
||||
brew untap archivebox/archivebox || true
|
||||
brew uninstall --ignore-dependencies yt-dlp || true
|
||||
brew uninstall python-mutagen || true
|
||||
brew uninstall python-brotli || true
|
||||
|
||||
pip3 uninstall archivebox || true
|
||||
pip3 uninstall mutagen || true
|
||||
pip3 uninstall brotli || true
|
||||
pip3 uninstall yt-dlp || true
|
||||
|
||||
# echo "[*] Running Formula linters and test build..."
|
||||
# brew test-bot --tap=ArchiveBox/homebrew-archivebox archivebox/archivebox/archivebox || true
|
||||
|
@ -37,7 +45,7 @@ brew untap archivebox/archivebox || true
|
|||
# brew untap archivebox/archivebox || true
|
||||
|
||||
echo
|
||||
echo "[+] Installing and building hombrew bottle from https://Github.com/ArchiveBox/homebrew-archivebox#main"
|
||||
echo "[+] Installing and building hombrew bottle from https://github.com/ArchiveBox/homebrew-archivebox#main"
|
||||
brew tap archivebox/archivebox
|
||||
brew install --build-bottle archivebox
|
||||
brew bottle archivebox
|
||||
|
|
|
@ -31,6 +31,20 @@ else
|
|||
echo "[!] Warning: No virtualenv presesnt in $REPO_DIR.venv"
|
||||
fi
|
||||
|
||||
|
||||
# Build python package lists
|
||||
# https://pdm-project.org/latest/usage/lockfile/
|
||||
echo "[+] Generating requirements.txt and pdm.lock from pyproject.toml..."
|
||||
pdm lock --group=':all' --production --lockfile pdm.lock --strategy="cross_platform"
|
||||
pdm sync --group=':all' --production --lockfile pdm.lock --clean || pdm sync --group=':all' --production --lockfile pdm.lock --clean
|
||||
pdm export --group=':all' --production --lockfile pdm.lock --without-hashes -o requirements.txt
|
||||
|
||||
pdm lock --group=':all' --dev --lockfile pdm.dev.lock --strategy="cross_platform"
|
||||
pdm sync --group=':all' --dev --lockfile pdm.dev.lock --clean || pdm sync --group=':all' --dev --lockfile pdm.dev.lock --clean
|
||||
pdm export --group=':all' --dev --lockfile pdm.dev.lock --without-hashes -o requirements-dev.txt
|
||||
|
||||
|
||||
|
||||
# cleanup build artifacts
|
||||
rm -Rf build deb_dist dist archivebox-*.tar.gz
|
||||
|
||||
|
|
|
@ -21,6 +21,20 @@ VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
|
|||
SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')"
|
||||
REQUIRED_PLATFORMS="${2:-"linux/arm64,linux/amd64,linux/arm/v7"}"
|
||||
|
||||
|
||||
# Build python package lists
|
||||
# https://pdm-project.org/latest/usage/lockfile/
|
||||
echo "[+] Generating requirements.txt and pdm.lock from pyproject.toml..."
|
||||
pdm lock --group=':all' --production --lockfile pdm.lock --strategy="cross_platform"
|
||||
pdm sync --group=':all' --production --lockfile pdm.lock --clean || pdm sync --group=':all' --production --lockfile pdm.lock --clean
|
||||
pdm export --group=':all' --production --lockfile pdm.lock --without-hashes -o requirements.txt
|
||||
|
||||
pdm lock --group=':all' --dev --lockfile pdm.dev.lock --strategy="cross_platform"
|
||||
pdm sync --group=':all' --dev --lockfile pdm.dev.lock --clean || pdm sync --group=':all' --dev --lockfile pdm.dev.lock --clean
|
||||
pdm export --group=':all' --dev --lockfile pdm.dev.lock --without-hashes -o requirements-dev.txt
|
||||
|
||||
|
||||
|
||||
echo "[+] Building Docker image: tag=$TAG_NAME version=$SHORT_VERSION arch=$REQUIRED_PLATFORMS"
|
||||
|
||||
|
||||
|
@ -32,4 +46,4 @@ docker build . --no-cache -t archivebox-dev --load
|
|||
# -t archivebox \
|
||||
# -t archivebox:$TAG_NAME \
|
||||
# -t archivebox:$VERSION \
|
||||
# -t archivebox:$SHORT_VERSION
|
||||
# -t archivebox:$SHORT_VERSION
|
||||
|
|
|
@ -18,7 +18,7 @@ which docker > /dev/null || exit 1
|
|||
which jq > /dev/null || exit 1
|
||||
# which pdm > /dev/null || exit 1
|
||||
|
||||
SUPPORTED_PLATFORMS="linux/amd64,linux/arm64,linux/arm/v7"
|
||||
SUPPORTED_PLATFORMS="linux/amd64,linux/arm64"
|
||||
|
||||
TAG_NAME="${1:-$(git rev-parse --abbrev-ref HEAD)}"
|
||||
VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
|
||||
|
@ -71,10 +71,8 @@ docker buildx use xbuilder 2>&1 >/dev/null || create_builder
|
|||
check_platforms || (recreate_builder && check_platforms) || exit 1
|
||||
|
||||
|
||||
# Build python package lists
|
||||
echo "[+] Generating requirements.txt and pdm.lock from pyproject.toml..."
|
||||
pdm lock --group=':all' --strategy="cross_platform" --production
|
||||
pdm export --group=':all' --production --without-hashes -o requirements.txt
|
||||
# Make sure pyproject.toml, pdm{.dev}.lock, requirements{-dev}.txt, package{-lock}.json are all up-to-date
|
||||
bash ./bin/lock_pkgs.sh
|
||||
|
||||
|
||||
echo "[+] Building archivebox:$VERSION docker image..."
|
||||
|
@ -82,20 +80,20 @@ echo "[+] Building archivebox:$VERSION docker image..."
|
|||
# docker build . --no-cache -t archivebox-dev \
|
||||
# replace --load with --push to deploy
|
||||
docker buildx build --platform "$SELECTED_PLATFORMS" --load . \
|
||||
-t archivebox/archivebox \
|
||||
# -t archivebox/archivebox \
|
||||
-t archivebox/archivebox:$TAG_NAME \
|
||||
-t archivebox/archivebox:$VERSION \
|
||||
-t archivebox/archivebox:$SHORT_VERSION \
|
||||
# -t archivebox/archivebox:$VERSION \
|
||||
# -t archivebox/archivebox:$SHORT_VERSION \
|
||||
-t archivebox/archivebox:$GIT_SHA \
|
||||
-t archivebox/archivebox:latest \
|
||||
-t nikisweeting/archivebox \
|
||||
# -t archivebox/archivebox:latest \
|
||||
# -t nikisweeting/archivebox \
|
||||
-t nikisweeting/archivebox:$TAG_NAME \
|
||||
-t nikisweeting/archivebox:$VERSION \
|
||||
-t nikisweeting/archivebox:$SHORT_VERSION \
|
||||
# -t nikisweeting/archivebox:$VERSION \
|
||||
# -t nikisweeting/archivebox:$SHORT_VERSION \
|
||||
-t nikisweeting/archivebox:$GIT_SHA \
|
||||
-t nikisweeting/archivebox:latest \
|
||||
# -t nikisweeting/archivebox:latest \
|
||||
-t ghcr.io/archivebox/archivebox/archivebox:$TAG_NAME \
|
||||
-t ghcr.io/archivebox/archivebox/archivebox:$VERSION \
|
||||
-t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION \
|
||||
# -t ghcr.io/archivebox/archivebox/archivebox:$VERSION \
|
||||
# -t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION \
|
||||
-t ghcr.io/archivebox/archivebox/archivebox:$GIT_SHA \
|
||||
-t ghcr.io/archivebox/archivebox/archivebox:latest
|
||||
# -t ghcr.io/archivebox/archivebox/archivebox:latest
|
||||
|
|
|
@ -20,20 +20,13 @@ else
|
|||
fi
|
||||
cd "$REPO_DIR"
|
||||
|
||||
echo "[*] Cleaning up build dirs"
|
||||
cd "$REPO_DIR"
|
||||
rm -Rf build dist
|
||||
# Generate pdm.lock, requirements.txt, and package-lock.json
|
||||
bash ./bin/lock_pkgs.sh
|
||||
|
||||
echo "[+] Building sdist, bdist_wheel, and egg_info"
|
||||
rm -f archivebox/package.json
|
||||
cp package.json archivebox/package.json
|
||||
|
||||
pdm self update
|
||||
pdm install
|
||||
rm -Rf build dist
|
||||
pdm build
|
||||
pdm export --without-hashes -o ./pip_dist/requirements.txt
|
||||
|
||||
cp dist/* ./pip_dist/
|
||||
|
||||
echo
|
||||
echo "[√] Finished. Don't forget to commit the new sdist and wheel files in ./pip_dist/"
|
||||
echo "[√] Finished. Don't forget to commit the new sdist and wheel files in ./pip_dist/"
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
|
||||
# set -o xtrace
|
||||
# set -o nounset
|
||||
shopt -s nullglob
|
||||
set -o errexit
|
||||
set -o errtrace
|
||||
set -o pipefail
|
||||
|
@ -35,7 +36,7 @@ export DEFAULT_PGID=911
|
|||
if [[ "$PUID" == "0" ]]; then
|
||||
echo -e "\n[X] Error: Got PUID=$PUID and PGID=$PGID but ArchiveBox is not allowed to be run as root, please change or unset PUID & PGID and try again." > /dev/stderr
|
||||
echo -e " Hint: some NFS/SMB/FUSE/etc. filesystems force-remap/ignore all permissions," > /dev/stderr
|
||||
echo -e " leave PUID/PGID unset, or use values the filesystem prefers (defaults to $DEFAULT_PUID:$DEFAULT_PGID)" > /dev/stderr
|
||||
echo -e " leave PUID/PGID unset, disable root_squash, or use values the drive prefers (default is $DEFAULT_PUID:$DEFAULT_PGID)" > /dev/stderr
|
||||
echo -e " https://linux.die.net/man/8/mount.cifs#:~:text=does%20not%20provide%20unix%20ownership" > /dev/stderr
|
||||
exit 3
|
||||
fi
|
||||
|
@ -46,6 +47,7 @@ export DETECTED_PGID="$(stat -c '%g' "$DATA_DIR/logs/errors.log" 2>/dev/null ||
|
|||
|
||||
# If data directory exists but is owned by root, use defaults instead of root because root is not allowed
|
||||
[[ "$DETECTED_PUID" == "0" ]] && export DETECTED_PUID="$DEFAULT_PUID"
|
||||
# (GUID / DETECTED_GUID is allowed to be 0 though)
|
||||
|
||||
# Set archivebox user and group ids to desired PUID/PGID
|
||||
usermod -o -u "${PUID:-$DETECTED_PUID}" "$ARCHIVEBOX_USER" > /dev/null 2>&1
|
||||
|
@ -64,32 +66,42 @@ if [[ -d "$DATA_DIR/archive" ]]; then
|
|||
# echo "[√] Permissions are correct"
|
||||
else
|
||||
# the only time this fails is if the host filesystem doesn't allow us to write as root (e.g. some NFS mapall/maproot problems, connection issues, drive dissapeared, etc.)
|
||||
echo -e "\n[X] Error: archivebox user (PUID=$PUID) is not able to write to your ./data dir (currently owned by $(stat -c '%u' "$DATA_DIR"):$(stat -c '%g' "$DATA_DIR")." >&2
|
||||
echo -e "\n[X] Error: archivebox user (PUID=$PUID) is not able to write to your ./data/archive dir (currently owned by $(stat -c '%u' "$DATA_DIR/archive"):$(stat -c '%g' "$DATA_DIR/archive")." > /dev/stderr
|
||||
echo -e " Change ./data to be owned by PUID=$PUID PGID=$PGID on the host and retry:" > /dev/stderr
|
||||
echo -e " \$ chown -R $PUID:$PGID ./data\n" > /dev/stderr
|
||||
echo -e " Configure the PUID & PGID environment variables to change the desired owner:" > /dev/stderr
|
||||
echo -e " https://docs.linuxserver.io/general/understanding-puid-and-pgid\n" > /dev/stderr
|
||||
echo -e " Hint: some NFS/SMB/FUSE/etc. filesystems force-remap/ignore all permissions," > /dev/stderr
|
||||
echo -e " leave PUID/PGID unset, or use values the filesystem prefers (defaults to $DEFAULT_PUID:$DEFAULT_PGID)" > /dev/stderr
|
||||
echo -e " leave PUID/PGID unset, disable root_squash, or use values the drive prefers (default is $DEFAULT_PUID:$DEFAULT_PGID)" > /dev/stderr
|
||||
echo -e " https://linux.die.net/man/8/mount.cifs#:~:text=does%20not%20provide%20unix%20ownership" > /dev/stderr
|
||||
exit 3
|
||||
fi
|
||||
else
|
||||
# create data directory
|
||||
# create data directory (and logs, since its the first dir ArchiveBox needs to write to)
|
||||
mkdir -p "$DATA_DIR/logs"
|
||||
fi
|
||||
|
||||
# force set the ownership of the data dir contents to the archivebox user and group
|
||||
# this is needed because Docker Desktop often does not map user permissions from the host properly
|
||||
chown $PUID:$PGID "$DATA_DIR"
|
||||
chown $PUID:$PGID "$DATA_DIR"/*
|
||||
if ! chown $PUID:$PGID "$DATA_DIR"/* > /dev/null 2>&1; then
|
||||
# users may store the ./data/archive folder on a network mount that prevents chmod/chown
|
||||
# fallback to chowning everything else in ./data and leaving ./data/archive alone
|
||||
find "$DATA_DIR" -type d -not -path "$DATA_DIR/archive*" -exec chown $PUID:$PGID {} \; > /dev/null 2>&1
|
||||
find "$DATA_DIR" -type f -not -path "$DATA_DIR/archive/*" -exec chown $PUID:$PGID {} \; > /dev/null 2>&1
|
||||
fi
|
||||
|
||||
|
||||
# also chown BROWSERS_DIR because otherwise 'archivebox setup' wont be able to install chrome at runtime
|
||||
# also chown BROWSERS_DIR because otherwise 'archivebox setup' wont be able to 'playwright install chromium' at runtime
|
||||
export PLAYWRIGHT_BROWSERS_PATH="${PLAYWRIGHT_BROWSERS_PATH:-/browsers}"
|
||||
mkdir -p "$PLAYWRIGHT_BROWSERS_PATH/permissions_test_safe_to_delete"
|
||||
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"
|
||||
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/*
|
||||
rm -Rf "$PLAYWRIGHT_BROWSERS_PATH/permissions_test_safe_to_delete"
|
||||
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"
|
||||
if [[ -d "$PLAYWRIGHT_BROWSERS_PATH/.links" ]]; then
|
||||
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/*
|
||||
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/.*
|
||||
chown -h $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/.links/*
|
||||
fi
|
||||
|
||||
|
||||
# (this check is written in blood in 2023, QEMU silently breaks things in ways that are not obvious)
|
||||
|
@ -100,7 +112,7 @@ if [[ "$IN_QEMU" == "True" ]]; then
|
|||
echo -e " See here for more info: https://github.com/microsoft/playwright/issues/17395#issuecomment-1250830493\n" > /dev/stderr
|
||||
fi
|
||||
|
||||
# check disk space free on / and /data, warn on <500Mb free, error on <100Mb free
|
||||
# check disk space free on /, /data, and /data/archive, warn on <500Mb free, error on <100Mb free
|
||||
export ROOT_USAGE="$(df --output=pcent,avail / | tail -n 1 | xargs)"
|
||||
export ROOT_USED_PCT="${ROOT_USAGE%%%*}"
|
||||
export ROOT_AVAIL_KB="$(echo "$ROOT_USAGE" | awk '{print $2}')"
|
||||
|
@ -117,22 +129,58 @@ elif [[ "$ROOT_USED_PCT" -ge 99 ]] || [[ "$ROOT_AVAIL_KB" -lt 500000 ]]; then
|
|||
df -kh / > /dev/stderr
|
||||
fi
|
||||
|
||||
export DATA_USAGE="$(df --output=pcent,avail /data | tail -n 1 | xargs)"
|
||||
export DATA_USAGE="$(df --output=pcent,avail "$DATA_DIR" | tail -n 1 | xargs)"
|
||||
export DATA_USED_PCT="${DATA_USAGE%%%*}"
|
||||
export DATA_AVAIL_KB="$(echo "$DATA_USAGE" | awk '{print $2}')"
|
||||
if [[ "$DATA_AVAIL_KB" -lt 100000 ]]; then
|
||||
echo -e "\n[!] Warning: Docker data volume is completely out of space! (${DATA_USED_PCT}% used on /data)" > /dev/stderr
|
||||
echo -e "\n[!] Warning: Docker data volume is completely out of space! (${DATA_USED_PCT}% used on $DATA_DIR)" > /dev/stderr
|
||||
echo -e " you need to free up at least 100Mb on the drive holding your data directory" > /dev/stderr
|
||||
echo -e " \$ ncdu -x data\n" > /dev/stderr
|
||||
df -kh /data > /dev/stderr
|
||||
df -kh "$DATA_DIR" > /dev/stderr
|
||||
sleep 5
|
||||
elif [[ "$DATA_USED_PCT" -ge 99 ]] || [[ "$ROOT_AVAIL_KB" -lt 500000 ]]; then
|
||||
echo -e "\n[!] Warning: Docker data volume is running out of space! (${DATA_USED_PCT}% used on /data)" > /dev/stderr
|
||||
echo -e "\n[!] Warning: Docker data volume is running out of space! (${DATA_USED_PCT}% used on $DATA_DIR)" > /dev/stderr
|
||||
echo -e " you may need to free up space on the drive holding your data directory soon" > /dev/stderr
|
||||
echo -e " \$ ncdu -x data\n" > /dev/stderr
|
||||
df -kh /data > /dev/stderr
|
||||
df -kh "$DATA_DIR" > /dev/stderr
|
||||
else
|
||||
# data/ has space available, but check data/archive separately, because it might be on a network mount or external drive
|
||||
if [[ -d "$DATA_DIR/archive" ]]; then
|
||||
export ARCHIVE_USAGE="$(df --output=pcent,avail "$DATA_DIR/archive" | tail -n 1 | xargs)"
|
||||
export ARCHIVE_USED_PCT="${ARCHIVE_USAGE%%%*}"
|
||||
export ARCHIVE_AVAIL_KB="$(echo "$ARCHIVE_USAGE" | awk '{print $2}')"
|
||||
if [[ "$ARCHIVE_AVAIL_KB" -lt 100000 ]]; then
|
||||
echo -e "\n[!] Warning: data/archive folder is completely out of space! (${ARCHIVE_USED_PCT}% used on $DATA_DIR/archive)" > /dev/stderr
|
||||
echo -e " you need to free up at least 100Mb on the drive holding your data/archive directory" > /dev/stderr
|
||||
echo -e " \$ ncdu -x data/archive\n" > /dev/stderr
|
||||
df -kh "$DATA_DIR/archive" > /dev/stderr
|
||||
sleep 5
|
||||
elif [[ "$ARCHIVE_USED_PCT" -ge 99 ]] || [[ "$ROOT_AVAIL_KB" -lt 500000 ]]; then
|
||||
echo -e "\n[!] Warning: data/archive folder is running out of space! (${ARCHIVE_USED_PCT}% used on $DATA_DIR/archive)" > /dev/stderr
|
||||
echo -e " you may need to free up space on the drive holding your data/archive directory soon" > /dev/stderr
|
||||
echo -e " \$ ncdu -x data/archive\n" > /dev/stderr
|
||||
df -kh "$DATA_DIR/archive" > /dev/stderr
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# symlink etc crontabs into place
|
||||
mkdir -p "$DATA_DIR/crontabs"
|
||||
if ! test -L /var/spool/cron/crontabs; then
|
||||
# move files from old location into new data dir location
|
||||
for existing_file in /var/spool/cron/crontabs/*; do
|
||||
mv "$existing_file" "$DATA_DIR/crontabs/"
|
||||
done
|
||||
# replace old system path with symlink to data dir location
|
||||
rm -Rf /var/spool/cron/crontabs
|
||||
ln -sf "$DATA_DIR/crontabs" /var/spool/cron/crontabs
|
||||
fi
|
||||
|
||||
# set DBUS_SYSTEM_BUS_ADDRESS & DBUS_SESSION_BUS_ADDRESS
|
||||
# (dbus is not actually needed, it makes chrome log fewer warnings but isn't worth making our docker images bigger)
|
||||
# service dbus start >/dev/null 2>&1 &
|
||||
# export $(dbus-launch --close-stderr)
|
||||
|
||||
|
||||
export ARCHIVEBOX_BIN_PATH="$(which archivebox)"
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
|
|||
source "$DIR/.venv/bin/activate"
|
||||
|
||||
echo "[*] Running flake8..."
|
||||
cd archivebox
|
||||
cd "$DIR/archivebox"
|
||||
flake8 . && echo "√ No errors found."
|
||||
|
||||
echo
|
||||
|
|
101
bin/lock_pkgs.sh
Executable file
101
bin/lock_pkgs.sh
Executable file
|
@ -0,0 +1,101 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
### Bash Environment Setup
|
||||
# http://redsymbol.net/articles/unofficial-bash-strict-mode/
|
||||
# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
|
||||
# set -o xtrace
|
||||
set -o errexit
|
||||
set -o errtrace
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
IFS=$'\n'
|
||||
|
||||
REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
|
||||
|
||||
cd "$REPO_DIR"
|
||||
|
||||
py_version="$(grep 'version = ' pyproject.toml | awk '{print $3}' | jq -r)"
|
||||
js_version="$(jq -r '.version' package.json)"
|
||||
|
||||
if [[ "$py_version" != "$js_version" ]]; then
|
||||
echo "[❌] Version in pyproject.toml ($py_version) does not match version in package.json ($js_version)!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[🔒] Locking all ArchiveBox dependencies (pip, npm)"
|
||||
echo
|
||||
echo "pyproject.toml: archivebox $py_version"
|
||||
echo "package.json: archivebox $js_version"
|
||||
echo
|
||||
echo
|
||||
|
||||
echo "[*] Cleaning up old lockfiles and build files"
|
||||
deactivate 2>/dev/null || true
|
||||
rm -Rf build dist
|
||||
rm -f pdm.lock
|
||||
rm -f pdm.dev.lock
|
||||
rm -f requirements.txt
|
||||
rm -f requirements-dev.txt
|
||||
rm -f package-lock.json
|
||||
rm -f archivebox/package.json
|
||||
rm -f archivebox/package-lock.json
|
||||
rm -Rf ./.venv
|
||||
rm -Rf ./node_modules
|
||||
rm -Rf ./archivebox/node_modules
|
||||
|
||||
echo
|
||||
echo
|
||||
|
||||
echo "[+] Generating dev & prod requirements.txt & pdm.lock from pyproject.toml..."
|
||||
pip install --upgrade pip setuptools
|
||||
pdm self update >/dev/null 2>&1 || true
|
||||
pdm venv create 3.10
|
||||
echo
|
||||
echo "pyproject.toml: archivebox $(grep 'version = ' pyproject.toml | awk '{print $3}' | jq -r)"
|
||||
echo "$(which python): $(python --version | head -n 1)"
|
||||
echo "$(which pdm): $(pdm --version | head -n 1)"
|
||||
pdm info --env
|
||||
pdm info
|
||||
|
||||
echo
|
||||
# https://pdm-project.org/latest/usage/lockfile/
|
||||
# prod
|
||||
pdm lock --group=':all' --production --lockfile pdm.lock --strategy="cross_platform"
|
||||
pdm sync --group=':all' --production --lockfile pdm.lock --clean
|
||||
pdm export --group=':all' --production --lockfile pdm.lock --without-hashes -o requirements.txt
|
||||
cp ./pdm.lock ./pip_dist/
|
||||
cp ./requirements.txt ./pip_dist/
|
||||
# dev
|
||||
pdm lock --group=':all' --dev --lockfile pdm.dev.lock --strategy="cross_platform"
|
||||
pdm sync --group=':all' --dev --lockfile pdm.dev.lock --clean
|
||||
pdm export --group=':all' --dev --lockfile pdm.dev.lock --without-hashes -o requirements-dev.txt
|
||||
cp ./pdm.dev.lock ./pip_dist/
|
||||
cp ./requirements-dev.txt ./pip_dist/
|
||||
|
||||
echo
|
||||
echo "[+] Generating package-lock.json from package.json..."
|
||||
npm install -g npm
|
||||
echo
|
||||
echo "package.json: archivebox $(jq -r '.version' package.json)"
|
||||
echo
|
||||
echo "$(which node): $(node --version | head -n 1)"
|
||||
echo "$(which npm): $(npm --version | head -n 1)"
|
||||
|
||||
echo
|
||||
npm install --package-lock-only
|
||||
cp package.json archivebox/package.json
|
||||
cp package-lock.json archivebox/package-lock.json
|
||||
|
||||
echo
|
||||
echo "[√] Finished. Don't forget to commit the new lockfiles:"
|
||||
echo
|
||||
ls "pyproject.toml" | cat
|
||||
ls "pdm.lock" | cat
|
||||
ls "pdm.dev.lock" | cat
|
||||
ls "requirements.txt" | cat
|
||||
ls "requirements-dev.txt" | cat
|
||||
echo
|
||||
ls "package.json" | cat
|
||||
ls "package-lock.json" | cat
|
||||
ls "archivebox/package.json" | cat
|
||||
ls "archivebox/package-lock.json" | cat
|
156
bin/setup.sh
156
bin/setup.sh
|
@ -1,66 +1,70 @@
|
|||
#!/usr/bin/env sh
|
||||
# ArchiveBox Setup Script: https://github.com/ArchiveBox/ArchiveBox
|
||||
# Supported Platforms: Ubuntu/Debian/FreeBSD/macOS
|
||||
# Usage:
|
||||
# curl -sSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/dev/bin/setup.sh' | sh
|
||||
# ArchiveBox Setup Script (Ubuntu/Debian/FreeBSD/macOS)
|
||||
# - Project Homepage: https://github.com/ArchiveBox/ArchiveBox
|
||||
# - Install Documentation: https://github.com/ArchiveBox/ArchiveBox/wiki/Install
|
||||
# Script Usage:
|
||||
# curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/dev/bin/setup.sh' | sh
|
||||
# (aka https://docker-compose.archivebox.io)
|
||||
|
||||
clear
|
||||
|
||||
if [ $(id -u) -eq 0 ]; then
|
||||
echo ""
|
||||
echo
|
||||
echo "[X] You cannot run this script as root. You must run it as a non-root user with sudo ability."
|
||||
echo " Create a new non-privileged user 'archivebox' if necessary."
|
||||
echo " adduser archivebox && usermod -a archivebox -G sudo && su archivebox"
|
||||
echo " https://www.digitalocean.com/community/tutorials/how-to-create-a-new-sudo-enabled-user-on-ubuntu-20-04-quickstart"
|
||||
echo " https://www.vultr.com/docs/create-a-sudo-user-on-freebsd"
|
||||
echo " Then re-run this script as the non-root user."
|
||||
echo ""
|
||||
echo
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if (which docker-compose > /dev/null && docker pull archivebox/archivebox:latest); then
|
||||
echo "[+] Initializing an ArchiveBox data folder at ~/archivebox/data using Docker Compose..."
|
||||
mkdir -p ~/archivebox
|
||||
mkdir -p ~/archivebox/data
|
||||
cd ~/archivebox
|
||||
mkdir -p data
|
||||
if [ -f "./index.sqlite3" ]; then
|
||||
mv ~/archivebox/* ~/archivebox/data/
|
||||
mv -i ~/archivebox/* ~/archivebox/data/
|
||||
fi
|
||||
curl -O 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml'
|
||||
docker-compose run --rm archivebox init --setup
|
||||
curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/stable/docker-compose.yml' > docker-compose.yml
|
||||
mkdir -p ./etc
|
||||
curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/stable/etc/sonic.cfg' > ./etc/sonic.cfg
|
||||
docker compose run --rm archivebox init --setup
|
||||
echo
|
||||
echo "[+] Starting ArchiveBox server using: docker-compose up -d..."
|
||||
docker-compose up -d
|
||||
echo "[+] Starting ArchiveBox server using: docker compose up -d..."
|
||||
docker compose up -d
|
||||
sleep 7
|
||||
open http://127.0.0.1:8000 || true
|
||||
which open > /dev/null && open "http://127.0.0.1:8000" || true
|
||||
echo
|
||||
echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized in ~/archivebox/data. Usage:"
|
||||
echo " cd ~/archivebox"
|
||||
echo " docker-compose ps"
|
||||
echo " docker-compose down"
|
||||
echo " docker-compose pull"
|
||||
echo " docker-compose up"
|
||||
echo " docker-compose run archivebox manage createsuperuser"
|
||||
echo " docker-compose run archivebox add 'https://example.com'"
|
||||
echo " docker-compose run archivebox list"
|
||||
echo " docker-compose run archivebox help"
|
||||
echo " docker compose ps"
|
||||
echo " docker compose down"
|
||||
echo " docker compose pull"
|
||||
echo " docker compose up"
|
||||
echo " docker compose run archivebox manage createsuperuser"
|
||||
echo " docker compose run archivebox add 'https://example.com'"
|
||||
echo " docker compose run archivebox list"
|
||||
echo " docker compose run archivebox help"
|
||||
exit 0
|
||||
elif (which docker > /dev/null && docker pull archivebox/archivebox:latest); then
|
||||
echo "[+] Initializing an ArchiveBox data folder at ~/archivebox using Docker..."
|
||||
mkdir -p ~/archivebox
|
||||
echo "[+] Initializing an ArchiveBox data folder at ~/archivebox/data using Docker..."
|
||||
mkdir -p ~/archivebox/data
|
||||
cd ~/archivebox
|
||||
if [ -f "./data/index.sqlite3" ]; then
|
||||
cd ./data
|
||||
if [ -f "./index.sqlite3" ]; then
|
||||
mv -i ~/archivebox/* ~/archivebox/data/
|
||||
fi
|
||||
cd ./data
|
||||
docker run -v "$PWD":/data -it --rm archivebox/archivebox:latest init --setup
|
||||
echo
|
||||
echo "[+] Starting ArchiveBox server using: docker run -d archivebox/archivebox..."
|
||||
docker run -v "$PWD":/data -it -d -p 8000:8000 --name=archivebox archivebox/archivebox:latest
|
||||
sleep 7
|
||||
open http://127.0.0.1:8000 || true
|
||||
which open > /dev/null && open "http://127.0.0.1:8000" || true
|
||||
echo
|
||||
echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized in ~/archivebox. Usage:"
|
||||
echo " cd ~/archivebox"
|
||||
echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized in ~/archivebox/data. Usage:"
|
||||
echo " cd ~/archivebox/data"
|
||||
echo " docker ps --filter name=archivebox"
|
||||
echo " docker kill archivebox"
|
||||
echo " docker pull archivebox/archivebox"
|
||||
|
@ -72,37 +76,37 @@ elif (which docker > /dev/null && docker pull archivebox/archivebox:latest); the
|
|||
exit 0
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo
|
||||
echo "[!] It's highly recommended to use ArchiveBox with Docker, but Docker wasn't found."
|
||||
echo ""
|
||||
echo
|
||||
echo " ⚠️ If you want to use Docker, press [Ctrl-C] to cancel now. ⚠️"
|
||||
echo " Get Docker: https://docs.docker.com/get-docker/"
|
||||
echo " After you've installed Docker, run this script again."
|
||||
echo ""
|
||||
echo "Otherwise, install will continue with apt/brew/pip in 12s... (press [Ctrl+C] to cancel)"
|
||||
echo ""
|
||||
echo
|
||||
echo "Otherwise, install will continue with apt/brew/pkg + pip in 12s... (press [Ctrl+C] to cancel)"
|
||||
echo
|
||||
sleep 12 || exit 1
|
||||
echo "Proceeding with system package manager..."
|
||||
echo ""
|
||||
echo
|
||||
|
||||
echo "[i] ArchiveBox Setup Script 📦"
|
||||
echo ""
|
||||
echo
|
||||
echo " This is a helper script which installs the ArchiveBox dependencies on your system using brew/apt/pip3."
|
||||
echo " You may be prompted for a sudo password in order to install the following:"
|
||||
echo ""
|
||||
echo
|
||||
echo " - archivebox"
|
||||
echo " - python3, pip, nodejs, npm (languages used by ArchiveBox, and its extractor modules)"
|
||||
echo " - curl, wget, git, youtube-dl, yt-dlp (used for extracting title, favicon, git, media, and more)"
|
||||
echo " - chromium (skips this if any Chrome/Chromium version is already installed)"
|
||||
echo ""
|
||||
echo
|
||||
echo " If you'd rather install these manually as-needed, you can find detailed documentation here:"
|
||||
echo " https://github.com/ArchiveBox/ArchiveBox/wiki/Install"
|
||||
echo ""
|
||||
echo
|
||||
echo "Continuing in 12s... (press [Ctrl+C] to cancel)"
|
||||
echo ""
|
||||
echo
|
||||
sleep 12 || exit 1
|
||||
echo "Proceeding to install dependencies..."
|
||||
echo ""
|
||||
echo
|
||||
|
||||
# On Linux:
|
||||
if which apt-get > /dev/null; then
|
||||
|
@ -115,41 +119,42 @@ if which apt-get > /dev/null; then
|
|||
fi
|
||||
echo
|
||||
echo "[+] Installing ArchiveBox system dependencies using apt..."
|
||||
sudo apt-get install -y git python3 python3-pip python3-distutils wget curl youtube-dl yt-dlp ffmpeg git nodejs npm ripgrep
|
||||
sudo apt-get install -y git python3 python3-pip python3-distutils wget curl yt-dlp ffmpeg git nodejs npm ripgrep
|
||||
sudo apt-get install -y libgtk2.0-0 libgtk-3-0 libnotify-dev libgconf-2-4 libnss3 libxss1 libasound2 libxtst6 xauth xvfb libgbm-dev || sudo apt-get install -y chromium || sudo apt-get install -y chromium-browser || true
|
||||
sudo apt-get install -y archivebox
|
||||
sudo apt-get --only-upgrade install -y archivebox
|
||||
echo ""
|
||||
echo
|
||||
echo "[+] Installing ArchiveBox python dependencies using pip3..."
|
||||
sudo python3 -m pip install --upgrade --ignore-installed archivebox
|
||||
sudo python3 -m pip install --upgrade --ignore-installed archivebox yt-dlp playwright
|
||||
# On Mac:
|
||||
elif which brew > /dev/null; then
|
||||
echo "[+] Installing ArchiveBox system dependencies using brew..."
|
||||
brew tap archivebox/archivebox
|
||||
brew update
|
||||
brew install python3 node git wget curl yt-dlp ripgrep
|
||||
brew install --fetch-HEAD -f archivebox
|
||||
echo ""
|
||||
echo
|
||||
echo "[+] Installing ArchiveBox python dependencies using pip3..."
|
||||
python3 -m pip install --upgrade --ignore-installed archivebox
|
||||
python3 -m pip install --upgrade --ignore-installed archivebox yt-dlp playwright
|
||||
elif which pkg > /dev/null; then
|
||||
echo "[+] Installing ArchiveBox system dependencies using pkg and pip (python3.9)..."
|
||||
sudo pkg install -y python3 py39-pip py39-sqlite3 npm wget curl youtube_dl ffmpeg git ripgrep
|
||||
sudo pkg install -y chromium
|
||||
echo ""
|
||||
echo
|
||||
echo "[+] Installing ArchiveBox python dependencies using pip..."
|
||||
# don't use sudo here so that pip installs in $HOME/.local instead of into /usr/local
|
||||
python3 -m pip install --upgrade --ignore-installed archivebox
|
||||
python3 -m pip install --upgrade --ignore-installed archivebox yt-dlp playwright
|
||||
else
|
||||
echo "[!] Warning: Could not find aptitude/homebrew/pkg! May not be able to install all dependencies automatically."
|
||||
echo ""
|
||||
echo
|
||||
echo " If you're on macOS, make sure you have homebrew installed: https://brew.sh/"
|
||||
echo " If you're on Linux, only Ubuntu/Debian/BSD systems are officially supported with this script."
|
||||
echo " If you're on Windows, this script is not officially supported (Docker is recommeded instead)."
|
||||
echo ""
|
||||
echo
|
||||
echo "See the README.md for Manual Setup & Troubleshooting instructions if you you're unable to run ArchiveBox after this script completes."
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo
|
||||
|
||||
if ! (python3 --version && python3 -m pip --version && python3 -m django --version); then
|
||||
echo "[X] Python 3 pip was not found on your system!"
|
||||
|
@ -160,41 +165,46 @@ if ! (python3 --version && python3 -m pip --version && python3 -m django --versi
|
|||
exit 1
|
||||
fi
|
||||
|
||||
if ! (python3 -m django --version && python3 -m archivebox version --quiet); then
|
||||
if ! (python3 -m django --version && python3 -m pip show archivebox && which -a archivebox); then
|
||||
echo "[X] Django and ArchiveBox were not found after installing!"
|
||||
echo " Check to see if a previous step failed."
|
||||
echo ""
|
||||
echo
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# echo ""
|
||||
# echo
|
||||
# echo "[+] Upgrading npm and pip..."
|
||||
# sudo npm i -g npm || true
|
||||
# sudo python3 -m pip install --upgrade pip setuptools || true
|
||||
|
||||
echo
|
||||
echo "[+] Initializing ArchiveBox data folder at ~/archivebox..."
|
||||
mkdir -p ~/archivebox
|
||||
cd ~/archivebox
|
||||
if [ -f "./data/index.sqlite3" ]; then
|
||||
cd ./data
|
||||
fi
|
||||
: | python3 -m archivebox init --setup || true # pipe in empty command to make sure stdin is closed
|
||||
echo "[+] Installing Chromium binary using playwright..."
|
||||
python3 -m playwright install --with-deps chromium || true
|
||||
echo
|
||||
|
||||
echo
|
||||
echo "[+] Initializing ArchiveBox data folder at ~/archivebox/data..."
|
||||
mkdir -p ~/archivebox/data
|
||||
cd ~/archivebox
|
||||
if [ -f "./index.sqlite3" ]; then
|
||||
mv -i ~/archivebox/* ~/archivebox/data/
|
||||
fi
|
||||
cd ./data
|
||||
: | python3 -m archivebox init --setup || true # pipe in empty command to make sure stdin is closed
|
||||
# init shows version output at the end too
|
||||
echo
|
||||
echo "[+] Starting ArchiveBox server using: nohup archivebox server &..."
|
||||
nohup python3 -m archivebox server 0.0.0.0:8000 > ./logs/server.log 2>&1 &
|
||||
sleep 7
|
||||
which open > /dev/null && open http://127.0.0.1:8000 || true
|
||||
|
||||
which open > /dev/null && open "http://127.0.0.1:8000" || true
|
||||
echo
|
||||
echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized in ~/archivebox. Usage:"
|
||||
echo " cd ~/archivebox"
|
||||
echo " ps aux | grep archivebox"
|
||||
echo " pkill -f archivebox"
|
||||
echo " python3 -m pip install --upgrade archivebox"
|
||||
echo " archivebox server --quick-init 0.0.0.0:8000"
|
||||
echo " archivebox manage createsuperuser"
|
||||
echo " archivebox add 'https://example.com'"
|
||||
echo " archivebox list"
|
||||
echo " archivebox help"
|
||||
echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized in ~/archivebox/data. Usage:"
|
||||
echo " cd ~/archivebox/data # see your data dir"
|
||||
echo " archivebox server --quick-init 0.0.0.0:8000 # start server process"
|
||||
echo " archivebox manage createsuperuser # add an admin user+pass"
|
||||
echo " ps aux | grep archivebox # see server process pid"
|
||||
echo " pkill -f archivebox # stop the server"
|
||||
echo " pip install --upgrade archivebox; archivebox init # update versions"
|
||||
echo " archivebox add 'https://example.com'" # archive a new URL
|
||||
echo " archivebox list # see URLs archived"
|
||||
echo " archivebox help # see more help & examples"
|
||||
|
|
|
@ -14,4 +14,4 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
|
|||
|
||||
source "$DIR/.venv/bin/activate"
|
||||
|
||||
pytest -s --basetemp=tests/out --ignore=archivebox/vendor --ignore=deb_dist --ignore=pip_dist --ignore=brew_dist
|
||||
pytest -s --basetemp=tests/out "$@"
|
||||
|
|
|
@ -1,39 +1,31 @@
|
|||
# Usage:
|
||||
# docker compose run archivebox init --setup
|
||||
# docker compose up
|
||||
# echo "https://example.com" | docker compose run archivebox archivebox add
|
||||
# docker compose run archivebox add --depth=1 https://example.com/some/feed.rss
|
||||
# docker compose run archivebox config --set MEDIA_MAX_SIZE=750m
|
||||
# echo 'https://example.com' | docker compose run -T archivebox add
|
||||
# docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
|
||||
# docker compose run archivebox config --set SAVE_ARCHIVE_DOT_ORG=False
|
||||
# docker compose run archivebox help
|
||||
# Documentation:
|
||||
# https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose
|
||||
|
||||
version: '3.9'
|
||||
|
||||
services:
|
||||
archivebox:
|
||||
#image: ${DOCKER_IMAGE:-archivebox/archivebox:dev}
|
||||
image: archivebox/archivebox:dev
|
||||
command: server --quick-init 0.0.0.0:8000
|
||||
image: archivebox/archivebox:latest
|
||||
ports:
|
||||
- 8000:8000
|
||||
volumes:
|
||||
- ./data:/data
|
||||
# - ./etc/crontabs:/var/spool/cron/crontabs # uncomment this and archivebox_scheduler below to set up automatic recurring archive jobs
|
||||
# - ./archivebox:/app/archivebox # uncomment this to mount the ArchiveBox source code at runtime (for developers working on archivebox)
|
||||
# build: . # uncomment this to build the image from source code at buildtime (for developers working on archivebox)
|
||||
environment:
|
||||
- ALLOWED_HOSTS=* # restrict this to only accept incoming traffic via specific domain name
|
||||
# - PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list
|
||||
# - PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content
|
||||
# - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive
|
||||
# - ADMIN_USERNAME=admin # create an admin user on first run with the given user/pass combo
|
||||
# - ADMIN_PASSWORD=SomeSecretPassword
|
||||
- ALLOWED_HOSTS=* # restrict this to only accept incoming traffic via specific domain name
|
||||
- PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list
|
||||
- PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content
|
||||
- PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive
|
||||
- SEARCH_BACKEND_ENGINE=sonic # tells ArchiveBox to use sonic container below for fast full-text search
|
||||
- SEARCH_BACKEND_HOST_NAME=sonic
|
||||
- SEARCH_BACKEND_PASSWORD=SomeSecretPassword
|
||||
# - PUID=911 # set to your host user's UID & GID if you encounter permissions issues
|
||||
# - PGID=911
|
||||
# - SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search
|
||||
# - SEARCH_BACKEND_HOST_NAME=sonic
|
||||
# - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
|
||||
# - PGID=911 # UID/GIDs <500 may clash with existing users and are not recommended
|
||||
# - MEDIA_MAX_SIZE=750m # increase this filesize limit to allow archiving larger audio/video files
|
||||
# - TIMEOUT=60 # increase this number to 120+ seconds if you see many slow downloads timing out
|
||||
# - CHECK_SSL_VALIDITY=True # set to False to disable strict SSL checking (allows saving URLs w/ broken certs)
|
||||
|
@ -41,8 +33,7 @@ services:
|
|||
# ...
|
||||
# add further configuration options from archivebox/config.py as needed (to apply them only to this container)
|
||||
# or set using `docker compose run archivebox config --set SOME_KEY=someval` (to persist config across all containers)
|
||||
|
||||
# For ad-blocking during archiving, uncomment this section and pihole service section below
|
||||
# For ad-blocking during archiving, uncomment this section and pihole service section below
|
||||
# networks:
|
||||
# - dns
|
||||
# dns:
|
||||
|
@ -51,29 +42,85 @@ services:
|
|||
|
||||
######## Optional Addons: tweak examples below as needed for your specific use case ########
|
||||
|
||||
### Example: To run the Sonic full-text search backend, first download the config file to sonic.cfg
|
||||
# $ curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic.cfg
|
||||
# After starting, backfill any existing Snapshots into the full-text index:
|
||||
### This optional container runs any scheduled tasks in the background, add new tasks like so:
|
||||
# $ docker compose run archivebox schedule --add --every=day --depth=1 'https://example.com/some/rss/feed.xml'
|
||||
# then restart the scheduler container to apply any changes to the scheduled task list:
|
||||
# $ docker compose restart archivebox_scheduler
|
||||
|
||||
archivebox_scheduler:
|
||||
image: archivebox/archivebox:latest
|
||||
command: schedule --foreground --update --every=day
|
||||
environment:
|
||||
- TIMEOUT=120 # use a higher timeout than the main container to give slow tasks more time when retrying
|
||||
# - PUID=502 # set to your host user's UID & GID if you encounter permissions issues
|
||||
# - PGID=20
|
||||
volumes:
|
||||
- ./data:/data
|
||||
# cpus: 2 # uncomment / edit these values to limit scheduler container resource consumption
|
||||
# mem_limit: 2048m
|
||||
# restart: always
|
||||
|
||||
|
||||
### This runs the optional Sonic full-text search backend (much faster than default rg backend).
|
||||
# If Sonic is ever started after not running for a while, update its full-text index by running:
|
||||
# $ docker-compose run archivebox update --index-only
|
||||
|
||||
# sonic:
|
||||
# image: valeriansaliou/sonic:latest
|
||||
# expose:
|
||||
# - 1491
|
||||
# environment:
|
||||
# - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
|
||||
# volumes:
|
||||
# - ./sonic.cfg:/etc/sonic.cfg:ro
|
||||
# - ./data/sonic:/var/lib/sonic/store
|
||||
|
||||
|
||||
sonic:
|
||||
image: valeriansaliou/sonic:latest
|
||||
build:
|
||||
# custom build just auto-downloads archivebox's default sonic.cfg as a convenience
|
||||
# not needed after first run / if you have already have ./etc/sonic.cfg present
|
||||
dockerfile_inline: |
|
||||
FROM quay.io/curl/curl:latest AS config_downloader
|
||||
RUN curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/stable/etc/sonic.cfg' > /tmp/sonic.cfg
|
||||
FROM valeriansaliou/sonic:latest
|
||||
COPY --from=config_downloader /tmp/sonic.cfg /etc/sonic.cfg
|
||||
expose:
|
||||
- 1491
|
||||
environment:
|
||||
- SEARCH_BACKEND_PASSWORD=SomeSecretPassword
|
||||
volumes:
|
||||
- ./sonic.cfg:/etc/sonic.cfg
|
||||
- ./data/sonic:/var/lib/sonic/store
|
||||
|
||||
|
||||
### This container runs xvfb+noVNC so you can watch the ArchiveBox browser as it archives things,
|
||||
# or remote control it to set up a chrome profile w/ login credentials for sites you want to archive.
|
||||
# https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile
|
||||
|
||||
novnc:
|
||||
image: theasp/novnc:latest
|
||||
environment:
|
||||
- DISPLAY_WIDTH=1920
|
||||
- DISPLAY_HEIGHT=1080
|
||||
- RUN_XTERM=no
|
||||
ports:
|
||||
# to view/control ArchiveBox's browser, visit: http://127.0.0.1:8080/vnc.html
|
||||
# restricted to access from localhost by default because it has no authentication
|
||||
- 127.0.0.1:8080:8080
|
||||
|
||||
|
||||
### Example: Put Nginx in front of the ArchiveBox server for SSL termination and static file serving.
|
||||
# You can also any other ingress provider for SSL like Apache, Caddy, Traefik, Cloudflare Tunnels, etc.
|
||||
|
||||
# nginx:
|
||||
# image: nginx:alpine
|
||||
# ports:
|
||||
# - 443:443
|
||||
# - 80:80
|
||||
# volumes:
|
||||
# - ./etc/nginx.conf:/etc/nginx/nginx.conf
|
||||
# - ./data:/var/www
|
||||
|
||||
|
||||
### Example: To run pihole in order to block ad/tracker requests during archiving,
|
||||
# uncomment this block and set up pihole using its admin interface
|
||||
|
||||
# pihole:
|
||||
# image: pihole/pihole:latest
|
||||
# ports:
|
||||
# - 127.0.0.1:8090:80 # uncomment to access the admin HTTP interface on http://localhost:8090
|
||||
# # access the admin HTTP interface on http://localhost:8090
|
||||
# - 127.0.0.1:8090:80
|
||||
# environment:
|
||||
# - WEBPASSWORD=SET_THIS_TO_SOME_SECRET_PASSWORD_FOR_ADMIN_DASHBOARD
|
||||
# - DNSMASQ_LISTENING=all
|
||||
|
@ -94,7 +141,7 @@ services:
|
|||
# $ docker compose restart archivebox_scheduler
|
||||
|
||||
# archivebox_scheduler:
|
||||
# image: ${DOCKER_IMAGE:-archivebox/archivebox:dev}
|
||||
# image: archivebox/archivebox:latest
|
||||
# command: schedule --foreground
|
||||
# environment:
|
||||
# - MEDIA_MAX_SIZE=750m # increase this number to allow archiving larger audio/video files
|
||||
|
@ -124,7 +171,8 @@ services:
|
|||
# - ./data:/var/www
|
||||
|
||||
|
||||
### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel
|
||||
### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel to avoid IP blocks.
|
||||
# You can also use any other VPN that works at the docker IP level, e.g. Tailscale, OpenVPN, etc.
|
||||
|
||||
# wireguard:
|
||||
# image: linuxserver/wireguard:latest
|
||||
|
@ -155,10 +203,30 @@ services:
|
|||
|
||||
|
||||
networks:
|
||||
|
||||
# network needed for pihole container to offer :53 dns resolving on fixed ip for archivebox container
|
||||
# network just used for pihole container to offer :53 dns resolving on fixed ip for archivebox container
|
||||
dns:
|
||||
ipam:
|
||||
driver: default
|
||||
config:
|
||||
- subnet: 172.20.0.0/24
|
||||
|
||||
|
||||
# To use remote storage for your ./data/archive (e.g. Amazon S3, Backblaze B2, Google Drive, OneDrive, SFTP, etc.)
|
||||
# Follow the steps here to set up the Docker RClone Plugin https://rclone.org/docker/
|
||||
# $ docker plugin install rclone/docker-volume-rclone:amd64 --grant-all-permissions --alias rclone
|
||||
# $ nano /var/lib/docker-plugins/rclone/config/rclone.conf
|
||||
# [examplegdrive]
|
||||
# type = drive
|
||||
# scope = drive
|
||||
# drive_id = 1234567...
|
||||
# root_folder_id = 0Abcd...
|
||||
# token = {"access_token":...}
|
||||
|
||||
# volumes:
|
||||
# archive:
|
||||
# driver: rclone
|
||||
# driver_opts:
|
||||
# remote: 'examplegdrive:archivebox'
|
||||
# allow_other: 'true'
|
||||
# vfs_cache_mode: full
|
||||
# poll_interval: 0
|
||||
|
|
2
docs
2
docs
|
@ -1 +1 @@
|
|||
Subproject commit a1b69c51ba9b249c0b2a6efd141dbb792fc36ad2
|
||||
Subproject commit f23abba9773b67ad9f2fd04d6f2e8e056dfa6521
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
[server]
|
||||
|
||||
# log_level = "debug"
|
||||
log_level = "warn"
|
||||
|
||||
|
||||
|
|
480
package-lock.json
generated
480
package-lock.json
generated
|
@ -1,23 +1,33 @@
|
|||
{
|
||||
"name": "archivebox",
|
||||
"version": "0.7.2",
|
||||
"version": "0.8.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "archivebox",
|
||||
"version": "0.7.2",
|
||||
"version": "0.8.0",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@postlight/parser": "^2.2.3",
|
||||
"readability-extractor": "github:ArchiveBox/readability-extractor",
|
||||
"single-file-cli": "^1.1.46"
|
||||
"single-file-cli": "^1.1.54"
|
||||
}
|
||||
},
|
||||
"node_modules/@asamuzakjp/dom-selector": {
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/@asamuzakjp/dom-selector/-/dom-selector-2.0.2.tgz",
|
||||
"integrity": "sha512-x1KXOatwofR6ZAYzXRBL5wrdV0vwNxlTCK9NCuLqAzQYARqGcvFwiJA6A1ERuh+dgeA4Dxm3JBYictIes+SqUQ==",
|
||||
"dependencies": {
|
||||
"bidi-js": "^1.0.3",
|
||||
"css-tree": "^2.3.1",
|
||||
"is-potential-custom-element-name": "^1.0.1"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/runtime-corejs2": {
|
||||
"version": "7.23.7",
|
||||
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.23.7.tgz",
|
||||
"integrity": "sha512-JmMk2t1zGDNkvsY2MsLLksocjY+ufGzSk8UlcNcxzfrzAPu4nMx0HRFakzIg2bhcqQq6xBI2nUaW/sHoaYIHdQ==",
|
||||
"version": "7.24.5",
|
||||
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.24.5.tgz",
|
||||
"integrity": "sha512-cC9jiO6s/IN+xwCHYy1AGrcFJ4bwgIwb8HX1KaoEpRsznLlO4x9eBP6AX7RIeMSWlQqEj2WHox637OS8cDq6Ew==",
|
||||
"dependencies": {
|
||||
"core-js": "^2.6.12",
|
||||
"regenerator-runtime": "^0.14.0"
|
||||
|
@ -168,9 +178,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/@puppeteer/browsers": {
|
||||
"version": "1.8.0",
|
||||
"resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-1.8.0.tgz",
|
||||
"integrity": "sha512-TkRHIV6k2D8OlUe8RtG+5jgOF/H98Myx0M6AOafC8DdNVOFiBSFa5cpRDtpm8LXOa9sVwe0+e6Q3FC56X/DZfg==",
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.0.0.tgz",
|
||||
"integrity": "sha512-3PS82/5+tnpEaUWonjAFFvlf35QHF15xqyGd34GBa5oP5EPVfFXRsbSxIGYf1M+vZlqBZ3oxT1kRg9OYhtt8ng==",
|
||||
"dependencies": {
|
||||
"debug": "4.3.4",
|
||||
"extract-zip": "2.0.1",
|
||||
|
@ -184,7 +194,7 @@
|
|||
"browsers": "lib/cjs/main-cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=16.3.0"
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/@tootallnate/quickjs-emscripten": {
|
||||
|
@ -193,9 +203,9 @@
|
|||
"integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA=="
|
||||
},
|
||||
"node_modules/@types/node": {
|
||||
"version": "20.10.6",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.10.6.tgz",
|
||||
"integrity": "sha512-Vac8H+NlRNNlAmDfGUP7b5h/KA+AtWIzuXy0E6OyP8f1tCLYAtPvKRRDJjAPqhpCb0t6U2j7/xqAuLEebW2kiw==",
|
||||
"version": "20.12.11",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.11.tgz",
|
||||
"integrity": "sha512-vDg9PZ/zi+Nqp6boSOT7plNuthRugEKixDv5sFTIpkE89MmNtEArAShI4mxuX2+UrLEe9pxC1vm2cjm9YlWbJw==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"undici-types": "~5.26.4"
|
||||
|
@ -211,9 +221,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/agent-base": {
|
||||
"version": "7.1.0",
|
||||
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.0.tgz",
|
||||
"integrity": "sha512-o/zjMZRhJxny7OyEF+Op8X+efiELC7k7yOjMzgfzVqOzXqkBkWI79YoTdOtsuWd5BWhAGAuOY/Xa6xpiaWXiNg==",
|
||||
"version": "7.1.1",
|
||||
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.1.tgz",
|
||||
"integrity": "sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA==",
|
||||
"dependencies": {
|
||||
"debug": "^4.3.4"
|
||||
},
|
||||
|
@ -304,14 +314,15 @@
|
|||
"integrity": "sha512-NmWvPnx0F1SfrQbYwOi7OeaNGokp9XhzNioJ/CSBs8Qa4vxug81mhJEAVZwxXuBmYB5KDRfMq/F3RR0BIU7sWg=="
|
||||
},
|
||||
"node_modules/b4a": {
|
||||
"version": "1.6.4",
|
||||
"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.6.4.tgz",
|
||||
"integrity": "sha512-fpWrvyVHEKyeEvbKZTVOeZF3VSKKWtJxFIxX/jaVPf+cLbGUSitjb49pHLqPV2BUNNZ0LcoeEGfE/YCpyDYHIw=="
|
||||
"version": "1.6.6",
|
||||
"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.6.6.tgz",
|
||||
"integrity": "sha512-5Tk1HLk6b6ctmjIkAcU/Ujv/1WqiDl0F0JdRCR80VsOcUlHcu7pWeWRlOqQLHfDEsVx9YH/aif5AG4ehoCtTmg=="
|
||||
},
|
||||
"node_modules/balanced-match": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
|
||||
"integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw=="
|
||||
"node_modules/bare-events": {
|
||||
"version": "2.2.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.2.2.tgz",
|
||||
"integrity": "sha512-h7z00dWdG0PYOQEvChhOSWvOfkIKsdZGkWr083FgN/HyoQuebSew/cgirYqh9SCuy/hRvxc5Vy6Fw8xAmYHLkQ==",
|
||||
"optional": true
|
||||
},
|
||||
"node_modules/base64-js": {
|
||||
"version": "1.5.1",
|
||||
|
@ -333,9 +344,9 @@
|
|||
]
|
||||
},
|
||||
"node_modules/basic-ftp": {
|
||||
"version": "5.0.4",
|
||||
"resolved": "https://registry.npmjs.org/basic-ftp/-/basic-ftp-5.0.4.tgz",
|
||||
"integrity": "sha512-8PzkB0arJFV4jJWSGOYR+OEic6aeKMu/osRhBULN6RY0ykby6LKhbmuQ5ublvaas5BOwboah5D87nrHyuh8PPA==",
|
||||
"version": "5.0.5",
|
||||
"resolved": "https://registry.npmjs.org/basic-ftp/-/basic-ftp-5.0.5.tgz",
|
||||
"integrity": "sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
}
|
||||
|
@ -348,6 +359,14 @@
|
|||
"tweetnacl": "^0.14.3"
|
||||
}
|
||||
},
|
||||
"node_modules/bidi-js": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/bidi-js/-/bidi-js-1.0.3.tgz",
|
||||
"integrity": "sha512-RKshQI1R3YQ+n9YJz2QQ147P66ELpa1FQEg20Dk8oW9t2KgLbpDLLp9aGZ7y8WHSshDknG0bknqGw5/tyCs5tw==",
|
||||
"dependencies": {
|
||||
"require-from-string": "^2.0.2"
|
||||
}
|
||||
},
|
||||
"node_modules/bluebird": {
|
||||
"version": "2.11.0",
|
||||
"resolved": "https://registry.npmjs.org/bluebird/-/bluebird-2.11.0.tgz",
|
||||
|
@ -358,15 +377,6 @@
|
|||
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
|
||||
"integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="
|
||||
},
|
||||
"node_modules/brace-expansion": {
|
||||
"version": "1.1.11",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
|
||||
"integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
|
||||
"dependencies": {
|
||||
"balanced-match": "^1.0.0",
|
||||
"concat-map": "0.0.1"
|
||||
}
|
||||
},
|
||||
"node_modules/brotli": {
|
||||
"version": "1.3.3",
|
||||
"resolved": "https://registry.npmjs.org/brotli/-/brotli-1.3.3.tgz",
|
||||
|
@ -446,12 +456,12 @@
|
|||
}
|
||||
},
|
||||
"node_modules/chromium-bidi": {
|
||||
"version": "0.4.33",
|
||||
"resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-0.4.33.tgz",
|
||||
"integrity": "sha512-IxoFM5WGQOIAd95qrSXzJUv4eXIrh+RvU3rwwqIiwYuvfE7U/Llj4fejbsJnjJMUYCuGtVQsY2gv7oGl4aTNSQ==",
|
||||
"version": "0.5.8",
|
||||
"resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-0.5.8.tgz",
|
||||
"integrity": "sha512-blqh+1cEQbHBKmok3rVJkBlBxt9beKBgOsxbFgs7UJcoVbbeZ+K7+6liAsjgpc8l1Xd55cQUy14fXZdGSb4zIw==",
|
||||
"dependencies": {
|
||||
"mitt": "3.0.1",
|
||||
"urlpattern-polyfill": "9.0.0"
|
||||
"urlpattern-polyfill": "10.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"devtools-protocol": "*"
|
||||
|
@ -497,11 +507,6 @@
|
|||
"node": ">= 0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/concat-map": {
|
||||
"version": "0.0.1",
|
||||
"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
|
||||
"integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg=="
|
||||
},
|
||||
"node_modules/core-js": {
|
||||
"version": "2.6.12",
|
||||
"resolved": "https://registry.npmjs.org/core-js/-/core-js-2.6.12.tgz",
|
||||
|
@ -533,6 +538,18 @@
|
|||
"nth-check": "~1.0.1"
|
||||
}
|
||||
},
|
||||
"node_modules/css-tree": {
|
||||
"version": "2.3.1",
|
||||
"resolved": "https://registry.npmjs.org/css-tree/-/css-tree-2.3.1.tgz",
|
||||
"integrity": "sha512-6Fv1DV/TYw//QF5IzQdqsNDjx/wc8TrMBZsqjL9eW01tWb7R7k/mq+/VXfJCl7SoD5emsJop9cOByJZfs8hYIw==",
|
||||
"dependencies": {
|
||||
"mdn-data": "2.0.30",
|
||||
"source-map-js": "^1.0.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": "^10 || ^12.20.0 || ^14.13.0 || >=15.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/css-what": {
|
||||
"version": "2.1.3",
|
||||
"resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.3.tgz",
|
||||
|
@ -542,14 +559,14 @@
|
|||
}
|
||||
},
|
||||
"node_modules/cssstyle": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-3.0.0.tgz",
|
||||
"integrity": "sha512-N4u2ABATi3Qplzf0hWbVCdjenim8F3ojEXpBDF5hBpjzW182MjNGLqfmQ0SkSPeQ+V86ZXgeH8aXj6kayd4jgg==",
|
||||
"version": "4.0.1",
|
||||
"resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-4.0.1.tgz",
|
||||
"integrity": "sha512-8ZYiJ3A/3OkDd093CBT/0UKDWry7ak4BdPTFP2+QEP7cmhouyq/Up709ASSj2cK02BbZiMgk7kYjZNS4QP5qrQ==",
|
||||
"dependencies": {
|
||||
"rrweb-cssom": "^0.6.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=14"
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/dashdash": {
|
||||
|
@ -564,9 +581,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/data-uri-to-buffer": {
|
||||
"version": "6.0.1",
|
||||
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.1.tgz",
|
||||
"integrity": "sha512-MZd3VlchQkp8rdend6vrx7MmVDJzSNTBvghvKjirLkD+WTChA3KUf0jkE68Q4UyctNqI11zZO9/x2Yx+ub5Cvg==",
|
||||
"version": "6.0.2",
|
||||
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
||||
"integrity": "sha512-7hvf7/GW8e86rW0ptuwS3OcBGDjIi6SZva7hCyWC0yYry2cOPmLIjXAUHI6DK2HsnwJd9ifmt57i8eV2n4YNpw==",
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
|
@ -657,9 +674,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/devtools-protocol": {
|
||||
"version": "0.0.1203626",
|
||||
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1203626.tgz",
|
||||
"integrity": "sha512-nEzHZteIUZfGCZtTiS1fRpC8UZmsfD1SiyPvaUNvS13dvKf666OAm8YTi0+Ca3n1nLEyu49Cy4+dPWpaHFJk9g=="
|
||||
"version": "0.0.1232444",
|
||||
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1232444.tgz",
|
||||
"integrity": "sha512-pM27vqEfxSxRkTMnF+XCmxSEb6duO5R+t8A9DEEJgy4Wz2RVanje2mmj99B6A3zv2r/qGfYlOvYznUhuokizmg=="
|
||||
},
|
||||
"node_modules/difflib": {
|
||||
"version": "0.2.6",
|
||||
|
@ -696,9 +713,9 @@
|
|||
"integrity": "sha512-3VdM/SXBZX2omc9JF9nOPCtDaYQ67BGp5CoLpIQlO2KCAPETs8TcDHacF26jXadGbvUteZzRTeos2fhID5+ucQ=="
|
||||
},
|
||||
"node_modules/dompurify": {
|
||||
"version": "3.0.7",
|
||||
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.0.7.tgz",
|
||||
"integrity": "sha512-BViYTZoqP3ak/ULKOc101y+CtHDUvBsVgSxIF1ku0HmK6BRf+C03MC+tArMvOPtVtZp83DDh5puywKDu4sbVjQ=="
|
||||
"version": "3.1.3",
|
||||
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.1.3.tgz",
|
||||
"integrity": "sha512-5sOWYSNPaxz6o2MUPvtyxTTqR4D3L77pr5rUQoWgD5ROQtVIZQgJkXbo1DLlK3vj11YGw5+LnF4SYti4gZmwng=="
|
||||
},
|
||||
"node_modules/domutils": {
|
||||
"version": "1.5.1",
|
||||
|
@ -726,6 +743,11 @@
|
|||
"safer-buffer": "^2.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/ecc-jsbn/node_modules/jsbn": {
|
||||
"version": "0.1.1",
|
||||
"resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz",
|
||||
"integrity": "sha512-UVU9dibq2JcFWxQPA6KCqj5O42VOmAY3zQUfEKxU0KpTGXwNoCjkX1e13eHNvw/xPynt6pU0rZ1htjWTNTSXsg=="
|
||||
},
|
||||
"node_modules/ellipsize": {
|
||||
"version": "0.1.0",
|
||||
"resolved": "https://registry.npmjs.org/ellipsize/-/ellipsize-0.1.0.tgz",
|
||||
|
@ -750,9 +772,9 @@
|
|||
"integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
|
||||
},
|
||||
"node_modules/escalade": {
|
||||
"version": "3.1.1",
|
||||
"resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.1.tgz",
|
||||
"integrity": "sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw==",
|
||||
"version": "3.1.2",
|
||||
"resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.2.tgz",
|
||||
"integrity": "sha512-ErCHMCae19vR8vQGe50xIsVomy19rg6gFu3+r3jkEO46suLMWBksvVyoGgQV+jOfl84ZSOSlmv6Gxa89PmTGmA==",
|
||||
"engines": {
|
||||
"node": ">=6"
|
||||
}
|
||||
|
@ -890,31 +912,26 @@
|
|||
}
|
||||
},
|
||||
"node_modules/fs-extra": {
|
||||
"version": "8.1.0",
|
||||
"resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-8.1.0.tgz",
|
||||
"integrity": "sha512-yhlQgA6mnOJUKOsRUFsgJdQCvkKhcz8tlZG5HBQfReYZy46OwLcY+Zia0mtdHsOo9y/hP+CxMN0TU9QxoOtG4g==",
|
||||
"version": "11.2.0",
|
||||
"resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-11.2.0.tgz",
|
||||
"integrity": "sha512-PmDi3uwK5nFuXh7XDTlVnS17xJS7vW36is2+w3xcv8SVxiB4NyATf4ctkVY5bkSjX0Y4nbvZCq1/EjtEyr9ktw==",
|
||||
"dependencies": {
|
||||
"graceful-fs": "^4.2.0",
|
||||
"jsonfile": "^4.0.0",
|
||||
"universalify": "^0.1.0"
|
||||
"jsonfile": "^6.0.1",
|
||||
"universalify": "^2.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6 <7 || >=8"
|
||||
"node": ">=14.14"
|
||||
}
|
||||
},
|
||||
"node_modules/fs-extra/node_modules/universalify": {
|
||||
"version": "0.1.2",
|
||||
"resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz",
|
||||
"integrity": "sha512-rBJeI5CXAlmy1pV+617WB9J63U6XcazHHF2f2dbJix4XzpUF0RS3Zbj0FGIOCAva5P/d/GBOYaACQ1w+0azUkg==",
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.1.tgz",
|
||||
"integrity": "sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==",
|
||||
"engines": {
|
||||
"node": ">= 4.0.0"
|
||||
"node": ">= 10.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/fs.realpath": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
|
||||
"integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw=="
|
||||
},
|
||||
"node_modules/get-caller-file": {
|
||||
"version": "2.0.5",
|
||||
"resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
|
||||
|
@ -938,14 +955,14 @@
|
|||
}
|
||||
},
|
||||
"node_modules/get-uri": {
|
||||
"version": "6.0.2",
|
||||
"resolved": "https://registry.npmjs.org/get-uri/-/get-uri-6.0.2.tgz",
|
||||
"integrity": "sha512-5KLucCJobh8vBY1K07EFV4+cPZH3mrV9YeAruUseCQKHB58SGjjT2l9/eA9LD082IiuMjSlFJEcdJ27TXvbZNw==",
|
||||
"version": "6.0.3",
|
||||
"resolved": "https://registry.npmjs.org/get-uri/-/get-uri-6.0.3.tgz",
|
||||
"integrity": "sha512-BzUrJBS9EcUb4cFol8r4W3v1cPsSyajLSthNkz5BxbpDcHN5tIrM10E2eNvfnvBn3DaT3DUgx0OpsBKkaOpanw==",
|
||||
"dependencies": {
|
||||
"basic-ftp": "^5.0.2",
|
||||
"data-uri-to-buffer": "^6.0.0",
|
||||
"data-uri-to-buffer": "^6.0.2",
|
||||
"debug": "^4.3.4",
|
||||
"fs-extra": "^8.1.0"
|
||||
"fs-extra": "^11.2.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
|
@ -959,25 +976,6 @@
|
|||
"assert-plus": "^1.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/glob": {
|
||||
"version": "7.2.3",
|
||||
"resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz",
|
||||
"integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==",
|
||||
"dependencies": {
|
||||
"fs.realpath": "^1.0.0",
|
||||
"inflight": "^1.0.4",
|
||||
"inherits": "2",
|
||||
"minimatch": "^3.1.1",
|
||||
"once": "^1.3.0",
|
||||
"path-is-absolute": "^1.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": "*"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/isaacs"
|
||||
}
|
||||
},
|
||||
"node_modules/graceful-fs": {
|
||||
"version": "4.2.11",
|
||||
"resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz",
|
||||
|
@ -1034,9 +1032,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/http-proxy-agent": {
|
||||
"version": "7.0.0",
|
||||
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.0.tgz",
|
||||
"integrity": "sha512-+ZT+iBxVUQ1asugqnD6oWoRiS25AkjNfG085dKJGtGxkdwLQrMKU5wJr2bOOFAXzKcTuqq+7fZlTMgG3SRfIYQ==",
|
||||
"version": "7.0.2",
|
||||
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz",
|
||||
"integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==",
|
||||
"dependencies": {
|
||||
"agent-base": "^7.1.0",
|
||||
"debug": "^4.3.4"
|
||||
|
@ -1059,9 +1057,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/https-proxy-agent": {
|
||||
"version": "7.0.2",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.2.tgz",
|
||||
"integrity": "sha512-NmLNjm6ucYwtcUmL7JQC1ZQ57LmHP4lT15FQ8D61nak1rO6DH+fz5qNK2Ap5UN4ZapYICE3/0KodcLYSPsPbaA==",
|
||||
"version": "7.0.4",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.4.tgz",
|
||||
"integrity": "sha512-wlwpilI7YdjSkWaQ/7omYBMTliDcmCN8OLihO6I9B86g06lMyAoqgoDpV0XqoaPOKj+0DIdAvnsWfyAAhmimcg==",
|
||||
"dependencies": {
|
||||
"agent-base": "^7.0.2",
|
||||
"debug": "4"
|
||||
|
@ -1105,24 +1103,22 @@
|
|||
"resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz",
|
||||
"integrity": "sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ=="
|
||||
},
|
||||
"node_modules/inflight": {
|
||||
"version": "1.0.6",
|
||||
"resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
|
||||
"integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==",
|
||||
"dependencies": {
|
||||
"once": "^1.3.0",
|
||||
"wrappy": "1"
|
||||
}
|
||||
},
|
||||
"node_modules/inherits": {
|
||||
"version": "2.0.4",
|
||||
"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
|
||||
"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="
|
||||
},
|
||||
"node_modules/ip": {
|
||||
"version": "1.1.8",
|
||||
"resolved": "https://registry.npmjs.org/ip/-/ip-1.1.8.tgz",
|
||||
"integrity": "sha512-PuExPYUiu6qMBQb4l06ecm6T6ujzhmh+MeJcW9wa89PoAz5pvd4zPgN5WJV104mb6S2T1AwNIAaB70JNrLQWhg=="
|
||||
"node_modules/ip-address": {
|
||||
"version": "9.0.5",
|
||||
"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-9.0.5.tgz",
|
||||
"integrity": "sha512-zHtQzGojZXTwZTHQqra+ETKd4Sn3vgi7uBmlPoXVWZqYvuKmtI0l/VZTjqGmJY9x88GGOaZ9+G9ES8hC4T4X8g==",
|
||||
"dependencies": {
|
||||
"jsbn": "1.1.0",
|
||||
"sprintf-js": "^1.1.3"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 12"
|
||||
}
|
||||
},
|
||||
"node_modules/is-fullwidth-code-point": {
|
||||
"version": "3.0.0",
|
||||
|
@ -1153,16 +1149,17 @@
|
|||
"integrity": "sha512-Yljz7ffyPbrLpLngrMtZ7NduUgVvi6wG9RJ9IUcyCd59YQ911PBJphODUcbOVbqYfxe1wuYf/LJ8PauMRwsM/g=="
|
||||
},
|
||||
"node_modules/jsbn": {
|
||||
"version": "0.1.1",
|
||||
"resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz",
|
||||
"integrity": "sha512-UVU9dibq2JcFWxQPA6KCqj5O42VOmAY3zQUfEKxU0KpTGXwNoCjkX1e13eHNvw/xPynt6pU0rZ1htjWTNTSXsg=="
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/jsbn/-/jsbn-1.1.0.tgz",
|
||||
"integrity": "sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A=="
|
||||
},
|
||||
"node_modules/jsdom": {
|
||||
"version": "23.0.1",
|
||||
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-23.0.1.tgz",
|
||||
"integrity": "sha512-2i27vgvlUsGEBO9+/kJQRbtqtm+191b5zAZrU/UezVmnC2dlDAFLgDYJvAEi94T4kjsRKkezEtLQTgsNEsW2lQ==",
|
||||
"version": "23.2.0",
|
||||
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-23.2.0.tgz",
|
||||
"integrity": "sha512-L88oL7D/8ufIES+Zjz7v0aes+oBMh2Xnh3ygWvL0OaICOomKEPKuPnIfBJekiXr+BHbbMjrWn/xqrDQuxFTeyA==",
|
||||
"dependencies": {
|
||||
"cssstyle": "^3.0.0",
|
||||
"@asamuzakjp/dom-selector": "^2.0.1",
|
||||
"cssstyle": "^4.0.1",
|
||||
"data-urls": "^5.0.0",
|
||||
"decimal.js": "^10.4.3",
|
||||
"form-data": "^4.0.0",
|
||||
|
@ -1170,7 +1167,6 @@
|
|||
"http-proxy-agent": "^7.0.0",
|
||||
"https-proxy-agent": "^7.0.2",
|
||||
"is-potential-custom-element-name": "^1.0.1",
|
||||
"nwsapi": "^2.2.7",
|
||||
"parse5": "^7.1.2",
|
||||
"rrweb-cssom": "^0.6.0",
|
||||
"saxes": "^6.0.0",
|
||||
|
@ -1181,7 +1177,7 @@
|
|||
"whatwg-encoding": "^3.1.1",
|
||||
"whatwg-mimetype": "^4.0.0",
|
||||
"whatwg-url": "^14.0.0",
|
||||
"ws": "^8.14.2",
|
||||
"ws": "^8.16.0",
|
||||
"xml-name-validator": "^5.0.0"
|
||||
},
|
||||
"engines": {
|
||||
|
@ -1235,13 +1231,24 @@
|
|||
"integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA=="
|
||||
},
|
||||
"node_modules/jsonfile": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-4.0.0.tgz",
|
||||
"integrity": "sha512-m6F1R3z8jjlf2imQHS2Qez5sjKWQzbuuhuJ/FKYFRZvPE3PuHcSMVZzfsLhGVOkfd20obL5SWEBew5ShlquNxg==",
|
||||
"version": "6.1.0",
|
||||
"resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.1.0.tgz",
|
||||
"integrity": "sha512-5dgndWOriYSm5cnYaJNhalLNDKOqFwyDB/rr1E9ZsGciGvKPs8R2xYGCacuf3z6K1YKDz182fd+fY3cn3pMqXQ==",
|
||||
"dependencies": {
|
||||
"universalify": "^2.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"graceful-fs": "^4.1.6"
|
||||
}
|
||||
},
|
||||
"node_modules/jsonfile/node_modules/universalify": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.1.tgz",
|
||||
"integrity": "sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==",
|
||||
"engines": {
|
||||
"node": ">= 10.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/jsprim": {
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/jsprim/-/jsprim-2.0.2.tgz",
|
||||
|
@ -1375,6 +1382,11 @@
|
|||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/mdn-data": {
|
||||
"version": "2.0.30",
|
||||
"resolved": "https://registry.npmjs.org/mdn-data/-/mdn-data-2.0.30.tgz",
|
||||
"integrity": "sha512-GaqWWShW4kv/G9IEucWScBx9G1/vsFZZJUO+tD26M8J8z3Kw5RDQjaoZe03YAClgeS/SWPOcb4nkFBTEi5DUEA=="
|
||||
},
|
||||
"node_modules/mime-db": {
|
||||
"version": "1.52.0",
|
||||
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
|
||||
|
@ -1394,17 +1406,6 @@
|
|||
"node": ">= 0.6"
|
||||
}
|
||||
},
|
||||
"node_modules/minimatch": {
|
||||
"version": "3.1.2",
|
||||
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
|
||||
"integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
|
||||
"dependencies": {
|
||||
"brace-expansion": "^1.1.7"
|
||||
},
|
||||
"engines": {
|
||||
"node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/mitt": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz",
|
||||
|
@ -1461,9 +1462,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/nwsapi": {
|
||||
"version": "2.2.7",
|
||||
"resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.7.tgz",
|
||||
"integrity": "sha512-ub5E4+FBPKwAZx0UwIQOjYWGHTEq5sPqHQNRN8Z9e4A7u3Tj1weLJsL59yH9vmvqEtBHaOmT6cYQKIZOxp35FQ=="
|
||||
"version": "2.2.9",
|
||||
"resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.9.tgz",
|
||||
"integrity": "sha512-2f3F0SEEer8bBu0dsNCFF50N0cTThV1nWFYcEYFZttdW0lDAoybv9cQoK7X7/68Z89S7FoRrVjP1LPX4XRf9vg=="
|
||||
},
|
||||
"node_modules/oauth-sign": {
|
||||
"version": "0.9.0",
|
||||
|
@ -1500,12 +1501,11 @@
|
|||
}
|
||||
},
|
||||
"node_modules/pac-resolver": {
|
||||
"version": "7.0.0",
|
||||
"resolved": "https://registry.npmjs.org/pac-resolver/-/pac-resolver-7.0.0.tgz",
|
||||
"integrity": "sha512-Fd9lT9vJbHYRACT8OhCbZBbxr6KRSawSovFpy8nDGshaK99S/EBhVIHp9+crhxrsZOuvLpgL1n23iyPg6Rl2hg==",
|
||||
"version": "7.0.1",
|
||||
"resolved": "https://registry.npmjs.org/pac-resolver/-/pac-resolver-7.0.1.tgz",
|
||||
"integrity": "sha512-5NPgf87AT2STgwa2ntRMr45jTKrYBGkVU36yT0ig/n/GMAa3oPqhZfIQ2kMEimReg0+t9kZViDVZ83qfVUlckg==",
|
||||
"dependencies": {
|
||||
"degenerator": "^5.0.0",
|
||||
"ip": "^1.1.8",
|
||||
"netmask": "^2.0.2"
|
||||
},
|
||||
"engines": {
|
||||
|
@ -1539,14 +1539,6 @@
|
|||
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/path-is-absolute": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
|
||||
"integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==",
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/pend": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
|
||||
|
@ -1648,25 +1640,25 @@
|
|||
}
|
||||
},
|
||||
"node_modules/puppeteer-core": {
|
||||
"version": "21.5.2",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-21.5.2.tgz",
|
||||
"integrity": "sha512-v4T0cWnujSKs+iEfmb8ccd7u4/x8oblEyKqplqKnJ582Kw8PewYAWvkH4qUWhitN3O2q9RF7dzkvjyK5HbzjLA==",
|
||||
"version": "22.0.0",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-22.0.0.tgz",
|
||||
"integrity": "sha512-S3s91rLde0A86PWVeNY82h+P0fdS7CTiNWAicCVH/bIspRP4nS2PnO5j+VTFqCah0ZJizGzpVPAmxVYbLxTc9w==",
|
||||
"dependencies": {
|
||||
"@puppeteer/browsers": "1.8.0",
|
||||
"chromium-bidi": "0.4.33",
|
||||
"@puppeteer/browsers": "2.0.0",
|
||||
"chromium-bidi": "0.5.8",
|
||||
"cross-fetch": "4.0.0",
|
||||
"debug": "4.3.4",
|
||||
"devtools-protocol": "0.0.1203626",
|
||||
"ws": "8.14.2"
|
||||
"devtools-protocol": "0.0.1232444",
|
||||
"ws": "8.16.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=16.13.2"
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/puppeteer-core/node_modules/ws": {
|
||||
"version": "8.14.2",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.14.2.tgz",
|
||||
"integrity": "sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g==",
|
||||
"version": "8.16.0",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz",
|
||||
"integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
},
|
||||
|
@ -1703,8 +1695,7 @@
|
|||
},
|
||||
"node_modules/readability-extractor": {
|
||||
"version": "0.0.11",
|
||||
"resolved": "git+ssh://git@github.com/ArchiveBox/readability-extractor.git#2fb4689a65c6433036453dcbee7a268840604eb9",
|
||||
"license": "MIT",
|
||||
"resolved": "git+ssh://git@github.com/ArchiveBox/readability-extractor.git#057f2046f9535cfc6df7b8d551aaad32a9e6226c",
|
||||
"dependencies": {
|
||||
"@mozilla/readability": "^0.5.0",
|
||||
"dompurify": "^3.0.6",
|
||||
|
@ -1740,25 +1731,19 @@
|
|||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/require-from-string": {
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz",
|
||||
"integrity": "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==",
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/requires-port": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz",
|
||||
"integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ=="
|
||||
},
|
||||
"node_modules/rimraf": {
|
||||
"version": "3.0.2",
|
||||
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
|
||||
"integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==",
|
||||
"dependencies": {
|
||||
"glob": "^7.1.3"
|
||||
},
|
||||
"bin": {
|
||||
"rimraf": "bin.js"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/isaacs"
|
||||
}
|
||||
},
|
||||
"node_modules/rrweb-cssom": {
|
||||
"version": "0.6.0",
|
||||
"resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz",
|
||||
|
@ -1800,9 +1785,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/selenium-webdriver": {
|
||||
"version": "4.15.0",
|
||||
"resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.15.0.tgz",
|
||||
"integrity": "sha512-BNG1bq+KWiBGHcJ/wULi0eKY0yaDqFIbEmtbsYJmfaEghdCkXBsx1akgOorhNwjBipOr0uwpvNXqT6/nzl+zjg==",
|
||||
"version": "4.17.0",
|
||||
"resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.17.0.tgz",
|
||||
"integrity": "sha512-e2E+2XBlGepzwgFbyQfSwo9Cbj6G5fFfs9MzAS00nC99EewmcS2rwn2MwtgfP7I5p1e7DYv4HQJXtWedsu6DvA==",
|
||||
"dependencies": {
|
||||
"jszip": "^3.10.1",
|
||||
"tmp": "^0.2.1",
|
||||
|
@ -1818,16 +1803,16 @@
|
|||
"integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA=="
|
||||
},
|
||||
"node_modules/single-file-cli": {
|
||||
"version": "1.1.46",
|
||||
"resolved": "https://registry.npmjs.org/single-file-cli/-/single-file-cli-1.1.46.tgz",
|
||||
"integrity": "sha512-+vFj0a5Y4ESqpMwH0T6738pg8ZA9KVhhl6OlIOsicamGNU9DnMa+q9dL1S2KnLWHoauKjU0BThhR/YKUleJSxw==",
|
||||
"version": "1.1.54",
|
||||
"resolved": "https://registry.npmjs.org/single-file-cli/-/single-file-cli-1.1.54.tgz",
|
||||
"integrity": "sha512-wnVPg7BklhswwFVrtuFXbmluI4piHxg2dC0xATxYTeXAld6PnRPlnp7ufallRKArjFBZdP2u+ihMkOIp7A38XA==",
|
||||
"dependencies": {
|
||||
"file-url": "3.0.0",
|
||||
"iconv-lite": "0.6.3",
|
||||
"jsdom": "23.0.0",
|
||||
"puppeteer-core": "21.5.2",
|
||||
"selenium-webdriver": "4.15.0",
|
||||
"single-file-core": "1.3.15",
|
||||
"jsdom": "24.0.0",
|
||||
"puppeteer-core": "22.0.0",
|
||||
"selenium-webdriver": "4.17.0",
|
||||
"single-file-core": "1.3.24",
|
||||
"strong-data-uri": "1.0.6",
|
||||
"yargs": "17.7.2"
|
||||
},
|
||||
|
@ -1847,11 +1832,11 @@
|
|||
}
|
||||
},
|
||||
"node_modules/single-file-cli/node_modules/jsdom": {
|
||||
"version": "23.0.0",
|
||||
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-23.0.0.tgz",
|
||||
"integrity": "sha512-cbL/UCtohJguhFC7c2/hgW6BeZCNvP7URQGnx9tSJRYKCdnfbfWOrtuLTMfiB2VxKsx5wPHVsh/J0aBy9lIIhQ==",
|
||||
"version": "24.0.0",
|
||||
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-24.0.0.tgz",
|
||||
"integrity": "sha512-UDS2NayCvmXSXVP6mpTj+73JnNQadZlr9N68189xib2tx5Mls7swlTNao26IoHv46BZJFvXygyRtyXd1feAk1A==",
|
||||
"dependencies": {
|
||||
"cssstyle": "^3.0.0",
|
||||
"cssstyle": "^4.0.1",
|
||||
"data-urls": "^5.0.0",
|
||||
"decimal.js": "^10.4.3",
|
||||
"form-data": "^4.0.0",
|
||||
|
@ -1870,14 +1855,14 @@
|
|||
"whatwg-encoding": "^3.1.1",
|
||||
"whatwg-mimetype": "^4.0.0",
|
||||
"whatwg-url": "^14.0.0",
|
||||
"ws": "^8.14.2",
|
||||
"ws": "^8.16.0",
|
||||
"xml-name-validator": "^5.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"canvas": "^3.0.0"
|
||||
"canvas": "^2.11.2"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"canvas": {
|
||||
|
@ -1909,9 +1894,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/single-file-core": {
|
||||
"version": "1.3.15",
|
||||
"resolved": "https://registry.npmjs.org/single-file-core/-/single-file-core-1.3.15.tgz",
|
||||
"integrity": "sha512-/YNpHBwASWNxmSmZXz0xRolmXf0+PGAbwpVrwn6A8tYeuAdezxxde5RYTTQ7V4Zv68+H4JMhE2DwCRV0sVUGNA=="
|
||||
"version": "1.3.24",
|
||||
"resolved": "https://registry.npmjs.org/single-file-core/-/single-file-core-1.3.24.tgz",
|
||||
"integrity": "sha512-1B256mKBbNV8jXAV+hRyEv0aMa7tn0C0Ci+zx7Ya4ZXZB3b9/1MgKsB/fxVwDiL28WJSU0pxzh8ftIYubCNn9w=="
|
||||
},
|
||||
"node_modules/smart-buffer": {
|
||||
"version": "4.2.0",
|
||||
|
@ -1923,24 +1908,24 @@
|
|||
}
|
||||
},
|
||||
"node_modules/socks": {
|
||||
"version": "2.7.1",
|
||||
"resolved": "https://registry.npmjs.org/socks/-/socks-2.7.1.tgz",
|
||||
"integrity": "sha512-7maUZy1N7uo6+WVEX6psASxtNlKaNVMlGQKkG/63nEDdLOWNbiUMoLK7X4uYoLhQstau72mLgfEWcXcwsaHbYQ==",
|
||||
"version": "2.8.3",
|
||||
"resolved": "https://registry.npmjs.org/socks/-/socks-2.8.3.tgz",
|
||||
"integrity": "sha512-l5x7VUUWbjVFbafGLxPWkYsHIhEvmF85tbIeFZWc8ZPtoMyybuEhL7Jye/ooC4/d48FgOjSJXgsF/AJPYCW8Zw==",
|
||||
"dependencies": {
|
||||
"ip": "^2.0.0",
|
||||
"ip-address": "^9.0.5",
|
||||
"smart-buffer": "^4.2.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 10.13.0",
|
||||
"node": ">= 10.0.0",
|
||||
"npm": ">= 3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/socks-proxy-agent": {
|
||||
"version": "8.0.2",
|
||||
"resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-8.0.2.tgz",
|
||||
"integrity": "sha512-8zuqoLv1aP/66PHF5TqwJ7Czm3Yv32urJQHrVyhD7mmA6d61Zv8cIXQYPTWwmg6qlupnPvs/QKDmfa4P/qct2g==",
|
||||
"version": "8.0.3",
|
||||
"resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-8.0.3.tgz",
|
||||
"integrity": "sha512-VNegTZKhuGq5vSD6XNKlbqWhyt/40CgoEw8XxD6dhnm8Jq9IEa3nIa4HwnM8XOqU0CdB0BwWVXusqiFXfHB3+A==",
|
||||
"dependencies": {
|
||||
"agent-base": "^7.0.2",
|
||||
"agent-base": "^7.1.1",
|
||||
"debug": "^4.3.4",
|
||||
"socks": "^2.7.1"
|
||||
},
|
||||
|
@ -1948,11 +1933,6 @@
|
|||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/socks/node_modules/ip": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/ip/-/ip-2.0.0.tgz",
|
||||
"integrity": "sha512-WKa+XuLG1A1R0UWhl2+1XQSi+fZWMsYKffMZTTYsiZaUD8k2yDAj5atimTUD2TZkyCkNEeYE5NhFZmupOGtjYQ=="
|
||||
},
|
||||
"node_modules/source-map": {
|
||||
"version": "0.6.1",
|
||||
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
|
||||
|
@ -1962,6 +1942,19 @@
|
|||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/source-map-js": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.0.tgz",
|
||||
"integrity": "sha512-itJW8lvSA0TXEphiRoawsCksnlf8SyvmFzIhltqAHluXd88pkCd+cXJVHTDwdCr0IzwptSm035IHQktUu1QUMg==",
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sprintf-js": {
|
||||
"version": "1.1.3",
|
||||
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.1.3.tgz",
|
||||
"integrity": "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA=="
|
||||
},
|
||||
"node_modules/sshpk": {
|
||||
"version": "1.18.0",
|
||||
"resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.18.0.tgz",
|
||||
|
@ -1986,6 +1979,11 @@
|
|||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sshpk/node_modules/jsbn": {
|
||||
"version": "0.1.1",
|
||||
"resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz",
|
||||
"integrity": "sha512-UVU9dibq2JcFWxQPA6KCqj5O42VOmAY3zQUfEKxU0KpTGXwNoCjkX1e13eHNvw/xPynt6pU0rZ1htjWTNTSXsg=="
|
||||
},
|
||||
"node_modules/stream-length": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/stream-length/-/stream-length-1.0.2.tgz",
|
||||
|
@ -1995,12 +1993,15 @@
|
|||
}
|
||||
},
|
||||
"node_modules/streamx": {
|
||||
"version": "2.15.6",
|
||||
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.15.6.tgz",
|
||||
"integrity": "sha512-q+vQL4AAz+FdfT137VF69Cc/APqUbxy+MDOImRrMvchJpigHj9GksgDU2LYbO9rx7RX6osWgxJB2WxhYv4SZAw==",
|
||||
"version": "2.16.1",
|
||||
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.16.1.tgz",
|
||||
"integrity": "sha512-m9QYj6WygWyWa3H1YY69amr4nVgy61xfjys7xO7kviL5rfIEc2naf+ewFiOA+aEJD7y0JO3h2GoiUv4TDwEGzQ==",
|
||||
"dependencies": {
|
||||
"fast-fifo": "^1.1.0",
|
||||
"queue-tick": "^1.0.1"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"bare-events": "^2.2.0"
|
||||
}
|
||||
},
|
||||
"node_modules/string_decoder": {
|
||||
|
@ -2067,9 +2068,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/tar-stream": {
|
||||
"version": "3.1.6",
|
||||
"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.6.tgz",
|
||||
"integrity": "sha512-B/UyjYwPpMBv+PaFSWAmtYjwdrlEaZQEhMIBFNC5oEG8lpiW8XjcSdmEaClj28ArfKScKHs2nshz3k2le6crsg==",
|
||||
"version": "3.1.7",
|
||||
"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz",
|
||||
"integrity": "sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==",
|
||||
"dependencies": {
|
||||
"b4a": "^1.6.4",
|
||||
"fast-fifo": "^1.2.0",
|
||||
|
@ -2082,20 +2083,17 @@
|
|||
"integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg=="
|
||||
},
|
||||
"node_modules/tmp": {
|
||||
"version": "0.2.1",
|
||||
"resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.1.tgz",
|
||||
"integrity": "sha512-76SUhtfqR2Ijn+xllcI5P1oyannHNHByD80W1q447gU3mp9G9PSpGdWmjUOHRDPiHYacIk66W7ubDTuPF3BEtQ==",
|
||||
"dependencies": {
|
||||
"rimraf": "^3.0.0"
|
||||
},
|
||||
"version": "0.2.3",
|
||||
"resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.3.tgz",
|
||||
"integrity": "sha512-nZD7m9iCPC5g0pYmcaxogYKggSfLsdxl8of3Q/oIbqCqLLIO9IAF0GWjX1z9NZRHPiXv8Wex4yDCaZsgEw0Y8w==",
|
||||
"engines": {
|
||||
"node": ">=8.17.0"
|
||||
"node": ">=14.14"
|
||||
}
|
||||
},
|
||||
"node_modules/tough-cookie": {
|
||||
"version": "4.1.3",
|
||||
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.3.tgz",
|
||||
"integrity": "sha512-aX/y5pVRkfRnfmuX+OdbSdXvPe6ieKX/G2s7e98f4poJHnqH3281gDPm/metm6E/WRamfx7WC4HUqkWHfQHprw==",
|
||||
"version": "4.1.4",
|
||||
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.4.tgz",
|
||||
"integrity": "sha512-Loo5UUvLD9ScZ6jh8beX1T6sO1w2/MpCRpEP7V280GKMVUQ0Jzar2U3UJPsrdbziLEMMhu3Ujnq//rhiFuIeag==",
|
||||
"dependencies": {
|
||||
"psl": "^1.1.33",
|
||||
"punycode": "^2.1.1",
|
||||
|
@ -2125,9 +2123,9 @@
|
|||
"integrity": "sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q=="
|
||||
},
|
||||
"node_modules/turndown": {
|
||||
"version": "7.1.2",
|
||||
"resolved": "https://registry.npmjs.org/turndown/-/turndown-7.1.2.tgz",
|
||||
"integrity": "sha512-ntI9R7fcUKjqBP6QU8rBK2Ehyt8LAzt3UBT9JR9tgo6GtuKvyUzpayWmeMKJw1DPdXzktvtIT8m2mVXz+bL/Qg==",
|
||||
"version": "7.1.3",
|
||||
"resolved": "https://registry.npmjs.org/turndown/-/turndown-7.1.3.tgz",
|
||||
"integrity": "sha512-Z3/iJ6IWh8VBiACWQJaA5ulPQE5E1QwvBHj00uGzdQxdRnd8fh1DPqNOJqzQDu6DkOstORrtXzf/9adB+vMtEA==",
|
||||
"dependencies": {
|
||||
"domino": "^2.1.6"
|
||||
}
|
||||
|
@ -2178,9 +2176,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/urlpattern-polyfill": {
|
||||
"version": "9.0.0",
|
||||
"resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-9.0.0.tgz",
|
||||
"integrity": "sha512-WHN8KDQblxd32odxeIgo83rdVDE2bvdkb86it7bMhYZwWKJz0+O0RK/eZiHYnM+zgt/U7hAHOlCQGfjjvSkw2g=="
|
||||
"version": "10.0.0",
|
||||
"resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz",
|
||||
"integrity": "sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg=="
|
||||
},
|
||||
"node_modules/util-deprecate": {
|
||||
"version": "1.0.2",
|
||||
|
@ -2298,9 +2296,9 @@
|
|||
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="
|
||||
},
|
||||
"node_modules/ws": {
|
||||
"version": "8.16.0",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz",
|
||||
"integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==",
|
||||
"version": "8.17.0",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.17.0.tgz",
|
||||
"integrity": "sha512-uJq6108EgZMAl20KagGkzCKfMEjxmKvZHG7Tlq0Z6nOky7YF7aq4mOx6xK8TJ/i1LeK4Qus7INktacctDgY8Ow==",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
},
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "archivebox",
|
||||
"version": "0.7.2",
|
||||
"version": "0.8.0",
|
||||
"description": "ArchiveBox: The self-hosted internet archive",
|
||||
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
|
||||
"repository": "github:ArchiveBox/ArchiveBox",
|
||||
|
@ -8,6 +8,6 @@
|
|||
"dependencies": {
|
||||
"@postlight/parser": "^2.2.3",
|
||||
"readability-extractor": "github:ArchiveBox/readability-extractor",
|
||||
"single-file-cli": "^1.1.46"
|
||||
"single-file-cli": "^1.1.54"
|
||||
}
|
||||
}
|
||||
|
|
2
pip_dist
2
pip_dist
|
@ -1 +1 @@
|
|||
Subproject commit 5323fc773d33ef3f219c35c946f3b353b1251d37
|
||||
Subproject commit 1380be7e4ef156d85957dfef8c6d154ef9880578
|
148
pyproject.toml
148
pyproject.toml
|
@ -1,26 +1,48 @@
|
|||
[project]
|
||||
name = "archivebox"
|
||||
version = "0.7.2"
|
||||
version = "0.8.0"
|
||||
package-dir = "archivebox"
|
||||
requires-python = ">=3.10,<3.13"
|
||||
platform = "py3-none-any"
|
||||
description = "Self-hosted internet archiving solution."
|
||||
authors = [
|
||||
{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"},
|
||||
]
|
||||
dependencies = [
|
||||
"croniter>=0.3.34",
|
||||
"dateparser>=1.0.0",
|
||||
"django-extensions>=3.0.3",
|
||||
"django>=3.1.3,<3.2",
|
||||
"ipython>5.0.0",
|
||||
"mypy-extensions>=0.4.3",
|
||||
"python-crontab>=2.5.1",
|
||||
"requests>=2.24.0",
|
||||
"w3lib>=1.22.0",
|
||||
"yt-dlp>=2023.10.13",
|
||||
# "playwright>=1.39.0; platform_machine != 'armv7l'",
|
||||
]
|
||||
requires-python = ">=3.9,<3.12"
|
||||
readme = "README.md"
|
||||
authors = [{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"}]
|
||||
license = {text = "MIT"}
|
||||
readme = "README.md"
|
||||
|
||||
# pdm install
|
||||
# pdm update --unconstrained
|
||||
dependencies = [
|
||||
# Last Bumped: 2024-04-25
|
||||
# Base Framework and Language Dependencies
|
||||
"setuptools>=69.5.1",
|
||||
"django>=5.0.4,<6.0",
|
||||
"django-ninja>=1.1.0",
|
||||
"django-extensions>=3.2.3",
|
||||
"mypy-extensions>=1.0.0",
|
||||
# Python Helper Libraries
|
||||
"requests>=2.31.0",
|
||||
"dateparser>=1.0.0",
|
||||
"feedparser>=6.0.11",
|
||||
"w3lib>=2.1.2",
|
||||
# Feature-Specific Dependencies
|
||||
"python-crontab>=3.0.0", # for: archivebox schedule
|
||||
"croniter>=2.0.5", # for: archivebox schedule
|
||||
"ipython>=8.23.0", # for: archivebox shell
|
||||
# Extractor Dependencies
|
||||
"yt-dlp>=2024.4.9", # for: media
|
||||
# "playwright>=1.43.0; platform_machine != 'armv7l'", # WARNING: playwright doesn't have any sdist, causes trouble on build systems that refuse to install wheel-only packages
|
||||
# TODO: add more extractors
|
||||
# - gallery-dl
|
||||
# - scihubdl
|
||||
# - See Github issues for more...
|
||||
"django-signal-webhooks>=0.3.0",
|
||||
"django-admin-data-views>=0.3.1",
|
||||
]
|
||||
|
||||
homepage = "https://github.com/ArchiveBox/ArchiveBox"
|
||||
repository = "https://github.com/ArchiveBox/ArchiveBox"
|
||||
documentation = "https://github.com/ArchiveBox/ArchiveBox/wiki"
|
||||
keywords = ["internet archiving", "web archiving", "digipres", "warc", "preservation", "backups", "archiving", "web", "bookmarks", "puppeteer", "browser", "download"]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
"Environment :: Console",
|
||||
|
@ -36,9 +58,6 @@ classifiers = [
|
|||
"Natural Language :: English",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
|
@ -53,27 +72,52 @@ classifiers = [
|
|||
"Topic :: Utilities",
|
||||
"Typing :: Typed",
|
||||
]
|
||||
# dynamic = ["version"] # TODO: programatticaly fetch version from package.json at build time
|
||||
|
||||
# pdm lock -G:all
|
||||
# pdm lock --group=':all'
|
||||
# pdm install -G:all
|
||||
# pdm update --group=':all' --unconstrained
|
||||
[project.optional-dependencies]
|
||||
sonic = [
|
||||
# echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list
|
||||
# curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg
|
||||
# apt install sonic
|
||||
"sonic-client>=1.0.0",
|
||||
]
|
||||
ldap = [
|
||||
# apt install libldap2-dev libsasl2-dev python3-ldap
|
||||
"python-ldap>=3.4.3",
|
||||
"django-auth-ldap>=4.1.0",
|
||||
]
|
||||
|
||||
|
||||
# pdm lock --group=':all' --dev
|
||||
# pdm install -G:all --dev
|
||||
# pdm update --dev --unconstrained
|
||||
[tool.pdm.dev-dependencies]
|
||||
dev = [
|
||||
# build
|
||||
"setuptools>=69.0.3",
|
||||
build = [
|
||||
# "pdm", # usually installed by apt/brew, dont double-install with pip
|
||||
"setuptools>=69.5.1",
|
||||
"pip",
|
||||
"wheel",
|
||||
"pdm",
|
||||
"homebrew-pypi-poet>=0.10.0",
|
||||
# docs
|
||||
"homebrew-pypi-poet>=0.10.0", # for: generating archivebox.rb brewfile list of python packages
|
||||
]
|
||||
docs = [
|
||||
"recommonmark",
|
||||
"sphinx",
|
||||
"sphinx-rtd-theme",
|
||||
# debug
|
||||
]
|
||||
debug = [
|
||||
"django-debug-toolbar",
|
||||
"djdt_flamegraph",
|
||||
"ipdb",
|
||||
# test
|
||||
"requests-tracker>=0.3.3",
|
||||
]
|
||||
test = [
|
||||
"pytest",
|
||||
# lint
|
||||
"bottle",
|
||||
]
|
||||
lint = [
|
||||
"flake8",
|
||||
"mypy",
|
||||
"django-stubs",
|
||||
|
@ -84,29 +128,33 @@ lint = "./bin/lint.sh"
|
|||
test = "./bin/test.sh"
|
||||
# all = {composite = ["lint mypackage/", "test -v tests/"]}
|
||||
|
||||
[project.optional-dependencies]
|
||||
sonic = [
|
||||
# echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list
|
||||
# curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg
|
||||
"sonic-client>=0.0.5",
|
||||
]
|
||||
ldap = [
|
||||
# apt install libldap2-dev libsasl2-dev
|
||||
"python-ldap>=3.4.3",
|
||||
"django-auth-ldap>=4.1.0",
|
||||
]
|
||||
# playwright = [
|
||||
# platform_machine isnt respected by pdm export -o requirements.txt, this breaks arm/v7
|
||||
# "playwright>=1.39.0; platform_machine != 'armv7l'",
|
||||
# ]
|
||||
|
||||
[project.scripts]
|
||||
archivebox = "archivebox.cli:main"
|
||||
|
||||
[build-system]
|
||||
requires = ["pdm-backend"]
|
||||
build-backend = "pdm.backend"
|
||||
|
||||
[project.scripts]
|
||||
archivebox = "archivebox.cli:main"
|
||||
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = [ "tests" ]
|
||||
|
||||
[tool.mypy]
|
||||
mypy_path = "archivebox"
|
||||
namespace_packages = true
|
||||
explicit_package_bases = true
|
||||
# follow_imports = "silent"
|
||||
# ignore_missing_imports = true
|
||||
# disallow_incomplete_defs = true
|
||||
# disallow_untyped_defs = true
|
||||
# disallow_untyped_decorators = true
|
||||
# exclude = "pdm/(pep582/|models/in_process/.+\\.py)"
|
||||
plugins = ["mypy_django_plugin.main"]
|
||||
|
||||
[tool.django-stubs]
|
||||
django_settings_module = "core.settings"
|
||||
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/ArchiveBox/ArchiveBox"
|
||||
|
|
|
@ -1,54 +1,70 @@
|
|||
# This file is @generated by PDM.
|
||||
# Please do not edit it manually.
|
||||
|
||||
asgiref==3.7.2
|
||||
annotated-types==0.6.0
|
||||
anyio==4.3.0
|
||||
asgiref==3.8.1
|
||||
asttokens==2.4.1
|
||||
brotli==1.1.0; implementation_name == "cpython"
|
||||
brotlicffi==1.1.0.0; implementation_name != "cpython"
|
||||
certifi==2023.11.17
|
||||
cffi==1.16.0; implementation_name != "cpython"
|
||||
certifi==2024.2.2
|
||||
cffi==1.16.0; platform_python_implementation != "PyPy" or implementation_name != "cpython"
|
||||
charset-normalizer==3.3.2
|
||||
colorama==0.4.6; sys_platform == "win32"
|
||||
croniter==2.0.1
|
||||
croniter==2.0.5
|
||||
cryptography==42.0.7
|
||||
dateparser==1.2.0
|
||||
decorator==5.1.1
|
||||
django==3.1.14
|
||||
django-auth-ldap==4.1.0
|
||||
django-extensions==3.1.5
|
||||
exceptiongroup==1.2.0; python_version < "3.11"
|
||||
django==5.0.6
|
||||
django-admin-data-views==0.3.1
|
||||
django-auth-ldap==4.8.0
|
||||
django-extensions==3.2.3
|
||||
django-ninja==1.1.0
|
||||
django-settings-holder==0.1.2
|
||||
django-signal-webhooks==0.3.0
|
||||
exceptiongroup==1.2.1; python_version < "3.11"
|
||||
executing==2.0.1
|
||||
idna==3.6
|
||||
ipython==8.18.1
|
||||
feedparser==6.0.11
|
||||
h11==0.14.0
|
||||
httpcore==1.0.5
|
||||
httpx==0.27.0
|
||||
idna==3.7
|
||||
ipython==8.24.0
|
||||
jedi==0.19.1
|
||||
matplotlib-inline==0.1.6
|
||||
matplotlib-inline==0.1.7
|
||||
mutagen==1.47.0
|
||||
mypy-extensions==1.0.0
|
||||
parso==0.8.3
|
||||
pexpect==4.9.0; sys_platform != "win32"
|
||||
parso==0.8.4
|
||||
pexpect==4.9.0; sys_platform != "win32" and sys_platform != "emscripten"
|
||||
prompt-toolkit==3.0.43
|
||||
ptyprocess==0.7.0; sys_platform != "win32"
|
||||
ptyprocess==0.7.0; sys_platform != "win32" and sys_platform != "emscripten"
|
||||
pure-eval==0.2.2
|
||||
pyasn1==0.5.1
|
||||
pyasn1-modules==0.3.0
|
||||
pycparser==2.21; implementation_name != "cpython"
|
||||
pycryptodomex==3.19.1
|
||||
pygments==2.17.2
|
||||
pyasn1==0.6.0
|
||||
pyasn1-modules==0.4.0
|
||||
pycparser==2.22; platform_python_implementation != "PyPy" or implementation_name != "cpython"
|
||||
pycryptodomex==3.20.0
|
||||
pydantic==2.7.1
|
||||
pydantic-core==2.18.2
|
||||
pygments==2.18.0
|
||||
python-crontab==3.0.0
|
||||
python-dateutil==2.8.2
|
||||
python-dateutil==2.9.0.post0
|
||||
python-ldap==3.4.4
|
||||
pytz==2023.3.post1
|
||||
regex==2023.12.25
|
||||
pytz==2024.1
|
||||
regex==2024.5.10
|
||||
requests==2.31.0
|
||||
setuptools==69.5.1
|
||||
sgmllib3k==1.0.0
|
||||
six==1.16.0
|
||||
sniffio==1.3.1
|
||||
sonic-client==1.0.0
|
||||
sqlparse==0.4.4
|
||||
sqlparse==0.5.0
|
||||
stack-data==0.6.3
|
||||
traitlets==5.14.1
|
||||
typing-extensions==4.9.0; python_version < "3.11"
|
||||
tzdata==2023.4; platform_system == "Windows"
|
||||
traitlets==5.14.3
|
||||
typing-extensions==4.11.0
|
||||
tzdata==2024.1; sys_platform == "win32" or platform_system == "Windows"
|
||||
tzlocal==5.2
|
||||
urllib3==2.1.0
|
||||
urllib3==2.2.1
|
||||
w3lib==2.1.2
|
||||
wcwidth==0.2.12
|
||||
wcwidth==0.2.13
|
||||
websockets==12.0
|
||||
yt-dlp==2023.12.30
|
||||
yt-dlp==2024.4.9
|
||||
|
|
|
@ -50,4 +50,4 @@ def redirect_to_static(filename):
|
|||
|
||||
|
||||
def start():
|
||||
run(host='localhost', port=8080)
|
||||
run(host='localhost', port=8080, quiet=True)
|
||||
|
|
1
tests/mock_server/templates/example-single.jsonl
Normal file
1
tests/mock_server/templates/example-single.jsonl
Normal file
|
@ -0,0 +1 @@
|
|||
{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}
|
24
tests/mock_server/templates/example.atom
Normal file
24
tests/mock_server/templates/example.atom
Normal file
|
@ -0,0 +1,24 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<feed
|
||||
xml:lang="en"
|
||||
xmlns="http://www.w3.org/2005/Atom"
|
||||
>
|
||||
<id>http://www.example.com/</id>
|
||||
<title>Example of an Atom feed</title>
|
||||
<link rel="self" type="application/atom+xml" href="http://www.example.com/index.atom" />
|
||||
<link rel="alternate" type="text/html" href="http://www.example.com/" />
|
||||
<author>
|
||||
<name>Jim Winstead</name>
|
||||
</author>
|
||||
<updated>2024-02-26T03:18:26Z</updated>
|
||||
<entry>
|
||||
<title>Example</title>
|
||||
<link rel="alternate" type="text/html" href="http://127.0.0.1:8080/static/example.com.html" />
|
||||
<id>tag:example.com,2024-02-25:3319</id>
|
||||
<updated>2024-02-26T03:18:26Z</updated>
|
||||
<published>2024-02-25T19:18:25-08:00</published>
|
||||
<category term="Tag1" scheme="http://example.com/archive" />
|
||||
<category term="Tag2" scheme="http://example.com/archive" />
|
||||
<content type="html">This is some <b>content</b></content>
|
||||
</entry>
|
||||
</feed>
|
6
tests/mock_server/templates/example.json
Normal file
6
tests/mock_server/templates/example.json
Normal file
|
@ -0,0 +1,6 @@
|
|||
[
|
||||
{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"},
|
||||
{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"},
|
||||
{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]},
|
||||
{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"}
|
||||
]
|
2
tests/mock_server/templates/example.json.bad
Normal file
2
tests/mock_server/templates/example.json.bad
Normal file
|
@ -0,0 +1,2 @@
|
|||
this line would cause problems but --parser=json will actually skip it
|
||||
[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}]
|
4
tests/mock_server/templates/example.jsonl
Normal file
4
tests/mock_server/templates/example.jsonl
Normal file
|
@ -0,0 +1,4 @@
|
|||
{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}
|
||||
{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"}
|
||||
{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]}
|
||||
{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"}
|
32
tests/mock_server/templates/example.rss
Normal file
32
tests/mock_server/templates/example.rss
Normal file
|
@ -0,0 +1,32 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<rss version="2.0"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:admin="http://webns.net/mvcb/"
|
||||
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<channel>
|
||||
<title>Sample Feed</title>
|
||||
<link>http://example.org/</link>
|
||||
<description>For documentation only</description>
|
||||
<dc:language>en-us</dc:language>
|
||||
<dc:creator>Nobody (nobody@example.org)</dc:creator>
|
||||
<dc:rights>Public domain</dc:rights>
|
||||
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
|
||||
<admin:generatorAgent rdf:resource="http://www.example.org/"/>
|
||||
<admin:errorReportsTo rdf:resource="mailto:nobody@example.org"/>
|
||||
|
||||
<item>
|
||||
<title>First!</title>
|
||||
<link>http://127.0.0.1:8080/static/example.com.html</link>
|
||||
<guid isPermaLink="false">just-an@example.org</guid>
|
||||
<description>
|
||||
This has a description.
|
||||
</description>
|
||||
<dc:subject>Tag1 Tag2</dc:subject>
|
||||
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
|
||||
<content:encoded><![CDATA[
|
||||
This has a <b>description</b>.]]>
|
||||
</content:encoded>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
|
@ -91,3 +91,198 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
|
|||
|
||||
assert (archived_item_path / "warc").exists()
|
||||
assert not (archived_item_path / "singlefile.html").exists()
|
||||
|
||||
def test_json(tmp_path, process, disable_extractors_dict):
|
||||
with open('../../mock_server/templates/example.json', 'r', encoding='utf-8') as f:
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--parser=json"],
|
||||
stdin=f,
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
urls = c.execute("SELECT url from core_snapshot").fetchall()
|
||||
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
urls = list(map(lambda x: x[0], urls))
|
||||
assert "http://127.0.0.1:8080/static/example.com.html" in urls
|
||||
assert "http://127.0.0.1:8080/static/iana.org.html" in urls
|
||||
assert "http://127.0.0.1:8080/static/shift_jis.html" in urls
|
||||
assert "http://127.0.0.1:8080/static/title_og_with_html" in urls
|
||||
# if the following URL appears, we must have fallen back to another parser
|
||||
assert not "http://www.example.com/should-not-exist" in urls
|
||||
|
||||
tags = list(map(lambda x: x[0], tags))
|
||||
assert "Tag1" in tags
|
||||
assert "Tag2" in tags
|
||||
assert "Tag3" in tags
|
||||
assert "Tag4 with Space" in tags
|
||||
assert "Tag5" in tags
|
||||
assert "Tag6 with Space" in tags
|
||||
|
||||
def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict):
|
||||
with open('../../mock_server/templates/example.json.bad', 'r', encoding='utf-8') as f:
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--parser=json"],
|
||||
stdin=f,
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
urls = c.execute("SELECT url from core_snapshot").fetchall()
|
||||
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
urls = list(map(lambda x: x[0], urls))
|
||||
assert "http://127.0.0.1:8080/static/example.com.html" in urls
|
||||
# if the following URL appears, we must have fallen back to another parser
|
||||
assert not "http://www.example.com/should-not-exist" in urls
|
||||
|
||||
tags = list(map(lambda x: x[0], tags))
|
||||
assert "Tag1" in tags
|
||||
assert "Tag2" in tags
|
||||
|
||||
def test_generic_rss(tmp_path, process, disable_extractors_dict):
|
||||
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--parser=rss"],
|
||||
stdin=f,
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
urls = c.execute("SELECT url from core_snapshot").fetchall()
|
||||
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
urls = list(map(lambda x: x[0], urls))
|
||||
assert "http://127.0.0.1:8080/static/example.com.html" in urls
|
||||
# if the following URL appears, we must have fallen back to another parser
|
||||
assert not "http://purl.org/dc/elements/1.1/" in urls
|
||||
|
||||
tags = list(map(lambda x: x[0], tags))
|
||||
assert "Tag1 Tag2" in tags
|
||||
|
||||
def test_pinboard_rss(tmp_path, process, disable_extractors_dict):
|
||||
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--parser=pinboard_rss"],
|
||||
stdin=f,
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
tags = list(map(lambda x: x[0], tags))
|
||||
assert "Tag1" in tags
|
||||
assert "Tag2" in tags
|
||||
|
||||
def test_atom(tmp_path, process, disable_extractors_dict):
|
||||
with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f:
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--parser=rss"],
|
||||
stdin=f,
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
urls = c.execute("SELECT url from core_snapshot").fetchall()
|
||||
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
urls = list(map(lambda x: x[0], urls))
|
||||
assert "http://127.0.0.1:8080/static/example.com.html" in urls
|
||||
# if the following URL appears, we must have fallen back to another parser
|
||||
assert not "http://www.w3.org/2005/Atom" in urls
|
||||
|
||||
tags = list(map(lambda x: x[0], tags))
|
||||
assert "Tag1" in tags
|
||||
assert "Tag2" in tags
|
||||
|
||||
def test_jsonl(tmp_path, process, disable_extractors_dict):
|
||||
with open('../../mock_server/templates/example.jsonl', 'r', encoding='utf-8') as f:
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--parser=jsonl"],
|
||||
stdin=f,
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
urls = c.execute("SELECT url from core_snapshot").fetchall()
|
||||
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
urls = list(map(lambda x: x[0], urls))
|
||||
assert "http://127.0.0.1:8080/static/example.com.html" in urls
|
||||
assert "http://127.0.0.1:8080/static/iana.org.html" in urls
|
||||
assert "http://127.0.0.1:8080/static/shift_jis.html" in urls
|
||||
assert "http://127.0.0.1:8080/static/title_og_with_html" in urls
|
||||
# if the following URL appears, we must have fallen back to another parser
|
||||
assert not "http://www.example.com/should-not-exist" in urls
|
||||
|
||||
tags = list(map(lambda x: x[0], tags))
|
||||
assert "Tag1" in tags
|
||||
assert "Tag2" in tags
|
||||
assert "Tag3" in tags
|
||||
assert "Tag4 with Space" in tags
|
||||
assert "Tag5" in tags
|
||||
assert "Tag6 with Space" in tags
|
||||
|
||||
def test_jsonl_single(tmp_path, process, disable_extractors_dict):
|
||||
with open('../../mock_server/templates/example-single.jsonl', 'r', encoding='utf-8') as f:
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--parser=jsonl"],
|
||||
stdin=f,
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
urls = c.execute("SELECT url from core_snapshot").fetchall()
|
||||
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
urls = list(map(lambda x: x[0], urls))
|
||||
assert "http://127.0.0.1:8080/static/example.com.html" in urls
|
||||
# if the following URL appears, we must have fallen back to another parser
|
||||
assert not "http://www.example.com/should-not-exist" in urls
|
||||
|
||||
tags = list(map(lambda x: x[0], tags))
|
||||
assert "Tag1" in tags
|
||||
assert "Tag2" in tags
|
||||
|
||||
# make sure that JSON parser rejects a single line of JSONL which is valid
|
||||
# JSON but not our expected format
|
||||
def test_json_single(tmp_path, process, disable_extractors_dict):
|
||||
with open('../../mock_server/templates/example-single.jsonl', 'r', encoding='utf-8') as f:
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--parser=json"],
|
||||
stdin=f,
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert 'expects list of objects' in arg_process.stderr.decode("utf-8")
|
||||
|
|
Loading…
Reference in a new issue