1
0
Fork 0
mirror of synced 2024-06-26 10:00:19 +12:00

Merge pull request #1249 from ArchiveBox/pdm

This commit is contained in:
Nick Sweeting 2023-10-20 04:23:25 -07:00 committed by GitHub
commit a58535baff
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 2992 additions and 1594 deletions

View file

@ -5,16 +5,21 @@ __pycache__/
.mypy_cache/
.pytest_cache/
.github/
.git/
.pdm-build/
.pdm-python/
.eggs/
venv/
.venv/
.docker-venv/
node_modules/
build/
dist/
pip_dist/
!pip_dist/archivebox.egg-info/requires.txt
brew_dist/
deb_dist/
pip_dist/
assets/
data/

2
.gitignore vendored
View file

@ -13,6 +13,8 @@ venv/
node_modules/
# Packaging artifacts
.pdm-python
.pdm-build
archivebox.egg-info
archivebox-*.tar.gz
build/

View file

@ -16,15 +16,17 @@
# Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
FROM python:3.11-slim-bullseye
FROM debian:bookworm-backports
LABEL name="archivebox" \
maintainer="Nick Sweeting <archivebox-docker@sweeting.me>" \
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
description="All-in-one personal internet archiving container" \
homepage="https://github.com/ArchiveBox/ArchiveBox" \
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker"
# System-level base config
######### Base System Setup ####################################
# Global system-level config
ENV TZ=UTC \
LANGUAGE=en_US:en \
LC_ALL=C.UTF-8 \
@ -32,103 +34,146 @@ ENV TZ=UTC \
PYTHONIOENCODING=UTF-8 \
PYTHONUNBUFFERED=1 \
DEBIAN_FRONTEND=noninteractive \
APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
npm_config_loglevel=error
# Application-level base config
# Application-level config
ENV CODE_DIR=/app \
VENV_PATH=/venv \
DATA_DIR=/data \
NODE_DIR=/node \
GLOBAL_VENV=/venv \
APP_VENV=/app/.venv \
NODE_MODULES=/app/node_modules \
ARCHIVEBOX_USER="archivebox"
ENV PATH="$PATH:$GLOBAL_VENV/bin:$APP_VENV/bin:$NODE_MODULES/.bin"
# Create non-privileged user for archivebox and chrome
RUN groupadd --system $ARCHIVEBOX_USER \
&& useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER
RUN echo "[*] Setting up system environment..." \
&& groupadd --system $ARCHIVEBOX_USER \
&& useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER \
&& mkdir -p /etc/apt/keyrings
# Install system dependencies
RUN apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
apt-transport-https ca-certificates gnupg2 zlib1g-dev \
dumb-init gosu cron unzip curl \
# Install system apt dependencies (adding backports to access more recent apt updates)
RUN echo "[+] Installing system dependencies..." \
&& echo 'deb https://deb.debian.org/debian bullseye-backports main contrib non-free' >> /etc/apt/sources.list.d/backports.list \
&& apt-get update -qq \
&& apt-get install -qq -y \
apt-transport-https ca-certificates gnupg2 curl wget \
zlib1g-dev dumb-init gosu cron unzip \
nano iputils-ping dnsutils htop procps \
# 1. packaging dependencies
# 2. docker and init system dependencies
# 3. frivolous CLI helpers to make debugging failed archiving easier
&& mkdir -p /etc/apt/keyrings \
&& rm -rf /var/lib/apt/lists/*
# Install apt dependencies
RUN apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
wget curl chromium git ffmpeg youtube-dl ripgrep \
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& ln -s /usr/bin/chromium /usr/bin/chromium-browser \
&& rm -rf /var/lib/apt/lists/*
######### Language Environments ####################################
# Install Node environment
RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
&& echo 'deb https://deb.nodesource.com/node_18.x buster main' >> /etc/apt/sources.list \
RUN echo "[+] Installing Node environment..." \
&& echo 'deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main' >> /etc/apt/sources.list.d/nodejs.list \
&& curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
&& apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
nodejs \
# && npm install -g npm \
&& apt-get install -qq -y nodejs \
&& npm i -g npm \
&& node --version \
&& npm --version
# Install Python environment
RUN echo "[+] Installing Python environment..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
python3 python3-pip python3-venv python3-setuptools python3-wheel python-dev-is-python3 \
python3-ldap libldap2-dev libsasl2-dev libssl-dev \
&& rm /usr/lib/python3*/EXTERNALLY-MANAGED \
&& python3 -m venv --system-site-packages --symlinks $GLOBAL_VENV \
&& $GLOBAL_VENV/bin/pip install --upgrade pip pdm setuptools wheel python-ldap \
&& rm -rf /var/lib/apt/lists/*
######### Extractor Dependencies ##################################
# Install apt dependencies
RUN echo "[+] Installing extractor APT dependencies..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
curl wget git yt-dlp ffmpeg ripgrep \
# Packages we have also needed in the past:
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
# fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& rm -rf /var/lib/apt/lists/*
# Install chromium browser using playwright
ENV PLAYWRIGHT_BROWSERS_PATH="/browsers"
RUN echo "[+] Installing extractor Chromium dependency..." \
&& apt-get update -qq \
&& $GLOBAL_VENV/bin/pip install playwright \
&& $GLOBAL_VENV/bin/playwright install --with-deps chromium \
&& CHROME_BINARY="$($GLOBAL_VENV/bin/python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
&& chown -R $ARCHIVEBOX_USER "/home/${ARCHIVEBOX_USER}/.config"
# Install Node dependencies
WORKDIR "$NODE_DIR"
ENV PATH="${PATH}:$NODE_DIR/node_modules/.bin" \
npm_config_loglevel=error
ADD ./package.json ./package.json
ADD ./package-lock.json ./package-lock.json
RUN npm ci
# Install Python dependencies
WORKDIR "$CODE_DIR"
ENV PATH="${PATH}:$VENV_PATH/bin"
RUN python -m venv --clear --symlinks "$VENV_PATH" \
&& pip install --upgrade --quiet pip setuptools \
&& mkdir -p "$CODE_DIR/archivebox"
ADD "./setup.py" "$CODE_DIR/"
ADD "./package.json" "$CODE_DIR/archivebox/"
RUN apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
build-essential python-dev python3-dev libldap2-dev libsasl2-dev \
&& echo 'empty placeholder for setup.py to use' > "$CODE_DIR/archivebox/README.md" \
&& python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \
&& pip install -r /tmp/requirements.txt \
&& pip install --upgrade youtube-dl yt-dlp \
&& apt-get purge -y build-essential python-dev python3-dev libldap2-dev libsasl2-dev \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*
COPY --chown=root:root --chmod=755 "package.json" "package-lock.json" "$CODE_DIR/"
RUN echo "[+] Installing extractor Node dependencies..." \
&& npm ci --prefer-offline --no-audit \
&& npm version
# Install apt development dependencies
# RUN apt-get install -qq \
# && apt-get install -qq -y --no-install-recommends \
# python3 python3-dev python3-pip python3-venv python3-all \
# dh-python debhelper devscripts dput software-properties-common \
# python3-distutils python3-setuptools python3-wheel python3-stdeb
# RUN python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.extras_require["dev"]))' > /tmp/dev_requirements.txt \
# && pip install --quiet -r /tmp/dev_requirements.txt
######### Build Dependencies ####################################
# Install ArchiveBox Python package and its dependencies
WORKDIR "$CODE_DIR"
ADD . "$CODE_DIR"
RUN chown -R root:root . && chmod a+rX -R . && pip install -e .
# # Installing Python dependencies to build from source
# WORKDIR "$CODE_DIR"
# COPY --chown=root:root --chmod=755 "./pyproject.toml" "./pdm.lock" "$CODE_DIR/"
# RUN echo "[+] Installing project Python dependencies..." \
# && apt-get update -qq \
# && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
# build-essential libssl-dev libldap2-dev libsasl2-dev \
# && pdm use -f $GLOBAL_VENV \
# && pdm install --fail-fast --no-lock --group :all --no-self \
# && pdm build \
# && apt-get purge -y \
# build-essential libssl-dev libldap2-dev libsasl2-dev \
# # these are only needed to build CPython libs, we discard after build phase to shrink layer size
# && apt-get autoremove -y \
# && rm -rf /var/lib/apt/lists/*
# Install ArchiveBox Python package from source
COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
RUN echo "[*] Installing ArchiveBox package from /app..." \
&& apt-get update -qq \
&& $GLOBAL_VENV/bin/pip install -e "$CODE_DIR"[sonic,ldap]
####################################################
# Setup ArchiveBox runtime config
WORKDIR "$DATA_DIR"
ENV IN_DOCKER=True \
WGET_BINARY="wget" \
YOUTUBEDL_BINARY="yt-dlp" \
CHROME_SANDBOX=False \
CHROME_BINARY="/usr/bin/chromium-browser" \
USE_SINGLEFILE=True \
SINGLEFILE_BINARY="$NODE_DIR/node_modules/.bin/single-file" \
SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \
USE_READABILITY=True \
READABILITY_BINARY="$NODE_DIR/node_modules/.bin/readability-extractor" \
READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \
USE_MERCURY=True \
MERCURY_BINARY="$NODE_DIR/node_modules/.bin/mercury-parser" \
YOUTUBEDL_BINARY="yt-dlp"
MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser"
# Print version for nice docker finish summary
# RUN archivebox version
RUN /app/bin/docker_entrypoint.sh archivebox version
RUN echo "[√] Finished Docker build succesfully. Saving build summary in: /version_info.txt" \
&& uname -a | tee -a /version_info.txt \
&& env --chdir="$NODE_DIR" npm version | tee -a /version_info.txt \
&& env --chdir="$CODE_DIR" pdm info | tee -a /version_info.txt \
&& "$CODE_DIR/bin/docker_entrypoint.sh" archivebox version 2>&1 | tee -a /version_info.txt
####################################################
# Open up the interfaces to the outside world
VOLUME "$DATA_DIR"
VOLUME "/data"
EXPOSE 8000
# Optional:

View file

@ -231,12 +231,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'},
'WGET_BINARY': {'type': str, 'default': 'wget'},
'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')},
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
'NODE_BINARY': {'type': str, 'default': 'node'},
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'CHROME_BINARY': {'type': str, 'default': None},
@ -435,7 +434,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury is unversioned
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},

View file

@ -6,9 +6,6 @@ import re
import logging
import tempfile
import ldap
from django_auth_ldap.config import LDAPSearch
from pathlib import Path
from django.utils.crypto import get_random_string
@ -97,33 +94,43 @@ AUTHENTICATION_BACKENDS = [
]
if LDAP:
global AUTH_LDAP_SERVER_URI
AUTH_LDAP_SERVER_URI = LDAP_SERVER_URI
try:
import ldap
from django_auth_ldap.config import LDAPSearch
global AUTH_LDAP_BIND_DN
AUTH_LDAP_BIND_DN = LDAP_BIND_DN
global AUTH_LDAP_SERVER_URI
global AUTH_LDAP_BIND_DN
global AUTH_LDAP_BIND_PASSWORD
global AUTH_LDAP_USER_SEARCH
global AUTH_LDAP_USER_ATTR_MAP
global AUTH_LDAP_BIND_PASSWORD
AUTH_LDAP_BIND_PASSWORD = LDAP_BIND_PASSWORD
AUTH_LDAP_SERVER_URI = LDAP_SERVER_URI
AUTH_LDAP_BIND_DN = LDAP_BIND_DN
AUTH_LDAP_BIND_PASSWORD = LDAP_BIND_PASSWORD
global AUTH_LDAP_USER_SEARCH
AUTH_LDAP_USER_SEARCH = LDAPSearch(
LDAP_USER_BASE,
ldap.SCOPE_SUBTREE,
'(&(' + LDAP_USERNAME_ATTR + '=%(user)s)' + LDAP_USER_FILTER + ')',
)
assert AUTH_LDAP_SERVER_URI and LDAP_USERNAME_ATTR and LDAP_USER_FILTER, 'LDAP_* config options must all be set if LDAP=True'
global AUTH_LDAP_USER_ATTR_MAP
AUTH_LDAP_USER_ATTR_MAP = {
'username': LDAP_USERNAME_ATTR,
'first_name': LDAP_FIRSTNAME_ATTR,
'last_name': LDAP_LASTNAME_ATTR,
'email': LDAP_EMAIL_ATTR,
}
AUTH_LDAP_USER_SEARCH = LDAPSearch(
LDAP_USER_BASE,
ldap.SCOPE_SUBTREE,
'(&(' + LDAP_USERNAME_ATTR + '=%(user)s)' + LDAP_USER_FILTER + ')',
)
AUTH_LDAP_USER_ATTR_MAP = {
'username': LDAP_USERNAME_ATTR,
'first_name': LDAP_FIRSTNAME_ATTR,
'last_name': LDAP_LASTNAME_ATTR,
'email': LDAP_EMAIL_ATTR,
}
AUTHENTICATION_BACKENDS = [
'django_auth_ldap.backend.LDAPBackend',
]
except ModuleNotFoundError:
sys.stderr.write('[X] Error: Found LDAP=True config but LDAP packages not installed. You may need to run: pip install archivebox[ldap]\n\n')
# dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
# sys.exit(1)
AUTHENTICATION_BACKENDS = [
'django_auth_ldap.backend.LDAPBackend',
]
################################################################################
### Debug Settings

View file

@ -71,7 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
result = run(cmd, cwd=out_dir, timeout=timeout)
try:
result_json = json.loads(result.stdout)
assert result_json and 'content' in result_json
assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
except json.JSONDecodeError:
raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
@ -85,7 +85,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
if line.strip()
]
hints = (

View file

@ -533,11 +533,27 @@ def log_shell_welcome_msg():
### Helpers
@enforce_types
def pretty_path(path: Union[Path, str]) -> str:
def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=OUTPUT_DIR) -> str:
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
pwd = Path('.').resolve()
# parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
return str(path).replace(str(pwd) + '/', './')
pwd = str(Path(pwd)) # .resolve()
path = str(path)
if not path:
return path
# replace long absolute paths with ./ relative ones to save on terminal output width
if path.startswith(pwd) and (pwd != '/'):
path = path.replace(pwd, '.', 1)
# quote paths containing spaces
if ' ' in path:
path = f'"{path}"'
# if path is just a plain dot, replace it back with the absolute path for clarity
if path == '.':
path = pwd
return path
@enforce_types
@ -578,6 +594,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
else:
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
if folder['path']:
if Path(folder['path']).exists():
num_files = (
@ -592,13 +609,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
# add symbol @ next to filecount if path is a remote filesystem mount
num_files = f'{num_files} @' if num_files else '@'
path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
if path and ' ' in path:
path = f'"{path}"'
# if path is just a plain dot, replace it back with the full path for clarity
if path == '.':
path = str(OUTPUT_DIR)
path = pretty_path(folder['path'])
return ' '.join((
ANSI[color],
@ -629,9 +640,7 @@ def printable_dependency_version(name: str, dependency: Dict) -> str:
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
path = str(dependency["path"]).replace(str(OUTPUT_DIR), '.') if dependency["path"] else ''
if path and ' ' in path:
path = f'"{path}"'
path = pretty_path(dependency['path'])
return ' '.join((
ANSI[color],

View file

@ -218,7 +218,7 @@ def version(quiet: bool=False,
if not quiet:
# 0.6.3
# ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
# DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 501:20 SEARCH_BACKEND=ripgrep
# DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 FS_USER=501:20 SEARCH_BACKEND=ripgrep
p = platform.uname()
print(
@ -238,7 +238,8 @@ def version(quiet: bool=False,
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
f'FS_PERMS={OUTPUT_PERMISSIONS} {PUID}:{PGID}',
f'FS_USER={PUID}:{PGID}',
f'FS_PERMS={OUTPUT_PERMISSIONS}',
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
)
print()
@ -253,19 +254,19 @@ def version(quiet: bool=False,
print()
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
for name, folder in CODE_LOCATIONS.items():
print(printable_folder_status(name, folder))
for name, path in CODE_LOCATIONS.items():
print(printable_folder_status(name, path))
print()
print('{white}[i] Secrets locations:{reset}'.format(**ANSI))
for name, folder in EXTERNAL_LOCATIONS.items():
print(printable_folder_status(name, folder))
for name, path in EXTERNAL_LOCATIONS.items():
print(printable_folder_status(name, path))
print()
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
print('{white}[i] Data locations:{reset}'.format(**ANSI))
for name, folder in DATA_LOCATIONS.items():
print(printable_folder_status(name, folder))
for name, path in DATA_LOCATIONS.items():
print(printable_folder_status(name, path))
else:
print()
print('{white}[i] Data locations:{reset}'.format(**ANSI))

View file

@ -1,62 +1,3 @@
{% extends "base.html" %}
{% load static %}
{% block body %}
<div id="toolbar">
<form id="changelist-search" action="{% url 'public-index' %}" method="get">
<div>
<label for="searchbar"><img src="/static/admin/img/search.svg" alt="Search"></label>
<input type="text" size="40" name="q" value="" id="searchbar" autofocus placeholder="Title, URL, tags, timestamp, or content...".>
<input type="submit" value="Search" style="height: 36px; padding-top: 6px; margin: 8px"/>
<input type="button"
value="♺"
title="Refresh..."
onclick="location.href='{% url 'public-index' %}'"
style="background-color: rgba(121, 174, 200, 0.8); height: 30px; font-size: 0.8em; margin-top: 12px; padding-top: 6px; float:right">
</input>
</div>
</form>
</div>
<table id="table-bookmarks">
<thead>
<tr>
<th style="width: 100px;">Bookmarked</th>
<th style="width: 26vw;">Snapshot ({{object_list|length}})</th>
<th style="width: 140px">Files</th>
<th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
</tr>
</thead>
<tbody>
{% for link in object_list %}
{% include 'main_index_row.html' with link=link %}
{% endfor %}
</tbody>
</table>
<center>
<span class="step-links">
{% if page_obj.has_previous %}
<a href="{% url 'public-index' %}?page=1">&laquo; first</a>
<a href="{% url 'public-index' %}?page={{ page_obj.previous_page_number }}">previous</a>
{% endif %}
<span class="current">
Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}.
</span>
{% if page_obj.has_next %}
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
<a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
{% endif %}
</span>
{% if page_obj.has_next %}
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
<a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
{% endif %}
</span>
<br>
</center>
{% endblock %}
{% extends "admin/base_site.html" %}
{% load i18n admin_urls static admin_list %}
{% load core_tags %}

View file

@ -33,7 +33,7 @@
<br/>
<div class="loader"></div>
<br/>
Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...
Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for detailed progress...
</center>
</div>
<form id="add-form" method="POST" class="p-form">{% csrf_token %}
@ -46,19 +46,22 @@
</form>
<br/><br/><br/>
<center id="delay-warning" style="display: none">
<small>(it's safe to leave this page, adding will continue in the background)</small>
<small>(you will be redirected to your <a href="/">Snapshot list</a> momentarily, its safe to close this page at any time)</small>
</center>
{% if absolute_add_path %}
<center id="bookmarklet">
<!-- <center id="bookmarklet">
<p>Bookmark this link to quickly add to your archive:
<a href="javascript:void(window.open('{{ absolute_add_path }}?url='+encodeURIComponent(document.location.href)));">Add to ArchiveBox</a></p>
</center>
</center> -->
{% endif %}
<script>
document.getElementById('add-form').addEventListener('submit', function(event) {
document.getElementById('in-progress').style.display = 'block'
document.getElementById('add-form').style.display = 'none'
document.getElementById('delay-warning').style.display = 'block'
setTimeout(function() {
window.location = '/'
}, 2000)
return true
})
</script>

View file

@ -65,7 +65,8 @@ check_platforms || (recreate_builder && check_platforms) || exit 1
echo "[+] Building archivebox:$VERSION docker image..."
#docker build . \
# docker builder prune
# docker build . --no-cache -t archivebox-dev \
docker buildx build --platform "$REQUIRED_PLATFORMS" --load . \
-t archivebox \
-t archivebox:$TAG_NAME \

View file

@ -12,8 +12,8 @@ if [[ -n "$PGID" && "$PGID" != 0 ]]; then
groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
fi
PUID="$(id -u archivebox)"
PGID="$(id -g archivebox)"
export PUID="$(id -u archivebox)"
export PGID="$(id -g archivebox)"
# Check the permissions of the data dir (or create if it doesn't exist)
if [[ -d "$DATA_DIR/archive" ]]; then
@ -33,7 +33,6 @@ else
fi
chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR" "$DATA_DIR"/*
# Drop permissions to run commands as the archivebox user
if [[ "$1" == /* || "$1" == "echo" || "$1" == "archivebox" ]]; then
# arg 1 is a binary, execute it verbatim

1718
package-lock.json generated

File diff suppressed because it is too large Load diff

View file

@ -1,14 +1,13 @@
{
"name": "archivebox",
"version": "0.6.3",
"version": "0.7.0",
"description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
"repository": "github:ArchiveBox/ArchiveBox",
"license": "MIT",
"dependencies": {
"@postlight/mercury-parser": "git+https://github.com/postlight/mercury-parser.git",
"playwright": "^1.37.1",
"@postlight/parser": "^2.2.3",
"readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git",
"single-file-cli": "^1.0.63"
"single-file-cli": "^1.1.12"
}
}

2077
pdm.lock Normal file

File diff suppressed because it is too large Load diff

121
pyproject.toml Normal file
View file

@ -0,0 +1,121 @@
[project]
name = "archivebox"
version = "0.7.0"
description = "Self-hosted internet archiving solution."
authors = [
{name = "Nick Sweeting", email = "setup.py@archivebox.io"},
]
dependencies = [
"setuptools>=68.2.2",
"croniter>=0.3.34",
"dateparser>=1.0.0",
"django-extensions>=3.0.3",
"django>=3.1.3,<3.2",
"ipython>5.0.0",
"mypy-extensions>=0.4.3",
"python-crontab>=2.5.1",
"requests>=2.24.0",
"w3lib>=1.22.0",
# "youtube-dl>=2021.04.17",
"yt-dlp>=2021.4.11",
"playwright>=1.39.0",
]
requires-python = ">=3.9"
readme = "README.md"
license = {text = "MIT"}
classifiers = [
"Development Status :: 4 - Beta",
"Environment :: Console",
"Environment :: Web Environment",
"Framework :: Django",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: End Users/Desktop",
"Intended Audience :: Information Technology",
"Intended Audience :: Legal Industry",
"Intended Audience :: System Administrators",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
"Topic :: Internet :: WWW/HTTP :: WSGI :: Application",
"Topic :: Sociology :: History",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: System :: Archiving",
"Topic :: System :: Archiving :: Backup",
"Topic :: System :: Recovery Tools",
"Topic :: Utilities",
"Typing :: Typed",
]
# pdm lock -G:all
# pdm install -G:all
[tool.pdm.dev-dependencies]
build = [
"pdm",
"bottle",
"setuptools",
"stdeb",
"twine",
"wheel",
]
lint = [
"flake8",
"mypy",
"django-stubs",
]
test = [
"pytest",
]
debug = [
"django-debug-toolbar",
"djdt_flamegraph",
"ipdb",
]
doc = [
"recommonmark",
"sphinx",
"sphinx-rtd-theme",
]
[project.optional-dependencies]
sonic = [
# echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list
# curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg
"sonic-client>=0.0.5",
]
ldap = [
# apt install libldap2-dev libsasl2-dev
"django-auth-ldap>=4.1.0",
]
[project.scripts]
archivebox = "archivebox.cli:main"
[tool.pdm.scripts]
lint = "./bin/lint.sh"
test = "./bin/test.sh"
# all = {composite = ["lint mypackage/", "test -v tests/"]}
[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"
[project.urls]
Homepage = "https://github.com/ArchiveBox/ArchiveBox"
Source = "https://github.com/ArchiveBox/ArchiveBox"
Documentation = "https://github.com/ArchiveBox/ArchiveBox/wiki"
"Bug Tracker" = "https://github.com/ArchiveBox/ArchiveBox/issues"
Changelog = "https://github.com/ArchiveBox/ArchiveBox/releases"
Roadmap = "https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap"
Community = "https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community"
Demo = "https://demo.archivebox.io"
Donate = "https://github.com/ArchiveBox/ArchiveBox/wiki/Donations"

269
setup.py
View file

@ -1,149 +1,150 @@
import json
import setuptools
from setuptools.command.test import test
#####################################################################################
# THIS FILE IS DEPRECATED AND WILL BE REMOVED EVENTUALLU
# ALL FUTURE CHANGES SHOULD HAPPEN IN pyproject.toml with pdm
#####################################################################################
from pathlib import Path
# import json
# import setuptools
# from setuptools.command.test import test
# from pathlib import Path
PKG_NAME = "archivebox"
DESCRIPTION = "Self-hosted internet archiving solution."
LICENSE = "MIT"
AUTHOR = "Nick Sweeting"
AUTHOR_EMAIL="git@nicksweeting.com"
REPO_URL = "https://github.com/ArchiveBox/ArchiveBox"
PROJECT_URLS = {
"Source": f"{REPO_URL}",
"Documentation": f"{REPO_URL}/wiki",
"Bug Tracker": f"{REPO_URL}/issues",
"Changelog": f"{REPO_URL}/releases",
"Roadmap": f"{REPO_URL}/wiki/Roadmap",
"Community": f"{REPO_URL}/wiki/Web-Archiving-Community",
"Demo": f"https://demo.archivebox.io",
"Donate": f"{REPO_URL}/wiki/Donations",
}
# PKG_NAME = "archivebox"
# DESCRIPTION = "Self-hosted internet archiving solution."
# LICENSE = "MIT"
# AUTHOR = "Nick Sweeting"
# AUTHOR_EMAIL="setup.py@archivebox.io"
# REPO_URL = "https://github.com/ArchiveBox/ArchiveBox"
# PROJECT_URLS = {
# "Source": f"{REPO_URL}",
# "Documentation": f"{REPO_URL}/wiki",
# "Bug Tracker": f"{REPO_URL}/issues",
# "Changelog": f"{REPO_URL}/releases",
# "Roadmap": f"{REPO_URL}/wiki/Roadmap",
# "Community": f"{REPO_URL}/wiki/Web-Archiving-Community",
# "Demo": f"https://demo.archivebox.io",
# "Donate": f"{REPO_URL}/wiki/Donations",
# }
ROOT_DIR = Path(__file__).parent.resolve()
PACKAGE_DIR = ROOT_DIR / PKG_NAME
# ROOT_DIR = Path(__file__).parent.resolve()
# PACKAGE_DIR = ROOT_DIR / PKG_NAME
README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
# README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
# VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
PYTHON_REQUIRES = ">=3.9"
SETUP_REQUIRES = ["wheel"]
INSTALL_REQUIRES = [
# only add things here that have corresponding apt python3-packages available
# anything added here also needs to be added to our package dependencies in
# stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
# if there is no apt python3-package equivalent, then vendor it instead in
# ./archivebox/vendor/
"requests>=2.24.0",
"mypy-extensions>=0.4.3",
"django>=3.1.3,<3.2",
"django-extensions>=3.0.3",
"dateparser>=1.0.0",
"youtube-dl>=2021.04.17",
"yt-dlp>=2021.4.11",
"python-crontab>=2.5.1",
"croniter>=0.3.34",
"w3lib>=1.22.0",
"ipython>5.0.0",
]
EXTRAS_REQUIRE = {
'sonic': [
"sonic-client>=0.0.5",
],
'ldap': [
"django-auth-ldap>=4.1.0",
],
'dev': [
"build",
"setuptools",
"twine",
"wheel",
"flake8",
"ipdb",
"mypy",
"django-stubs",
"sphinx",
"sphinx-rtd-theme",
"recommonmark",
"pytest",
"bottle",
"stdeb",
"django-debug-toolbar",
"djdt_flamegraph",
],
}
# class DisabledTestCommand(test):
# def run(self):
# # setup.py test is deprecated, disable it here by force so stdeb doesnt run it
# print('\n[X] Running tests via setup.py test is deprecated.')
# print(' Hint: Use the ./bin/test.sh script or pytest instead')
# To see when setup.py gets called (uncomment for debugging):
# import sys
# print(PACKAGE_DIR, f" (v{VERSION})")
# print('>', sys.executable, *sys.argv)
# PYTHON_REQUIRES = ">=3.9"
# SETUP_REQUIRES = ["wheel"]
# INSTALL_REQUIRES = [
# # only add things here that have corresponding apt python3-packages available
# # anything added here also needs to be added to our package dependencies in
# # stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
# # if there is no apt python3-package equivalent, then vendor it instead in
# # ./archivebox/vendor/
# "requests>=2.24.0",
# "mypy-extensions>=0.4.3",
# "django>=3.1.3,<3.2",
# "django-extensions>=3.0.3",
# "dateparser>=1.0.0",
# "youtube-dl>=2021.04.17",
# "yt-dlp>=2021.4.11",
# "python-crontab>=2.5.1",
# "croniter>=0.3.34",
# "w3lib>=1.22.0",
# "ipython>5.0.0",
# ]
# EXTRAS_REQUIRE = {
# 'sonic': [
# "sonic-client>=0.0.5",
# ],
# 'ldap': [
# "django-auth-ldap>=4.1.0",
# ],
# 'dev': [
# "setuptools",
# "twine",
# "wheel",
# "flake8",
# "ipdb",
# "mypy",
# "django-stubs",
# "sphinx",
# "sphinx-rtd-theme",
# "recommonmark",
# "pytest",
# "bottle",
# "stdeb",
# "django-debug-toolbar",
# "djdt_flamegraph",
# ],
# }
#
# setuptools.setup(
# name=PKG_NAME,
# version=VERSION,
# license=LICENSE,
# author=AUTHOR,
# author_email=AUTHOR_EMAIL,
# description=DESCRIPTION,
# long_description=README,
# long_description_content_type="text/markdown",
# url=REPO_URL,
# project_urls=PROJECT_URLS,
# python_requires=PYTHON_REQUIRES,
# setup_requires=SETUP_REQUIRES,
# install_requires=INSTALL_REQUIRES,
# extras_require=EXTRAS_REQUIRE,
# packages=[PKG_NAME],
# include_package_data=True, # see MANIFEST.in
# entry_points={
# "console_scripts": [
# f"{PKG_NAME} = {PKG_NAME}.cli:main",
# ],
# },
# classifiers=[
# "License :: OSI Approved :: MIT License",
# "Natural Language :: English",
# "Operating System :: OS Independent",
# "Development Status :: 4 - Beta",
class DisabledTestCommand(test):
def run(self):
# setup.py test is deprecated, disable it here by force so stdeb doesnt run it
print()
print('[X] Running tests via setup.py test is deprecated.')
print(' Hint: Use the ./bin/test.sh script or pytest instead')
# "Topic :: Utilities",
# "Topic :: System :: Archiving",
# "Topic :: System :: Archiving :: Backup",
# "Topic :: System :: Recovery Tools",
# "Topic :: Sociology :: History",
# "Topic :: Internet :: WWW/HTTP",
# "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
# "Topic :: Internet :: WWW/HTTP :: WSGI :: Application",
# "Topic :: Software Development :: Libraries :: Python Modules",
setuptools.setup(
name=PKG_NAME,
version=VERSION,
license=LICENSE,
author=AUTHOR,
author_email=AUTHOR_EMAIL,
description=DESCRIPTION,
long_description=README,
long_description_content_type="text/markdown",
url=REPO_URL,
project_urls=PROJECT_URLS,
python_requires=PYTHON_REQUIRES,
setup_requires=SETUP_REQUIRES,
install_requires=INSTALL_REQUIRES,
extras_require=EXTRAS_REQUIRE,
packages=[PKG_NAME],
include_package_data=True, # see MANIFEST.in
entry_points={
"console_scripts": [
f"{PKG_NAME} = {PKG_NAME}.cli:main",
],
},
classifiers=[
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Operating System :: OS Independent",
"Development Status :: 4 - Beta",
"Topic :: Utilities",
"Topic :: System :: Archiving",
"Topic :: System :: Archiving :: Backup",
"Topic :: System :: Recovery Tools",
"Topic :: Sociology :: History",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
"Topic :: Internet :: WWW/HTTP :: WSGI :: Application",
"Topic :: Software Development :: Libraries :: Python Modules",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: End Users/Desktop",
"Intended Audience :: Information Technology",
"Intended Audience :: Legal Industry",
"Intended Audience :: System Administrators",
# "Intended Audience :: Developers",
# "Intended Audience :: Education",
# "Intended Audience :: End Users/Desktop",
# "Intended Audience :: Information Technology",
# "Intended Audience :: Legal Industry",
# "Intended Audience :: System Administrators",
"Environment :: Console",
"Environment :: Web Environment",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Framework :: Django",
"Typing :: Typed",
],
cmdclass={
"test": DisabledTestCommand,
},
)
# "Environment :: Console",
# "Environment :: Web Environment",
# "Programming Language :: Python :: 3",
# "Programming Language :: Python :: 3.7",
# "Programming Language :: Python :: 3.8",
# "Programming Language :: Python :: 3.9",
# "Framework :: Django",
# "Typing :: Typed",
# ],
# cmdclass={
# "test": DisabledTestCommand,
# },
# )