1
0
Fork 0
mirror of synced 2024-05-17 02:43:16 +12:00

Merge branch 'dev' into fix-URL_REGEX

This commit is contained in:
Nick Sweeting 2024-04-23 19:53:58 -07:00 committed by GitHub
commit 17f40f3ada
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
67 changed files with 4341 additions and 1844 deletions

View file

@ -28,4 +28,5 @@ assets/
docker/
data/
data*/
output/

View file

@ -81,6 +81,13 @@ jobs:
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
- name: Update README
uses: peter-evans/dockerhub-description@v4
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
repository: archivebox/archivebox
# This ugly bit is necessary if you don't want your cache to grow forever
# until it hits GitHub's limit of 5GB.

View file

@ -35,7 +35,7 @@ jobs:
cache: true
- name: Install dependencies
run: pdm install --fail-fast --no-lock --group :all --no-self
run: pdm install --fail-fast --no-lock --dev --group=':all' --no-self
- name: Build package
run: |

5
.gitignore vendored
View file

@ -12,6 +12,10 @@ venv/
.docker-venv/
node_modules/
# Ignore dev lockfiles (should always be built fresh)
requirements-dev.txt
pdm.dev.lock
# Packaging artifacts
.pdm-python
.pdm-build
@ -25,6 +29,7 @@ data/
data1/
data2/
data3/
data*/
output/
# vim

View file

@ -10,7 +10,7 @@
# docker run -v "$PWD/data":/data -p 8000:8000 archivebox server
# Multi-arch build:
# docker buildx create --use
# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
# docker buildx build . --platform=linux/amd64,linux/arm64--push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
#
# Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
@ -20,10 +20,24 @@ FROM python:3.11-slim-bookworm
LABEL name="archivebox" \
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
description="All-in-one personal internet archiving container" \
description="All-in-one self-hosted internet archiving solution" \
homepage="https://github.com/ArchiveBox/ArchiveBox" \
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker"
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker" \
org.opencontainers.image.title="ArchiveBox" \
org.opencontainers.image.vendor="ArchiveBox" \
org.opencontainers.image.description="All-in-one self-hosted internet archiving solution" \
org.opencontainers.image.source="https://github.com/ArchiveBox/ArchiveBox" \
com.docker.image.source.entrypoint="Dockerfile" \
# TODO: release ArchiveBox as a Docker Desktop extension (requires these labels):
# https://docs.docker.com/desktop/extensions-sdk/architecture/metadata/
com.docker.desktop.extension.api.version=">= 1.4.7" \
com.docker.desktop.extension.icon="https://archivebox.io/icon.png" \
com.docker.extension.publisher-url="https://archivebox.io" \
com.docker.extension.screenshots='[{"alt": "Screenshot of Admin UI", "url": "https://github.com/ArchiveBox/ArchiveBox/assets/511499/e8e0b6f8-8fdf-4b7f-8124-c10d8699bdb2"}]' \
com.docker.extension.detailed-description='See here for detailed documentation: https://wiki.archivebox.io' \
com.docker.extension.changelog='See here for release notes: https://github.com/ArchiveBox/ArchiveBox/releases' \
com.docker.extension.categories='database,utility-tools'
ARG TARGETPLATFORM
ARG TARGETOS
ARG TARGETARCH
@ -167,7 +181,6 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
curl wget git yt-dlp ffmpeg ripgrep \
# Packages we have also needed in the past:
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
# fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& rm -rf /var/lib/apt/lists/* \
# Save version info
&& ( \
@ -183,6 +196,11 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing Browser binary dependencies to $PLAYWRIGHT_BROWSERS_PATH..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
# chrome can run without dbus/upower technically, it complains about missing dbus but should run ok anyway
# libxss1 dbus dbus-x11 upower \
# && service dbus start \
&& if [[ "$TARGETPLATFORM" == *amd64* || "$TARGETPLATFORM" == *arm64* ]]; then \
# install Chromium using playwright
pip install playwright \
@ -190,10 +208,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
&& playwright install --with-deps chromium \
&& export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \
else \
# fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
apt-get install -qq -y -t bookworm-backports --no-install-recommends \
chromium fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& export CHROME_BINARY="$(which chromium)"; \
# fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
# apt-get install -qq -y -t bookworm-backports --no-install-recommends \
# chromium \
# && export CHROME_BINARY="$(which chromium)"; \
echo 'armv7 no longer supported in versions after v0.7.3' \
exit 1; \
fi \
&& rm -rf /var/lib/apt/lists/* \
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
@ -262,9 +282,15 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
# Setup ArchiveBox runtime config
WORKDIR "$DATA_DIR"
ENV IN_DOCKER=True
ENV IN_DOCKER=True \
DISPLAY=novnc:0.0 \
CUSTOM_TEMPLATES_DIR=/data/templates \
CHROME_USER_DATA_DIR=/data/personas/Default/chromium \
GOOGLE_API_KEY=no \
GOOGLE_DEFAULT_CLIENT_ID=no \
GOOGLE_DEFAULT_CLIENT_SECRET=no \
ALLOWED_HOSTS=*
## No need to set explicitly, these values will be autodetected by archivebox in docker:
# CHROME_SANDBOX=False \
# WGET_BINARY="wget" \
# YOUTUBEDL_BINARY="yt-dlp" \
# CHROME_BINARY="/usr/bin/chromium-browser" \
@ -289,9 +315,8 @@ WORKDIR "$DATA_DIR"
VOLUME "$DATA_DIR"
EXPOSE 8000
# Optional:
# HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
# CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
CMD curl --silent 'http://localhost:8000/health/' | grep -q 'OK'
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]

View file

@ -1076,7 +1076,7 @@ Because ArchiveBox is designed to ingest a large volume of URLs with multiple co
<li><strong>Don't store large collections on older filesystems like EXT3/FAT</strong> as they may not be able to handle more than 50k directory entries in the <code>data/archive/</code> folder.
</li>
<li><strong>Try to keep the <code>data/index.sqlite3</code> file on local drive (not a network mount)</strong> or SSD for maximum performance, however the <code>data/archive/</code> folder can be on a network mount or slower HDD.</li>
<li>If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to set <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid"><code>PUID</code> & <code>PGID</code></a> and <a href="https://github.com/ArchiveBox/ArchiveBox/issues/1304">disable <code>root_squash</code></a> on your fileshare server.
<li>If using Docker or NFS/SMB/FUSE for the <code>data/archive/</code> folder, you may need to set <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid"><code>PUID</code> & <code>PGID</code></a> and <a href="https://github.com/ArchiveBox/ArchiveBox/issues/1304">disable <code>root_squash</code></a> on your fileshare server.
</li>
</ul>

View file

@ -112,6 +112,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
},
'ARCHIVE_METHOD_TOGGLES': {
@ -136,14 +137,15 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
},
'ARCHIVE_METHOD_OPTIONS': {
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'},
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'USER_AGENT': {'type': str, 'default': None},
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'COOKIES_FILE': {'type': str, 'default': None},
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
@ -151,7 +153,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'CHROME_TIMEOUT': {'type': int, 'default': 0},
'CHROME_HEADLESS': {'type': bool, 'default': True},
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
'CHROME_EXTRA_ARGS': {'type': list, 'default': None},
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
'--restrict-filenames',
'--trim-filenames', '128',
'--write-description',
'--write-info-json',
'--write-annotations',
@ -173,6 +179,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--add-metadata',
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
]},
'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
'WGET_ARGS': {'type': list, 'default': ['--no-verbose',
@ -184,12 +191,17 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--no-parent',
'-e', 'robots=off',
]},
'WGET_EXTRA_ARGS': {'type': list, 'default': None},
'CURL_ARGS': {'type': list, 'default': ['--silent',
'--location',
'--compressed'
]},
'CURL_EXTRA_ARGS': {'type': list, 'default': None},
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
'SINGLEFILE_ARGS': {'type': list, 'default' : None},
'SINGLEFILE_ARGS': {'type': list, 'default': None},
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
'MERCURY_ARGS': {'type': list, 'default': ['--format=text']},
'MERCURY_EXTRA_ARGS': {'type': list, 'default': None},
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
},
@ -269,6 +281,7 @@ TEMPLATES_DIR_NAME = 'templates'
ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources'
LOGS_DIR_NAME = 'logs'
PERSONAS_DIR_NAME = 'personas'
SQL_INDEX_FILENAME = 'index.sqlite3'
JSON_INDEX_FILENAME = 'index.json'
HTML_INDEX_FILENAME = 'index.html'
@ -342,9 +355,11 @@ ALLOWED_IN_OUTPUT_DIR = {
'static',
'sonic',
'search.sqlite3',
'crontabs',
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
PERSONAS_DIR_NAME,
SQL_INDEX_FILENAME,
f'{SQL_INDEX_FILENAME}-wal',
f'{SQL_INDEX_FILENAME}-shm',
@ -363,24 +378,32 @@ ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
############################## Version Config ##################################
def get_system_user():
SYSTEM_USER = getpass.getuser() or os.getlogin()
def get_system_user() -> str:
# some host OS's are unable to provide a username (k3s, Windows), making this complicated
# uid 999 is especially problematic and breaks many attempts
SYSTEM_USER = None
FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
# Option 1
try:
import pwd
return pwd.getpwuid(os.geteuid()).pw_name or SYSTEM_USER
except KeyError:
# Process' UID might not map to a user in cases such as running the Docker image
# (where `archivebox` is 999) as a different UID.
pass
except ModuleNotFoundError:
# pwd doesn't exist on windows
pass
except Exception:
# this should never happen, uncomment to debug
# raise
SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
except (ModuleNotFoundError, Exception):
pass
return SYSTEM_USER
# Option 2
try:
SYSTEM_USER = SYSTEM_USER or getpass.getuser()
except Exception:
pass
# Option 3
try:
SYSTEM_USER = SYSTEM_USER or os.getlogin()
except Exception:
pass
return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
def get_version(config):
try:
@ -487,9 +510,10 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
@ -519,6 +543,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
@ -529,18 +554,22 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
@ -550,6 +579,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
@ -571,6 +601,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
}
@ -899,27 +930,36 @@ def find_chrome_binary() -> Optional[str]:
def find_chrome_data_dir() -> Optional[str]:
"""find any installed chrome user data directories in the default locations"""
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# make sure data dir finding precedence order always matches binary finding order
default_profile_paths = (
'~/.config/chromium',
'~/Library/Application Support/Chromium',
'~/AppData/Local/Chromium/User Data',
'~/.config/chrome',
'~/.config/google-chrome',
'~/Library/Application Support/Google/Chrome',
'~/AppData/Local/Google/Chrome/User Data',
'~/.config/google-chrome-stable',
'~/.config/google-chrome-beta',
'~/Library/Application Support/Google/Chrome Canary',
'~/AppData/Local/Google/Chrome SxS/User Data',
'~/.config/google-chrome-unstable',
'~/.config/google-chrome-dev',
)
for path in default_profile_paths:
full_path = Path(path).resolve()
if full_path.exists():
return full_path
# deprecated because this is DANGEROUS, do not re-implement/uncomment this behavior.
# Going forward we want to discourage people from using their main chrome profile for archiving.
# Session tokens, personal data, and cookies are often returned in server responses,
# when they get archived, they are essentially burned as anyone who can view the archive
# can use that data to masquerade as the logged-in user that did the archiving.
# For this reason users should always create dedicated burner profiles for archiving and not use
# their daily driver main accounts.
# # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# # make sure data dir finding precedence order always matches binary finding order
# default_profile_paths = (
# '~/.config/chromium',
# '~/Library/Application Support/Chromium',
# '~/AppData/Local/Chromium/User Data',
# '~/.config/chrome',
# '~/.config/google-chrome',
# '~/Library/Application Support/Google/Chrome',
# '~/AppData/Local/Google/Chrome/User Data',
# '~/.config/google-chrome-stable',
# '~/.config/google-chrome-beta',
# '~/Library/Application Support/Google/Chrome Canary',
# '~/AppData/Local/Google/Chrome SxS/User Data',
# '~/.config/google-chrome-unstable',
# '~/.config/google-chrome-dev',
# )
# for path in default_profile_paths:
# full_path = Path(path).resolve()
# if full_path.exists():
# return full_path
return None
def wget_supports_compression(config):
@ -990,6 +1030,11 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
'enabled': True,
'is_valid': config['LOGS_DIR'].exists(),
},
'PERSONAS_DIR': {
'path': config['PERSONAS_DIR'].resolve(),
'enabled': True,
'is_valid': config['PERSONAS_DIR'].exists(),
},
'ARCHIVE_DIR': {
'path': config['ARCHIVE_DIR'].resolve(),
'enabled': True,
@ -1337,6 +1382,8 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO
(Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
(Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
(Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True)
(Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True)

View file

@ -1 +1,2 @@
__package__ = 'archivebox.core'

View file

@ -6,6 +6,7 @@ from contextlib import redirect_stdout
from datetime import datetime, timezone
from django.contrib import admin
from django.db.models import Count
from django.urls import path
from django.utils.html import format_html
from django.utils.safestring import mark_safe
@ -23,8 +24,16 @@ from core.mixins import SearchResultsAdminMixin
from index.html import snapshot_icons
from logging_util import printable_filesize
from main import add, remove
from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE
from extractors import archive_links
from config import (
OUTPUT_DIR,
SNAPSHOTS_PER_PAGE,
VERSION,
VERSIONS_AVAILABLE,
CAN_UPGRADE
)
GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
# Admin URLs
# /admin/
@ -39,6 +48,60 @@ from extractors import archive_links
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
class ArchiveBoxAdmin(admin.AdminSite):
site_header = 'ArchiveBox'
index_title = 'Links'
site_title = 'Index'
namespace = 'admin'
def get_urls(self):
return [
path('core/snapshot/add/', self.add_view, name='Add'),
] + super().get_urls()
def add_view(self, request):
if not request.user.is_authenticated:
return redirect(f'/admin/login/?next={request.path}')
request.current_app = self.name
context = {
**self.each_context(request),
'title': 'Add URLs',
}
if request.method == 'GET':
context['form'] = AddLinkForm()
elif request.method == 'POST':
form = AddLinkForm(request.POST)
if form.is_valid():
url = form.cleaned_data["url"]
print(f'[+] Adding URL: {url}')
depth = 0 if form.cleaned_data["depth"] == "0" else 1
input_kwargs = {
"urls": url,
"depth": depth,
"update_all": False,
"out_dir": OUTPUT_DIR,
}
add_stdout = StringIO()
with redirect_stdout(add_stdout):
add(**input_kwargs)
print(add_stdout.getvalue())
context.update({
"stdout": ansi_to_html(add_stdout.getvalue().strip()),
"form": AddLinkForm()
})
else:
context["form"] = form
return render(template_name='add.html', request=request, context=context)
archivebox_admin = ArchiveBoxAdmin()
archivebox_admin.register(get_user_model())
archivebox_admin.disable_action('delete_selected')
class ArchiveResultInline(admin.TabularInline):
model = ArchiveResult
@ -48,11 +111,11 @@ class TagInline(admin.TabularInline):
from django.contrib.admin.helpers import ActionForm
from django.contrib.admin.widgets import AutocompleteSelectMultiple
# WIP: broken by Django 3.1.2 -> 4.0 migration
class AutocompleteTags:
model = Tag
search_fields = ['name']
name = 'tags'
remote_field = TagInline
class AutocompleteTagsAdminStub:
name = 'admin'
@ -62,7 +125,6 @@ class SnapshotActionForm(ActionForm):
tags = forms.ModelMultipleChoiceField(
queryset=Tag.objects.all(),
required=False,
# WIP: broken by Django 3.1.2 -> 4.0 migration
widget=AutocompleteSelectMultiple(
AutocompleteTags(),
AutocompleteTagsAdminStub(),
@ -81,6 +143,7 @@ class SnapshotActionForm(ActionForm):
# )
@admin.register(Snapshot, site=archivebox_admin)
class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
list_display = ('added', 'title_str', 'files', 'size', 'url_str')
sort_fields = ('title_str', 'url_str', 'added', 'files')
@ -96,6 +159,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
action_form = SnapshotActionForm
def changelist_view(self, request, extra_context=None):
extra_context = extra_context or {}
return super().changelist_view(request, extra_context | GLOBAL_CONTEXT)
def get_urls(self):
urls = super().get_urls()
custom_urls = [
@ -105,7 +172,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
def get_queryset(self, request):
self.request = request
return super().get_queryset(request).prefetch_related('tags')
return super().get_queryset(request).prefetch_related('tags').annotate(archiveresult_count=Count('archiveresult'))
def tag_list(self, obj):
return ', '.join(obj.tags.values_list('name', flat=True))
@ -163,6 +230,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
obj.id,
)
@admin.display(
description='Title',
ordering='title',
)
def title_str(self, obj):
canon = obj.as_link().canonical_outputs()
tags = ''.join(
@ -184,12 +255,17 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
) + mark_safe(f' <span class="tags">{tags}</span>')
@admin.display(
description='Files Saved',
ordering='archiveresult_count',
)
def files(self, obj):
return snapshot_icons(obj)
files.admin_order_field = 'updated'
files.short_description = 'Files Saved'
@admin.display(
ordering='archiveresult_count'
)
def size(self, obj):
archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
if archive_size:
@ -204,8 +280,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
size_txt,
)
size.admin_order_field = 'archiveresult__count'
@admin.display(
description='Original URL',
ordering='url',
)
def url_str(self, obj):
return format_html(
'<a href="{}"><code style="user-select: all;">{}</code></a>',
@ -242,65 +321,76 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
# print('[*] Got request', request.method, request.POST)
# return super().changelist_view(request, extra_context=None)
@admin.action(
description="Pull"
)
def update_snapshots(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], out_dir=OUTPUT_DIR)
update_snapshots.short_description = "Pull"
@admin.action(
description="⬇️ Title"
)
def update_titles(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
update_titles.short_description = "⬇️ Title"
@admin.action(
description="Re-Snapshot"
)
def resnapshot_snapshot(self, request, queryset):
for snapshot in queryset:
timestamp = datetime.now(timezone.utc).isoformat('T', 'seconds')
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
add(new_url, tag=snapshot.tags_str())
resnapshot_snapshot.short_description = "Re-Snapshot"
@admin.action(
description="Reset"
)
def overwrite_snapshots(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], overwrite=True, out_dir=OUTPUT_DIR)
overwrite_snapshots.short_description = "Reset"
@admin.action(
description="Delete"
)
def delete_snapshots(self, request, queryset):
remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
delete_snapshots.short_description = "Delete"
@admin.action(
description="+"
)
def add_tags(self, request, queryset):
tags = request.POST.getlist('tags')
print('[+] Adding tags', tags, 'to Snapshots', queryset)
for obj in queryset:
obj.tags.add(*tags)
add_tags.short_description = "+"
@admin.action(
description=""
)
def remove_tags(self, request, queryset):
tags = request.POST.getlist('tags')
print('[-] Removing tags', tags, 'to Snapshots', queryset)
for obj in queryset:
obj.tags.remove(*tags)
remove_tags.short_description = ""
title_str.short_description = 'Title'
url_str.short_description = 'Original URL'
title_str.admin_order_field = 'title'
url_str.admin_order_field = 'url'
@admin.register(Tag, site=archivebox_admin)
class TagAdmin(admin.ModelAdmin):
list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id')
sort_fields = ('id', 'name', 'slug')
@ -331,6 +421,7 @@ class TagAdmin(admin.ModelAdmin):
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={obj.id}">and {total_count-10} more...<a>' if obj.snapshot_set.count() > 10 else ''))
@admin.register(ArchiveResult, site=archivebox_admin)
class ArchiveResultAdmin(admin.ModelAdmin):
list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'tags_str', 'cmd_str', 'status', 'output_str')
sort_fields = ('start_ts', 'extractor', 'status')
@ -343,6 +434,9 @@ class ArchiveResultAdmin(admin.ModelAdmin):
ordering = ['-start_ts']
list_per_page = SNAPSHOTS_PER_PAGE
@admin.display(
description='snapshot'
)
def snapshot_str(self, obj):
return format_html(
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b></a><br/>'
@ -352,6 +446,9 @@ class ArchiveResultAdmin(admin.ModelAdmin):
obj.snapshot.url[:128],
)
@admin.display(
description='tags'
)
def tags_str(self, obj):
return obj.snapshot.tags_str()
@ -368,62 +465,3 @@ class ArchiveResultAdmin(admin.ModelAdmin):
obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html',
obj.output,
)
tags_str.short_description = 'tags'
snapshot_str.short_description = 'snapshot'
class ArchiveBoxAdmin(admin.AdminSite):
site_header = 'ArchiveBox'
index_title = 'Links'
site_title = 'Index'
def get_urls(self):
return [
path('core/snapshot/add/', self.add_view, name='Add'),
] + super().get_urls()
def add_view(self, request):
if not request.user.is_authenticated:
return redirect(f'/admin/login/?next={request.path}')
request.current_app = self.name
context = {
**self.each_context(request),
'title': 'Add URLs',
}
if request.method == 'GET':
context['form'] = AddLinkForm()
elif request.method == 'POST':
form = AddLinkForm(request.POST)
if form.is_valid():
url = form.cleaned_data["url"]
print(f'[+] Adding URL: {url}')
depth = 0 if form.cleaned_data["depth"] == "0" else 1
input_kwargs = {
"urls": url,
"depth": depth,
"update_all": False,
"out_dir": OUTPUT_DIR,
}
add_stdout = StringIO()
with redirect_stdout(add_stdout):
add(**input_kwargs)
print(add_stdout.getvalue())
context.update({
"stdout": ansi_to_html(add_stdout.getvalue().strip()),
"form": AddLinkForm()
})
else:
context["form"] = form
return render(template_name='add.html', request=request, context=context)
admin.site = ArchiveBoxAdmin()
admin.site.register(get_user_model())
admin.site.register(Snapshot, SnapshotAdmin)
admin.site.register(Tag, TagAdmin)
admin.site.register(ArchiveResult, ArchiveResultAdmin)
admin.site.disable_action('delete_selected')

View file

@ -3,5 +3,8 @@ from django.apps import AppConfig
class CoreConfig(AppConfig):
name = 'core'
# WIP: broken by Django 3.1.2 -> 4.0 migration
default_auto_field = 'django.db.models.UUIDField'
def ready(self):
from .auth import register_signals
register_signals()

13
archivebox/core/auth.py Normal file
View file

@ -0,0 +1,13 @@
import os
from django.conf import settings
from ..config import (
LDAP
)
def register_signals():
if LDAP:
import django_auth_ldap.backend
from .auth_ldap import create_user
django_auth_ldap.backend.populate_user.connect(create_user)

View file

@ -0,0 +1,12 @@
from django.conf import settings
from ..config import (
LDAP_CREATE_SUPERUSER
)
def create_user(sender, user=None, ldap_user=None, **kwargs):
if not user.id and LDAP_CREATE_SUPERUSER:
user.is_superuser = True
user.is_staff = True
print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')

View file

@ -18,4 +18,4 @@ class SearchResultsAdminMixin:
print(f'[!] Error while using search backend: {err.__class__.__name__} {err}')
messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}')
return qs, use_distinct
return qs.distinct(), use_distinct

View file

@ -269,9 +269,6 @@ AUTH_PASSWORD_VALIDATORS = [
{'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
]
# WIP: broken by Django 3.1.2 -> 4.0 migration
DEFAULT_AUTO_FIELD = 'django.db.models.UUIDField'
################################################################################
### Shell Settings
################################################################################
@ -290,7 +287,6 @@ if IS_SHELL:
LANGUAGE_CODE = 'en-us'
USE_I18N = True
USE_L10N = True
USE_TZ = True
DATETIME_FORMAT = 'Y-m-d g:iA'
SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'

View file

@ -1,4 +1,4 @@
from django.contrib import admin
from .admin import archivebox_admin
from django.urls import path, include
from django.views import static
@ -8,11 +8,6 @@ from django.views.generic.base import RedirectView
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
# print('DEBUG', settings.DEBUG)
urlpatterns = [
@ -34,11 +29,8 @@ urlpatterns = [
path('accounts/', include('django.contrib.auth.urls')),
path('admin/', admin.site.urls),
path('admin/', archivebox_admin.urls),
# do not add extra_context like this as not all admin views (e.g. ModelAdmin.autocomplete_view accept extra kwargs)
# path('admin/', admin.site.urls, {'extra_context': GLOBAL_CONTEXT}),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
path('error/', lambda _: 1/0),

View file

@ -231,7 +231,7 @@ class PublicIndexView(ListView):
qs = qs | query_search_index(query)
except Exception as err:
print(f'[!] Error while using search backend: {err.__class__.__name__} {err}')
return qs
return qs.distinct()
def get(self, *args, **kwargs):
if PUBLIC_INDEX or self.request.user.is_authenticated:

View file

@ -131,7 +131,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
link = load_link_details(link, out_dir=out_dir)
write_link_details(link, out_dir=out_dir, skip_sql_index=False)
log_link_archiving_started(link, out_dir, is_new)
log_link_archiving_started(link, str(out_dir), is_new)
link = link.overwrite(updated=datetime.now(timezone.utc))
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
start_ts = datetime.now(timezone.utc)
@ -165,16 +165,6 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
# print('{black} X {}{reset}'.format(method_name, **ANSI))
stats['skipped'] += 1
except Exception as e:
# Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
# and https://github.com/ArchiveBox/ArchiveBox/issues/1014
# are fixed.
"""
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
method_name,
link.url,
)) from e
"""
# Instead, use the kludgy workaround from
# https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
with open(ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
@ -186,6 +176,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
ts
) + "\n" + str(e) + "\n"))
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
# print(f' ERROR: {method_name} {e.__class__.__name__}: {e} {getattr(e, "hints", "")}', ts, link.url, command)
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
method_name,
link.url,
)) from e
# print(' ', stats)
@ -218,7 +215,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
if type(all_links) is QuerySet:
num_links: int = all_links.count()
get_link = lambda x: x.as_link()
get_link = lambda x: x.as_link_with_details()
all_links = all_links.iterator()
else:
num_links: int = len(all_links)

View file

@ -10,10 +10,12 @@ from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
dedupe,
)
from ..config import (
TIMEOUT,
CURL_ARGS,
CURL_EXTRA_ARGS,
CHECK_SSL_VALIDITY,
SAVE_ARCHIVE_DOT_ORG,
CURL_BINARY,
@ -44,13 +46,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
output: ArchiveOutput = 'archive.org.txt'
archive_org_url = None
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
cmd = [
CURL_BINARY,
# later options take precedence
options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--head',
'--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
*dedupe(options),
submit_url,
]
status = 'succeeded'

View file

@ -6,13 +6,18 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..system import chmod_file, run
from ..util import enforce_types, domain
from ..util import (
enforce_types,
domain,
dedupe,
)
from ..config import (
TIMEOUT,
SAVE_FAVICON,
FAVICON_PROVIDER,
CURL_BINARY,
CURL_ARGS,
CURL_EXTRA_ARGS,
CURL_VERSION,
CHECK_SSL_VALIDITY,
CURL_USER_AGENT,
@ -34,13 +39,18 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'favicon.ico'
cmd = [
CURL_BINARY,
# later options take precedence
options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--max-time', str(timeout),
'--output', str(output),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
*dedupe(options),
FAVICON_PROVIDER.format(domain(link.url)),
]
status = 'failed'

View file

@ -9,11 +9,13 @@ from ..system import atomic_write
from ..util import (
enforce_types,
get_headers,
dedupe,
)
from ..config import (
TIMEOUT,
CURL_BINARY,
CURL_ARGS,
CURL_EXTRA_ARGS,
CURL_USER_AGENT,
CURL_VERSION,
CHECK_SSL_VALIDITY,
@ -40,14 +42,18 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
cmd = [
CURL_BINARY,
# later options take precedence
options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--head',
'--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
*dedupe(options),
link.url,
]
try:

View file

@ -121,9 +121,11 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
out_dir = Path(out_dir or link.link_dir)
output = "htmltotext.txt"
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
timer = TimedProgress(timeout, prefix=' ')
extracted_text = None
status = 'failed'
try:
extractor = HTMLTextExtractor()
document = get_html(link, out_dir)
@ -136,10 +138,9 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
extracted_text = str(extractor)
atomic_write(str(out_dir / output), extracted_text)
status = 'succeeded'
except (Exception, OSError) as err:
status = 'failed'
output = err
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
finally:
timer.end()

View file

@ -8,11 +8,13 @@ from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
dedupe,
)
from ..config import (
MEDIA_TIMEOUT,
SAVE_MEDIA,
YOUTUBEDL_ARGS,
YOUTUBEDL_EXTRA_ARGS,
YOUTUBEDL_BINARY,
YOUTUBEDL_VERSION,
CHECK_SSL_VALIDITY
@ -39,11 +41,16 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
output: ArchiveOutput = 'media'
output_path = out_dir / output
output_path.mkdir(exist_ok=True)
cmd = [
YOUTUBEDL_BINARY,
# later options take precedence
options = [
*YOUTUBEDL_ARGS,
*YOUTUBEDL_EXTRA_ARGS,
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
]
cmd = [
YOUTUBEDL_BINARY,
*dedupe(options),
link.url,
]
status = 'succeeded'

View file

@ -11,13 +11,15 @@ from ..system import run, atomic_write
from ..util import (
enforce_types,
is_static_file,
dedupe,
)
from ..config import (
TIMEOUT,
SAVE_MERCURY,
DEPENDENCIES,
MERCURY_VERSION,
MERCURY_ARGS,
MERCURY_EXTRA_ARGS,
)
from ..logging_util import TimedProgress
@ -60,12 +62,16 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
timer = TimedProgress(timeout, prefix=' ')
try:
output_folder.mkdir(exist_ok=True)
# Get plain text version of article
# later options take precedence
options = [
*MERCURY_ARGS,
*MERCURY_EXTRA_ARGS,
]
# By default, get plain text version of article
cmd = [
DEPENDENCIES['MERCURY_BINARY']['path'],
link.url,
"--format=text"
*dedupe(options)
]
result = run(cmd, cwd=out_dir, timeout=timeout)
try:

View file

@ -11,6 +11,7 @@ from ..util import (
enforce_types,
is_static_file,
chrome_args,
dedupe,
)
from ..config import (
TIMEOUT,
@ -18,6 +19,7 @@ from ..config import (
DEPENDENCIES,
SINGLEFILE_VERSION,
SINGLEFILE_ARGS,
SINGLEFILE_EXTRA_ARGS,
CHROME_BINARY,
COOKIES_FILE,
)
@ -47,38 +49,24 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
# later options take precedence
options = [
*SINGLEFILE_ARGS,
*(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []),
'--browser-executable-path={}'.format(CHROME_BINARY),
*(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []),
browser_args,
*SINGLEFILE_ARGS,
*SINGLEFILE_EXTRA_ARGS,
]
# Deduplicate options (single-file doesn't like when you use the same option two times)
#
# NOTE: Options names that come first clobber conflicting names that come later
# My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
# specificity, therefore the user sets it with a lot intent, therefore it should take precedence
# kind of like the ergonomic principle of lexical scope in programming languages.
seen_option_names = []
def test_seen(argument):
option_name = argument.split("=")[0]
if option_name in seen_option_names:
return False
else:
seen_option_names.append(option_name)
return True
deduped_options = list(filter(test_seen, options))
cmd = [
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
*deduped_options,
*dedupe(options),
link.url,
output,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
result = None
try:
result = run(cmd, cwd=str(out_dir), timeout=timeout)
@ -86,7 +74,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
if line.strip()
]
hints = (
@ -96,12 +84,13 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
# Check for common failure cases
if (result.returncode > 0) or not (out_dir / output).is_file():
raise ArchiveError('SingleFile was not able to archive the page', hints)
raise ArchiveError(f'SingleFile was not able to archive the page (status={result.returncode})', hints)
chmod_file(output, cwd=str(out_dir))
except (Exception, OSError) as err:
status = 'failed'
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
cmd[2] = browser_args.replace('"', "\\\"")
err.hints = (result.stdout + result.stderr).decode().split('\n')
output = err
finally:
timer.end()

View file

@ -10,6 +10,7 @@ from ..util import (
enforce_types,
download_url,
htmldecode,
dedupe,
)
from ..config import (
TIMEOUT,
@ -17,6 +18,7 @@ from ..config import (
SAVE_TITLE,
CURL_BINARY,
CURL_ARGS,
CURL_EXTRA_ARGS,
CURL_VERSION,
CURL_USER_AGENT,
)
@ -75,7 +77,7 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
with open(abs_path / source, "r", encoding="utf-8") as f:
document = f.read()
break
except (FileNotFoundError, TypeError):
except (FileNotFoundError, TypeError, UnicodeDecodeError):
continue
if document is None:
return download_url(link.url, timeout=timeout)
@ -102,12 +104,17 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
from core.models import Snapshot
output: ArchiveOutput = None
cmd = [
CURL_BINARY,
# later options take precedence
options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
*dedupe(options),
link.url,
]
status = 'succeeded'

View file

@ -15,9 +15,11 @@ from ..util import (
path,
domain,
urldecode,
dedupe,
)
from ..config import (
WGET_ARGS,
WGET_EXTRA_ARGS,
TIMEOUT,
SAVE_WGET,
SAVE_WARC,
@ -55,10 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
output: ArchiveOutput = None
cmd = [
WGET_BINARY,
# '--server-response', # print headers for better error parsing
# later options take precedence
options = [
*WGET_ARGS,
*WGET_EXTRA_ARGS,
'--timeout={}'.format(timeout),
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
@ -68,6 +70,11 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
*([] if SAVE_WARC else ['--timestamping']),
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
# '--server-response', # print headers for better error parsing
]
cmd = [
WGET_BINARY,
*dedupe(options),
link.url,
]
@ -202,4 +209,9 @@ def wget_output_path(link: Link) -> Optional[str]:
if search_dir.is_dir():
return domain(link.url).replace(":", "+")
# fallback to just the domain dir without port
search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0]
if search_dir.is_dir():
return domain(link.url).split(":", 1)[0]
return None

View file

@ -250,7 +250,7 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
"""parse and load existing index with any new links from import_path merged in"""
from core.models import Snapshot
try:
return Snapshot.objects.all()
return Snapshot.objects.all().only('id')
except (KeyboardInterrupt, SystemExit):
raise SystemExit(0)
@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links without checking archive status or data directory validity"""
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
links = (snapshot.as_link() for snapshot in snapshots.iterator())
return {
link.link_dir: link
for link in links
@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are archived with a valid data directory"""
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
links = (snapshot.as_link() for snapshot in snapshots.iterator())
return {
link.link_dir: link
for link in filter(is_archived, links)
@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are unarchived with no data directory or an empty data directory"""
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
links = (snapshot.as_link() for snapshot in snapshots.iterator())
return {
link.link_dir: link
for link in filter(is_unarchived, links)

View file

@ -379,11 +379,15 @@ class Link:
output_paths = (
domain(self.url),
'output.html',
'output.pdf',
'screenshot.png',
'output.html',
'singlefile.html',
'readability/content.html',
'mercury/content.html',
'htmltotext.txt',
'media',
'singlefile.html'
'git',
)
return any(

View file

@ -432,12 +432,14 @@ def log_archive_method_finished(result: "ArchiveResult"):
**ANSI,
),
]
# import pudb; pudb.set_trace()
# Prettify error output hints string and limit to five lines
hints = getattr(result.output, 'hints', None) or ()
if hints:
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
hints = [hint.decode() for hint in hints if isinstance(hint, bytes)]
hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
else:
if isinstance(hints, bytes):
hints = hints.decode()
@ -492,12 +494,12 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
if delete:
file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()]
print(
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' +
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
)
else:
print(
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n' +
' (Pass --delete if you also want to permanently delete the data folders)'
)
@ -636,17 +638,15 @@ def printable_folder_status(name: str, folder: Dict) -> str:
@enforce_types
def printable_dependency_version(name: str, dependency: Dict) -> str:
version = None
color, symbol, note, version = 'red', 'X', 'invalid', '?'
if dependency['enabled']:
if dependency['is_valid']:
color, symbol, note, version = 'green', '', 'valid', ''
color, symbol, note = 'green', '', 'valid'
parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
if parsed_version_num:
version = f'v{parsed_version_num[0]}'
if not version:
color, symbol, note, version = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'

View file

@ -791,6 +791,8 @@ def update(resume: Optional[float]=None,
out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
from core.models import ArchiveResult
check_data_folder(out_dir=out_dir)
check_dependencies()
new_links: List[Link] = [] # TODO: Remove input argument: only_new
@ -798,19 +800,23 @@ def update(resume: Optional[float]=None,
extractors = extractors.split(",") if extractors else []
# Step 1: Filter for selected_links
print('[*] Finding matching Snapshots to update...')
print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
matching_snapshots = list_links(
filter_patterns=filter_patterns,
filter_type=filter_type,
before=before,
after=after,
)
print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
matching_folders = list_folders(
links=matching_snapshots,
status=status,
out_dir=out_dir,
)
all_links = [link for link in matching_folders.values() if link]
all_links = (link for link in matching_folders.values() if link)
print(' - Sorting by most unfinished -> least unfinished + date archived...')
all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
if index_only:
for link in all_links:
@ -836,6 +842,7 @@ def update(resume: Optional[float]=None,
if extractors:
archive_kwargs["methods"] = extractors
archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
# Step 4: Re-write links index with updated titles, icons, and resources

2371
archivebox/package-lock.json generated Normal file

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
{
"name": "archivebox",
"version": "0.7.2",
"version": "0.8.0",
"description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
"repository": "github:ArchiveBox/ArchiveBox",
@ -8,6 +8,6 @@
"dependencies": {
"@postlight/parser": "^2.2.3",
"readability-extractor": "github:ArchiveBox/readability-extractor",
"single-file-cli": "^1.1.46"
"single-file-cli": "^1.1.54"
}
}

View file

@ -44,6 +44,7 @@ from . import medium_rss
from . import netscape_html
from . import generic_rss
from . import generic_json
from . import generic_jsonl
from . import generic_html
from . import generic_txt
from . import url_list
@ -63,6 +64,7 @@ PARSERS = {
netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER),
generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER),
generic_json.KEY: (generic_json.NAME, generic_json.PARSER),
generic_jsonl.KEY: (generic_jsonl.NAME, generic_jsonl.PARSER),
generic_html.KEY: (generic_html.NAME, generic_html.PARSER),
# Catchall fallback parser

View file

@ -11,6 +11,60 @@ from ..util import (
enforce_types,
)
# This gets used by generic_jsonl, too
def jsonObjectToLink(link: str, source: str):
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
# example line
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
# Parse URL
url = link.get('href') or link.get('url') or link.get('URL')
if not url:
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
# Parse the timestamp
ts_str = str(datetime.now(timezone.utc).timestamp())
if link.get('timestamp'):
# chrome/ff histories use a very precise timestamp
ts_str = str(link['timestamp'] / 10000000)
elif link.get('time'):
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
elif link.get('created_at'):
ts_str = str(json_date(link['created_at']).timestamp())
elif link.get('created'):
ts_str = str(json_date(link['created']).timestamp())
elif link.get('date'):
ts_str = str(json_date(link['date']).timestamp())
elif link.get('bookmarked'):
ts_str = str(json_date(link['bookmarked']).timestamp())
elif link.get('saved'):
ts_str = str(json_date(link['saved']).timestamp())
# Parse the title
title = None
if link.get('title'):
title = link['title'].strip()
elif link.get('description'):
title = link['description'].replace(' — Readability', '').strip()
elif link.get('name'):
title = link['name'].strip()
# if we have a list, join it with commas
tags = link.get('tags')
if type(tags) == list:
tags = ','.join(tags)
elif type(tags) == str:
# if there's no comma, assume it was space-separated
if ',' not in tags:
tags = tags.replace(' ', ',')
return Link(
url=htmldecode(url),
timestamp=ts_str,
title=htmldecode(title) or None,
tags=htmldecode(tags),
sources=[source],
)
@enforce_types
def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
@ -18,55 +72,21 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
json_file.seek(0)
# sometimes the first line is a comment or filepath, so we get everything after the first {
json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
links = json.loads(json_file_json_str)
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
try:
links = json.load(json_file)
if type(links) != list:
raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
except json.decoder.JSONDecodeError:
# sometimes the first line is a comment or other junk, so try without
json_file.seek(0)
first_line = json_file.readline()
#print(' > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '')
links = json.load(json_file)
# we may fail again, which means we really don't know what to do
for link in links:
# example line
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
if link:
# Parse URL
url = link.get('href') or link.get('url') or link.get('URL')
if not url:
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
# Parse the timestamp
ts_str = str(datetime.now(timezone.utc).timestamp())
if link.get('timestamp'):
# chrome/ff histories use a very precise timestamp
ts_str = str(link['timestamp'] / 10000000)
elif link.get('time'):
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
elif link.get('created_at'):
ts_str = str(json_date(link['created_at']).timestamp())
elif link.get('created'):
ts_str = str(json_date(link['created']).timestamp())
elif link.get('date'):
ts_str = str(json_date(link['date']).timestamp())
elif link.get('bookmarked'):
ts_str = str(json_date(link['bookmarked']).timestamp())
elif link.get('saved'):
ts_str = str(json_date(link['saved']).timestamp())
# Parse the title
title = None
if link.get('title'):
title = link['title'].strip()
elif link.get('description'):
title = link['description'].replace(' — Readability', '').strip()
elif link.get('name'):
title = link['name'].strip()
yield Link(
url=htmldecode(url),
timestamp=ts_str,
title=htmldecode(title) or None,
tags=htmldecode(link.get('tags')) or '',
sources=[json_file.name],
)
yield jsonObjectToLink(link,json_file.name)
KEY = 'json'
NAME = 'Generic JSON'

View file

@ -0,0 +1,34 @@
__package__ = 'archivebox.parsers'
import json
from typing import IO, Iterable
from datetime import datetime, timezone
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
)
from .generic_json import jsonObjectToLink
def parse_line(line: str):
if line.strip() != "":
return json.loads(line)
@enforce_types
def parse_generic_jsonl_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse JSONL format bookmarks export files"""
json_file.seek(0)
links = [ parse_line(line) for line in json_file ]
for link in links:
if link:
yield jsonObjectToLink(link,json_file.name)
KEY = 'jsonl'
NAME = 'Generic JSONL'
PARSER = parse_generic_jsonl_export

View file

@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
from time import mktime
from feedparser import parse as feedparser
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
str_between,
enforce_types
)
@enforce_types
@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse RSS XML-format files into links"""
rss_file.seek(0)
items = rss_file.read().split('<item>')
items = items[1:] if items else []
for item in items:
# example item:
# <item>
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
# <category>Unread</category>
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
# </item>
feed = feedparser(rss_file.read())
for item in feed.entries:
url = item.link
title = item.title
time = mktime(item.updated_parsed)
trailing_removed = item.split('</item>', 1)[0]
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
rows = leading_removed.split('\n')
try:
tags = ','.join(map(lambda tag: tag.term, item.tags))
except AttributeError:
tags = ''
def get_row(key):
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
url = str_between(get_row('link'), '<link>', '</link>')
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
if url is None:
# Yielding a Link with no URL will
# crash on a URL validation assertion
continue
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
timestamp=str(time),
title=htmldecode(title) or None,
tags=None,
tags=tags,
sources=[rss_file.name],
)

View file

@ -2,50 +2,41 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime, timezone
from xml.etree import ElementTree
from time import mktime
from feedparser import parse as feedparser
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
enforce_types
)
@enforce_types
def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse Pinboard RSS feed files into links"""
rss_file.seek(0)
root = ElementTree.parse(rss_file).getroot()
items = root.findall("{http://purl.org/rss/1.0/}item")
for item in items:
find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore
feed = feedparser(rss_file.read())
for item in feed.entries:
url = item.link
# title will start with "[priv] " if pin was marked private. useful?
title = item.title
time = mktime(item.updated_parsed)
url = find("{http://purl.org/rss/1.0/}link")
tags = find("{http://purl.org/dc/elements/1.1/}subject")
title = find("{http://purl.org/rss/1.0/}title")
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
# all tags are in one entry.tags with spaces in it. annoying!
try:
tags = item.tags[0].term.replace(' ', ',')
except AttributeError:
tags = ''
if url is None:
# Yielding a Link with no URL will
# crash on a URL validation assertion
continue
# Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it:
if ts_str and ts_str[-3:-2] == ":":
ts_str = ts_str[:-3]+ts_str[-2:]
if ts_str:
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
else:
time = datetime.now(timezone.utc)
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
timestamp=str(time),
title=htmldecode(title) or None,
tags=htmldecode(tags) or None,
sources=[rss_file.name],

View file

@ -30,8 +30,7 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False,
if capture_output:
if ('stdout' in kwargs) or ('stderr' in kwargs):
raise ValueError('stdout and stderr arguments may not be used '
'with capture_output.')
raise ValueError('stdout and stderr arguments may not be used with capture_output.')
kwargs['stdout'] = PIPE
kwargs['stderr'] = PIPE
@ -146,20 +145,24 @@ def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional
recursively and limiting to a given filter list
"""
num_bytes, num_dirs, num_files = 0, 0, 0
for entry in os.scandir(path):
if (pattern is not None) and (pattern not in entry.path):
continue
if entry.is_dir(follow_symlinks=False):
if not recursive:
try:
for entry in os.scandir(path):
if (pattern is not None) and (pattern not in entry.path):
continue
num_dirs += 1
bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
num_bytes += bytes_inside
num_dirs += dirs_inside
num_files += files_inside
else:
num_bytes += entry.stat(follow_symlinks=False).st_size
num_files += 1
if entry.is_dir(follow_symlinks=False):
if not recursive:
continue
num_dirs += 1
bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
num_bytes += bytes_inside
num_dirs += dirs_inside
num_files += files_inside
else:
num_bytes += entry.stat(follow_symlinks=False).st_size
num_files += 1
except OSError:
# e.g. FileNameTooLong or other error while trying to read dir
pass
return num_bytes, num_dirs, num_files
@ -171,7 +174,7 @@ def dedupe_cron_jobs(cron: CronTab) -> CronTab:
deduped: Set[Tuple[str, str]] = set()
for job in list(cron):
unique_tuple = (str(job.slices), job.command)
unique_tuple = (str(job.slices), str(job.command))
if unique_tuple not in deduped:
deduped.add(unique_tuple)
cron.remove(job)

View file

@ -5,7 +5,7 @@
<a href="{% url 'Home' %}">Snapshots</a> |
<a href="/admin/core/tag/">Tags</a> |
<a href="/admin/core/archiveresult/?o=-1">Log</a> &nbsp; &nbsp;
<a href="{% url 'Docs' %}">Docs</a> |
<a href="{% url 'Docs' %}" target="_blank" rel="noopener noreferrer">Docs</a> |
<a href="{% url 'public-index' %}">Public</a> |
<a href="/admin/">Admin</a>
&nbsp; &nbsp;

File diff suppressed because one or more lines are too long

View file

@ -3,6 +3,7 @@ __package__ = 'archivebox'
import re
import requests
import json as pyjson
import http.cookiejar
from typing import List, Optional, Any
from pathlib import Path
@ -200,9 +201,22 @@ def parse_date(date: Any) -> Optional[datetime]:
@enforce_types
def download_url(url: str, timeout: int=None) -> str:
"""Download the contents of a remote url and return the text"""
from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT
from .config import (
TIMEOUT,
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
COOKIES_FILE,
)
timeout = timeout or TIMEOUT
response = requests.get(
session = requests.Session()
if COOKIES_FILE and Path(COOKIES_FILE).is_file():
cookie_jar = http.cookiejar.MozillaCookieJar(COOKIES_FILE)
cookie_jar.load(ignore_discard=True, ignore_expires=True)
for cookie in cookie_jar:
session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path)
response = session.get(
url,
headers={'User-Agent': WGET_USER_AGENT},
verify=CHECK_SSL_VALIDITY,
@ -215,7 +229,11 @@ def download_url(url: str, timeout: int=None) -> str:
if encoding is not None:
response.encoding = encoding
return response.text
try:
return response.text
except UnicodeDecodeError:
# if response is non-test (e.g. image or other binary files), just return the filename instead
return url.rsplit('/', 1)[-1]
@enforce_types
def get_headers(url: str, timeout: int=None) -> str:
@ -257,7 +275,13 @@ def get_headers(url: str, timeout: int=None) -> str:
def chrome_args(**options) -> List[str]:
"""helper to build up a chrome shell command with arguments"""
from .config import CHROME_OPTIONS, CHROME_VERSION
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
from .config import (
CHROME_OPTIONS,
CHROME_VERSION,
CHROME_EXTRA_ARGS,
)
options = {**CHROME_OPTIONS, **options}
@ -266,6 +290,8 @@ def chrome_args(**options) -> List[str]:
cmd_args = [options['CHROME_BINARY']]
cmd_args += CHROME_EXTRA_ARGS
if options['CHROME_HEADLESS']:
chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
if chrome_major_version >= 111:
@ -284,14 +310,19 @@ def chrome_args(**options) -> List[str]:
"--disable-software-rasterizer",
"--run-all-compositor-stages-before-draw",
"--hide-scrollbars",
"--window-size=1440,2000",
"--autoplay-policy=no-user-gesture-required",
"--no-first-run",
"--use-fake-ui-for-media-stream",
"--use-fake-device-for-media-stream",
"--disable-sync",
# "--password-store=basic",
)
# disable automatic updating when running headless, as there's no user to see the upgrade prompts
cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",)
# set window size for screenshot/pdf/etc. rendering
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
if not options['CHECK_SSL_VALIDITY']:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
@ -299,16 +330,15 @@ def chrome_args(**options) -> List[str]:
if options['CHROME_USER_AGENT']:
cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
if options['RESOLUTION']:
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
if options['CHROME_TIMEOUT']:
cmd_args += ('--timeout={}'.format(options['CHROME_TIMEOUT'] * 1000),)
if options['CHROME_USER_DATA_DIR']:
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
return cmd_args
cmd_args.append('--profile-directory=Default')
return dedupe(cmd_args)
def chrome_cleanup():
"""
@ -345,6 +375,20 @@ def ansi_to_html(text):
return COLOR_REGEX.sub(single_sub, text)
@enforce_types
def dedupe(options: List[str]) -> List[str]:
"""
Deduplicates the given options. Options that come later clobber earlier
conflicting options.
"""
deduped = {}
for option in options:
deduped[option.split('=')[0]] = option
return list(deduped.values())
class AttributeDict(dict):
"""Helper to allow accessing dict values via Example.key or Example['key']"""

6
archivebox/vendor/requirements.txt vendored Normal file
View file

@ -0,0 +1,6 @@
# this folder contains vendored versions of these packages
atomicwrites==1.4.0
pocket==0.3.7
django-taggit==1.3.0
base32-crockford==0.3.0

View file

@ -10,7 +10,7 @@ set -o nounset
set -o pipefail
IFS=$'\n'
REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" > /dev/null 2>&1 && cd .. && pwd )"
CURRENT_PLAFORM="$(uname)"
@ -45,7 +45,7 @@ pip3 uninstall yt-dlp || true
# brew untap archivebox/archivebox || true
echo
echo "[+] Installing and building hombrew bottle from https://Github.com/ArchiveBox/homebrew-archivebox#main"
echo "[+] Installing and building hombrew bottle from https://github.com/ArchiveBox/homebrew-archivebox#main"
brew tap archivebox/archivebox
brew install --build-bottle archivebox
brew bottle archivebox

View file

@ -31,6 +31,20 @@ else
echo "[!] Warning: No virtualenv presesnt in $REPO_DIR.venv"
fi
# Build python package lists
# https://pdm-project.org/latest/usage/lockfile/
echo "[+] Generating requirements.txt and pdm.lock from pyproject.toml..."
pdm lock --group=':all' --production --lockfile pdm.lock --strategy="cross_platform"
pdm sync --group=':all' --production --lockfile pdm.lock --clean || pdm sync --group=':all' --production --lockfile pdm.lock --clean
pdm export --group=':all' --production --lockfile pdm.lock --without-hashes -o requirements.txt
pdm lock --group=':all' --dev --lockfile pdm.dev.lock --strategy="cross_platform"
pdm sync --group=':all' --dev --lockfile pdm.dev.lock --clean || pdm sync --group=':all' --dev --lockfile pdm.dev.lock --clean
pdm export --group=':all' --dev --lockfile pdm.dev.lock --without-hashes -o requirements-dev.txt
# cleanup build artifacts
rm -Rf build deb_dist dist archivebox-*.tar.gz

View file

@ -21,6 +21,20 @@ VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')"
REQUIRED_PLATFORMS="${2:-"linux/arm64,linux/amd64,linux/arm/v7"}"
# Build python package lists
# https://pdm-project.org/latest/usage/lockfile/
echo "[+] Generating requirements.txt and pdm.lock from pyproject.toml..."
pdm lock --group=':all' --production --lockfile pdm.lock --strategy="cross_platform"
pdm sync --group=':all' --production --lockfile pdm.lock --clean || pdm sync --group=':all' --production --lockfile pdm.lock --clean
pdm export --group=':all' --production --lockfile pdm.lock --without-hashes -o requirements.txt
pdm lock --group=':all' --dev --lockfile pdm.dev.lock --strategy="cross_platform"
pdm sync --group=':all' --dev --lockfile pdm.dev.lock --clean || pdm sync --group=':all' --dev --lockfile pdm.dev.lock --clean
pdm export --group=':all' --dev --lockfile pdm.dev.lock --without-hashes -o requirements-dev.txt
echo "[+] Building Docker image: tag=$TAG_NAME version=$SHORT_VERSION arch=$REQUIRED_PLATFORMS"
@ -32,4 +46,4 @@ docker build . --no-cache -t archivebox-dev --load
# -t archivebox \
# -t archivebox:$TAG_NAME \
# -t archivebox:$VERSION \
# -t archivebox:$SHORT_VERSION
# -t archivebox:$SHORT_VERSION

View file

@ -71,10 +71,8 @@ docker buildx use xbuilder 2>&1 >/dev/null || create_builder
check_platforms || (recreate_builder && check_platforms) || exit 1
# Build python package lists
echo "[+] Generating requirements.txt and pdm.lock from pyproject.toml..."
pdm lock --group=':all' --strategy="cross_platform" --production
pdm export --group=':all' --production --without-hashes -o requirements.txt
# Make sure pyproject.toml, pdm{.dev}.lock, requirements{-dev}.txt, package{-lock}.json are all up-to-date
bash ./bin/lock_pkgs.sh
echo "[+] Building archivebox:$VERSION docker image..."

View file

@ -20,20 +20,13 @@ else
fi
cd "$REPO_DIR"
echo "[*] Cleaning up build dirs"
cd "$REPO_DIR"
rm -Rf build dist
# Generate pdm.lock, requirements.txt, and package-lock.json
bash ./bin/lock_pkgs.sh
echo "[+] Building sdist, bdist_wheel, and egg_info"
rm -f archivebox/package.json
cp package.json archivebox/package.json
pdm self update
pdm install
rm -Rf build dist
pdm build
pdm export --without-hashes -o ./pip_dist/requirements.txt
cp dist/* ./pip_dist/
echo
echo "[√] Finished. Don't forget to commit the new sdist and wheel files in ./pip_dist/"
echo "[√] Finished. Don't forget to commit the new sdist and wheel files in ./pip_dist/"

View file

@ -35,7 +35,7 @@ export DEFAULT_PGID=911
if [[ "$PUID" == "0" ]]; then
echo -e "\n[X] Error: Got PUID=$PUID and PGID=$PGID but ArchiveBox is not allowed to be run as root, please change or unset PUID & PGID and try again." > /dev/stderr
echo -e " Hint: some NFS/SMB/FUSE/etc. filesystems force-remap/ignore all permissions," > /dev/stderr
echo -e " leave PUID/PGID unset, or use values the filesystem prefers (defaults to $DEFAULT_PUID:$DEFAULT_PGID)" > /dev/stderr
echo -e " leave PUID/PGID unset, disable root_squash, or use values the drive prefers (default is $DEFAULT_PUID:$DEFAULT_PGID)" > /dev/stderr
echo -e " https://linux.die.net/man/8/mount.cifs#:~:text=does%20not%20provide%20unix%20ownership" > /dev/stderr
exit 3
fi
@ -46,6 +46,7 @@ export DETECTED_PGID="$(stat -c '%g' "$DATA_DIR/logs/errors.log" 2>/dev/null ||
# If data directory exists but is owned by root, use defaults instead of root because root is not allowed
[[ "$DETECTED_PUID" == "0" ]] && export DETECTED_PUID="$DEFAULT_PUID"
# (GUID / DETECTED_GUID is allowed to be 0 though)
# Set archivebox user and group ids to desired PUID/PGID
usermod -o -u "${PUID:-$DETECTED_PUID}" "$ARCHIVEBOX_USER" > /dev/null 2>&1
@ -64,32 +65,42 @@ if [[ -d "$DATA_DIR/archive" ]]; then
# echo "[√] Permissions are correct"
else
# the only time this fails is if the host filesystem doesn't allow us to write as root (e.g. some NFS mapall/maproot problems, connection issues, drive dissapeared, etc.)
echo -e "\n[X] Error: archivebox user (PUID=$PUID) is not able to write to your ./data dir (currently owned by $(stat -c '%u' "$DATA_DIR"):$(stat -c '%g' "$DATA_DIR")." >&2
echo -e "\n[X] Error: archivebox user (PUID=$PUID) is not able to write to your ./data/archive dir (currently owned by $(stat -c '%u' "$DATA_DIR/archive"):$(stat -c '%g' "$DATA_DIR/archive")." > /dev/stderr
echo -e " Change ./data to be owned by PUID=$PUID PGID=$PGID on the host and retry:" > /dev/stderr
echo -e " \$ chown -R $PUID:$PGID ./data\n" > /dev/stderr
echo -e " Configure the PUID & PGID environment variables to change the desired owner:" > /dev/stderr
echo -e " https://docs.linuxserver.io/general/understanding-puid-and-pgid\n" > /dev/stderr
echo -e " Hint: some NFS/SMB/FUSE/etc. filesystems force-remap/ignore all permissions," > /dev/stderr
echo -e " leave PUID/PGID unset, or use values the filesystem prefers (defaults to $DEFAULT_PUID:$DEFAULT_PGID)" > /dev/stderr
echo -e " leave PUID/PGID unset, disable root_squash, or use values the drive prefers (default is $DEFAULT_PUID:$DEFAULT_PGID)" > /dev/stderr
echo -e " https://linux.die.net/man/8/mount.cifs#:~:text=does%20not%20provide%20unix%20ownership" > /dev/stderr
exit 3
fi
else
# create data directory
# create data directory (and logs, since its the first dir ArchiveBox needs to write to)
mkdir -p "$DATA_DIR/logs"
fi
# force set the ownership of the data dir contents to the archivebox user and group
# this is needed because Docker Desktop often does not map user permissions from the host properly
chown $PUID:$PGID "$DATA_DIR"
chown $PUID:$PGID "$DATA_DIR"/*
if ! chown $PUID:$PGID "$DATA_DIR"/* > /dev/null 2>&1; then
# users may store the ./data/archive folder on a network mount that prevents chmod/chown
# fallback to chowning everything else in ./data and leaving ./data/archive alone
find "$DATA_DIR" -type d -not -path "$DATA_DIR/archive*" -exec chown $PUID:$PGID {} \; > /dev/null 2>&1
find "$DATA_DIR" -type f -not -path "$DATA_DIR/archive/*" -exec chown $PUID:$PGID {} \; > /dev/null 2>&1
fi
# also chown BROWSERS_DIR because otherwise 'archivebox setup' wont be able to install chrome at runtime
# also chown BROWSERS_DIR because otherwise 'archivebox setup' wont be able to 'playwright install chromium' at runtime
export PLAYWRIGHT_BROWSERS_PATH="${PLAYWRIGHT_BROWSERS_PATH:-/browsers}"
mkdir -p "$PLAYWRIGHT_BROWSERS_PATH/permissions_test_safe_to_delete"
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/*
rm -Rf "$PLAYWRIGHT_BROWSERS_PATH/permissions_test_safe_to_delete"
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"
if [[ -d "$PLAYWRIGHT_BROWSERS_PATH/.links" ]]; then
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/*
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/.*
chown -h $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/.links/*
fi
# (this check is written in blood in 2023, QEMU silently breaks things in ways that are not obvious)
@ -100,7 +111,7 @@ if [[ "$IN_QEMU" == "True" ]]; then
echo -e " See here for more info: https://github.com/microsoft/playwright/issues/17395#issuecomment-1250830493\n" > /dev/stderr
fi
# check disk space free on / and /data, warn on <500Mb free, error on <100Mb free
# check disk space free on /, /data, and /data/archive, warn on <500Mb free, error on <100Mb free
export ROOT_USAGE="$(df --output=pcent,avail / | tail -n 1 | xargs)"
export ROOT_USED_PCT="${ROOT_USAGE%%%*}"
export ROOT_AVAIL_KB="$(echo "$ROOT_USAGE" | awk '{print $2}')"
@ -117,22 +128,58 @@ elif [[ "$ROOT_USED_PCT" -ge 99 ]] || [[ "$ROOT_AVAIL_KB" -lt 500000 ]]; then
df -kh / > /dev/stderr
fi
export DATA_USAGE="$(df --output=pcent,avail /data | tail -n 1 | xargs)"
export DATA_USAGE="$(df --output=pcent,avail "$DATA_DIR" | tail -n 1 | xargs)"
export DATA_USED_PCT="${DATA_USAGE%%%*}"
export DATA_AVAIL_KB="$(echo "$DATA_USAGE" | awk '{print $2}')"
if [[ "$DATA_AVAIL_KB" -lt 100000 ]]; then
echo -e "\n[!] Warning: Docker data volume is completely out of space! (${DATA_USED_PCT}% used on /data)" > /dev/stderr
echo -e "\n[!] Warning: Docker data volume is completely out of space! (${DATA_USED_PCT}% used on $DATA_DIR)" > /dev/stderr
echo -e " you need to free up at least 100Mb on the drive holding your data directory" > /dev/stderr
echo -e " \$ ncdu -x data\n" > /dev/stderr
df -kh /data > /dev/stderr
df -kh "$DATA_DIR" > /dev/stderr
sleep 5
elif [[ "$DATA_USED_PCT" -ge 99 ]] || [[ "$ROOT_AVAIL_KB" -lt 500000 ]]; then
echo -e "\n[!] Warning: Docker data volume is running out of space! (${DATA_USED_PCT}% used on /data)" > /dev/stderr
echo -e "\n[!] Warning: Docker data volume is running out of space! (${DATA_USED_PCT}% used on $DATA_DIR)" > /dev/stderr
echo -e " you may need to free up space on the drive holding your data directory soon" > /dev/stderr
echo -e " \$ ncdu -x data\n" > /dev/stderr
df -kh /data > /dev/stderr
df -kh "$DATA_DIR" > /dev/stderr
else
# data/ has space available, but check data/archive separately, because it might be on a network mount or external drive
if [[ -d "$DATA_DIR/archive" ]]; then
export ARCHIVE_USAGE="$(df --output=pcent,avail "$DATA_DIR/archive" | tail -n 1 | xargs)"
export ARCHIVE_USED_PCT="${ARCHIVE_USAGE%%%*}"
export ARCHIVE_AVAIL_KB="$(echo "$ARCHIVE_USAGE" | awk '{print $2}')"
if [[ "$ARCHIVE_AVAIL_KB" -lt 100000 ]]; then
echo -e "\n[!] Warning: data/archive folder is completely out of space! (${ARCHIVE_USED_PCT}% used on $DATA_DIR/archive)" > /dev/stderr
echo -e " you need to free up at least 100Mb on the drive holding your data/archive directory" > /dev/stderr
echo -e " \$ ncdu -x data/archive\n" > /dev/stderr
df -kh "$DATA_DIR/archive" > /dev/stderr
sleep 5
elif [[ "$ARCHIVE_USED_PCT" -ge 99 ]] || [[ "$ROOT_AVAIL_KB" -lt 500000 ]]; then
echo -e "\n[!] Warning: data/archive folder is running out of space! (${ARCHIVE_USED_PCT}% used on $DATA_DIR/archive)" > /dev/stderr
echo -e " you may need to free up space on the drive holding your data/archive directory soon" > /dev/stderr
echo -e " \$ ncdu -x data/archive\n" > /dev/stderr
df -kh "$DATA_DIR/archive" > /dev/stderr
fi
fi
fi
# symlink etc crontabs into place
mkdir -p "$DATA_DIR/crontabs"
if ! test -L /var/spool/cron/crontabs; then
# copy files from old location into new data dir location
for file in $(ls /var/spool/cron/crontabs); do
cp /var/spool/cron/crontabs/"$file" "$DATA_DIR/crontabs"
done
# replace old system path with symlink to data dir location
rm -Rf /var/spool/cron/crontabs
ln -s "$DATA_DIR/crontabs" /var/spool/cron/crontabs
fi
# set DBUS_SYSTEM_BUS_ADDRESS & DBUS_SESSION_BUS_ADDRESS
# (dbus is not actually needed, it makes chrome log fewer warnings but isn't worth making our docker images bigger)
# service dbus start >/dev/null 2>&1 &
# export $(dbus-launch --close-stderr)
export ARCHIVEBOX_BIN_PATH="$(which archivebox)"

100
bin/lock_pkgs.sh Executable file
View file

@ -0,0 +1,100 @@
#!/usr/bin/env bash
### Bash Environment Setup
# http://redsymbol.net/articles/unofficial-bash-strict-mode/
# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
# set -o xtrace
set -o errexit
set -o errtrace
set -o nounset
set -o pipefail
IFS=$'\n'
REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
cd "$REPO_DIR"
py_version="$(grep 'version = ' pyproject.toml | awk '{print $3}' | jq -r)"
js_version="$(jq -r '.version' package.json)"
if [[ "$py_version" != "$js_version" ]]; then
echo "[❌] Version in pyproject.toml ($py_version) does not match version in package.json ($js_version)!"
exit 1
fi
echo "[🔒] Locking all ArchiveBox dependencies (pip, npm)"
echo
echo "pyproject.toml: archivebox $py_version"
echo "package.json: archivebox $js_version"
echo
echo
echo "[*] Cleaning up old lockfiles and build files"
rm -Rf build dist
rm -f pdm.lock
rm -f pdm.dev.lock
rm -f requirements.txt
rm -f requirements-dev.txt
rm -f package-lock.json
rm -f archivebox/package.json
rm -f archivebox/package-lock.json
rm -Rf ./.venv
rm -Rf ./node_modules
rm -Rf ./archivebox/node_modules
echo
echo
echo "[+] Generating dev & prod requirements.txt & pdm.lock from pyproject.toml..."
pip install --upgrade pip setuptools
pdm self update
pdm venv create 3.12
echo
echo "pyproject.toml: archivebox $(grep 'version = ' pyproject.toml | awk '{print $3}' | jq -r)"
echo "$(which python): $(python --version | head -n 1)"
echo "$(which pdm): $(pdm --version | head -n 1)"
pdm info --env
pdm info
echo
# https://pdm-project.org/latest/usage/lockfile/
# prod
pdm lock --group=':all' --production --lockfile pdm.lock --strategy="cross_platform"
pdm sync --group=':all' --production --lockfile pdm.lock --clean
pdm export --group=':all' --production --lockfile pdm.lock --without-hashes -o requirements.txt
cp ./pdm.lock ./pip_dist/
cp ./requirements.txt ./pip_dist/
# dev
pdm lock --group=':all' --dev --lockfile pdm.dev.lock --strategy="cross_platform"
pdm sync --group=':all' --dev --lockfile pdm.dev.lock --clean
pdm export --group=':all' --dev --lockfile pdm.dev.lock --without-hashes -o requirements-dev.txt
cp ./pdm.dev.lock ./pip_dist/
cp ./requirements-dev.txt ./pip_dist/
echo
echo "[+]] Generating package-lock.json from package.json..."
npm install -g npm
echo
echo "package.json: archivebox $(jq -r '.version' package.json)"
echo
echo "$(which node): $(node --version | head -n 1)"
echo "$(which npm): $(npm --version | head -n 1)"
echo
npm install --package-lock-only
cp package.json archivebox/package.json
cp package-lock.json archivebox/package-lock.json
echo
echo "[√] Finished. Don't forget to commit the new lockfiles:"
echo
ls "pyproject.toml" | cat
ls "pdm.lock" | cat
ls "pdm.dev.lock" | cat
ls "requirements.txt" | cat
ls "requirements-dev.txt" | cat
echo
ls "package.json" | cat
ls "package-lock.json" | cat
ls "archivebox/package.json" | cat
ls "archivebox/package-lock.json" | cat

View file

@ -165,7 +165,7 @@ if ! (python3 --version && python3 -m pip --version && python3 -m django --versi
exit 1
fi
if ! (python3 -m django --version && which -a archivebox); then
if ! (python3 -m django --version && python3 -m pip show archivebox && which -a archivebox); then
echo "[X] Django and ArchiveBox were not found after installing!"
echo " Check to see if a previous step failed."
echo

View file

@ -14,4 +14,4 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
source "$DIR/.venv/bin/activate"
pytest -s --basetemp=tests/out --ignore=archivebox/vendor --ignore=deb_dist --ignore=pip_dist --ignore=brew_dist
pytest -s --basetemp=tests/out "$@"

View file

@ -1,39 +1,31 @@
# Usage:
# docker compose run archivebox init --setup
# docker compose up
# echo "https://example.com" | docker compose run archivebox archivebox add
# docker compose run archivebox add --depth=1 https://example.com/some/feed.rss
# docker compose run archivebox config --set MEDIA_MAX_SIZE=750m
# echo 'https://example.com' | docker compose run -T archivebox add
# docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
# docker compose run archivebox config --set SAVE_ARCHIVE_DOT_ORG=False
# docker compose run archivebox help
# Documentation:
# https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose
version: '3.9'
services:
archivebox:
#image: ${DOCKER_IMAGE:-archivebox/archivebox:dev}
image: archivebox/archivebox:dev
command: server --quick-init 0.0.0.0:8000
image: archivebox/archivebox:latest
ports:
- 8000:8000
volumes:
- ./data:/data
# - ./etc/crontabs:/var/spool/cron/crontabs # uncomment this and archivebox_scheduler below to set up automatic recurring archive jobs
# - ./archivebox:/app/archivebox # uncomment this to mount the ArchiveBox source code at runtime (for developers working on archivebox)
# build: . # uncomment this to build the image from source code at buildtime (for developers working on archivebox)
environment:
- ALLOWED_HOSTS=* # restrict this to only accept incoming traffic via specific domain name
# - PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list
# - PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content
# - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive
# - ADMIN_USERNAME=admin # create an admin user on first run with the given user/pass combo
# - ADMIN_PASSWORD=SomeSecretPassword
- ALLOWED_HOSTS=* # restrict this to only accept incoming traffic via specific domain name
- PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list
- PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content
- PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive
- SEARCH_BACKEND_ENGINE=sonic # tells ArchiveBox to use sonic container below for fast full-text search
- SEARCH_BACKEND_HOST_NAME=sonic
- SEARCH_BACKEND_PASSWORD=SomeSecretPassword
# - PUID=911 # set to your host user's UID & GID if you encounter permissions issues
# - PGID=911
# - SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search
# - SEARCH_BACKEND_HOST_NAME=sonic
# - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
# - PGID=911 # UID/GIDs <500 may clash with existing users and are not recommended
# - MEDIA_MAX_SIZE=750m # increase this filesize limit to allow archiving larger audio/video files
# - TIMEOUT=60 # increase this number to 120+ seconds if you see many slow downloads timing out
# - CHECK_SSL_VALIDITY=True # set to False to disable strict SSL checking (allows saving URLs w/ broken certs)
@ -41,8 +33,7 @@ services:
# ...
# add further configuration options from archivebox/config.py as needed (to apply them only to this container)
# or set using `docker compose run archivebox config --set SOME_KEY=someval` (to persist config across all containers)
# For ad-blocking during archiving, uncomment this section and pihole service section below
# For ad-blocking during archiving, uncomment this section and pihole service section below
# networks:
# - dns
# dns:
@ -51,29 +42,85 @@ services:
######## Optional Addons: tweak examples below as needed for your specific use case ########
### Example: To run the Sonic full-text search backend, first download the config file to sonic.cfg
# $ curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic.cfg
# After starting, backfill any existing Snapshots into the full-text index:
### This optional container runs any scheduled tasks in the background, add new tasks like so:
# $ docker compose run archivebox schedule --add --every=day --depth=1 'https://example.com/some/rss/feed.xml'
# then restart the scheduler container to apply any changes to the scheduled task list:
# $ docker compose restart archivebox_scheduler
archivebox_scheduler:
image: archivebox/archivebox:latest
command: schedule --foreground --update --every=day
environment:
- TIMEOUT=120 # use a higher timeout than the main container to give slow tasks more time when retrying
# - PUID=502 # set to your host user's UID & GID if you encounter permissions issues
# - PGID=20
volumes:
- ./data:/data
# cpus: 2 # uncomment / edit these values to limit scheduler container resource consumption
# mem_limit: 2048m
# restart: always
### This runs the optional Sonic full-text search backend (much faster than default rg backend).
# If Sonic is ever started after not running for a while, update its full-text index by running:
# $ docker-compose run archivebox update --index-only
# sonic:
# image: valeriansaliou/sonic:latest
# expose:
# - 1491
# environment:
# - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
# volumes:
# - ./sonic.cfg:/etc/sonic.cfg:ro
# - ./data/sonic:/var/lib/sonic/store
sonic:
image: valeriansaliou/sonic:latest
build:
# custom build just auto-downloads archivebox's default sonic.cfg as a convenience
# not needed after first run / if you have already have ./etc/sonic.cfg present
dockerfile_inline: |
FROM quay.io/curl/curl:latest AS config_downloader
RUN curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/main/etc/sonic.cfg' > /tmp/sonic.cfg
FROM valeriansaliou/sonic:latest
COPY --from=config_downloader /tmp/sonic.cfg /etc/sonic.cfg
expose:
- 1491
environment:
- SEARCH_BACKEND_PASSWORD=SomeSecretPassword
volumes:
- ./sonic.cfg:/etc/sonic.cfg
- ./data/sonic:/var/lib/sonic/store
### This container runs xvfb+noVNC so you can watch the ArchiveBox browser as it archives things,
# or remote control it to set up a chrome profile w/ login credentials for sites you want to archive.
# https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile
novnc:
image: theasp/novnc:latest
environment:
- DISPLAY_WIDTH=1920
- DISPLAY_HEIGHT=1080
- RUN_XTERM=no
ports:
# to view/control ArchiveBox's browser, visit: http://127.0.0.1:8080/vnc.html
# restricted to access from localhost by default because it has no authentication
- 127.0.0.1:8080:8080
### Example: Put Nginx in front of the ArchiveBox server for SSL termination and static file serving.
# You can also any other ingress provider for SSL like Apache, Caddy, Traefik, Cloudflare Tunnels, etc.
# nginx:
# image: nginx:alpine
# ports:
# - 443:443
# - 80:80
# volumes:
# - ./etc/nginx.conf:/etc/nginx/nginx.conf
# - ./data:/var/www
### Example: To run pihole in order to block ad/tracker requests during archiving,
# uncomment this block and set up pihole using its admin interface
# pihole:
# image: pihole/pihole:latest
# ports:
# - 127.0.0.1:8090:80 # uncomment to access the admin HTTP interface on http://localhost:8090
# # access the admin HTTP interface on http://localhost:8090
# - 127.0.0.1:8090:80
# environment:
# - WEBPASSWORD=SET_THIS_TO_SOME_SECRET_PASSWORD_FOR_ADMIN_DASHBOARD
# - DNSMASQ_LISTENING=all
@ -88,43 +135,8 @@ services:
# - ./etc/dnsmasq:/etc/dnsmasq.d
### Example: Enable ability to run regularly scheduled archiving tasks by uncommenting this container
# $ docker compose run archivebox schedule --every=day --depth=1 'https://example.com/some/rss/feed.xml'
# then restart the scheduler container to apply the changes to the schedule
# $ docker compose restart archivebox_scheduler
# archivebox_scheduler:
# image: ${DOCKER_IMAGE:-archivebox/archivebox:dev}
# command: schedule --foreground
# environment:
# - MEDIA_MAX_SIZE=750m # increase this number to allow archiving larger audio/video files
# # - TIMEOUT=60 # increase if you see timeouts often during archiving / on slow networks
# # - ONLY_NEW=True # set to False to retry previously failed URLs when re-adding instead of skipping them
# # - CHECK_SSL_VALIDITY=True # set to False to allow saving URLs w/ broken SSL certs
# # - SAVE_ARCHIVE_DOT_ORG=True # set to False to disable submitting URLs to Archive.org when archiving
# # - PUID=502 # set to your host user's UID & GID if you encounter permissions issues
# # - PGID=20
# volumes:
# - ./data:/data
# - ./etc/crontabs:/var/spool/cron/crontabs
# # cpus: 2 # uncomment / edit these values to limit container resource consumption
# # mem_limit: 2048m
# # shm_size: 1024m
### Example: Put Nginx in front of the ArchiveBox server for SSL termination
# nginx:
# image: nginx:alpine
# ports:
# - 443:443
# - 80:80
# volumes:
# - ./etc/nginx.conf:/etc/nginx/nginx.conf
# - ./data:/var/www
### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel
### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel to avoid IP blocks.
# You can also use any other VPN that works at the docker IP level, e.g. Tailscale, OpenVPN, etc.
# wireguard:
# image: linuxserver/wireguard:latest
@ -155,10 +167,30 @@ services:
networks:
# network needed for pihole container to offer :53 dns resolving on fixed ip for archivebox container
# network just used for pihole container to offer :53 dns resolving on fixed ip for archivebox container
dns:
ipam:
driver: default
config:
- subnet: 172.20.0.0/24
# To use remote storage for your ./data/archive (e.g. Amazon S3, Backblaze B2, Google Drive, OneDrive, SFTP, etc.)
# Follow the steps here to set up the Docker RClone Plugin https://rclone.org/docker/
# $ docker plugin install rclone/docker-volume-rclone:amd64 --grant-all-permissions --alias rclone
# $ nano /var/lib/docker-plugins/rclone/config/rclone.conf
# [examplegdrive]
# type = drive
# scope = drive
# drive_id = 1234567...
# root_folder_id = 0Abcd...
# token = {"access_token":...}
# volumes:
# archive:
# driver: rclone
# driver_opts:
# remote: 'examplegdrive:archivebox'
# allow_other: 'true'
# vfs_cache_mode: full
# poll_interval: 0

View file

@ -6,6 +6,7 @@
[server]
# log_level = "debug"
log_level = "warn"

482
package-lock.json generated
View file

@ -1,23 +1,33 @@
{
"name": "archivebox",
"version": "0.7.2",
"version": "0.8.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "archivebox",
"version": "0.7.2",
"version": "0.8.0",
"license": "MIT",
"dependencies": {
"@postlight/parser": "^2.2.3",
"readability-extractor": "github:ArchiveBox/readability-extractor",
"single-file-cli": "^1.1.46"
"single-file-cli": "^1.1.54"
}
},
"node_modules/@asamuzakjp/dom-selector": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/@asamuzakjp/dom-selector/-/dom-selector-2.0.2.tgz",
"integrity": "sha512-x1KXOatwofR6ZAYzXRBL5wrdV0vwNxlTCK9NCuLqAzQYARqGcvFwiJA6A1ERuh+dgeA4Dxm3JBYictIes+SqUQ==",
"dependencies": {
"bidi-js": "^1.0.3",
"css-tree": "^2.3.1",
"is-potential-custom-element-name": "^1.0.1"
}
},
"node_modules/@babel/runtime-corejs2": {
"version": "7.23.7",
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.23.7.tgz",
"integrity": "sha512-JmMk2t1zGDNkvsY2MsLLksocjY+ufGzSk8UlcNcxzfrzAPu4nMx0HRFakzIg2bhcqQq6xBI2nUaW/sHoaYIHdQ==",
"version": "7.24.4",
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.24.4.tgz",
"integrity": "sha512-ZCKqyUKt/Coimg+3Kafu43yNetgYnTXzNbEGAgxc81J5sI0qFNbQ613w7PNny+SmijAmGVroL0GDvx5rG/JI5Q==",
"dependencies": {
"core-js": "^2.6.12",
"regenerator-runtime": "^0.14.0"
@ -168,9 +178,9 @@
}
},
"node_modules/@puppeteer/browsers": {
"version": "1.8.0",
"resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-1.8.0.tgz",
"integrity": "sha512-TkRHIV6k2D8OlUe8RtG+5jgOF/H98Myx0M6AOafC8DdNVOFiBSFa5cpRDtpm8LXOa9sVwe0+e6Q3FC56X/DZfg==",
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.0.0.tgz",
"integrity": "sha512-3PS82/5+tnpEaUWonjAFFvlf35QHF15xqyGd34GBa5oP5EPVfFXRsbSxIGYf1M+vZlqBZ3oxT1kRg9OYhtt8ng==",
"dependencies": {
"debug": "4.3.4",
"extract-zip": "2.0.1",
@ -184,7 +194,7 @@
"browsers": "lib/cjs/main-cli.js"
},
"engines": {
"node": ">=16.3.0"
"node": ">=18"
}
},
"node_modules/@tootallnate/quickjs-emscripten": {
@ -193,9 +203,9 @@
"integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA=="
},
"node_modules/@types/node": {
"version": "20.10.6",
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.10.6.tgz",
"integrity": "sha512-Vac8H+NlRNNlAmDfGUP7b5h/KA+AtWIzuXy0E6OyP8f1tCLYAtPvKRRDJjAPqhpCb0t6U2j7/xqAuLEebW2kiw==",
"version": "20.12.7",
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.7.tgz",
"integrity": "sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg==",
"optional": true,
"dependencies": {
"undici-types": "~5.26.4"
@ -211,9 +221,9 @@
}
},
"node_modules/agent-base": {
"version": "7.1.0",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.0.tgz",
"integrity": "sha512-o/zjMZRhJxny7OyEF+Op8X+efiELC7k7yOjMzgfzVqOzXqkBkWI79YoTdOtsuWd5BWhAGAuOY/Xa6xpiaWXiNg==",
"version": "7.1.1",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.1.tgz",
"integrity": "sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA==",
"dependencies": {
"debug": "^4.3.4"
},
@ -304,14 +314,15 @@
"integrity": "sha512-NmWvPnx0F1SfrQbYwOi7OeaNGokp9XhzNioJ/CSBs8Qa4vxug81mhJEAVZwxXuBmYB5KDRfMq/F3RR0BIU7sWg=="
},
"node_modules/b4a": {
"version": "1.6.4",
"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.6.4.tgz",
"integrity": "sha512-fpWrvyVHEKyeEvbKZTVOeZF3VSKKWtJxFIxX/jaVPf+cLbGUSitjb49pHLqPV2BUNNZ0LcoeEGfE/YCpyDYHIw=="
"version": "1.6.6",
"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.6.6.tgz",
"integrity": "sha512-5Tk1HLk6b6ctmjIkAcU/Ujv/1WqiDl0F0JdRCR80VsOcUlHcu7pWeWRlOqQLHfDEsVx9YH/aif5AG4ehoCtTmg=="
},
"node_modules/balanced-match": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
"integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw=="
"node_modules/bare-events": {
"version": "2.2.2",
"resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.2.2.tgz",
"integrity": "sha512-h7z00dWdG0PYOQEvChhOSWvOfkIKsdZGkWr083FgN/HyoQuebSew/cgirYqh9SCuy/hRvxc5Vy6Fw8xAmYHLkQ==",
"optional": true
},
"node_modules/base64-js": {
"version": "1.5.1",
@ -333,9 +344,9 @@
]
},
"node_modules/basic-ftp": {
"version": "5.0.4",
"resolved": "https://registry.npmjs.org/basic-ftp/-/basic-ftp-5.0.4.tgz",
"integrity": "sha512-8PzkB0arJFV4jJWSGOYR+OEic6aeKMu/osRhBULN6RY0ykby6LKhbmuQ5ublvaas5BOwboah5D87nrHyuh8PPA==",
"version": "5.0.5",
"resolved": "https://registry.npmjs.org/basic-ftp/-/basic-ftp-5.0.5.tgz",
"integrity": "sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==",
"engines": {
"node": ">=10.0.0"
}
@ -348,6 +359,14 @@
"tweetnacl": "^0.14.3"
}
},
"node_modules/bidi-js": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/bidi-js/-/bidi-js-1.0.3.tgz",
"integrity": "sha512-RKshQI1R3YQ+n9YJz2QQ147P66ELpa1FQEg20Dk8oW9t2KgLbpDLLp9aGZ7y8WHSshDknG0bknqGw5/tyCs5tw==",
"dependencies": {
"require-from-string": "^2.0.2"
}
},
"node_modules/bluebird": {
"version": "2.11.0",
"resolved": "https://registry.npmjs.org/bluebird/-/bluebird-2.11.0.tgz",
@ -358,15 +377,6 @@
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
"integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="
},
"node_modules/brace-expansion": {
"version": "1.1.11",
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
"integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
"dependencies": {
"balanced-match": "^1.0.0",
"concat-map": "0.0.1"
}
},
"node_modules/brotli": {
"version": "1.3.3",
"resolved": "https://registry.npmjs.org/brotli/-/brotli-1.3.3.tgz",
@ -446,12 +456,12 @@
}
},
"node_modules/chromium-bidi": {
"version": "0.4.33",
"resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-0.4.33.tgz",
"integrity": "sha512-IxoFM5WGQOIAd95qrSXzJUv4eXIrh+RvU3rwwqIiwYuvfE7U/Llj4fejbsJnjJMUYCuGtVQsY2gv7oGl4aTNSQ==",
"version": "0.5.8",
"resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-0.5.8.tgz",
"integrity": "sha512-blqh+1cEQbHBKmok3rVJkBlBxt9beKBgOsxbFgs7UJcoVbbeZ+K7+6liAsjgpc8l1Xd55cQUy14fXZdGSb4zIw==",
"dependencies": {
"mitt": "3.0.1",
"urlpattern-polyfill": "9.0.0"
"urlpattern-polyfill": "10.0.0"
},
"peerDependencies": {
"devtools-protocol": "*"
@ -497,11 +507,6 @@
"node": ">= 0.8"
}
},
"node_modules/concat-map": {
"version": "0.0.1",
"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
"integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg=="
},
"node_modules/core-js": {
"version": "2.6.12",
"resolved": "https://registry.npmjs.org/core-js/-/core-js-2.6.12.tgz",
@ -533,6 +538,18 @@
"nth-check": "~1.0.1"
}
},
"node_modules/css-tree": {
"version": "2.3.1",
"resolved": "https://registry.npmjs.org/css-tree/-/css-tree-2.3.1.tgz",
"integrity": "sha512-6Fv1DV/TYw//QF5IzQdqsNDjx/wc8TrMBZsqjL9eW01tWb7R7k/mq+/VXfJCl7SoD5emsJop9cOByJZfs8hYIw==",
"dependencies": {
"mdn-data": "2.0.30",
"source-map-js": "^1.0.1"
},
"engines": {
"node": "^10 || ^12.20.0 || ^14.13.0 || >=15.0.0"
}
},
"node_modules/css-what": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.3.tgz",
@ -542,14 +559,14 @@
}
},
"node_modules/cssstyle": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-3.0.0.tgz",
"integrity": "sha512-N4u2ABATi3Qplzf0hWbVCdjenim8F3ojEXpBDF5hBpjzW182MjNGLqfmQ0SkSPeQ+V86ZXgeH8aXj6kayd4jgg==",
"version": "4.0.1",
"resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-4.0.1.tgz",
"integrity": "sha512-8ZYiJ3A/3OkDd093CBT/0UKDWry7ak4BdPTFP2+QEP7cmhouyq/Up709ASSj2cK02BbZiMgk7kYjZNS4QP5qrQ==",
"dependencies": {
"rrweb-cssom": "^0.6.0"
},
"engines": {
"node": ">=14"
"node": ">=18"
}
},
"node_modules/dashdash": {
@ -564,9 +581,9 @@
}
},
"node_modules/data-uri-to-buffer": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.1.tgz",
"integrity": "sha512-MZd3VlchQkp8rdend6vrx7MmVDJzSNTBvghvKjirLkD+WTChA3KUf0jkE68Q4UyctNqI11zZO9/x2Yx+ub5Cvg==",
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
"integrity": "sha512-7hvf7/GW8e86rW0ptuwS3OcBGDjIi6SZva7hCyWC0yYry2cOPmLIjXAUHI6DK2HsnwJd9ifmt57i8eV2n4YNpw==",
"engines": {
"node": ">= 14"
}
@ -657,9 +674,9 @@
}
},
"node_modules/devtools-protocol": {
"version": "0.0.1203626",
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1203626.tgz",
"integrity": "sha512-nEzHZteIUZfGCZtTiS1fRpC8UZmsfD1SiyPvaUNvS13dvKf666OAm8YTi0+Ca3n1nLEyu49Cy4+dPWpaHFJk9g=="
"version": "0.0.1232444",
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1232444.tgz",
"integrity": "sha512-pM27vqEfxSxRkTMnF+XCmxSEb6duO5R+t8A9DEEJgy4Wz2RVanje2mmj99B6A3zv2r/qGfYlOvYznUhuokizmg=="
},
"node_modules/difflib": {
"version": "0.2.6",
@ -696,9 +713,9 @@
"integrity": "sha512-3VdM/SXBZX2omc9JF9nOPCtDaYQ67BGp5CoLpIQlO2KCAPETs8TcDHacF26jXadGbvUteZzRTeos2fhID5+ucQ=="
},
"node_modules/dompurify": {
"version": "3.0.7",
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.0.7.tgz",
"integrity": "sha512-BViYTZoqP3ak/ULKOc101y+CtHDUvBsVgSxIF1ku0HmK6BRf+C03MC+tArMvOPtVtZp83DDh5puywKDu4sbVjQ=="
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.1.0.tgz",
"integrity": "sha512-yoU4rhgPKCo+p5UrWWWNKiIq+ToGqmVVhk0PmMYBK4kRsR3/qhemNFL8f6CFmBd4gMwm3F4T7HBoydP5uY07fA=="
},
"node_modules/domutils": {
"version": "1.5.1",
@ -726,6 +743,11 @@
"safer-buffer": "^2.1.0"
}
},
"node_modules/ecc-jsbn/node_modules/jsbn": {
"version": "0.1.1",
"resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz",
"integrity": "sha512-UVU9dibq2JcFWxQPA6KCqj5O42VOmAY3zQUfEKxU0KpTGXwNoCjkX1e13eHNvw/xPynt6pU0rZ1htjWTNTSXsg=="
},
"node_modules/ellipsize": {
"version": "0.1.0",
"resolved": "https://registry.npmjs.org/ellipsize/-/ellipsize-0.1.0.tgz",
@ -750,9 +772,9 @@
"integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
},
"node_modules/escalade": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.1.tgz",
"integrity": "sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw==",
"version": "3.1.2",
"resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.2.tgz",
"integrity": "sha512-ErCHMCae19vR8vQGe50xIsVomy19rg6gFu3+r3jkEO46suLMWBksvVyoGgQV+jOfl84ZSOSlmv6Gxa89PmTGmA==",
"engines": {
"node": ">=6"
}
@ -890,31 +912,26 @@
}
},
"node_modules/fs-extra": {
"version": "8.1.0",
"resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-8.1.0.tgz",
"integrity": "sha512-yhlQgA6mnOJUKOsRUFsgJdQCvkKhcz8tlZG5HBQfReYZy46OwLcY+Zia0mtdHsOo9y/hP+CxMN0TU9QxoOtG4g==",
"version": "11.2.0",
"resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-11.2.0.tgz",
"integrity": "sha512-PmDi3uwK5nFuXh7XDTlVnS17xJS7vW36is2+w3xcv8SVxiB4NyATf4ctkVY5bkSjX0Y4nbvZCq1/EjtEyr9ktw==",
"dependencies": {
"graceful-fs": "^4.2.0",
"jsonfile": "^4.0.0",
"universalify": "^0.1.0"
"jsonfile": "^6.0.1",
"universalify": "^2.0.0"
},
"engines": {
"node": ">=6 <7 || >=8"
"node": ">=14.14"
}
},
"node_modules/fs-extra/node_modules/universalify": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz",
"integrity": "sha512-rBJeI5CXAlmy1pV+617WB9J63U6XcazHHF2f2dbJix4XzpUF0RS3Zbj0FGIOCAva5P/d/GBOYaACQ1w+0azUkg==",
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.1.tgz",
"integrity": "sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==",
"engines": {
"node": ">= 4.0.0"
"node": ">= 10.0.0"
}
},
"node_modules/fs.realpath": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
"integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw=="
},
"node_modules/get-caller-file": {
"version": "2.0.5",
"resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
@ -938,14 +955,14 @@
}
},
"node_modules/get-uri": {
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/get-uri/-/get-uri-6.0.2.tgz",
"integrity": "sha512-5KLucCJobh8vBY1K07EFV4+cPZH3mrV9YeAruUseCQKHB58SGjjT2l9/eA9LD082IiuMjSlFJEcdJ27TXvbZNw==",
"version": "6.0.3",
"resolved": "https://registry.npmjs.org/get-uri/-/get-uri-6.0.3.tgz",
"integrity": "sha512-BzUrJBS9EcUb4cFol8r4W3v1cPsSyajLSthNkz5BxbpDcHN5tIrM10E2eNvfnvBn3DaT3DUgx0OpsBKkaOpanw==",
"dependencies": {
"basic-ftp": "^5.0.2",
"data-uri-to-buffer": "^6.0.0",
"data-uri-to-buffer": "^6.0.2",
"debug": "^4.3.4",
"fs-extra": "^8.1.0"
"fs-extra": "^11.2.0"
},
"engines": {
"node": ">= 14"
@ -959,25 +976,6 @@
"assert-plus": "^1.0.0"
}
},
"node_modules/glob": {
"version": "7.2.3",
"resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz",
"integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==",
"dependencies": {
"fs.realpath": "^1.0.0",
"inflight": "^1.0.4",
"inherits": "2",
"minimatch": "^3.1.1",
"once": "^1.3.0",
"path-is-absolute": "^1.0.0"
},
"engines": {
"node": "*"
},
"funding": {
"url": "https://github.com/sponsors/isaacs"
}
},
"node_modules/graceful-fs": {
"version": "4.2.11",
"resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz",
@ -1034,9 +1032,9 @@
}
},
"node_modules/http-proxy-agent": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.0.tgz",
"integrity": "sha512-+ZT+iBxVUQ1asugqnD6oWoRiS25AkjNfG085dKJGtGxkdwLQrMKU5wJr2bOOFAXzKcTuqq+7fZlTMgG3SRfIYQ==",
"version": "7.0.2",
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz",
"integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==",
"dependencies": {
"agent-base": "^7.1.0",
"debug": "^4.3.4"
@ -1059,9 +1057,9 @@
}
},
"node_modules/https-proxy-agent": {
"version": "7.0.2",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.2.tgz",
"integrity": "sha512-NmLNjm6ucYwtcUmL7JQC1ZQ57LmHP4lT15FQ8D61nak1rO6DH+fz5qNK2Ap5UN4ZapYICE3/0KodcLYSPsPbaA==",
"version": "7.0.4",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.4.tgz",
"integrity": "sha512-wlwpilI7YdjSkWaQ/7omYBMTliDcmCN8OLihO6I9B86g06lMyAoqgoDpV0XqoaPOKj+0DIdAvnsWfyAAhmimcg==",
"dependencies": {
"agent-base": "^7.0.2",
"debug": "4"
@ -1105,24 +1103,22 @@
"resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz",
"integrity": "sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ=="
},
"node_modules/inflight": {
"version": "1.0.6",
"resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
"integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==",
"dependencies": {
"once": "^1.3.0",
"wrappy": "1"
}
},
"node_modules/inherits": {
"version": "2.0.4",
"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="
},
"node_modules/ip": {
"version": "1.1.8",
"resolved": "https://registry.npmjs.org/ip/-/ip-1.1.8.tgz",
"integrity": "sha512-PuExPYUiu6qMBQb4l06ecm6T6ujzhmh+MeJcW9wa89PoAz5pvd4zPgN5WJV104mb6S2T1AwNIAaB70JNrLQWhg=="
"node_modules/ip-address": {
"version": "9.0.5",
"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-9.0.5.tgz",
"integrity": "sha512-zHtQzGojZXTwZTHQqra+ETKd4Sn3vgi7uBmlPoXVWZqYvuKmtI0l/VZTjqGmJY9x88GGOaZ9+G9ES8hC4T4X8g==",
"dependencies": {
"jsbn": "1.1.0",
"sprintf-js": "^1.1.3"
},
"engines": {
"node": ">= 12"
}
},
"node_modules/is-fullwidth-code-point": {
"version": "3.0.0",
@ -1153,16 +1149,17 @@
"integrity": "sha512-Yljz7ffyPbrLpLngrMtZ7NduUgVvi6wG9RJ9IUcyCd59YQ911PBJphODUcbOVbqYfxe1wuYf/LJ8PauMRwsM/g=="
},
"node_modules/jsbn": {
"version": "0.1.1",
"resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz",
"integrity": "sha512-UVU9dibq2JcFWxQPA6KCqj5O42VOmAY3zQUfEKxU0KpTGXwNoCjkX1e13eHNvw/xPynt6pU0rZ1htjWTNTSXsg=="
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/jsbn/-/jsbn-1.1.0.tgz",
"integrity": "sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A=="
},
"node_modules/jsdom": {
"version": "23.0.1",
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-23.0.1.tgz",
"integrity": "sha512-2i27vgvlUsGEBO9+/kJQRbtqtm+191b5zAZrU/UezVmnC2dlDAFLgDYJvAEi94T4kjsRKkezEtLQTgsNEsW2lQ==",
"version": "23.2.0",
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-23.2.0.tgz",
"integrity": "sha512-L88oL7D/8ufIES+Zjz7v0aes+oBMh2Xnh3ygWvL0OaICOomKEPKuPnIfBJekiXr+BHbbMjrWn/xqrDQuxFTeyA==",
"dependencies": {
"cssstyle": "^3.0.0",
"@asamuzakjp/dom-selector": "^2.0.1",
"cssstyle": "^4.0.1",
"data-urls": "^5.0.0",
"decimal.js": "^10.4.3",
"form-data": "^4.0.0",
@ -1170,7 +1167,6 @@
"http-proxy-agent": "^7.0.0",
"https-proxy-agent": "^7.0.2",
"is-potential-custom-element-name": "^1.0.1",
"nwsapi": "^2.2.7",
"parse5": "^7.1.2",
"rrweb-cssom": "^0.6.0",
"saxes": "^6.0.0",
@ -1181,7 +1177,7 @@
"whatwg-encoding": "^3.1.1",
"whatwg-mimetype": "^4.0.0",
"whatwg-url": "^14.0.0",
"ws": "^8.14.2",
"ws": "^8.16.0",
"xml-name-validator": "^5.0.0"
},
"engines": {
@ -1235,13 +1231,24 @@
"integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA=="
},
"node_modules/jsonfile": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-4.0.0.tgz",
"integrity": "sha512-m6F1R3z8jjlf2imQHS2Qez5sjKWQzbuuhuJ/FKYFRZvPE3PuHcSMVZzfsLhGVOkfd20obL5SWEBew5ShlquNxg==",
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.1.0.tgz",
"integrity": "sha512-5dgndWOriYSm5cnYaJNhalLNDKOqFwyDB/rr1E9ZsGciGvKPs8R2xYGCacuf3z6K1YKDz182fd+fY3cn3pMqXQ==",
"dependencies": {
"universalify": "^2.0.0"
},
"optionalDependencies": {
"graceful-fs": "^4.1.6"
}
},
"node_modules/jsonfile/node_modules/universalify": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.1.tgz",
"integrity": "sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==",
"engines": {
"node": ">= 10.0.0"
}
},
"node_modules/jsprim": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/jsprim/-/jsprim-2.0.2.tgz",
@ -1375,6 +1382,11 @@
"node": ">=12"
}
},
"node_modules/mdn-data": {
"version": "2.0.30",
"resolved": "https://registry.npmjs.org/mdn-data/-/mdn-data-2.0.30.tgz",
"integrity": "sha512-GaqWWShW4kv/G9IEucWScBx9G1/vsFZZJUO+tD26M8J8z3Kw5RDQjaoZe03YAClgeS/SWPOcb4nkFBTEi5DUEA=="
},
"node_modules/mime-db": {
"version": "1.52.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
@ -1394,17 +1406,6 @@
"node": ">= 0.6"
}
},
"node_modules/minimatch": {
"version": "3.1.2",
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
"integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
"dependencies": {
"brace-expansion": "^1.1.7"
},
"engines": {
"node": "*"
}
},
"node_modules/mitt": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz",
@ -1461,9 +1462,9 @@
}
},
"node_modules/nwsapi": {
"version": "2.2.7",
"resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.7.tgz",
"integrity": "sha512-ub5E4+FBPKwAZx0UwIQOjYWGHTEq5sPqHQNRN8Z9e4A7u3Tj1weLJsL59yH9vmvqEtBHaOmT6cYQKIZOxp35FQ=="
"version": "2.2.9",
"resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.9.tgz",
"integrity": "sha512-2f3F0SEEer8bBu0dsNCFF50N0cTThV1nWFYcEYFZttdW0lDAoybv9cQoK7X7/68Z89S7FoRrVjP1LPX4XRf9vg=="
},
"node_modules/oauth-sign": {
"version": "0.9.0",
@ -1500,12 +1501,11 @@
}
},
"node_modules/pac-resolver": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/pac-resolver/-/pac-resolver-7.0.0.tgz",
"integrity": "sha512-Fd9lT9vJbHYRACT8OhCbZBbxr6KRSawSovFpy8nDGshaK99S/EBhVIHp9+crhxrsZOuvLpgL1n23iyPg6Rl2hg==",
"version": "7.0.1",
"resolved": "https://registry.npmjs.org/pac-resolver/-/pac-resolver-7.0.1.tgz",
"integrity": "sha512-5NPgf87AT2STgwa2ntRMr45jTKrYBGkVU36yT0ig/n/GMAa3oPqhZfIQ2kMEimReg0+t9kZViDVZ83qfVUlckg==",
"dependencies": {
"degenerator": "^5.0.0",
"ip": "^1.1.8",
"netmask": "^2.0.2"
},
"engines": {
@ -1539,14 +1539,6 @@
"url": "https://github.com/fb55/entities?sponsor=1"
}
},
"node_modules/path-is-absolute": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
"integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/pend": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
@ -1648,39 +1640,19 @@
}
},
"node_modules/puppeteer-core": {
"version": "21.5.2",
"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-21.5.2.tgz",
"integrity": "sha512-v4T0cWnujSKs+iEfmb8ccd7u4/x8oblEyKqplqKnJ582Kw8PewYAWvkH4qUWhitN3O2q9RF7dzkvjyK5HbzjLA==",
"version": "22.0.0",
"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-22.0.0.tgz",
"integrity": "sha512-S3s91rLde0A86PWVeNY82h+P0fdS7CTiNWAicCVH/bIspRP4nS2PnO5j+VTFqCah0ZJizGzpVPAmxVYbLxTc9w==",
"dependencies": {
"@puppeteer/browsers": "1.8.0",
"chromium-bidi": "0.4.33",
"@puppeteer/browsers": "2.0.0",
"chromium-bidi": "0.5.8",
"cross-fetch": "4.0.0",
"debug": "4.3.4",
"devtools-protocol": "0.0.1203626",
"ws": "8.14.2"
"devtools-protocol": "0.0.1232444",
"ws": "8.16.0"
},
"engines": {
"node": ">=16.13.2"
}
},
"node_modules/puppeteer-core/node_modules/ws": {
"version": "8.14.2",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.14.2.tgz",
"integrity": "sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g==",
"engines": {
"node": ">=10.0.0"
},
"peerDependencies": {
"bufferutil": "^4.0.1",
"utf-8-validate": ">=5.0.2"
},
"peerDependenciesMeta": {
"bufferutil": {
"optional": true
},
"utf-8-validate": {
"optional": true
}
"node": ">=18"
}
},
"node_modules/qs": {
@ -1703,8 +1675,7 @@
},
"node_modules/readability-extractor": {
"version": "0.0.11",
"resolved": "git+ssh://git@github.com/ArchiveBox/readability-extractor.git#2fb4689a65c6433036453dcbee7a268840604eb9",
"license": "MIT",
"resolved": "git+ssh://git@github.com/ArchiveBox/readability-extractor.git#057f2046f9535cfc6df7b8d551aaad32a9e6226c",
"dependencies": {
"@mozilla/readability": "^0.5.0",
"dompurify": "^3.0.6",
@ -1740,25 +1711,19 @@
"node": ">=0.10.0"
}
},
"node_modules/require-from-string": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz",
"integrity": "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/requires-port": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz",
"integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ=="
},
"node_modules/rimraf": {
"version": "3.0.2",
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
"integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==",
"dependencies": {
"glob": "^7.1.3"
},
"bin": {
"rimraf": "bin.js"
},
"funding": {
"url": "https://github.com/sponsors/isaacs"
}
},
"node_modules/rrweb-cssom": {
"version": "0.6.0",
"resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz",
@ -1800,9 +1765,9 @@
}
},
"node_modules/selenium-webdriver": {
"version": "4.15.0",
"resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.15.0.tgz",
"integrity": "sha512-BNG1bq+KWiBGHcJ/wULi0eKY0yaDqFIbEmtbsYJmfaEghdCkXBsx1akgOorhNwjBipOr0uwpvNXqT6/nzl+zjg==",
"version": "4.17.0",
"resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.17.0.tgz",
"integrity": "sha512-e2E+2XBlGepzwgFbyQfSwo9Cbj6G5fFfs9MzAS00nC99EewmcS2rwn2MwtgfP7I5p1e7DYv4HQJXtWedsu6DvA==",
"dependencies": {
"jszip": "^3.10.1",
"tmp": "^0.2.1",
@ -1818,16 +1783,16 @@
"integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA=="
},
"node_modules/single-file-cli": {
"version": "1.1.46",
"resolved": "https://registry.npmjs.org/single-file-cli/-/single-file-cli-1.1.46.tgz",
"integrity": "sha512-+vFj0a5Y4ESqpMwH0T6738pg8ZA9KVhhl6OlIOsicamGNU9DnMa+q9dL1S2KnLWHoauKjU0BThhR/YKUleJSxw==",
"version": "1.1.54",
"resolved": "https://registry.npmjs.org/single-file-cli/-/single-file-cli-1.1.54.tgz",
"integrity": "sha512-wnVPg7BklhswwFVrtuFXbmluI4piHxg2dC0xATxYTeXAld6PnRPlnp7ufallRKArjFBZdP2u+ihMkOIp7A38XA==",
"dependencies": {
"file-url": "3.0.0",
"iconv-lite": "0.6.3",
"jsdom": "23.0.0",
"puppeteer-core": "21.5.2",
"selenium-webdriver": "4.15.0",
"single-file-core": "1.3.15",
"jsdom": "24.0.0",
"puppeteer-core": "22.0.0",
"selenium-webdriver": "4.17.0",
"single-file-core": "1.3.24",
"strong-data-uri": "1.0.6",
"yargs": "17.7.2"
},
@ -1847,11 +1812,11 @@
}
},
"node_modules/single-file-cli/node_modules/jsdom": {
"version": "23.0.0",
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-23.0.0.tgz",
"integrity": "sha512-cbL/UCtohJguhFC7c2/hgW6BeZCNvP7URQGnx9tSJRYKCdnfbfWOrtuLTMfiB2VxKsx5wPHVsh/J0aBy9lIIhQ==",
"version": "24.0.0",
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-24.0.0.tgz",
"integrity": "sha512-UDS2NayCvmXSXVP6mpTj+73JnNQadZlr9N68189xib2tx5Mls7swlTNao26IoHv46BZJFvXygyRtyXd1feAk1A==",
"dependencies": {
"cssstyle": "^3.0.0",
"cssstyle": "^4.0.1",
"data-urls": "^5.0.0",
"decimal.js": "^10.4.3",
"form-data": "^4.0.0",
@ -1870,14 +1835,14 @@
"whatwg-encoding": "^3.1.1",
"whatwg-mimetype": "^4.0.0",
"whatwg-url": "^14.0.0",
"ws": "^8.14.2",
"ws": "^8.16.0",
"xml-name-validator": "^5.0.0"
},
"engines": {
"node": ">=18"
},
"peerDependencies": {
"canvas": "^3.0.0"
"canvas": "^2.11.2"
},
"peerDependenciesMeta": {
"canvas": {
@ -1909,9 +1874,9 @@
}
},
"node_modules/single-file-core": {
"version": "1.3.15",
"resolved": "https://registry.npmjs.org/single-file-core/-/single-file-core-1.3.15.tgz",
"integrity": "sha512-/YNpHBwASWNxmSmZXz0xRolmXf0+PGAbwpVrwn6A8tYeuAdezxxde5RYTTQ7V4Zv68+H4JMhE2DwCRV0sVUGNA=="
"version": "1.3.24",
"resolved": "https://registry.npmjs.org/single-file-core/-/single-file-core-1.3.24.tgz",
"integrity": "sha512-1B256mKBbNV8jXAV+hRyEv0aMa7tn0C0Ci+zx7Ya4ZXZB3b9/1MgKsB/fxVwDiL28WJSU0pxzh8ftIYubCNn9w=="
},
"node_modules/smart-buffer": {
"version": "4.2.0",
@ -1923,24 +1888,24 @@
}
},
"node_modules/socks": {
"version": "2.7.1",
"resolved": "https://registry.npmjs.org/socks/-/socks-2.7.1.tgz",
"integrity": "sha512-7maUZy1N7uo6+WVEX6psASxtNlKaNVMlGQKkG/63nEDdLOWNbiUMoLK7X4uYoLhQstau72mLgfEWcXcwsaHbYQ==",
"version": "2.8.3",
"resolved": "https://registry.npmjs.org/socks/-/socks-2.8.3.tgz",
"integrity": "sha512-l5x7VUUWbjVFbafGLxPWkYsHIhEvmF85tbIeFZWc8ZPtoMyybuEhL7Jye/ooC4/d48FgOjSJXgsF/AJPYCW8Zw==",
"dependencies": {
"ip": "^2.0.0",
"ip-address": "^9.0.5",
"smart-buffer": "^4.2.0"
},
"engines": {
"node": ">= 10.13.0",
"node": ">= 10.0.0",
"npm": ">= 3.0.0"
}
},
"node_modules/socks-proxy-agent": {
"version": "8.0.2",
"resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-8.0.2.tgz",
"integrity": "sha512-8zuqoLv1aP/66PHF5TqwJ7Czm3Yv32urJQHrVyhD7mmA6d61Zv8cIXQYPTWwmg6qlupnPvs/QKDmfa4P/qct2g==",
"version": "8.0.3",
"resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-8.0.3.tgz",
"integrity": "sha512-VNegTZKhuGq5vSD6XNKlbqWhyt/40CgoEw8XxD6dhnm8Jq9IEa3nIa4HwnM8XOqU0CdB0BwWVXusqiFXfHB3+A==",
"dependencies": {
"agent-base": "^7.0.2",
"agent-base": "^7.1.1",
"debug": "^4.3.4",
"socks": "^2.7.1"
},
@ -1948,11 +1913,6 @@
"node": ">= 14"
}
},
"node_modules/socks/node_modules/ip": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ip/-/ip-2.0.0.tgz",
"integrity": "sha512-WKa+XuLG1A1R0UWhl2+1XQSi+fZWMsYKffMZTTYsiZaUD8k2yDAj5atimTUD2TZkyCkNEeYE5NhFZmupOGtjYQ=="
},
"node_modules/source-map": {
"version": "0.6.1",
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
@ -1962,6 +1922,19 @@
"node": ">=0.10.0"
}
},
"node_modules/source-map-js": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.0.tgz",
"integrity": "sha512-itJW8lvSA0TXEphiRoawsCksnlf8SyvmFzIhltqAHluXd88pkCd+cXJVHTDwdCr0IzwptSm035IHQktUu1QUMg==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/sprintf-js": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.1.3.tgz",
"integrity": "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA=="
},
"node_modules/sshpk": {
"version": "1.18.0",
"resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.18.0.tgz",
@ -1986,6 +1959,11 @@
"node": ">=0.10.0"
}
},
"node_modules/sshpk/node_modules/jsbn": {
"version": "0.1.1",
"resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz",
"integrity": "sha512-UVU9dibq2JcFWxQPA6KCqj5O42VOmAY3zQUfEKxU0KpTGXwNoCjkX1e13eHNvw/xPynt6pU0rZ1htjWTNTSXsg=="
},
"node_modules/stream-length": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/stream-length/-/stream-length-1.0.2.tgz",
@ -1995,12 +1973,15 @@
}
},
"node_modules/streamx": {
"version": "2.15.6",
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.15.6.tgz",
"integrity": "sha512-q+vQL4AAz+FdfT137VF69Cc/APqUbxy+MDOImRrMvchJpigHj9GksgDU2LYbO9rx7RX6osWgxJB2WxhYv4SZAw==",
"version": "2.16.1",
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.16.1.tgz",
"integrity": "sha512-m9QYj6WygWyWa3H1YY69amr4nVgy61xfjys7xO7kviL5rfIEc2naf+ewFiOA+aEJD7y0JO3h2GoiUv4TDwEGzQ==",
"dependencies": {
"fast-fifo": "^1.1.0",
"queue-tick": "^1.0.1"
},
"optionalDependencies": {
"bare-events": "^2.2.0"
}
},
"node_modules/string_decoder": {
@ -2067,9 +2048,9 @@
}
},
"node_modules/tar-stream": {
"version": "3.1.6",
"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.6.tgz",
"integrity": "sha512-B/UyjYwPpMBv+PaFSWAmtYjwdrlEaZQEhMIBFNC5oEG8lpiW8XjcSdmEaClj28ArfKScKHs2nshz3k2le6crsg==",
"version": "3.1.7",
"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz",
"integrity": "sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==",
"dependencies": {
"b4a": "^1.6.4",
"fast-fifo": "^1.2.0",
@ -2082,14 +2063,11 @@
"integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg=="
},
"node_modules/tmp": {
"version": "0.2.1",
"resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.1.tgz",
"integrity": "sha512-76SUhtfqR2Ijn+xllcI5P1oyannHNHByD80W1q447gU3mp9G9PSpGdWmjUOHRDPiHYacIk66W7ubDTuPF3BEtQ==",
"dependencies": {
"rimraf": "^3.0.0"
},
"version": "0.2.3",
"resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.3.tgz",
"integrity": "sha512-nZD7m9iCPC5g0pYmcaxogYKggSfLsdxl8of3Q/oIbqCqLLIO9IAF0GWjX1z9NZRHPiXv8Wex4yDCaZsgEw0Y8w==",
"engines": {
"node": ">=8.17.0"
"node": ">=14.14"
}
},
"node_modules/tough-cookie": {
@ -2125,9 +2103,9 @@
"integrity": "sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q=="
},
"node_modules/turndown": {
"version": "7.1.2",
"resolved": "https://registry.npmjs.org/turndown/-/turndown-7.1.2.tgz",
"integrity": "sha512-ntI9R7fcUKjqBP6QU8rBK2Ehyt8LAzt3UBT9JR9tgo6GtuKvyUzpayWmeMKJw1DPdXzktvtIT8m2mVXz+bL/Qg==",
"version": "7.1.3",
"resolved": "https://registry.npmjs.org/turndown/-/turndown-7.1.3.tgz",
"integrity": "sha512-Z3/iJ6IWh8VBiACWQJaA5ulPQE5E1QwvBHj00uGzdQxdRnd8fh1DPqNOJqzQDu6DkOstORrtXzf/9adB+vMtEA==",
"dependencies": {
"domino": "^2.1.6"
}
@ -2178,9 +2156,9 @@
}
},
"node_modules/urlpattern-polyfill": {
"version": "9.0.0",
"resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-9.0.0.tgz",
"integrity": "sha512-WHN8KDQblxd32odxeIgo83rdVDE2bvdkb86it7bMhYZwWKJz0+O0RK/eZiHYnM+zgt/U7hAHOlCQGfjjvSkw2g=="
"version": "10.0.0",
"resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz",
"integrity": "sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg=="
},
"node_modules/util-deprecate": {
"version": "1.0.2",

View file

@ -1,6 +1,6 @@
{
"name": "archivebox",
"version": "0.7.2",
"version": "0.8.0",
"description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
"repository": "github:ArchiveBox/ArchiveBox",
@ -8,6 +8,6 @@
"dependencies": {
"@postlight/parser": "^2.2.3",
"readability-extractor": "github:ArchiveBox/readability-extractor",
"single-file-cli": "^1.1.46"
"single-file-cli": "^1.1.54"
}
}

1391
pdm.lock

File diff suppressed because it is too large Load diff

@ -1 +1 @@
Subproject commit 5323fc773d33ef3f219c35c946f3b353b1251d37
Subproject commit 1380be7e4ef156d85957dfef8c6d154ef9880578

View file

@ -1,26 +1,48 @@
[project]
name = "archivebox"
version = "0.7.2"
version = "0.8.0"
package-dir = "archivebox"
requires-python = ">=3.10,<3.13"
platform = "py3-none-any"
description = "Self-hosted internet archiving solution."
authors = [
{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"},
]
dependencies = [
"croniter>=0.3.34",
"dateparser>=1.0.0",
"django-extensions>=3.0.3",
"django>=3.1.3,<3.2",
"ipython>5.0.0",
"mypy-extensions>=0.4.3",
"python-crontab>=2.5.1",
"requests>=2.24.0",
"w3lib>=1.22.0",
"yt-dlp>=2023.10.13",
# "playwright>=1.39.0; platform_machine != 'armv7l'",
]
requires-python = ">=3.9,<3.12"
readme = "README.md"
authors = [{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"}]
license = {text = "MIT"}
readme = "README.md"
# pdm install
# pdm update --unconstrained
dependencies = [
# Base Framework and Language Dependencies
"setuptools>=69.5.1",
"django>=4.2.0,<5.0",
"django-extensions>=3.2.3",
"mypy-extensions>=1.0.0",
# Python Helper Libraries
"requests>=2.31.0",
"dateparser>=1.0.0",
"feedparser>=6.0.11",
"w3lib>=1.22.0",
# Feature-Specific Dependencies
"python-crontab>=2.5.1", # for: archivebox schedule
"croniter>=0.3.34", # for: archivebox schedule
"ipython>5.0.0", # for: archivebox shell
# Extractor Dependencies
"yt-dlp>=2024.4.9", # for: media
"playwright>=1.43.0; platform_machine != 'armv7l'", # WARNING: playwright doesn't have any sdist, causes trouble on build systems that refuse to install wheel-only packages
# TODO: add more extractors
# - gallery-dl
# - scihubdl
# - See Github issues for more...
]
homepage = "https://github.com/ArchiveBox/ArchiveBox"
repository = "https://github.com/ArchiveBox/ArchiveBox"
documentation = "https://github.com/ArchiveBox/ArchiveBox/wiki"
keywords = ["internet archiving", "web archiving", "digipres", "warc", "preservation", "backups", "archiving", "web", "bookmarks", "puppeteer", "browser", "download"]
classifiers = [
"Development Status :: 4 - Beta",
"Environment :: Console",
@ -53,59 +75,84 @@ classifiers = [
"Topic :: Utilities",
"Typing :: Typed",
]
# dynamic = ["version"] # TODO: programatticaly fetch version from package.json at build time
# pdm lock -G:all
# pdm lock --group=':all'
# pdm install -G:all
# pdm update --group=':all' --unconstrained
[project.optional-dependencies]
sonic = [
# echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list
# curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg
# apt install sonic
"sonic-client>=1.0.0",
]
ldap = [
# apt install libldap2-dev libsasl2-dev python3-ldap
"python-ldap>=3.4.3",
"django-auth-ldap>=4.1.0",
]
# pdm lock --group=':all' --dev
# pdm install -G:all --dev
# pdm update --dev --unconstrained
[tool.pdm.dev-dependencies]
dev = [
# build
"setuptools>=69.0.3",
build = [
"setuptools>=69.5.1",
"pip",
"wheel",
"pdm",
"homebrew-pypi-poet>=0.10.0",
# docs
"homebrew-pypi-poet>=0.10.0", # for: generating archivebox.rb brewfile list of python packages
]
docs = [
"recommonmark",
"sphinx",
"sphinx-rtd-theme",
# debug
]
debug = [
"django-debug-toolbar",
"djdt_flamegraph",
"ipdb",
# test
]
test = [
"pdm[pytest]",
"pytest",
# lint
]
lint = [
"flake8",
"mypy",
"django-stubs",
]
[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"
[project.scripts]
archivebox = "archivebox.cli:main"
[tool.pdm.scripts]
lint = "./bin/lint.sh"
test = "./bin/test.sh"
# all = {composite = ["lint mypackage/", "test -v tests/"]}
[project.optional-dependencies]
sonic = [
# echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list
# curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg
"sonic-client>=0.0.5",
]
ldap = [
# apt install libldap2-dev libsasl2-dev
"python-ldap>=3.4.3",
"django-auth-ldap>=4.1.0",
]
# playwright = [
# platform_machine isnt respected by pdm export -o requirements.txt, this breaks arm/v7
# "playwright>=1.39.0; platform_machine != 'armv7l'",
# ]
[tool.pytest.ini_options]
testpaths = [ "tests" ]
[project.scripts]
archivebox = "archivebox.cli:main"
[tool.mypy]
mypy_path = "archivebox"
namespace_packages = true
explicit_package_bases = true
# follow_imports = "silent"
# ignore_missing_imports = true
# disallow_incomplete_defs = true
# disallow_untyped_defs = true
# disallow_untyped_decorators = true
# exclude = "pdm/(pep582/|models/in_process/.+\\.py)"
plugins = ["mypy_django_plugin.main"]
[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"
[project.urls]

View file

@ -1,54 +1,60 @@
# This file is @generated by PDM.
# Please do not edit it manually.
asgiref==3.7.2
asgiref==3.8.1
asttokens==2.4.1
brotli==1.1.0; implementation_name == "cpython"
brotlicffi==1.1.0.0; implementation_name != "cpython"
certifi==2023.11.17
certifi==2024.2.2
cffi==1.16.0; implementation_name != "cpython"
charset-normalizer==3.3.2
colorama==0.4.6; sys_platform == "win32"
croniter==2.0.1
croniter==2.0.5
dateparser==1.2.0
decorator==5.1.1
django==3.1.14
django-auth-ldap==4.1.0
django-extensions==3.1.5
exceptiongroup==1.2.0; python_version < "3.11"
django==4.2.11
django-auth-ldap==4.8.0
django-extensions==3.2.3
exceptiongroup==1.2.1; python_version < "3.11"
executing==2.0.1
idna==3.6
ipython==8.18.1
feedparser==6.0.11
greenlet==3.0.3; platform_machine != "armv7l"
idna==3.7
ipython==8.23.0
jedi==0.19.1
matplotlib-inline==0.1.6
matplotlib-inline==0.1.7
mutagen==1.47.0
mypy-extensions==1.0.0
parso==0.8.3
pexpect==4.9.0; sys_platform != "win32"
parso==0.8.4
pexpect==4.9.0; sys_platform != "win32" and sys_platform != "emscripten"
playwright==1.43.0; platform_machine != "armv7l"
prompt-toolkit==3.0.43
ptyprocess==0.7.0; sys_platform != "win32"
ptyprocess==0.7.0; sys_platform != "win32" and sys_platform != "emscripten"
pure-eval==0.2.2
pyasn1==0.5.1
pyasn1-modules==0.3.0
pycparser==2.21; implementation_name != "cpython"
pycryptodomex==3.19.1
pyasn1==0.6.0
pyasn1-modules==0.4.0
pycparser==2.22; implementation_name != "cpython"
pycryptodomex==3.20.0
pyee==11.1.0; platform_machine != "armv7l"
pygments==2.17.2
python-crontab==3.0.0
python-dateutil==2.8.2
python-dateutil==2.9.0.post0
python-ldap==3.4.4
pytz==2023.3.post1
regex==2023.12.25
pytz==2024.1
regex==2024.4.16
requests==2.31.0
setuptools==69.5.1
sgmllib3k==1.0.0
six==1.16.0
sonic-client==1.0.0
sqlparse==0.4.4
sqlparse==0.5.0
stack-data==0.6.3
traitlets==5.14.1
typing-extensions==4.9.0; python_version < "3.11"
tzdata==2023.4; platform_system == "Windows"
traitlets==5.14.3
typing-extensions==4.11.0; python_version < "3.12" or platform_machine != "armv7l"
tzdata==2024.1; sys_platform == "win32" or platform_system == "Windows"
tzlocal==5.2
urllib3==2.1.0
urllib3==2.2.1
w3lib==2.1.2
wcwidth==0.2.12
wcwidth==0.2.13
websockets==12.0
yt-dlp==2023.12.30
yt-dlp==2024.4.9

View file

@ -50,4 +50,4 @@ def redirect_to_static(filename):
def start():
run(host='localhost', port=8080)
run(host='localhost', port=8080, quiet=True)

View file

@ -0,0 +1 @@
{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}

View file

@ -0,0 +1,24 @@
<?xml version="1.0" encoding="utf-8"?>
<feed
xml:lang="en"
xmlns="http://www.w3.org/2005/Atom"
>
<id>http://www.example.com/</id>
<title>Example of an Atom feed</title>
<link rel="self" type="application/atom+xml" href="http://www.example.com/index.atom" />
<link rel="alternate" type="text/html" href="http://www.example.com/" />
<author>
<name>Jim Winstead</name>
</author>
<updated>2024-02-26T03:18:26Z</updated>
<entry>
<title>Example</title>
<link rel="alternate" type="text/html" href="http://127.0.0.1:8080/static/example.com.html" />
<id>tag:example.com,2024-02-25:3319</id>
<updated>2024-02-26T03:18:26Z</updated>
<published>2024-02-25T19:18:25-08:00</published>
<category term="Tag1" scheme="http://example.com/archive" />
<category term="Tag2" scheme="http://example.com/archive" />
<content type="html">This is some &lt;b&gt;content&lt;/b&gt;</content>
</entry>
</feed>

View file

@ -0,0 +1,6 @@
[
{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"},
{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"},
{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]},
{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"}
]

View file

@ -0,0 +1,2 @@
this line would cause problems but --parser=json will actually skip it
[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}]

View file

@ -0,0 +1,4 @@
{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}
{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"}
{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]}
{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"}

View file

@ -0,0 +1,32 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:admin="http://webns.net/mvcb/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<channel>
<title>Sample Feed</title>
<link>http://example.org/</link>
<description>For documentation only</description>
<dc:language>en-us</dc:language>
<dc:creator>Nobody (nobody@example.org)</dc:creator>
<dc:rights>Public domain</dc:rights>
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
<admin:generatorAgent rdf:resource="http://www.example.org/"/>
<admin:errorReportsTo rdf:resource="mailto:nobody@example.org"/>
<item>
<title>First!</title>
<link>http://127.0.0.1:8080/static/example.com.html</link>
<guid isPermaLink="false">just-an@example.org</guid>
<description>
This has a description.
</description>
<dc:subject>Tag1 Tag2</dc:subject>
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
<content:encoded><![CDATA[
This has a <b>description</b>.]]>
</content:encoded>
</item>
</channel>
</rss>

View file

@ -91,3 +91,198 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
assert (archived_item_path / "warc").exists()
assert not (archived_item_path / "singlefile.html").exists()
def test_json(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.json', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=json"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
assert "http://127.0.0.1:8080/static/iana.org.html" in urls
assert "http://127.0.0.1:8080/static/shift_jis.html" in urls
assert "http://127.0.0.1:8080/static/title_og_with_html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://www.example.com/should-not-exist" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
assert "Tag3" in tags
assert "Tag4 with Space" in tags
assert "Tag5" in tags
assert "Tag6 with Space" in tags
def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.json.bad', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=json"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://www.example.com/should-not-exist" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
def test_generic_rss(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://purl.org/dc/elements/1.1/" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1 Tag2" in tags
def test_pinboard_rss(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=pinboard_rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
def test_atom(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://www.w3.org/2005/Atom" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
def test_jsonl(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.jsonl', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=jsonl"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
assert "http://127.0.0.1:8080/static/iana.org.html" in urls
assert "http://127.0.0.1:8080/static/shift_jis.html" in urls
assert "http://127.0.0.1:8080/static/title_og_with_html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://www.example.com/should-not-exist" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
assert "Tag3" in tags
assert "Tag4 with Space" in tags
assert "Tag5" in tags
assert "Tag6 with Space" in tags
def test_jsonl_single(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example-single.jsonl', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=jsonl"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://www.example.com/should-not-exist" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
# make sure that JSON parser rejects a single line of JSONL which is valid
# JSON but not our expected format
def test_json_single(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example-single.jsonl', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=json"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
assert 'expects list of objects' in arg_process.stderr.decode("utf-8")