diff --git a/Dockerfile b/Dockerfile index 82647329..fbb56a78 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ # docker run -v "$PWD/data":/data -p 8000:8000 archivebox server # Multi-arch build: # docker buildx create --use -# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev +# docker buildx build . --platform=linux/amd64,linux/arm64--push -t archivebox/archivebox:latest -t archivebox/archivebox:dev # # Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development). @@ -194,10 +194,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T && playwright install --with-deps chromium \ && export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \ else \ - # fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.) - apt-get install -qq -y -t bookworm-backports --no-install-recommends \ - chromium \ - && export CHROME_BINARY="$(which chromium)"; \ + # fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.) + # apt-get install -qq -y -t bookworm-backports --no-install-recommends \ + # chromium \ + # && export CHROME_BINARY="$(which chromium)"; \ + echo 'armv7 no longer supported in versions after v0.7.3' \ + exit 1; \ fi \ && rm -rf /var/lib/apt/lists/* \ && ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \ @@ -275,7 +277,6 @@ ENV IN_DOCKER=True \ GOOGLE_DEFAULT_CLIENT_SECRET=no \ ALLOWED_HOSTS=* ## No need to set explicitly, these values will be autodetected by archivebox in docker: - # CHROME_SANDBOX=False \ # WGET_BINARY="wget" \ # YOUTUBEDL_BINARY="yt-dlp" \ # CHROME_BINARY="/usr/bin/chromium-browser" \ diff --git a/README.md b/README.md index 27a84956..4d1bcf0d 100644 --- a/README.md +++ b/README.md @@ -1076,7 +1076,7 @@ Because ArchiveBox is designed to ingest a large volume of URLs with multiple co
  • Don't store large collections on older filesystems like EXT3/FAT as they may not be able to handle more than 50k directory entries in the data/archive/ folder.
  • Try to keep the data/index.sqlite3 file on local drive (not a network mount) or SSD for maximum performance, however the data/archive/ folder can be on a network mount or slower HDD.
  • -
  • If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to set PUID & PGID and disable root_squash on your fileshare server. +
  • If using Docker or NFS/SMB/FUSE for the data/archive/ folder, you may need to set PUID & PGID and disable root_squash on your fileshare server.
  • diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 9912b4c7..fb3688f3 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -250,7 +250,7 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]: """parse and load existing index with any new links from import_path merged in""" from core.models import Snapshot try: - return Snapshot.objects.all() + return Snapshot.objects.all().only('id') except (KeyboardInterrupt, SystemExit): raise SystemExit(0) diff --git a/docker-compose.yml b/docker-compose.yml index ea3d3ab7..a8293705 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,32 +8,26 @@ # Documentation: # https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose -version: '3.9' services: archivebox: - #image: ${DOCKER_IMAGE:-archivebox/archivebox:dev} - image: archivebox/archivebox:dev - command: server --quick-init 0.0.0.0:8000 + image: archivebox/archivebox ports: - 8000:8000 volumes: - ./data:/data - # - ./etc/crontabs:/var/spool/cron/crontabs # uncomment this and archivebox_scheduler below to set up automatic recurring archive jobs - # - ./archivebox:/app/archivebox # uncomment this to mount the ArchiveBox source code at runtime (for developers working on archivebox) - # build: . # uncomment this to build the image from source code at buildtime (for developers working on archivebox) environment: - ALLOWED_HOSTS=* # restrict this to only accept incoming traffic via specific domain name - # - PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list - # - PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content - # - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive # - ADMIN_USERNAME=admin # create an admin user on first run with the given user/pass combo # - ADMIN_PASSWORD=SomeSecretPassword # - PUID=911 # set to your host user's UID & GID if you encounter permissions issues # - PGID=911 - # - SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search - # - SEARCH_BACKEND_HOST_NAME=sonic - # - SEARCH_BACKEND_PASSWORD=SomeSecretPassword + # - PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list + # - PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content + # - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive + - SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search + - SEARCH_BACKEND_HOST_NAME=sonic + - SEARCH_BACKEND_PASSWORD=SomeSecretPassword # - MEDIA_MAX_SIZE=750m # increase this filesize limit to allow archiving larger audio/video files # - TIMEOUT=60 # increase this number to 120+ seconds if you see many slow downloads timing out # - CHECK_SSL_VALIDITY=True # set to False to disable strict SSL checking (allows saving URLs w/ broken certs) @@ -42,7 +36,7 @@ services: # add further configuration options from archivebox/config.py as needed (to apply them only to this container) # or set using `docker compose run archivebox config --set SOME_KEY=someval` (to persist config across all containers) - # For ad-blocking during archiving, uncomment this section and pihole service section below + # For ad-blocking during archiving, uncomment this section and pihole service section below # networks: # - dns # dns: @@ -51,22 +45,26 @@ services: ######## Optional Addons: tweak examples below as needed for your specific use case ######## - ### Example: To run the Sonic full-text search backend, first download the config file to sonic.cfg - # $ curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic.cfg - # After starting, backfill any existing Snapshots into the full-text index: + ### Runs the Sonic full-text search backend, config file is auto-downloaded into sonic.cfg: + # After starting, backfill any existing Snapshots into the full-text index: # $ docker-compose run archivebox update --index-only - # sonic: - # image: valeriansaliou/sonic:latest - # expose: - # - 1491 - # environment: - # - SEARCH_BACKEND_PASSWORD=SomeSecretPassword - # volumes: - # - ./sonic.cfg:/etc/sonic.cfg:ro - # - ./data/sonic:/var/lib/sonic/store - - + sonic: + image: valeriansaliou/sonic + build: + dockerfile_inline: | + FROM quay.io/curl/curl:latest AS setup + RUN curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/main/etc/sonic.cfg' > /tmp/sonic.cfg + FROM valeriansaliou/sonic:latest + COPY --from=setup /tmp/sonic.cfg /etc/sonic.cfg + expose: + - 1491 + environment: + - SEARCH_BACKEND_PASSWORD=SomeSecretPassword + volumes: + - ./etc/sonic.cfg:/etc/sonic.cfg + - ./data/sonic:/var/lib/sonic/store + ### Example: To run pihole in order to block ad/tracker requests during archiving, # uncomment this block and set up pihole using its admin interface diff --git a/package.json b/package.json index 1377ef99..3c42a8b9 100644 --- a/package.json +++ b/package.json @@ -8,6 +8,6 @@ "dependencies": { "@postlight/parser": "^2.2.3", "readability-extractor": "github:ArchiveBox/readability-extractor", - "single-file-cli": "^1.1.46" + "single-file-cli": "^1.1.54" } } diff --git a/pyproject.toml b/pyproject.toml index 969b6318..98a1a055 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,15 +15,16 @@ dependencies = [ "dateparser>=1.0.0", "django-extensions>=3.2.3", "django>=4.2.0,<5.0", + "setuptools>=69.0.3", "feedparser>=6.0.11", "ipython>5.0.0", "mypy-extensions>=0.4.3", "python-crontab>=2.5.1", "requests>=2.24.0", "w3lib>=1.22.0", - "yt-dlp>=2023.10.13", + "yt-dlp>=2024.3.10", # dont add playwright becuase packages without sdists cause trouble on many build systems that refuse to install wheel-only packages - # "playwright>=1.39.0; platform_machine != 'armv7l'", + "playwright>=1.39.0; platform_machine != 'armv7l'", ] classifiers = [ @@ -64,11 +65,11 @@ classifiers = [ sonic = [ # echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list # curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg + # apt install sonic "sonic-client>=0.0.5", ] ldap = [ # apt install libldap2-dev libsasl2-dev python3-ldap - "setuptools>=69.0.3", "python-ldap>=3.4.3", "django-auth-ldap>=4.1.0", ] @@ -83,7 +84,6 @@ ldap = [ [tool.pdm.dev-dependencies] dev = [ # building - "setuptools>=69.0.3", "wheel", "pdm", "homebrew-pypi-poet>=0.10.0",