diff --git a/Dockerfile b/Dockerfile index 28019ad5..65003800 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,117 +24,192 @@ LABEL name="archivebox" \ homepage="https://github.com/ArchiveBox/ArchiveBox" \ documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker" -######### Base System Setup #################################### +ARG TARGETPLATFORM +ARG TARGETARCH +ARG TARGETVARIANT + +######### Environment Variables ################################# # Global system-level config ENV TZ=UTC \ LANGUAGE=en_US:en \ LC_ALL=C.UTF-8 \ LANG=C.UTF-8 \ - PYTHONIOENCODING=UTF-8 \ - PYTHONUNBUFFERED=1 \ DEBIAN_FRONTEND=noninteractive \ APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \ + PYTHONIOENCODING=UTF-8 \ + PYTHONUNBUFFERED=1 \ npm_config_loglevel=error -# Application-level config +# Version config +ENV PYTHON_VERSION=3.11 \ + NODE_VERSION=21 + +# User config +ENV ARCHIVEBOX_USER="archivebox" \ + DEFAULT_PUID=911 \ + DEFAULT_PGID=911 + +# Global paths ENV CODE_DIR=/app \ DATA_DIR=/data \ GLOBAL_VENV=/venv \ - APP_VENV=/app/.venv \ - NODE_MODULES=/app/node_modules \ - ARCHIVEBOX_USER="archivebox" + PLAYWRIGHT_BROWSERS_PATH=/browsers +# Application-level paths +ENV APP_VENV=/app/.venv \ + NODE_MODULES=/app/node_modules + +# Build shell config ENV PATH="$PATH:$GLOBAL_VENV/bin:$APP_VENV/bin:$NODE_MODULES/.bin" -SHELL ["/bin/bash", "-c"] -ARG TARGETPLATFORM -ARG TARGETARCH -ARG TARGETVARIANT -RUN printf "[i] Building for TARGETPLATFORM=${TARGETPLATFORM}" \ - && printf ", TARGETARCH=${TARGETARCH}" \ - && printf ", TARGETVARIANT=${TARGETVARIANT} \n" \ - && printf "uname -a : " && uname -a +SHELL ["/bin/bash", "-o", "pipefail", "-c"] +######### System Environment #################################### + +# Detect ArchiveBox version number by reading package.json +COPY --chown=root:root --chmod=755 package.json "$CODE_DIR/" +RUN grep '"version": ' "${CODE_DIR}/package.json" | awk -F'"' '{print $4}' > /VERSION.txt + +# Print debug info about build and save it to disk +RUN (echo "[i] Docker build for ArchiveBox $(cat /VERSION.txt) starting..." \ + && echo "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})" \ + && echo "BUILD_START_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ} LANG=${LANG}" \ + && echo \ + && echo "GLOBAL_VENV=${GLOBAL_VENV} APP_VENV=${APP_VENV} NODE_MODULES=${NODE_MODULES}" \ + && echo "PYTHON=${PYTHON_VERSION} NODE=${NODE_VERSION} PATH=${PATH}" \ + && echo "CODE_DIR=${CODE_DIR} DATA_DIR=${DATA_DIR}" \ + && echo \ + && uname -a \ + && cat /etc/os-release | head -n7 \ + && which bash && bash --version | head -n1 \ + && which dpkg && dpkg --version | head -n1 \ + && echo -e '\n\n' && env && echo -e '\n\n' \ + ) | tee -a /VERSION.txt # Create non-privileged user for archivebox and chrome -RUN echo "[*] Setting up system environment..." \ +RUN echo "[*] Setting up $ARCHIVEBOX_USER user ${DEFAULT_PUID}..." \ && groupadd --system $ARCHIVEBOX_USER \ && useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER \ - && mkdir -p /etc/apt/keyrings + && usermod -u "$DEFAULT_PUID" "$ARCHIVEBOX_USER" \ + && groupmod -g "$DEFAULT_PGID" "$ARCHIVEBOX_USER" \ + && echo -e "\nARCHIVEBOX_USER=$ARCHIVEBOX_USER PUID=$(id -u $ARCHIVEBOX_USER) PGID=$(id -g $ARCHIVEBOX_USER)\n\n" \ + | tee -a /VERSION.txt + # DEFAULT_PUID and DEFAULT_PID are overriden by PUID and PGID in /bin/docker_entrypoint.sh at runtime + # https://docs.linuxserver.io/general/understanding-puid-and-pgid # Install system apt dependencies (adding backports to access more recent apt updates) -RUN echo "[+] Installing system dependencies..." \ - && echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' >> /etc/apt/sources.list.d/backports.list \ - && apt-get update -qq \ - && apt-get install -qq -y \ - apt-transport-https ca-certificates gnupg2 curl wget \ - zlib1g-dev dumb-init gosu cron unzip \ - # nano iputils-ping dnsutils htop procps \ - # 1. packaging dependencies - # 2. docker and init system dependencies - # 3. frivolous CLI helpers to make debugging failed archiving easier +RUN echo "[+] Installing system dependencies for $TARGETPLATFORM..." \ + # && echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' >> /etc/apt/sources.list.d/backports.list \ && mkdir -p /etc/apt/keyrings \ + && apt-get update -qq \ + && apt-get install -qq -y --no-install-recommends \ + # 1. packaging dependencies + apt-transport-https ca-certificates gnupg2 curl wget \ + # 2. docker and init system dependencies + zlib1g-dev dumb-init gosu cron unzip grep \ + # 3. frivolous CLI helpers to make debugging failed archiving easier + # nano iputils-ping dnsutils htop procps jq yq && rm -rf /var/lib/apt/lists/* - ######### Language Environments #################################### # Install Node environment -RUN echo "[+] Installing Node environment..." \ - && echo 'deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_21.x nodistro main' >> /etc/apt/sources.list.d/nodejs.list \ +RUN echo "[+] Installing Node $NODE_VERSION environment..." \ + && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \ && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \ && apt-get update -qq \ - && apt-get install -qq -y nodejs libatomic1 \ + && apt-get install -qq -y -t bookworm-backports --no-install-recommends \ + nodejs libatomic1 \ + && rm -rf /var/lib/apt/lists/* \ + # Update NPM to latest version && npm i -g npm \ - && node --version \ - && npm --version + # Save version info + && ( \ + which node && node --version \ + && which npm && npm --version \ + && echo -e '\n\n' \ + ) | tee -a /VERSION.txt # Install Python environment -RUN echo "[+] Installing Python environment..." \ +RUN echo "[+] Installing Python $PYTHON_VERSION environment..." \ && apt-get update -qq \ && apt-get install -qq -y -t bookworm-backports --no-install-recommends \ python3 python3-pip python3-venv python3-setuptools python3-wheel python-dev-is-python3 \ python3-ldap libldap2-dev libsasl2-dev libssl-dev python3-msgpack \ + && rm -rf /var/lib/apt/lists/* \ + # tell PDM to allow using global system python site packages && rm /usr/lib/python3*/EXTERNALLY-MANAGED \ + # create global virtual environment GLOBAL_VENV to use (better than using pip install --global) && python3 -m venv --system-site-packages --symlinks $GLOBAL_VENV \ + # install global dependencies / python build dependencies in GLOBAL_VENV && $GLOBAL_VENV/bin/pip install --upgrade pip pdm setuptools wheel python-ldap \ - && rm -rf /var/lib/apt/lists/* + # Save version info + && ( \ + which python3 && python3 --version | grep " $PYTHON_VERSION" \ + && which pip3 && pip3 --version \ + && which pdm && pdm --version \ + && echo -e '\n\n' \ + ) | tee -a /VERSION.txt ######### Extractor Dependencies ################################## # Install apt dependencies -RUN echo "[+] Installing extractor APT dependencies..." \ +RUN echo "[+] Installing APT extractor dependencies..." \ && apt-get update -qq \ && apt-get install -qq -y -t bookworm-backports --no-install-recommends \ curl wget git yt-dlp ffmpeg ripgrep \ # Packages we have also needed in the past: # youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \ # fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* \ + # Save version info + && ( \ + which curl && curl --version | head -n1 \ + && which wget && wget --version | head -n1 \ + && which yt-dlp && yt-dlp --version | head -n1 \ + && which git && git --version | head -n1 \ + && which rg && rg --version | head -n1 \ + && echo -e '\n\n' \ + ) | tee -a /VERSION.txt # Install chromium browser using playwright -ENV PLAYWRIGHT_BROWSERS_PATH="/browsers" -RUN echo "[+] Installing extractor Chromium dependency..." \ +RUN echo "[+] Installing Browser binary dependencies for $TARGETPLATFORM..." \ && apt-get update -qq \ - && $GLOBAL_VENV/bin/pip install playwright \ - && $GLOBAL_VENV/bin/playwright install --with-deps chromium \ - && CHROME_BINARY="$($GLOBAL_VENV/bin/python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \ + && if [[ "$TARGETPLATFORM" == "linux/amd64" || "$TARGETPLATFORM" == "linux/arm64" ]]; then \ + # install Chromium using playwright + $GLOBAL_VENV/bin/pip install playwright \ + && $GLOBAL_VENV/bin/playwright install --with-deps chromium \ + && export CHROME_BINARY="$($GLOBAL_VENV/bin/python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \ + else \ + # install Chromium on platforms not supported by playwright (e.g. risc, ARMv7, etc.) + apt-get install -qq -y -t bookworm-backports --no-install-recommends \ + chromium fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \ + && export CHROME_BINARY="$(which chromium)"; \ + fi \ + && rm -rf /var/lib/apt/lists/* \ && ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \ && mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \ && chown -R $ARCHIVEBOX_USER "/home/${ARCHIVEBOX_USER}/.config" \ - || if [[ "$TARGETPLATFORM" == "linux/arm/v7" ]]; then exit 0; else exit 1; fi - # ignore failure for architectures where no playwright release is available yet + # Save version info + && ( \ + which chromium-browser && /usr/bin/chromium-browser --version \ + && echo -e '\n\n' \ + ) | tee -a /VERSION.txt # Install Node dependencies WORKDIR "$CODE_DIR" COPY --chown=root:root --chmod=755 "package.json" "package-lock.json" "$CODE_DIR/" -RUN echo "[+] Installing extractor Node dependencies..." \ +RUN echo "[+] Installing NPM extractor dependencies..." \ && npm ci --prefer-offline --no-audit \ - && npm version + && ( \ + which node && node --version \ + && which npm && npm version \ + && echo -e '\n\n' \ + ) | tee -a /VERSION.txt ######### Build Dependencies #################################### -# # Building ArchiveBox from source with all pdm dev dependencies +# # Install ArchiveBox development dependencies # WORKDIR "$CODE_DIR" # COPY --chown=root:root --chmod=755 "./pyproject.toml" "./pdm.lock" "$CODE_DIR/" # RUN echo "[+] Installing project Python dependencies..." \ @@ -152,38 +227,48 @@ RUN echo "[+] Installing extractor Node dependencies..." \ # Install ArchiveBox Python package from source COPY --chown=root:root --chmod=755 "." "$CODE_DIR/" -RUN echo "[*] Installing ArchiveBox package from /app..." \ +RUN echo "[*] Installing PIP ArchiveBox package from $CODE_DIR..." \ && apt-get update -qq \ - && $GLOBAL_VENV/bin/pip install -e "$CODE_DIR"[sonic,ldap] + # install C compiler to build deps on platforms that dont have 32-bit wheels available on pypi + && [[ "$TARGETPLATFORM" == "linux/arm/v7" ]] \ + && apt-get install -qq -y --no-install-recommends build-essential python3-regex \ + # INSTALL ARCHIVEBOX python package globally from CODE_DIR, with all optional dependencies + && $GLOBAL_VENV/bin/pip3 install -e "$CODE_DIR"[sonic,ldap] \ + # save docker image size and always remove compilers / build tools after building is complete + && apt-get purge -y build-essential \ + && apt-get autoremove -y \ + && rm -rf /var/lib/apt/lists/* #################################################### # Setup ArchiveBox runtime config WORKDIR "$DATA_DIR" -ENV IN_DOCKER=True \ - WGET_BINARY="wget" \ - YOUTUBEDL_BINARY="yt-dlp" \ - CHROME_SANDBOX=False \ - CHROME_BINARY="/usr/bin/chromium-browser" \ - USE_SINGLEFILE=True \ - SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \ - USE_READABILITY=True \ - READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \ - USE_MERCURY=True \ - MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser" +ENV IN_DOCKER=True + ## No need to set explicitly, these values will be autodetected by archivebox in docker: + # CHROME_SANDBOX=False \ + # WGET_BINARY="wget" \ + # YOUTUBEDL_BINARY="yt-dlp" \ + # CHROME_BINARY="/usr/bin/chromium-browser" \ + # USE_SINGLEFILE=True \ + # SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \ + # USE_READABILITY=True \ + # READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \ + # USE_MERCURY=True \ + # MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser" # Print version for nice docker finish summary -# RUN archivebox version -RUN echo "[√] Finished Docker build succesfully. Saving build summary in: /version_info.txt" \ - && uname -a | tee -a /version_info.txt \ - && env --chdir="$NODE_DIR" npm version | tee -a /version_info.txt \ - && env --chdir="$CODE_DIR" pdm info | tee -a /version_info.txt \ - && "$CODE_DIR/bin/docker_entrypoint.sh" archivebox version 2>&1 | tee -a /version_info.txt +RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \ + && echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})" \ + && echo -e "BUILD_END_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ}\n\n" \ + && "$CODE_DIR/bin/docker_entrypoint.sh" \ + archivebox version 2>&1 \ + ) | tee -a /VERSION.txt #################################################### # Open up the interfaces to the outside world -VOLUME "/data" +WORKDIR "$DATA_DIR" +VOLUME "$DATA_DIR" EXPOSE 8000 # Optional: diff --git a/bin/docker_entrypoint.sh b/bin/docker_entrypoint.sh index 29153908..4b31fb08 100755 --- a/bin/docker_entrypoint.sh +++ b/bin/docker_entrypoint.sh @@ -3,18 +3,18 @@ DATA_DIR="${DATA_DIR:-/data}" ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}" +export PUID=${PUID:-911} +export PGID=${PGID:-911} # Set the archivebox user UID & GID -if [[ -n "$PUID" && "$PUID" != 0 ]]; then - usermod -u "$PUID" "$ARCHIVEBOX_USER" > /dev/null 2>&1 -fi -if [[ -n "$PGID" && "$PGID" != 0 ]]; then - groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1 -fi +usermod -o -u "$PUID" "$ARCHIVEBOX_USER" > /dev/null 2>&1 +groupmod -o -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1 export PUID="$(id -u archivebox)" export PGID="$(id -g archivebox)" +chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR" + # Check the permissions of the data dir (or create if it doesn't exist) if [[ -d "$DATA_DIR/archive" ]]; then if touch "$DATA_DIR/archive/.permissions_test_safe_to_delete" 2>/dev/null; then @@ -22,9 +22,11 @@ if [[ -d "$DATA_DIR/archive" ]]; then rm "$DATA_DIR/archive/.permissions_test_safe_to_delete" # echo "[√] Permissions are correct" else - echo "[X] Error: ArchiveBox (uid=$PUID) is not able to write to your ./data dir. Fix the permissions and retry:" >&2 - echo " \$ chown -R $PUID:$PGID data" >&2 - echo " You may need to pass PUID & PGID to the Docker container: https://docs.linuxserver.io/general/understanding-puid-and-pgid" >&2 + echo -e "\n[X] Error: archivebox user (PUID=$PUID) is not able to write to your ./data dir." >&2 + echo -e " Change ./data to be owned by PUID=$PUID PGID=$PGID on the host and retry:" + echo -e " \$ chown -R $PUID:$PGID ./data\n" >&2 + echo -e " Configure the PUID & PGID environment variables to change the desired owner:" >&2 + echo -e " https://docs.linuxserver.io/general/understanding-puid-and-pgid\n" >&2 exit 1 fi else @@ -34,19 +36,19 @@ fi # force set the ownership of the data dir contents to the archivebox user and group # this is needed because Docker Desktop often does not map user permissions from the host properly -chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR" "$DATA_DIR"/* +chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR"/* # Drop permissions to run commands as the archivebox user -if [[ "$1" == /* || "$1" == "bash" || "$1" == "sh" || "$1" == "echo" || "$1" == "archivebox" ]]; then - # arg 1 is a binary, execute it verbatim - # e.g. "archivebox init" - # "/bin/bash" - # "echo" +if [[ "$1" == /* || "$1" == "bash" || "$1" == "sh" || "$1" == "echo" || "$1" == "cat" || "$1" == "archivebox" ]]; then + # handle "docker run archivebox /some/non-archivebox/command" by executing args as direct bash command + # e.g. "docker run archivebox /venv/bin/archivebox-alt init" + # "docker run archivebox /bin/bash -c '...'" + # "docker run archivebox echo test" exec gosu "$ARCHIVEBOX_USER" bash -c "$*" else - # no command given, assume args were meant to be passed to archivebox cmd - # e.g. "add https://example.com" - # "manage createsupseruser" - # "server 0.0.0.0:8000" + # handle "docker run archivebox add ..." by running args as archivebox $subcommand + # e.g. "docker run archivebox add https://example.com" + # "docker run archivebox manage createsupseruser" + # "docker run archivebox server 0.0.0.0:8000" exec gosu "$ARCHIVEBOX_USER" bash -c "archivebox $*" fi diff --git a/pyproject.toml b/pyproject.toml index acd8571a..b696600c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,7 @@ build = [ "stdeb", "twine", "wheel", + "regex=2021.9.30; platform_machine == 'armv7l'", ] lint = [ "flake8",