diff --git a/.dockerignore b/.dockerignore index 0a7034e1..8cebf35e 100644 --- a/.dockerignore +++ b/.dockerignore @@ -3,6 +3,8 @@ *.pyc __pycache__/ .mypy_cache/ +.pytest_cache/ +.github/ venv/ .venv/ @@ -10,6 +12,10 @@ venv/ build/ dist/ +pip_dist/ +!pip_dist/archivebox.egg-info/requires.txt +brew_dist/ +assets/ data/ output/ diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml new file mode 100644 index 00000000..82a635d0 --- /dev/null +++ b/.github/workflows/debian.yml @@ -0,0 +1,76 @@ +name: Build Debian package + +on: + workflow_dispatch: + push: + +env: + DEB_BUILD_OPTIONS: nocheck + +jobs: + build: + runs-on: ubuntu-20.04 + + steps: + - uses: actions/checkout@v2 + with: + submodules: true + fetch-depth: 1 + + - name: Install packaging dependencies + run: | + sudo apt install -y \ + python3 python3-dev python3-pip python3-venv python3-all \ + dh-python debhelper devscripts dput software-properties-common \ + python3-distutils python3-setuptools python3-wheel python3-stdeb + + - name: Build Debian/Apt sdist_dsc + run: | + rm -Rf deb_dist/* + python3 setup.py --command-packages=stdeb.command sdist_dsc + + - name: Build Debian/Apt bdist_deb + run: | + python3 setup.py --command-packages=stdeb.command bdist_deb + + - name: Install archivebox from deb + run: | + cd deb_dist/ + sudo apt install ./archivebox*.deb + + - name: Check ArchiveBox version + run: | + # must create dir needed for snaps to run as non-root on github actions + sudo mkdir -p /run/user/1001 && sudo chmod -R 777 /run/user/1001 + mkdir "${{ github.workspace }}/data" && cd "${{ github.workspace }}/data" + archivebox init + archivebox config --set SAVE_READABILITY=False + archivebox config --set SAVE_MERCURY=False + archivebox config --set SAVE_SINGLEFILE=False + archivebox --version + + - name: Add some links to test + run: | + cd "${{ github.workspace }}/data" + archivebox add 'https://example.com' + archivebox status + + # - name: Commit built package + # run: | + # cd deb_dist/ + # git config --local user.email "action@github.com" + # git config --local user.name "GitHub Action" + # git commit -m "Debian package autobuild" -a + + # - name: Push build to Github + # uses: ad-m/github-push-action@master + # with: + # github_token: ${{ secrets.GITHUB_TOKEN }} + # repository: ArchiveBox/debian-archivebox + # branch: ${{ github.ref }} + # directory: deb_dist + + # - name: Push build to Launchpad PPA + # run: | + # debsign -k "$PGP_KEY_ID" "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" + # dput archivebox "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index e6361fde..1d8c14e7 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -1,4 +1,4 @@ -name: Docker Push +name: Build Docker image on: push: @@ -8,6 +8,10 @@ on: types: - created +env: + DOCKER_IMAGE: archivebox-ci + + jobs: buildx: runs-on: ubuntu-latest @@ -17,20 +21,29 @@ jobs: with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} + - name: Checkout uses: actions/checkout@v2 + with: + submodules: true + fetch-depth: 1 + - name: Set up QEMU uses: docker/setup-qemu-action@v1 + - name: Set up Docker Buildx id: buildx uses: docker/setup-buildx-action@v1 with: version: latest install: true + - name: Builder instance name run: echo ${{ steps.buildx.outputs.name }} + - name: Available platforms run: echo ${{ steps.buildx.outputs.platforms }} + - name: Cache Docker layers uses: actions/cache@v2 with: @@ -38,6 +51,7 @@ jobs: key: ${{ runner.os }}-buildx-${{ github.sha }} restore-keys: | ${{ runner.os }}-buildx- + - name: Build and push id: docker_build uses: docker/build-push-action@v2 @@ -54,5 +68,6 @@ jobs: cache-from: type=local,src=/tmp/.buildx-cache cache-to: type=local,dest=/tmp/.buildx-cache platforms: linux/amd64,linux/arm64,linux/arm/v7 + - name: Image digest run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/homebrew.yml b/.github/workflows/homebrew.yml new file mode 100644 index 00000000..d9bb05f1 --- /dev/null +++ b/.github/workflows/homebrew.yml @@ -0,0 +1,50 @@ +name: Build Homebrew package + +on: + workflow_dispatch: + push: + + +jobs: + build: + runs-on: macos-latest + + steps: + - uses: actions/checkout@v2 + with: + submodules: true + fetch-depth: 1 + + # TODO: modify archivebox.rb to update src url, hashes, and dependencies + + - name: Build Homebrew Bottle + run: | + pip3 install --upgrade pip setuptools wheel + cd brew_dist/ + brew install --build-bottle ./archivebox.rb + # brew bottle archivebox + + - name: Add some links to test + run: | + mkdir data && cd data + archivebox init + archivebox add 'https://example.com' + archivebox version + archivebox status + + # - name: Commit built package + # run: | + # cd brew_dist/ + # git config --local user.email "action@github.com" + # git config --local user.name "GitHub Action" + # git commit -m "Homebrew package autobuild" -a + + # - name: Push build to Github + # uses: ad-m/github-push-action@master + # with: + # github_token: ${{ secrets.GITHUB_TOKEN }} + # repository: ArchiveBox/homebrew-archivebox + # branch: ${{ github.ref }} + # directory: brew_dist + + # TODO: push bottle homebrew core PR with latest changes diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..80f4f19f --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,34 @@ +name: Run linters + +on: + workflow_dispatch: + push: + +env: + MAX_LINE_LENGTH: 110 + +jobs: + lint: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + with: + submodules: true + fetch-depth: 1 + + - name: Set up Python + uses: actions/setup-python@v1 + with: + python-version: 3.9 + architecture: x64 + + - name: Install flake8 + run: | + pip install flake8 + + - name: Lint with flake8 + run: | + # one pass for show-stopper syntax errors or undefined names + flake8 archivebox --count --show-source --statistics + # one pass for small stylistic things + flake8 archivebox --count --max-line-length="$MAX_LINE_LENGTH" --statistics diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml new file mode 100644 index 00000000..36153189 --- /dev/null +++ b/.github/workflows/pip.yml @@ -0,0 +1,61 @@ +name: Build pip package + +on: + workflow_dispatch: + push: + + +jobs: + build: + runs-on: ubuntu-20.04 + + steps: + - uses: actions/checkout@v2 + with: + submodules: true + fetch-depth: 1 + + - name: Set up Python + uses: actions/setup-python@v1 + with: + python-version: 3.9 + architecture: x64 + + - name: Build Python Package + run: | + pip3 install --upgrade pip setuptools wheel + rm -Rf pip_dist/*.whl + python3 setup.py \ + sdist --dist-dir=./pip_dist \ + bdist_wheel --dist-dir=./pip_dist \ + egg_info --egg-base=./pip_dist + pip install pip_dist/archivebox-*.whl + + - name: Add some links to test + run: | + mkdir data && cd data + archivebox init + archivebox add 'https://example.com' + archivebox version + archivebox status + + # - name: Commit built package + # run: | + # cd pip_dist/ + # git config --local user.email "action@github.com" + # git config --local user.name "GitHub Action" + # git commit -m "Pip package autobuild" -a + + # - name: Push build to Github + # uses: ad-m/github-push-action@master + # with: + # github_token: ${{ secrets.GITHUB_TOKEN }} + # repository: ArchiveBox/pip-archivebox + # branch: ${{ github.ref }} + # directory: pip_dist + + # - name: Push build to PyPI + # run: | + # cd pip_dist/ + # python3 -m twine upload --repository testpypi pip_dist/*.{whl,tar.gz} + # python3 -m twine upload --repository pypi pip_dist/*.{whl,tar.gz} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8b26eca6..b5ab84b9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,44 +1,25 @@ -name: 'Lint, Test, and Build' +name: Run tests on: [push] env: - MAX_LINE_LENGTH: 110 DOCKER_IMAGE: archivebox-ci + PYTHONIOENCODING: utf-8 + PYTHONLEGACYWINDOWSSTDIO: utf-8 + USE_COLOR: False jobs: - lint: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - - name: Set up Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - architecture: x64 - - - name: Install flake8 - run: | - pip install flake8 - - - name: Lint with flake8 - run: | - # one pass for show-stopper syntax errors or undefined names - flake8 archivebox --count --show-source --statistics - # one pass for small stylistic things - flake8 archivebox --count --max-line-length="$MAX_LINE_LENGTH" --statistics - - test: + python_tests: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, macos-latest] - python: [3.7, 3.8] + os: [ubuntu-20.04, macos-latest, windows-latest] + python: [3.7] steps: - uses: actions/checkout@v2 with: + submodules: true fetch-depth: 1 ### Setup Python & JS Languages @@ -70,8 +51,9 @@ jobs: - name: Install pip dependencies run: | + python -m pip install --upgrade pip setuptools wheel pytest bottle + ./bin/build_pip.sh python -m pip install . - python -m pip install pytest bottle - name: Get npm cache dir id: npm-cache @@ -98,19 +80,25 @@ jobs: - name: Directory listing for debugging run: | pwd - ls -a ./ + ls + + - name: Archivebox version + run: | archivebox version - name: Test built package with pytest + # TODO: remove this exception for windows once we get tests passing on that platform + if: ${{ !contains(matrix.os, 'windows') }} run: | - python -m pytest -s + python -m pytest -s --ignore=archivebox/vendor - docker-test: + docker_tests: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 with: + submodules: true fetch-depth: 1 # TODO: as of 2020-11 this helper layer broke, upgrade and re-enable this once it's usable again @@ -122,8 +110,8 @@ jobs: - name: Init data dir run: | - mkdir data - docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" init + mkdir "${{ github.workspace }}/data" + docker run -v "${{ github.workspace }}/data":/data "$DOCKER_IMAGE" init - name: Run test server run: | @@ -149,7 +137,7 @@ jobs: docker-compose up -d sleep 5 curl --silent --location 'http://127.0.0.1:8000' | grep 'ArchiveBox' - curl --silent --location 'http://127.0.0.1:8000/static/admin/js/jquery.init.js' | grep 'django.jQuery' + curl --silent --location 'http://127.0.0.1:8000/static/admin/js/jquery.init.js' | grep 'window.django' - name: Check added urls show up in index run: | diff --git a/.gitignore b/.gitignore index 68717afb..e29719e4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ *.pyc __pycache__/ .mypy_cache/ +tests/out/ # Python and Node dependencies venv/ @@ -11,9 +12,9 @@ venv/ node_modules/ # Packaging artifacts +archivebox.egg-info archivebox-*.tar.gz build/ -deb_dist/ dist/ # Data folders diff --git a/.gitmodules b/.gitmodules index 9bbb6b2c..0993934a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,25 @@ [submodule "docs"] - path = docs - url = https://github.com/pirate/ArchiveBox.wiki.git + path = docs + url = https://github.com/ArchiveBox/ArchiveBox.wiki.git + +[submodule "deb_dist"] + path = deb_dist + url = https://github.com/ArchiveBox/debian-archivebox.git +[submodule "brew_dist"] + path = brew_dist + url = https://github.com/ArchiveBox/homebrew-archivebox.git +[submodule "pip_dist"] + path = pip_dist + url = https://github.com/ArchiveBox/pip-archivebox.git +[submodule "docker"] + path = docker + url = https://github.com/ArchiveBox/docker-archivebox.git +[submodule "archivebox/vendor/base32-crockford"] + path = archivebox/vendor/base32-crockford + url = https://github.com/jbittel/base32-crockford +[submodule "archivebox/vendor/pocket"] + path = archivebox/vendor/pocket + url = https://github.com/tapanpandita/pocket +[submodule "archivebox/vendor/django-taggit"] + path = archivebox/vendor/django-taggit + url = https://github.com/jazzband/django-taggit diff --git a/Dockerfile b/Dockerfile index a9b3c639..507ee6ac 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ # docker run -v "$PWD/data":/data -it archivebox manage createsuperuser # docker run -v "$PWD/data":/data -p 8000:8000 archivebox server -FROM python:3.8-slim-buster +FROM python:3.9-slim-buster LABEL name="archivebox" \ maintainer="Nick Sweeting " \ @@ -46,13 +46,20 @@ RUN apt-get update -qq \ # Install apt dependencies RUN apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ - wget curl chromium git ffmpeg youtube-dl \ + wget curl chromium git ffmpeg youtube-dl ripgrep \ fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \ && rm -rf /var/lib/apt/lists/* +# Install apt development dependencies +# RUN apt-get install -qq \ +# && apt-get install -qq -y --no-install-recommends \ +# python3 python3-dev python3-pip python3-venv python3-all \ +# dh-python debhelper devscripts dput software-properties-common \ +# python3-distutils python3-setuptools python3-wheel python3-stdeb + # Install Node environment RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \ - && echo 'deb https://deb.nodesource.com/node_14.x buster main' >> /etc/apt/sources.list \ + && echo 'deb https://deb.nodesource.com/node_15.x buster main' >> /etc/apt/sources.list \ && apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ nodejs \ @@ -62,7 +69,6 @@ RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - WORKDIR "$NODE_DIR" ENV PATH="${PATH}:$NODE_DIR/node_modules/.bin" \ npm_config_loglevel=error -RUN npm install -g npm ADD ./package.json ./package.json ADD ./package-lock.json ./package-lock.json RUN npm ci @@ -72,16 +78,17 @@ WORKDIR "$CODE_DIR" ENV PATH="${PATH}:$VENV_PATH/bin" RUN python -m venv --clear --symlinks "$VENV_PATH" \ && pip install --upgrade --quiet pip setuptools -ADD ./archivebox.egg-info/requires.txt "$CODE_DIR/archivebox.egg-info/requires.txt" +ADD ./pip_dist/archivebox.egg-info/requires.txt "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" RUN apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ build-essential python-dev python3-dev \ - && grep -B 1000 -E '^$' "$CODE_DIR/archivebox.egg-info/requires.txt" | pip install --quiet -r /dev/stdin \ + && grep -B 1000 -E '^$' "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" | pip install --quiet -r /dev/stdin \ + && pip install --quiet "sonic-client==0.0.5" \ && apt-get purge -y build-essential python-dev python3-dev \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* -# Install ArchiveBox Python package +# Install ArchiveBox Python package and its dependencies WORKDIR "$CODE_DIR" ADD . "$CODE_DIR" RUN pip install -e . @@ -99,7 +106,8 @@ ENV IN_DOCKER=True \ MERCURY_BINARY="$NODE_DIR/node_modules/.bin/mercury-parser" # Print version for nice docker finish summary -RUN archivebox version +# RUN archivebox version +RUN /app/bin/docker_entrypoint.sh archivebox version # Open up the interfaces to the outside world VOLUME "$DATA_DIR" diff --git a/README.md b/README.md index 56afe775..29f23f74 100644 --- a/README.md +++ b/README.md @@ -26,62 +26,175 @@
-ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). +ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. -Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. +Your archive can be managed through the command line with commands like `archivebox add`, through the built-in Web UI `archivebox server`, or via the Python library API (beta). It can ingest bookmarks from a browser or service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. You can also schedule regular/realtime imports with `archivebox schedule`. -The main index is a self-contained `data/index.sqlite3` file, and each snapshot is stored as a folder `data/archive//`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: 3 types of HTML snapshots (wget, Chrome headless, singlefile), a PDF snapshot, a screenshot, a WARC archive, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python API. +The main index is a self-contained `index.sqlite3` file, and each snapshot is stored as a folder `data/archive//`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: several types of HTML snapshots (wget, Chrome headless, singlefile), PDF snapshotting, screenshotting, WARC archiving, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python library API. -#### Quickstart +### Quickstart + +It works on Linux/BSD (Intel and ARM CPUs with `docker`/`apt`/`pip3`), macOS (with `docker`/`brew`/`pip3`), and Windows (beta with `docker`/`pip3`). -**First, get ArchiveBox using your system package manager, Docker, or pip:** ```bash -# You can run it with Docker or Docker Compose (recommended) -docker pull archivebox/archivebox -# https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml +pip3 install archivebox +archivebox --version +# install extras as-needed, or use one of full setup methods below to get everything out-of-the-box -# or Ubuntu/Debian +mkdir ~/archivebox && cd ~/archivebox # this can be anywhere +archivebox init + +archivebox add 'https://example.com' +archivebox add --depth=1 'https://example.com' +archivebox schedule --every=day https://getpocket.com/users/USERNAME/feed/all +archivebox oneshot --extract=title,favicon,media https://www.youtube.com/watch?v=dQw4w9WgXcQ +archivebox help # to see more options +``` + +*(click to expand the sections below for full setup instructions)* + +
+Get ArchiveBox with docker-compose on any platform (recommended, everything included out-of-the-box) + +First make sure you have Docker installed: https://docs.docker.com/get-docker/ +

+This is the recommended way to run ArchiveBox because it includes *all* the extractors like chrome, wget, youtube-dl, git, etc., as well as full-text search with sonic, and many other great features. + +```bash +# create a new empty directory and initalize your collection (can be anywhere) +mkdir ~/archivebox && cd ~/archivebox +curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml +docker-compose run archivebox init +docker-compose run archivebox --version + +# start the webserver and open the UI (optional) +docker-compose run archivebox manage createsuperuser +docker-compose up -d +open http://127.0.0.1:8000 + +# you can also add links and manage your archive via the CLI: +docker-compose run archivebox add 'https://example.com' +docker-compose run archivebox status +docker-compose run archivebox help # to see more options +``` + +
+ +
+Get ArchiveBox with docker on any platform + +First make sure you have Docker installed: https://docs.docker.com/get-docker/
+```bash +# create a new empty directory and initalize your collection (can be anywhere) +mkdir ~/archivebox && cd ~/archivebox +docker run -v $PWD:/data -it archivebox/archivebox init +docker run -v $PWD:/data -it archivebox/archivebox --version + +# start the webserver and open the UI (optional) +docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser +docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox server 0.0.0.0:8000 +open http://127.0.0.1:8000 + +# you can also add links and manage your archive via the CLI: +docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com' +docker run -v $PWD:/data -it archivebox/archivebox status +docker run -v $PWD:/data -it archivebox/archivebox help # to see more options +``` + +
+ +
+Get ArchiveBox with apt on Ubuntu >=20.04 + +```bash sudo add-apt-repository -u ppa:archivebox/archivebox -apt install archivebox +sudo apt install archivebox -# or macOS +# create a new empty directory and initalize your collection (can be anywhere) +mkdir ~/archivebox && cd ~/archivebox +npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' +archivebox init +archivebox --version + +# start the webserver and open the web UI (optional) +archivebox manage createsuperuser +archivebox server 0.0.0.0:8000 +open http://127.0.0.1:8000 + +# you can also add URLs and manage the archive via the CLI and filesystem: +archivebox add 'https://example.com' +archivebox status +archivebox list --html --with-headers > index.html +archivebox list --json --with-headers > index.json +archivebox help # to see more options +``` + +For other Debian-based systems or older Ubuntu systems you can add these sources to `/etc/apt/sources.list`: +```bash +deb http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main +deb-src http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main +``` +(you may need to install some other dependencies manually however) + +
+ +
+Get ArchiveBox with brew on macOS >=10.13 + +```bash brew install archivebox/archivebox/archivebox -# or for the Python version only, without wget/git/chrome/etc. included +# create a new empty directory and initalize your collection (can be anywhere) +mkdir ~/archivebox && cd ~/archivebox +npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' +archivebox init +archivebox --version + +# start the webserver and open the web UI (optional) +archivebox manage createsuperuser +archivebox server 0.0.0.0:8000 +open http://127.0.0.1:8000 + +# you can also add URLs and manage the archive via the CLI and filesystem: +archivebox add 'https://example.com' +archivebox status +archivebox list --html --with-headers > index.html +archivebox list --json --with-headers > index.json +archivebox help # to see more options +``` + +
+ +
+Get ArchiveBox with pip on any platform + +```bash pip3 install archivebox -# If you're using an apt/brew/pip install you can run archivebox commands normally -# archivebox [subcommand] [...args] -# If you're using Docker you'll have to run the commands like this -# docker run -v $PWD:/data -it archivebox/archivebox [subcommand] [...args] -# And the equivalent in Docker Compose: -# docker-compose run archivebox [subcommand] [...args] -``` - -Check that everything installed correctly with `archivebox --version` - -**To start using archivebox, you have to create a data folder and `cd` into it:** - -```bash -mkdir ~/archivebox && cd ~/archivebox # you can put the collection dir anywhere +# create a new empty directory and initalize your collection (can be anywhere) +mkdir ~/archivebox && cd ~/archivebox +npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' archivebox init +archivebox --version +# Install any missing extras like wget/git/chrome/etc. manually as needed + +# start the webserver and open the web UI (optional) +archivebox manage createsuperuser +archivebox server 0.0.0.0:8000 +open http://127.0.0.1:8000 + +# you can also add URLs and manage the archive via the CLI and filesystem: +archivebox add 'https://example.com' +archivebox status +archivebox list --html --with-headers > index.html +archivebox list --json --with-headers > index.json +archivebox help # to see more options ``` -**Then Add some URLs to your archive collection:** -```bash -archivebox add https://github.com/ArchiveBox/ArchiveBox -archivebox add --depth=1 https://example.com -``` - -**View the snapshots of the URLs you added via the self-hosted web UI:** -```bash -archivebox manage createsuperuser # create an admin acct -archivebox server 0.0.0.0:8000 # start the web server -open http://127.0.0.1:8000/ # open the interactive admin panel -ls ~/archivebox/archive/*/index.html # or browse the snapshots on disk -``` - - +
+ +--- +

@@ -97,9 +210,9 @@ For more information, see the .gz` gzipped WARC of all the resources fetched while archiving - **PDF:** `output.pdf` Printed PDF of site using headless chrome - **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome - **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome +- **Readability:** `article.html/json` Article text extraction using Readability - **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org - **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl - **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links @@ -191,8 +307,8 @@ archivebox add 'https://example.com/any/url/you/want/to/keep/secret/' # without first disabling share the URL with 3rd party APIs: archivebox config --set SAVE_ARCHIVE_DOT_ORG=False # disable saving all URLs in Archive.org -archivebox config --set SAVE_FAVICON=False # optional: only the domain is leaked, not full URL -archivebox config --get CHROME_VERSION # optional: set this to chromium instead of chrome if you don't like Google +archivebox config --set SAVE_FAVICON=False # optional: only the domain is leaked, not full URL +archivebox config --set CHROME_BINARY=chromium # optional: switch to chromium to avoid Chrome phoning home to Google ``` Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details. @@ -215,95 +331,6 @@ archivebox add 'https://example.com#2020-10-25' --- -# Setup - -## Docker Compose - -*This is the recommended way of running ArchiveBox.* - -It comes with everything working out of the box, including all extractors, -a headless browser runtime, a full webserver, and CLI interface. - -```bash -# docker-compose run archivebox [args] - -mkdir archivebox && cd archivebox -wget 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml' -docker-compose run archivebox init -docker-compose run archivebox add 'https://example.com' -docker-compose run archivebox manage createsuperuser -docker-compose up -open http://127.0.0.1:8000 -``` - -## Docker - -```bash -# docker run -v $PWD:/data -it archivebox/archivebox [args] - -mkdir archivebox && cd archivebox -docker run -v $PWD:/data -it archivebox/archivebox init -docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com' -docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser - -# run the webserver to access the web UI -docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox server 0.0.0.0:8000 -open http://127.0.0.1:8000 - -# or export a static version of the index if you dont want to run a server -docker run -v $PWD:/data -it archivebox/archivebox list --html --with-headers > index.html -docker run -v $PWD:/data -it archivebox/archivebox list --json --with-headers > index.json -open ./index.html -``` - - -## Bare Metal - -```bash -# archivebox [args] - -# on Debian/Ubuntu -sudo add-apt-repository -u ppa:archivebox/archivebox -apt install archivebox - -# on macOS -brew install archivebox/archivebox/archivebox -``` - -Initialize your archive in a directory somewhere and add some links: -```bash -mkdir ~/archivebox && cd archivebox -npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' -archivebox init -archivebox add 'https://example.com' # add URLs as args pipe them in via stdin -archivebox add --depth=1 https://example.com/table-of-contents.html -# it can injest links from many formats, including RSS/JSON/XML/MD/TXT and more -curl https://getpocket.com/users/USERNAME/feed/all | archivebox add -``` - -Start the webserver to access the web UI: -```bash -archivebox manage createsuperuser -archivebox server 0.0.0.0:8000 - -open http://127.0.0.1:8000 -``` - -Or export a static HTML version of the index if you don't want to run a webserver: -```bash -archivebox list --html --with-headers > index.html -archivebox list --json --with-headers > index.json -open ./index.html -``` - -To view more information about your dependencies, data, or the CLI: -```bash -archivebox version -archivebox status -archivebox help -``` ---- -
@@ -418,20 +445,19 @@ All contributions to ArchiveBox are welcomed! Check our [issues](https://github. First, install the system dependencies from the "Bare Metal" section above. Then you can clone the ArchiveBox repo and install ```python3 -git clone https://github.com/ArchiveBox/ArchiveBox -cd ArchiveBox +git clone https://github.com/ArchiveBox/ArchiveBox && cd ArchiveBox git checkout master # or the branch you want to test -git pull +git submodule update --init --recursive +git pull --recurse-submodules # Install ArchiveBox + python dependencies python3 -m venv .venv && source .venv/bin/activate && pip install -e .[dev] -# or -pipenv install --dev && pipenv shell +# or with pipenv: pipenv install --dev && pipenv shell # Install node dependencies npm install -# Optional: install the extractor dependencies +# Optional: install extractor dependencies manually or with helper script ./bin/setup.sh # Optional: develop via docker by mounting the code dir into the container @@ -463,6 +489,17 @@ You can also run all these in Docker. For more examples see the Github Actions C ``` (uses `pytest -s`) +#### Make migrations or enter a django shell + +```bash +cd archivebox/ +./manage.py makemigrations + +cd data/ +archivebox shell +``` +(uses `pytest -s`) + #### Build the docs, pip package, and docker image ```bash @@ -471,6 +508,8 @@ You can also run all these in Docker. For more examples see the Github Actions C # or individually: ./bin/build_docs.sh ./bin/build_pip.sh +./bin/build_deb.sh +./bin/build_brew.sh ./bin/build_docker.sh ``` diff --git a/archivebox.egg-info/PKG-INFO b/archivebox.egg-info/PKG-INFO deleted file mode 100644 index 1d528824..00000000 --- a/archivebox.egg-info/PKG-INFO +++ /dev/null @@ -1,541 +0,0 @@ -Metadata-Version: 2.1 -Name: archivebox -Version: 0.4.24 -Summary: The self-hosted internet archive. -Home-page: https://github.com/ArchiveBox/ArchiveBox -Author: Nick Sweeting -Author-email: git@nicksweeting.com -License: MIT -Project-URL: Source, https://github.com/ArchiveBox/ArchiveBox -Project-URL: Documentation, https://github.com/ArchiveBox/ArchiveBox/wiki -Project-URL: Bug Tracker, https://github.com/ArchiveBox/ArchiveBox/issues -Project-URL: Changelog, https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog -Project-URL: Roadmap, https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap -Project-URL: Community, https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community -Project-URL: Donate, https://github.com/ArchiveBox/ArchiveBox/wiki/Donations -Description:
- -

ArchiveBox
The open-source self-hosted web archive.

- - ▶️
Quickstart | - Demo | - Github | - Documentation | - Info & Motivation | - Community | - Roadmap - -
-        "Your own personal internet archive" (网站存档 / 爬虫)
-        
- - - - - - - - - - -
-
- - ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended) or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). - - Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. - - The main index is a self-contained `data/index.sqlite3` file, and each snapshot is stored as a folder `data/archive//`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: 3 types of HTML snapshots (wget, Chrome headless, singlefile), a PDF snapshot, a screenshot, a WARC archive, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python API. - - - #### Quickstart - - ```bash - # 1. Create a folder somewhere to hold your ArchiveBox data - mkdir ~/archivebox && cd ~/archivebox - docker run -v $PWD:/data -it archivebox/archivebox init - - # 2. Archive some URLs to get started - docker run -v $PWD:/data -t archivebox/archivebox add https://github.com/ArchiveBox/ArchiveBox - docker run -v $PWD:/data -t archivebox/archivebox add --depth=1 https://example.com - - # 3. Then view the snapshots of the URLs you added via the self-hosted web UI - docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser # create an admin acct - docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox # start the web server - open http://127.0.0.1:8000/ # open the interactive admin panel - ls archive/*/index.html # or just browse snapshots on disk - ``` - -
- -
- - DEMO: archivebox.zervice.io/ - For more information, see the full Quickstart guide, Usage, and Configuration docs. -
- - --- - - - # Overview - - ArchiveBox is a command line tool, self-hostable web-archiving server, and Python library all-in-one. It's available as a Python3 package or a Docker image, both methods provide the same CLI, Web UI, and on-disk data format. - - It works on Docker, macOS, and Linux/BSD. Windows is not officially supported, but users have reported getting it working using the WSL2 + Docker. - - To use ArchiveBox you start by creating a folder for your data to live in (it can be anywhere on your system), and running `archivebox init` inside of it. That will create a sqlite3 index and an `ArchiveBox.conf` file. After that, you can continue to add/remove/search/import/export/manage/config/etc using the CLI `archivebox help`, or you can run the Web UI (recommended): - ```bash - archivebox manage createsuperuser - archivebox server 0.0.0.0:8000 - open http://127.0.0.1:8000 - ``` - - The CLI is considered "stable", the ArchiveBox Python API and REST APIs are in "beta", and the [desktop app](https://github.com/ArchiveBox/desktop) is in "alpha" stage. - - At the end of the day, the goal is to sleep soundly knowing that the part of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer). You can also self-host your archivebox server on a public domain to provide archive.org-style public access to your site snapshots. - -
- CLI Screenshot - Desktop index screenshot - Desktop details page Screenshot - Desktop details page Screenshot
- Demo | Usage | Screenshots -
- . . . . . . . . . . . . . . . . . . . . . . . . . . . . -

- - - ## Key Features - - - [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally - - [**Few dependencies**](https://github.com/ArchiveBox/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) - - [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) - - Easy to set up **[scheduled importing](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources** - - Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC - - ~~**Suitable for paywalled / [authenticated content](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes) - - **Doesn't require a constantly-running daemon**, proxy, or native app - - Provides a CLI, Python API, self-hosted web UI, and REST API (WIP) - - Architected to be able to run [**many varieties of scripts during archiving**](https://github.com/ArchiveBox/ArchiveBox/issues/51), e.g. to extract media, summarize articles, [scroll pages](https://github.com/ArchiveBox/ArchiveBox/issues/80), [close modals](https://github.com/ArchiveBox/ArchiveBox/issues/175), expand comment threads, etc. - - Can also [**mirror content to 3rd-party archiving services**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy - - ## Input formats - - ArchiveBox supports many input formats for URLs, including Pocket & Pinboard exports, Browser bookmarks, Browser history, plain text, HTML, markdown, and more! - - ```bash - echo 'http://example.com' | archivebox add - archivebox add 'https://example.com/some/page' - archivebox add < ~/Downloads/firefox_bookmarks_export.html - archivebox add < any_text_with_urls_in_it.txt - archivebox add --depth=1 'https://example.com/some/downloads.html' - archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12' - ``` - - - Browser history or bookmarks exports (Chrome, Firefox, Safari, IE, Opera, and more) - - RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format - - Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more - - See the [Usage: CLI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples. - - It also includes a built-in scheduled import feature and browser bookmarklet, so you can ingest URLs from RSS feeds, websites, or the filesystem regularly. - - ## Output formats - - All of ArchiveBox's state (including the index, snapshot data, and config file) is stored in a single folder called the "ArchiveBox data folder". All `archivebox` CLI commands must be run from inside this folder, and you first create it by running `archivebox init`. - - The on-disk layout is optimized to be easy to browse by hand and durable long-term. The main index is a standard sqlite3 database (it can also be exported as static JSON/HTML), and the archive snapshots are organized by date-added timestamp in the `archive/` subfolder. Each snapshot subfolder includes a static JSON and HTML index describing its contents, and the snapshot extrator outputs are plain files within the folder (e.g. `media/example.mp4`, `git/somerepo.git`, `static/someimage.png`, etc.) - - ```bash - ls ./archive// - ``` - - - **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details - - **Title:** `title` title of the site - - **Favicon:** `favicon.ico` favicon of the site - - **WGET Clone:** `example.com/page-name.html` wget clone of the site, with .html appended if not present - - **WARC:** `warc/.gz` gzipped WARC of all the resources fetched while archiving - - **PDF:** `output.pdf` Printed PDF of site using headless chrome - - **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome - - **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome - - **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org - - **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl - - **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links - - _More coming soon! See the [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)..._ - - It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) via environment variables or config file. - - ## Dependencies - - You don't need to install all the dependencies, ArchiveBox will automatically enable the relevant modules based on whatever you have available, but it's recommended to use the official [Docker image](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker) with everything preinstalled. - - If you so choose, you can also install ArchiveBox and its dependencies directly on any Linux or macOS systems using the [automated setup script](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart) or the [system package manager](https://github.com/ArchiveBox/ArchiveBox/wiki/Install). - - ArchiveBox is written in Python 3 so it requires `python3` and `pip3` available on your system. It also uses a set of optional, but highly recommended external dependencies for archiving sites: `wget` (for plain HTML, static files, and WARC saving), `chromium` (for screenshots, PDFs, JS execution, and more), `youtube-dl` (for audio and video), `git` (for cloning git repos), and `nodejs` (for readability and singlefile), and more. - - ## Caveats - - If you're importing URLs containing secret slugs or pages with private content (e.g Google Docs, CodiMD notepads, etc), you may want to disable some of the extractor modules to avoid leaking private URLs to 3rd party APIs during the archiving process. - ```bash - # don't do this: - archivebox add 'https://docs.google.com/document/d/12345somelongsecrethere' - archivebox add 'https://example.com/any/url/you/want/to/keep/secret/' - - # without first disabling share the URL with 3rd party APIs: - archivebox config --set SAVE_ARCHIVE_DOT_ORG=False # disable saving all URLs in Archive.org - archivebox config --set SAVE_FAVICON=False # optional: only the domain is leaked, not full URL - archivebox config --get CHROME_VERSION # optional: set this to chromium instead of chrome if you don't like Google - ``` - - Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details. - ```bash - # visiting an archived page with malicious JS: - https://127.0.0.1:8000/archive/1602401954/example.com/index.html - - # example.com/index.js can now make a request to read everything: - https://127.0.0.1:8000/index.html - https://127.0.0.1:8000/archive/* - # then example.com/index.js can send it off to some evil server - ``` - - Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/ArchiveBox/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). For now ArchiveBox is designed to only archive each URL with each extractor type once. A workaround to take multiple snapshots of the same URL is to make them slightly different by adding a hash: - ```bash - archivebox add 'https://example.com#2020-10-24' - ... - archivebox add 'https://example.com#2020-10-25' - ``` - - --- - - # Setup - - ## Docker Compose - - *This is the recommended way of running ArchiveBox.* - - It comes with everything working out of the box, including all extractors, - a headless browser runtime, a full webserver, and CLI interface. - - ```bash - # docker-compose run archivebox [args] - - mkdir archivebox && cd archivebox - wget 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml' - docker-compose run archivebox init - docker-compose run archivebox add 'https://example.com' - docker-compose run archivebox manage createsuperuser - docker-compose up - open http://127.0.0.1:8000 - ``` - - ## Docker - - ```bash - # docker run -v $PWD:/data -it archivebox/archivebox [args] - - mkdir archivebox && cd archivebox - docker run -v $PWD:/data -it archivebox/archivebox init - docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com' - docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser - - # run the webserver to access the web UI - docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox server 0.0.0.0:8000 - open http://127.0.0.1:8000 - - # or export a static version of the index if you dont want to run a server - docker run -v $PWD:/data -it archivebox/archivebox list --html --with-headers > index.html - docker run -v $PWD:/data -it archivebox/archivebox list --json --with-headers > index.json - open ./index.html - ``` - - - ## Bare Metal - - ```bash - # archivebox [args] - ``` - - First install the system, pip, and npm dependencies: - ```bash - # Install main dependendencies using apt on Ubuntu/Debian, brew on mac, or pkg on BSD - apt install python3 python3-pip python3-dev git curl wget chromium-browser youtube-dl - - # Install Node runtime (used for headless browser scripts like Readability, Singlefile, Mercury, etc.) - curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \ - && echo 'deb https://deb.nodesource.com/node_14.x $(lsb_release -cs) main' >> /etc/apt/sources.list \ - && apt-get update \ - && apt-get install --no-install-recommends nodejs - - # Make a directory to hold your collection - mkdir archivebox && cd archivebox # (can be anywhere, doesn't have to be called archivebox) - - # Install the archivebox python package in ./.venv - python3 -m venv .venv && source .venv/bin/activate - pip install --upgrade archivebox - - # Install node packages in ./node_modules (used for SingleFile, Readability, and Puppeteer) - npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' - ``` - - Initialize your archive and add some links: - ```bash - archivebox init - archivebox add 'https://example.com' # add URLs as args pipe them in via stdin - archivebox add --depth=1 https://example.com/table-of-contents.html - # it can injest links from many formats, including RSS/JSON/XML/MD/TXT and more - curl https://getpocket.com/users/USERNAME/feed/all | archivebox add - ``` - - Start the webserver to access the web UI: - ```bash - archivebox manage createsuperuser - archivebox server 0.0.0.0:8000 - - open http://127.0.0.1:8000 - ``` - - Or export a static HTML version of the index if you don't want to run a webserver: - ```bash - archivebox list --html --with-headers > index.html - archivebox list --json --with-headers > index.json - open ./index.html - ``` - - To view more information about your dependencies, data, or the CLI: - ```bash - archivebox version - archivebox status - archivebox help - ``` - --- - -
- -
- - --- - - # Background & Motivation - - Vast treasure troves of knowledge are lost every day on the internet to link rot. As a society, we have an imperative to preserve some important parts of that treasure, just like we preserve our books, paintings, and music in physical libraries long after the originals go out of print or fade into obscurity. - - Whether it's to resist censorship by saving articles before they get taken down or edited, or - just to save a collection of early 2010's flash games you love to play, having the tools to - archive internet content enables to you save the stuff you care most about before it disappears. - -
-
- Image from WTF is Link Rot?...
-
- - The balance between the permanence and ephemeral nature of content on the internet is part of what makes it beautiful. - I don't think everything should be preserved in an automated fashion, making all content permanent and never removable, but I do think people should be able to decide for themselves and effectively archive specific content that they care about. - - Because modern websites are complicated and often rely on dynamic content, - ArchiveBox archives the sites in **several different formats** beyond what public archiving services like Archive.org and Archive.is are capable of saving. Using multiple methods and the market-dominant browser to execute JS ensures we can save even the most complex, finicky websites in at least a few high-quality, long-term data formats. - - All the archived links are stored by date bookmarked in `./archive/`, and everything is indexed nicely with JSON & HTML files. The intent is for all the content to be viewable with common software in 50 - 100 years without needing to run ArchiveBox in a VM. - - ## Comparison to Other Projects - - ▶ **Check out our [community page](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.** - - comparison The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations. - - #### User Interface & Intended Purpose - - ArchiveBox differentiates itself from [similar projects](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend. - - #### Private Local Archives vs Centralized Public Archives - - Unlike crawler software that starts from a seed URL and works outwards, or public tools like Archive.org designed for users to manually submit links from the public internet, ArchiveBox tries to be a set-and-forget archiver suitable for archiving your entire browsing history, RSS feeds, or bookmarks, ~~including private/authenticated content that you wouldn't otherwise share with a centralized service~~ (do not do this until v0.5 is released with some security fixes). Also by having each user store their own content locally, we can save much larger portions of everyone's browsing history than a shared centralized service would be able to handle. - - #### Storage Requirements - - Because ArchiveBox is designed to ingest a firehose of browser history and bookmark feeds to a local disk, it can be much more disk-space intensive than a centralized service like the Internet Archive or Archive.today. However, as storage space gets cheaper and compression improves, you should be able to use it continuously over the years without having to delete anything. In my experience, ArchiveBox uses about 5gb per 1000 articles, but your milage may vary depending on which options you have enabled and what types of sites you're archiving. By default, it archives everything in as many formats as possible, meaning it takes more space than a using a single method, but more content is accurately replayable over extended periods of time. Storage requirements can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by setting `SAVE_MEDIA=False` to skip audio & video files. - - ## Learn more - - Whether you want to learn which organizations are the big players in the web archiving space, want to find a specific open-source tool for your web archiving need, or just want to see where archivists hang out online, our Community Wiki page serves as an index of the broader web archiving community. Check it out to learn about some of the coolest web archiving projects and communities on the web! - - - - - [Community Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) - - [The Master Lists](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists) - _Community-maintained indexes of archiving tools and institutions._ - - [Web Archiving Software](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) - _Open source tools and projects in the internet archiving space._ - - [Reading List](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Reading-List) - _Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._ - - [Communities](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Communities) - _A collection of the most active internet archiving communities and initiatives._ - - Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog) - - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post. - - Or reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter. - - --- - - # Documentation - - - - We use the [Github wiki system](https://github.com/ArchiveBox/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation. - - You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/ArchiveBox/ArchiveBox/wiki/Home) folder. - - ## Getting Started - - - [Quickstart](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart) - - [Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install) - - [Docker](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker) - - ## Reference - - - [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage) - - [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) - - [Supported Sources](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) - - [Supported Outputs](https://github.com/ArchiveBox/ArchiveBox/wiki#can-save-these-things-for-each-site) - - [Scheduled Archiving](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) - - [Publishing Your Archive](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive) - - [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install-Chromium) - - [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview) - - [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting) - - [Python API](https://docs.archivebox.io/en/latest/modules.html) - - REST API (coming soon...) - - ## More Info - - - [Tickets](https://github.com/ArchiveBox/ArchiveBox/issues) - - [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) - - [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog) - - [Donations](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) - - [Background & Motivation](https://github.com/ArchiveBox/ArchiveBox#background--motivation) - - [Web Archiving Community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) - - --- - - # ArchiveBox Development - - All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/ArchiveBox/ArchiveBox/issues) and [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap. - - ### Setup the dev environment - - First, install the system dependencies from the "Bare Metal" section above. - Then you can clone the ArchiveBox repo and install - ```python3 - git clone https://github.com/ArchiveBox/ArchiveBox - cd ArchiveBox - git checkout master # or the branch you want to test - git pull - - # Install ArchiveBox + python dependencies - python3 -m venv .venv && source .venv/bin/activate && pip install -e .[dev] - # or - pipenv install --dev && pipenv shell - - # Install node dependencies - npm install - - # Optional: install the extractor dependencies - ./bin/setup.sh - - # Optional: develop via docker by mounting the code dir into the container - # if you edit e.g. ./archivebox/core/models.py on the docker host, runserver - # inside the container will reload and pick up your changes - docker build . -t archivebox - docker run -it -p 8000:8000 \ - -v $PWD/data:/data \ - -v $PWD/archivebox:/app/archivebox \ - archivebox server 0.0.0.0:8000 --debug --reload - ``` - - ### Common development tasks - - See the `./bin/` folder and read the source of the bash scripts within. - You can also run all these in Docker. For more examples see the Github Actions CI/CD tests that are run: `.github/workflows/*.yaml`. - - #### Run the linters - - ```bash - ./bin/lint.sh - ``` - (uses `flake8` and `mypy`) - - #### Run the integration tests - - ```bash - ./bin/test.sh - ``` - (uses `pytest -s`) - - #### Build the docs, pip package, and docker image - - ```bash - ./bin/build.sh - - # or individually: - ./bin/build_docs.sh - ./bin/build_pip.sh - ./bin/build_docker.sh - ``` - - #### Roll a release - - ```bash - ./bin/release.sh - ``` - (bumps the version, builds, and pushes a release to PyPI, Docker Hub, and Github Packages) - - - --- - -
-

- -
- This project is maintained mostly in my spare time with the help from generous contributors and Monadical.com. -

- -
- Sponsor us on Github -
-
- -
- - - - -

- -
- -Platform: UNKNOWN -Classifier: License :: OSI Approved :: MIT License -Classifier: Natural Language :: English -Classifier: Operating System :: OS Independent -Classifier: Development Status :: 4 - Beta -Classifier: Topic :: Utilities -Classifier: Topic :: System :: Archiving -Classifier: Topic :: System :: Archiving :: Backup -Classifier: Topic :: System :: Recovery Tools -Classifier: Topic :: Sociology :: History -Classifier: Topic :: Internet :: WWW/HTTP -Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search -Classifier: Topic :: Internet :: WWW/HTTP :: WSGI :: Application -Classifier: Topic :: Software Development :: Libraries :: Python Modules -Classifier: Intended Audience :: Developers -Classifier: Intended Audience :: Education -Classifier: Intended Audience :: End Users/Desktop -Classifier: Intended Audience :: Information Technology -Classifier: Intended Audience :: Legal Industry -Classifier: Intended Audience :: System Administrators -Classifier: Environment :: Console -Classifier: Environment :: Web Environment -Classifier: Programming Language :: Python :: 3 -Classifier: Programming Language :: Python :: 3.7 -Classifier: Programming Language :: Python :: 3.8 -Classifier: Framework :: Django -Classifier: Typing :: Typed -Requires-Python: >=3.7 -Description-Content-Type: text/markdown -Provides-Extra: dev diff --git a/archivebox.egg-info/SOURCES.txt b/archivebox.egg-info/SOURCES.txt deleted file mode 100644 index 8f0d5d48..00000000 --- a/archivebox.egg-info/SOURCES.txt +++ /dev/null @@ -1,128 +0,0 @@ -MANIFEST.in -README.md -setup.py -archivebox/.flake8 -archivebox/LICENSE -archivebox/README.md -archivebox/__init__.py -archivebox/__main__.py -archivebox/base32_crockford.py -archivebox/config.py -archivebox/config_stubs.py -archivebox/logging_util.py -archivebox/main.py -archivebox/manage.py -archivebox/mypy.ini -archivebox/package.json -archivebox/system.py -archivebox/util.py -archivebox.egg-info/PKG-INFO -archivebox.egg-info/SOURCES.txt -archivebox.egg-info/dependency_links.txt -archivebox.egg-info/entry_points.txt -archivebox.egg-info/requires.txt -archivebox.egg-info/top_level.txt -archivebox/cli/__init__.py -archivebox/cli/archivebox_add.py -archivebox/cli/archivebox_config.py -archivebox/cli/archivebox_help.py -archivebox/cli/archivebox_init.py -archivebox/cli/archivebox_list.py -archivebox/cli/archivebox_manage.py -archivebox/cli/archivebox_oneshot.py -archivebox/cli/archivebox_remove.py -archivebox/cli/archivebox_schedule.py -archivebox/cli/archivebox_server.py -archivebox/cli/archivebox_shell.py -archivebox/cli/archivebox_status.py -archivebox/cli/archivebox_update.py -archivebox/cli/archivebox_version.py -archivebox/cli/tests.py -archivebox/core/__init__.py -archivebox/core/admin.py -archivebox/core/apps.py -archivebox/core/forms.py -archivebox/core/models.py -archivebox/core/settings.py -archivebox/core/tests.py -archivebox/core/urls.py -archivebox/core/utils.py -archivebox/core/utils_taggit.py -archivebox/core/views.py -archivebox/core/welcome_message.py -archivebox/core/wsgi.py -archivebox/core/management/commands/archivebox.py -archivebox/core/migrations/0001_initial.py -archivebox/core/migrations/0002_auto_20200625_1521.py -archivebox/core/migrations/0003_auto_20200630_1034.py -archivebox/core/migrations/0004_auto_20200713_1552.py -archivebox/core/migrations/0005_auto_20200728_0326.py -archivebox/core/migrations/0006_auto_20201012_1520.py -archivebox/core/migrations/__init__.py -archivebox/extractors/__init__.py -archivebox/extractors/archive_org.py -archivebox/extractors/dom.py -archivebox/extractors/favicon.py -archivebox/extractors/git.py -archivebox/extractors/headers.py -archivebox/extractors/media.py -archivebox/extractors/mercury.py -archivebox/extractors/pdf.py -archivebox/extractors/readability.py -archivebox/extractors/screenshot.py -archivebox/extractors/singlefile.py -archivebox/extractors/title.py -archivebox/extractors/wget.py -archivebox/index/__init__.py -archivebox/index/csv.py -archivebox/index/html.py -archivebox/index/json.py -archivebox/index/schema.py -archivebox/index/sql.py -archivebox/parsers/__init__.py -archivebox/parsers/generic_html.py -archivebox/parsers/generic_json.py -archivebox/parsers/generic_rss.py -archivebox/parsers/generic_txt.py -archivebox/parsers/medium_rss.py -archivebox/parsers/netscape_html.py -archivebox/parsers/pinboard_rss.py -archivebox/parsers/pocket_html.py -archivebox/parsers/shaarli_rss.py -archivebox/parsers/wallabag_atom.py -archivebox/themes/admin/actions_as_select.html -archivebox/themes/admin/app_index.html -archivebox/themes/admin/base.html -archivebox/themes/admin/login.html -archivebox/themes/default/add_links.html -archivebox/themes/default/base.html -archivebox/themes/default/main_index.html -archivebox/themes/default/core/snapshot_list.html -archivebox/themes/default/static/add.css -archivebox/themes/default/static/admin.css -archivebox/themes/default/static/archive.png -archivebox/themes/default/static/bootstrap.min.css -archivebox/themes/default/static/external.png -archivebox/themes/default/static/jquery.dataTables.min.css -archivebox/themes/default/static/jquery.dataTables.min.js -archivebox/themes/default/static/jquery.min.js -archivebox/themes/default/static/sort_asc.png -archivebox/themes/default/static/sort_both.png -archivebox/themes/default/static/sort_desc.png -archivebox/themes/default/static/spinner.gif -archivebox/themes/legacy/favicon.ico -archivebox/themes/legacy/link_details.html -archivebox/themes/legacy/main_index.html -archivebox/themes/legacy/main_index_minimal.html -archivebox/themes/legacy/main_index_row.html -archivebox/themes/legacy/robots.txt -archivebox/themes/legacy/static/archive.png -archivebox/themes/legacy/static/bootstrap.min.css -archivebox/themes/legacy/static/external.png -archivebox/themes/legacy/static/jquery.dataTables.min.css -archivebox/themes/legacy/static/jquery.dataTables.min.js -archivebox/themes/legacy/static/jquery.min.js -archivebox/themes/legacy/static/sort_asc.png -archivebox/themes/legacy/static/sort_both.png -archivebox/themes/legacy/static/sort_desc.png -archivebox/themes/legacy/static/spinner.gif \ No newline at end of file diff --git a/archivebox.egg-info/dependency_links.txt b/archivebox.egg-info/dependency_links.txt deleted file mode 100644 index 8b137891..00000000 --- a/archivebox.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox.egg-info/entry_points.txt b/archivebox.egg-info/entry_points.txt deleted file mode 100644 index 14fdb7e2..00000000 --- a/archivebox.egg-info/entry_points.txt +++ /dev/null @@ -1,3 +0,0 @@ -[console_scripts] -archivebox = archivebox.cli:main - diff --git a/archivebox.egg-info/requires.txt b/archivebox.egg-info/requires.txt deleted file mode 100644 index 457f64e5..00000000 --- a/archivebox.egg-info/requires.txt +++ /dev/null @@ -1,25 +0,0 @@ -requests==2.24.0 -atomicwrites==1.4.0 -mypy-extensions==0.4.3 -django==3.0.8 -django-extensions==3.0.3 -dateparser -ipython -youtube-dl -python-crontab==2.5.1 -croniter==0.3.34 -w3lib==1.22.0 - -[dev] -setuptools -twine -flake8 -ipdb -mypy -django-stubs -sphinx -sphinx-rtd-theme -recommonmark -pytest -bottle -stdeb diff --git a/archivebox.egg-info/top_level.txt b/archivebox.egg-info/top_level.txt deleted file mode 100644 index 74056b65..00000000 --- a/archivebox.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -archivebox diff --git a/archivebox/base32_crockford.py b/archivebox/base32_crockford.py deleted file mode 100644 index 07dac08c..00000000 --- a/archivebox/base32_crockford.py +++ /dev/null @@ -1,172 +0,0 @@ -""" -base32-crockford -================ - -A Python module implementing the alternate base32 encoding as described -by Douglas Crockford at: http://www.crockford.com/wrmg/base32.html. - -He designed the encoding to: - - * Be human and machine readable - * Be compact - * Be error resistant - * Be pronounceable - -It uses a symbol set of 10 digits and 22 letters, excluding I, L O and -U. Decoding is not case sensitive, and 'i' and 'l' are converted to '1' -and 'o' is converted to '0'. Encoding uses only upper-case characters. - -Hyphens may be present in symbol strings to improve readability, and -are removed when decoding. - -A check symbol can be appended to a symbol string to detect errors -within the string. - -""" - -import re -import sys - -PY3 = sys.version_info[0] == 3 - -if not PY3: - import string as str - - -__all__ = ["encode", "decode", "normalize"] - - -if PY3: - string_types = (str,) -else: - string_types = (basestring,) # noqa - -# The encoded symbol space does not include I, L, O or U -symbols = '0123456789ABCDEFGHJKMNPQRSTVWXYZ' -# These five symbols are exclusively for checksum values -check_symbols = '*~$=U' - -encode_symbols = dict((i, ch) for (i, ch) in enumerate(symbols + check_symbols)) -decode_symbols = dict((ch, i) for (i, ch) in enumerate(symbols + check_symbols)) -normalize_symbols = str.maketrans('IiLlOo', '111100') -valid_symbols = re.compile('^[%s]+[%s]?$' % (symbols, - re.escape(check_symbols))) - -base = len(symbols) -check_base = len(symbols + check_symbols) - - -def encode(number, checksum=False, split=0): - """Encode an integer into a symbol string. - - A ValueError is raised on invalid input. - - If checksum is set to True, a check symbol will be - calculated and appended to the string. - - If split is specified, the string will be divided into - clusters of that size separated by hyphens. - - The encoded string is returned. - """ - number = int(number) - if number < 0: - raise ValueError("number '%d' is not a positive integer" % number) - - split = int(split) - if split < 0: - raise ValueError("split '%d' is not a positive integer" % split) - - check_symbol = '' - if checksum: - check_symbol = encode_symbols[number % check_base] - - if number == 0: - return '0' + check_symbol - - symbol_string = '' - while number > 0: - remainder = number % base - number //= base - symbol_string = encode_symbols[remainder] + symbol_string - symbol_string = symbol_string + check_symbol - - if split: - chunks = [] - for pos in range(0, len(symbol_string), split): - chunks.append(symbol_string[pos:pos + split]) - symbol_string = '-'.join(chunks) - - return symbol_string - - -def decode(symbol_string, checksum=False, strict=False): - """Decode an encoded symbol string. - - If checksum is set to True, the string is assumed to have a - trailing check symbol which will be validated. If the - checksum validation fails, a ValueError is raised. - - If strict is set to True, a ValueError is raised if the - normalization step requires changes to the string. - - The decoded string is returned. - """ - symbol_string = normalize(symbol_string, strict=strict) - if checksum: - symbol_string, check_symbol = symbol_string[:-1], symbol_string[-1] - - number = 0 - for symbol in symbol_string: - number = number * base + decode_symbols[symbol] - - if checksum: - check_value = decode_symbols[check_symbol] - modulo = number % check_base - if check_value != modulo: - raise ValueError("invalid check symbol '%s' for string '%s'" % - (check_symbol, symbol_string)) - - return number - - -def normalize(symbol_string, strict=False): - """Normalize an encoded symbol string. - - Normalization provides error correction and prepares the - string for decoding. These transformations are applied: - - 1. Hyphens are removed - 2. 'I', 'i', 'L' or 'l' are converted to '1' - 3. 'O' or 'o' are converted to '0' - 4. All characters are converted to uppercase - - A TypeError is raised if an invalid string type is provided. - - A ValueError is raised if the normalized string contains - invalid characters. - - If the strict parameter is set to True, a ValueError is raised - if any of the above transformations are applied. - - The normalized string is returned. - """ - if isinstance(symbol_string, string_types): - if not PY3: - try: - symbol_string = symbol_string.encode('ascii') - except UnicodeEncodeError: - raise ValueError("string should only contain ASCII characters") - else: - raise TypeError("string is of invalid type %s" % - symbol_string.__class__.__name__) - - norm_string = symbol_string.replace('-', '').translate(normalize_symbols).upper() - - if not valid_symbols.match(norm_string): - raise ValueError("string '%s' contains invalid characters" % norm_string) - - if strict and norm_string != symbol_string: - raise ValueError("string '%s' requires normalization" % symbol_string) - - return norm_string diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index dd07fc51..f9a55efd 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -19,6 +19,8 @@ meta_cmds = ('help', 'version') main_cmds = ('init', 'info', 'config') archive_cmds = ('add', 'remove', 'update', 'list', 'status') +fake_db = ("oneshot",) + display_first = (*meta_cmds, *main_cmds, *archive_cmds) # every imported command module must have these properties in order to be valid @@ -59,6 +61,10 @@ def run_subcommand(subcommand: str, pwd: Union[Path, str, None]=None) -> None: """Run a given ArchiveBox subcommand with the given list of args""" + if subcommand not in meta_cmds: + from ..config import setup_django + setup_django(in_memory_db=subcommand in fake_db, check_db=subcommand in archive_cmds) + module = import_module('.archivebox_{}'.format(subcommand), __package__) module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore @@ -134,3 +140,5 @@ __all__ = ( 'run_subcommand', *SUBCOMMANDS.keys(), ) + + diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index b4e65231..41c7554d 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -89,8 +89,8 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional index_only=command.index_only, overwrite=command.overwrite, init=command.init, - out_dir=pwd or OUTPUT_DIR, extractors=command.extract, + out_dir=pwd or OUTPUT_DIR, ) diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py index 140810a6..3838cf60 100644 --- a/archivebox/cli/archivebox_list.py +++ b/archivebox/cli/archivebox_list.py @@ -98,7 +98,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--filter-type', type=str, - choices=('exact', 'substring', 'domain', 'regex','tag'), + choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'), default='exact', help='Type of pattern matching to use when filtering URLs', ) diff --git a/archivebox/cli/archivebox_oneshot.py b/archivebox/cli/archivebox_oneshot.py index 2353d101..af68bac2 100644 --- a/archivebox/cli/archivebox_oneshot.py +++ b/archivebox/cli/archivebox_oneshot.py @@ -36,6 +36,13 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional ' ~/Desktop/sites_list.csv\n' ) ) + parser.add_argument( + "--extract", + type=str, + help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ + This does not take precedence over the configuration", + default="" + ) parser.add_argument( '--out-dir', type=str, @@ -55,6 +62,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional oneshot( url=stdin_url or url, out_dir=Path(command.out_dir).resolve(), + extractors=command.extract, ) diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index 9d483362..6748096e 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -91,7 +91,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--filter-type', type=str, - choices=('exact', 'substring', 'domain', 'regex'), + choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'), default='exact', help='Type of pattern matching to use when filtering URLs', ) @@ -102,6 +102,13 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional default=None, help='Update only URLs matching these filter patterns.' ) + parser.add_argument( + "--extract", + type=str, + help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ + This does not take precedence over the configuration", + default="" + ) command = parser.parse_args(args or ()) filter_patterns_str = accept_stdin(stdin) @@ -117,6 +124,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional after=command.after, before=command.before, out_dir=pwd or OUTPUT_DIR, + extractors=command.extract, ) diff --git a/archivebox/config.py b/archivebox/config.py index dc50679d..9a3f9a77 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -1,3 +1,24 @@ +""" +ArchiveBox config definitons (including defaults and dynamic config options). + +Config Usage Example: + + archivebox config --set MEDIA_TIMEOUT=600 + env MEDIA_TIMEOUT=600 USE_COLOR=False ... archivebox [subcommand] ... + +Config Precedence Order: + + 1. cli args (--update-all / --index-only / etc.) + 2. shell environment vars (env USE_COLOR=False archivebox add '...') + 3. config file (echo "SAVE_FAVICON=False" >> ArchiveBox.conf) + 4. defaults (defined below in Python) + +Documentation: + + https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration + +""" + __package__ = 'archivebox' import os @@ -24,26 +45,9 @@ from .config_stubs import ( ConfigDefaultDict, ) -# precedence order for config: -# 1. cli args (e.g. ) -# 2. shell environment vars (env USE_COLOR=False archivebox add '...') -# 3. config file (echo "SAVE_FAVICON=False" >> ArchiveBox.conf) -# 4. defaults (defined below in Python) +############################### Config Schema ################################## -# -# env SHOW_PROGRESS=1 archivebox add '...' -# archivebox config --set TIMEOUT=600 -# - -# ****************************************************************************** -# Documentation: https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration -# Use the 'env' command to pass config options to ArchiveBox. e.g.: -# env USE_COLOR=True CHROME_BINARY=chromium archivebox add < example.html -# ****************************************************************************** - -################################# User Config ################################## - -CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { +CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SHELL_CONFIG': { 'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()}, 'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']}, @@ -139,6 +143,18 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, }, + 'SEARCH_BACKEND_CONFIG' : { + 'USE_INDEXING_BACKEND': {'type': bool, 'default': True}, + 'USE_SEARCHING_BACKEND': {'type': bool, 'default': True}, + 'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'}, + 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'}, + 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491}, + 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'}, + # SONIC + 'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'}, + 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'}, + }, + 'DEPENDENCY_CONFIG': { 'USE_CURL': {'type': bool, 'default': True}, 'USE_WGET': {'type': bool, 'default': True}, @@ -149,7 +165,8 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'USE_CHROME': {'type': bool, 'default': True}, 'USE_NODE': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True}, - + 'USE_RIPGREP': {'type': bool, 'default': True}, + 'CURL_BINARY': {'type': str, 'default': 'curl'}, 'GIT_BINARY': {'type': str, 'default': 'git'}, 'WGET_BINARY': {'type': str, 'default': 'wget'}, @@ -158,25 +175,48 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'MERCURY_BINARY': {'type': str, 'default': 'mercury-parser'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, 'NODE_BINARY': {'type': str, 'default': 'node'}, + 'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, 'CHROME_BINARY': {'type': str, 'default': None}, + + 'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, + 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}}, }, } + +########################## Backwards-Compatibility ############################# + + # for backwards compatibility with old config files, check old/deprecated names for each key CONFIG_ALIASES = { alias: key - for section in CONFIG_DEFAULTS.values() + for section in CONFIG_SCHEMA.values() for key, default in section.items() for alias in default.get('aliases', ()) } -USER_CONFIG = {key for section in CONFIG_DEFAULTS.values() for key in section.keys()} +USER_CONFIG = {key for section in CONFIG_SCHEMA.values() for key in section.keys()} def get_real_name(key: str) -> str: + """get the current canonical name for a given deprecated config key""" return CONFIG_ALIASES.get(key.upper().strip(), key.upper().strip()) -############################## Derived Config ############################## -# Constants + +################################ Constants ##################################### + +PACKAGE_DIR_NAME = 'archivebox' +TEMPLATES_DIR_NAME = 'themes' + +ARCHIVE_DIR_NAME = 'archive' +SOURCES_DIR_NAME = 'sources' +LOGS_DIR_NAME = 'logs' +STATIC_DIR_NAME = 'static' +SQL_INDEX_FILENAME = 'index.sqlite3' +JSON_INDEX_FILENAME = 'index.json' +HTML_INDEX_FILENAME = 'index.html' +ROBOTS_TXT_FILENAME = 'robots.txt' +FAVICON_FILENAME = 'favicon.ico' +CONFIG_FILENAME = 'ArchiveBox.conf' DEFAULT_CLI_COLORS = { 'reset': '\033[00;00m', @@ -225,42 +265,18 @@ STATICFILE_EXTENSIONS = { # html, htm, shtml, xhtml, xml, aspx, php, cgi } -PACKAGE_DIR_NAME = 'archivebox' -TEMPLATES_DIR_NAME = 'themes' - -ARCHIVE_DIR_NAME = 'archive' -SOURCES_DIR_NAME = 'sources' -LOGS_DIR_NAME = 'logs' -STATIC_DIR_NAME = 'static' -SQL_INDEX_FILENAME = 'index.sqlite3' -JSON_INDEX_FILENAME = 'index.json' -HTML_INDEX_FILENAME = 'index.html' -ROBOTS_TXT_FILENAME = 'robots.txt' -FAVICON_FILENAME = 'favicon.ico' -CONFIG_FILENAME = 'ArchiveBox.conf' - -CONFIG_HEADER = ( -"""# This is the config file for your ArchiveBox collection. -# -# You can add options here manually in INI format, or automatically by running: -# archivebox config --set KEY=VALUE -# -# If you modify this file manually, make sure to update your archive after by running: -# archivebox init -# -# A list of all possible config with documentation and examples can be found here: -# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration - -""") -DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { +############################## Derived Config ################################## + + +DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns}, 'USER': {'default': lambda c: getpass.getuser() or os.getlogin()}, 'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}}, 'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent}, - 'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME / 'legacy'}, + 'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME}, 'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()}, 'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME}, @@ -297,6 +313,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, + 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None}, 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, @@ -305,7 +322,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']}, - 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if (c['USE_MERCURY'] and c['MERCURY_BINARY']) else None}, # mercury is unversioned + 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury is unversioned 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, @@ -319,8 +336,6 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])}, 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()}, 'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None}, - 'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'])}, - 'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None}, 'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']}, 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']}, @@ -328,6 +343,9 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']}, 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']}, 'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']}, + + 'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])}, + 'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None}, 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, @@ -340,6 +358,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { ################################### Helpers #################################### + def load_config_val(key: str, default: ConfigDefaultValue=None, type: Optional[Type]=None, @@ -386,7 +405,7 @@ def load_config_val(key: str, raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)') return int(val) - elif type is list: + elif type is list or type is dict: return json.loads(val) raise Exception('Config values can only be str, bool, int or json') @@ -418,6 +437,20 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: from .system import atomic_write + CONFIG_HEADER = ( + """# This is the config file for your ArchiveBox collection. + # + # You can add options here manually in INI format, or automatically by running: + # archivebox config --set KEY=VALUE + # + # If you modify this file manually, make sure to update your archive after by running: + # archivebox init + # + # A list of all possible config with documentation and examples can be found here: + # https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration + + """) + out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() config_path = Path(out_dir) / CONFIG_FILENAME @@ -431,7 +464,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: with open(config_path, 'r') as old: atomic_write(f'{config_path}.bak', old.read()) - find_section = lambda key: [name for name, opts in CONFIG_DEFAULTS.items() if key in opts][0] + find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0] # Set up sections in empty config file for key, val in config.items(): @@ -520,6 +553,8 @@ def load_config(defaults: ConfigDefaultDict, # with open(os.path.join(config['OUTPUT_DIR'], CONFIG_FILENAME), 'w+') as f: + +# Logging Helpers def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None: ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI @@ -551,6 +586,7 @@ def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Op stderr('{} {}'.format(prefix, line)) +# Dependency Metadata Helpers def bin_version(binary: Optional[str]) -> Optional[str]: """check the presence and return valid version line of a specified binary""" @@ -580,7 +616,7 @@ def bin_path(binary: Optional[str]) -> Optional[str]: if node_modules_bin.exists(): return str(node_modules_bin.resolve()) - return shutil.which(Path(binary).expanduser()) or binary + return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary def bin_hash(binary: Optional[str]) -> Optional[str]: if binary is None: @@ -667,7 +703,7 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: 'TEMPLATES_DIR': { 'path': (config['TEMPLATES_DIR']).resolve(), 'enabled': True, - 'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(), + 'is_valid': (config['TEMPLATES_DIR'] / config['ACTIVE_THEME'] / 'static').exists(), }, # 'NODE_MODULES_DIR': { # 'path': , @@ -811,6 +847,21 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': config['USE_CHROME'], 'is_valid': bool(config['CHROME_VERSION']), }, + 'RIPGREP_BINARY': { + 'path': bin_path(config['RIPGREP_BINARY']), + 'version': config['RIPGREP_VERSION'], + 'hash': bin_hash(config['RIPGREP_BINARY']), + 'enabled': config['USE_RIPGREP'], + 'is_valid': bool(config['RIPGREP_VERSION']), + }, + # TODO: add an entry for the sonic search backend? + # 'SONIC_BINARY': { + # 'path': bin_path(config['SONIC_BINARY']), + # 'version': config['SONIC_VERSION'], + # 'hash': bin_hash(config['SONIC_BINARY']), + # 'enabled': config['USE_SONIC'], + # 'is_valid': bool(config['SONIC_VERSION']), + # }, } def get_chrome_info(config: ConfigDict) -> ConfigValue: @@ -826,28 +877,51 @@ def get_chrome_info(config: ConfigDict) -> ConfigValue: } -################################## Load Config ################################# +# ****************************************************************************** +# ****************************************************************************** +# ******************************** Load Config ********************************* +# ******* (compile the defaults, configs, and metadata all into CONFIG) ******** +# ****************************************************************************** +# ****************************************************************************** def load_all_config(): CONFIG: ConfigDict = {} - for section_name, section_config in CONFIG_DEFAULTS.items(): + for section_name, section_config in CONFIG_SCHEMA.items(): CONFIG = load_config(section_config, CONFIG) - return load_config(DERIVED_CONFIG_DEFAULTS, CONFIG) + return load_config(DYNAMIC_CONFIG_SCHEMA, CONFIG) +# add all final config values in CONFIG to globals in this file CONFIG = load_all_config() globals().update(CONFIG) +# this lets us do: from .config import DEBUG, MEDIA_TIMEOUT, ... -# Timezone set as UTC + +# ****************************************************************************** +# ****************************************************************************** +# ****************************************************************************** +# ****************************************************************************** +# ****************************************************************************** + + + +########################### System Environment Setup ########################### + + +# Set timezone to UTC and umask to OUTPUT_PERMISSIONS os.environ["TZ"] = 'UTC' +os.umask(0o777 - int(OUTPUT_PERMISSIONS, base=8)) # noqa: F821 # add ./node_modules/.bin to $PATH so we can use node scripts in extractors NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin')) sys.path.append(NODE_BIN_PATH) -############################## Importable Checkers ############################# + + +########################### Config Validity Checkers ########################### + def check_system_config(config: ConfigDict=CONFIG) -> None: ### Check system environment @@ -936,7 +1010,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media') stderr() -def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None: +def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None: output_dir = out_dir or config['OUTPUT_DIR'] assert isinstance(output_dir, (str, Path)) @@ -976,7 +1050,7 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> -def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG) -> None: +def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None: check_system_config() output_dir = out_dir or Path(config['OUTPUT_DIR']) @@ -989,7 +1063,15 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG) os.environ.setdefault('OUTPUT_DIR', str(output_dir)) assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py' os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') - django.setup() + + if in_memory_db: + # Put the db in memory and run migrations in case any command requires it + from django.core.management import call_command + os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:") + django.setup() + call_command("migrate", interactive=False, verbosity=0) + else: + django.setup() if check_db: sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME @@ -997,5 +1079,3 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG) f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {config["OUTPUT_DIR"]}') except KeyboardInterrupt: raise SystemExit(2) - -os.umask(0o777 - int(OUTPUT_PERMISSIONS, base=8)) # noqa: F821 diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index b15507a4..832bea38 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -13,8 +13,10 @@ from django import forms from core.models import Snapshot, Tag from core.forms import AddLinkForm, TagField -from core.utils import get_icons +from core.mixins import SearchResultsAdminMixin + +from index.html import snapshot_icons from util import htmldecode, urldecode, ansi_to_html from logging_util import printable_filesize from main import add, remove @@ -82,7 +84,7 @@ class SnapshotAdminForm(forms.ModelForm): return instance -class SnapshotAdmin(admin.ModelAdmin): +class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): list_display = ('added', 'title_str', 'url_str', 'files', 'size') sort_fields = ('title_str', 'url_str', 'added') readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated') @@ -94,6 +96,13 @@ class SnapshotAdmin(admin.ModelAdmin): actions_template = 'admin/actions_as_select.html' form = SnapshotAdminForm + def get_urls(self): + urls = super().get_urls() + custom_urls = [ + path('grid/', self.admin_site.admin_view(self.grid_view),name='grid') + ] + return custom_urls + urls + def get_queryset(self, request): return super().get_queryset(request).prefetch_related('tags') @@ -128,7 +137,7 @@ class SnapshotAdmin(admin.ModelAdmin): ) + mark_safe(f' {tags}') def files(self, obj): - return get_icons(obj) + return snapshot_icons(obj) def size(self, obj): archive_size = obj.archive_size @@ -151,6 +160,31 @@ class SnapshotAdmin(admin.ModelAdmin): obj.url.split('://www.', 1)[-1].split('://', 1)[-1][:64], ) + def grid_view(self, request): + + # cl = self.get_changelist_instance(request) + + # Save before monkey patching to restore for changelist list view + saved_change_list_template = self.change_list_template + saved_list_per_page = self.list_per_page + saved_list_max_show_all = self.list_max_show_all + + # Monkey patch here plus core_tags.py + self.change_list_template = 'admin/grid_change_list.html' + self.list_per_page = 20 + self.list_max_show_all = self.list_per_page + + # Call monkey patched view + rendered_response = self.changelist_view(request) + + # Restore values + self.change_list_template = saved_change_list_template + self.list_per_page = saved_list_per_page + self.list_max_show_all = saved_list_max_show_all + + return rendered_response + + id_str.short_description = 'ID' title_str.short_description = 'Title' url_str.short_description = 'Original URL' @@ -216,7 +250,6 @@ class ArchiveBoxAdmin(admin.AdminSite): return render(template_name='add_links.html', request=request, context=context) - admin.site = ArchiveBoxAdmin() admin.site.register(get_user_model()) admin.site.register(Snapshot, SnapshotAdmin) diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index 8f48929b..86b29bb7 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -3,18 +3,29 @@ __package__ = 'archivebox.core' from django import forms from ..util import URL_REGEX -from .utils_taggit import edit_string_for_tags, parse_tags +from ..vendor.taggit_utils import edit_string_for_tags, parse_tags CHOICES = ( ('0', 'depth = 0 (archive just these URLs)'), ('1', 'depth = 1 (archive these URLs and all URLs one hop away)'), ) +from ..extractors import get_default_archive_methods + +ARCHIVE_METHODS = [ + (name, name) + for name, _, _ in get_default_archive_methods() +] + + class AddLinkForm(forms.Form): url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True) depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0') - - + archive_methods = forms.MultipleChoiceField( + required=False, + widget=forms.SelectMultiple, + choices=ARCHIVE_METHODS, + ) class TagWidgetMixin: def format_value(self, value): if value is not None and not isinstance(value, str): diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py new file mode 100644 index 00000000..a780376f --- /dev/null +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -0,0 +1,97 @@ +# Generated by Django 3.0.8 on 2020-11-04 12:25 + +import json +from pathlib import Path + +from django.db import migrations, models +import django.db.models.deletion + +from config import CONFIG +from index.json import to_json + +try: + JSONField = models.JSONField +except AttributeError: + import jsonfield + JSONField = jsonfield.JSONField + + +def forwards_func(apps, schema_editor): + from core.models import EXTRACTORS + + Snapshot = apps.get_model("core", "Snapshot") + ArchiveResult = apps.get_model("core", "ArchiveResult") + + snapshots = Snapshot.objects.all() + for snapshot in snapshots: + out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp + + try: + with open(out_dir / "index.json", "r") as f: + fs_index = json.load(f) + except Exception as e: + continue + + history = fs_index["history"] + + for extractor in history: + for result in history[extractor]: + ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=result["cmd"], cmd_version=result["cmd_version"], + start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"]) + + +def verify_json_index_integrity(snapshot): + results = snapshot.archiveresult_set.all() + out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp + with open(out_dir / "index.json", "r") as f: + index = json.load(f) + + history = index["history"] + index_results = [result for extractor in history for result in history[extractor]] + flattened_results = [result["start_ts"] for result in index_results] + + missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results] + + for missing in missing_results: + index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(), + "start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output, + "schema": "ArchiveResult", "status": missing.status}) + + json_index = to_json(index) + with open(out_dir / "index.json", "w") as f: + f.write(json_index) + + +def reverse_func(apps, schema_editor): + Snapshot = apps.get_model("core", "Snapshot") + ArchiveResult = apps.get_model("core", "ArchiveResult") + for snapshot in Snapshot.objects.all(): + verify_json_index_integrity(snapshot) + + ArchiveResult.objects.all().delete() + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0006_auto_20201012_1520'), + ] + + operations = [ + migrations.CreateModel( + name='ArchiveResult', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('cmd', JSONField()), + ('pwd', models.CharField(max_length=256)), + ('cmd_version', models.CharField(max_length=32)), + ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)), + ('output', models.CharField(max_length=512)), + ('start_ts', models.DateTimeField()), + ('end_ts', models.DateTimeField()), + ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)), + ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')), + ], + ), + migrations.RunPython(forwards_func, reverse_func), + ] diff --git a/archivebox/core/mixins.py b/archivebox/core/mixins.py new file mode 100644 index 00000000..538ca1e3 --- /dev/null +++ b/archivebox/core/mixins.py @@ -0,0 +1,23 @@ +from django.contrib import messages + +from archivebox.search import query_search_index + +class SearchResultsAdminMixin(object): + def get_search_results(self, request, queryset, search_term): + ''' Enhances the search queryset with results from the search backend. + ''' + qs, use_distinct = \ + super(SearchResultsAdminMixin, self).get_search_results( + request, queryset, search_term) + + search_term = search_term.strip() + if not search_term: + return qs, use_distinct + try: + qsearch = query_search_index(search_term) + except Exception as err: + messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}') + else: + qs = queryset & qsearch + finally: + return qs, use_distinct diff --git a/archivebox/core/models.py b/archivebox/core/models.py index f43fc631..d50e8f40 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -5,9 +5,24 @@ import uuid from django.db import models, transaction from django.utils.functional import cached_property from django.utils.text import slugify +from django.db.models import Case, When, Value, IntegerField from ..util import parse_date from ..index.schema import Link +from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE + +EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()] +STATUS_CHOICES = [ + ("succeeded", "succeeded"), + ("failed", "failed"), + ("skipped", "skipped") +] + +try: + JSONField = models.JSONField +except AttributeError: + import jsonfield + JSONField = jsonfield.JSONField class Tag(models.Model): @@ -51,6 +66,7 @@ class Tag(models.Model): else: return super().save(*args, **kwargs) + class Snapshot(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) @@ -83,7 +99,7 @@ class Snapshot(models.Model): return { key: getattr(self, key) if key != 'tags' else self.tags_str() - for key in args + for key in args } def as_link(self) -> Link: @@ -92,7 +108,7 @@ class Snapshot(models.Model): def as_link_with_details(self) -> Link: from ..index import load_link_details return load_link_details(self.as_link()) - + def tags_str(self) -> str: return ','.join(self.tags.order_by('name').values_list('name', flat=True)) @@ -106,7 +122,7 @@ class Snapshot(models.Model): @cached_property def num_outputs(self): - return self.as_link().num_outputs + return self.archiveresult_set.filter(status='succeeded').count() @cached_property def url_hash(self): @@ -130,8 +146,8 @@ class Snapshot(models.Model): @cached_property def history(self): - from ..index import load_link_details - return load_link_details(self.as_link()).history + # TODO: use ArchiveResult for this instead of json + return self.as_link_with_details().history @cached_property def latest_title(self): @@ -142,9 +158,37 @@ class Snapshot(models.Model): return self.history['title'][-1].output.strip() return None - def save_tags(self, tags=[]): + def save_tags(self, tags=()): tags_id = [] for tag in tags: tags_id.append(Tag.objects.get_or_create(name=tag)[0].id) self.tags.clear() self.tags.add(*tags_id) + + +class ArchiveResultManager(models.Manager): + def indexable(self, sorted: bool = True): + INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] + qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded') + + if sorted: + precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] + qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence') + return qs + + +class ArchiveResult(models.Model): + snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) + cmd = JSONField() + pwd = models.CharField(max_length=256) + cmd_version = models.CharField(max_length=32) + output = models.CharField(max_length=512) + start_ts = models.DateTimeField() + end_ts = models.DateTimeField() + status = models.CharField(max_length=16, choices=STATUS_CHOICES) + extractor = models.CharField(choices=EXTRACTORS, max_length=32) + + objects = ArchiveResultManager() + + def __str__(self): + return self.extractor diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 28a3e1fe..e8ed6b16 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -12,6 +12,7 @@ from ..config import ( ALLOWED_HOSTS, PACKAGE_DIR, ACTIVE_THEME, + TEMPLATES_DIR_NAME, SQL_INDEX_FILENAME, OUTPUT_DIR, ) @@ -68,14 +69,14 @@ AUTHENTICATION_BACKENDS = [ STATIC_URL = '/static/' STATICFILES_DIRS = [ - str(Path(PACKAGE_DIR) / 'themes' / ACTIVE_THEME / 'static'), - str(Path(PACKAGE_DIR) / 'themes' / 'default' / 'static'), + str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / ACTIVE_THEME / 'static'), + str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'default' / 'static'), ] TEMPLATE_DIRS = [ - str(Path(PACKAGE_DIR) / 'themes' / ACTIVE_THEME), - str(Path(PACKAGE_DIR) / 'themes' / 'default'), - str(Path(PACKAGE_DIR) / 'themes'), + str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / ACTIVE_THEME), + str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'default'), + str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME), ] TEMPLATES = [ @@ -100,10 +101,12 @@ TEMPLATES = [ ################################################################################ DATABASE_FILE = Path(OUTPUT_DIR) / SQL_INDEX_FILENAME +DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", DATABASE_FILE) + DATABASES = { 'default': { 'ENGINE': 'django.db.backends.sqlite3', - 'NAME': str(DATABASE_FILE), + 'NAME': DATABASE_NAME, } } diff --git a/archivebox/core/templatetags/__init__.py b/archivebox/core/templatetags/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py new file mode 100644 index 00000000..25f06852 --- /dev/null +++ b/archivebox/core/templatetags/core_tags.py @@ -0,0 +1,47 @@ +from django import template +from django.urls import reverse +from django.contrib.admin.templatetags.base import InclusionAdminNode +from django.templatetags.static import static + + +from typing import Union + +from core.models import ArchiveResult + +register = template.Library() + +@register.simple_tag +def snapshot_image(snapshot): + result = ArchiveResult.objects.filter(snapshot=snapshot, extractor='screenshot', status='succeeded').first() + if result: + return reverse('LinkAssets', args=[f'{str(snapshot.timestamp)}/{result.output}']) + + return static('archive.png') + +@register.filter +def file_size(num_bytes: Union[int, float]) -> str: + for count in ['Bytes','KB','MB','GB']: + if num_bytes > -1024.0 and num_bytes < 1024.0: + return '%3.1f %s' % (num_bytes, count) + num_bytes /= 1024.0 + return '%3.1f %s' % (num_bytes, 'TB') + +def result_list(cl): + """ + Monkey patched result + """ + num_sorted_fields = 0 + return { + 'cl': cl, + 'num_sorted_fields': num_sorted_fields, + 'results': cl.result_list, + } + +@register.tag(name='snapshots_grid') +def result_list_tag(parser, token): + return InclusionAdminNode( + parser, token, + func=result_list, + template_name='snapshots_grid.html', + takes_context=False, + ) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py deleted file mode 100644 index 75c9c4e7..00000000 --- a/archivebox/core/utils.py +++ /dev/null @@ -1,39 +0,0 @@ -from pathlib import Path - -from django.utils.html import format_html - -from core.models import Snapshot - - -def get_icons(snapshot: Snapshot) -> str: - link = snapshot.as_link() - canon = link.canonical_outputs() - out_dir = Path(link.link_dir) - - # slow version: highlights icons based on whether files exist or not for that output - # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) - # fast version: all icons are highlighted without checking for outputs in filesystem - link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) - - return format_html( - '' - '' - '🆆 ' - '🅷 ' - '📄 ' - '💻 ' - '📦 ' - '📼 ' - '🅶 ' - '🏛 ' - '', - *link_tuple(link, 'singlefile_path'), - *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')), - *link_tuple(link, 'dom_path'), - *link_tuple(link, 'pdf_path'), - *link_tuple(link, 'screenshot_path'), - *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), - *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), - *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), - canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), - ) diff --git a/archivebox/core/utils_taggit.py b/archivebox/core/utils_taggit.py deleted file mode 100644 index 5a2d511d..00000000 --- a/archivebox/core/utils_taggit.py +++ /dev/null @@ -1,113 +0,0 @@ -# Taken from https://github.com/jazzband/django-taggit/blob/3b56adb637ab95aca5036c37a358402c825a367c/taggit/utils.py - -def parse_tags(tagstring): - """ - Parses tag input, with multiple word input being activated and - delineated by commas and double quotes. Quotes take precedence, so - they may contain commas. - - Returns a sorted list of unique tag names. - - Ported from Jonathan Buchanan's `django-tagging - `_ - """ - if not tagstring: - return [] - - # Special case - if there are no commas or double quotes in the - # input, we don't *do* a recall... I mean, we know we only need to - # split on spaces. - if "," not in tagstring and '"' not in tagstring: - words = list(set(split_strip(tagstring, " "))) - words.sort() - return words - - words = [] - buffer = [] - # Defer splitting of non-quoted sections until we know if there are - # any unquoted commas. - to_be_split = [] - saw_loose_comma = False - open_quote = False - i = iter(tagstring) - try: - while True: - c = next(i) - if c == '"': - if buffer: - to_be_split.append("".join(buffer)) - buffer = [] - # Find the matching quote - open_quote = True - c = next(i) - while c != '"': - buffer.append(c) - c = next(i) - if buffer: - word = "".join(buffer).strip() - if word: - words.append(word) - buffer = [] - open_quote = False - else: - if not saw_loose_comma and c == ",": - saw_loose_comma = True - buffer.append(c) - except StopIteration: - # If we were parsing an open quote which was never closed treat - # the buffer as unquoted. - if buffer: - if open_quote and "," in buffer: - saw_loose_comma = True - to_be_split.append("".join(buffer)) - if to_be_split: - if saw_loose_comma: - delimiter = "," - else: - delimiter = " " - for chunk in to_be_split: - words.extend(split_strip(chunk, delimiter)) - words = list(set(words)) - words.sort() - return words - - -def split_strip(string, delimiter=","): - """ - Splits ``string`` on ``delimiter``, stripping each resulting string - and returning a list of non-empty strings. - - Ported from Jonathan Buchanan's `django-tagging - `_ - """ - if not string: - return [] - - words = [w.strip() for w in string.split(delimiter)] - return [w for w in words if w] - - -def edit_string_for_tags(tags): - """ - Given list of ``Tag`` instances, creates a string representation of - the list suitable for editing by the user, such that submitting the - given string representation back without changing it will give the - same list of tags. - - Tag names which contain commas will be double quoted. - - If any tag name which isn't being quoted contains whitespace, the - resulting string of tag names will be comma-delimited, otherwise - it will be space-delimited. - - Ported from Jonathan Buchanan's `django-tagging - `_ - """ - names = [] - for tag in tags: - name = tag.name - if "," in name or " " in name: - names.append('"%s"' % name) - else: - names.append(name) - return ", ".join(sorted(names)) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index dfea7700..b46e364e 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -12,17 +12,19 @@ from django.views.generic import FormView from django.contrib.auth.mixins import UserPassesTestMixin from core.models import Snapshot -from core.utils import get_icons from core.forms import AddLinkForm from ..config import ( OUTPUT_DIR, PUBLIC_INDEX, PUBLIC_SNAPSHOTS, - PUBLIC_ADD_VIEW + PUBLIC_ADD_VIEW, + VERSION, + FOOTER_INFO, ) from main import add from ..util import base_url, ansi_to_html +from ..index.html import snapshot_icons class MainIndex(View): @@ -94,13 +96,20 @@ class PublicArchiveView(ListView): paginate_by = 100 ordering = ['title'] + def get_context_data(self, **kwargs): + return { + **super().get_context_data(**kwargs), + 'VERSION': VERSION, + 'FOOTER_INFO': FOOTER_INFO, + } + def get_queryset(self, **kwargs): qs = super().get_queryset(**kwargs) query = self.request.GET.get('q') if query: qs = qs.filter(title__icontains=query) for snapshot in qs: - snapshot.icons = get_icons(snapshot) + snapshot.icons = snapshot_icons(snapshot) return qs def get(self, *args, **kwargs): @@ -127,23 +136,29 @@ class AddView(UserPassesTestMixin, FormView): def test_func(self): return PUBLIC_ADD_VIEW or self.request.user.is_authenticated - def get_context_data(self, *args, **kwargs): - context = super().get_context_data(*args, **kwargs) - context["title"] = "Add URLs" - # We can't just call request.build_absolute_uri in the template, because it would include query parameters - context["absolute_add_path"] = self.request.build_absolute_uri(self.request.path) - return context + def get_context_data(self, **kwargs): + return { + **super().get_context_data(**kwargs), + 'title': "Add URLs", + # We can't just call request.build_absolute_uri in the template, because it would include query parameters + 'absolute_add_path': self.request.build_absolute_uri(self.request.path), + 'VERSION': VERSION, + 'FOOTER_INFO': FOOTER_INFO, + } def form_valid(self, form): url = form.cleaned_data["url"] print(f'[+] Adding URL: {url}') depth = 0 if form.cleaned_data["depth"] == "0" else 1 + extractors = ','.join(form.cleaned_data["archive_methods"]) input_kwargs = { "urls": url, "depth": depth, "update_all": False, "out_dir": OUTPUT_DIR, } + if extractors: + input_kwargs.update({"extractors": extractors}) add_stdout = StringIO() with redirect_stdout(add_stdout): add(**input_kwargs) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 60f20adf..a4acef0b 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -8,6 +8,7 @@ from datetime import datetime from django.db.models import QuerySet from ..index.schema import Link +from ..index.sql import write_link_to_sql_index from ..index import ( load_link_details, write_link_details, @@ -22,6 +23,7 @@ from ..logging_util import ( log_archive_method_started, log_archive_method_finished, ) +from ..search import write_search_index from .title import should_save_title, save_title from .favicon import should_save_favicon, save_favicon @@ -37,6 +39,7 @@ from .media import should_save_media, save_media from .archive_org import should_save_archive_dot_org, save_archive_dot_org from .headers import should_save_headers, save_headers + def get_default_archive_methods(): return [ ('title', should_save_title, save_title), @@ -54,6 +57,8 @@ def get_default_archive_methods(): ('archive_org', should_save_archive_dot_org, save_archive_dot_org), ] +ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)] + @enforce_types def ignore_methods(to_ignore: List[str]): ARCHIVE_METHODS = get_default_archive_methods() @@ -62,9 +67,16 @@ def ignore_methods(to_ignore: List[str]): return list(methods) @enforce_types -def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, skip_index: bool=False) -> Link: +def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link: """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" + # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach. + from core.models import Snapshot, ArchiveResult + try: + snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot + except Snapshot.DoesNotExist: + snapshot = write_link_to_sql_index(link) + ARCHIVE_METHODS = get_default_archive_methods() if methods: @@ -80,7 +92,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s os.makedirs(out_dir) link = load_link_details(link, out_dir=out_dir) - write_link_details(link, out_dir=out_dir, skip_sql_index=skip_index) + write_link_details(link, out_dir=out_dir, skip_sql_index=False) log_link_archiving_started(link, out_dir, is_new) link = link.overwrite(updated=datetime.now()) stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} @@ -99,6 +111,10 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s stats[result.status] += 1 log_archive_method_finished(result) + write_search_index(link=link, texts=result.index_texts) + ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version, + output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status) + else: # print('{black} X {}{reset}'.format(method_name, **ANSI)) stats['skipped'] += 1 @@ -117,7 +133,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s except Exception: pass - write_link_details(link, out_dir=out_dir, skip_sql_index=skip_index) + write_link_details(link, out_dir=out_dir, skip_sql_index=False) log_link_archiving_finished(link, link.link_dir, is_new, stats) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index bd45e9d5..9da620b4 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -71,6 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO CURL_BINARY, link.url ] + readability_content = None timer = TimedProgress(timeout, prefix=' ') try: document = get_html(link, out_dir) @@ -86,8 +87,9 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO result = run(cmd, cwd=out_dir, timeout=timeout) result_json = json.loads(result.stdout) output_folder.mkdir(exist_ok=True) + readability_content = result_json.pop("textContent") atomic_write(str(output_folder / "content.html"), result_json.pop("content")) - atomic_write(str(output_folder / "content.txt"), result_json.pop("textContent")) + atomic_write(str(output_folder / "content.txt"), readability_content) atomic_write(str(output_folder / "article.json"), result_json) # parse out number of files downloaded from last line of stderr: @@ -117,5 +119,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO cmd_version=READABILITY_VERSION, output=output, status=status, - **timer.stats, + index_texts= [readability_content] if readability_content else [], + **timer.stats, ) diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index ff70f689..28cb128f 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -20,7 +20,6 @@ from ..config import ( CURL_ARGS, CURL_VERSION, CURL_USER_AGENT, - setup_django, ) from ..logging_util import TimedProgress @@ -81,7 +80,6 @@ def extract_title_with_regex(html): def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: """try to guess the page's title from its content""" - setup_django(out_dir=out_dir) from core.models import Snapshot output: ArchiveOutput = None diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index dfc1c839..4f4ac3d4 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -18,7 +18,6 @@ from ..util import ( ExtendedEncoder, ) from ..config import ( - setup_django, ARCHIVE_DIR_NAME, SQL_INDEX_FILENAME, JSON_INDEX_FILENAME, @@ -51,6 +50,8 @@ from .sql import ( write_sql_link_details, ) +from ..search import search_backend_enabled, query_search_index + ### Link filtering and checking @enforce_types @@ -221,7 +222,7 @@ def timed_index_update(out_path: Path): @enforce_types -def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool=False) -> None: +def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: """Writes links to sqlite3 file for a given list of links""" log_indexing_process_started(len(links)) @@ -241,16 +242,9 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool log_indexing_process_finished() -@enforce_types -def get_empty_snapshot_queryset(out_dir: Path=OUTPUT_DIR): - setup_django(out_dir, check_db=True) - from core.models import Snapshot - return Snapshot.objects.none() - @enforce_types def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]: """parse and load existing index with any new links from import_path merged in""" - setup_django(out_dir, check_db=True) from core.models import Snapshot try: return Snapshot.objects.all() @@ -365,7 +359,7 @@ LINK_FILTERS = { } @enforce_types -def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet: +def q_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet: q_filter = Q() for pattern in filter_patterns: try: @@ -380,10 +374,36 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type raise SystemExit(2) return snapshots.filter(q_filter) +def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet: + if not search_backend_enabled(): + stderr() + stderr( + '[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', + color='red', + ) + raise SystemExit(2) + from core.models import Snapshot + + qsearch = Snapshot.objects.none() + for pattern in filter_patterns: + try: + qsearch |= query_search_index(pattern) + except: + raise SystemExit(2) + + return snapshots & qsearch + +@enforce_types +def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet: + if filter_type != 'search': + return q_filter(snapshots, filter_patterns, filter_type) + else: + return search_filter(snapshots, filter_patterns, filter_type) + def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links without checking archive status or data directory validity""" - links = [snapshot.as_link() for snapshot in snapshots.iterator()] + links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in links @@ -391,7 +411,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are archived with a valid data directory""" - links = [snapshot.as_link() for snapshot in snapshots.iterator()] + links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in filter(is_archived, links) @@ -399,7 +419,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are unarchived with no data directory or an empty data directory""" - links = [snapshot.as_link() for snapshot in snapshots.iterator()] + links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in filter(is_unarchived, links) @@ -424,7 +444,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs with a valid index matched to the main index and archived content""" - links = [snapshot.as_link() for snapshot in snapshots.iterator()] + links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in filter(is_valid, links) diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 793a60af..a62e2c7e 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -1,12 +1,14 @@ __package__ = 'archivebox.index' -from string import Template from datetime import datetime from typing import List, Optional, Iterator, Mapping from pathlib import Path +from django.utils.html import format_html +from collections import defaultdict + from .schema import Link -from ..system import atomic_write, copy_and_overwrite +from ..system import atomic_write from ..logging_util import printable_filesize from ..util import ( enforce_types, @@ -17,21 +19,15 @@ from ..util import ( ) from ..config import ( OUTPUT_DIR, - TEMPLATES_DIR, VERSION, GIT_SHA, FOOTER_INFO, - ARCHIVE_DIR_NAME, HTML_INDEX_FILENAME, - STATIC_DIR_NAME, - ROBOTS_TXT_FILENAME, - FAVICON_FILENAME, ) -MAIN_INDEX_TEMPLATE = str(Path(TEMPLATES_DIR) / 'main_index.html') -MINIMAL_INDEX_TEMPLATE = str(Path(TEMPLATES_DIR) / 'main_index_minimal.html') -MAIN_INDEX_ROW_TEMPLATE = str(Path(TEMPLATES_DIR) / 'main_index_row.html') -LINK_DETAILS_TEMPLATE = str(Path(TEMPLATES_DIR) / 'link_details.html') +MAIN_INDEX_TEMPLATE = 'main_index.html' +MINIMAL_INDEX_TEMPLATE = 'main_index_minimal.html' +LINK_DETAILS_TEMPLATE = 'link_details.html' TITLE_LOADING_MSG = 'Not yet archived...' @@ -50,62 +46,25 @@ def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]: return () @enforce_types -def write_html_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool=False) -> None: - """write the html link index to a given path""" - - copy_and_overwrite(str(Path(TEMPLATES_DIR) / FAVICON_FILENAME), str(out_dir / FAVICON_FILENAME)) - copy_and_overwrite(str(Path(TEMPLATES_DIR) / ROBOTS_TXT_FILENAME), str(out_dir / ROBOTS_TXT_FILENAME)) - copy_and_overwrite(str(Path(TEMPLATES_DIR) / STATIC_DIR_NAME), str(out_dir / STATIC_DIR_NAME)) - - rendered_html = main_index_template(links, finished=finished) - atomic_write(str(out_dir / HTML_INDEX_FILENAME), rendered_html) - +def generate_index_from_links(links: List[Link], with_headers: bool): + if with_headers: + output = main_index_template(links) + else: + output = main_index_template(links, template=MINIMAL_INDEX_TEMPLATE) + return output @enforce_types -def main_index_template(links: List[Link], finished: bool=True, template: str=MAIN_INDEX_TEMPLATE) -> str: +def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str: """render the template for the entire main index""" - return render_legacy_template(template, { + return render_django_template(template, { 'version': VERSION, 'git_sha': GIT_SHA, 'num_links': str(len(links)), - 'status': 'finished' if finished else 'running', 'date_updated': datetime.now().strftime('%Y-%m-%d'), 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), - 'rows': '\n'.join( - main_index_row_template(link) - for link in links - ), - 'footer_info': FOOTER_INFO, - }) - - -@enforce_types -def main_index_row_template(link: Link) -> str: - """render the template for an individual link row of the main index""" - - from ..extractors.wget import wget_output_path - - return render_legacy_template(MAIN_INDEX_ROW_TEMPLATE, { - **link._asdict(extended=True), - - # before pages are finished archiving, show loading msg instead of title - 'title': htmlencode( - link.title - or (link.base_url if link.is_archived else TITLE_LOADING_MSG) - ), - - # before pages are finished archiving, show fallback loading favicon - 'favicon_url': ( - str(Path(ARCHIVE_DIR_NAME) / link.timestamp / 'favicon.ico') - # if link['is_archived'] else '' - ), - - # before pages are finished archiving, show the details page instead - 'wget_url': urlencode(wget_output_path(link) or 'index.html'), - - # replace commas in tags with spaces, or file extension if it's static - 'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''), + 'links': [link._asdict(extended=True) for link in links], + 'FOOTER_INFO': FOOTER_INFO, }) @@ -126,7 +85,7 @@ def link_details_template(link: Link) -> str: link_info = link._asdict(extended=True) - return render_legacy_template(LINK_DETAILS_TEMPLATE, { + return render_django_template(LINK_DETAILS_TEMPLATE, { **link_info, **link_info['canonical'], 'title': htmlencode( @@ -146,12 +105,60 @@ def link_details_template(link: Link) -> str: 'oldest_archive_date': ts_to_date(link.oldest_archive_date), }) - @enforce_types -def render_legacy_template(template_path: str, context: Mapping[str, str]) -> str: +def render_django_template(template: str, context: Mapping[str, str]) -> str: """render a given html template string with the given template content""" + from django.template.loader import render_to_string - # will be replaced by django templates in the future - with open(template_path, 'r', encoding='utf-8') as template: - template_str = template.read() - return Template(template_str).substitute(**context) + return render_to_string(template, context) + + +def snapshot_icons(snapshot) -> str: + from core.models import EXTRACTORS + + archive_results = snapshot.archiveresult_set.filter(status="succeeded") + link = snapshot.as_link() + path = link.archive_path + canon = link.canonical_outputs() + output = "" + output_template = '{} ' + icons = { + "singlefile": "❶", + "wget": "🆆", + "dom": "🅷", + "pdf": "📄", + "screenshot": "💻", + "media": "📼", + "git": "🅶", + "archive_org": "🏛", + "readability": "🆁", + "mercury": "🅼", + "warc": "📦" + } + exclude = ["favicon", "title", "headers", "archive_org"] + # Missing specific entry for WARC + + extractor_items = defaultdict(lambda: None) + for extractor, _ in EXTRACTORS: + for result in archive_results: + if result.extractor == extractor: + extractor_items[extractor] = result + + for extractor, _ in EXTRACTORS: + if extractor not in exclude: + exists = extractor_items[extractor] is not None + output += output_template.format(path, canon[f"{extractor}_path"], str(exists), + extractor, icons.get(extractor, "?")) + if extractor == "wget": + # warc isn't technically it's own extractor, so we have to add it after wget + exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) + output += output_template.format(exists[0] if exists else '#', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?")) + + if extractor == "archive_org": + # The check for archive_org is different, so it has to be handled separately + target_path = Path(path) / "archive.org.txt" + exists = target_path.exists() + output += '{} '.format(canon["archive_org_path"], str(exists), + "archive_org", icons.get("archive_org", "?")) + + return format_html(f'{output}') diff --git a/archivebox/index/json.py b/archivebox/index/json.py index 1c3ce6e8..f24b969f 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -8,7 +8,7 @@ from pathlib import Path from datetime import datetime from typing import List, Optional, Iterator, Any, Union -from .schema import Link, ArchiveResult +from .schema import Link from ..system import atomic_write from ..util import enforce_types from ..config import ( @@ -39,7 +39,20 @@ MAIN_INDEX_HEADER = { }, } -### Main Links Index +@enforce_types +def generate_json_index_from_links(links: List[Link], with_headers: bool): + if with_headers: + output = { + **MAIN_INDEX_HEADER, + 'num_links': len(links), + 'updated': datetime.now(), + 'last_run_cmd': sys.argv, + 'links': links, + } + else: + output = links + return to_json(output, indent=4, sort_keys=True) + @enforce_types def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: @@ -65,30 +78,6 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: continue return () -@enforce_types -def write_json_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: - """write the json link index to a given path""" - - assert isinstance(links, List), 'Links must be a list, not a generator.' - assert not links or isinstance(links[0].history, dict) - assert not links or isinstance(links[0].sources, list) - - if links and links[0].history.get('title'): - assert isinstance(links[0].history['title'][0], ArchiveResult) - - if links and links[0].sources: - assert isinstance(links[0].sources[0], str) - - main_index_json = { - **MAIN_INDEX_HEADER, - 'num_links': len(links), - 'updated': datetime.now(), - 'last_run_cmd': sys.argv, - 'links': links, - } - atomic_write(str(Path(out_dir) / JSON_INDEX_FILENAME), main_index_json) - - ### Link Details Index @enforce_types diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 68d840a2..bc3a25da 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -1,3 +1,11 @@ +""" + +WARNING: THIS FILE IS ALL LEGACY CODE TO BE REMOVED. + +DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py + +""" + __package__ = 'archivebox.index' from pathlib import Path @@ -31,6 +39,7 @@ class ArchiveResult: status: str start_ts: datetime end_ts: datetime + index_texts: Union[List[str], None] = None schema: str = 'ArchiveResult' def __post_init__(self): @@ -207,6 +216,10 @@ class Link: }) return info + def as_snapshot(self): + from core.models import Snapshot + return Snapshot.objects.get(url=self.url) + @classmethod def from_json(cls, json_info, guess=False): from ..util import parse_date @@ -339,7 +352,7 @@ class Link: ### Archive Status Helpers @property def num_outputs(self) -> int: - return len(tuple(filter(None, self.latest_outputs().values()))) + return self.as_snapshot().num_outputs @property def num_failures(self) -> int: diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index aa7c8817..1e99f67c 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -4,17 +4,17 @@ from io import StringIO from pathlib import Path from typing import List, Tuple, Iterator from django.db.models import QuerySet +from django.db import transaction from .schema import Link from ..util import enforce_types -from ..config import setup_django, OUTPUT_DIR +from ..config import OUTPUT_DIR ### Main Links Index @enforce_types def parse_sql_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: - setup_django(out_dir, check_db=True) from core.models import Snapshot return ( @@ -24,9 +24,6 @@ def parse_sql_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: @enforce_types def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) -> None: - setup_django(out_dir, check_db=True) - from django.db import transaction - with transaction.atomic(): snapshots.delete() @@ -51,9 +48,6 @@ def write_link_to_sql_index(link: Link): @enforce_types def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: - setup_django(out_dir, check_db=True) - from django.db import transaction - with transaction.atomic(): for link in links: write_link_to_sql_index(link) @@ -61,9 +55,7 @@ def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: @enforce_types def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: - setup_django(out_dir, check_db=True) from core.models import Snapshot - from django.db import transaction with transaction.atomic(): try: @@ -84,7 +76,6 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: @enforce_types def list_migrations(out_dir: Path=OUTPUT_DIR) -> List[Tuple[bool, str]]: - setup_django(out_dir, check_db=False) from django.core.management import call_command out = StringIO() call_command("showmigrations", list=True, stdout=out) @@ -101,7 +92,6 @@ def list_migrations(out_dir: Path=OUTPUT_DIR) -> List[Tuple[bool, str]]: @enforce_types def apply_migrations(out_dir: Path=OUTPUT_DIR) -> List[str]: - setup_django(out_dir, check_db=False) from django.core.management import call_command null, out = StringIO(), StringIO() call_command("makemigrations", interactive=False, stdout=null) @@ -112,6 +102,5 @@ def apply_migrations(out_dir: Path=OUTPUT_DIR) -> List[str]: @enforce_types def get_admins(out_dir: Path=OUTPUT_DIR) -> List[str]: - setup_django(out_dir, check_db=False) from django.contrib.auth.models import User return User.objects.filter(is_superuser=True) diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index aa4659f0..f2b86735 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -19,6 +19,7 @@ if TYPE_CHECKING: from .util import enforce_types from .config import ( ConfigDict, + OUTPUT_DIR, PYTHON_ENCODING, ANSI, IS_TTY, @@ -443,7 +444,7 @@ def log_shell_welcome_msg(): from .cli import list_subcommands print('{green}# ArchiveBox Imports{reset}'.format(**ANSI)) - print('{green}from archivebox.core.models import Snapshot, User{reset}'.format(**ANSI)) + print('{green}from core.models import Snapshot, User{reset}'.format(**ANSI)) print('{green}from archivebox import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI)) print() print('[i] Welcome to the ArchiveBox Shell!') @@ -477,39 +478,7 @@ def printable_filesize(num_bytes: Union[int, float]) -> str: @enforce_types def printable_folders(folders: Dict[str, Optional["Link"]], - json: bool=False, - html: bool=False, - csv: Optional[str]=None, with_headers: bool=False) -> str: - - from .index.json import MAIN_INDEX_HEADER - - links = folders.values() - if json: - from .index.json import to_json - if with_headers: - output = { - **MAIN_INDEX_HEADER, - 'num_links': len(links), - 'updated': datetime.now(), - 'last_run_cmd': sys.argv, - 'links': links, - } - else: - output = links - return to_json(output, indent=4, sort_keys=True) - elif html: - from .index.html import main_index_template - if with_headers: - output = main_index_template(links, True) - else: - from .index.html import MINIMAL_INDEX_TEMPLATE - output = main_index_template(links, True, MINIMAL_INDEX_TEMPLATE) - return output - elif csv: - from .index.csv import links_to_csv - return links_to_csv(folders.values(), cols=csv.split(','), header=with_headers) - return '\n'.join( f'{folder} {link and link.url} "{link and link.title}"' for folder, link in folders.items() @@ -546,19 +515,24 @@ def printable_folder_status(name: str, folder: Dict) -> str: else: num_files = 'missing' - if ' ' in str(folder['path']): - folder['path'] = f'"{folder["path"]}"' + path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else '' + if path and ' ' in path: + path = f'"{path}"' + + # if path is just a plain dot, replace it back with the full path for clarity + if path == '.': + path = str(OUTPUT_DIR) return ' '.join(( ANSI[color], symbol, ANSI['reset'], - name.ljust(22), - (str(folder["path"]) or '').ljust(76), + name.ljust(21), num_files.ljust(14), ANSI[color], - note, + note.ljust(8), ANSI['reset'], + path.ljust(76), )) @@ -578,17 +552,18 @@ def printable_dependency_version(name: str, dependency: Dict) -> str: else: color, symbol, note, version = 'lightyellow', '-', 'disabled', '-' - if ' ' in (dependency["path"] or ''): - dependency["path"] = f'"{dependency["path"]}"' + path = str(dependency["path"]).replace(str(OUTPUT_DIR), '.') if dependency["path"] else '' + if path and ' ' in path: + path = f'"{path}"' return ' '.join(( ANSI[color], symbol, ANSI['reset'], - name.ljust(22), - (dependency["path"] or '').ljust(76), + name.ljust(21), version.ljust(14), ANSI[color], - note, + note.ljust(8), ANSI['reset'], + path.ljust(76), )) diff --git a/archivebox/main.py b/archivebox/main.py index 66b9248f..eb8cd6a0 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -29,7 +29,6 @@ from .util import enforce_types # type: ignore from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT from .index import ( load_main_index, - get_empty_snapshot_queryset, parse_links_from_source, dedupe_links, write_main_index, @@ -45,16 +44,22 @@ from .index import ( get_corrupted_folders, get_unrecognized_folders, fix_invalid_folder_locations, + write_link_details, ) from .index.json import ( parse_json_main_index, parse_json_links_details, + generate_json_index_from_links, ) from .index.sql import ( get_admins, apply_migrations, remove_from_sql_main_index, ) +from .index.html import ( + generate_index_from_links, +) +from .index.csv import links_to_csv from .extractors import archive_links, archive_link, ignore_methods from .config import ( stderr, @@ -83,7 +88,6 @@ from .config import ( check_dependencies, check_data_folder, write_config_file, - setup_django, VERSION, CODE_LOCATIONS, EXTERNAL_LOCATIONS, @@ -110,6 +114,7 @@ from .logging_util import ( printable_dependency_version, ) +from .search import flush_search_index, index_links ALLOWED_IN_OUTPUT_DIR = { 'lost+found', @@ -212,7 +217,7 @@ def version(quiet: bool=False, else: print('ArchiveBox v{}'.format(VERSION)) p = platform.uname() - print(p.system, platform.platform(), p.machine) + print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, '(in Docker)' if IN_DOCKER else '(not in Docker)') print() print('{white}[i] Dependency versions:{reset}'.format(**ANSI)) @@ -259,6 +264,7 @@ def run(subcommand: str, @enforce_types def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: """Initialize a new ArchiveBox collection in the current directory""" + from core.models import Snapshot Path(out_dir).mkdir(exist_ok=True) is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR) @@ -312,7 +318,6 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: else: print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI)) - setup_django(out_dir, check_db=False) DATABASE_FILE = Path(out_dir) / SQL_INDEX_FILENAME print(f' √ {DATABASE_FILE}') print() @@ -330,7 +335,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: print() print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI)) - all_links = get_empty_snapshot_queryset() + all_links = Snapshot.objects.none() pending_links: Dict[str, Link] = {} if existing_index: @@ -378,7 +383,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: print(' archivebox list --status=invalid') - write_main_index(list(pending_links.values()), out_dir=out_dir, finished=True) + write_main_index(list(pending_links.values()), out_dir=out_dir) print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI)) if existing_index: @@ -506,7 +511,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None: @enforce_types -def oneshot(url: str, out_dir: Path=OUTPUT_DIR): +def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR): """ Create a single URL archive folder with an index.json and index.html, and all the archive method outputs. You can run this to archive single pages without needing to create a whole collection with archivebox init. @@ -518,8 +523,9 @@ def oneshot(url: str, out_dir: Path=OUTPUT_DIR): color='red' ) raise SystemExit(2) - methods = ignore_methods(['title']) - archive_link(oneshot_link[0], out_dir=out_dir, methods=methods, skip_index=True) + + methods = extractors.split(",") if extractors else ignore_methods(['title']) + archive_link(oneshot_link[0], out_dir=out_dir, methods=methods) return oneshot_link @enforce_types @@ -529,8 +535,8 @@ def add(urls: Union[str, List[str]], index_only: bool=False, overwrite: bool=False, init: bool=False, - out_dir: Path=OUTPUT_DIR, - extractors: str="") -> List[Link]: + extractors: str="", + out_dir: Path=OUTPUT_DIR) -> List[Link]: """Add a new URL or list of URLs to your archive""" assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' @@ -567,7 +573,7 @@ def add(urls: Union[str, List[str]], imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) new_links = dedupe_links(all_links, imported_links) - write_main_index(links=new_links, out_dir=out_dir, finished=not new_links) + write_main_index(links=new_links, out_dir=out_dir) all_links = load_main_index(out_dir=out_dir) if index_only: @@ -585,7 +591,7 @@ def add(urls: Union[str, List[str]], archive_links(imported_links, overwrite=True, **archive_kwargs) elif new_links: archive_links(new_links, overwrite=False, **archive_kwargs) - + return all_links @enforce_types @@ -660,6 +666,7 @@ def remove(filter_str: Optional[str]=None, to_remove = snapshots.count() + flush_search_index(snapshots=snapshots) remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) all_snapshots = load_main_index(out_dir=out_dir) log_removal_finished(all_snapshots.count(), to_remove) @@ -677,6 +684,7 @@ def update(resume: Optional[float]=None, status: Optional[str]=None, after: Optional[str]=None, before: Optional[str]=None, + extractors: str="", out_dir: Path=OUTPUT_DIR) -> List[Link]: """Import any new links from subscriptions and retry any previously failed/skipped links""" @@ -684,6 +692,8 @@ def update(resume: Optional[float]=None, check_dependencies() new_links: List[Link] = [] # TODO: Remove input argument: only_new + extractors = extractors.split(",") if extractors else [] + # Step 1: Filter for selected_links matching_snapshots = list_links( filter_patterns=filter_patterns, @@ -700,6 +710,9 @@ def update(resume: Optional[float]=None, all_links = [link for link in matching_folders.values() if link] if index_only: + for link in all_links: + write_link_details(link, out_dir=out_dir, skip_sql_index=True) + index_links(all_links, out_dir=out_dir) return all_links # Step 2: Run the archive methods for each link @@ -714,7 +727,13 @@ def update(resume: Optional[float]=None, stderr(f'[√] Nothing found to resume after {resume}', color='green') return all_links - archive_links(to_archive, overwrite=overwrite, out_dir=out_dir) + archive_kwargs = { + "out_dir": out_dir, + } + if extractors: + archive_kwargs["methods"] = extractors + + archive_links(to_archive, overwrite=overwrite, **archive_kwargs) # Step 4: Re-write links index with updated titles, icons, and resources all_links = load_main_index(out_dir=out_dir) @@ -747,7 +766,6 @@ def list_all(filter_patterns_str: Optional[str]=None, elif filter_patterns_str: filter_patterns = filter_patterns_str.split('\n') - snapshots = list_links( filter_patterns=filter_patterns, filter_type=filter_type, @@ -763,8 +781,16 @@ def list_all(filter_patterns_str: Optional[str]=None, status=status, out_dir=out_dir, ) - - print(printable_folders(folders, json=json, csv=csv, html=html, with_headers=with_headers)) + + if json: + output = generate_json_index_from_links(folders.values(), with_headers) + elif html: + output = generate_index_from_links(folders.values(), with_headers) + elif csv: + output = links_to_csv(folders.values(), cols=csv.split(','), header=with_headers) + else: + output = printable_folders(folders, with_headers=with_headers) + print(output) return folders @@ -1048,7 +1074,6 @@ def server(runserver_args: Optional[List[str]]=None, config.DEBUG = config.DEBUG or debug check_data_folder(out_dir=out_dir) - setup_django(out_dir) from django.core.management import call_command from django.contrib.auth.models import User @@ -1085,7 +1110,6 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None: """Run an ArchiveBox Django management command""" check_data_folder(out_dir=out_dir) - setup_django(out_dir) from django.core.management import execute_from_command_line if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY): @@ -1102,7 +1126,6 @@ def shell(out_dir: Path=OUTPUT_DIR) -> None: check_data_folder(out_dir=out_dir) - setup_django(OUTPUT_DIR) from django.core.management import call_command call_command("shell_plus") diff --git a/archivebox/manage.py b/archivebox/manage.py index 6951d8f7..1a9b2975 100755 --- a/archivebox/manage.py +++ b/archivebox/manage.py @@ -8,10 +8,9 @@ if __name__ == '__main__': # (e.g. makemigrations), you can comment out this check temporarily if not ('makemigrations' in sys.argv or 'migrate' in sys.argv): - print("[X] Don't run ./manage.py directly, use the archivebox CLI instead e.g.:") - print(' archivebox manage createsuperuser') + print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):") print() - print(' Hint: Use these archivebox commands instead of the ./manage.py equivalents:') + print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:') print(' archivebox init (migrates the databse to latest version)') print(' archivebox server (runs the Django web server)') print(' archivebox shell (opens an iPython Django shell with all models imported)') diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 42b2464e..441c08ac 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -32,6 +32,7 @@ from ..index.schema import Link from ..logging_util import TimedProgress, log_source_saved from .pocket_html import parse_pocket_html_export +from .pocket_api import parse_pocket_api_export from .pinboard_rss import parse_pinboard_rss_export from .wallabag_atom import parse_wallabag_atom_export from .shaarli_rss import parse_shaarli_rss_export @@ -44,6 +45,7 @@ from .generic_txt import parse_generic_txt_export PARSERS = ( # Specialized parsers + ('Pocket API', parse_pocket_api_export), ('Wallabag ATOM', parse_wallabag_atom_export), ('Pocket HTML', parse_pocket_html_export), ('Pinboard RSS', parse_pinboard_rss_export), diff --git a/archivebox/parsers/pocket_api.py b/archivebox/parsers/pocket_api.py new file mode 100644 index 00000000..bf3a292b --- /dev/null +++ b/archivebox/parsers/pocket_api.py @@ -0,0 +1,113 @@ +__package__ = 'archivebox.parsers' + + +import re + +from typing import IO, Iterable, Optional +from configparser import ConfigParser + +from pathlib import Path +from ..vendor.pocket import Pocket + +from ..index.schema import Link +from ..util import enforce_types +from ..system import atomic_write +from ..config import ( + SOURCES_DIR, + POCKET_CONSUMER_KEY, + POCKET_ACCESS_TOKENS, +) + + +COUNT_PER_PAGE = 500 +API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db' + +# search for broken protocols that sometimes come from the Pocket API +_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))') + + +def get_pocket_articles(api: Pocket, since=None, page=0): + body, headers = api.get( + state='archive', + sort='oldest', + since=since, + count=COUNT_PER_PAGE, + offset=page * COUNT_PER_PAGE, + ) + + articles = body['list'].values() if isinstance(body['list'], dict) else body['list'] + returned_count = len(articles) + + yield from articles + + if returned_count == COUNT_PER_PAGE: + yield from get_pocket_articles(api, since=since, page=page + 1) + else: + api.last_since = body['since'] + + +def link_from_article(article: dict, sources: list): + url: str = article['resolved_url'] or article['given_url'] + broken_protocol = _BROKEN_PROTOCOL_RE.match(url) + if broken_protocol: + url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://') + title = article['resolved_title'] or article['given_title'] or url + + return Link( + url=url, + timestamp=article['time_read'], + title=title, + tags=article.get('tags'), + sources=sources + ) + + +def write_since(username: str, since: str): + if not API_DB_PATH.exists(): + atomic_write(API_DB_PATH, '') + + since_file = ConfigParser() + since_file.optionxform = str + since_file.read(API_DB_PATH) + + since_file[username] = { + 'since': since + } + + with open(API_DB_PATH, 'w+') as new: + since_file.write(new) + + +def read_since(username: str) -> Optional[str]: + if not API_DB_PATH.exists(): + atomic_write(API_DB_PATH, '') + + config_file = ConfigParser() + config_file.optionxform = str + config_file.read(API_DB_PATH) + + return config_file.get(username, 'since', fallback=None) + + +@enforce_types +def should_parse_as_pocket_api(text: str) -> bool: + return text.startswith('pocket://') + + +@enforce_types +def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]: + """Parse bookmarks from the Pocket API""" + + input_buffer.seek(0) + pattern = re.compile(r"^pocket:\/\/(\w+)") + for line in input_buffer: + if should_parse_as_pocket_api(line): + + username = pattern.search(line).group(1) + api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username]) + api.last_since = None + + for article in get_pocket_articles(api, since=read_since(username)): + yield link_from_article(article, sources=[line]) + + write_since(username, api.last_since) diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py new file mode 100644 index 00000000..6191ede9 --- /dev/null +++ b/archivebox/search/__init__.py @@ -0,0 +1,108 @@ +from typing import List, Union +from pathlib import Path +from importlib import import_module + +from django.db.models import QuerySet + +from archivebox.index.schema import Link +from archivebox.util import enforce_types +from archivebox.config import stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE + +from .utils import get_indexable_content, log_index_started + +def indexing_enabled(): + return USE_INDEXING_BACKEND + +def search_backend_enabled(): + return USE_SEARCHING_BACKEND + +def get_backend(): + return f'search.backends.{SEARCH_BACKEND_ENGINE}' + +def import_backend(): + backend_string = get_backend() + try: + backend = import_module(backend_string) + except Exception as err: + raise Exception("Could not load '%s' as a backend: %s" % (backend_string, err)) + return backend + +@enforce_types +def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None: + if not indexing_enabled(): + return + + if not skip_text_index and texts: + from core.models import Snapshot + + snap = Snapshot.objects.filter(url=link.url).first() + backend = import_backend() + if snap: + try: + backend.index(snapshot_id=str(snap.id), texts=texts) + except Exception as err: + stderr() + stderr( + f'[X] The search backend threw an exception={err}:', + color='red', + ) + +@enforce_types +def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: + from core.models import Snapshot + + if search_backend_enabled(): + backend = import_backend() + try: + snapshot_ids = backend.search(query) + except Exception as err: + stderr() + stderr( + f'[X] The search backend threw an exception={err}:', + color='red', + ) + raise + else: + # TODO preserve ordering from backend + qsearch = Snapshot.objects.filter(pk__in=snapshot_ids) + return qsearch + + return Snapshot.objects.none() + +@enforce_types +def flush_search_index(snapshots: QuerySet): + if not indexing_enabled() or not snapshots: + return + backend = import_backend() + snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True)) + try: + backend.flush(snapshot_ids) + except Exception as err: + stderr() + stderr( + f'[X] The search backend threw an exception={err}:', + color='red', + ) + +@enforce_types +def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR): + if not links: + return + + from core.models import Snapshot, ArchiveResult + + for link in links: + snap = Snapshot.objects.filter(url=link.url).first() + if snap: + results = ArchiveResult.objects.indexable().filter(snapshot=snap) + log_index_started(link.url) + try: + texts = get_indexable_content(results) + except Exception as err: + stderr() + stderr( + f'[X] An Exception ocurred reading the indexable content={err}:', + color='red', + ) + else: + write_search_index(link, texts, out_dir=out_dir) diff --git a/archivebox/search/backends/__init__.py b/archivebox/search/backends/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py new file mode 100644 index 00000000..840d2d2d --- /dev/null +++ b/archivebox/search/backends/ripgrep.py @@ -0,0 +1,45 @@ +import re +from subprocess import run, PIPE +from typing import List, Generator + +from archivebox.config import ARCHIVE_DIR, RIPGREP_VERSION +from archivebox.util import enforce_types + +RG_IGNORE_EXTENSIONS = ('css','js','orig','svg') + +RG_ADD_TYPE = '--type-add' +RG_IGNORE_ARGUMENTS = f"ignore:*.{{{','.join(RG_IGNORE_EXTENSIONS)}}}" +RG_DEFAULT_ARGUMENTS = "-ilTignore" # Case insensitive(i), matching files results(l) +RG_REGEX_ARGUMENT = '-e' + +TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/' + +ts_regex = re.compile(TIMESTAMP_REGEX) + +@enforce_types +def index(snapshot_id: str, texts: List[str]): + return + +@enforce_types +def flush(snapshot_ids: Generator[str, None, None]): + return + +@enforce_types +def search(text: str) -> List[str]: + if not RIPGREP_VERSION: + raise Exception("ripgrep binary not found, install ripgrep to use this search backend") + + from core.models import Snapshot + + rg_cmd = ['rg', RG_ADD_TYPE, RG_IGNORE_ARGUMENTS, RG_DEFAULT_ARGUMENTS, RG_REGEX_ARGUMENT, text, str(ARCHIVE_DIR)] + rg = run(rg_cmd, stdout=PIPE, stderr=PIPE, timeout=60) + file_paths = [p.decode() for p in rg.stdout.splitlines()] + timestamps = set() + for path in file_paths: + ts = ts_regex.findall(path) + if ts: + timestamps.add(ts[0]) + + snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)] + + return snap_ids diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py new file mode 100644 index 00000000..f0beaddd --- /dev/null +++ b/archivebox/search/backends/sonic.py @@ -0,0 +1,28 @@ +from typing import List, Generator + +from sonic import IngestClient, SearchClient + +from archivebox.util import enforce_types +from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION + +MAX_SONIC_TEXT_LENGTH = 20000 + +@enforce_types +def index(snapshot_id: str, texts: List[str]): + with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl: + for text in texts: + chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)] + for chunk in chunks: + ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk)) + +@enforce_types +def search(text: str) -> List[str]: + with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl: + snap_ids = querycl.query(SONIC_COLLECTION, SONIC_BUCKET, text) + return snap_ids + +@enforce_types +def flush(snapshot_ids: Generator[str, None, None]): + with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl: + for id in snapshot_ids: + ingestcl.flush_object(SONIC_COLLECTION, SONIC_BUCKET, str(id)) diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py new file mode 100644 index 00000000..55c97e75 --- /dev/null +++ b/archivebox/search/utils.py @@ -0,0 +1,44 @@ +from django.db.models import QuerySet + +from archivebox.util import enforce_types +from archivebox.config import ANSI + +def log_index_started(url): + print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI)) + print( ) + +def get_file_result_content(res, extra_path, use_pwd=False): + if use_pwd: + fpath = f'{res.pwd}/{res.output}' + else: + fpath = f'{res.output}' + + if extra_path: + fpath = f'{fpath}/{extra_path}' + + with open(fpath, 'r') as file: + data = file.read() + if data: + return [data] + return [] + + +# This should be abstracted by a plugin interface for extractors +@enforce_types +def get_indexable_content(results: QuerySet): + if not results: + return [] + # Only use the first method available + res, method = results.first(), results.first().extractor + if method not in ('readability', 'singlefile', 'dom', 'wget'): + return [] + # This should come from a plugin interface + + if method == 'readability': + return get_file_result_content(res, 'content.txt') + elif method == 'singlefile': + return get_file_result_content(res, '') + elif method == 'dom': + return get_file_result_content(res,'',use_pwd=True) + elif method == 'wget': + return get_file_result_content(res,'',use_pwd=True) diff --git a/archivebox/themes/admin/base.html b/archivebox/themes/admin/base.html index 2802555e..d8ad8d00 100644 --- a/archivebox/themes/admin/base.html +++ b/archivebox/themes/admin/base.html @@ -107,6 +107,9 @@ {% trans 'Change password' %} / {% endif %} {% trans 'Log out' %} + | + + ⣿⣿ {% endblock %}
{% endif %} @@ -179,8 +182,63 @@ }); } }; + + function redirectWithQuery(uri){ + uri_query = uri + document.location.search; + window.location = uri_query; + + }; + + function selectSnapshotListView(){ + localStorage.setItem('currentSnapshotView', 'List'); + redirectWithQuery("{% url 'admin:core_snapshot_changelist' %}"); + }; + + function selectSnapshotGridView(){ + localStorage.setItem('currentSnapshotView', 'Grid'); + redirectWithQuery("{% url 'admin:grid' %}"); + }; + + function setPreferredSnapshotView(view){ + urlPath = window.location.pathname; + + if((view==="Grid") && urlPath == "{% url 'admin:core_snapshot_changelist' %}"){ + selectSnapshotGridView(); + } + + {% comment %} + else if((view==="List") && urlPath == "{% url 'admin:grid' %}"){ + selectSnapshotListView(); + + } + {% endcomment %} + }; + + function setupSnapshotViews() { + const preferredSnapshotView = localStorage.getItem('currentSnapshotView'); + setPreferredSnapshotView(preferredSnapshotView); + + $( document ).ready(function() { + + $("#snapshotListView").click(function() { + selectSnapshotListView(); + }); + $("#snapshotGridView").click(function() { + selectSnapshotGridView(); + }); + + $('input:checkbox').change(function(){ + if($(this).is(':checked')) + $(this).parent().parent().parent().parent().addClass('selected-card'); + else + $(this).parent().parent().parent().parent().removeClass('selected-card') + }); + + }); + }; $(function () { fix_actions(); + setupSnapshotViews(); }); })(django.jQuery); diff --git a/archivebox/themes/admin/grid_change_list.html b/archivebox/themes/admin/grid_change_list.html new file mode 100644 index 00000000..6894efd7 --- /dev/null +++ b/archivebox/themes/admin/grid_change_list.html @@ -0,0 +1,91 @@ +{% extends "admin/base_site.html" %} +{% load i18n admin_urls static admin_list %} +{% load core_tags %} + +{% block extrastyle %} + {{ block.super }} + + {% if cl.formset %} + + {% endif %} + {% if cl.formset or action_form %} + + {% endif %} + {{ media.css }} + {% if not actions_on_top and not actions_on_bottom %} + + {% endif %} +{% endblock %} + +{% block extrahead %} +{{ block.super }} +{{ media.js }} +{% endblock %} + +{% block bodyclass %}{{ block.super }} app-{{ opts.app_label }} model-{{ opts.model_name }} change-list{% endblock %} + +{% if not is_popup %} +{% block breadcrumbs %} + +{% endblock %} +{% endif %} + +{% block coltype %}{% endblock %} + +{% block content %} +
+ {% block object-tools %} +
    + {% block object-tools-items %} + {% change_list_object_tools %} + {% endblock %} +
+ {% endblock %} + {% if cl.formset and cl.formset.errors %} +

+ {% if cl.formset.total_error_count == 1 %}{% translate "Please correct the error below." %}{% else %}{% translate "Please correct the errors below." %}{% endif %} +

+ {{ cl.formset.non_form_errors }} + {% endif %} +
+
+ {% block search %}{% search_form cl %}{% endblock %} + {% block date_hierarchy %}{% if cl.date_hierarchy %}{% date_hierarchy cl %}{% endif %}{% endblock %} + +
{% csrf_token %} + {% if cl.formset %} +
{{ cl.formset.management_form }}
+ {% endif %} + + {% block result_list %} + {% if action_form and actions_on_top and cl.show_admin_actions %}{% admin_actions %}{% endif %} + {% comment %} + Table grid + {% result_list cl %} + {% endcomment %} + {% snapshots_grid cl %} + {% if action_form and actions_on_bottom and cl.show_admin_actions %}{% admin_actions %}{% endif %} + {% endblock %} + {% block pagination %}{% pagination cl %}{% endblock %} +
+
+ {% block filters %} + {% if cl.has_filters %} +
+

{% translate 'Filter' %}

+ {% if cl.has_active_filters %}

+ ✖ {% translate "Clear all filters" %} +

{% endif %} + {% for spec in cl.filter_specs %}{% admin_list_filter cl spec %}{% endfor %} +
+ {% endif %} + {% endblock %} +
+
+{% endblock %} \ No newline at end of file diff --git a/archivebox/themes/admin/snapshots_grid.html b/archivebox/themes/admin/snapshots_grid.html new file mode 100644 index 00000000..a7a2d4f9 --- /dev/null +++ b/archivebox/themes/admin/snapshots_grid.html @@ -0,0 +1,162 @@ +{% load i18n admin_urls static admin_list %} +{% load core_tags %} + +{% block extrastyle %} + + +{% endblock %} + +{% block content %} +
+ {% for obj in results %} +
+ + + + + +
+ {% if obj.tags_str %} +

{{obj.tags_str}}

+ {% endif %} + {% if obj.title %} + +

{{obj.title|truncatechars:55 }}

+
+ {% endif %} + {% comment %}

TEXT If needed.

{% endcomment %} +
+
+ +
+
+ {% endfor %} +
+ +{% endblock %} \ No newline at end of file diff --git a/archivebox/themes/default/base.html b/archivebox/themes/default/base.html index 4a5a76c6..a70430ea 100644 --- a/archivebox/themes/default/base.html +++ b/archivebox/themes/default/base.html @@ -187,13 +187,6 @@ display: none; } - body[data-status~=finished] .files-spinner { - display: none; - } - - /*body[data-status~=running] .in-progress { - display: inline-block; - }*/ tr td a.favicon img { padding-left: 6px; padding-right: 12px; @@ -224,12 +217,10 @@ color: black; } - tr td a.exists-True { - opacity: 1; - } - tr td a.exists-False { - opacity: 0.1; - filter: grayscale(100%); + .exists-False { + opacity: 0.1; + filter: grayscale(100%); + pointer-events: none; } @@ -280,10 +271,9 @@
- Archive created using ArchiveBox   | -   - Download index as JSON -

+ Archive created using ArchiveBox version + v{{VERSION}}. +

{{FOOTER_INFO}}
diff --git a/archivebox/themes/default/core/snapshot_list.html b/archivebox/themes/default/core/snapshot_list.html index a5beceb8..ce2b2faa 100644 --- a/archivebox/themes/default/core/snapshot_list.html +++ b/archivebox/themes/default/core/snapshot_list.html @@ -2,44 +2,25 @@ {% load static %} {% block body %} -
-
- - - -
- - - - - - - - - +
+ + + + + +
BookmarkedSaved Link ({{num_links}})FilesOriginal URL
+ + + + + + + + {% for link in object_list %} - - - - - - + {% include 'main_index_row.html' with link=link %} {% endfor %}
BookmarkedSnapshot ({{object_list|length}})FilesOriginal URL
{{link.added}} - {% if link.is_archived %} - - {% else %} - - {% endif %} - - {{link.title|default:'Loading...'}} - {{link.tags_str}} - - - 📄 - {{link.icons}} - - {{link.url}}
@@ -59,6 +40,12 @@ last » {% endif %} -
+ + {% if page_obj.has_next %} + next + last » + {% endif %} + +
- {% endblock %} +{% endblock %} diff --git a/archivebox/themes/legacy/link_details.html b/archivebox/themes/default/link_details.html similarity index 80% rename from archivebox/themes/legacy/link_details.html rename to archivebox/themes/default/link_details.html index efb7274b..b1edcfe0 100644 --- a/archivebox/themes/legacy/link_details.html +++ b/archivebox/themes/default/link_details.html @@ -1,7 +1,7 @@ - $title + {{title}}