From 402aac2366b17ea3186730f20457812c77b4266d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 31 Jan 2024 11:48:43 -0800 Subject: [PATCH 1/5] Update README.md --- README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index b7bd0ff2..66743966 100644 --- a/README.md +++ b/README.md @@ -190,7 +190,7 @@ curl -sSL 'https://docker-compose.archivebox.io' > docker-compose.yml
  • Next steps: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    docker compose up
     # completely optional, CLI can always be used without running a server
    -# docker compose run [-T] archivebox [subcommand] [--args]
    +# docker compose run [-T] archivebox [subcommand] [--help]
     docker compose run archivebox add 'https://example.com'
     docker compose run archivebox help
     
  • @@ -213,7 +213,7 @@ docker run -v $PWD:/data -it archivebox/archivebox init --setup
  • Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox
     # completely optional, CLI can always be used without running a server
    -# docker run -v $PWD:/data -it [subcommand] [--args]
    +# docker run -v $PWD:/data -it [subcommand] [--help]
     docker run -v $PWD:/data -it archivebox/archivebox help
     
  • @@ -265,7 +265,7 @@ archivebox init --setup
  • Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    archivebox server 0.0.0.0:8000
     # completely optional, CLI can always be used without running a server
    -# archivebox [subcommand] [--args]
    +# archivebox [subcommand] [--help]
     archivebox help
     
  • @@ -301,7 +301,7 @@ archivebox init --setup # if any problems, install with pip instead
  • Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    archivebox server 0.0.0.0:8000
     # completely optional, CLI can always be used without running a server
    -# archivebox [subcommand] [--args]
    +# archivebox [subcommand] [--help]
     archivebox help
     
  • @@ -330,7 +330,7 @@ archivebox init --setup # if any problems, install with pip instead
  • Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    archivebox server 0.0.0.0:8000
     # completely optional, CLI can always be used without running a server
    -# archivebox [subcommand] [--args]
    +# archivebox [subcommand] [--help]
     archivebox help
     
  • @@ -458,13 +458,13 @@ ArchiveBox commands can be run in a terminal directly on your host, or via Docke mkdir -p ~/archivebox/data # create a new data dir anywhere cd ~/archivebox/data # IMPORTANT: cd into the directory -# archivebox [subcommand] [--args] +# archivebox [subcommand] [--help] archivebox help -# equivalent: docker compose run archivebox [subcommand [--args] +# equivalent: docker compose run archivebox [subcommand [--help] docker compose run archivebox help -# equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--args] +# equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--help] docker run -it -v $PWD:/data archivebox/archivebox help ``` @@ -482,7 +482,7 @@ docker compose run archivebox help
    
     # make sure you have pip-installed ArchiveBox and it's available in your $PATH first  
     
    -# archivebox [subcommand] [--args] +# archivebox [subcommand] [--help] archivebox init --setup # safe to run init multiple times (also how you update versions) archivebox version # get archivebox version info + check dependencies archivebox help # get list of archivebox subcommands that can be run @@ -498,7 +498,7 @@ archivebox add --depth=1 'https://news.ycombinator.com'
    
     # make sure you have `docker-compose.yml` from the Quickstart instructions first
     
    -# docker compose run archivebox [subcommand [--args] +# docker compose run archivebox [subcommand [--help] docker compose run archivebox init --setup docker compose run archivebox version docker compose run archivebox help @@ -515,7 +515,7 @@ docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
    
     # make sure you create and cd into in a new empty directory first  
     
    -# docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--args] +# docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--help] docker run -v $PWD:/data -it archivebox/archivebox init --setup docker run -v $PWD:/data -it archivebox/archivebox version docker run -v $PWD:/data -it archivebox/archivebox help From 9f8ad4b126959f5593d6f22a0b8ecc1eb5a9e697 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 1 Feb 2024 01:13:04 -0800 Subject: [PATCH 2/5] fix missing closing square brackets in readme cli examples --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 66743966..c5230039 100644 --- a/README.md +++ b/README.md @@ -461,10 +461,10 @@ cd ~/archivebox/data # IMPORTANT: cd into the directory # archivebox [subcommand] [--help] archivebox help -# equivalent: docker compose run archivebox [subcommand [--help] +# equivalent: docker compose run archivebox [subcommand] [--help] docker compose run archivebox help -# equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--help] +# equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help] docker run -it -v $PWD:/data archivebox/archivebox help ``` @@ -498,7 +498,7 @@ archivebox add --depth=1 'https://news.ycombinator.com'
    
     # make sure you have `docker-compose.yml` from the Quickstart instructions first
     
    -# docker compose run archivebox [subcommand [--help] +# docker compose run archivebox [subcommand] [--help] docker compose run archivebox init --setup docker compose run archivebox version docker compose run archivebox help @@ -515,7 +515,7 @@ docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
    
     # make sure you create and cd into in a new empty directory first  
     
    -# docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--help] +# docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help] docker run -v $PWD:/data -it archivebox/archivebox init --setup docker run -v $PWD:/data -it archivebox/archivebox version docker run -v $PWD:/data -it archivebox/archivebox help From babd273fc0e63809932b81fa46ddc68805a74f04 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 1 Feb 2024 01:40:33 -0800 Subject: [PATCH 3/5] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c5230039..e8492472 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,6 @@ curl -sSL 'https://get.archivebox.io' | sh # (or see pip/brew/Docker instruct Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a centralized service, but saved URLs have to be public, and they can't save every type of content. *ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...* -
    > ➡️ Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via **[Docker](#quickstart)** ⭐️. @@ -51,10 +50,13 @@ It also detects any content featured *inside* pages & extracts it out into a fol - 💾 **Github**/**Gitlab**/etc. links ➡️ `clone of GIT source code`, `README`, `images`, ... - ✨ *and more, see [Output Formats](#output-formats) below...* +You can run ArchiveBox as a Docker web app to manage these snapshots, or continue accessing the same collection using the `pip`-installed CLI, Python API, and SQLite3 APIs. +All the ways of using it are equivalent, and provide matching features like adding tags, scheduling regular crawls, viewing logs, and more... +

    -🛠️ ArchiveBox uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in [ordinary files & folders](#archive-layout). +🛠️ ArchiveBox uses [standard tools](#dependencies) like Chrome, [`wget`](https://www.gnu.org/software/wget/), & [`yt-dlp`](https://github.com/yt-dlp/yt-dlp), and stores data in [ordinary files & folders](#archive-layout). *(no complex proprietary formats, all data is readable without needing to run ArchiveBox)* The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessible formats [for decades](#background--motivation) after it goes down. From 00d2d20a631ca507c50c951b67859e2eb52ed7f4 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 12 Feb 2024 02:04:07 -0800 Subject: [PATCH 4/5] Update README.md --- README.md | 66 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index e8492472..b8892b06 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@     @@ -72,10 +72,9 @@ The goal is to sleep soundly knowing the part of the internet you care about wil
      Expand for quick copy-pastable install commands...   ⤵️
    -
    mkdir ~/archivebox; cd ~/archivebox    # create a dir somewhere for your archivebox data
    -
    -# Option A: Get ArchiveBox with Docker Compose (recommended): -curl -sSL 'https://docker-compose.archivebox.io' > docker-compose.yml # edit options in this file as-needed +
    # Option A: Get ArchiveBox with Docker Compose (recommended):
    +mkdir -p ~/archivebox/data && cd ~/archivebox
    +curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml   # edit options in this file as-needed
     docker compose run archivebox init --setup
     # docker compose run archivebox add 'https://example.com'
     # docker compose run archivebox help
    @@ -83,6 +82,7 @@ docker compose run archivebox init --setup
     

    # Option B: Or use it as a plain Docker container: +mkdir -p ~/archivebox/data && cd ~/archivebox/data docker run -it -v $PWD:/data archivebox/archivebox init --setup # docker run -it -v $PWD:/data archivebox/archivebox add 'https://example.com' # docker run -it -v $PWD:/data archivebox/archivebox help @@ -91,6 +91,7 @@ docker run -it -v $PWD:/data archivebox/archivebox init --setup
    # Option C: Or install it with your preferred pkg manager (see Quickstart below for apt, brew, and more) pip install archivebox +mkdir -p ~/archivebox/data && cd ~/archivebox/data archivebox init --setup # archviebox add 'https://example.com' # archivebox help @@ -98,7 +99,7 @@ archivebox init --setup

    # Option D: Or use the optional auto setup script to install it -curl -sSL 'https://get.archivebox.io' | sh +curl -fsSL 'https://get.archivebox.io' | sh

    Open http://localhost:8000 to see your server's Web UI ➡️ @@ -182,9 +183,9 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur
    1. Install Docker on your system (if not already installed).
    2. Download the docker-compose.yml file into a new empty directory (can be anywhere). -
      mkdir ~/archivebox && cd ~/archivebox
      +
      mkdir -p ~/archivebox/data && cd ~/archivebox
       # Read and edit docker-compose.yml options as-needed after downloading
      -curl -sSL 'https://docker-compose.archivebox.io' > docker-compose.yml
      +curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml
       
    3. Run the initial setup to create an admin user (or set ADMIN_USER/PASS in docker-compose.yml)
      docker compose run archivebox init --setup
      @@ -208,7 +209,7 @@ See below for more usage examples using the C
       
      1. Install Docker on your system (if not already installed).
      2. Create a new empty directory and initialize your collection (can be anywhere). -
        mkdir ~/archivebox && cd ~/archivebox
        +
        mkdir -p ~/archivebox/data && cd ~/archivebox/data
         docker run -v $PWD:/data -it archivebox/archivebox init --setup
         
      3. @@ -231,7 +232,7 @@ See below for more usage examples using the C
        1. Install Docker on your system (optional, highly recommended but not required).
        2. Run the automatic setup script. -
          curl -sSL 'https://get.archivebox.io' | sh
          +
          curl -fsSL 'https://get.archivebox.io' | sh
        @@ -256,12 +257,16 @@ See "Against curl | sh as a
      4. Install Python >= v3.10 and Node >= v18 on your system (if not already installed).
      5. Install the ArchiveBox package using pip3 (or pipx).
        pip3 install archivebox
        +archivebox version
        +# install any missing extras shown using apt/brew/pkg/etc.
        +#    python@3.10 node curl wget git ripgrep ...
         
        +See the Install: Bare Metal Wiki for full install instructions for each OS...
      6. Create a new empty directory and initialize your collection (can be anywhere). -
        mkdir ~/archivebox && cd ~/archivebox
        -archivebox init --setup
        -# install any missing extras like wget/git/ripgrep/etc. manually as needed
        +
        mkdir -p ~/archivebox/data && cd ~/archivebox/data   # for example
        +archivebox init --setup   # instantialize a new collection
        +# (--setup auto-installs and link JS dependencies: singlefile, readability, etc.)
         
      7. Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin. @@ -274,7 +279,8 @@ archivebox help
      See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
      -See the pip-archivebox repo for more details about this distribution. +
      +See the pip-archivebox repo for more details about this distribution.

    @@ -295,10 +301,10 @@ sudo python3 -m pip install --upgrade --ignore-installed archivebox # pip need
  • Create a new empty directory and initialize your collection (can be anywhere). -
    mkdir ~/archivebox && cd ~/archivebox
    +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data
     archivebox init --setup           # if any problems, install with pip instead
     
    -Note: If you encounter issues with NPM/NodeJS, install a more recent version.

    +Note: If you encounter issues or want more granular instructions, see the Install: Bare Metal Wiki.

  • Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    archivebox server 0.0.0.0:8000
    @@ -323,9 +329,10 @@ See the debian-a
     
    brew tap archivebox/archivebox
     brew install archivebox
     
    +See the
    Install: Bare Metal Wiki for more granular instructions for macOS... ➡️
  • Create a new empty directory and initialize your collection (can be anywhere). -
    mkdir ~/archivebox && cd ~/archivebox
    +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data
     archivebox init --setup         # if any problems, install with pip instead
     
  • @@ -334,7 +341,7 @@ archivebox init --setup # if any problems, install with pip instead # completely optional, CLI can always be used without running a server # archivebox [subcommand] [--help] archivebox help -
    +

    @@ -351,7 +358,7 @@ See the homebr
    • Arch: yay -S archivebox (contributed by @imlonghao)
    • -
    • FreeBSD: curl -sSL 'https://get.archivebox.io' | sh (uses pkg + pip3 under-the-hood)
    • +
    • FreeBSD: curl -fsSL 'https://get.archivebox.io' | sh (uses pkg + pip3 under-the-hood)
    • Nix: nix-env --install archivebox (contributed by @siraben)
    • Guix: guix install archivebox (contributed by @rakino)
    • More: contribute another distribution...!
    • @@ -461,13 +468,14 @@ mkdir -p ~/archivebox/data # create a new data dir anywhere cd ~/archivebox/data # IMPORTANT: cd into the directory # archivebox [subcommand] [--help] +archivebox version archivebox help # equivalent: docker compose run archivebox [subcommand] [--help] docker compose run archivebox help # equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help] - docker run -it -v $PWD:/data archivebox/archivebox help +docker run -it -v $PWD:/data archivebox/archivebox help ``` #### ArchiveBox Subcommands @@ -677,7 +685,7 @@ It uses all available methods out-of-the-box, but you can disable extractors and Expand to see the full list of ways it saves each page... -./archive/{Snapshot.id}/
      +data/archive/{Snapshot.id}/
      • Index: index.html & index.json HTML and JSON index files containing metadata and details
      • Title, Favicon, Headers Response headers, site favicon, and parsed site title
      • @@ -808,18 +816,18 @@ All of ArchiveBox's state (SQLite DB, content, config, logs, etc.) is stored in
        Expand to learn more about the layout of Archivebox's data on-disk...
        -Data folders can be created anywhere (`~/archivebox` or `$PWD/data` as seen in our examples), and you can create as many data folders as you want to hold different collections. +Data folders can be created anywhere (`~/archivebox/data` or `$PWD/data` as seen in our examples), and you can create as many data folders as you want to hold different collections. All archivebox CLI commands are designed to be run from inside an ArchiveBox data folder, starting with archivebox init to initialize a new collection inside an empty directory. -
        mkdir ~/archivebox && cd ~/archivebox   # just an example, can be anywhere
        +
        mkdir -p ~/archivebox/data && cd ~/archivebox/data   # just an example, can be anywhere
         archivebox init
        -The on-disk layout is optimized to be easy to browse by hand and durable long-term. The main index is a standard index.sqlite3 database in the root of the data folder (it can also be exported as static JSON/HTML), and the archive snapshots are organized by date-added timestamp in the ./archive/ subfolder. +The on-disk layout is optimized to be easy to browse by hand and durable long-term. The main index is a standard index.sqlite3 database in the root of the data folder (it can also be exported as static JSON/HTML), and the archive snapshots are organized by date-added timestamp in the data/archive/ subfolder. -
        /data/
        +
        data/
             index.sqlite3
             ArchiveBox.conf
             archive/
        @@ -834,7 +842,7 @@ The on-disk layout is optimized to be easy to browse by hand and durable long-te
                     ...
         
        -Each snapshot subfolder ./archive/TIMESTAMP/ includes a static index.json and index.html describing its contents, and the snapshot extractor outputs are plain files within the folder. +Each snapshot subfolder data/archive/TIMESTAMP/ includes a static index.json and index.html describing its contents, and the snapshot extractor outputs are plain files within the folder.

        Learn More

          @@ -1048,9 +1056,9 @@ Because ArchiveBox is designed to ingest a large volume of URLs with multiple co Disk usage can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by turning off extractors methods you don't need. You can also deduplicate content with a tool like [fdupes](https://github.com/adrianlopezroche/fdupes) or [rdfind](https://github.com/pauldreik/rdfind). -**Don't store large collections on older filesystems like EXT3/FAT** as they may not be able to handle more than 50k directory entries in the `archive/` folder. +**Don't store large collections on older filesystems like EXT3/FAT** as they may not be able to handle more than 50k directory entries in the `data/archive/` folder. -**Try to keep the `index.sqlite3` file on local drive (not a network mount)** or SSD for maximum performance, however the `archive/` folder can be on a network mount or slower HDD. +**Try to keep the `data/index.sqlite3` file on local drive (not a network mount)** or SSD for maximum performance, however the `data/archive/` folder can be on a network mount or slower HDD. If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to set [`PUID` & `PGID`](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid) and [disable `root_squash`](https://github.com/ArchiveBox/ArchiveBox/issues/1304) on your fileshare server. @@ -1441,7 +1449,7 @@ https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-dj ArchiveBox [`extractors`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/media.py) are external binaries or Python/Node scripts that ArchiveBox runs to archive content on a page. -Extractors take the URL of a page to archive, write their output to the filesystem `archive/TIMESTAMP/EXTRACTOR/...`, and return an [`ArchiveResult`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/core/models.py#:~:text=return%20qs-,class%20ArchiveResult,-(models.Model)%3A) entry which is saved to the database (visible on the `Log` page in the UI). +Extractors take the URL of a page to archive, write their output to the filesystem `data/archive/TIMESTAMP/EXTRACTOR/...`, and return an [`ArchiveResult`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/core/models.py#:~:text=return%20qs-,class%20ArchiveResult,-(models.Model)%3A) entry which is saved to the database (visible on the `Log` page in the UI). *Check out how we added **[`archivebox/extractors/singlefile.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/singlefile.py)** as an example of the process: [Issue #399](https://github.com/ArchiveBox/ArchiveBox/issues/399) + [PR #403](https://github.com/ArchiveBox/ArchiveBox/pull/403).* From 3ad32509e985236f82f3558f31b856623b1eb261 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 12 Feb 2024 02:09:39 -0800 Subject: [PATCH 5/5] Update FUNDING.yml --- .github/FUNDING.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index ff0edb0f..d3fbf26a 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,3 +1,3 @@ github: pirate patreon: theSquashSH -custom: ["https://twitter.com/ArchiveBoxApp", "https://paypal.me/NicholasSweeting", "https://www.blockchain.com/eth/address/0x5D4c34D4a121Fe08d1dDB7969F07550f2dB9f471", "https://www.blockchain.com/btc/address/1HuxXriPE2Bbnag3jJrqa3bkNHrs297dYH"] +custom: ["https://hcb.hackclub.com/donations/start/archivebox", "https://paypal.me/NicholasSweeting"]