diff --git a/.gitignore b/.gitignore index 884e1da4..68717afb 100644 --- a/.gitignore +++ b/.gitignore @@ -4,13 +4,21 @@ __pycache__/ .mypy_cache/ +# Python and Node dependencies venv/ .venv/ .docker-venv/ - -build/ -dist/ node_modules/ +# Packaging artifacts +archivebox-*.tar.gz +build/ +deb_dist/ +dist/ + +# Data folders data/ +data1/ +data2/ +data3/ output/ diff --git a/archivebox.egg-info/PKG-INFO b/archivebox.egg-info/PKG-INFO index eb80f48d..3940b731 100644 --- a/archivebox.egg-info/PKG-INFO +++ b/archivebox.egg-info/PKG-INFO @@ -14,7 +14,7 @@ Project-URL: Roadmap, https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap Project-URL: Community, https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community Project-URL: Donate, https://github.com/ArchiveBox/ArchiveBox/wiki/Donations Description:
- +

ArchiveBox
The open-source self-hosted web archive.

▶️ Quickstart | @@ -41,7 +41,7 @@ Description:

- ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) or [`pip3`](https://wiki.python.org/moin/BeginnersGuide/Download). + ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended) or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. @@ -51,19 +51,27 @@ Description:
#### Quickstart ```bash - docker run -d -it -v ~/archivebox:/data -p 8000:8000 archivebox/archivebox server --init 0.0.0.0:8000 - docker run -v ~/archivebox:/data -it archivebox/archivebox manage createsuperuser - docker run -v ~/archivebox:/data -it archivebox/archivebox add 'https://example.com' + # 1. Create a folder somewhere to hold your ArchiveBox data + mkdir ~/archivebox && cd ~/archivebox + docker run -v $PWD:/data -it archivebox/archivebox init - open http://127.0.0.1:8000/admin/login/ # then click "Add" in the navbar + # 2. Archive some URLs to get started + docker run -v $PWD:/data -t archivebox/archivebox add https://github.com/ArchiveBox/ArchiveBox + docker run -v $PWD:/data -t archivebox/archivebox add --depth=1 https://example.com + + # 3. Then view the snapshots of the URLs you added via the self-hosted web UI + docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser # create an admin acct + docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox # start the web server + open http://127.0.0.1:8000/ # open the interactive admin panel + ls archive/*/index.html # or just browse snapshots on disk ```

- [DEMO: archivebox.zervice.io/](https://archivebox.zervice.io) - For more information, see the [full Quickstart guide](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) docs. + DEMO: archivebox.zervice.io/ + For more information, see the full Quickstart guide, Usage, and Configuration docs.
--- @@ -82,7 +90,7 @@ Description:
open http://127.0.0.1:8000 ``` - The CLI is considered "stable", and the ArchiveBox Python API and REST APIs are in "beta". + The CLI is considered "stable", the ArchiveBox Python API and REST APIs are in "beta", and the [desktop app](https://github.com/ArchiveBox/desktop) is in "alpha" stage. At the end of the day, the goal is to sleep soundly knowing that the part of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer). You can also self-host your archivebox server on a public domain to provide archive.org-style public access to your site snapshots. @@ -331,7 +339,7 @@ Description:
▶ **Check out our [community page](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.** - The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations. + comparison The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations. #### User Interface & Intended Purpose @@ -362,7 +370,7 @@ Description:
_A collection of the most active internet archiving communities and initiatives._ - Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog) - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post. - - Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter. + - Or reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter. --- @@ -494,7 +502,7 @@ Description:

- +

diff --git a/archivebox.egg-info/SOURCES.txt b/archivebox.egg-info/SOURCES.txt index eee55cc5..8f0d5d48 100644 --- a/archivebox.egg-info/SOURCES.txt +++ b/archivebox.egg-info/SOURCES.txt @@ -6,6 +6,7 @@ archivebox/LICENSE archivebox/README.md archivebox/__init__.py archivebox/__main__.py +archivebox/base32_crockford.py archivebox/config.py archivebox/config_stubs.py archivebox/logging_util.py diff --git a/archivebox/base32_crockford.py b/archivebox/base32_crockford.py new file mode 100644 index 00000000..bafb69b4 --- /dev/null +++ b/archivebox/base32_crockford.py @@ -0,0 +1,172 @@ +""" +base32-crockford +================ + +A Python module implementing the alternate base32 encoding as described +by Douglas Crockford at: http://www.crockford.com/wrmg/base32.html. + +He designed the encoding to: + + * Be human and machine readable + * Be compact + * Be error resistant + * Be pronounceable + +It uses a symbol set of 10 digits and 22 letters, excluding I, L O and +U. Decoding is not case sensitive, and 'i' and 'l' are converted to '1' +and 'o' is converted to '0'. Encoding uses only upper-case characters. + +Hyphens may be present in symbol strings to improve readability, and +are removed when decoding. + +A check symbol can be appended to a symbol string to detect errors +within the string. + +""" + +import re +import sys + +PY3 = sys.version_info[0] == 3 + +if not PY3: + import string as str + + +__all__ = ["encode", "decode", "normalize"] + + +if PY3: + string_types = str, +else: + string_types = basestring, + +# The encoded symbol space does not include I, L, O or U +symbols = '0123456789ABCDEFGHJKMNPQRSTVWXYZ' +# These five symbols are exclusively for checksum values +check_symbols = '*~$=U' + +encode_symbols = dict((i, ch) for (i, ch) in enumerate(symbols + check_symbols)) +decode_symbols = dict((ch, i) for (i, ch) in enumerate(symbols + check_symbols)) +normalize_symbols = str.maketrans('IiLlOo', '111100') +valid_symbols = re.compile('^[%s]+[%s]?$' % (symbols, + re.escape(check_symbols))) + +base = len(symbols) +check_base = len(symbols + check_symbols) + + +def encode(number, checksum=False, split=0): + """Encode an integer into a symbol string. + + A ValueError is raised on invalid input. + + If checksum is set to True, a check symbol will be + calculated and appended to the string. + + If split is specified, the string will be divided into + clusters of that size separated by hyphens. + + The encoded string is returned. + """ + number = int(number) + if number < 0: + raise ValueError("number '%d' is not a positive integer" % number) + + split = int(split) + if split < 0: + raise ValueError("split '%d' is not a positive integer" % split) + + check_symbol = '' + if checksum: + check_symbol = encode_symbols[number % check_base] + + if number == 0: + return '0' + check_symbol + + symbol_string = '' + while number > 0: + remainder = number % base + number //= base + symbol_string = encode_symbols[remainder] + symbol_string + symbol_string = symbol_string + check_symbol + + if split: + chunks = [] + for pos in range(0, len(symbol_string), split): + chunks.append(symbol_string[pos:pos + split]) + symbol_string = '-'.join(chunks) + + return symbol_string + + +def decode(symbol_string, checksum=False, strict=False): + """Decode an encoded symbol string. + + If checksum is set to True, the string is assumed to have a + trailing check symbol which will be validated. If the + checksum validation fails, a ValueError is raised. + + If strict is set to True, a ValueError is raised if the + normalization step requires changes to the string. + + The decoded string is returned. + """ + symbol_string = normalize(symbol_string, strict=strict) + if checksum: + symbol_string, check_symbol = symbol_string[:-1], symbol_string[-1] + + number = 0 + for symbol in symbol_string: + number = number * base + decode_symbols[symbol] + + if checksum: + check_value = decode_symbols[check_symbol] + modulo = number % check_base + if check_value != modulo: + raise ValueError("invalid check symbol '%s' for string '%s'" % + (check_symbol, symbol_string)) + + return number + + +def normalize(symbol_string, strict=False): + """Normalize an encoded symbol string. + + Normalization provides error correction and prepares the + string for decoding. These transformations are applied: + + 1. Hyphens are removed + 2. 'I', 'i', 'L' or 'l' are converted to '1' + 3. 'O' or 'o' are converted to '0' + 4. All characters are converted to uppercase + + A TypeError is raised if an invalid string type is provided. + + A ValueError is raised if the normalized string contains + invalid characters. + + If the strict parameter is set to True, a ValueError is raised + if any of the above transformations are applied. + + The normalized string is returned. + """ + if isinstance(symbol_string, string_types): + if not PY3: + try: + symbol_string = symbol_string.encode('ascii') + except UnicodeEncodeError: + raise ValueError("string should only contain ASCII characters") + else: + raise TypeError("string is of invalid type %s" % + symbol_string.__class__.__name__) + + norm_string = symbol_string.replace('-', '').translate(normalize_symbols).upper() + + if not valid_symbols.match(norm_string): + raise ValueError("string '%s' contains invalid characters" % norm_string) + + if strict and norm_string != symbol_string: + raise ValueError("string '%s' requires normalization" % symbol_string) + + return norm_string diff --git a/archivebox/util.py b/archivebox/util.py index ae827899..4e55e30d 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -16,7 +16,7 @@ from dateparser import parse as dateparser import requests from requests.exceptions import RequestException, ReadTimeout -from base32_crockford import encode as base32_encode # type: ignore +from .base32_crockford import encode as base32_encode # type: ignore from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding try: diff --git a/assets/css/style.scss b/assets/css/style.scss index a4bd9890..44657267 100644 --- a/assets/css/style.scss +++ b/assets/css/style.scss @@ -6,11 +6,13 @@ div.shell { width: 80%; max-width: 1300px; + min-width: 300px; } span.banner-fix { width: 80%; max-width: 1300px; + min-width: 300px; } header h1 { diff --git a/bin/build.sh b/bin/build.sh index 7b1c3232..693c2bbe 100755 --- a/bin/build.sh +++ b/bin/build.sh @@ -16,6 +16,7 @@ cd "$REPO_DIR" ./bin/build_docs.sh ./bin/build_pip.sh +./bin/build_deb.sh ./bin/build_docker.sh echo "[√] Done. Install the built package by running:" diff --git a/bin/build_deb.sh b/bin/build_deb.sh new file mode 100755 index 00000000..6f5e418c --- /dev/null +++ b/bin/build_deb.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +### Bash Environment Setup +# http://redsymbol.net/articles/unofficial-bash-strict-mode/ +# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html +# set -o xtrace +set -o errexit +set -o errtrace +set -o nounset +set -o pipefail +IFS=$'\n' + +REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" + +source "$REPO_DIR/.venv/bin/activate" +cd "$REPO_DIR" + +VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" +DEBIAN_VERSION="1" +PGP_KEY_ID="7D5695D3B618872647861D51C38137A7C1675988" +# make sure you have this in ~/.dput.cf: +# [archivebox-ppa] +# fqdn: ppa.launchpad.net +# method: ftp +# incoming: ~archivebox/ubuntu/archivebox/ +# login: anonymous +# allow_unsigned_uploads: 0 + + +# cleanup build artifacts +rm -Rf build deb_dist dist archivebox-*.tar.gz + +# build source and binary packages +python3 setup.py --command-packages=stdeb.command \ + sdist_dsc --debian-version=$DEBIAN_VERSION \ + bdist_deb + +# sign the build with your PGP key ID +debsign -k "$PGP_KEY_ID" "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" + +# push the build to launchpad ppa +# dput archivebox "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" diff --git a/bin/release.sh b/bin/release.sh index f01eb1d3..d9c9b52d 100755 --- a/bin/release.sh +++ b/bin/release.sh @@ -42,6 +42,7 @@ echo "${contents}" > package.json # Build docs, python package, and docker image ./bin/build_docs.sh ./bin/build_pip.sh +./bin/build_deb.sh ./bin/build_docker.sh @@ -64,11 +65,14 @@ python3 -m twine upload --repository testpypi dist/* echo "[^] Uploading to pypi.org" python3 -m twine upload --repository pypi dist/* +echo "[^] Uploading to launchpad.net" +python3 -m dput archivebox "deb_dist/archivebox_${NEW_VERSION}-1_source.changes" + echo "[^] Uploading docker image" # docker login --username=nikisweeting # docker login docker.pkg.github.com --username=pirate docker push docker.io/nikisweeting/archivebox docker push docker.io/archivebox/archivebox -docker push docker.pkg.github.com/pirate/archivebox/archivebox +docker push docker.pkg.github.com/archivebox/archivebox/archivebox echo "[√] Done. Published version v$NEW_VERSION" diff --git a/icon.png b/icon.png new file mode 100644 index 00000000..04462b21 Binary files /dev/null and b/icon.png differ diff --git a/package.json b/package.json index f342ece6..70f58f61 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "archivebox", - "version": "0.4.21", + "version": "0.4.22", "description": "ArchiveBox: The self-hosted internet archive", "author": "Nick Sweeting ", "license": "MIT", diff --git a/setup.py b/setup.py index af643c9e..6b40b803 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,6 @@ setuptools.setup( "requests==2.24.0", "atomicwrites==1.4.0", "mypy-extensions==0.4.3", - "base32-crockford==0.3.0", "django==3.0.8", "django-extensions==3.0.3", @@ -80,6 +79,7 @@ setuptools.setup( "recommonmark", "pytest", "bottle", + "stdeb", ], # 'redis': ['redis', 'django-redis'], # 'pywb': ['pywb', 'redis'], diff --git a/stdeb.cfg b/stdeb.cfg new file mode 100644 index 00000000..6eaa8f2d --- /dev/null +++ b/stdeb.cfg @@ -0,0 +1,6 @@ +[DEFAULT] +Package: archivebox +Suite: focal +Build-Depends: dh-python +Depends: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-mypy-extensions, python3-requests, python3-w3lib +XS-Python-Version: >= 3.7