diff --git a/.dockerignore b/.dockerignore index d8810a34..44d23b6f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,7 +1,16 @@ -output -__pycache__ .DS_Store -venv -.venv -.docker-venv -data +._* +*.pyc +__pycache__/ +.mypy_cache/ + +venv/ +.venv/ +.docker-venv/ + +*.egg-info/ +build/ +dist/ + +data/ +output/ diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 76273c54..5eeae85b 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1 +1,40 @@ -Make sure check in with me first or confirm your desired features line up with our roadmap: https://github.com/pirate/ArchiveBox#roadmap +# Contribution Process + +1. Confirm your desired features fit into our bigger project goals roadmap: https://github.com/pirate/ArchiveBox#roadmap +2. Open an issue with your planned implementation to discuss +3. Check in with me before starting development to make sure your work wont conflict with or duplicate existing work +4. Setup your dev environment, make some changes, and test using the test input files +5. Commit, push, and submit a PR and wait for review feedback +6. Have patience, don't abandon your PR! We love contributors but we all have day jobs and don't always have time to respond to notifications instantly. If you want a faster response, ping @theSquashSH on twitter or Patreon. + +**Useful links:** + +- https://github.com/pirate/ArchiveBox/issues +- https://github.com/pirate/ArchiveBox/pulls +- https://github.com/pirate/ArchiveBox/wiki/Roadmap +- https://github.com/pirate/ArchiveBox/wiki/Install#manual-setup + +### Development Setup + +```bash +git clone https://github.com/pirate/ArchiveBox +cd ArchiveBox +# Optionally create a virtualenv +pip install -r requirements.txt +pip install -e . +``` + +### Running Tests + +```bash +./bin/archive tests/* +# look for errors in stdout/stderr +# then confirm output html looks right + +# if on >v0.4 run the django test suite: +archivebox manage test +``` + +### Getting Help + +Open issues on Github or contact me https://sweeting.me/#contact. diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000..766165b2 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,3 @@ +github: pirate +patreon: theSquashSH +custom: ["https://paypal.me/NicholasSweeting", "https://www.blockchain.com/eth/address/0x5D4c34D4a121Fe08d1dDB7969F07550f2dB9f471", "https://www.blockchain.com/btc/address/1HuxXriPE2Bbnag3jJrqa3bkNHrs297dYH"] diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index b350fb28..c2bf8b23 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,30 +1,41 @@ --- name: 🐞 Bug report about: Create a report to help us improve -title: '' -labels: '' +title: 'Bugfix: ...' +labels: 'changes: bugfixes' assignees: '' --- -(please fill out the following information, feel free to delete sections if they're not applicable) + -## Describe the bug -A description of what the bug is, what you expected to happen, +#### Describe the bug + -## Steps to reproduce - +#### Steps to reproduce + -## Screenshots or log output +#### Screenshots or log output + -## Software versions +#### Software versions - OS: ([e.g. macOS 10.14] the operating system you're running ArchiveBox on) - ArchiveBox version: (`git rev-parse HEAD | head -c7` [e.g. d798117] commit ID of the version you're running) diff --git a/.github/ISSUE_TEMPLATE/documentation_change.md b/.github/ISSUE_TEMPLATE/documentation_change.md index dc3c2741..a02e9374 100644 --- a/.github/ISSUE_TEMPLATE/documentation_change.md +++ b/.github/ISSUE_TEMPLATE/documentation_change.md @@ -1,15 +1,16 @@ --- name: 📑 Documentation change about: Submit a suggestion for the Wiki documentation -title: '' +title: 'Documentation: Improvement request ...' labels: '' assignees: '' --- ## Wiki Page URL + ## Suggested Edit + -... diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 0f9423f5..3361571d 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,38 +1,50 @@ --- name: 💡 Feature request about: Suggest an idea for this project -title: '' -labels: '' +title: 'Feature Request: ...' +labels: 'changes: behavior,status: idea phase' assignees: '' --- -(feel free to delete this template and write your own issue description if you don't find it helpful) + ## Type - - [ ] General Question or Disussion + - [ ] General question or discussion - [ ] Propose a brand new feature - [ ] Request modification of existing behavior or design ## What is the problem that your feature request solves + ## Describe the ideal specific solution you'd want, and whether it fits into any broader scope of changes -e.g. I specifically need a new archive method to look for multilingual subtitle files related to pages. + ## What hacks or alternative solutions have you tried to solve the problem? -A clear and concise description of any alternative solutions or features you've considered. + ## How badly do you want this new feature? - - [ ] It's an urgent deal-breaker, I cant live without it + - [ ] It's an urgent deal-breaker, I can't live without it - [ ] It's important to add it in the near-mid term future - [ ] It would be nice to have eventually --- - - [ ] I'm willing to contribute to development / fixing this issue + - [ ] I'm willing to contribute dev time / money to fix this issue - [ ] I like ArchiveBox so far / would recommend it to a friend + - [ ] I've had a lot of difficulty getting ArchiveBox set up diff --git a/.github/ISSUE_TEMPLATE/question_or_discussion.md b/.github/ISSUE_TEMPLATE/question_or_discussion.md new file mode 100644 index 00000000..4b7fb02f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question_or_discussion.md @@ -0,0 +1,9 @@ +--- +name: 💬 Question, discussion, or support request +about: Start a discussion or ask a question about ArchiveBox +title: 'Question: ...' +labels: '' +assignees: '' + +--- + diff --git a/.gitignore b/.gitignore index 7afd85be..44d23b6f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,27 +1,16 @@ -# OS cruft .DS_Store ._* - -# python +*.pyc __pycache__/ .mypy_cache/ -venv -.venv -archivebox/.venv -archivebox/venv -archivebox/docs/_build -# vim -.swp* +venv/ +.venv/ +.docker-venv/ -# output artifacts -output -output/ -data -data/ -archivebox/output -archivebox/data - -archivebox.egg-info/ +*.egg-info/ build/ dist/ + +data/ +output/ diff --git a/Dockerfile b/Dockerfile index af75b709..2bad4144 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,20 +2,21 @@ # git, curl, wget, python3, youtube-dl, google-chrome-stable, ArchiveBox # Usage: # docker build . -t archivebox:latest -# docker run -v=./data:/data archivebox:latest init -# docker run -v=./data:/data archivebox:latest add 'https://example.com' +# docker run -v=$PWD/data:/data archivebox:latest archivebox init +# echo 'https://example.com' | docker run -v=$PWD/data:/data -i archivebox:latest archivebox add # Documentation: # https://github.com/pirate/ArchiveBox/wiki/Docker#docker FROM python:3.8-slim-buster + LABEL name="archivebox" \ maintainer="Nick Sweeting " \ - version="0.4.3" \ description="All-in-one personal internet archiving container" -ENV LANG=C.UTF-8 \ +ENV TZ=UTC \ LANGUAGE=en_US:en \ LC_ALL=C.UTF-8 \ + LANG=C.UTF-8 \ PYTHONIOENCODING=UTF-8 \ PYTHONUNBUFFERED=1 \ APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \ @@ -23,15 +24,15 @@ ENV LANG=C.UTF-8 \ VENV_PATH=/venv \ DATA_PATH=/data -# Install latest chrome package and fonts to support major charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others) +# First install CLI utils and base deps, then Chrome + Fons RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \ && apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ - apt-transport-https ca-certificates apt-utils gnupg gnupg2 libgconf-2-4 zlib1g-dev dumb-init \ - wget curl youtube-dl jq git ffmpeg avconv \ - && curl -sSL https://dl.google.com/linux/linux_signing_key.pub | apt-key add - \ + apt-transport-https ca-certificates apt-utils gnupg gnupg2 libgconf-2-4 zlib1g-dev \ + dumb-init jq git wget curl youtube-dl ffmpeg \ + && curl -sSL "https://dl.google.com/linux/linux_signing_key.pub" | apt-key add - \ && echo "deb https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \ - && apt-get update \ + && apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ google-chrome-stable \ fontconfig \ @@ -42,36 +43,27 @@ RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selectio fonts-symbola \ fonts-noto \ fonts-freefont-ttf \ - && rm -rf /var/lib/apt/lists/* - -# Add user so we don't need --no-sandbox to run chrome -RUN groupadd -r archivebox && useradd -r -g archivebox -G audio,video archivebox \ - && mkdir -p /home/archivebox/Downloads \ - && chown -R archivebox:archivebox /home/archivebox - -WORKDIR "$CODE_PATH" -ADD . "$CODE_PATH" -VOLUME "$CODE_PATH" -RUN chown -R archivebox:archivebox "$CODE_PATH" - -ENV PATH="$VENV_PATH/bin:${PATH}" -RUN python --version \ - && python -m venv "$VENV_PATH" \ - && pip install --upgrade pip \ - && pip install -e . \ - && chown -R archivebox:archivebox "$VENV_PATH" - -WORKDIR "$DATA_PATH" -VOLUME "$DATA_PATH" -RUN chown -R archivebox:archivebox "$DATA_PATH" + && rm -rf /var/lib/apt/lists/* \ + && pip install --upgrade --no-cache-dir pip setuptools # Run everything from here on out as non-privileged user -USER archivebox +RUN groupadd --system archivebox \ + && useradd --system --gid archivebox --groups audio,video archivebox + +ADD . "$CODE_PATH" +WORKDIR "$CODE_PATH" +ENV PATH="$VENV_PATH/bin:${PATH}" +RUN python -m venv --clear --symlinks "$VENV_PATH" \ + && pip install -e . + +VOLUME "$DATA_PATH" +WORKDIR "$DATA_PATH" +USER archivebox:archivebox +EXPOSE 8000 ENV CHROME_BINARY=google-chrome \ - CHROME_SANDBOX=False \ - OUTPUT_DIR="$DATA_PATH" + CHROME_SANDBOX=False RUN archivebox version -ENTRYPOINT ["dumb-init", "--"] -CMD ["archivebox"] +ENTRYPOINT ["dumb-init", "--", "archivebox"] +CMD ["server", "0.0.0.0:8000"] diff --git a/Pipfile b/Pipfile index 7c7e05ce..0359bc1d 100644 --- a/Pipfile +++ b/Pipfile @@ -3,27 +3,13 @@ name = "pypi" url = "https://pypi.org/simple" verify_ssl = true -[dev-packages] -ipdb = "*" -flake8 = "*" -mypy = "*" -django-stubs = "*" -setuptools = "*" -sphinx = "*" -recommonmark = "*" -sphinx-rtd-theme = "*" -twine = "*" +[requires] +python_version = "3.8" [packages] -dataclasses = "*" -base32-crockford = "*" -django = "*" -django-extensions = "*" -youtube-dl = "*" -python-crontab = "*" -croniter = "*" -ipython = "*" -mypy-extensions = "*" +# see setup.py for package dependency list +"e1839a8" = {path = ".", editable = true} -[requires] -python_version = "3.7" +[dev-packages] +# see setup.py for dev package dependency list +"e1839a8" = {path = ".", extras = ["dev"], editable = true} diff --git a/Pipfile.lock b/Pipfile.lock deleted file mode 100644 index 64a9bae2..00000000 --- a/Pipfile.lock +++ /dev/null @@ -1,693 +0,0 @@ -{ - "_meta": { - "hash": { - "sha256": "5a1618caef76ff53b66c5e8674d8e639d25f75068f7026ad799e217d307628fc" - }, - "pipfile-spec": 6, - "requires": { - "python_version": "3.7" - }, - "sources": [ - { - "name": "pypi", - "url": "https://pypi.org/simple", - "verify_ssl": true - } - ] - }, - "default": { - "appnope": { - "hashes": [ - "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0", - "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71" - ], - "markers": "sys_platform == 'darwin'", - "version": "==0.1.0" - }, - "backcall": { - "hashes": [ - "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4", - "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2" - ], - "version": "==0.1.0" - }, - "base32-crockford": { - "hashes": [ - "sha256:115f5bd32ae32b724035cb02eb65069a8824ea08c08851eb80c8b9f63443a969", - "sha256:295ef5ffbf6ed96b6e739ffd36be98fa7e90a206dd18c39acefb15777eedfe6e" - ], - "index": "pypi", - "version": "==0.3.0" - }, - "croniter": { - "hashes": [ - "sha256:0d905dbe6f131a910fd3dde792f0129788cd2cb3a8048c5f7aaa212670b0cef2", - "sha256:538adeb3a7f7816c3cdec6db974c441620d764c25ff4ed0146ee7296b8a50590" - ], - "index": "pypi", - "version": "==0.3.30" - }, - "dataclasses": { - "hashes": [ - "sha256:454a69d788c7fda44efd71e259be79577822f5e3f53f029a22d08004e951dc9f", - "sha256:6988bd2b895eef432d562370bb707d540f32f7360ab13da45340101bc2307d84" - ], - "index": "pypi", - "version": "==0.6" - }, - "decorator": { - "hashes": [ - "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de", - "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6" - ], - "version": "==4.4.0" - }, - "django": { - "hashes": [ - "sha256:6fcc3cbd55b16f9a01f37de8bcbe286e0ea22e87096557f1511051780338eaea", - "sha256:bb407d0bb46395ca1241f829f5bd03f7e482f97f7d1936e26e98dacb201ed4ec" - ], - "index": "pypi", - "version": "==2.2.1" - }, - "django-extensions": { - "hashes": [ - "sha256:109004f80b6f45ad1f56addaa59debca91d94aa0dc1cb19678b9364b4fe9b6f4", - "sha256:307766e5e6c1caffe76c5d99239d8115d14ae3f7cab2cd991fcffd763dad904b" - ], - "index": "pypi", - "version": "==2.1.6" - }, - "ipython": { - "hashes": [ - "sha256:54c5a8aa1eadd269ac210b96923688ccf01ebb2d0f21c18c3c717909583579a8", - "sha256:e840810029224b56cd0d9e7719dc3b39cf84d577f8ac686547c8ba7a06eeab26" - ], - "index": "pypi", - "version": "==7.5.0" - }, - "ipython-genutils": { - "hashes": [ - "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8", - "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8" - ], - "version": "==0.2.0" - }, - "jedi": { - "hashes": [ - "sha256:2bb0603e3506f708e792c7f4ad8fc2a7a9d9c2d292a358fbbd58da531695595b", - "sha256:2c6bcd9545c7d6440951b12b44d373479bf18123a401a52025cf98563fbd826c" - ], - "version": "==0.13.3" - }, - "mypy-extensions": { - "hashes": [ - "sha256:37e0e956f41369209a3d5f34580150bcacfabaa57b33a15c0b25f4b5725e0812", - "sha256:b16cabe759f55e3409a7d231ebd2841378fb0c27a5d1994719e340e4f429ac3e" - ], - "index": "pypi", - "version": "==0.4.1" - }, - "parso": { - "hashes": [ - "sha256:17cc2d7a945eb42c3569d4564cdf49bde221bc2b552af3eca9c1aad517dcdd33", - "sha256:2e9574cb12e7112a87253e14e2c380ce312060269d04bd018478a3c92ea9a376" - ], - "version": "==0.4.0" - }, - "pexpect": { - "hashes": [ - "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1", - "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb" - ], - "markers": "sys_platform != 'win32'", - "version": "==4.7.0" - }, - "pickleshare": { - "hashes": [ - "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca", - "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56" - ], - "version": "==0.7.5" - }, - "prompt-toolkit": { - "hashes": [ - "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780", - "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1", - "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55" - ], - "version": "==2.0.9" - }, - "ptyprocess": { - "hashes": [ - "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0", - "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f" - ], - "version": "==0.6.0" - }, - "pygments": { - "hashes": [ - "sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a", - "sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d" - ], - "version": "==2.3.1" - }, - "python-crontab": { - "hashes": [ - "sha256:91ce4b245ee5e5c117aa0b21b485bc43f2d80df854a36e922b707643f50d7923" - ], - "index": "pypi", - "version": "==2.3.6" - }, - "python-dateutil": { - "hashes": [ - "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", - "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e" - ], - "version": "==2.8.0" - }, - "pytz": { - "hashes": [ - "sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda", - "sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141" - ], - "version": "==2019.1" - }, - "six": { - "hashes": [ - "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", - "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" - ], - "version": "==1.12.0" - }, - "sqlparse": { - "hashes": [ - "sha256:40afe6b8d4b1117e7dff5504d7a8ce07d9a1b15aeeade8a2d10f130a834f8177", - "sha256:7c3dca29c022744e95b547e867cee89f4fce4373f3549ccd8797d8eb52cdb873" - ], - "version": "==0.3.0" - }, - "traitlets": { - "hashes": [ - "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835", - "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9" - ], - "version": "==4.3.2" - }, - "wcwidth": { - "hashes": [ - "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", - "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" - ], - "version": "==0.1.7" - }, - "youtube-dl": { - "hashes": [ - "sha256:31844229a4f4d7003e03ab309ff2caff1b16ce0acbd3cfb7a13276058af13056", - "sha256:a751bd293e2d7ee963910de14b3eb95b88837021899be488fade0b8abe815650" - ], - "index": "pypi", - "version": "==2019.4.30" - } - }, - "develop": { - "alabaster": { - "hashes": [ - "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359", - "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02" - ], - "version": "==0.7.12" - }, - "appnope": { - "hashes": [ - "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0", - "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71" - ], - "markers": "sys_platform == 'darwin'", - "version": "==0.1.0" - }, - "babel": { - "hashes": [ - "sha256:6778d85147d5d85345c14a26aada5e478ab04e39b078b0745ee6870c2b5cf669", - "sha256:8cba50f48c529ca3fa18cf81fa9403be176d374ac4d60738b839122dfaaa3d23" - ], - "version": "==2.6.0" - }, - "backcall": { - "hashes": [ - "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4", - "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2" - ], - "version": "==0.1.0" - }, - "bleach": { - "hashes": [ - "sha256:213336e49e102af26d9cde77dd2d0397afabc5a6bf2fed985dc35b5d1e285a16", - "sha256:3fdf7f77adcf649c9911387df51254b813185e32b2c6619f690b593a617e19fa" - ], - "version": "==3.1.0" - }, - "certifi": { - "hashes": [ - "sha256:59b7658e26ca9c7339e00f8f4636cdfe59d34fa37b9b04f6f9e9926b3cece1a5", - "sha256:b26104d6835d1f5e49452a26eb2ff87fe7090b89dfcaee5ea2212697e1e1d7ae" - ], - "version": "==2019.3.9" - }, - "chardet": { - "hashes": [ - "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", - "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" - ], - "version": "==3.0.4" - }, - "commonmark": { - "hashes": [ - "sha256:14c3df31e8c9c463377e287b2a1eefaa6019ab97b22dad36e2f32be59d61d68d", - "sha256:867fc5db078ede373ab811e16b6789e9d033b15ccd7296f370ca52d1ee792ce0" - ], - "version": "==0.9.0" - }, - "decorator": { - "hashes": [ - "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de", - "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6" - ], - "version": "==4.4.0" - }, - "django-stubs": { - "hashes": [ - "sha256:9c06a4b28fc8c18f6abee4f199f8ee29cb5cfcecf349e912ded31cb3526ea2b6", - "sha256:9ef230843a24b5d74f2ebd4c60f9bea09c21911bc119d0325e8bb47e2f495e70" - ], - "index": "pypi", - "version": "==0.12.1" - }, - "docutils": { - "hashes": [ - "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6", - "sha256:51e64ef2ebfb29cae1faa133b3710143496eca21c530f3f71424d77687764274", - "sha256:7a4bd47eaf6596e1295ecb11361139febe29b084a87bf005bf899f9a42edc3c6" - ], - "version": "==0.14" - }, - "entrypoints": { - "hashes": [ - "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19", - "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451" - ], - "version": "==0.3" - }, - "flake8": { - "hashes": [ - "sha256:859996073f341f2670741b51ec1e67a01da142831aa1fdc6242dbf88dffbe661", - "sha256:a796a115208f5c03b18f332f7c11729812c8c3ded6c46319c59b53efd3819da8" - ], - "index": "pypi", - "version": "==3.7.7" - }, - "future": { - "hashes": [ - "sha256:67045236dcfd6816dc439556d009594abf643e5eb48992e36beac09c2ca659b8" - ], - "version": "==0.17.1" - }, - "idna": { - "hashes": [ - "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", - "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" - ], - "version": "==2.8" - }, - "imagesize": { - "hashes": [ - "sha256:3f349de3eb99145973fefb7dbe38554414e5c30abd0c8e4b970a7c9d09f3a1d8", - "sha256:f3832918bc3c66617f92e35f5d70729187676313caa60c187eb0f28b8fe5e3b5" - ], - "version": "==1.1.0" - }, - "ipdb": { - "hashes": [ - "sha256:dce2112557edfe759742ca2d0fee35c59c97b0cc7a05398b791079d78f1519ce" - ], - "index": "pypi", - "version": "==0.12" - }, - "ipython": { - "hashes": [ - "sha256:54c5a8aa1eadd269ac210b96923688ccf01ebb2d0f21c18c3c717909583579a8", - "sha256:e840810029224b56cd0d9e7719dc3b39cf84d577f8ac686547c8ba7a06eeab26" - ], - "index": "pypi", - "version": "==7.5.0" - }, - "ipython-genutils": { - "hashes": [ - "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8", - "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8" - ], - "version": "==0.2.0" - }, - "jedi": { - "hashes": [ - "sha256:2bb0603e3506f708e792c7f4ad8fc2a7a9d9c2d292a358fbbd58da531695595b", - "sha256:2c6bcd9545c7d6440951b12b44d373479bf18123a401a52025cf98563fbd826c" - ], - "version": "==0.13.3" - }, - "jinja2": { - "hashes": [ - "sha256:065c4f02ebe7f7cf559e49ee5a95fb800a9e4528727aec6f24402a5374c65013", - "sha256:14dd6caf1527abb21f08f86c784eac40853ba93edb79552aa1e4b8aef1b61c7b" - ], - "version": "==2.10.1" - }, - "markupsafe": { - "hashes": [ - "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", - "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", - "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", - "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", - "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", - "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", - "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", - "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", - "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", - "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", - "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", - "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", - "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", - "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", - "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905", - "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735", - "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d", - "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e", - "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d", - "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c", - "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21", - "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2", - "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5", - "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b", - "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", - "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", - "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", - "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7" - ], - "version": "==1.1.1" - }, - "mccabe": { - "hashes": [ - "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", - "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f" - ], - "version": "==0.6.1" - }, - "mypy": { - "hashes": [ - "sha256:2afe51527b1f6cdc4a5f34fc90473109b22bf7f21086ba3e9451857cf11489e6", - "sha256:56a16df3e0abb145d8accd5dbb70eba6c4bd26e2f89042b491faa78c9635d1e2", - "sha256:5764f10d27b2e93c84f70af5778941b8f4aa1379b2430f85c827e0f5464e8714", - "sha256:5bbc86374f04a3aa817622f98e40375ccb28c4836f36b66706cf3c6ccce86eda", - "sha256:6a9343089f6377e71e20ca734cd8e7ac25d36478a9df580efabfe9059819bf82", - "sha256:6c9851bc4a23dc1d854d3f5dfd5f20a016f8da86bcdbb42687879bb5f86434b0", - "sha256:b8e85956af3fcf043d6f87c91cbe8705073fc67029ba6e22d3468bfee42c4823", - "sha256:b9a0af8fae490306bc112229000aa0c2ccc837b49d29a5c42e088c132a2334dd", - "sha256:bbf643528e2a55df2c1587008d6e3bda5c0445f1240dfa85129af22ae16d7a9a", - "sha256:c46ab3438bd21511db0f2c612d89d8344154c0c9494afc7fbc932de514cf8d15", - "sha256:f7a83d6bd805855ef83ec605eb01ab4fa42bcef254b13631e451cbb44914a9b0" - ], - "index": "pypi", - "version": "==0.701" - }, - "mypy-extensions": { - "hashes": [ - "sha256:37e0e956f41369209a3d5f34580150bcacfabaa57b33a15c0b25f4b5725e0812", - "sha256:b16cabe759f55e3409a7d231ebd2841378fb0c27a5d1994719e340e4f429ac3e" - ], - "index": "pypi", - "version": "==0.4.1" - }, - "packaging": { - "hashes": [ - "sha256:0c98a5d0be38ed775798ece1b9727178c4469d9c3b4ada66e8e6b7849f8732af", - "sha256:9e1cbf8c12b1f1ce0bb5344b8d7ecf66a6f8a6e91bcb0c84593ed6d3ab5c4ab3" - ], - "version": "==19.0" - }, - "parso": { - "hashes": [ - "sha256:17cc2d7a945eb42c3569d4564cdf49bde221bc2b552af3eca9c1aad517dcdd33", - "sha256:2e9574cb12e7112a87253e14e2c380ce312060269d04bd018478a3c92ea9a376" - ], - "version": "==0.4.0" - }, - "pexpect": { - "hashes": [ - "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1", - "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb" - ], - "markers": "sys_platform != 'win32'", - "version": "==4.7.0" - }, - "pickleshare": { - "hashes": [ - "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca", - "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56" - ], - "version": "==0.7.5" - }, - "pkginfo": { - "hashes": [ - "sha256:7424f2c8511c186cd5424bbf31045b77435b37a8d604990b79d4e70d741148bb", - "sha256:a6d9e40ca61ad3ebd0b72fbadd4fba16e4c0e4df0428c041e01e06eb6ee71f32" - ], - "version": "==1.5.0.1" - }, - "prompt-toolkit": { - "hashes": [ - "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780", - "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1", - "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55" - ], - "version": "==2.0.9" - }, - "ptyprocess": { - "hashes": [ - "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0", - "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f" - ], - "version": "==0.6.0" - }, - "pycodestyle": { - "hashes": [ - "sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56", - "sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c" - ], - "version": "==2.5.0" - }, - "pyflakes": { - "hashes": [ - "sha256:17dbeb2e3f4d772725c777fabc446d5634d1038f234e77343108ce445ea69ce0", - "sha256:d976835886f8c5b31d47970ed689944a0262b5f3afa00a5a7b4dc81e5449f8a2" - ], - "version": "==2.1.1" - }, - "pygments": { - "hashes": [ - "sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a", - "sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d" - ], - "version": "==2.3.1" - }, - "pyparsing": { - "hashes": [ - "sha256:1873c03321fc118f4e9746baf201ff990ceb915f433f23b395f5580d1840cb2a", - "sha256:9b6323ef4ab914af344ba97510e966d64ba91055d6b9afa6b30799340e89cc03" - ], - "version": "==2.4.0" - }, - "pytz": { - "hashes": [ - "sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda", - "sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141" - ], - "version": "==2019.1" - }, - "readme-renderer": { - "hashes": [ - "sha256:bb16f55b259f27f75f640acf5e00cf897845a8b3e4731b5c1a436e4b8529202f", - "sha256:c8532b79afc0375a85f10433eca157d6b50f7d6990f337fa498c96cd4bfc203d" - ], - "version": "==24.0" - }, - "recommonmark": { - "hashes": [ - "sha256:a520b8d25071a51ae23a27cf6252f2fe387f51bdc913390d83b2b50617f5bb48", - "sha256:c85228b9b7aea7157662520e74b4e8791c5eacd375332ec68381b52bf10165be" - ], - "index": "pypi", - "version": "==0.5.0" - }, - "requests": { - "hashes": [ - "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", - "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b" - ], - "version": "==2.21.0" - }, - "requests-toolbelt": { - "hashes": [ - "sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f", - "sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0" - ], - "version": "==0.9.1" - }, - "six": { - "hashes": [ - "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", - "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" - ], - "version": "==1.12.0" - }, - "snowballstemmer": { - "hashes": [ - "sha256:919f26a68b2c17a7634da993d91339e288964f93c274f1343e3bbbe2096e1128", - "sha256:9f3bcd3c401c3e862ec0ebe6d2c069ebc012ce142cce209c098ccb5b09136e89" - ], - "version": "==1.2.1" - }, - "sphinx": { - "hashes": [ - "sha256:423280646fb37944dd3c85c58fb92a20d745793a9f6c511f59da82fa97cd404b", - "sha256:de930f42600a4fef993587633984cc5027dedba2464bcf00ddace26b40f8d9ce" - ], - "index": "pypi", - "version": "==2.0.1" - }, - "sphinx-rtd-theme": { - "hashes": [ - "sha256:00cf895504a7895ee433807c62094cf1e95f065843bf3acd17037c3e9a2becd4", - "sha256:728607e34d60456d736cc7991fd236afb828b21b82f956c5ea75f94c8414040a" - ], - "index": "pypi", - "version": "==0.4.3" - }, - "sphinxcontrib-applehelp": { - "hashes": [ - "sha256:edaa0ab2b2bc74403149cb0209d6775c96de797dfd5b5e2a71981309efab3897", - "sha256:fb8dee85af95e5c30c91f10e7eb3c8967308518e0f7488a2828ef7bc191d0d5d" - ], - "version": "==1.0.1" - }, - "sphinxcontrib-devhelp": { - "hashes": [ - "sha256:6c64b077937330a9128a4da74586e8c2130262f014689b4b89e2d08ee7294a34", - "sha256:9512ecb00a2b0821a146736b39f7aeb90759834b07e81e8cc23a9c70bacb9981" - ], - "version": "==1.0.1" - }, - "sphinxcontrib-htmlhelp": { - "hashes": [ - "sha256:4670f99f8951bd78cd4ad2ab962f798f5618b17675c35c5ac3b2132a14ea8422", - "sha256:d4fd39a65a625c9df86d7fa8a2d9f3cd8299a3a4b15db63b50aac9e161d8eff7" - ], - "version": "==1.0.2" - }, - "sphinxcontrib-jsmath": { - "hashes": [ - "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", - "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" - ], - "version": "==1.0.1" - }, - "sphinxcontrib-qthelp": { - "hashes": [ - "sha256:513049b93031beb1f57d4daea74068a4feb77aa5630f856fcff2e50de14e9a20", - "sha256:79465ce11ae5694ff165becda529a600c754f4bc459778778c7017374d4d406f" - ], - "version": "==1.0.2" - }, - "sphinxcontrib-serializinghtml": { - "hashes": [ - "sha256:c0efb33f8052c04fd7a26c0a07f1678e8512e0faec19f4aa8f2473a8b81d5227", - "sha256:db6615af393650bf1151a6cd39120c29abaf93cc60db8c48eb2dddbfdc3a9768" - ], - "version": "==1.1.3" - }, - "tqdm": { - "hashes": [ - "sha256:d385c95361699e5cf7622485d9b9eae2d4864b21cd5a2374a9c381ffed701021", - "sha256:e22977e3ebe961f72362f6ddfb9197cc531c9737aaf5f607ef09740c849ecd05" - ], - "version": "==4.31.1" - }, - "traitlets": { - "hashes": [ - "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835", - "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9" - ], - "version": "==4.3.2" - }, - "twine": { - "hashes": [ - "sha256:0fb0bfa3df4f62076cab5def36b1a71a2e4acb4d1fa5c97475b048117b1a6446", - "sha256:d6c29c933ecfc74e9b1d9fa13aa1f87c5d5770e119f5a4ce032092f0ff5b14dc" - ], - "index": "pypi", - "version": "==1.13.0" - }, - "typed-ast": { - "hashes": [ - "sha256:132eae51d6ef3ff4a8c47c393a4ef5ebf0d1aecc96880eb5d6c8ceab7017cc9b", - "sha256:18141c1484ab8784006c839be8b985cfc82a2e9725837b0ecfa0203f71c4e39d", - "sha256:2baf617f5bbbfe73fd8846463f5aeafc912b5ee247f410700245d68525ec584a", - "sha256:3d90063f2cbbe39177e9b4d888e45777012652d6110156845b828908c51ae462", - "sha256:4304b2218b842d610aa1a1d87e1dc9559597969acc62ce717ee4dfeaa44d7eee", - "sha256:4983ede548ffc3541bae49a82675996497348e55bafd1554dc4e4a5d6eda541a", - "sha256:5315f4509c1476718a4825f45a203b82d7fdf2a6f5f0c8f166435975b1c9f7d4", - "sha256:6cdfb1b49d5345f7c2b90d638822d16ba62dc82f7616e9b4caa10b72f3f16649", - "sha256:7b325f12635598c604690efd7a0197d0b94b7d7778498e76e0710cd582fd1c7a", - "sha256:8d3b0e3b8626615826f9a626548057c5275a9733512b137984a68ba1598d3d2f", - "sha256:8f8631160c79f53081bd23446525db0bc4c5616f78d04021e6e434b286493fd7", - "sha256:912de10965f3dc89da23936f1cc4ed60764f712e5fa603a09dd904f88c996760", - "sha256:b010c07b975fe853c65d7bbe9d4ac62f1c69086750a574f6292597763781ba18", - "sha256:c908c10505904c48081a5415a1e295d8403e353e0c14c42b6d67f8f97fae6616", - "sha256:c94dd3807c0c0610f7c76f078119f4ea48235a953512752b9175f9f98f5ae2bd", - "sha256:ce65dee7594a84c466e79d7fb7d3303e7295d16a83c22c7c4037071b059e2c21", - "sha256:eaa9cfcb221a8a4c2889be6f93da141ac777eb8819f077e1d09fb12d00a09a93", - "sha256:f3376bc31bad66d46d44b4e6522c5c21976bf9bca4ef5987bb2bf727f4506cbb", - "sha256:f9202fa138544e13a4ec1a6792c35834250a85958fde1251b6a22e07d1260ae7" - ], - "version": "==1.3.5" - }, - "typing-extensions": { - "hashes": [ - "sha256:07b2c978670896022a43c4b915df8958bec4a6b84add7f2c87b2b728bda3ba64", - "sha256:f3f0e67e1d42de47b5c67c32c9b26641642e9170fe7e292991793705cd5fef7c", - "sha256:fb2cd053238d33a8ec939190f30cfd736c00653a85a2919415cecf7dc3d9da71" - ], - "version": "==3.7.2" - }, - "urllib3": { - "hashes": [ - "sha256:2393a695cd12afedd0dcb26fe5d50d0cf248e5a66f75dbd89a3d4eb333a61af4", - "sha256:a637e5fae88995b256e3409dc4d52c2e2e0ba32c42a6365fee8bbd2238de3cfb" - ], - "version": "==1.24.3" - }, - "wcwidth": { - "hashes": [ - "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", - "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" - ], - "version": "==0.1.7" - }, - "webencodings": { - "hashes": [ - "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", - "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923" - ], - "version": "==0.5.1" - } - } -} diff --git a/README.md b/README.md index 80fe591d..6ce54b97 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@

ArchiveBox
The open-source self-hosted web archive.

▶️ Quickstart | -Demo | +Demo | Github | Documentation | Info & Motivation | @@ -14,7 +14,7 @@ "Your own personal internet archive" (网站存档 / 爬虫) - + @@ -23,11 +23,20 @@
+
+💥 Attention: Big API changes are coming soon (including a proper config file format and pip install archivebox)! Check out v0.4 and help us test it! 💥 +

+Note: There are some important security design issues that need to be fixed before v0.4 can be pushed, all help is appreciated!
+(This project is not abandoned, it's my primary side-project for the forseeable future, my day job is very busy right now.)
+See the v0.4 release PR for more information.
+
+
+ **ArchiveBox takes a list of website URLs you want to archive, and creates a local, static, browsable HTML clone of the content from those websites (it saves HTML, JS, media files, PDFs, images and more).** -You can use it to preserve access to websites you care about by storing them locally offline. ArchiveBox imports lists of URLs, renders the pages in a headless, autheticated, user-scriptable browser, and then archives the content in multiple redundant common formats (HTML, PDF, PNG, WARC) that will last long after the originals disappear off the internet. It automatically extracts assets and media from pages and saves them in easily-accessible folders, with out-of-the-box support for extracting git repositories, audio, video, subtitles, images, PDFs, and more. +You can use it to preserve access to websites you care about by storing them locally offline. ArchiveBox imports lists of URLs, renders the pages in a headless, authenticated, user-scriptable browser, and then archives the content in multiple redundant common formats (HTML, PDF, PNG, WARC) that will last long after the originals disappear off the internet. It automatically extracts assets and media from pages and saves them in easily-accessible folders, with out-of-the-box support for extracting git repositories, audio, video, subtitles, images, PDFs, and more. #### How does it work? @@ -36,7 +45,7 @@ echo 'http://example.com' | ./archive ``` After installing the dependencies, just pipe some new links into the `./archive` command to start your archive. -ArchiveBox is written in Python 3.5 and uses wget, Chrome headless, youtube-dl, pywb, and other common unix tools to save each page you add in multiple redundant formats. It doesn't require a constantly running server or backend, just open the generated `output/index.html` in a browser to view the archive. It can import and export links as JSON (among other formats), so it's easy to script or hook up to other APIs. If you run it on a schedule and import from browser history or bookmarks regularly, you can sleep soundly knowing that the slice of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer). +ArchiveBox is written in Python 3.7 and uses wget, Chrome headless, youtube-dl, pywb, and other common UNIX tools to save each page you add in multiple redundant formats. It doesn't require a constantly running server or backend, just open the generated `output/index.html` in a browser to view the archive. It can import and export links as JSON (among other formats), so it's easy to script or hook up to other APIs. If you run it on a schedule and import from browser history or bookmarks regularly, you can sleep soundly knowing that the slice of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer).
@@ -50,7 +59,7 @@ ArchiveBox is written in Python 3.5 and uses wget, Chrome headless, youtube-dl, ## Quickstart -ArchiveBox has [3 main dependencies](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies) beyond `python3`: `wget`, `chromium`, and `youtube-dl`. +ArchiveBox is written in `python3.7` and has [3 main binary dependencies](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies): `wget`, `chromium`, and `youtube-dl`. To get started, you can [install them manually](https://github.com/pirate/ArchiveBox/wiki/Install) using your system's package manager, use the [automated helper script](https://github.com/pirate/ArchiveBox/wiki/Quickstart), or use the official [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker) container. All three dependencies are optional if [disabled](https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles) in settings. ```bash @@ -66,7 +75,7 @@ echo 'https://example.com' | ./archive # pass URLs to archive v ./archive https://getpocket.com/users/example/feed/all # or import an RSS/JSON/XML/TXT feed ``` -One you've added your first links, open `output/index.html` in a browser to view the archive. [DEMO: archive.sweeting.me](https://archive.sweeting.me) +Once you've added your first links, open `output/index.html` in a browser to view the archive. [DEMO: archivebox.zervice.io/](https://archivebox.zervice.io) For more information, see the [full Quickstart guide](https://github.com/pirate/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) docs. *(`pip install archivebox` will be available in the near future, follow our [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) for progress)* @@ -135,7 +144,7 @@ If you're importing URLs with secret tokens in them (e.g Google Docs, CodiMD not - **Doesn't require a constantly-running server**, proxy, or native app - Easy to set up **[scheduled importing](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources** - Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC - - **Suitable for paywalled / [authenticated content](https://github.com/pirate/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies) + - ~~**Suitable for paywalled / [authenticated content](https://github.com/pirate/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.4 is released with some security fixes) - Can [**run scripts during archiving**](https://github.com/pirate/ArchiveBox/issues/51) to [scroll pages](https://github.com/pirate/ArchiveBox/issues/80), [close modals](https://github.com/pirate/ArchiveBox/issues/175), expand comment threads, etc. - Can also [**mirror content to 3rd-party archiving services**](https://github.com/pirate/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy @@ -164,13 +173,13 @@ I don't think everything should be preserved in an automated fashion, making all #### User Interface & Intended Purpose -ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI inferface for users to ingest built feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. +ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. -An alternative tool [pywb](https://github.com/webrecorder/pywb) allows you to run a browser through an always-running archiving proxy which records the traffic to WARC files. ArchiveBox intends to support this style of live proxy-archiving using `pywb` in the future, but for now it only ingests lists of links at a time via browser history, bookmarks, RSS, etc. +An alternative tool [pywb](https://github.com/webrecorder/pywb) allows you to run a browser through an always-running archiving proxy which records the traffic to WARC files. ArchiveBox intends to support this style of live proxy-archiving using `pywb` in the future, but for now, it only ingests lists of links at a time via browser history, bookmarks, RSS, etc. #### Private Local Archives vs Centralized Public Archives -Unlike crawler software that starts from a seed URL and works outwards, or public tools like Archive.org designed for users to manually submit links from the public internet, ArchiveBox tries to be a set-and-forget archiver suitable for archiving your entire browsing history, RSS feeds, or bookmarks, including private/authenticated content that you wouldn't otherwise share with a centralized service. Also by having each user store their own content locally, we can save much larger portions of everyone's browsing history than a shared centralized service would be able to handle. +Unlike crawler software that starts from a seed URL and works outwards, or public tools like Archive.org designed for users to manually submit links from the public internet, ArchiveBox tries to be a set-and-forget archiver suitable for archiving your entire browsing history, RSS feeds, or bookmarks, ~~including private/authenticated content that you wouldn't otherwise share with a centralized service~~ (do not do this until v0.4 is released with some security fixes). Also by having each user store their own content locally, we can save much larger portions of everyone's browsing history than a shared centralized service would be able to handle. #### Storage Requirements @@ -178,9 +187,9 @@ Because ArchiveBox is designed to ingest a firehose of browser history and bookm ## Learn more -▶ **Join out our [community chat](http://webchat.freenode.net?channels=ArchiveBox&uio=d4) hosted on IRC freenode.net:`#ArchiveBox`!** + -Whether you want learn which organizations are the big players in the web archiving space, want to find a specific open source tool for your web archiving need, or just want to see where archivists hang out online, our Community Wiki page serves as an index of the broader web archiving community. Check it out to learn about some of the coolest web archiving projects and communities on the web! +Whether you want to learn which organizations are the big players in the web archiving space, want to find a specific open-source tool for your web archiving need, or just want to see where archivists hang out online, our Community Wiki page serves as an index of the broader web archiving community. Check it out to learn about some of the coolest web archiving projects and communities on the web! @@ -278,7 +287,7 @@ Contributor Spotlight:


- +

diff --git a/archivebox/VERSION b/archivebox/VERSION index 2b7c5ae0..17b2ccd9 100644 --- a/archivebox/VERSION +++ b/archivebox/VERSION @@ -1 +1 @@ -0.4.2 +0.4.3 diff --git a/archivebox/__init__.py b/archivebox/__init__.py index 485a340d..56b6f16e 100644 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -3,4 +3,5 @@ __package__ = 'archivebox' from . import core from . import cli +# The main CLI source code, is in 'archivebox/main.py' from .main import * diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index f54f33ef..921c258a 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -44,6 +44,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'TIMEOUT': {'type': int, 'default': 60}, 'MEDIA_TIMEOUT': {'type': int, 'default': 3600}, 'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'}, + 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, 'URL_BLACKLIST': {'type': str, 'default': None}, }, @@ -77,6 +78,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'}, 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, + 'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'} 'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'}, 'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}, @@ -85,6 +87,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'CHROME_HEADLESS': {'type': bool, 'default': True}, 'CHROME_SANDBOX': {'type': bool, 'default': True}, + }, 'DEPENDENCY_CONFIG': { @@ -130,7 +133,7 @@ DEFAULT_CLI_COLORS = { ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()} STATICFILE_EXTENSIONS = { - # 99.999% of the time, URLs ending in these extentions are static files + # 99.999% of the time, URLs ending in these extensions are static files # that can be downloaded as-is, not html pages that need to be rendered 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', @@ -147,7 +150,7 @@ STATICFILE_EXTENSIONS = { # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf, # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml - # Thse are always treated as pages, not as static files, never add them: + # These are always treated as pages, not as static files, never add them: # html, htm, shtml, xhtml, xml, aspx, php, cgi } @@ -210,8 +213,9 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')}, 'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)}, - 'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_ARCHIVE_DOT_ORG'])}, + 'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['FETCH_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])}, 'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None}, + 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)}, 'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']}, 'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']}, @@ -480,6 +484,7 @@ def find_chrome_binary() -> Optional[str]: 'chromium-browser', 'chromium', '/Applications/Chromium.app/Contents/MacOS/Chromium', + 'chrome', 'google-chrome', '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', 'google-chrome-stable', @@ -506,6 +511,7 @@ def find_chrome_data_dir() -> Optional[str]: '~/.config/chromium', '~/Library/Application Support/Chromium', '~/AppData/Local/Chromium/User Data', + '~/.config/chrome', '~/.config/google-chrome', '~/Library/Application Support/Google/Chrome', '~/AppData/Local/Google/Chrome/User Data', diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 6f68fccf..40433a69 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -13,6 +13,7 @@ from ..config import ( CURL_BINARY, CURL_VERSION, CHECK_SSL_VALIDITY, + CURL_USER_AGENT, ) from ..cli.logging import TimedProgress @@ -37,14 +38,16 @@ def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) '--max-time', str(timeout), '--location', '--output', str(output), + *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else [], *([] if CHECK_SSL_VALIDITY else ['--insecure']), 'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)), ] - status = 'succeeded' + status = 'pending' timer = TimedProgress(timeout, prefix=' ') try: run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) chmod_file(output, cwd=out_dir) + status = 'succeeded' except Exception as err: status = 'failed' output = err diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 782d6d31..4f6d7000 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -24,6 +24,7 @@ from ..config import ( SAVE_WARC, WGET_BINARY, WGET_VERSION, + RESTRICT_FILE_NAMES, CHECK_SSL_VALIDITY, SAVE_WGET_REQUISITES, WGET_AUTO_COMPRESSION, @@ -66,14 +67,14 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> '--span-hosts', '--no-parent', '-e', 'robots=off', - '--restrict-file-names=windows', '--timeout={}'.format(timeout), - *([] if SAVE_WARC else ['--timestamping']), + *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []), *(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []), *(['--page-requisites'] if SAVE_WGET_REQUISITES else []), *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []), *(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []), *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []), + *([] if SAVE_WARC else ['--timestamping']), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), link.url, ] diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 25f4f3a3..4ac4c4af 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -325,7 +325,8 @@ def patch_main_index(link: Link, out_dir: str=OUTPUT_DIR) -> None: # Patch HTML main index html_path = os.path.join(out_dir, 'index.html') with open(html_path, 'r') as f: - html = f.read().split('\n') + html = f.read().splitlines() + for idx, line in enumerate(html): if title and (' None: html[idx] = '{}'.format(successful) break - atomic_write('\n'.join(html), html_path) + atomic_write(html_path, '\n'.join(html)) ### Link Details Index diff --git a/archivebox/index/html.py b/archivebox/index/html.py index ea890276..60d41049 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -41,7 +41,7 @@ TITLE_LOADING_MSG = 'Not yet archived...' def parse_html_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[str]: """parse an archive index html file and return the list of urls""" - index_path = os.path.join(out_dir, HTML_INDEX_FILENAME) + index_path = join(out_dir, HTML_INDEX_FILENAME) if os.path.exists(index_path): with open(index_path, 'r', encoding='utf-8') as f: for line in f: @@ -58,7 +58,7 @@ def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: copy_and_overwrite(join(TEMPLATES_DIR, STATIC_DIR_NAME), join(out_dir, STATIC_DIR_NAME)) rendered_html = main_index_template(links, finished=finished) - atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME)) + atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html) @enforce_types @@ -116,7 +116,7 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None: out_dir = out_dir or link.link_dir rendered_html = link_details_template(link) - atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME)) + atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html) @enforce_types diff --git a/archivebox/index/json.py b/archivebox/index/json.py index a11dba5d..deca4bea 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -74,7 +74,7 @@ def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: 'last_run_cmd': sys.argv, 'links': links, } - atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME)) + atomic_write(os.path.join(out_dir, JSON_INDEX_FILENAME), main_index_json) ### Link Details Index @@ -86,7 +86,7 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None: out_dir = out_dir or link.link_dir path = os.path.join(out_dir, JSON_INDEX_FILENAME) - atomic_write(link._asdict(extended=True), path) + atomic_write(path, link._asdict(extended=True)) @enforce_types diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 4de3f8ab..479d4e2c 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -13,7 +13,6 @@ import os from typing import Tuple, List from datetime import datetime -from ..index.schema import Link from ..system import atomic_write from ..config import ( ANSI, @@ -29,6 +28,7 @@ from ..util import ( enforce_types, URL_REGEX, ) +from ..index.schema import Link from ..cli.logging import pretty_path, TimedProgress from .pocket_html import parse_pocket_html_export from .pinboard_rss import parse_pinboard_rss_export @@ -93,8 +93,7 @@ def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str: ts = str(datetime.now().timestamp()).split('.', 1)[0] source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts)) - - atomic_write(raw_text, source_path) + atomic_write(source_path, raw_text) return source_path @@ -112,6 +111,7 @@ def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DI source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts)) if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')): + # Source is a URL that needs to be downloaded source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts)) print('{}[*] [{}] Downloading {}{}'.format( ANSI['green'], @@ -134,10 +134,11 @@ def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DI raise SystemExit(1) else: + # Source is a path to a local file on the filesystem with open(path, 'r') as f: raw_source_text = f.read() - atomic_write(raw_source_text, source_path) + atomic_write(source_path, raw_source_text) print(' > {}'.format(pretty_path(source_path))) diff --git a/archivebox/system.py b/archivebox/system.py index b6063ac2..4200ec9b 100644 --- a/archivebox/system.py +++ b/archivebox/system.py @@ -8,6 +8,7 @@ import json as pyjson from typing import Optional, Union, Set, Tuple from crontab import CronTab +from atomicwrites import atomic_write as awrite from subprocess import ( Popen, @@ -22,10 +23,10 @@ from .util import enforce_types, ExtendedEncoder from .config import OUTPUT_PERMISSIONS + def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs): """Patched of subprocess.run to fix blocking io making timeout=innefective""" - if input is not None: if 'stdin' in kwargs: raise ValueError('stdin and input arguments may not both be used.') @@ -59,30 +60,14 @@ def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, return CompletedProcess(process.args, retcode, stdout, stderr) -def atomic_write(contents: Union[dict, str, bytes], path: str) -> None: +def atomic_write(path: str, contents: Union[dict, str, bytes], overwrite: bool=True) -> None: """Safe atomic write to filesystem by writing to temp file + atomic rename""" - try: - tmp_file = '{}.tmp'.format(path) - - if isinstance(contents, bytes): - args = {'mode': 'wb+'} + + with awrite(path, overwrite=overwrite) as f: + if isinstance(contents, dict): + pyjson.dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) else: - args = {'mode': 'w+', 'encoding': 'utf-8'} - - with open(tmp_file, **args) as f: - if isinstance(contents, dict): - pyjson.dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) - else: - f.write(contents) - - os.fsync(f.fileno()) - - os.rename(tmp_file, path) - chmod_file(path) - finally: - if os.path.exists(tmp_file): - os.remove(tmp_file) - + f.write(contents) @enforce_types def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None: @@ -105,7 +90,8 @@ def copy_and_overwrite(from_path: str, to_path: str): shutil.copytree(from_path, to_path) else: with open(from_path, 'rb') as src: - atomic_write(src.read(), to_path) + contents = src.read() + atomic_write(to_path, contents) @enforce_types diff --git a/archivebox/themes/default/main_index.html b/archivebox/themes/default/main_index.html index 24ee8906..e587ff75 100644 --- a/archivebox/themes/default/main_index.html +++ b/archivebox/themes/default/main_index.html @@ -6,6 +6,37 @@ Archived Sites