From 90ef5e14b5b690e655e41e935ae53008fa22f35b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 9 Feb 2021 12:36:30 -0500 Subject: [PATCH 001/137] fix dangling small and update apt sources instructions --- README.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 76b51be3..86c0217b 100644 --- a/README.md +++ b/README.md @@ -152,7 +152,7 @@ docker run -v $PWD:/data -it archivebox/archivebox help # to see more options
Get ArchiveBox with apt on Ubuntu >=20.04 -First make sure you're on Ubuntu >= 20.04, or scroll down for older/non-Ubuntu instructions. +First make sure you're on Ubuntu >= 20.04, or scroll down for older/non-Ubuntu system instructions.

 # add the repo to your sources and install the archivebox package using apt
@@ -181,12 +181,15 @@ archivebox help  # to see more options
 For other Debian-based systems or older Ubuntu systems you can add these sources to `/etc/apt/sources.list`:
 
 

-deb http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main
-deb-src http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main
+echo "deb http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main" > /etc/apt/sources.list.d/archivebox.list
+echo "deb-src http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main" >> /etc/apt/sources.list.d/archivebox.list
+sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys C258F79DCC02E369
+sudo apt update
+sudo apt install archivebox
+archivebox --version
+# then continue the instructions above
 
-Then run `apt update; apt install archivebox; archivebox --version`. - (you may need to install some other dependencies manually however)
@@ -252,13 +255,11 @@ archivebox help # to see more options No matter which install method you choose, they all roughly follow this 3-step process and all provide the same CLI, Web UI, and on-disk data format. - - -1. Install ArchiveBox: `apt/brew/pip3 install archivebox` -2. Start a collection: `archivebox init` -3. Start archiving: `archivebox add 'https://example.com'` - - +
    +
  1. Install ArchiveBox: apt/brew/pip3 install archivebox
  2. +
  3. Start a collection: archivebox init
  4. +
  5. Start archiving: archivebox add 'https://example.com'
  6. +

From 7b1b1a237496a81225f6ff7796d3c2d09292cf26 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 9 Feb 2021 12:45:24 -0500 Subject: [PATCH 002/137] make arm support for apt explicit --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 86c0217b..8a38d6fb 100644 --- a/README.md +++ b/README.md @@ -150,16 +150,21 @@ docker run -v $PWD:/data -it archivebox/archivebox help # to see more options
-Get ArchiveBox with apt on Ubuntu >=20.04 +Get ArchiveBox with apt on Ubuntu/Debian -First make sure you're on Ubuntu >= 20.04, or scroll down for older/non-Ubuntu system instructions. +This method should work on all Ubuntu/Debian based systems, including x86, amd64, arm7, and arm8 CPUs (e.g. Raspberry Pis >=3). + +If you're on Ubuntu >= 20.04, add the `apt` repository like so:

 # add the repo to your sources and install the archivebox package using apt
 sudo apt install software-properties-common
 sudo add-apt-repository -u ppa:archivebox/archivebox
 sudo apt install archivebox
+
+(♰ otherwise follow the instructions below for other systems) +

 # create a new empty directory and initalize your collection (can be anywhere)
 mkdir ~/archivebox && cd ~/archivebox
 npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git'
@@ -178,7 +183,7 @@ archivebox list --json --with-headers > index.json
 archivebox help  # to see more options
 
-For other Debian-based systems or older Ubuntu systems you can add these sources to `/etc/apt/sources.list`: +♰ For other Debian-based systems or older Ubuntu systems you can add these sources to `/etc/apt/sources.list`:

 echo "deb http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main" > /etc/apt/sources.list.d/archivebox.list

From 9ac1f8c5a1bd05b2dcb7fd4acc45848fe9183e17 Mon Sep 17 00:00:00 2001
From: Nick Sweeting 
Date: Tue, 9 Feb 2021 21:38:25 -0500
Subject: [PATCH 003/137] Update README.md

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 8a38d6fb..a4da114e 100644
--- a/README.md
+++ b/README.md
@@ -154,15 +154,15 @@ docker run -v $PWD:/data -it archivebox/archivebox help  # to see more options
 
 This method should work on all Ubuntu/Debian based systems, including x86, amd64, arm7, and arm8 CPUs (e.g. Raspberry Pis >=3).
 
-If you're on Ubuntu >= 20.04, add the `apt` repository like so:
+If you're on Ubuntu >= 20.04, add the `apt` repository with `add-apt-repository`:
 
+(on other Ubuntu/Debian-based systems follow the ♰ instructions below)
 

 # add the repo to your sources and install the archivebox package using apt
 sudo apt install software-properties-common
 sudo add-apt-repository -u ppa:archivebox/archivebox
 sudo apt install archivebox
 
-(♰ otherwise follow the instructions below for other systems)

 # create a new empty directory and initalize your collection (can be anywhere)
@@ -183,7 +183,7 @@ archivebox list --json --with-headers > index.json
 archivebox help  # to see more options
 
-♰ For other Debian-based systems or older Ubuntu systems you can add these sources to `/etc/apt/sources.list`: +♰ On other Ubuntu/Debian-based systems add these sources directly to /etc/apt/sources.list:

 echo "deb http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main" > /etc/apt/sources.list.d/archivebox.list
@@ -191,8 +191,9 @@ echo "deb-src http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main"
 sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys C258F79DCC02E369
 sudo apt update
 sudo apt install archivebox
+sudo snap install chromium
 archivebox --version
-# then continue the instructions above
+# then scroll back up and continue the initalization instructions above
 
(you may need to install some other dependencies manually however) From 7d62fc23fae7dbedf5f8bfaa67184350c41d30f4 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 9 Feb 2021 21:40:46 -0500 Subject: [PATCH 004/137] fix macOS unnecessary version limitation --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a4da114e..30c70aca 100644 --- a/README.md +++ b/README.md @@ -201,7 +201,7 @@ archivebox --version
-Get ArchiveBox with brew on macOS >=10.13 +Get ArchiveBox with brew on macOS First make sure you have Homebrew installed: https://brew.sh/#install From 2b9282e754771f7aa06c6c736e889ed3796bd435 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 9 Feb 2021 21:45:28 -0500 Subject: [PATCH 005/137] send some love to monadical --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 30c70aca..ccd03819 100644 --- a/README.md +++ b/README.md @@ -541,7 +541,8 @@ Whether you want to learn which organizations are the big players in the web arc _A collection of the most active internet archiving communities and initiatives._ - Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog) - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post. -- Or reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter. +- Reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter +- Hire us to develop an internet archiving solution for you [@MonadicalSAS](https://twitter.com/MonadicalSAS) [Monadical.com](https://monadical.com)
From af09ac0e7f5c6993d984c45f587c0f9a72d7b930 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 9 Feb 2021 21:52:55 -0500 Subject: [PATCH 006/137] Update README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ccd03819..47fcd285 100644 --- a/README.md +++ b/README.md @@ -727,7 +727,11 @@ archivebox manage dbshell


-This project is maintained mostly in my spare time with the help from generous contributors and Monadical.com. + +This project is maintained mostly in my spare time with the help from generous contributors and Monadical. +

+✨ Monadical is a software consultancy specializing in Internet Archiving and Python+JS development, hire us for your projects! +



From 7e18fb87652cf4a48f13531bcd8e1ac23ff61817 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 9 Feb 2021 21:56:40 -0500 Subject: [PATCH 007/137] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 47fcd285..8f10c790 100644 --- a/README.md +++ b/README.md @@ -727,11 +727,11 @@ archivebox manage dbshell


- -This project is maintained mostly in my spare time with the help from generous contributors and Monadical. -

-✨ Monadical is a software consultancy specializing in Internet Archiving and Python+JS development, hire us for your projects! -
+ +This project is maintained mostly in my spare time with the help from generous contributors and Monadical. +/sub>

+✨ Monadical is our software consultancy specializing in Python+JS development, and it helps support this project. Check us out if you need internet archiving or full-stack development! ✨ +



From 8439f3f532936537f5916d3a28b761d56576d973 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 9 Feb 2021 21:57:53 -0500 Subject: [PATCH 008/137] Update README.md --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8f10c790..002696ee 100644 --- a/README.md +++ b/README.md @@ -728,9 +728,8 @@ archivebox manage dbshell
-This project is maintained mostly in my spare time with the help from generous contributors and Monadical. -/sub>

-✨ Monadical is our software consultancy specializing in Python+JS development, and it helps support this project. Check us out if you need internet archiving or full-stack development! ✨ +This project is maintained mostly in my spare time with the help from generous contributors and Monadical (✨ [hire them](https://monadical.com) for dev work!). +


From e8069f8043999dcc9d481c826a4ceb10f76e6bcc Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 9 Feb 2021 21:58:25 -0500 Subject: [PATCH 009/137] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 002696ee..38850745 100644 --- a/README.md +++ b/README.md @@ -728,7 +728,7 @@ archivebox manage dbshell
-This project is maintained mostly in my spare time with the help from generous contributors and Monadical (✨ [hire them](https://monadical.com) for dev work!). +This project is maintained mostly in my spare time with the help from generous contributors and Monadical (✨ hire them for dev work!).

From 188670eb8be643ed7d38d4db32a2d8fe1eb99b4e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 13:38:32 -0500 Subject: [PATCH 010/137] disable sonic by default in docker-compose and add instructions --- docker-compose.yml | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 0b4cad24..37c92cb4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,27 +20,27 @@ services: - 8000:8000 environment: - USE_COLOR=True - - SHOW_PROGRESS=False - - SEARCH_BACKEND_ENGINE=sonic - - SEARCH_BACKEND_HOST_NAME=sonic - - SEARCH_BACKEND_PASSWORD=SecretPassword + # - SEARCH_BACKEND_ENGINE=sonic + # - SEARCH_BACKEND_HOST_NAME=sonic + # - SEARCH_BACKEND_PASSWORD=SecretPassword volumes: - ./data:/data - depends_on: - - sonic - # Run sonic search backend - sonic: - image: valeriansaliou/sonic:v1.3.0 - ports: - - 1491:1491 - environment: - - SEARCH_BACKEND_PASSWORD=SecretPassword - volumes: - - ./etc/sonic/config.cfg:/etc/sonic.cfg - - ./data:/var/lib/sonic/store/ + # To run the Sonic full-text search backend, create an ./etc/sonic folder + # and download the sonic config file from here into that folder: + # https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic/config.cfg + # sonic: + # image: valeriansaliou/sonic:v1.3.0 + # expose: + # - 1491 + # environment: + # - SEARCH_BACKEND_PASSWORD=SecretPassword + # volumes: + # - ./etc/sonic/config.cfg:/etc/sonic.cfg + # - ./data/sonic:/var/lib/sonic/store - # Optional Addons: tweak these examples as needed for your specific use case + + ### Optional Addons: tweak these examples as needed for your specific use case # Example: Run scheduled imports in a docker instead of using cron on the # host machine, add tasks and see more info with archivebox schedule --help From 611216765d7e0006bff9431f900f8571d50c037c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 13:39:03 -0500 Subject: [PATCH 011/137] switch sqlite to use WAL mode by default to prevent database locked errors --- archivebox/core/settings.py | 3 +++ archivebox/main.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index e73c93d9..853a4863 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -107,6 +107,9 @@ DATABASES = { 'default': { 'ENGINE': 'django.db.backends.sqlite3', 'NAME': DATABASE_NAME, + 'OPTIONS': { + 'init_command': 'PRAGMA journal_mode=wal;', + } } } diff --git a/archivebox/main.py b/archivebox/main.py index c1751528..a1e58619 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -125,10 +125,12 @@ ALLOWED_IN_OUTPUT_DIR = { 'node_modules', 'package-lock.json', 'static', + 'sonic', ARCHIVE_DIR_NAME, SOURCES_DIR_NAME, LOGS_DIR_NAME, SQL_INDEX_FILENAME, + f'{SQL_INDEX_FILENAME}-wal', JSON_INDEX_FILENAME, HTML_INDEX_FILENAME, ROBOTS_TXT_FILENAME, From 128e7f0e8d542b1d91260cef74c4413c12c356ef Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 13:39:37 -0500 Subject: [PATCH 012/137] dont show progress bars in docker by default --- docker-compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-compose.yml b/docker-compose.yml index 37c92cb4..96b90a35 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,6 +20,7 @@ services: - 8000:8000 environment: - USE_COLOR=True + - SHOW_PROGRESS=False # - SEARCH_BACKEND_ENGINE=sonic # - SEARCH_BACKEND_HOST_NAME=sonic # - SEARCH_BACKEND_PASSWORD=SecretPassword From 0407d03b6bcbca92a0e99470b66ce89e3d61b288 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 13:39:49 -0500 Subject: [PATCH 013/137] add cli tests file back --- archivebox/cli/tests.py | 227 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 archivebox/cli/tests.py diff --git a/archivebox/cli/tests.py b/archivebox/cli/tests.py new file mode 100644 index 00000000..4d7016aa --- /dev/null +++ b/archivebox/cli/tests.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' + + +import os +import sys +import shutil +import unittest +from pathlib import Path + +from contextlib import contextmanager + +TEST_CONFIG = { + 'USE_COLOR': 'False', + 'SHOW_PROGRESS': 'False', + + 'OUTPUT_DIR': 'data.tests', + + 'SAVE_ARCHIVE_DOT_ORG': 'False', + 'SAVE_TITLE': 'False', + + 'USE_CURL': 'False', + 'USE_WGET': 'False', + 'USE_GIT': 'False', + 'USE_CHROME': 'False', + 'USE_YOUTUBEDL': 'False', +} + +OUTPUT_DIR = 'data.tests' +os.environ.update(TEST_CONFIG) + +from ..main import init +from ..index import load_main_index +from ..config import ( + SQL_INDEX_FILENAME, + JSON_INDEX_FILENAME, + HTML_INDEX_FILENAME, +) + +from . import ( + archivebox_init, + archivebox_add, + archivebox_remove, +) + +HIDE_CLI_OUTPUT = True + +test_urls = ''' +https://example1.com/what/is/happening.html?what=1#how-about-this=1 +https://example2.com/what/is/happening/?what=1#how-about-this=1 +HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f +https://example4.com/what/is/happening.html +https://example5.com/ +https://example6.com + +http://example7.com +[https://example8.com/what/is/this.php?what=1] +[and http://example9.com?what=1&other=3#and-thing=2] +https://example10.com#and-thing=2 " +abcdef +sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi +example13.bada +and example14.badb +htt://example15.badc +''' + +stdout = sys.stdout +stderr = sys.stderr + + +@contextmanager +def output_hidden(show_failing=True): + if not HIDE_CLI_OUTPUT: + yield + return + + sys.stdout = open('stdout.txt', 'w+') + sys.stderr = open('stderr.txt', 'w+') + try: + yield + sys.stdout.close() + sys.stderr.close() + sys.stdout = stdout + sys.stderr = stderr + except: + sys.stdout.close() + sys.stderr.close() + sys.stdout = stdout + sys.stderr = stderr + if show_failing: + with open('stdout.txt', 'r') as f: + print(f.read()) + with open('stderr.txt', 'r') as f: + print(f.read()) + raise + finally: + os.remove('stdout.txt') + os.remove('stderr.txt') + + +class TestInit(unittest.TestCase): + def setUp(self): + os.makedirs(OUTPUT_DIR, exist_ok=True) + + def tearDown(self): + shutil.rmtree(OUTPUT_DIR, ignore_errors=True) + + def test_basic_init(self): + with output_hidden(): + archivebox_init.main([]) + + assert (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists() + assert (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists() + assert (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists() + assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0 + + def test_conflicting_init(self): + with open(Path(OUTPUT_DIR) / 'test_conflict.txt', 'w+') as f: + f.write('test') + + try: + with output_hidden(show_failing=False): + archivebox_init.main([]) + assert False, 'Init should have exited with an exception' + except SystemExit: + pass + + assert not (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists() + assert not (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists() + assert not (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists() + try: + load_main_index(out_dir=OUTPUT_DIR) + assert False, 'load_main_index should raise an exception when no index is present' + except: + pass + + def test_no_dirty_state(self): + with output_hidden(): + init() + shutil.rmtree(OUTPUT_DIR, ignore_errors=True) + with output_hidden(): + init() + + +class TestAdd(unittest.TestCase): + def setUp(self): + os.makedirs(OUTPUT_DIR, exist_ok=True) + with output_hidden(): + init() + + def tearDown(self): + shutil.rmtree(OUTPUT_DIR, ignore_errors=True) + + def test_add_arg_url(self): + with output_hidden(): + archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all']) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 30 + + def test_add_arg_file(self): + test_file = Path(OUTPUT_DIR) / 'test.txt' + with open(test_file, 'w+') as f: + f.write(test_urls) + + with output_hidden(): + archivebox_add.main([test_file]) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 12 + os.remove(test_file) + + def test_add_stdin_url(self): + with output_hidden(): + archivebox_add.main([], stdin=test_urls) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 12 + + +class TestRemove(unittest.TestCase): + def setUp(self): + os.makedirs(OUTPUT_DIR, exist_ok=True) + with output_hidden(): + init() + archivebox_add.main([], stdin=test_urls) + + # def tearDown(self): + # shutil.rmtree(OUTPUT_DIR, ignore_errors=True) + + + def test_remove_exact(self): + with output_hidden(): + archivebox_remove.main(['--yes', '--delete', 'https://example5.com/']) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 11 + + def test_remove_regex(self): + with output_hidden(): + archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', r'http(s)?:\/\/(.+\.)?(example\d\.com)']) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 4 + + def test_remove_domain(self): + with output_hidden(): + archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com']) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 10 + + def test_remove_none(self): + try: + with output_hidden(show_failing=False): + archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com']) + assert False, 'Should raise if no URLs match' + except: + pass + + +if __name__ == '__main__': + if '--verbose' in sys.argv or '-v' in sys.argv: + HIDE_CLI_OUTPUT = False + + unittest.main() From e61e12c889c937e3fc29ef95fab8cfd2512fc71a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 14:51:32 -0500 Subject: [PATCH 014/137] use setup.py to determine dependencies in Dockerfile instead of egg-info requires.txt --- Dockerfile | 8 ++-- archivebox/config.py | 5 +++ archivebox/core/settings.py | 3 -- setup.py | 88 ++++++++++++++++++++----------------- 4 files changed, 57 insertions(+), 47 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8cf2da30..bb750721 100644 --- a/Dockerfile +++ b/Dockerfile @@ -79,13 +79,13 @@ WORKDIR "$CODE_DIR" ENV PATH="${PATH}:$VENV_PATH/bin" RUN python -m venv --clear --symlinks "$VENV_PATH" \ && pip install --upgrade --quiet pip setuptools -ADD ./pip_dist/archivebox.egg-info/requires.txt "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" +ADD "./setup.py" "$CODE_DIR/" +ADD "./README.md" "./package.json" "$CODE_DIR/archivebox/" RUN apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ build-essential python-dev python3-dev \ - # && pip install --upgrade pip \ - && grep -B 1000 -E '^$' "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" | pip install --quiet -r /dev/stdin \ - && pip install --quiet "sonic-client==0.0.5" \ + && python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \ + && pip install --quiet -r /tmp/requirements.txt \ && apt-get purge -y build-essential python-dev python3-dev \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* diff --git a/archivebox/config.py b/archivebox/config.py index 3d48344f..b8acb3f7 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -1075,6 +1075,11 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, call_command("migrate", interactive=False, verbosity=0) else: django.setup() + + # Enable WAL mode in sqlite3 + from django.db import connection + with connection.cursor() as cursor: + cursor.execute("PRAGMA journal_mode=wal;") if check_db: sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 853a4863..e73c93d9 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -107,9 +107,6 @@ DATABASES = { 'default': { 'ENGINE': 'django.db.backends.sqlite3', 'NAME': DATABASE_NAME, - 'OPTIONS': { - 'init_command': 'PRAGMA journal_mode=wal;', - } } } diff --git a/setup.py b/setup.py index 962db8d8..3ab4f238 100755 --- a/setup.py +++ b/setup.py @@ -27,6 +27,47 @@ PACKAGE_DIR = ROOT_DIR / PKG_NAME README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore') VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version'] +PYTHON_REQUIRES = ">=3.7" +SETUP_REQUIRES = ["wheel"] +INSTALL_REQUIRES = [ + # only add things here that have corresponding apt python3-packages available + # anything added here also needs to be added to our package dependencies in + # stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc. + # if there is no apt python3-package equivalent, then vendor it instead in + # ./archivebox/vendor/ + "requests>=2.24.0", + "atomicwrites>=1.4.0", + "mypy-extensions>=0.4.3", + "django>=3.1.3", + "django-extensions>=3.0.3", + "dateparser", + "ipython", + "youtube-dl", + "python-crontab>=2.5.1", + "croniter>=0.3.34", + "w3lib>=1.22.0", +] +EXTRAS_REQUIRE = { + 'sonic': [ + "sonic-client>=0.0.5", + ], + 'dev': [ + "setuptools", + "twine", + "wheel", + "flake8", + "ipdb", + "mypy", + "django-stubs", + "sphinx", + "sphinx-rtd-theme", + "recommonmark", + "pytest", + "bottle", + "stdeb", + ], +} + # To see when setup.py gets called (uncomment for debugging): # import sys # print(PACKAGE_DIR, f" (v{VERSION})") @@ -36,7 +77,9 @@ VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['versio class DisabledTestCommand(test): def run(self): # setup.py test is deprecated, disable it here by force so stdeb doesnt run it - print('Use the ./bin/test.sh script to run tests, not setup.py test.') + print() + print('[X] Running tests via setup.py test is deprecated.') + print(' Hint: Use the ./bin/test.sh script or pytest instead') setuptools.setup( @@ -50,45 +93,10 @@ setuptools.setup( long_description_content_type="text/markdown", url=REPO_URL, project_urls=PROJECT_URLS, - python_requires=">=3.7", - setup_requires=[ - "wheel", - ], - install_requires=[ - # only add things here that have corresponding apt python3-packages available - # anything added here also needs to be added to our package dependencies in - # stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc. - # if there is no apt python3-package equivalent, then vendor it instead in - # ./archivebox/vendor/ - "requests==2.24.0", - "atomicwrites==1.4.0", - "mypy-extensions==0.4.3", - "django==3.1.3", - "django-extensions==3.0.3", - "dateparser", - "ipython", - "youtube-dl", - "python-crontab==2.5.1", - "croniter==0.3.34", - "w3lib==1.22.0", - ], - extras_require={ - 'dev': [ - "setuptools", - "twine", - "wheel", - "flake8", - "ipdb", - "mypy", - "django-stubs", - "sphinx", - "sphinx-rtd-theme", - "recommonmark", - "pytest", - "bottle", - "stdeb", - ], - }, + python_requires=PYTHON_REQUIRES, + setup_requires=SETUP_REQUIRES, + install_requires=INSTALL_REQUIRES, + extras_require=EXTRAS_REQUIRE, packages=[PKG_NAME], include_package_data=True, # see MANIFEST.in entry_points={ From 3c3bae02d2937a9824d74f0e1db766f657ba7996 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 14:52:10 -0500 Subject: [PATCH 015/137] add quick-init option to skip reimporting all snapshot dirs on init --- archivebox/cli/archivebox_init.py | 6 +++ archivebox/cli/archivebox_server.py | 8 ++- archivebox/main.py | 82 +++++++++++++++-------------- docker-compose.yml | 2 +- 4 files changed, 56 insertions(+), 42 deletions(-) diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py index 6255ef26..5753269c 100755 --- a/archivebox/cli/archivebox_init.py +++ b/archivebox/cli/archivebox_init.py @@ -27,11 +27,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional action='store_true', help='Ignore unrecognized files in current directory and initialize anyway', ) + parser.add_argument( + '--quick', '-q', + action='store_true', + help='Run any updates or migrations without rechecking all snapshot dirs', + ) command = parser.parse_args(args or ()) reject_stdin(__command__, stdin) init( force=command.force, + quick=command.quick, out_dir=pwd or OUTPUT_DIR, ) diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index a4d96dc9..a6ec987e 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -41,7 +41,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--init', action='store_true', - help='Run archivebox init before starting the server', + help='Run a full archivebox init/upgrade before starting the server', + ) + parser.add_argument( + '--quick-init', '-i', + action='store_true', + help='Run quick archivebox init/upgrade before starting the server', ) parser.add_argument( '--createsuperuser', @@ -56,6 +61,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional reload=command.reload, debug=command.debug, init=command.init, + quick_init=command.quick_init, createsuperuser=command.createsuperuser, out_dir=pwd or OUTPUT_DIR, ) diff --git a/archivebox/main.py b/archivebox/main.py index a1e58619..bad93706 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -263,7 +263,7 @@ def run(subcommand: str, @enforce_types -def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: +def init(force: bool=False, quick: bool=False, out_dir: Path=OUTPUT_DIR) -> None: """Initialize a new ArchiveBox collection in the current directory""" from core.models import Snapshot @@ -345,48 +345,49 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: all_links = load_main_index(out_dir=out_dir, warn=False) print(' √ Loaded {} links from existing main index.'.format(all_links.count())) - # Links in data folders that dont match their timestamp - fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir) - if fixed: - print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI)) - if cant_fix: - print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI)) + if not quick: + # Links in data folders that dont match their timestamp + fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir) + if fixed: + print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI)) + if cant_fix: + print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI)) - # Links in JSON index but not in main index - orphaned_json_links = { - link.url: link - for link in parse_json_main_index(out_dir) - if not all_links.filter(url=link.url).exists() - } - if orphaned_json_links: - pending_links.update(orphaned_json_links) - print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI)) + # Links in JSON index but not in main index + orphaned_json_links = { + link.url: link + for link in parse_json_main_index(out_dir) + if not all_links.filter(url=link.url).exists() + } + if orphaned_json_links: + pending_links.update(orphaned_json_links) + print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI)) - # Links in data dir indexes but not in main index - orphaned_data_dir_links = { - link.url: link - for link in parse_json_links_details(out_dir) - if not all_links.filter(url=link.url).exists() - } - if orphaned_data_dir_links: - pending_links.update(orphaned_data_dir_links) - print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI)) + # Links in data dir indexes but not in main index + orphaned_data_dir_links = { + link.url: link + for link in parse_json_links_details(out_dir) + if not all_links.filter(url=link.url).exists() + } + if orphaned_data_dir_links: + pending_links.update(orphaned_data_dir_links) + print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI)) - # Links in invalid/duplicate data dirs - invalid_folders = { - folder: link - for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items() - } - if invalid_folders: - print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI)) - print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items())) - print() - print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI)) - print(' archivebox status') - print(' archivebox list --status=invalid') + # Links in invalid/duplicate data dirs + invalid_folders = { + folder: link + for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items() + } + if invalid_folders: + print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI)) + print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items())) + print() + print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI)) + print(' archivebox status') + print(' archivebox list --status=invalid') - write_main_index(list(pending_links.values()), out_dir=out_dir) + write_main_index(list(pending_links.values()), out_dir=out_dir) print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI)) if existing_index: @@ -1063,14 +1064,15 @@ def server(runserver_args: Optional[List[str]]=None, reload: bool=False, debug: bool=False, init: bool=False, + quick_init: bool=False, createsuperuser: bool=False, out_dir: Path=OUTPUT_DIR) -> None: """Run the ArchiveBox HTTP server""" runserver_args = runserver_args or [] - if init: - run_subcommand('init', stdin=None, pwd=out_dir) + if init or quick_init: + run_subcommand('init', quick=quick_init, stdin=None, pwd=out_dir) if createsuperuser: run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) diff --git a/docker-compose.yml b/docker-compose.yml index 96b90a35..a8fd08a9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,7 +13,7 @@ services: archivebox: # build: . image: ${DOCKER_IMAGE:-archivebox/archivebox:latest} - command: server 0.0.0.0:8000 + command: server --quick-init 0.0.0.0:8000 stdin_open: true tty: true ports: From 00ae1f15a75b0511db47f4c058ed8bb627049421 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 14:52:37 -0500 Subject: [PATCH 016/137] ignore shm db file and config files in archivebox data dir on init --- archivebox/main.py | 4 ++++ docker-compose.yml | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/archivebox/main.py b/archivebox/main.py index bad93706..169921fd 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -76,6 +76,7 @@ from .config import ( ARCHIVE_DIR, LOGS_DIR, CONFIG_FILE, + CONFIG_FILENAME, ARCHIVE_DIR_NAME, SOURCES_DIR_NAME, LOGS_DIR_NAME, @@ -131,10 +132,13 @@ ALLOWED_IN_OUTPUT_DIR = { LOGS_DIR_NAME, SQL_INDEX_FILENAME, f'{SQL_INDEX_FILENAME}-wal', + f'{SQL_INDEX_FILENAME}-shm', JSON_INDEX_FILENAME, HTML_INDEX_FILENAME, ROBOTS_TXT_FILENAME, FAVICON_FILENAME, + CONFIG_FILENAME, + f'{CONFIG_FILENAME}.bak', } @enforce_types diff --git a/docker-compose.yml b/docker-compose.yml index a8fd08a9..b6338e0a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,11 +21,12 @@ services: environment: - USE_COLOR=True - SHOW_PROGRESS=False - # - SEARCH_BACKEND_ENGINE=sonic + # - SEARCH_BACKEND_ENGINE=sonic # uncomment these if you enable sonic below # - SEARCH_BACKEND_HOST_NAME=sonic # - SEARCH_BACKEND_PASSWORD=SecretPassword volumes: - ./data:/data + # - ./archivebox:/app/archivebox # for developers working on archivebox # To run the Sonic full-text search backend, create an ./etc/sonic folder # and download the sonic config file from here into that folder: From 3e96871386c72f649c2889b2b57d10da31580ecb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 14:53:03 -0500 Subject: [PATCH 017/137] add comment explaining commented out lines in docker-compose file --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index b6338e0a..43e0d843 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,7 +11,7 @@ version: '3.7' services: archivebox: - # build: . + # build: . # for developers working on archivebox image: ${DOCKER_IMAGE:-archivebox/archivebox:latest} command: server --quick-init 0.0.0.0:8000 stdin_open: true From 9cd4ba38f068869495326693863d03dea5196de5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:42:00 -0500 Subject: [PATCH 018/137] add new SNAPSHOTS_PER_PAGE pagination limit config --- archivebox/config.py | 1 + archivebox/core/admin.py | 4 ++-- archivebox/core/views.py | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index b8acb3f7..f42668b9 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -77,6 +77,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, + 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, }, 'ARCHIVE_METHOD_TOGGLES': { diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index bacc53c0..e959a7d0 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -21,7 +21,7 @@ from core.mixins import SearchResultsAdminMixin from index.html import snapshot_icons from logging_util import printable_filesize from main import add, remove -from config import OUTPUT_DIR +from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE from extractors import archive_links # Admin URLs @@ -106,7 +106,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots] actions_template = 'admin/actions_as_select.html' form = SnapshotAdminForm - list_per_page = 40 + list_per_page = SNAPSHOTS_PER_PAGE def get_urls(self): urls = super().get_urls() diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 0e19fad6..9c8313f0 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -22,6 +22,7 @@ from ..config import ( PUBLIC_ADD_VIEW, VERSION, FOOTER_INFO, + SNAPSHOTS_PER_PAGE, ) from main import add from ..util import base_url, ansi_to_html @@ -94,7 +95,7 @@ class SnapshotView(View): class PublicIndexView(ListView): template_name = 'public_index.html' model = Snapshot - paginate_by = 100 + paginate_by = SNAPSHOTS_PER_PAGE ordering = ['title'] def get_context_data(self, **kwargs): From 78463c243a6da6243452c157ca868e4593566378 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:42:33 -0500 Subject: [PATCH 019/137] remove unused GIT_SHA config option --- archivebox/config.py | 1 - archivebox/index/html.py | 3 +-- archivebox/index/json.py | 3 +-- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index f42668b9..10a84a48 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -288,7 +288,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]}, 'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text().strip())['version']}, - 'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'}, 'PYTHON_BINARY': {'default': lambda c: sys.executable}, 'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()}, diff --git a/archivebox/index/html.py b/archivebox/index/html.py index ebfe7d78..30922269 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -20,7 +20,6 @@ from ..util import ( from ..config import ( OUTPUT_DIR, VERSION, - GIT_SHA, FOOTER_INFO, HTML_INDEX_FILENAME, SAVE_ARCHIVE_DOT_ORG, @@ -60,7 +59,7 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> return render_django_template(template, { 'version': VERSION, - 'git_sha': GIT_SHA, + 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility 'num_links': str(len(links)), 'date_updated': datetime.now().strftime('%Y-%m-%d'), 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), diff --git a/archivebox/index/json.py b/archivebox/index/json.py index f24b969f..441e6854 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -15,7 +15,6 @@ from ..config import ( VERSION, OUTPUT_DIR, FOOTER_INFO, - GIT_SHA, DEPENDENCIES, JSON_INDEX_FILENAME, ARCHIVE_DIR_NAME, @@ -30,7 +29,7 @@ MAIN_INDEX_HEADER = { 'meta': { 'project': 'ArchiveBox', 'version': VERSION, - 'git_sha': GIT_SHA, + 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility 'website': 'https://ArchiveBox.io', 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki', 'source': 'https://github.com/ArchiveBox/ArchiveBox', From c28ad8bd1be0a84c370a384f353ceb5915eecf77 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:42:59 -0500 Subject: [PATCH 020/137] fix AddLinkForm widget complaining about missing template var class --- archivebox/core/forms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index ed584c68..5521ff1d 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -20,7 +20,7 @@ ARCHIVE_METHODS = [ class AddLinkForm(forms.Form): url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True) - depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0') + depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"})) archive_methods = forms.MultipleChoiceField( label="Archive methods (select at least 1, otherwise all will be used by default)", required=False, From 9ce3bd5bdc0ce3c94fa4e865b8a25b74bcce3a41 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:43:36 -0500 Subject: [PATCH 021/137] use index.LINK_FILTERS to validate filter-type args instead of hardocding them twice --- archivebox/cli/archivebox_list.py | 5 +++-- archivebox/cli/archivebox_update.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py index 3838cf60..7cfeeb95 100644 --- a/archivebox/cli/archivebox_list.py +++ b/archivebox/cli/archivebox_list.py @@ -12,6 +12,7 @@ from ..main import list_all from ..util import docstring from ..config import OUTPUT_DIR from ..index import ( + LINK_FILTERS, get_indexed_folders, get_archived_folders, get_unarchived_folders, @@ -96,9 +97,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional ) ) parser.add_argument( - '--filter-type', + '--filter-type', '-t', type=str, - choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'), + choices=(*LINK_FILTERS.keys(), 'search'), default='exact', help='Type of pattern matching to use when filtering URLs', ) diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index 6748096e..bf3c15f8 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -12,6 +12,7 @@ from ..main import update from ..util import docstring from ..config import OUTPUT_DIR from ..index import ( + LINK_FILTERS, get_indexed_folders, get_archived_folders, get_unarchived_folders, @@ -89,9 +90,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional ) ) parser.add_argument( - '--filter-type', + '--filter-type', '-t', type=str, - choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'), + choices=(*LINK_FILTERS.keys(), 'search'), default='exact', help='Type of pattern matching to use when filtering URLs', ) From 4faef03ba3f3fbb21c11f3a41a31c66d7e83bb75 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:44:08 -0500 Subject: [PATCH 022/137] compute snapshot properties directly without loading whole Link --- archivebox/core/models.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 13d75b66..7be705c3 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -7,7 +7,8 @@ from django.utils.functional import cached_property from django.utils.text import slugify from django.db.models import Case, When, Value, IntegerField -from ..util import parse_date +from ..config import ARCHIVE_DIR +from ..util import parse_date, base_url, hashurl from ..index.schema import Link from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE @@ -116,6 +117,11 @@ class Snapshot(models.Model): def bookmarked(self): return parse_date(self.timestamp) + @cached_property + def bookmarked_date(self): + # TODO: remove this + return self.bookmarked + @cached_property def is_archived(self): return self.as_link().is_archived @@ -126,15 +132,15 @@ class Snapshot(models.Model): @cached_property def url_hash(self): - return self.as_link().url_hash + return hashurl(self.url) @cached_property def base_url(self): - return self.as_link().base_url + return base_url(self.url) @cached_property def link_dir(self): - return self.as_link().link_dir + return str(ARCHIVE_DIR / self.timestamp) @cached_property def archive_path(self): From 0c9db1c554eb0679bc271c8e301549939c6bfea3 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:45:42 -0500 Subject: [PATCH 023/137] remove symbols from random secret key for easier copy pastin --- archivebox/config.py | 2 +- archivebox/core/settings.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 10a84a48..c5495ba2 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -480,7 +480,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: if (not existing_secret_key) or ('not a valid secret' in existing_secret_key): from django.utils.crypto import get_random_string - chars = 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.' + chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_' random_secret_key = get_random_string(50, chars) if 'SERVER_CONFIG' in config_file: config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index e73c93d9..b11c5857 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -117,7 +117,7 @@ EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' ### Security Settings ################################################################################ -SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.') +SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_') ALLOWED_HOSTS = ALLOWED_HOSTS.split(',') From 33d180afe7592b3486691ab59933f9969ba3f732 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:48:35 -0500 Subject: [PATCH 024/137] allow filtering snapshots by timestamp in list, update, and remove cmds --- archivebox/index/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 04ab0a8d..d3d1bedc 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -356,6 +356,7 @@ LINK_FILTERS = { 'regex': lambda pattern: Q(url__iregex=pattern), 'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"), 'tag': lambda pattern: Q(tags__name=pattern), + 'timestamp': lambda pattern: Q(timestamp=pattern), } @enforce_types From 8e98cef7adcc78465d5ca6754b126de362ea1a3a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:48:51 -0500 Subject: [PATCH 025/137] fix after and before args flipped when filtering --- archivebox/main.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/archivebox/main.py b/archivebox/main.py index 169921fd..afcaaeff 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -818,11 +818,15 @@ def list_links(snapshots: Optional[QuerySet]=None, all_snapshots = load_main_index(out_dir=out_dir) if after is not None: - all_snapshots = all_snapshots.filter(timestamp__lt=after) + all_snapshots = all_snapshots.filter(timestamp__gte=after) if before is not None: - all_snapshots = all_snapshots.filter(timestamp__gt=before) + all_snapshots = all_snapshots.filter(timestamp__lt=before) if filter_patterns: all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type) + + if not all_snapshots: + stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow') + return all_snapshots @enforce_types From b06e256ad9c11238db589c2bf2bbebe8d9cecdbd Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:49:23 -0500 Subject: [PATCH 026/137] fix add command not updating snapshot detail index pages when passed index-only and overwrite flags together --- archivebox/main.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/archivebox/main.py b/archivebox/main.py index afcaaeff..a10ad212 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -585,6 +585,10 @@ def add(urls: Union[str, List[str]], all_links = load_main_index(out_dir=out_dir) if index_only: + if overwrite: + archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir) + else: + archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir) return all_links # Run the archive methods for each link @@ -593,6 +597,7 @@ def add(urls: Union[str, List[str]], } if extractors: archive_kwargs["methods"] = extractors + if update_all: archive_links(all_links, overwrite=overwrite, **archive_kwargs) elif overwrite: From b3a50a2c10e6b9973a8283d0115288dcc54b6d3e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:49:40 -0500 Subject: [PATCH 027/137] fix server quick-init param not being passed properly to subcommand --- archivebox/main.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/archivebox/main.py b/archivebox/main.py index a10ad212..8a823597 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -1084,8 +1084,11 @@ def server(runserver_args: Optional[List[str]]=None, runserver_args = runserver_args or [] - if init or quick_init: - run_subcommand('init', quick=quick_init, stdin=None, pwd=out_dir) + if init: + run_subcommand('init', stdin=None, pwd=out_dir) + + if quick_init: + run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir) if createsuperuser: run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) From 0ec9bfb9719cf7eaab375c1508a563bb18dfd29f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:50:12 -0500 Subject: [PATCH 028/137] fix dead missing template variables --- archivebox/templates/admin/base.html | 2 +- archivebox/templates/core/base.html | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index d8ad8d00..d581337f 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -20,7 +20,7 @@ -
@@ -316,8 +323,13 @@ archivebox add < ~/Downloads/firefox_bookmarks_export.html archivebox add < any_text_with_urls_in_it.txt archivebox add --depth=1 'https://example.com/some/downloads.html' archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12' -``` +# (if using docker add -i when passing via stdin) +echo 'https://example.com' | docker run -v $PWD:/data -i archivebox/archivebox add + +# (if using docker-compose add -T when passing via stdin) +echo 'https://example.com' | docker-compose run -T archivebox add +``` - TXT, RSS, XML, JSON, CSV, SQL, HTML, Markdown, or [any other text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file) - [Browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](http://i.imgur.com/AtcvUZA.png), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](http://help.opera.com/Windows/12.10/en/importexport.html), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)) @@ -337,6 +349,8 @@ The on-disk layout is optimized to be easy to browse by hand and durable long-te # to browse your index statically without running the archivebox server, run: archivebox list --html --with-headers > index.html archivebox list --json --with-headers > index.json +# if running these commands with docker-compose, add -T: +# docker-compose run -T archivebox list ... # then open the static index in a browser open index.html From 49939f3eaa472a2d04aa9787799444c65a20732d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Feb 2021 01:20:47 -0500 Subject: [PATCH 037/137] only accept stdin if args are not passed, fix stdin hang in docker --- archivebox/cli/archivebox_add.py | 6 +++++- archivebox/cli/archivebox_config.py | 5 ++++- archivebox/cli/archivebox_list.py | 5 ++--- archivebox/cli/archivebox_oneshot.py | 5 ++++- archivebox/cli/archivebox_remove.py | 5 ++++- archivebox/cli/archivebox_update.py | 5 ++++- archivebox/logging_util.py | 29 ++++++++++++++++++++++------ 7 files changed, 46 insertions(+), 14 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 41c7554d..7266a571 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -75,7 +75,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional ) command = parser.parse_args(args or ()) urls = command.urls - stdin_urls = accept_stdin(stdin) + + stdin_urls = '' + if not urls: + stdin_urls = accept_stdin(stdin) + if (stdin_urls and urls) or (not stdin and not urls): stderr( '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n', diff --git a/archivebox/cli/archivebox_config.py b/archivebox/cli/archivebox_config.py index f81286c6..25621972 100644 --- a/archivebox/cli/archivebox_config.py +++ b/archivebox/cli/archivebox_config.py @@ -45,7 +45,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help='KEY or KEY=VALUE formatted config values to get or set', ) command = parser.parse_args(args or ()) - config_options_str = accept_stdin(stdin) + + config_options_str = '' + if not command.config_options: + config_options_str = accept_stdin(stdin) config( config_options_str=config_options_str, diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py index 7cfeeb95..1f2ee8c5 100644 --- a/archivebox/cli/archivebox_list.py +++ b/archivebox/cli/archivebox_list.py @@ -24,7 +24,7 @@ from ..index import ( get_corrupted_folders, get_unrecognized_folders, ) -from ..logging_util import SmartFormatter, accept_stdin, stderr +from ..logging_util import SmartFormatter, reject_stdin, stderr @docstring(list_all.__doc__) @@ -111,7 +111,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help='List only URLs matching these filter patterns.' ) command = parser.parse_args(args or ()) - filter_patterns_str = accept_stdin(stdin) + reject_stdin(stdin) if command.with_headers and not (command.json or command.html or command.csv): stderr( @@ -121,7 +121,6 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional raise SystemExit(2) matching_folders = list_all( - filter_patterns_str=filter_patterns_str, filter_patterns=command.filter_patterns, filter_type=command.filter_type, status=command.status, diff --git a/archivebox/cli/archivebox_oneshot.py b/archivebox/cli/archivebox_oneshot.py index af68bac2..411cce8b 100644 --- a/archivebox/cli/archivebox_oneshot.py +++ b/archivebox/cli/archivebox_oneshot.py @@ -50,8 +50,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help= "Path to save the single archive folder to, e.g. ./example.com_archive" ) command = parser.parse_args(args or ()) + stdin_url = None url = command.url - stdin_url = accept_stdin(stdin) + if not url: + stdin_url = accept_stdin(stdin) + if (stdin_url and url) or (not stdin and not url): stderr( '[X] You must pass a URL/path to add via stdin or CLI arguments.\n', diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py index cb073e95..dadf2654 100644 --- a/archivebox/cli/archivebox_remove.py +++ b/archivebox/cli/archivebox_remove.py @@ -61,7 +61,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help='URLs matching this filter pattern will be removed from the index.' ) command = parser.parse_args(args or ()) - filter_str = accept_stdin(stdin) + + filter_str = None + if not command.filter_patterns: + filter_str = accept_stdin(stdin) remove( filter_str=filter_str, diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index bf3c15f8..500d4c07 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -111,7 +111,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional default="" ) command = parser.parse_args(args or ()) - filter_patterns_str = accept_stdin(stdin) + + filter_patterns_str = None + if not command.filter_patterns: + filter_patterns_str = accept_stdin(stdin) update( resume=command.resume, diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index f2b86735..2fbcbb35 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -62,22 +62,40 @@ class SmartFormatter(argparse.HelpFormatter): def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None: """Tell the user they passed stdin to a command that doesn't accept it""" - if stdin and not stdin.isatty(): - stdin_raw_text = stdin.read().strip() + if not stdin: + return None + + if IN_DOCKER: + # when TTY is disabled in docker we cant tell if stdin is being piped in or not + # if we try to read stdin when its not piped we will hang indefinitely waiting for it + return None + + if not stdin.isatty(): + # stderr('READING STDIN TO REJECT...') + stdin_raw_text = stdin.read() if stdin_raw_text: + # stderr('GOT STDIN!', len(stdin_str)) stderr(f'[X] The "{caller}" command does not accept stdin.', color='red') stderr(f' Run archivebox "{caller} --help" to see usage and examples.') stderr() raise SystemExit(1) + return None def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]: """accept any standard input and return it as a string or None""" + if not stdin: return None - elif stdin and not stdin.isatty(): - stdin_str = stdin.read().strip() - return stdin_str or None + + if not stdin.isatty(): + # stderr('READING STDIN TO ACCEPT...') + stdin_str = stdin.read() + + if stdin_str: + # stderr('GOT STDIN...', len(stdin_str)) + return stdin_str + return None @@ -174,7 +192,6 @@ def progress_bar(seconds: int, prefix: str='') -> None: def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str): - from .config import VERSION, ANSI cmd = ' '.join(('archivebox', subcommand, *subcommand_args)) stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format( now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), From 10d687c55c4a0f084e298beb994907a0dc82987a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Feb 2021 01:21:19 -0500 Subject: [PATCH 038/137] add new MEDIA_MAX_SIZE=750m size option for limiting youtubedl downloads --- archivebox/config.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index c5495ba2..9079f063 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -102,6 +102,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)}, 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'}, 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, + 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, 'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'}, 'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'}, @@ -112,7 +113,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'CHROME_HEADLESS': {'type': bool, 'default': True}, 'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']}, - 'YOUTUBEDL_ARGS': {'type': list, 'default': ['--write-description', + 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: ['--write-description', '--write-info-json', '--write-annotations', '--write-thumbnail', @@ -123,7 +124,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--ignore-errors', '--geo-bypass', '--add-metadata', - '--max-filesize=750m', + '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']), ]}, From 19f7c907e08a4cd69ffecd32167a17a31f2a8566 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Feb 2021 01:22:36 -0500 Subject: [PATCH 039/137] exec archivebox from docker entrypoint script to avoid nesting pid under bash --- archivebox/config.py | 1 + bin/docker_entrypoint.sh | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 9079f063..41e8c34c 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -1086,5 +1086,6 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME assert sql_index_path.exists(), ( f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)') + except KeyboardInterrupt: raise SystemExit(2) diff --git a/bin/docker_entrypoint.sh b/bin/docker_entrypoint.sh index 65a4c1f6..c924e788 100755 --- a/bin/docker_entrypoint.sh +++ b/bin/docker_entrypoint.sh @@ -3,6 +3,7 @@ DATA_DIR="${DATA_DIR:-/data}" ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}" + # Set the archivebox user UID & GID if [[ -n "$PUID" && "$PUID" != 0 ]]; then usermod -u "$PUID" "$ARCHIVEBOX_USER" > /dev/null 2>&1 @@ -11,6 +12,7 @@ if [[ -n "$PGID" && "$PGID" != 0 ]]; then groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1 fi + # Set the permissions of the data dir to match the archivebox user if [[ -d "$DATA_DIR/archive" ]]; then # check data directory permissions @@ -33,11 +35,11 @@ if [[ "$1" == /* || "$1" == "echo" || "$1" == "archivebox" ]]; then # e.g. "archivebox init" # "/bin/bash" # "echo" - gosu "$ARCHIVEBOX_USER" bash -c "$*" + exec gosu "$ARCHIVEBOX_USER" bash -c "$*" else # no command given, assume args were meant to be passed to archivebox cmd # e.g. "add https://example.com" # "manage createsupseruser" # "server 0.0.0.0:8000" - gosu "$ARCHIVEBOX_USER" bash -c "archivebox $*" + exec gosu "$ARCHIVEBOX_USER" bash -c "archivebox $*" fi From 22da885148f01b1101e52e47ea3a641bf8948c20 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Feb 2021 01:23:01 -0500 Subject: [PATCH 040/137] log every archivebox command run to the errors.log --- archivebox/core/settings.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index eeb92e23..1e2be75f 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -6,10 +6,14 @@ import re import logging from pathlib import Path +from datetime import datetime from django.utils.crypto import get_random_string from ..config import ( # noqa: F401 DEBUG, + IS_TTY, + VERSION, + IN_DOCKER, SECRET_KEY, ALLOWED_HOSTS, PACKAGE_DIR, @@ -197,6 +201,8 @@ class NoisyRequestsFilter(logging.Filter): return 1 +ERROR_LOG = LOGS_DIR / 'errors.log' + LOGGING = { 'version': 1, 'disable_existing_loggers': False, @@ -207,7 +213,7 @@ LOGGING = { 'logfile': { 'level': 'ERROR', 'class': 'logging.handlers.RotatingFileHandler', - 'filename': LOGS_DIR / 'errors.log', + 'filename': ERROR_LOG, 'maxBytes': 1024 * 1024 * 25, # 25 MB 'backupCount': 10, }, @@ -231,3 +237,9 @@ LOGGING = { }, } + +# log startup message to the error log +with open(ERROR_LOG, "a+") as f: + command = ' '.join(sys.argv) + ts = datetime.now().strftime('%Y-%m-%d__%H:%M:%S') + f.write(f"\n> {command}; ts={ts} version={VERSION} docker={IN_DOCKER} is_tty={IS_TTY}\n") From 82de67db340bc9e1b69f10e8c73f8486eef87360 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Feb 2021 01:23:31 -0500 Subject: [PATCH 041/137] fix missing/outdated template variables --- archivebox/index/schema.py | 3 +++ archivebox/templates/core/minimal_index.html | 2 +- archivebox/templates/core/static_index.html | 4 ++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 1ca4e801..9c83c4cf 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -201,6 +201,9 @@ class Link: 'basename': self.basename, 'extension': self.extension, 'is_static': self.is_static, + + 'tags_str': self.tags, # only used to render static index in index/html.py, remove if no longer needed there + 'icons': None, # only used to render static index in index/html.py, remove if no longer needed there 'bookmarked_date': self.bookmarked_date, 'updated_date': self.updated_date, diff --git a/archivebox/templates/core/minimal_index.html b/archivebox/templates/core/minimal_index.html index 3c69a831..f50007a6 100644 --- a/archivebox/templates/core/minimal_index.html +++ b/archivebox/templates/core/minimal_index.html @@ -4,7 +4,7 @@ Archived Sites - + diff --git a/archivebox/templates/core/static_index.html b/archivebox/templates/core/static_index.html index 07066e27..4e97b83b 100644 --- a/archivebox/templates/core/static_index.html +++ b/archivebox/templates/core/static_index.html @@ -209,7 +209,7 @@
+ diff --git a/archivebox/templates/core/snapshot.html b/archivebox/templates/core/snapshot.html index c7b99c56..3c6fd64b 100644 --- a/archivebox/templates/core/snapshot.html +++ b/archivebox/templates/core/snapshot.html @@ -335,13 +335,15 @@ diff --git a/archivebox/templates/static/admin.css b/archivebox/templates/static/admin.css index 142e1b89..d8673dc7 100644 --- a/archivebox/templates/static/admin.css +++ b/archivebox/templates/static/admin.css @@ -237,3 +237,26 @@ body.model-snapshot.change-list #content .object-tools { opacity: 0.1; filter: grayscale(100%); } + + +#result_list tbody td.field-cmd_str pre, +#result_list tbody td.field-output_str pre { + max-width: 22vw; + word-wrap: anywhere; + white-space: break-spaces; + max-height: 40px; + overflow: hidden; + margin: 2px; + background-color: rgba(0,0,0,0.05); + padding: 1px 4px 16px 8px; + border-radius: 4px; +} + +#result_list tbody td.field-extractor { + font-weight: 800; + font-variant: small-caps; +} + +#result_list tbody td.field-status { + font-variant: small-caps; +} From 908262b133eb660c8f8934fae478f1aaa073980c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Feb 2021 15:55:47 -0500 Subject: [PATCH 069/137] add dev dependencies install commented out in Dockerfile --- Dockerfile | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index bb750721..26dd42bf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,13 +50,6 @@ RUN apt-get update -qq \ fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \ && rm -rf /var/lib/apt/lists/* -# Install apt development dependencies -# RUN apt-get install -qq \ -# && apt-get install -qq -y --no-install-recommends \ -# python3 python3-dev python3-pip python3-venv python3-all \ -# dh-python debhelper devscripts dput software-properties-common \ -# python3-distutils python3-setuptools python3-wheel python3-stdeb - # Install Node environment RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \ && echo 'deb https://deb.nodesource.com/node_15.x buster main' >> /etc/apt/sources.list \ @@ -90,6 +83,15 @@ RUN apt-get update -qq \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* +# Install apt development dependencies +# RUN apt-get install -qq \ +# && apt-get install -qq -y --no-install-recommends \ +# python3 python3-dev python3-pip python3-venv python3-all \ +# dh-python debhelper devscripts dput software-properties-common \ +# python3-distutils python3-setuptools python3-wheel python3-stdeb +# RUN python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.extras_require["dev"]))' > /tmp/dev_requirements.txt \ + # && pip install --quiet -r /tmp/dev_requirements.txt + # Install ArchiveBox Python package and its dependencies WORKDIR "$CODE_DIR" ADD . "$CODE_DIR" From 68e22fdaba5b39443d6fa0a8b71e22febbfb493e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Feb 2021 15:56:46 -0500 Subject: [PATCH 070/137] add uwsgi config file --- uwsgi.ini | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 uwsgi.ini diff --git a/uwsgi.ini b/uwsgi.ini new file mode 100644 index 00000000..9fa83abe --- /dev/null +++ b/uwsgi.ini @@ -0,0 +1,13 @@ +[uwsgi] +socket = 127.0.0.1:3031 +chdir = ../ +http = 0.0.0.0:8001 +env = OUTPUT_DIR=./data +wsgi-file = archivebox/core/wsgi.py +processes = 4 +threads = 1 +stats = 127.0.0.1:9191 +static-map /static=./archivebox/templates/static +harakiri = 172800 +post-buffering = 1 +disable-logging = True From 71cf8d5224f3479e7436a89aaee2451c1b6637f9 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Feb 2021 15:57:13 -0500 Subject: [PATCH 071/137] add migrations --- .../migrations/0009_auto_20210216_1038.py | 18 ++++++++++++++ .../migrations/0010_auto_20210216_1055.py | 18 ++++++++++++++ .../migrations/0011_auto_20210216_1331.py | 24 +++++++++++++++++++ .../migrations/0012_auto_20210216_1425.py | 23 ++++++++++++++++++ 4 files changed, 83 insertions(+) create mode 100644 archivebox/core/migrations/0009_auto_20210216_1038.py create mode 100644 archivebox/core/migrations/0010_auto_20210216_1055.py create mode 100644 archivebox/core/migrations/0011_auto_20210216_1331.py create mode 100644 archivebox/core/migrations/0012_auto_20210216_1425.py diff --git a/archivebox/core/migrations/0009_auto_20210216_1038.py b/archivebox/core/migrations/0009_auto_20210216_1038.py new file mode 100644 index 00000000..2817fe54 --- /dev/null +++ b/archivebox/core/migrations/0009_auto_20210216_1038.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.3 on 2021-02-16 10:38 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0008_auto_20210105_1421'), + ] + + operations = [ + migrations.AlterField( + model_name='snapshot', + name='updated', + field=models.DateTimeField(auto_now=True, db_index=True, null=True), + ), + ] diff --git a/archivebox/core/migrations/0010_auto_20210216_1055.py b/archivebox/core/migrations/0010_auto_20210216_1055.py new file mode 100644 index 00000000..0af61a39 --- /dev/null +++ b/archivebox/core/migrations/0010_auto_20210216_1055.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.3 on 2021-02-16 10:55 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0009_auto_20210216_1038'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='start_ts', + field=models.DateTimeField(db_index=True), + ), + ] diff --git a/archivebox/core/migrations/0011_auto_20210216_1331.py b/archivebox/core/migrations/0011_auto_20210216_1331.py new file mode 100644 index 00000000..d2226674 --- /dev/null +++ b/archivebox/core/migrations/0011_auto_20210216_1331.py @@ -0,0 +1,24 @@ +# Generated by Django 3.1.3 on 2021-02-16 13:31 + +from django.db import migrations, models +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0010_auto_20210216_1055'), + ] + + operations = [ + migrations.AddField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(default=uuid.uuid4, editable=False), + ), + migrations.AlterField( + model_name='archiveresult', + name='extractor', + field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32), + ), + ] diff --git a/archivebox/core/migrations/0012_auto_20210216_1425.py b/archivebox/core/migrations/0012_auto_20210216_1425.py new file mode 100644 index 00000000..310058ac --- /dev/null +++ b/archivebox/core/migrations/0012_auto_20210216_1425.py @@ -0,0 +1,23 @@ +# Generated by Django 3.1.3 on 2021-02-16 14:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0011_auto_20210216_1331'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='cmd_version', + field=models.CharField(blank=True, default=None, max_length=128, null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='output', + field=models.CharField(max_length=1024), + ), + ] From d89034dcde37be3a4371f99108a2a31dfc245fce Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Feb 2021 16:23:09 -0500 Subject: [PATCH 072/137] disable debug_toolbar by default --- archivebox/core/settings.py | 42 +++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index a8002da9..a90538be 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -40,8 +40,7 @@ LOGOUT_REDIRECT_URL = '/' PASSWORD_RESET_URL = '/accounts/password_reset/' APPEND_SLASH = True -DEBUG = True # DEBUG or ('--debug' in sys.argv) -DEBUG_TOOLBAR = True +DEBUG = DEBUG or ('--debug' in sys.argv) INSTALLED_APPS = [ 'django.contrib.auth', @@ -55,6 +54,29 @@ INSTALLED_APPS = [ 'django_extensions', ] + + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', +] + +AUTHENTICATION_BACKENDS = [ + 'django.contrib.auth.backends.ModelBackend', +] + +DEBUG_TOOLBAR = False +if DEBUG: + try: + import debug_toolbar + DEBUG_TOOLBAR = True + except ImportError: + pass + if DEBUG_TOOLBAR: INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar'] INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*'] @@ -79,24 +101,8 @@ if DEBUG_TOOLBAR: 'debug_toolbar.panels.profiling.ProfilingPanel', 'djdt_flamegraph.FlamegraphPanel', ] - - -MIDDLEWARE = [ - 'django.middleware.security.SecurityMiddleware', - 'django.contrib.sessions.middleware.SessionMiddleware', - 'django.middleware.common.CommonMiddleware', - 'django.middleware.csrf.CsrfViewMiddleware', - 'django.contrib.auth.middleware.AuthenticationMiddleware', - 'django.contrib.messages.middleware.MessageMiddleware', -] -if DEBUG_TOOLBAR: MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware'] -AUTHENTICATION_BACKENDS = [ - 'django.contrib.auth.backends.ModelBackend', -] - - ################################################################################ ### Staticfile and Template Settings ################################################################################ From 6f0eec92eba1ecd086345014a5995b8fb67e13b3 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Feb 2021 16:26:48 -0500 Subject: [PATCH 073/137] fix lint errors --- archivebox/config.py | 2 +- archivebox/core/settings.py | 1 - archivebox/core/wsgi.py | 1 - archivebox/index/sql.py | 1 - archivebox/logging_util.py | 7 ++++++- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 6c96a9f8..07fe4a4b 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -1097,7 +1097,7 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, with open(settings.ERROR_LOG, "a+") as f: command = ' '.join(sys.argv) ts = datetime.now().strftime('%Y-%m-%d__%H:%M:%S') - f.write(f"\n> {command}; ts={ts} version={VERSION} docker={IN_DOCKER} is_tty={IS_TTY}\n") + f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") if check_db: sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index a90538be..9c716c16 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -6,7 +6,6 @@ import re import logging from pathlib import Path -from datetime import datetime from django.utils.crypto import get_random_string from ..config import ( # noqa: F401 diff --git a/archivebox/core/wsgi.py b/archivebox/core/wsgi.py index 59b3d75a..94993b92 100644 --- a/archivebox/core/wsgi.py +++ b/archivebox/core/wsgi.py @@ -7,7 +7,6 @@ For more information on this file, see https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/ """ -import os from archivebox.config import setup_django setup_django(in_memory_db=False, check_db=True) diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index c453df1c..d74b836c 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -30,7 +30,6 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) -> @enforce_types def write_link_to_sql_index(link: Link): from core.models import Snapshot, ArchiveResult - from index.schema import ArchiveResult as LegacyArchiveResult info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} tags = info.pop("tags") if tags is None: diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index d07ec6e4..ab487c04 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -68,7 +68,12 @@ def get_fd_info(fd) -> Dict[str, Any]: IS_TERMINAL = not (IS_PIPE or IS_FILE) IS_LINE_BUFFERED = fd.line_buffering IS_READABLE = fd.readable() - return {key: val for key, val in locals().items() if val is not fd} + return { + 'NAME': NAME, 'FILENO': FILENO, 'MODE': MODE, + 'IS_TTY': IS_TTY, 'IS_PIPE': IS_PIPE, 'IS_FILE': IS_FILE, + 'IS_TERMINAL': IS_TERMINAL, 'IS_LINE_BUFFERED': IS_LINE_BUFFERED, + 'IS_READABLE': IS_READABLE, + } # # Log debug information about stdin, stdout, and stderr From 265bcc0264eb0414226eeabcc34a104de935bfaf Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Feb 2021 16:29:41 -0500 Subject: [PATCH 074/137] fix lint errors2 --- archivebox/core/settings.py | 2 +- archivebox/index/schema.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 9c716c16..746d6dbd 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -71,7 +71,7 @@ AUTHENTICATION_BACKENDS = [ DEBUG_TOOLBAR = False if DEBUG: try: - import debug_toolbar + import debug_toolbar # noqa DEBUG_TOOLBAR = True except ImportError: pass diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 889d74b1..00831e19 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -178,7 +178,6 @@ class Link: raise def _asdict(self, extended=False): - from core.models import Snapshot info = { 'schema': 'Link', 'url': self.url, From 084cf7ff51aed0350f767b42777439aae52fc423 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 17 Feb 2021 13:34:46 -0500 Subject: [PATCH 075/137] add more explanation about snapshot.save timestamp bump --- archivebox/extractors/__init__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 8d924415..09b56c66 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -114,7 +114,14 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s write_search_index(link=link, texts=result.index_texts) ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version, output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status) - snapshot.save() # bump the updated time + + + # bump the updated time on the main Snapshot here, this is critical + # to be able to cache summaries of the ArchiveResults for a given + # snapshot without having to load all the results from the DB each time. + # (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume + # ArchiveResults are unchanged as long as the updated timestamp is unchanged) + snapshot.save() else: # print('{black} X {}{reset}'.format(method_name, **ANSI)) stats['skipped'] += 1 From 05dbb1c160021ccb03487536d336ad9b48be0e08 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 17 Feb 2021 18:24:38 -0500 Subject: [PATCH 076/137] add healthcheck to dockerfile --- Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile b/Dockerfile index 26dd42bf..a31d36a0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -117,5 +117,8 @@ RUN /app/bin/docker_entrypoint.sh archivebox version VOLUME "$DATA_DIR" EXPOSE 8000 +HEALTHCHECK --interval=30s --timeout=20s --retries=15 \ + CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1 + ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"] CMD ["archivebox", "server", "0.0.0.0:8000"] From 3457773988904397f5b5b3bd26bd98ec436ea242 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 17 Feb 2021 18:25:01 -0500 Subject: [PATCH 077/137] fix ArchiveResult extractor showing up on top of dropdown in admin inline form --- archivebox/core/models.py | 2 +- archivebox/templates/static/admin.css | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index aa5dc951..3c42dfa8 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -209,6 +209,7 @@ class ArchiveResult(models.Model): uuid = models.UUIDField(default=uuid.uuid4, editable=False) snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) + extractor = models.CharField(choices=EXTRACTORS, max_length=32) cmd = JSONField() pwd = models.CharField(max_length=256) cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True) @@ -216,7 +217,6 @@ class ArchiveResult(models.Model): start_ts = models.DateTimeField(db_index=True) end_ts = models.DateTimeField() status = models.CharField(max_length=16, choices=STATUS_CHOICES) - extractor = models.CharField(choices=EXTRACTORS, max_length=32) objects = ArchiveResultManager() diff --git a/archivebox/templates/static/admin.css b/archivebox/templates/static/admin.css index d8673dc7..e94e0416 100644 --- a/archivebox/templates/static/admin.css +++ b/archivebox/templates/static/admin.css @@ -260,3 +260,7 @@ body.model-snapshot.change-list #content .object-tools { #result_list tbody td.field-status { font-variant: small-caps; } + +.inline-group .tabular td.original p { + margin-top: -33px; +} From a8a6752b0637cd7983e792b31b6af964e049c71b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 17 Feb 2021 18:25:23 -0500 Subject: [PATCH 078/137] add CACHE_BACKEND options to settings for easier dev --- archivebox/core/settings.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 746d6dbd..2879c704 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -150,12 +150,16 @@ DATABASES = { } } -# CACHES = { -# 'default': { -# 'BACKEND': 'django.core.cache.backends.db.DatabaseCache', -# 'LOCATION': 'cache_default', -# } -# } +CACHE_BACKEND = 'django.core.cache.backends.locmem.LocMemCache' +# CACHE_BACKEND = 'django.core.cache.backends.db.DatabaseCache' +# CACHE_BACKEND = 'django.core.cache.backends.dummy.DummyCache' + +CACHES = { + 'default': { + 'BACKEND': CACHE_BACKEND, + 'LOCATION': 'django_cache_default', + } +} EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' From d60d73754d313144394c02a3a256311081a459f6 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 17 Feb 2021 18:25:47 -0500 Subject: [PATCH 079/137] fix favicon.ico and robots.txt served by runserver --- archivebox/core/urls.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index 15ff24c7..d955f9f8 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -14,13 +14,13 @@ from core.views import HomepageView, SnapshotView, PublicIndexView, AddView urlpatterns = [ path('public/', PublicIndexView.as_view(), name='public-index'), - path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}), - path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}), + path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}), + path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}), path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'), path('archive/', RedirectView.as_view(url='/')), - path('archive/', SnapshotView.as_view(), name='Snapshot'), + path('archive/', SnapshotView.as_view(), name='snapshot'), path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')), path('add/', AddView.as_view(), name='add'), From 7b7aa239fd262764408faae4c3aff09a10738241 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 17 Feb 2021 18:26:04 -0500 Subject: [PATCH 080/137] autocreate db cache table when needed and reenable WAL mode --- archivebox/config.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 07fe4a4b..edbafc0f 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -1080,16 +1080,16 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, django.setup() # Enable WAL mode in sqlite3 - # from django.db import connection - # with connection.cursor() as cursor: - # cursor.execute("PRAGMA journal_mode=wal;") + from django.db import connection + with connection.cursor() as cursor: + cursor.execute("PRAGMA journal_mode=wal;") - # Create cache table in DB - # try: - # from django.core.cache import cache - # cache.get('test', None) - # except django.db.utils.OperationalError: - # call_command("createcachetable", verbosity=0) + # Create cache table in DB if needed + try: + from django.core.cache import cache + cache.get('test', None) + except django.db.utils.OperationalError: + call_command("createcachetable", verbosity=0) from django.conf import settings From c877d67e0cabace8578feeb77103c0d572eee2ed Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 18 Feb 2021 02:31:42 -0500 Subject: [PATCH 081/137] check if pragma is already wal before setting --- archivebox/config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/archivebox/config.py b/archivebox/config.py index edbafc0f..33c92a38 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -1082,7 +1082,9 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, # Enable WAL mode in sqlite3 from django.db import connection with connection.cursor() as cursor: - cursor.execute("PRAGMA journal_mode=wal;") + current_mode = cursor.execute("PRAGMA journal_mode") + if current_mode != 'wal': + cursor.execute("PRAGMA journal_mode=wal;") # Create cache table in DB if needed try: From 4e5671dda92dd285190f788df473b4d34d957e05 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 18 Feb 2021 02:32:58 -0500 Subject: [PATCH 082/137] add ability to add and remove tags directly from snapshot list --- archivebox/core/admin.py | 149 +++++++++++++----- .../templates/admin/actions_as_select.html | 1 - archivebox/templates/admin/base.html | 8 +- 3 files changed, 114 insertions(+), 44 deletions(-) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 84c1d25a..d51698eb 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -37,38 +37,6 @@ from extractors import archive_links # TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel -def update_snapshots(modeladmin, request, queryset): - archive_links([ - snapshot.as_link() - for snapshot in queryset - ], out_dir=OUTPUT_DIR) -update_snapshots.short_description = "Archive" - -def update_titles(modeladmin, request, queryset): - archive_links([ - snapshot.as_link() - for snapshot in queryset - ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR) -update_titles.short_description = "Pull title" - -def overwrite_snapshots(modeladmin, request, queryset): - archive_links([ - snapshot.as_link() - for snapshot in queryset - ], overwrite=True, out_dir=OUTPUT_DIR) -overwrite_snapshots.short_description = "Re-archive (overwrite)" - -def verify_snapshots(modeladmin, request, queryset): - for snapshot in queryset: - print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history)) - -verify_snapshots.short_description = "Check" - -def delete_snapshots(modeladmin, request, queryset): - remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR) - -delete_snapshots.short_description = "Delete" - class SnapshotAdminForm(forms.ModelForm): tags = TagField(required=False) @@ -98,6 +66,13 @@ class SnapshotAdminForm(forms.ModelForm): class ArchiveResultInline(admin.TabularInline): model = ArchiveResult +from django.contrib.admin.helpers import ActionForm + + +class SnapshotActionForm(ActionForm): + tag = forms.ModelChoiceField(queryset=Tag.objects.all(), required=False) + # pass + class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): list_display = ('added', 'title_str', 'url_str', 'files', 'size') @@ -107,12 +82,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): fields = (*readonly_fields, 'timestamp', 'url', 'title', 'tags') list_filter = ('added', 'updated', 'tags') ordering = ['-added'] - actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots] - actions_template = 'admin/actions_as_select.html' + actions = ['delete_snapshots', 'overwrite_snapshots', 'update_snapshots', 'update_titles', 'verify_snapshots', 'add_tag', 'remove_tag'] form = SnapshotAdminForm list_per_page = SNAPSHOTS_PER_PAGE inlines = [ArchiveResultInline] + action_form = SnapshotActionForm + def get_urls(self): urls = super().get_urls() custom_urls = [ @@ -121,11 +97,34 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): return custom_urls + urls def get_queryset(self, request): + self.request = request return super().get_queryset(request).prefetch_related('tags') def tag_list(self, obj): return ', '.join(obj.tags.values_list('name', flat=True)) + # TODO: figure out a different way to do this, you cant nest forms so this doenst work + # def action(self, obj): + # # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0 + # # action: update_snapshots + # # select_across: 0 + # # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3 + # return format_html( + # ''' + # + # + # + # + # + # + # + # + # + # ''', + # csrf.get_token(self.request), + # obj.id, + # ) + def id_str(self, obj): return format_html( '{}', @@ -200,6 +199,54 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): self.list_max_show_all = saved_list_max_show_all return rendered_response + + + def update_snapshots(modeladmin, request, queryset): + archive_links([ + snapshot.as_link() + for snapshot in queryset + ], out_dir=OUTPUT_DIR) + update_snapshots.short_description = "Archive" + + def update_titles(modeladmin, request, queryset): + archive_links([ + snapshot.as_link() + for snapshot in queryset + ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR) + update_titles.short_description = "Pull title" + + def overwrite_snapshots(modeladmin, request, queryset): + archive_links([ + snapshot.as_link() + for snapshot in queryset + ], overwrite=True, out_dir=OUTPUT_DIR) + overwrite_snapshots.short_description = "Re-archive (overwrite)" + + def verify_snapshots(modeladmin, request, queryset): + for snapshot in queryset: + print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history)) + + verify_snapshots.short_description = "Check" + + def delete_snapshots(modeladmin, request, queryset): + remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR) + + delete_snapshots.short_description = "Delete" + + def add_tag(modeladmin, request, queryset): + tag = request.POST['tag'] + for obj in queryset: + obj.tags.add(tag) + + add_tag.short_description = "Add tag" + + def remove_tag(modeladmin, request, queryset): + tag = request.POST['tag'] + for obj in queryset: + obj.tags.remove(tag) + + remove_tag.short_description = "Remove tag" + id_str.short_description = 'ID' @@ -210,25 +257,49 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): title_str.admin_order_field = 'title' url_str.admin_order_field = 'url' + + class TagAdmin(admin.ModelAdmin): - list_display = ('slug', 'name', 'id') + list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id') sort_fields = ('id', 'name', 'slug') - readonly_fields = ('id',) + readonly_fields = ('id', 'num_snapshots', 'snapshots') search_fields = ('id', 'name', 'slug') fields = (*readonly_fields, 'name', 'slug') + actions = ['delete_selected'] + + def num_snapshots(self, obj): + return format_html( + '{} total', + obj.id, + obj.snapshot_set.count(), + ) + + def snapshots(self, obj): + total_count = obj.snapshot_set.count() + return mark_safe('
'.join( + format_html( + '{} [{}] {}', + snap.updated.strftime('%Y-%m-%d %H:%M'), + snap.id, + snap.timestamp, + snap.url, + ) + for snap in obj.snapshot_set.order_by('-updated')[:10] + ) + (f'
and {total_count-10} more...' if obj.snapshot_set.count() > 10 else '')) + return + class ArchiveResultAdmin(admin.ModelAdmin): list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'cmd_str', 'status', 'output_str') sort_fields = ('start_ts', 'extractor', 'status') readonly_fields = ('id', 'uuid', 'snapshot_str') search_fields = ('id', 'uuid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') - fields = (*readonly_fields, 'snapshot', 'extractor', 'status', 'start_ts', 'end_ts', 'pwd', 'cmd', 'cmd_version', 'output') + fields = (*readonly_fields, 'snapshot', 'snapshot__tags', 'extractor', 'status', 'start_ts', 'end_ts', 'pwd', 'cmd', 'cmd_version', 'output') list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') ordering = ['-start_ts'] list_per_page = SNAPSHOTS_PER_PAGE - def snapshot_str(self, obj): return format_html( '[{}]
' @@ -246,7 +317,7 @@ class ArchiveResultAdmin(admin.ModelAdmin): def output_str(self, obj): return format_html( - '
{}
', + '↗️
{}
', obj.snapshot.timestamp, obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html', obj.output, diff --git a/archivebox/templates/admin/actions_as_select.html b/archivebox/templates/admin/actions_as_select.html index 86a77190..e69de29b 100644 --- a/archivebox/templates/admin/actions_as_select.html +++ b/archivebox/templates/admin/actions_as_select.html @@ -1 +0,0 @@ -actions_as_select diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index dc71418a..a3d21ba9 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -157,15 +157,15 @@ function fix_actions() { var container = $('div.actions'); - if (container.find('option').length < 10) { - container.find('label, button').hide(); + if (container.find('select[name=action] option').length < 10) { + container.find('label:nth-child(1), button[value=0]').hide(); var buttons = $('
') - .prependTo(container) + .appendTo(container) .css('display', 'inline') .addClass('class', 'action-buttons'); - container.find('option:gt(0)').reverse().each(function () { + container.find('select[name=action] option:gt(0)').reverse().each(function () { const name = this.value $('
{{link.url}}{{link.url|truncatechars:128}}