From 188670eb8be643ed7d38d4db32a2d8fe1eb99b4e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 13:38:32 -0500 Subject: [PATCH 001/112] disable sonic by default in docker-compose and add instructions --- docker-compose.yml | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 0b4cad24..37c92cb4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,27 +20,27 @@ services: - 8000:8000 environment: - USE_COLOR=True - - SHOW_PROGRESS=False - - SEARCH_BACKEND_ENGINE=sonic - - SEARCH_BACKEND_HOST_NAME=sonic - - SEARCH_BACKEND_PASSWORD=SecretPassword + # - SEARCH_BACKEND_ENGINE=sonic + # - SEARCH_BACKEND_HOST_NAME=sonic + # - SEARCH_BACKEND_PASSWORD=SecretPassword volumes: - ./data:/data - depends_on: - - sonic - # Run sonic search backend - sonic: - image: valeriansaliou/sonic:v1.3.0 - ports: - - 1491:1491 - environment: - - SEARCH_BACKEND_PASSWORD=SecretPassword - volumes: - - ./etc/sonic/config.cfg:/etc/sonic.cfg - - ./data:/var/lib/sonic/store/ + # To run the Sonic full-text search backend, create an ./etc/sonic folder + # and download the sonic config file from here into that folder: + # https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic/config.cfg + # sonic: + # image: valeriansaliou/sonic:v1.3.0 + # expose: + # - 1491 + # environment: + # - SEARCH_BACKEND_PASSWORD=SecretPassword + # volumes: + # - ./etc/sonic/config.cfg:/etc/sonic.cfg + # - ./data/sonic:/var/lib/sonic/store - # Optional Addons: tweak these examples as needed for your specific use case + + ### Optional Addons: tweak these examples as needed for your specific use case # Example: Run scheduled imports in a docker instead of using cron on the # host machine, add tasks and see more info with archivebox schedule --help From 611216765d7e0006bff9431f900f8571d50c037c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 13:39:03 -0500 Subject: [PATCH 002/112] switch sqlite to use WAL mode by default to prevent database locked errors --- archivebox/core/settings.py | 3 +++ archivebox/main.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index e73c93d9..853a4863 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -107,6 +107,9 @@ DATABASES = { 'default': { 'ENGINE': 'django.db.backends.sqlite3', 'NAME': DATABASE_NAME, + 'OPTIONS': { + 'init_command': 'PRAGMA journal_mode=wal;', + } } } diff --git a/archivebox/main.py b/archivebox/main.py index c1751528..a1e58619 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -125,10 +125,12 @@ ALLOWED_IN_OUTPUT_DIR = { 'node_modules', 'package-lock.json', 'static', + 'sonic', ARCHIVE_DIR_NAME, SOURCES_DIR_NAME, LOGS_DIR_NAME, SQL_INDEX_FILENAME, + f'{SQL_INDEX_FILENAME}-wal', JSON_INDEX_FILENAME, HTML_INDEX_FILENAME, ROBOTS_TXT_FILENAME, From 128e7f0e8d542b1d91260cef74c4413c12c356ef Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 13:39:37 -0500 Subject: [PATCH 003/112] dont show progress bars in docker by default --- docker-compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-compose.yml b/docker-compose.yml index 37c92cb4..96b90a35 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,6 +20,7 @@ services: - 8000:8000 environment: - USE_COLOR=True + - SHOW_PROGRESS=False # - SEARCH_BACKEND_ENGINE=sonic # - SEARCH_BACKEND_HOST_NAME=sonic # - SEARCH_BACKEND_PASSWORD=SecretPassword From 0407d03b6bcbca92a0e99470b66ce89e3d61b288 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 13:39:49 -0500 Subject: [PATCH 004/112] add cli tests file back --- archivebox/cli/tests.py | 227 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 archivebox/cli/tests.py diff --git a/archivebox/cli/tests.py b/archivebox/cli/tests.py new file mode 100644 index 00000000..4d7016aa --- /dev/null +++ b/archivebox/cli/tests.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' + + +import os +import sys +import shutil +import unittest +from pathlib import Path + +from contextlib import contextmanager + +TEST_CONFIG = { + 'USE_COLOR': 'False', + 'SHOW_PROGRESS': 'False', + + 'OUTPUT_DIR': 'data.tests', + + 'SAVE_ARCHIVE_DOT_ORG': 'False', + 'SAVE_TITLE': 'False', + + 'USE_CURL': 'False', + 'USE_WGET': 'False', + 'USE_GIT': 'False', + 'USE_CHROME': 'False', + 'USE_YOUTUBEDL': 'False', +} + +OUTPUT_DIR = 'data.tests' +os.environ.update(TEST_CONFIG) + +from ..main import init +from ..index import load_main_index +from ..config import ( + SQL_INDEX_FILENAME, + JSON_INDEX_FILENAME, + HTML_INDEX_FILENAME, +) + +from . import ( + archivebox_init, + archivebox_add, + archivebox_remove, +) + +HIDE_CLI_OUTPUT = True + +test_urls = ''' +https://example1.com/what/is/happening.html?what=1#how-about-this=1 +https://example2.com/what/is/happening/?what=1#how-about-this=1 +HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f +https://example4.com/what/is/happening.html +https://example5.com/ +https://example6.com + +http://example7.com +[https://example8.com/what/is/this.php?what=1] +[and http://example9.com?what=1&other=3#and-thing=2] +https://example10.com#and-thing=2 " +abcdef +sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi +example13.bada +and example14.badb +htt://example15.badc +''' + +stdout = sys.stdout +stderr = sys.stderr + + +@contextmanager +def output_hidden(show_failing=True): + if not HIDE_CLI_OUTPUT: + yield + return + + sys.stdout = open('stdout.txt', 'w+') + sys.stderr = open('stderr.txt', 'w+') + try: + yield + sys.stdout.close() + sys.stderr.close() + sys.stdout = stdout + sys.stderr = stderr + except: + sys.stdout.close() + sys.stderr.close() + sys.stdout = stdout + sys.stderr = stderr + if show_failing: + with open('stdout.txt', 'r') as f: + print(f.read()) + with open('stderr.txt', 'r') as f: + print(f.read()) + raise + finally: + os.remove('stdout.txt') + os.remove('stderr.txt') + + +class TestInit(unittest.TestCase): + def setUp(self): + os.makedirs(OUTPUT_DIR, exist_ok=True) + + def tearDown(self): + shutil.rmtree(OUTPUT_DIR, ignore_errors=True) + + def test_basic_init(self): + with output_hidden(): + archivebox_init.main([]) + + assert (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists() + assert (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists() + assert (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists() + assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0 + + def test_conflicting_init(self): + with open(Path(OUTPUT_DIR) / 'test_conflict.txt', 'w+') as f: + f.write('test') + + try: + with output_hidden(show_failing=False): + archivebox_init.main([]) + assert False, 'Init should have exited with an exception' + except SystemExit: + pass + + assert not (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists() + assert not (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists() + assert not (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists() + try: + load_main_index(out_dir=OUTPUT_DIR) + assert False, 'load_main_index should raise an exception when no index is present' + except: + pass + + def test_no_dirty_state(self): + with output_hidden(): + init() + shutil.rmtree(OUTPUT_DIR, ignore_errors=True) + with output_hidden(): + init() + + +class TestAdd(unittest.TestCase): + def setUp(self): + os.makedirs(OUTPUT_DIR, exist_ok=True) + with output_hidden(): + init() + + def tearDown(self): + shutil.rmtree(OUTPUT_DIR, ignore_errors=True) + + def test_add_arg_url(self): + with output_hidden(): + archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all']) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 30 + + def test_add_arg_file(self): + test_file = Path(OUTPUT_DIR) / 'test.txt' + with open(test_file, 'w+') as f: + f.write(test_urls) + + with output_hidden(): + archivebox_add.main([test_file]) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 12 + os.remove(test_file) + + def test_add_stdin_url(self): + with output_hidden(): + archivebox_add.main([], stdin=test_urls) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 12 + + +class TestRemove(unittest.TestCase): + def setUp(self): + os.makedirs(OUTPUT_DIR, exist_ok=True) + with output_hidden(): + init() + archivebox_add.main([], stdin=test_urls) + + # def tearDown(self): + # shutil.rmtree(OUTPUT_DIR, ignore_errors=True) + + + def test_remove_exact(self): + with output_hidden(): + archivebox_remove.main(['--yes', '--delete', 'https://example5.com/']) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 11 + + def test_remove_regex(self): + with output_hidden(): + archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', r'http(s)?:\/\/(.+\.)?(example\d\.com)']) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 4 + + def test_remove_domain(self): + with output_hidden(): + archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com']) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 10 + + def test_remove_none(self): + try: + with output_hidden(show_failing=False): + archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com']) + assert False, 'Should raise if no URLs match' + except: + pass + + +if __name__ == '__main__': + if '--verbose' in sys.argv or '-v' in sys.argv: + HIDE_CLI_OUTPUT = False + + unittest.main() From e61e12c889c937e3fc29ef95fab8cfd2512fc71a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 14:51:32 -0500 Subject: [PATCH 005/112] use setup.py to determine dependencies in Dockerfile instead of egg-info requires.txt --- Dockerfile | 8 ++-- archivebox/config.py | 5 +++ archivebox/core/settings.py | 3 -- setup.py | 88 ++++++++++++++++++++----------------- 4 files changed, 57 insertions(+), 47 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8cf2da30..bb750721 100644 --- a/Dockerfile +++ b/Dockerfile @@ -79,13 +79,13 @@ WORKDIR "$CODE_DIR" ENV PATH="${PATH}:$VENV_PATH/bin" RUN python -m venv --clear --symlinks "$VENV_PATH" \ && pip install --upgrade --quiet pip setuptools -ADD ./pip_dist/archivebox.egg-info/requires.txt "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" +ADD "./setup.py" "$CODE_DIR/" +ADD "./README.md" "./package.json" "$CODE_DIR/archivebox/" RUN apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ build-essential python-dev python3-dev \ - # && pip install --upgrade pip \ - && grep -B 1000 -E '^$' "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" | pip install --quiet -r /dev/stdin \ - && pip install --quiet "sonic-client==0.0.5" \ + && python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \ + && pip install --quiet -r /tmp/requirements.txt \ && apt-get purge -y build-essential python-dev python3-dev \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* diff --git a/archivebox/config.py b/archivebox/config.py index 3d48344f..b8acb3f7 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -1075,6 +1075,11 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, call_command("migrate", interactive=False, verbosity=0) else: django.setup() + + # Enable WAL mode in sqlite3 + from django.db import connection + with connection.cursor() as cursor: + cursor.execute("PRAGMA journal_mode=wal;") if check_db: sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 853a4863..e73c93d9 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -107,9 +107,6 @@ DATABASES = { 'default': { 'ENGINE': 'django.db.backends.sqlite3', 'NAME': DATABASE_NAME, - 'OPTIONS': { - 'init_command': 'PRAGMA journal_mode=wal;', - } } } diff --git a/setup.py b/setup.py index 962db8d8..3ab4f238 100755 --- a/setup.py +++ b/setup.py @@ -27,6 +27,47 @@ PACKAGE_DIR = ROOT_DIR / PKG_NAME README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore') VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version'] +PYTHON_REQUIRES = ">=3.7" +SETUP_REQUIRES = ["wheel"] +INSTALL_REQUIRES = [ + # only add things here that have corresponding apt python3-packages available + # anything added here also needs to be added to our package dependencies in + # stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc. + # if there is no apt python3-package equivalent, then vendor it instead in + # ./archivebox/vendor/ + "requests>=2.24.0", + "atomicwrites>=1.4.0", + "mypy-extensions>=0.4.3", + "django>=3.1.3", + "django-extensions>=3.0.3", + "dateparser", + "ipython", + "youtube-dl", + "python-crontab>=2.5.1", + "croniter>=0.3.34", + "w3lib>=1.22.0", +] +EXTRAS_REQUIRE = { + 'sonic': [ + "sonic-client>=0.0.5", + ], + 'dev': [ + "setuptools", + "twine", + "wheel", + "flake8", + "ipdb", + "mypy", + "django-stubs", + "sphinx", + "sphinx-rtd-theme", + "recommonmark", + "pytest", + "bottle", + "stdeb", + ], +} + # To see when setup.py gets called (uncomment for debugging): # import sys # print(PACKAGE_DIR, f" (v{VERSION})") @@ -36,7 +77,9 @@ VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['versio class DisabledTestCommand(test): def run(self): # setup.py test is deprecated, disable it here by force so stdeb doesnt run it - print('Use the ./bin/test.sh script to run tests, not setup.py test.') + print() + print('[X] Running tests via setup.py test is deprecated.') + print(' Hint: Use the ./bin/test.sh script or pytest instead') setuptools.setup( @@ -50,45 +93,10 @@ setuptools.setup( long_description_content_type="text/markdown", url=REPO_URL, project_urls=PROJECT_URLS, - python_requires=">=3.7", - setup_requires=[ - "wheel", - ], - install_requires=[ - # only add things here that have corresponding apt python3-packages available - # anything added here also needs to be added to our package dependencies in - # stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc. - # if there is no apt python3-package equivalent, then vendor it instead in - # ./archivebox/vendor/ - "requests==2.24.0", - "atomicwrites==1.4.0", - "mypy-extensions==0.4.3", - "django==3.1.3", - "django-extensions==3.0.3", - "dateparser", - "ipython", - "youtube-dl", - "python-crontab==2.5.1", - "croniter==0.3.34", - "w3lib==1.22.0", - ], - extras_require={ - 'dev': [ - "setuptools", - "twine", - "wheel", - "flake8", - "ipdb", - "mypy", - "django-stubs", - "sphinx", - "sphinx-rtd-theme", - "recommonmark", - "pytest", - "bottle", - "stdeb", - ], - }, + python_requires=PYTHON_REQUIRES, + setup_requires=SETUP_REQUIRES, + install_requires=INSTALL_REQUIRES, + extras_require=EXTRAS_REQUIRE, packages=[PKG_NAME], include_package_data=True, # see MANIFEST.in entry_points={ From 3c3bae02d2937a9824d74f0e1db766f657ba7996 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 14:52:10 -0500 Subject: [PATCH 006/112] add quick-init option to skip reimporting all snapshot dirs on init --- archivebox/cli/archivebox_init.py | 6 +++ archivebox/cli/archivebox_server.py | 8 ++- archivebox/main.py | 82 +++++++++++++++-------------- docker-compose.yml | 2 +- 4 files changed, 56 insertions(+), 42 deletions(-) diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py index 6255ef26..5753269c 100755 --- a/archivebox/cli/archivebox_init.py +++ b/archivebox/cli/archivebox_init.py @@ -27,11 +27,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional action='store_true', help='Ignore unrecognized files in current directory and initialize anyway', ) + parser.add_argument( + '--quick', '-q', + action='store_true', + help='Run any updates or migrations without rechecking all snapshot dirs', + ) command = parser.parse_args(args or ()) reject_stdin(__command__, stdin) init( force=command.force, + quick=command.quick, out_dir=pwd or OUTPUT_DIR, ) diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index a4d96dc9..a6ec987e 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -41,7 +41,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--init', action='store_true', - help='Run archivebox init before starting the server', + help='Run a full archivebox init/upgrade before starting the server', + ) + parser.add_argument( + '--quick-init', '-i', + action='store_true', + help='Run quick archivebox init/upgrade before starting the server', ) parser.add_argument( '--createsuperuser', @@ -56,6 +61,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional reload=command.reload, debug=command.debug, init=command.init, + quick_init=command.quick_init, createsuperuser=command.createsuperuser, out_dir=pwd or OUTPUT_DIR, ) diff --git a/archivebox/main.py b/archivebox/main.py index a1e58619..bad93706 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -263,7 +263,7 @@ def run(subcommand: str, @enforce_types -def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: +def init(force: bool=False, quick: bool=False, out_dir: Path=OUTPUT_DIR) -> None: """Initialize a new ArchiveBox collection in the current directory""" from core.models import Snapshot @@ -345,48 +345,49 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: all_links = load_main_index(out_dir=out_dir, warn=False) print(' √ Loaded {} links from existing main index.'.format(all_links.count())) - # Links in data folders that dont match their timestamp - fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir) - if fixed: - print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI)) - if cant_fix: - print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI)) + if not quick: + # Links in data folders that dont match their timestamp + fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir) + if fixed: + print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI)) + if cant_fix: + print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI)) - # Links in JSON index but not in main index - orphaned_json_links = { - link.url: link - for link in parse_json_main_index(out_dir) - if not all_links.filter(url=link.url).exists() - } - if orphaned_json_links: - pending_links.update(orphaned_json_links) - print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI)) + # Links in JSON index but not in main index + orphaned_json_links = { + link.url: link + for link in parse_json_main_index(out_dir) + if not all_links.filter(url=link.url).exists() + } + if orphaned_json_links: + pending_links.update(orphaned_json_links) + print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI)) - # Links in data dir indexes but not in main index - orphaned_data_dir_links = { - link.url: link - for link in parse_json_links_details(out_dir) - if not all_links.filter(url=link.url).exists() - } - if orphaned_data_dir_links: - pending_links.update(orphaned_data_dir_links) - print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI)) + # Links in data dir indexes but not in main index + orphaned_data_dir_links = { + link.url: link + for link in parse_json_links_details(out_dir) + if not all_links.filter(url=link.url).exists() + } + if orphaned_data_dir_links: + pending_links.update(orphaned_data_dir_links) + print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI)) - # Links in invalid/duplicate data dirs - invalid_folders = { - folder: link - for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items() - } - if invalid_folders: - print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI)) - print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items())) - print() - print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI)) - print(' archivebox status') - print(' archivebox list --status=invalid') + # Links in invalid/duplicate data dirs + invalid_folders = { + folder: link + for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items() + } + if invalid_folders: + print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI)) + print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items())) + print() + print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI)) + print(' archivebox status') + print(' archivebox list --status=invalid') - write_main_index(list(pending_links.values()), out_dir=out_dir) + write_main_index(list(pending_links.values()), out_dir=out_dir) print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI)) if existing_index: @@ -1063,14 +1064,15 @@ def server(runserver_args: Optional[List[str]]=None, reload: bool=False, debug: bool=False, init: bool=False, + quick_init: bool=False, createsuperuser: bool=False, out_dir: Path=OUTPUT_DIR) -> None: """Run the ArchiveBox HTTP server""" runserver_args = runserver_args or [] - if init: - run_subcommand('init', stdin=None, pwd=out_dir) + if init or quick_init: + run_subcommand('init', quick=quick_init, stdin=None, pwd=out_dir) if createsuperuser: run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) diff --git a/docker-compose.yml b/docker-compose.yml index 96b90a35..a8fd08a9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,7 +13,7 @@ services: archivebox: # build: . image: ${DOCKER_IMAGE:-archivebox/archivebox:latest} - command: server 0.0.0.0:8000 + command: server --quick-init 0.0.0.0:8000 stdin_open: true tty: true ports: From 00ae1f15a75b0511db47f4c058ed8bb627049421 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 14:52:37 -0500 Subject: [PATCH 007/112] ignore shm db file and config files in archivebox data dir on init --- archivebox/main.py | 4 ++++ docker-compose.yml | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/archivebox/main.py b/archivebox/main.py index bad93706..169921fd 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -76,6 +76,7 @@ from .config import ( ARCHIVE_DIR, LOGS_DIR, CONFIG_FILE, + CONFIG_FILENAME, ARCHIVE_DIR_NAME, SOURCES_DIR_NAME, LOGS_DIR_NAME, @@ -131,10 +132,13 @@ ALLOWED_IN_OUTPUT_DIR = { LOGS_DIR_NAME, SQL_INDEX_FILENAME, f'{SQL_INDEX_FILENAME}-wal', + f'{SQL_INDEX_FILENAME}-shm', JSON_INDEX_FILENAME, HTML_INDEX_FILENAME, ROBOTS_TXT_FILENAME, FAVICON_FILENAME, + CONFIG_FILENAME, + f'{CONFIG_FILENAME}.bak', } @enforce_types diff --git a/docker-compose.yml b/docker-compose.yml index a8fd08a9..b6338e0a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,11 +21,12 @@ services: environment: - USE_COLOR=True - SHOW_PROGRESS=False - # - SEARCH_BACKEND_ENGINE=sonic + # - SEARCH_BACKEND_ENGINE=sonic # uncomment these if you enable sonic below # - SEARCH_BACKEND_HOST_NAME=sonic # - SEARCH_BACKEND_PASSWORD=SecretPassword volumes: - ./data:/data + # - ./archivebox:/app/archivebox # for developers working on archivebox # To run the Sonic full-text search backend, create an ./etc/sonic folder # and download the sonic config file from here into that folder: From 3e96871386c72f649c2889b2b57d10da31580ecb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 14:53:03 -0500 Subject: [PATCH 008/112] add comment explaining commented out lines in docker-compose file --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index b6338e0a..43e0d843 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,7 +11,7 @@ version: '3.7' services: archivebox: - # build: . + # build: . # for developers working on archivebox image: ${DOCKER_IMAGE:-archivebox/archivebox:latest} command: server --quick-init 0.0.0.0:8000 stdin_open: true From 9cd4ba38f068869495326693863d03dea5196de5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:42:00 -0500 Subject: [PATCH 009/112] add new SNAPSHOTS_PER_PAGE pagination limit config --- archivebox/config.py | 1 + archivebox/core/admin.py | 4 ++-- archivebox/core/views.py | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index b8acb3f7..f42668b9 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -77,6 +77,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, + 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, }, 'ARCHIVE_METHOD_TOGGLES': { diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index bacc53c0..e959a7d0 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -21,7 +21,7 @@ from core.mixins import SearchResultsAdminMixin from index.html import snapshot_icons from logging_util import printable_filesize from main import add, remove -from config import OUTPUT_DIR +from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE from extractors import archive_links # Admin URLs @@ -106,7 +106,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots] actions_template = 'admin/actions_as_select.html' form = SnapshotAdminForm - list_per_page = 40 + list_per_page = SNAPSHOTS_PER_PAGE def get_urls(self): urls = super().get_urls() diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 0e19fad6..9c8313f0 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -22,6 +22,7 @@ from ..config import ( PUBLIC_ADD_VIEW, VERSION, FOOTER_INFO, + SNAPSHOTS_PER_PAGE, ) from main import add from ..util import base_url, ansi_to_html @@ -94,7 +95,7 @@ class SnapshotView(View): class PublicIndexView(ListView): template_name = 'public_index.html' model = Snapshot - paginate_by = 100 + paginate_by = SNAPSHOTS_PER_PAGE ordering = ['title'] def get_context_data(self, **kwargs): From 78463c243a6da6243452c157ca868e4593566378 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:42:33 -0500 Subject: [PATCH 010/112] remove unused GIT_SHA config option --- archivebox/config.py | 1 - archivebox/index/html.py | 3 +-- archivebox/index/json.py | 3 +-- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index f42668b9..10a84a48 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -288,7 +288,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]}, 'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text().strip())['version']}, - 'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'}, 'PYTHON_BINARY': {'default': lambda c: sys.executable}, 'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()}, diff --git a/archivebox/index/html.py b/archivebox/index/html.py index ebfe7d78..30922269 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -20,7 +20,6 @@ from ..util import ( from ..config import ( OUTPUT_DIR, VERSION, - GIT_SHA, FOOTER_INFO, HTML_INDEX_FILENAME, SAVE_ARCHIVE_DOT_ORG, @@ -60,7 +59,7 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> return render_django_template(template, { 'version': VERSION, - 'git_sha': GIT_SHA, + 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility 'num_links': str(len(links)), 'date_updated': datetime.now().strftime('%Y-%m-%d'), 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), diff --git a/archivebox/index/json.py b/archivebox/index/json.py index f24b969f..441e6854 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -15,7 +15,6 @@ from ..config import ( VERSION, OUTPUT_DIR, FOOTER_INFO, - GIT_SHA, DEPENDENCIES, JSON_INDEX_FILENAME, ARCHIVE_DIR_NAME, @@ -30,7 +29,7 @@ MAIN_INDEX_HEADER = { 'meta': { 'project': 'ArchiveBox', 'version': VERSION, - 'git_sha': GIT_SHA, + 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility 'website': 'https://ArchiveBox.io', 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki', 'source': 'https://github.com/ArchiveBox/ArchiveBox', From c28ad8bd1be0a84c370a384f353ceb5915eecf77 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:42:59 -0500 Subject: [PATCH 011/112] fix AddLinkForm widget complaining about missing template var class --- archivebox/core/forms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index ed584c68..5521ff1d 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -20,7 +20,7 @@ ARCHIVE_METHODS = [ class AddLinkForm(forms.Form): url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True) - depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0') + depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"})) archive_methods = forms.MultipleChoiceField( label="Archive methods (select at least 1, otherwise all will be used by default)", required=False, From 9ce3bd5bdc0ce3c94fa4e865b8a25b74bcce3a41 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:43:36 -0500 Subject: [PATCH 012/112] use index.LINK_FILTERS to validate filter-type args instead of hardocding them twice --- archivebox/cli/archivebox_list.py | 5 +++-- archivebox/cli/archivebox_update.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py index 3838cf60..7cfeeb95 100644 --- a/archivebox/cli/archivebox_list.py +++ b/archivebox/cli/archivebox_list.py @@ -12,6 +12,7 @@ from ..main import list_all from ..util import docstring from ..config import OUTPUT_DIR from ..index import ( + LINK_FILTERS, get_indexed_folders, get_archived_folders, get_unarchived_folders, @@ -96,9 +97,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional ) ) parser.add_argument( - '--filter-type', + '--filter-type', '-t', type=str, - choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'), + choices=(*LINK_FILTERS.keys(), 'search'), default='exact', help='Type of pattern matching to use when filtering URLs', ) diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index 6748096e..bf3c15f8 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -12,6 +12,7 @@ from ..main import update from ..util import docstring from ..config import OUTPUT_DIR from ..index import ( + LINK_FILTERS, get_indexed_folders, get_archived_folders, get_unarchived_folders, @@ -89,9 +90,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional ) ) parser.add_argument( - '--filter-type', + '--filter-type', '-t', type=str, - choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'), + choices=(*LINK_FILTERS.keys(), 'search'), default='exact', help='Type of pattern matching to use when filtering URLs', ) From 4faef03ba3f3fbb21c11f3a41a31c66d7e83bb75 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:44:08 -0500 Subject: [PATCH 013/112] compute snapshot properties directly without loading whole Link --- archivebox/core/models.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 13d75b66..7be705c3 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -7,7 +7,8 @@ from django.utils.functional import cached_property from django.utils.text import slugify from django.db.models import Case, When, Value, IntegerField -from ..util import parse_date +from ..config import ARCHIVE_DIR +from ..util import parse_date, base_url, hashurl from ..index.schema import Link from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE @@ -116,6 +117,11 @@ class Snapshot(models.Model): def bookmarked(self): return parse_date(self.timestamp) + @cached_property + def bookmarked_date(self): + # TODO: remove this + return self.bookmarked + @cached_property def is_archived(self): return self.as_link().is_archived @@ -126,15 +132,15 @@ class Snapshot(models.Model): @cached_property def url_hash(self): - return self.as_link().url_hash + return hashurl(self.url) @cached_property def base_url(self): - return self.as_link().base_url + return base_url(self.url) @cached_property def link_dir(self): - return self.as_link().link_dir + return str(ARCHIVE_DIR / self.timestamp) @cached_property def archive_path(self): From 0c9db1c554eb0679bc271c8e301549939c6bfea3 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:45:42 -0500 Subject: [PATCH 014/112] remove symbols from random secret key for easier copy pastin --- archivebox/config.py | 2 +- archivebox/core/settings.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 10a84a48..c5495ba2 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -480,7 +480,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: if (not existing_secret_key) or ('not a valid secret' in existing_secret_key): from django.utils.crypto import get_random_string - chars = 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.' + chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_' random_secret_key = get_random_string(50, chars) if 'SERVER_CONFIG' in config_file: config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index e73c93d9..b11c5857 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -117,7 +117,7 @@ EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' ### Security Settings ################################################################################ -SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.') +SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_') ALLOWED_HOSTS = ALLOWED_HOSTS.split(',') From 33d180afe7592b3486691ab59933f9969ba3f732 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:48:35 -0500 Subject: [PATCH 015/112] allow filtering snapshots by timestamp in list, update, and remove cmds --- archivebox/index/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 04ab0a8d..d3d1bedc 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -356,6 +356,7 @@ LINK_FILTERS = { 'regex': lambda pattern: Q(url__iregex=pattern), 'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"), 'tag': lambda pattern: Q(tags__name=pattern), + 'timestamp': lambda pattern: Q(timestamp=pattern), } @enforce_types From 8e98cef7adcc78465d5ca6754b126de362ea1a3a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:48:51 -0500 Subject: [PATCH 016/112] fix after and before args flipped when filtering --- archivebox/main.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/archivebox/main.py b/archivebox/main.py index 169921fd..afcaaeff 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -818,11 +818,15 @@ def list_links(snapshots: Optional[QuerySet]=None, all_snapshots = load_main_index(out_dir=out_dir) if after is not None: - all_snapshots = all_snapshots.filter(timestamp__lt=after) + all_snapshots = all_snapshots.filter(timestamp__gte=after) if before is not None: - all_snapshots = all_snapshots.filter(timestamp__gt=before) + all_snapshots = all_snapshots.filter(timestamp__lt=before) if filter_patterns: all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type) + + if not all_snapshots: + stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow') + return all_snapshots @enforce_types From b06e256ad9c11238db589c2bf2bbebe8d9cecdbd Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:49:23 -0500 Subject: [PATCH 017/112] fix add command not updating snapshot detail index pages when passed index-only and overwrite flags together --- archivebox/main.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/archivebox/main.py b/archivebox/main.py index afcaaeff..a10ad212 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -585,6 +585,10 @@ def add(urls: Union[str, List[str]], all_links = load_main_index(out_dir=out_dir) if index_only: + if overwrite: + archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir) + else: + archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir) return all_links # Run the archive methods for each link @@ -593,6 +597,7 @@ def add(urls: Union[str, List[str]], } if extractors: archive_kwargs["methods"] = extractors + if update_all: archive_links(all_links, overwrite=overwrite, **archive_kwargs) elif overwrite: From b3a50a2c10e6b9973a8283d0115288dcc54b6d3e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:49:40 -0500 Subject: [PATCH 018/112] fix server quick-init param not being passed properly to subcommand --- archivebox/main.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/archivebox/main.py b/archivebox/main.py index a10ad212..8a823597 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -1084,8 +1084,11 @@ def server(runserver_args: Optional[List[str]]=None, runserver_args = runserver_args or [] - if init or quick_init: - run_subcommand('init', quick=quick_init, stdin=None, pwd=out_dir) + if init: + run_subcommand('init', stdin=None, pwd=out_dir) + + if quick_init: + run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir) if createsuperuser: run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) From 0ec9bfb9719cf7eaab375c1508a563bb18dfd29f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:50:12 -0500 Subject: [PATCH 019/112] fix dead missing template variables --- archivebox/templates/admin/base.html | 2 +- archivebox/templates/core/base.html | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index d8ad8d00..d581337f 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -20,7 +20,7 @@ -