diff --git a/Dockerfile b/Dockerfile index 8cf2da30..b11d3382 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,13 +50,6 @@ RUN apt-get update -qq \ fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \ && rm -rf /var/lib/apt/lists/* -# Install apt development dependencies -# RUN apt-get install -qq \ -# && apt-get install -qq -y --no-install-recommends \ -# python3 python3-dev python3-pip python3-venv python3-all \ -# dh-python debhelper devscripts dput software-properties-common \ -# python3-distutils python3-setuptools python3-wheel python3-stdeb - # Install Node environment RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \ && echo 'deb https://deb.nodesource.com/node_15.x buster main' >> /etc/apt/sources.list \ @@ -79,17 +72,26 @@ WORKDIR "$CODE_DIR" ENV PATH="${PATH}:$VENV_PATH/bin" RUN python -m venv --clear --symlinks "$VENV_PATH" \ && pip install --upgrade --quiet pip setuptools -ADD ./pip_dist/archivebox.egg-info/requires.txt "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" +ADD "./setup.py" "$CODE_DIR/" +ADD "./README.md" "./package.json" "$CODE_DIR/archivebox/" RUN apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ build-essential python-dev python3-dev \ - # && pip install --upgrade pip \ - && grep -B 1000 -E '^$' "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" | pip install --quiet -r /dev/stdin \ - && pip install --quiet "sonic-client==0.0.5" \ + && python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \ + && pip install --quiet -r /tmp/requirements.txt \ && apt-get purge -y build-essential python-dev python3-dev \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* +# Install apt development dependencies +# RUN apt-get install -qq \ +# && apt-get install -qq -y --no-install-recommends \ +# python3 python3-dev python3-pip python3-venv python3-all \ +# dh-python debhelper devscripts dput software-properties-common \ +# python3-distutils python3-setuptools python3-wheel python3-stdeb +# RUN python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.extras_require["dev"]))' > /tmp/dev_requirements.txt \ + # && pip install --quiet -r /tmp/dev_requirements.txt + # Install ArchiveBox Python package and its dependencies WORKDIR "$CODE_DIR" ADD . "$CODE_DIR" @@ -115,5 +117,8 @@ RUN /app/bin/docker_entrypoint.sh archivebox version VOLUME "$DATA_DIR" EXPOSE 8000 +HEALTHCHECK --interval=30s --timeout=20s --retries=15 \ + CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1 + ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"] -CMD ["archivebox", "server", "0.0.0.0:8000"] +CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"] diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index f9a55efd..890065a4 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -63,7 +63,11 @@ def run_subcommand(subcommand: str, if subcommand not in meta_cmds: from ..config import setup_django - setup_django(in_memory_db=subcommand in fake_db, check_db=subcommand in archive_cmds) + + cmd_requires_db = subcommand in archive_cmds + init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args + + setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending) module = import_module('.archivebox_{}'.format(subcommand), __package__) module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 41c7554d..a96888b0 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -22,6 +22,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional add_help=True, formatter_class=SmartFormatter, ) + parser.add_argument( + '--tag', '-t', + type=str, + default='', + help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3", + ) parser.add_argument( '--update-all', #'-n', action='store_true', @@ -75,7 +81,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional ) command = parser.parse_args(args or ()) urls = command.urls - stdin_urls = accept_stdin(stdin) + + stdin_urls = '' + if not urls: + stdin_urls = accept_stdin(stdin) + if (stdin_urls and urls) or (not stdin and not urls): stderr( '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n', @@ -85,6 +95,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional add( urls=stdin_urls or urls, depth=command.depth, + tag=command.tag, update_all=command.update_all, index_only=command.index_only, overwrite=command.overwrite, diff --git a/archivebox/cli/archivebox_config.py b/archivebox/cli/archivebox_config.py index f81286c6..25621972 100644 --- a/archivebox/cli/archivebox_config.py +++ b/archivebox/cli/archivebox_config.py @@ -45,7 +45,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help='KEY or KEY=VALUE formatted config values to get or set', ) command = parser.parse_args(args or ()) - config_options_str = accept_stdin(stdin) + + config_options_str = '' + if not command.config_options: + config_options_str = accept_stdin(stdin) config( config_options_str=config_options_str, diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py index 6255ef26..5753269c 100755 --- a/archivebox/cli/archivebox_init.py +++ b/archivebox/cli/archivebox_init.py @@ -27,11 +27,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional action='store_true', help='Ignore unrecognized files in current directory and initialize anyway', ) + parser.add_argument( + '--quick', '-q', + action='store_true', + help='Run any updates or migrations without rechecking all snapshot dirs', + ) command = parser.parse_args(args or ()) reject_stdin(__command__, stdin) init( force=command.force, + quick=command.quick, out_dir=pwd or OUTPUT_DIR, ) diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py index 3838cf60..5477bfc8 100644 --- a/archivebox/cli/archivebox_list.py +++ b/archivebox/cli/archivebox_list.py @@ -12,6 +12,7 @@ from ..main import list_all from ..util import docstring from ..config import OUTPUT_DIR from ..index import ( + LINK_FILTERS, get_indexed_folders, get_archived_folders, get_unarchived_folders, @@ -23,7 +24,7 @@ from ..index import ( get_corrupted_folders, get_unrecognized_folders, ) -from ..logging_util import SmartFormatter, accept_stdin, stderr +from ..logging_util import SmartFormatter, reject_stdin, stderr @docstring(list_all.__doc__) @@ -44,7 +45,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional group.add_argument( '--json', #'-j', action='store_true', - help="Print the output in JSON format with all columns included.", + help="Print the output in JSON format with all columns included", ) group.add_argument( '--html', @@ -59,19 +60,19 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--sort', #'-s', type=str, - help="List the links sorted using the given key, e.g. timestamp or updated.", + help="List the links sorted using the given key, e.g. timestamp or updated", default=None, ) parser.add_argument( '--before', #'-b', type=float, - help="List only links bookmarked before the given timestamp.", + help="List only links bookmarked before (less than) the given timestamp", default=None, ) parser.add_argument( '--after', #'-a', type=float, - help="List only links bookmarked after the given timestamp.", + help="List only links bookmarked after (greater than or equal to) the given timestamp", default=None, ) parser.add_argument( @@ -96,9 +97,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional ) ) parser.add_argument( - '--filter-type', + '--filter-type', '-t', type=str, - choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'), + choices=(*LINK_FILTERS.keys(), 'search'), default='exact', help='Type of pattern matching to use when filtering URLs', ) @@ -107,20 +108,19 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional nargs='*', type=str, default=None, - help='List only URLs matching these filter patterns.' + help='List only URLs matching these filter patterns' ) command = parser.parse_args(args or ()) - filter_patterns_str = accept_stdin(stdin) + reject_stdin(stdin) if command.with_headers and not (command.json or command.html or command.csv): stderr( - '[X] --with-headers can only be used with --json, --html or --csv options.\n', + '[X] --with-headers can only be used with --json, --html or --csv options\n', color='red', ) raise SystemExit(2) matching_folders = list_all( - filter_patterns_str=filter_patterns_str, filter_patterns=command.filter_patterns, filter_type=command.filter_type, status=command.status, diff --git a/archivebox/cli/archivebox_oneshot.py b/archivebox/cli/archivebox_oneshot.py index af68bac2..411cce8b 100644 --- a/archivebox/cli/archivebox_oneshot.py +++ b/archivebox/cli/archivebox_oneshot.py @@ -50,8 +50,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help= "Path to save the single archive folder to, e.g. ./example.com_archive" ) command = parser.parse_args(args or ()) + stdin_url = None url = command.url - stdin_url = accept_stdin(stdin) + if not url: + stdin_url = accept_stdin(stdin) + if (stdin_url and url) or (not stdin and not url): stderr( '[X] You must pass a URL/path to add via stdin or CLI arguments.\n', diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py index cb073e95..dadf2654 100644 --- a/archivebox/cli/archivebox_remove.py +++ b/archivebox/cli/archivebox_remove.py @@ -61,7 +61,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help='URLs matching this filter pattern will be removed from the index.' ) command = parser.parse_args(args or ()) - filter_str = accept_stdin(stdin) + + filter_str = None + if not command.filter_patterns: + filter_str = accept_stdin(stdin) remove( filter_str=filter_str, diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index a4d96dc9..4cc050dd 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -38,10 +38,20 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional action='store_true', help='Enable DEBUG=True mode with more verbose errors', ) + parser.add_argument( + '--nothreading', + action='store_true', + help='Force runserver to run in single-threaded mode', + ) parser.add_argument( '--init', action='store_true', - help='Run archivebox init before starting the server', + help='Run a full archivebox init/upgrade before starting the server', + ) + parser.add_argument( + '--quick-init', '-i', + action='store_true', + help='Run quick archivebox init/upgrade before starting the server', ) parser.add_argument( '--createsuperuser', @@ -52,10 +62,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional reject_stdin(__command__, stdin) server( - runserver_args=command.runserver_args, + runserver_args=command.runserver_args + (['--nothreading'] if command.nothreading else []), reload=command.reload, debug=command.debug, init=command.init, + quick_init=command.quick_init, createsuperuser=command.createsuperuser, out_dir=pwd or OUTPUT_DIR, ) diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index 6748096e..500d4c07 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -12,6 +12,7 @@ from ..main import update from ..util import docstring from ..config import OUTPUT_DIR from ..index import ( + LINK_FILTERS, get_indexed_folders, get_archived_folders, get_unarchived_folders, @@ -89,9 +90,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional ) ) parser.add_argument( - '--filter-type', + '--filter-type', '-t', type=str, - choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'), + choices=(*LINK_FILTERS.keys(), 'search'), default='exact', help='Type of pattern matching to use when filtering URLs', ) @@ -110,7 +111,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional default="" ) command = parser.parse_args(args or ()) - filter_patterns_str = accept_stdin(stdin) + + filter_patterns_str = None + if not command.filter_patterns: + filter_patterns_str = accept_stdin(stdin) update( resume=command.resume, diff --git a/archivebox/cli/tests.py b/archivebox/cli/tests.py new file mode 100644 index 00000000..04c54df8 --- /dev/null +++ b/archivebox/cli/tests.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' + + +import os +import sys +import shutil +import unittest +from pathlib import Path + +from contextlib import contextmanager + +TEST_CONFIG = { + 'USE_COLOR': 'False', + 'SHOW_PROGRESS': 'False', + + 'OUTPUT_DIR': 'data.tests', + + 'SAVE_ARCHIVE_DOT_ORG': 'False', + 'SAVE_TITLE': 'False', + + 'USE_CURL': 'False', + 'USE_WGET': 'False', + 'USE_GIT': 'False', + 'USE_CHROME': 'False', + 'USE_YOUTUBEDL': 'False', +} + +OUTPUT_DIR = 'data.tests' +os.environ.update(TEST_CONFIG) + +from ..main import init +from ..index import load_main_index +from ..config import ( + SQL_INDEX_FILENAME, + JSON_INDEX_FILENAME, + HTML_INDEX_FILENAME, +) + +from . import ( + archivebox_init, + archivebox_add, + archivebox_remove, +) + +HIDE_CLI_OUTPUT = True + +test_urls = ''' +https://example1.com/what/is/happening.html?what=1#how-about-this=1 +https://example2.com/what/is/happening/?what=1#how-about-this=1 +HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f +https://example4.com/what/is/happening.html +https://example5.com/ +https://example6.com + +http://example7.com +[https://example8.com/what/is/this.php?what=1] +[and http://example9.com?what=1&other=3#and-thing=2] +https://example10.com#and-thing=2 " +abcdef +sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi +example13.bada +and example14.badb +htt://example15.badc +''' + +stdout = sys.stdout +stderr = sys.stderr + + +@contextmanager +def output_hidden(show_failing=True): + if not HIDE_CLI_OUTPUT: + yield + return + + sys.stdout = open('stdout.txt', 'w+', encoding='utf-8') + sys.stderr = open('stderr.txt', 'w+', encoding='utf-8') + try: + yield + sys.stdout.close() + sys.stderr.close() + sys.stdout = stdout + sys.stderr = stderr + except Exception: + sys.stdout.close() + sys.stderr.close() + sys.stdout = stdout + sys.stderr = stderr + if show_failing: + with open('stdout.txt', 'r', encoding='utf-8') as f: + print(f.read()) + with open('stderr.txt', 'r', encoding='utf-8') as f: + print(f.read()) + raise + finally: + os.remove('stdout.txt') + os.remove('stderr.txt') + + +class TestInit(unittest.TestCase): + def setUp(self): + os.makedirs(OUTPUT_DIR, exist_ok=True) + + def tearDown(self): + shutil.rmtree(OUTPUT_DIR, ignore_errors=True) + + def test_basic_init(self): + with output_hidden(): + archivebox_init.main([]) + + assert (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists() + assert (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists() + assert (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists() + assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0 + + def test_conflicting_init(self): + with open(Path(OUTPUT_DIR) / 'test_conflict.txt', 'w+', encoding='utf-8') as f: + f.write('test') + + try: + with output_hidden(show_failing=False): + archivebox_init.main([]) + assert False, 'Init should have exited with an exception' + except SystemExit: + pass + + assert not (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists() + assert not (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists() + assert not (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists() + try: + load_main_index(out_dir=OUTPUT_DIR) + assert False, 'load_main_index should raise an exception when no index is present' + except Exception: + pass + + def test_no_dirty_state(self): + with output_hidden(): + init() + shutil.rmtree(OUTPUT_DIR, ignore_errors=True) + with output_hidden(): + init() + + +class TestAdd(unittest.TestCase): + def setUp(self): + os.makedirs(OUTPUT_DIR, exist_ok=True) + with output_hidden(): + init() + + def tearDown(self): + shutil.rmtree(OUTPUT_DIR, ignore_errors=True) + + def test_add_arg_url(self): + with output_hidden(): + archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all']) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 30 + + def test_add_arg_file(self): + test_file = Path(OUTPUT_DIR) / 'test.txt' + with open(test_file, 'w+', encoding='utf') as f: + f.write(test_urls) + + with output_hidden(): + archivebox_add.main([test_file]) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 12 + os.remove(test_file) + + def test_add_stdin_url(self): + with output_hidden(): + archivebox_add.main([], stdin=test_urls) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 12 + + +class TestRemove(unittest.TestCase): + def setUp(self): + os.makedirs(OUTPUT_DIR, exist_ok=True) + with output_hidden(): + init() + archivebox_add.main([], stdin=test_urls) + + # def tearDown(self): + # shutil.rmtree(OUTPUT_DIR, ignore_errors=True) + + + def test_remove_exact(self): + with output_hidden(): + archivebox_remove.main(['--yes', '--delete', 'https://example5.com/']) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 11 + + def test_remove_regex(self): + with output_hidden(): + archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', r'http(s)?:\/\/(.+\.)?(example\d\.com)']) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 4 + + def test_remove_domain(self): + with output_hidden(): + archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com']) + + all_links = load_main_index(out_dir=OUTPUT_DIR) + assert len(all_links) == 10 + + def test_remove_none(self): + try: + with output_hidden(show_failing=False): + archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com']) + assert False, 'Should raise if no URLs match' + except Exception: + pass + + +if __name__ == '__main__': + if '--verbose' in sys.argv or '-v' in sys.argv: + HIDE_CLI_OUTPUT = False + + unittest.main() diff --git a/archivebox/config.py b/archivebox/config.py index 3d48344f..803e4d19 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -29,10 +29,12 @@ import json import getpass import platform import shutil +import sqlite3 import django from hashlib import md5 from pathlib import Path +from datetime import datetime from typing import Optional, Type, Tuple, Dict, Union, List from subprocess import run, PIPE, DEVNULL from configparser import ConfigParser @@ -77,6 +79,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, + 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, }, 'ARCHIVE_METHOD_TOGGLES': { @@ -99,8 +102,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'ARCHIVE_METHOD_OPTIONS': { 'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)}, - 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'}, + 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'}, 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, + 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, 'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'}, 'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'}, @@ -111,7 +115,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'CHROME_HEADLESS': {'type': bool, 'default': True}, 'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']}, - 'YOUTUBEDL_ARGS': {'type': list, 'default': ['--write-description', + 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: ['--write-description', '--write-info-json', '--write-annotations', '--write-thumbnail', @@ -122,7 +126,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--ignore-errors', '--geo-bypass', '--add-metadata', - '--max-filesize=750m', + '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']), ]}, @@ -287,7 +291,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]}, 'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text().strip())['version']}, - 'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'}, 'PYTHON_BINARY': {'default': lambda c: sys.executable}, 'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()}, @@ -459,7 +462,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: config_file.optionxform = str config_file.read(config_path) - with open(config_path, 'r') as old: + with open(config_path, 'r', encoding='utf-8') as old: atomic_write(f'{config_path}.bak', old.read()) find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0] @@ -480,14 +483,14 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: if (not existing_secret_key) or ('not a valid secret' in existing_secret_key): from django.utils.crypto import get_random_string - chars = 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.' + chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_' random_secret_key = get_random_string(50, chars) if 'SERVER_CONFIG' in config_file: config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key else: config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key} - with open(config_path, 'w+') as new: + with open(config_path, 'w+', encoding='utf-8') as new: config_file.write(new) try: @@ -499,7 +502,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: } except: # something went horribly wrong, rever to the previous version - with open(f'{config_path}.bak', 'r') as old: + with open(f'{config_path}.bak', 'r', encoding='utf-8') as old: atomic_write(config_path, old.read()) if Path(f'{config_path}.bak').exists(): @@ -1062,23 +1065,72 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, try: import django + from django.core.management import call_command + sys.path.append(str(config['PACKAGE_DIR'])) os.environ.setdefault('OUTPUT_DIR', str(output_dir)) assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py' os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') + # Check to make sure JSON extension is available in our Sqlite3 instance + try: + cursor = sqlite3.connect(':memory:').cursor() + cursor.execute('SELECT JSON(\'{"a": "b"}\')') + except sqlite3.OperationalError as exc: + stderr('[X] Your SQLite3 version is missing the required JSON1 extension', color='red') + hint([ + 'Upgrade your Python version or install the extension manually:', + 'https://code.djangoproject.com/wiki/JSON1Extension' + ]) + if in_memory_db: - # Put the db in memory and run migrations in case any command requires it - from django.core.management import call_command + # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk. + # in those cases we create a temporary in-memory db and run the migrations + # immediately to get a usable in-memory-database at startup os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:") django.setup() call_command("migrate", interactive=False, verbosity=0) else: + # Otherwise use default sqlite3 file-based database and initialize django + # without running migrations automatically (user runs them manually by calling init) django.setup() + + + from django.conf import settings + + # log startup message to the error log + with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f: + command = ' '.join(sys.argv) + ts = datetime.now().strftime('%Y-%m-%d__%H:%M:%S') + f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") + if check_db: + # Enable WAL mode in sqlite3 + from django.db import connection + with connection.cursor() as cursor: + current_mode = cursor.execute("PRAGMA journal_mode") + if current_mode != 'wal': + cursor.execute("PRAGMA journal_mode=wal;") + + # Create cache table in DB if needed + try: + from django.core.cache import cache + cache.get('test', None) + except django.db.utils.OperationalError: + call_command("createcachetable", verbosity=0) + + + # if archivebox gets imported multiple times, we have to close + # the sqlite3 whenever we init from scratch to avoid multiple threads + # sharing the same connection by accident + from django.db import connections + for conn in connections.all(): + conn.close_if_unusable_or_obsolete() + sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME assert sql_index_path.exists(), ( f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)') + except KeyboardInterrupt: raise SystemExit(2) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index bacc53c0..517ec79b 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -1,6 +1,7 @@ __package__ = 'archivebox.core' from io import StringIO +from pathlib import Path from contextlib import redirect_stdout from django.contrib import admin @@ -13,15 +14,15 @@ from django import forms from ..util import htmldecode, urldecode, ansi_to_html -from core.models import Snapshot, Tag -from core.forms import AddLinkForm, TagField +from core.models import Snapshot, ArchiveResult, Tag +from core.forms import AddLinkForm from core.mixins import SearchResultsAdminMixin from index.html import snapshot_icons from logging_util import printable_filesize from main import add, remove -from config import OUTPUT_DIR +from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE from extractors import archive_links # Admin URLs @@ -36,77 +37,34 @@ from extractors import archive_links # TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel -def update_snapshots(modeladmin, request, queryset): - archive_links([ - snapshot.as_link() - for snapshot in queryset - ], out_dir=OUTPUT_DIR) -update_snapshots.short_description = "Archive" -def update_titles(modeladmin, request, queryset): - archive_links([ - snapshot.as_link() - for snapshot in queryset - ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR) -update_titles.short_description = "Pull title" +class ArchiveResultInline(admin.TabularInline): + model = ArchiveResult -def overwrite_snapshots(modeladmin, request, queryset): - archive_links([ - snapshot.as_link() - for snapshot in queryset - ], overwrite=True, out_dir=OUTPUT_DIR) -overwrite_snapshots.short_description = "Re-archive (overwrite)" +class TagInline(admin.TabularInline): + model = Snapshot.tags.through -def verify_snapshots(modeladmin, request, queryset): - for snapshot in queryset: - print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history)) - -verify_snapshots.short_description = "Check" - -def delete_snapshots(modeladmin, request, queryset): - remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR) - -delete_snapshots.short_description = "Delete" +from django.contrib.admin.helpers import ActionForm -class SnapshotAdminForm(forms.ModelForm): - tags = TagField(required=False) - - class Meta: - model = Snapshot - fields = "__all__" - - def save(self, commit=True): - # Based on: https://stackoverflow.com/a/49933068/3509554 - - # Get the unsave instance - instance = forms.ModelForm.save(self, False) - tags = self.cleaned_data.pop("tags") - - #update save_m2m - def new_save_m2m(): - instance.save_tags(tags) - - # Do we need to save all changes now? - self.save_m2m = new_save_m2m - if commit: - instance.save() - - return instance +class SnapshotActionForm(ActionForm): + tag = forms.ModelChoiceField(queryset=Tag.objects.all(), required=False) class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): list_display = ('added', 'title_str', 'url_str', 'files', 'size') sort_fields = ('title_str', 'url_str', 'added') - readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated') + readonly_fields = ('uuid', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated') search_fields = ['url__icontains', 'timestamp', 'title', 'tags__name'] - fields = (*readonly_fields, 'title', 'tags') + fields = ('timestamp', 'url', 'title', 'tags', *readonly_fields) list_filter = ('added', 'updated', 'tags') ordering = ['-added'] - actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots] - actions_template = 'admin/actions_as_select.html' - form = SnapshotAdminForm - list_per_page = 40 + actions = ['delete_snapshots', 'overwrite_snapshots', 'update_snapshots', 'update_titles', 'verify_snapshots', 'add_tag', 'remove_tag'] + autocomplete_fields = ['tags'] + inlines = [ArchiveResultInline] + list_per_page = SNAPSHOTS_PER_PAGE + + action_form = SnapshotActionForm def get_urls(self): urls = super().get_urls() @@ -116,21 +74,46 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): return custom_urls + urls def get_queryset(self, request): + self.request = request return super().get_queryset(request).prefetch_related('tags') def tag_list(self, obj): return ', '.join(obj.tags.values_list('name', flat=True)) - def id_str(self, obj): + # TODO: figure out a different way to do this, you cant nest forms so this doenst work + # def action(self, obj): + # # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0 + # # action: update_snapshots + # # select_across: 0 + # # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3 + # return format_html( + # ''' + #
+ # + # + # + # + # + # + # + #
+ # ''', + # csrf.get_token(self.request), + # obj.id, + # ) + + def uuid(self, obj): return format_html( - '{}', - obj.url_hash[:8], + '{}
View index ➡️     View actions ⚙️', + obj.id, + obj.timestamp, + obj.id, ) def title_str(self, obj): canon = obj.as_link().canonical_outputs() tags = ''.join( - format_html('{} ', tag.id, tag) + format_html('{} ', tag.id, tag) for tag in obj.tags.all() if str(tag).strip() ) @@ -152,7 +135,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): return snapshot_icons(obj) def size(self, obj): - archive_size = obj.archive_size + archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size if archive_size: size_txt = printable_filesize(archive_size) if archive_size > 52428800: @@ -190,28 +173,136 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): rendered_response = self.changelist_view(request) # Restore values - self.change_list_template = saved_change_list_template + self.change_list_template = saved_change_list_template self.list_per_page = saved_list_per_page self.list_max_show_all = saved_list_max_show_all return rendered_response + + + def update_snapshots(self, request, queryset): + archive_links([ + snapshot.as_link() + for snapshot in queryset + ], out_dir=OUTPUT_DIR) + update_snapshots.short_description = "Archive" + + def update_titles(self, request, queryset): + archive_links([ + snapshot.as_link() + for snapshot in queryset + ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR) + update_titles.short_description = "Pull title" + + def overwrite_snapshots(self, request, queryset): + archive_links([ + snapshot.as_link() + for snapshot in queryset + ], overwrite=True, out_dir=OUTPUT_DIR) + overwrite_snapshots.short_description = "Re-archive (overwrite)" + + def verify_snapshots(self, request, queryset): + for snapshot in queryset: + print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history)) + + verify_snapshots.short_description = "Check" + + def delete_snapshots(self, request, queryset): + remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR) + + delete_snapshots.short_description = "Delete" + + def add_tag(self, request, queryset): + if tag and tag.id: + tag = request.POST['tag'] + for obj in queryset: + obj.tags.add(tag) + + add_tag.short_description = "Add tag" + + def remove_tag(self, request, queryset): + tag = request.POST['tag'] + for obj in queryset: + obj.tags.remove(tag) + + remove_tag.short_description = "Remove tag" + - id_str.short_description = 'ID' title_str.short_description = 'Title' url_str.short_description = 'Original URL' - id_str.admin_order_field = 'id' title_str.admin_order_field = 'title' url_str.admin_order_field = 'url' + + class TagAdmin(admin.ModelAdmin): - list_display = ('slug', 'name', 'id') + list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id') sort_fields = ('id', 'name', 'slug') - readonly_fields = ('id',) + readonly_fields = ('id', 'num_snapshots', 'snapshots') search_fields = ('id', 'name', 'slug') fields = (*readonly_fields, 'name', 'slug') + actions = ['delete_selected'] + ordering = ['-id'] + def num_snapshots(self, obj): + return format_html( + '{} total', + obj.id, + obj.snapshot_set.count(), + ) + + def snapshots(self, obj): + total_count = obj.snapshot_set.count() + return mark_safe('
'.join( + format_html( + '{} [{}] {}', + snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...', + snap.id, + snap.timestamp, + snap.url, + ) + for snap in obj.snapshot_set.order_by('-updated')[:10] + ) + (f'
and {total_count-10} more...' if obj.snapshot_set.count() > 10 else '')) + + +class ArchiveResultAdmin(admin.ModelAdmin): + list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'cmd_str', 'status', 'output_str') + sort_fields = ('start_ts', 'extractor', 'status') + readonly_fields = ('id', 'uuid', 'snapshot_str') + search_fields = ('id', 'uuid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') + fields = (*readonly_fields, 'snapshot', 'snapshot__tags', 'extractor', 'status', 'start_ts', 'end_ts', 'pwd', 'cmd', 'cmd_version', 'output') + autocomplete_fields = ['snapshot'] + + list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') + ordering = ['-start_ts'] + list_per_page = SNAPSHOTS_PER_PAGE + + def snapshot_str(self, obj): + return format_html( + '[{}]
' + '{}', + obj.snapshot.timestamp, + obj.snapshot.timestamp, + obj.snapshot.url[:128], + ) + + def cmd_str(self, obj): + return format_html( + '
{}
', + ' '.join(obj.cmd) if isinstance(obj.cmd, list) else str(obj.cmd), + ) + + def output_str(self, obj): + return format_html( + '↗️
{}
', + obj.snapshot.timestamp, + obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html', + obj.output, + ) + + snapshot_str.short_description = 'snapshot' class ArchiveBoxAdmin(admin.AdminSite): site_header = 'ArchiveBox' @@ -266,4 +357,5 @@ admin.site = ArchiveBoxAdmin() admin.site.register(get_user_model()) admin.site.register(Snapshot, SnapshotAdmin) admin.site.register(Tag, TagAdmin) +admin.site.register(ArchiveResult, ArchiveResultAdmin) admin.site.disable_action('delete_selected') diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index ed584c68..e3e904df 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -20,7 +20,8 @@ ARCHIVE_METHODS = [ class AddLinkForm(forms.Form): url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True) - depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0') + tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False) + depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"})) archive_methods = forms.MultipleChoiceField( label="Archive methods (select at least 1, otherwise all will be used by default)", required=False, diff --git a/archivebox/core/migrations/0009_auto_20210216_1038.py b/archivebox/core/migrations/0009_auto_20210216_1038.py new file mode 100644 index 00000000..2817fe54 --- /dev/null +++ b/archivebox/core/migrations/0009_auto_20210216_1038.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.3 on 2021-02-16 10:38 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0008_auto_20210105_1421'), + ] + + operations = [ + migrations.AlterField( + model_name='snapshot', + name='updated', + field=models.DateTimeField(auto_now=True, db_index=True, null=True), + ), + ] diff --git a/archivebox/core/migrations/0010_auto_20210216_1055.py b/archivebox/core/migrations/0010_auto_20210216_1055.py new file mode 100644 index 00000000..0af61a39 --- /dev/null +++ b/archivebox/core/migrations/0010_auto_20210216_1055.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.3 on 2021-02-16 10:55 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0009_auto_20210216_1038'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='start_ts', + field=models.DateTimeField(db_index=True), + ), + ] diff --git a/archivebox/core/migrations/0011_auto_20210216_1331.py b/archivebox/core/migrations/0011_auto_20210216_1331.py new file mode 100644 index 00000000..d2226674 --- /dev/null +++ b/archivebox/core/migrations/0011_auto_20210216_1331.py @@ -0,0 +1,24 @@ +# Generated by Django 3.1.3 on 2021-02-16 13:31 + +from django.db import migrations, models +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0010_auto_20210216_1055'), + ] + + operations = [ + migrations.AddField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(default=uuid.uuid4, editable=False), + ), + migrations.AlterField( + model_name='archiveresult', + name='extractor', + field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32), + ), + ] diff --git a/archivebox/core/migrations/0012_auto_20210216_1425.py b/archivebox/core/migrations/0012_auto_20210216_1425.py new file mode 100644 index 00000000..310058ac --- /dev/null +++ b/archivebox/core/migrations/0012_auto_20210216_1425.py @@ -0,0 +1,23 @@ +# Generated by Django 3.1.3 on 2021-02-16 14:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0011_auto_20210216_1331'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='cmd_version', + field=models.CharField(blank=True, default=None, max_length=128, null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='output', + field=models.CharField(max_length=1024), + ), + ] diff --git a/archivebox/core/migrations/0013_auto_20210218_0729.py b/archivebox/core/migrations/0013_auto_20210218_0729.py new file mode 100644 index 00000000..d3fe3b4f --- /dev/null +++ b/archivebox/core/migrations/0013_auto_20210218_0729.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.3 on 2021-02-18 07:29 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0012_auto_20210216_1425'), + ] + + operations = [ + migrations.AlterField( + model_name='snapshot', + name='title', + field=models.CharField(blank=True, db_index=True, max_length=256, null=True), + ), + ] diff --git a/archivebox/core/migrations/0014_auto_20210218_0729.py b/archivebox/core/migrations/0014_auto_20210218_0729.py new file mode 100644 index 00000000..db81934f --- /dev/null +++ b/archivebox/core/migrations/0014_auto_20210218_0729.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.3 on 2021-02-18 07:29 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0013_auto_20210218_0729'), + ] + + operations = [ + migrations.AlterField( + model_name='snapshot', + name='title', + field=models.CharField(blank=True, db_index=True, max_length=1024, null=True), + ), + ] diff --git a/archivebox/core/migrations/0015_auto_20210218_0730.py b/archivebox/core/migrations/0015_auto_20210218_0730.py new file mode 100644 index 00000000..b782a217 --- /dev/null +++ b/archivebox/core/migrations/0015_auto_20210218_0730.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.3 on 2021-02-18 07:30 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0014_auto_20210218_0729'), + ] + + operations = [ + migrations.AlterField( + model_name='snapshot', + name='title', + field=models.CharField(blank=True, db_index=True, max_length=512, null=True), + ), + ] diff --git a/archivebox/core/migrations/0016_auto_20210218_1204.py b/archivebox/core/migrations/0016_auto_20210218_1204.py new file mode 100644 index 00000000..4637feab --- /dev/null +++ b/archivebox/core/migrations/0016_auto_20210218_1204.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.3 on 2021-02-18 12:04 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0015_auto_20210218_0730'), + ] + + operations = [ + migrations.AlterField( + model_name='snapshot', + name='tags', + field=models.ManyToManyField(blank=True, to='core.Tag'), + ), + ] diff --git a/archivebox/core/migrations/0017_auto_20210219_0211.py b/archivebox/core/migrations/0017_auto_20210219_0211.py new file mode 100644 index 00000000..221a250b --- /dev/null +++ b/archivebox/core/migrations/0017_auto_20210219_0211.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.3 on 2021-02-19 02:11 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0016_auto_20210218_1204'), + ] + + operations = [ + migrations.AlterField( + model_name='tag', + name='slug', + field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name='slug'), + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 13d75b66..e7741920 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -2,12 +2,15 @@ __package__ = 'archivebox.core' import uuid -from django.db import models, transaction +from django.db import models from django.utils.functional import cached_property from django.utils.text import slugify +from django.core.cache import cache from django.db.models import Case, When, Value, IntegerField -from ..util import parse_date +from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME +from ..system import get_dir_size +from ..util import parse_date, base_url, hashurl from ..index.schema import Link from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE @@ -29,8 +32,11 @@ class Tag(models.Model): """ Based on django-taggit model """ - name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100) - slug = models.SlugField(verbose_name="slug", unique=True, max_length=100) + name = models.CharField(unique=True, blank=False, max_length=100) + + # slug is autoset on save from name, never set it manually + slug = models.SlugField(unique=True, blank=True, max_length=100) + class Meta: verbose_name = "Tag" @@ -49,20 +55,21 @@ class Tag(models.Model): if self._state.adding and not self.slug: self.slug = self.slugify(self.name) - with transaction.atomic(): - slugs = set( - type(self) - ._default_manager.filter(slug__startswith=self.slug) - .values_list("slug", flat=True) - ) + # if name is different but slug conficts with another tags slug, append a counter + # with transaction.atomic(): + slugs = set( + type(self) + ._default_manager.filter(slug__startswith=self.slug) + .values_list("slug", flat=True) + ) - i = None - while True: - slug = self.slugify(self.name, i) - if slug not in slugs: - self.slug = slug - return super().save(*args, **kwargs) - i = 1 if i is None else i+1 + i = None + while True: + slug = self.slugify(self.name, i) + if slug not in slugs: + self.slug = slug + return super().save(*args, **kwargs) + i = 1 if i is None else i+1 else: return super().save(*args, **kwargs) @@ -73,11 +80,11 @@ class Snapshot(models.Model): url = models.URLField(unique=True) timestamp = models.CharField(max_length=32, unique=True, db_index=True) - title = models.CharField(max_length=128, null=True, blank=True, db_index=True) + title = models.CharField(max_length=512, null=True, blank=True, db_index=True) added = models.DateTimeField(auto_now_add=True, db_index=True) - updated = models.DateTimeField(null=True, blank=True, db_index=True) - tags = models.ManyToManyField(Tag) + updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True) + tags = models.ManyToManyField(Tag, blank=True) keys = ('url', 'timestamp', 'title', 'tags', 'updated') @@ -109,13 +116,24 @@ class Snapshot(models.Model): from ..index import load_link_details return load_link_details(self.as_link()) - def tags_str(self) -> str: - return ','.join(self.tags.order_by('name').values_list('name', flat=True)) + def tags_str(self, nocache=True) -> str: + cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags' + calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True)) + if nocache: + tags_str = calc_tags_str() + cache.set(cache_key, tags_str) + return tags_str + return cache.get_or_set(cache_key, calc_tags_str) @cached_property def bookmarked(self): return parse_date(self.timestamp) + @cached_property + def bookmarked_date(self): + # TODO: remove this + return self.bookmarked + @cached_property def is_archived(self): return self.as_link().is_archived @@ -126,23 +144,31 @@ class Snapshot(models.Model): @cached_property def url_hash(self): - return self.as_link().url_hash + return hashurl(self.url) @cached_property def base_url(self): - return self.as_link().base_url + return base_url(self.url) @cached_property def link_dir(self): - return self.as_link().link_dir + return str(ARCHIVE_DIR / self.timestamp) @cached_property def archive_path(self): - return self.as_link().archive_path + return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp) @cached_property def archive_size(self): - return self.as_link().archive_size + cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size' + + def calc_dir_size(): + try: + return get_dir_size(self.link_dir)[0] + except Exception: + return 0 + + return cache.get_or_set(cache_key, calc_dir_size) @cached_property def history(self): @@ -151,17 +177,40 @@ class Snapshot(models.Model): @cached_property def latest_title(self): - if ('title' in self.history - and self.history['title'] - and (self.history['title'][-1].status == 'succeeded') - and self.history['title'][-1].output.strip()): - return self.history['title'][-1].output.strip() + if self.title: + return self.title # whoopdedoo that was easy + + try: + # take longest successful title from ArchiveResult db history + return sorted( + self.archiveresult_set\ + .filter(extractor='title', status='succeeded', output__isnull=False)\ + .values_list('output', flat=True), + key=lambda r: len(r), + )[-1] + except IndexError: + pass + + try: + # take longest successful title from Link json index file history + return sorted( + ( + result.output.strip() + for result in self.history['title'] + if result.status == 'succeeded' and result.output.strip() + ), + key=lambda r: len(r), + )[-1] + except (KeyError, IndexError): + pass + return None def save_tags(self, tags=()): tags_id = [] for tag in tags: - tags_id.append(Tag.objects.get_or_create(name=tag)[0].id) + if tag.strip(): + tags_id.append(Tag.objects.get_or_create(name=tag)[0].id) self.tags.clear() self.tags.add(*tags_id) @@ -178,15 +227,18 @@ class ArchiveResultManager(models.Manager): class ArchiveResult(models.Model): + id = models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID') + uuid = models.UUIDField(default=uuid.uuid4, editable=False) + snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) + extractor = models.CharField(choices=EXTRACTORS, max_length=32) cmd = JSONField() pwd = models.CharField(max_length=256) - cmd_version = models.CharField(max_length=32, default=None, null=True, blank=True) - output = models.CharField(max_length=512) - start_ts = models.DateTimeField() + cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True) + output = models.CharField(max_length=1024) + start_ts = models.DateTimeField(db_index=True) end_ts = models.DateTimeField() status = models.CharField(max_length=16, choices=STATUS_CHOICES) - extractor = models.CharField(choices=EXTRACTORS, max_length=32) objects = ArchiveResultManager() diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index e73c93d9..6a795702 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -2,6 +2,9 @@ __package__ = 'archivebox.core' import os import sys +import re +import logging +import tempfile from pathlib import Path from django.utils.crypto import get_random_string @@ -14,6 +17,7 @@ from ..config import ( TEMPLATES_DIR_NAME, SQL_INDEX_FILENAME, OUTPUT_DIR, + LOGS_DIR, ) @@ -62,6 +66,40 @@ AUTHENTICATION_BACKENDS = [ 'django.contrib.auth.backends.ModelBackend', ] +# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode) +DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv) +if DEBUG_TOOLBAR: + try: + import debug_toolbar # noqa + DEBUG_TOOLBAR = True + except ImportError: + DEBUG_TOOLBAR = False + +if DEBUG_TOOLBAR: + INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar'] + INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*'] + DEBUG_TOOLBAR_CONFIG = { + "SHOW_TOOLBAR_CALLBACK": lambda request: True, + "RENDER_PANELS": True, + } + DEBUG_TOOLBAR_PANELS = [ + 'debug_toolbar.panels.history.HistoryPanel', + 'debug_toolbar.panels.versions.VersionsPanel', + 'debug_toolbar.panels.timer.TimerPanel', + 'debug_toolbar.panels.settings.SettingsPanel', + 'debug_toolbar.panels.headers.HeadersPanel', + 'debug_toolbar.panels.request.RequestPanel', + 'debug_toolbar.panels.sql.SQLPanel', + 'debug_toolbar.panels.staticfiles.StaticFilesPanel', + # 'debug_toolbar.panels.templates.TemplatesPanel', + 'debug_toolbar.panels.cache.CachePanel', + 'debug_toolbar.panels.signals.SignalsPanel', + 'debug_toolbar.panels.logging.LoggingPanel', + 'debug_toolbar.panels.redirects.RedirectsPanel', + 'debug_toolbar.panels.profiling.ProfilingPanel', + 'djdt_flamegraph.FlamegraphPanel', + ] + MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware'] ################################################################################ ### Staticfile and Template Settings @@ -107,6 +145,22 @@ DATABASES = { 'default': { 'ENGINE': 'django.db.backends.sqlite3', 'NAME': DATABASE_NAME, + 'OPTIONS': { + 'timeout': 60, + 'check_same_thread': False, + }, + # DB setup is sometimes modified at runtime by setup_django() in config.py + } +} + +CACHE_BACKEND = 'django.core.cache.backends.locmem.LocMemCache' +# CACHE_BACKEND = 'django.core.cache.backends.db.DatabaseCache' +# CACHE_BACKEND = 'django.core.cache.backends.dummy.DummyCache' + +CACHES = { + 'default': { + 'BACKEND': CACHE_BACKEND, + 'LOCATION': 'django_cache_default', } } @@ -117,7 +171,7 @@ EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' ### Security Settings ################################################################################ -SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.') +SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_') ALLOWED_HOSTS = ALLOWED_HOSTS.split(',') @@ -131,6 +185,8 @@ SESSION_COOKIE_AGE = 1209600 # 2 weeks SESSION_EXPIRE_AT_BROWSER_CLOSE = False SESSION_SAVE_EVERY_REQUEST = True +SESSION_ENGINE = "django.contrib.sessions.backends.db" + AUTH_PASSWORD_VALIDATORS = [ {'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'}, {'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'}, @@ -163,3 +219,73 @@ USE_TZ = False DATETIME_FORMAT = 'Y-m-d g:iA' SHORT_DATETIME_FORMAT = 'Y-m-d h:iA' + + +################################################################################ +### Logging Settings +################################################################################ + +IGNORABLE_404_URLS = [ + re.compile(r'apple-touch-icon.*\.png$'), + re.compile(r'favicon\.ico$'), + re.compile(r'robots\.txt$'), + re.compile(r'.*\.(css|js)\.map$'), +] + +class NoisyRequestsFilter(logging.Filter): + def filter(self, record): + logline = record.getMessage() + + # ignore harmless 404s for the patterns in IGNORABLE_404_URLS + for ignorable_url_pattern in IGNORABLE_404_URLS: + ignorable_log_pattern = re.compile(f'^"GET /.*/?{ignorable_url_pattern.pattern[:-1]} HTTP/.*" (200|30.|404) .+$', re.I | re.M) + if ignorable_log_pattern.match(logline): + return 0 + + # ignore staticfile requests that 200 or 30* + ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M) + if ignoreable_200_log_pattern.match(logline): + return 0 + + return 1 + +if LOGS_DIR.exists(): + ERROR_LOG = (LOGS_DIR / 'errors.log') +else: + # meh too many edge cases here around creating log dir w/ correct permissions + # cant be bothered, just trash the log and let them figure it out via stdout/stderr + ERROR_LOG = tempfile.NamedTemporaryFile().name + +LOGGING = { + 'version': 1, + 'disable_existing_loggers': False, + 'handlers': { + 'console': { + 'class': 'logging.StreamHandler', + }, + 'logfile': { + 'level': 'ERROR', + 'class': 'logging.handlers.RotatingFileHandler', + 'filename': ERROR_LOG, + 'maxBytes': 1024 * 1024 * 25, # 25 MB + 'backupCount': 10, + }, + }, + 'filters': { + 'noisyrequestsfilter': { + '()': NoisyRequestsFilter, + } + }, + 'loggers': { + 'django': { + 'handlers': ['console', 'logfile'], + 'level': 'INFO', + 'filters': ['noisyrequestsfilter'], + }, + 'django.server': { + 'handlers': ['console', 'logfile'], + 'level': 'INFO', + 'filters': ['noisyrequestsfilter'], + } + }, +} diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index 182e4dca..87a302b8 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -2,6 +2,7 @@ from django.contrib import admin from django.urls import path, include from django.views import static +from django.contrib.staticfiles.urls import staticfiles_urlpatterns from django.conf import settings from django.views.generic.base import RedirectView @@ -13,8 +14,8 @@ from core.views import HomepageView, SnapshotView, PublicIndexView, AddView urlpatterns = [ path('public/', PublicIndexView.as_view(), name='public-index'), - path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}), - path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}), + path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}), + path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}), path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'), @@ -35,35 +36,43 @@ urlpatterns = [ path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}), path('', HomepageView.as_view(), name='Home'), ] +urlpatterns += staticfiles_urlpatterns() - # # Proposed UI URLs spec - # path('', HomepageView) - # path('/add', AddView) - # path('/public', PublicIndexView) - # path('/snapshot/:slug', SnapshotView) - - # path('/admin', admin.site.urls) - # path('/accounts', django.contrib.auth.urls) +if settings.DEBUG_TOOLBAR: + import debug_toolbar + urlpatterns += [ + path('__debug__/', include(debug_toolbar.urls)), + ] - # # Prposed REST API spec - # # :slugs can be uuid, short_uuid, or any of the unique index_fields - # path('api/v1/'), - # path('api/v1/core/' [GET]) - # path('api/v1/core/snapshot/', [GET, POST, PUT]), - # path('api/v1/core/snapshot/:slug', [GET, PATCH, DELETE]), - # path('api/v1/core/archiveresult', [GET, POST, PUT]), - # path('api/v1/core/archiveresult/:slug', [GET, PATCH, DELETE]), - # path('api/v1/core/tag/', [GET, POST, PUT]), - # path('api/v1/core/tag/:slug', [GET, PATCH, DELETE]), - # path('api/v1/cli/', [GET]) - # path('api/v1/cli/{add,list,config,...}', [POST]), # pass query as kwargs directly to `run_subcommand` and return stdout, stderr, exitcode +# # Proposed FUTURE URLs spec +# path('', HomepageView) +# path('/add', AddView) +# path('/public', PublicIndexView) +# path('/snapshot/:slug', SnapshotView) - # path('api/v1/extractors/', [GET]) - # path('api/v1/extractors/:extractor/', [GET]), - # path('api/v1/extractors/:extractor/:func', [GET, POST]), # pass query as args directly to chosen function +# path('/admin', admin.site.urls) +# path('/accounts', django.contrib.auth.urls) - # future, just an idea: - # path('api/v1/scheduler/', [GET]) - # path('api/v1/scheduler/task/', [GET, POST, PUT]), - # path('api/v1/scheduler/task/:slug', [GET, PATCH, DELETE]), +# # Prposed REST API spec +# # :slugs can be uuid, short_uuid, or any of the unique index_fields +# path('api/v1/'), +# path('api/v1/core/' [GET]) +# path('api/v1/core/snapshot/', [GET, POST, PUT]), +# path('api/v1/core/snapshot/:slug', [GET, PATCH, DELETE]), +# path('api/v1/core/archiveresult', [GET, POST, PUT]), +# path('api/v1/core/archiveresult/:slug', [GET, PATCH, DELETE]), +# path('api/v1/core/tag/', [GET, POST, PUT]), +# path('api/v1/core/tag/:slug', [GET, PATCH, DELETE]), + +# path('api/v1/cli/', [GET]) +# path('api/v1/cli/{add,list,config,...}', [POST]), # pass query as kwargs directly to `run_subcommand` and return stdout, stderr, exitcode + +# path('api/v1/extractors/', [GET]) +# path('api/v1/extractors/:extractor/', [GET]), +# path('api/v1/extractors/:extractor/:func', [GET, POST]), # pass query as args directly to chosen function + +# future, just an idea: +# path('api/v1/scheduler/', [GET]) +# path('api/v1/scheduler/task/', [GET, POST, PUT]), +# path('api/v1/scheduler/task/:slug', [GET, PATCH, DELETE]), diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 0e19fad6..36794a8d 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -4,8 +4,8 @@ from io import StringIO from contextlib import redirect_stdout from django.shortcuts import render, redirect - -from django.http import HttpResponse +from django.http import HttpResponse, Http404 +from django.utils.html import format_html, mark_safe from django.views import View, static from django.views.generic.list import ListView from django.views.generic import FormView @@ -22,6 +22,7 @@ from ..config import ( PUBLIC_ADD_VIEW, VERSION, FOOTER_INFO, + SNAPSHOTS_PER_PAGE, ) from main import add from ..util import base_url, ansi_to_html @@ -43,10 +44,6 @@ class SnapshotView(View): # render static html index from filesystem archive//index.html def get(self, request, path): - # missing trailing slash -> redirect to index - if '/' not in path: - return redirect(f'{path}/index.html') - if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS: return redirect(f'/admin/login/?next={request.path}') @@ -55,46 +52,163 @@ class SnapshotView(View): except (IndexError, ValueError): slug, archivefile = path.split('/', 1)[0], 'index.html' - all_pages = list(Snapshot.objects.all()) - # slug is a timestamp - by_ts = {page.timestamp: page for page in all_pages} - try: - # print('SERVING STATICFILE', by_ts[slug].link_dir, request.path, path) - response = static.serve(request, archivefile, document_root=by_ts[slug].link_dir, show_indexes=True) - response["Link"] = f'<{by_ts[slug].url}>; rel="canonical"' - return response - except KeyError: - pass + if slug.replace('.','').isdigit(): - # slug is a hash - by_hash = {page.url_hash: page for page in all_pages} - try: - timestamp = by_hash[slug].timestamp - return redirect(f'/archive/{timestamp}/{archivefile}') - except KeyError: - pass + # missing trailing slash -> redirect to index + if '/' not in path: + return redirect(f'{path}/index.html') + try: + try: + snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug)) + response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True) + response["Link"] = f'<{snapshot.url}>; rel="canonical"' + return response + except Snapshot.DoesNotExist: + if Snapshot.objects.filter(timestamp__startswith=slug).exists(): + raise Snapshot.MultipleObjectsReturned + else: + raise + except Snapshot.DoesNotExist: + # Snapshot does not exist + return HttpResponse( + format_html( + ( + '



' + 'No Snapshot directories match the given timestamp or UUID: {}

' + 'You can add a new Snapshot, or return to the Main Index' + '
' + ), + slug, + path, + ), + content_type="text/html", + status=404, + ) + except Snapshot.MultipleObjectsReturned: + snapshot_hrefs = mark_safe('
').join( + format_html( + '{} {} {} {}', + snap.added.strftime('%Y-%m-%d %H:%M:%S'), + snap.timestamp, + snap.timestamp, + snap.url, + snap.title or '', + ) + for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'added').order_by('-added') + ) + return HttpResponse( + format_html( + ( + 'Multiple Snapshots match the given timestamp/UUID {}
'
+                        ),
+                        slug,
+                    ) + snapshot_hrefs + format_html(
+                        (
+                            '

' + 'Choose a Snapshot to proceed or go back to the Main Index' + ) + ), + content_type="text/html", + status=404, + ) + except Http404: + # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png + return HttpResponse( + format_html( + ( + '



' + f'Snapshot [{snapshot.timestamp}] exists in DB, but resource {snapshot.timestamp}/' + '{}' + f' does not exist in snapshot dir yet.

' + 'Maybe this resource type is not availabe for this Snapshot,
or the archiving process has not completed yet?
' + f'
# run this cmd to finish archiving this Snapshot
archivebox update -t timestamp {snapshot.timestamp}


' + '
' + 'Next steps:
' + f'- list all the Snapshot files .*
' + f'- view the Snapshot ./index.html
' + f'- go to the Snapshot admin to edit
' + f'- go to the Snapshot actions to re-archive
' + '- or return to the main index...
' + '
' + ), + archivefile, + ), + content_type="text/html", + status=404, + ) # slug is a URL - by_url = {page.base_url: page for page in all_pages} try: - # TODO: add multiple snapshot support by showing index of all snapshots - # for given url instead of redirecting to timestamp index - timestamp = by_url[base_url(path)].timestamp - return redirect(f'/archive/{timestamp}/index.html') - except KeyError: - pass - - return HttpResponse( - 'No archived link matches the given timestamp or hash.', - content_type="text/plain", - status=404, - ) + try: + # try exact match on full url first + snapshot = Snapshot.objects.get( + Q(url='http://' + path) | Q(url='https://' + path) | Q(id__startswith=path) + ) + except Snapshot.DoesNotExist: + # fall back to match on exact base_url + try: + snapshot = Snapshot.objects.get( + Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path)) + ) + except Snapshot.DoesNotExist: + # fall back to matching base_url as prefix + snapshot = Snapshot.objects.get( + Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path)) + ) + return redirect(f'/archive/{snapshot.timestamp}/index.html') + except Snapshot.DoesNotExist: + return HttpResponse( + format_html( + ( + '



' + 'No Snapshots match the given url: {}


' + 'Return to the Main Index, or:

' + '+ Add a new Snapshot for {}

' + '
' + ), + base_url(path), + path if '://' in path else f'https://{path}', + path, + ), + content_type="text/html", + status=404, + ) + except Snapshot.MultipleObjectsReturned: + snapshot_hrefs = mark_safe('
').join( + format_html( + '{} {} {} {}', + snap.added.strftime('%Y-%m-%d %H:%M:%S'), + snap.timestamp, + snap.timestamp, + snap.url, + snap.title or '', + ) + for snap in Snapshot.objects.filter( + Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path)) + ).only('url', 'timestamp', 'title', 'added').order_by('-added') + ) + return HttpResponse( + format_html( + ( + 'Multiple Snapshots match the given URL {}
'
+                    ),
+                    base_url(path),
+                ) + snapshot_hrefs + format_html(
+                    (
+                        '

' + 'Choose a Snapshot to proceed or go back to the Main Index' + ) + ), + content_type="text/html", + status=404, + ) + class PublicIndexView(ListView): template_name = 'public_index.html' model = Snapshot - paginate_by = 100 + paginate_by = SNAPSHOTS_PER_PAGE ordering = ['title'] def get_context_data(self, **kwargs): @@ -105,12 +219,14 @@ class PublicIndexView(ListView): } def get_queryset(self, **kwargs): - qs = super().get_queryset(**kwargs) + qs = super().get_queryset(**kwargs) query = self.request.GET.get('q') if query: qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query)) + for snapshot in qs: - snapshot.icons = snapshot_icons(snapshot) + # lazy load snapshot icons, otherwise it will load icons for entire index at once + snapshot.icons = lambda: snapshot_icons(snapshot) return qs def get(self, *args, **kwargs): @@ -130,9 +246,9 @@ class AddView(UserPassesTestMixin, FormView): if self.request.method == 'GET': url = self.request.GET.get('url', None) if url: - return {'url': url} - else: - return super().get_initial() + return {'url': url if '://' in url else f'https://{url}'} + + return super().get_initial() def test_func(self): return PUBLIC_ADD_VIEW or self.request.user.is_authenticated @@ -145,15 +261,18 @@ class AddView(UserPassesTestMixin, FormView): 'absolute_add_path': self.request.build_absolute_uri(self.request.path), 'VERSION': VERSION, 'FOOTER_INFO': FOOTER_INFO, + 'stdout': '', } def form_valid(self, form): url = form.cleaned_data["url"] print(f'[+] Adding URL: {url}') + tag = form.cleaned_data["tag"] depth = 0 if form.cleaned_data["depth"] == "0" else 1 extractors = ','.join(form.cleaned_data["archive_methods"]) input_kwargs = { "urls": url, + "tag": tag, "depth": depth, "update_all": False, "out_dir": OUTPUT_DIR, diff --git a/archivebox/core/wsgi.py b/archivebox/core/wsgi.py index f933afae..94993b92 100644 --- a/archivebox/core/wsgi.py +++ b/archivebox/core/wsgi.py @@ -7,10 +7,10 @@ For more information on this file, see https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/ """ -import os + +from archivebox.config import setup_django +setup_django(in_memory_db=False, check_db=True) from django.core.wsgi import get_wsgi_application -os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings') - application = get_wsgi_application() diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 15968097..09b56c66 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -44,16 +44,16 @@ def get_default_archive_methods(): return [ ('title', should_save_title, save_title), ('favicon', should_save_favicon, save_favicon), - ('wget', should_save_wget, save_wget), + ('headers', should_save_headers, save_headers), ('singlefile', should_save_singlefile, save_singlefile), ('pdf', should_save_pdf, save_pdf), ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), - ('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them + ('wget', should_save_wget, save_wget), + ('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them ('mercury', should_save_mercury, save_mercury), ('git', should_save_git, save_git), ('media', should_save_media, save_media), - ('headers', should_save_headers, save_headers), ('archive_org', should_save_archive_dot_org, save_archive_dot_org), ] @@ -115,6 +115,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version, output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status) + + # bump the updated time on the main Snapshot here, this is critical + # to be able to cache summaries of the ArchiveResults for a given + # snapshot without having to load all the results from the DB each time. + # (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume + # ArchiveResults are unchanged as long as the updated timestamp is unchanged) + snapshot.save() else: # print('{black} X {}{reset}'.format(method_name, **ANSI)) stats['skipped'] += 1 diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index 1f382190..a0883113 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -31,7 +31,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr out_dir = out_dir or Path(link.link_dir) if not overwrite and (out_dir / 'archive.org.txt').exists(): - # if open(path, 'r').read().strip() != 'None': + # if open(path, 'r', encoding='utf-8').read().strip() != 'None': return False return SAVE_ARCHIVE_DOT_ORG diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index d9e32c0a..e7d20362 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -54,11 +54,13 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) out_dir = Path(out_dir or link.link_dir) output_folder = out_dir.absolute() / "mercury" - output = str(output_folder) + output = "mercury" status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: + output_folder.mkdir(exist_ok=True) + # Get plain text version of article cmd = [ DEPENDENCIES['MERCURY_BINARY']['path'], @@ -71,6 +73,11 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) except json.JSONDecodeError: raise ShellError(cmd, result) + if article_text.get('failed'): + raise ArchiveError('Mercury was not able to get article text from the URL') + + atomic_write(str(output_folder / "content.txt"), article_text["content"]) + # Get HTML version of article cmd = [ DEPENDENCIES['MERCURY_BINARY']['path'], @@ -82,9 +89,10 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) except json.JSONDecodeError: raise ShellError(cmd, result) - output_folder.mkdir(exist_ok=True) + if article_text.get('failed'): + raise ArchiveError('Mercury was not able to get article HTML from the URL') + atomic_write(str(output_folder / "content.html"), article_json.pop("content")) - atomic_write(str(output_folder / "content.txt"), article_text["content"]) atomic_write(str(output_folder / "article.json"), article_json) # Check for common failure cases diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 6e48cd9a..d7c1e303 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -35,7 +35,7 @@ def get_html(link: Link, path: Path) -> str: document = None for source in sources: try: - with open(abs_path / source, "r") as f: + with open(abs_path / source, "r", encoding="utf-8") as f: document = f.read() break except (FileNotFoundError, TypeError): @@ -63,7 +63,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO out_dir = Path(out_dir or link.link_dir) output_folder = out_dir.absolute() / "readability" - output = str(output_folder) + output = "readability" # Readability Docs: https://github.com/mozilla/readability @@ -81,13 +81,20 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO temp_doc.write(document.encode("utf-8")) temp_doc.close() + if not document or len(document) < 10: + raise ArchiveError('Readability could not find HTML to parse for article text') + cmd = [ DEPENDENCIES['READABILITY_BINARY']['path'], - temp_doc.name + temp_doc.name, ] result = run(cmd, cwd=out_dir, timeout=timeout) - result_json = json.loads(result.stdout) + try: + result_json = json.loads(result.stdout) + except json.JSONDecodeError: + raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr) + output_folder.mkdir(exist_ok=True) readability_content = result_json.pop("textContent") atomic_write(str(output_folder / "content.html"), result_json.pop("content")) @@ -112,6 +119,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO except (Exception, OSError) as err: status = 'failed' output = err + cmd = [cmd[0], './{singlefile,dom}.html'] finally: timer.end() @@ -121,6 +129,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO cmd_version=READABILITY_VERSION, output=output, status=status, - index_texts= [readability_content] if readability_content else [], + index_texts=[readability_content] if readability_content else [], **timer.stats, ) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 04ab0a8d..d3d1bedc 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -356,6 +356,7 @@ LINK_FILTERS = { 'regex': lambda pattern: Q(url__iregex=pattern), 'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"), 'tag': lambda pattern: Q(tags__name=pattern), + 'timestamp': lambda pattern: Q(timestamp=pattern), } @enforce_types diff --git a/archivebox/index/html.py b/archivebox/index/html.py index ebfe7d78..c4f66f55 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -1,11 +1,12 @@ __package__ = 'archivebox.index' -from datetime import datetime -from typing import List, Optional, Iterator, Mapping from pathlib import Path +from datetime import datetime +from collections import defaultdict +from typing import List, Optional, Iterator, Mapping from django.utils.html import format_html, mark_safe -from collections import defaultdict +from django.core.cache import cache from .schema import Link from ..system import atomic_write @@ -20,7 +21,6 @@ from ..util import ( from ..config import ( OUTPUT_DIR, VERSION, - GIT_SHA, FOOTER_INFO, HTML_INDEX_FILENAME, SAVE_ARCHIVE_DOT_ORG, @@ -60,7 +60,7 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> return render_django_template(template, { 'version': VERSION, - 'git_sha': GIT_SHA, + 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility 'num_links': str(len(links)), 'date_updated': datetime.now().strftime('%Y-%m-%d'), 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), @@ -116,71 +116,78 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str: def snapshot_icons(snapshot) -> str: - from core.models import EXTRACTORS + cache_key = f'{str(snapshot.id)[:12]}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons' + + def calc_snapshot_icons(): + from core.models import EXTRACTORS + # start = datetime.now() - # start = datetime.now() + archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) + link = snapshot.as_link() + path = link.archive_path + canon = link.canonical_outputs() + output = "" + output_template = '{}  ' + icons = { + "singlefile": "❶", + "wget": "🆆", + "dom": "🅷", + "pdf": "📄", + "screenshot": "💻", + "media": "📼", + "git": "🅶", + "archive_org": "🏛", + "readability": "🆁", + "mercury": "🅼", + "warc": "📦" + } + exclude = ["favicon", "title", "headers", "archive_org"] + # Missing specific entry for WARC - archive_results = snapshot.archiveresult_set.filter(status="succeeded") - link = snapshot.as_link() - path = link.archive_path - canon = link.canonical_outputs() - output = "" - output_template = '{}  ' - icons = { - "singlefile": "❶", - "wget": "🆆", - "dom": "🅷", - "pdf": "📄", - "screenshot": "💻", - "media": "📼", - "git": "🅶", - "archive_org": "🏛", - "readability": "🆁", - "mercury": "🅼", - "warc": "📦" - } - exclude = ["favicon", "title", "headers", "archive_org"] - # Missing specific entry for WARC + extractor_outputs = defaultdict(lambda: None) + for extractor, _ in EXTRACTORS: + for result in archive_results: + if result.extractor == extractor and result: + extractor_outputs[extractor] = result - extractor_outputs = defaultdict(lambda: None) - for extractor, _ in EXTRACTORS: - for result in archive_results: - if result.extractor == extractor and result: - extractor_outputs[extractor] = result + for extractor, _ in EXTRACTORS: + if extractor not in exclude: + existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output + # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching) + # if existing: + # existing = (Path(path) / existing) + # if existing.is_file(): + # existing = True + # elif existing.is_dir(): + # existing = any(existing.glob('*.*')) + output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)), + extractor, icons.get(extractor, "?")) + if extractor == "wget": + # warc isn't technically it's own extractor, so we have to add it after wget + + # get from db (faster but less thurthful) + exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output + # get from filesystem (slower but more accurate) + # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) + output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?")) - for extractor, _ in EXTRACTORS: - if extractor not in exclude: - existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output - # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching) - # if existing: - # existing = (Path(path) / existing) - # if existing.is_file(): - # existing = True - # elif existing.is_dir(): - # existing = any(existing.glob('*.*')) - output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)), - extractor, icons.get(extractor, "?")) - if extractor == "wget": - # warc isn't technically it's own extractor, so we have to add it after wget - - # get from db (faster but less thurthful) - exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output - # get from filesystem (slower but more accurate) - # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) - output += format_html(output_template, 'warc/', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?")) + if extractor == "archive_org": + # The check for archive_org is different, so it has to be handled separately - if extractor == "archive_org": - # The check for archive_org is different, so it has to be handled separately + # get from db (faster) + exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output + # get from filesystem (slower) + # target_path = Path(path) / "archive.org.txt" + # exists = target_path.exists() + output += '{} '.format(canon["archive_org_path"], str(exists), + "archive_org", icons.get("archive_org", "?")) - # get from db (faster) - exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output - # get from filesystem (slower) - # target_path = Path(path) / "archive.org.txt" - # exists = target_path.exists() - output += '{} '.format(canon["archive_org_path"], str(exists), - "archive_org", icons.get("archive_org", "?")) + result = format_html('{}', mark_safe(output)) + # end = datetime.now() + # print(((end - start).total_seconds()*1000) // 1, 'ms') + return result - result = format_html('{}', mark_safe(output)) - # end = datetime.now() - # print(((end - start).total_seconds()*1000) // 1, 'ms') - return result + return cache.get_or_set(cache_key, calc_snapshot_icons) + # return calc_snapshot_icons() + + diff --git a/archivebox/index/json.py b/archivebox/index/json.py index f24b969f..441e6854 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -15,7 +15,6 @@ from ..config import ( VERSION, OUTPUT_DIR, FOOTER_INFO, - GIT_SHA, DEPENDENCIES, JSON_INDEX_FILENAME, ARCHIVE_DIR_NAME, @@ -30,7 +29,7 @@ MAIN_INDEX_HEADER = { 'meta': { 'project': 'ArchiveBox', 'version': VERSION, - 'git_sha': GIT_SHA, + 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility 'website': 'https://ArchiveBox.io', 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki', 'source': 'https://github.com/ArchiveBox/ArchiveBox', diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 1ca4e801..00831e19 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -16,6 +16,7 @@ from typing import List, Dict, Any, Optional, Union from dataclasses import dataclass, asdict, field, fields +from django.utils.functional import cached_property from ..system import get_dir_size @@ -133,7 +134,6 @@ class Link: updated: Optional[datetime] = None schema: str = 'Link' - def __str__(self) -> str: return f'[{self.timestamp}] {self.url} "{self.title}"' @@ -190,6 +190,7 @@ class Link: } if extended: info.update({ + 'snapshot_id': self.snapshot_id, 'link_dir': self.link_dir, 'archive_path': self.archive_path, @@ -201,6 +202,9 @@ class Link: 'basename': self.basename, 'extension': self.extension, 'is_static': self.is_static, + + 'tags_str': self.tags, # only used to render static index in index/html.py, remove if no longer needed there + 'icons': None, # only used to render static index in index/html.py, remove if no longer needed there 'bookmarked_date': self.bookmarked_date, 'updated_date': self.updated_date, @@ -255,6 +259,11 @@ class Link: return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust) + @cached_property + def snapshot_id(self): + from core.models import Snapshot + return str(Snapshot.objects.only('id').get(url=self.url).id) + @classmethod def field_names(cls): return [f.name for f in fields(cls)] diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 1e99f67c..2fcabd61 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -7,7 +7,7 @@ from django.db.models import QuerySet from django.db import transaction from .schema import Link -from ..util import enforce_types +from ..util import enforce_types, parse_date from ..config import OUTPUT_DIR @@ -23,13 +23,15 @@ def parse_sql_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: ) @enforce_types -def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) -> None: - with transaction.atomic(): - snapshots.delete() +def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir: Path=OUTPUT_DIR) -> None: + if atomic: + with transaction.atomic(): + return snapshots.delete() + return snapshots.delete() @enforce_types def write_link_to_sql_index(link: Link): - from core.models import Snapshot + from core.models import Snapshot, ArchiveResult info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} tags = info.pop("tags") if tags is None: @@ -41,36 +43,74 @@ def write_link_to_sql_index(link: Link): while Snapshot.objects.filter(timestamp=info["timestamp"]).exists(): info["timestamp"] = str(float(info["timestamp"]) + 1.0) - snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info) + snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info) snapshot.save_tags(tags) + + for extractor, entries in link.history.items(): + for entry in entries: + if isinstance(entry, dict): + result, _ = ArchiveResult.objects.get_or_create( + snapshot_id=snapshot.id, + extractor=extractor, + start_ts=parse_date(entry['start_ts']), + defaults={ + 'end_ts': parse_date(entry['end_ts']), + 'cmd': entry['cmd'], + 'output': entry['output'], + 'cmd_version': entry.get('cmd_version') or 'unknown', + 'pwd': entry['pwd'], + 'status': entry['status'], + } + ) + else: + result, _ = ArchiveResult.objects.update_or_create( + snapshot_id=snapshot.id, + extractor=extractor, + start_ts=parse_date(entry.start_ts), + defaults={ + 'end_ts': parse_date(entry.end_ts), + 'cmd': entry.cmd, + 'output': entry.output, + 'cmd_version': entry.cmd_version or 'unknown', + 'pwd': entry.pwd, + 'status': entry.status, + } + ) + return snapshot @enforce_types def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: - with transaction.atomic(): - for link in links: - write_link_to_sql_index(link) + for link in links: + # with transaction.atomic(): + # write_link_to_sql_index(link) + write_link_to_sql_index(link) @enforce_types def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: from core.models import Snapshot - with transaction.atomic(): - try: - snap = Snapshot.objects.get(url=link.url) - except Snapshot.DoesNotExist: - snap = write_link_to_sql_index(link) - snap.title = link.title + # with transaction.atomic(): + # try: + # snap = Snapshot.objects.get(url=link.url) + # except Snapshot.DoesNotExist: + # snap = write_link_to_sql_index(link) + # snap.title = link.title + try: + snap = Snapshot.objects.get(url=link.url) + except Snapshot.DoesNotExist: + snap = write_link_to_sql_index(link) + snap.title = link.title - tag_set = ( - set(tag.strip() for tag in (link.tags or '').split(',')) - ) - tag_list = list(tag_set) or [] + tag_set = ( + set(tag.strip() for tag in (link.tags or '').split(',')) + ) + tag_list = list(tag_set) or [] - snap.save() - snap.save_tags(tag_list) + snap.save() + snap.save_tags(tag_list) diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index f2b86735..492ae55e 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -3,6 +3,7 @@ __package__ = 'archivebox' import re import os import sys +import stat import time import argparse from math import log @@ -11,18 +12,21 @@ from pathlib import Path from datetime import datetime from dataclasses import dataclass -from typing import Optional, List, Dict, Union, IO, TYPE_CHECKING +from typing import Any, Optional, List, Dict, Union, IO, TYPE_CHECKING if TYPE_CHECKING: from .index.schema import Link, ArchiveResult +from .system import get_dir_size from .util import enforce_types from .config import ( ConfigDict, OUTPUT_DIR, PYTHON_ENCODING, + VERSION, ANSI, IS_TTY, + IN_DOCKER, TERM_WIDTH, SHOW_PROGRESS, SOURCES_DIR_NAME, @@ -50,6 +54,37 @@ class RuntimeStats: _LAST_RUN_STATS = RuntimeStats() +def debug_dict_summary(obj: Dict[Any, Any]) -> None: + stderr(' '.join(f'{key}={str(val).ljust(6)}' for key, val in obj.items())) + + +def get_fd_info(fd) -> Dict[str, Any]: + NAME = fd.name[1:-1] + FILENO = fd.fileno() + MODE = os.fstat(FILENO).st_mode + IS_TTY = hasattr(fd, 'isatty') and fd.isatty() + IS_PIPE = stat.S_ISFIFO(MODE) + IS_FILE = stat.S_ISREG(MODE) + IS_TERMINAL = not (IS_PIPE or IS_FILE) + IS_LINE_BUFFERED = fd.line_buffering + IS_READABLE = fd.readable() + return { + 'NAME': NAME, 'FILENO': FILENO, 'MODE': MODE, + 'IS_TTY': IS_TTY, 'IS_PIPE': IS_PIPE, 'IS_FILE': IS_FILE, + 'IS_TERMINAL': IS_TERMINAL, 'IS_LINE_BUFFERED': IS_LINE_BUFFERED, + 'IS_READABLE': IS_READABLE, + } + + +# # Log debug information about stdin, stdout, and stderr +# sys.stdout.write('[>&1] this is python stdout\n') +# sys.stderr.write('[>&2] this is python stderr\n') + +# debug_dict_summary(get_fd_info(sys.stdin)) +# debug_dict_summary(get_fd_info(sys.stdout)) +# debug_dict_summary(get_fd_info(sys.stderr)) + + class SmartFormatter(argparse.HelpFormatter): """Patched formatter that prints newlines in argparse help strings""" @@ -62,22 +97,40 @@ class SmartFormatter(argparse.HelpFormatter): def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None: """Tell the user they passed stdin to a command that doesn't accept it""" - if stdin and not stdin.isatty(): - stdin_raw_text = stdin.read().strip() + if not stdin: + return None + + if IN_DOCKER: + # when TTY is disabled in docker we cant tell if stdin is being piped in or not + # if we try to read stdin when its not piped we will hang indefinitely waiting for it + return None + + if not stdin.isatty(): + # stderr('READING STDIN TO REJECT...') + stdin_raw_text = stdin.read() if stdin_raw_text: + # stderr('GOT STDIN!', len(stdin_str)) stderr(f'[X] The "{caller}" command does not accept stdin.', color='red') stderr(f' Run archivebox "{caller} --help" to see usage and examples.') stderr() raise SystemExit(1) + return None def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]: """accept any standard input and return it as a string or None""" + if not stdin: return None - elif stdin and not stdin.isatty(): - stdin_str = stdin.read().strip() - return stdin_str or None + + if not stdin.isatty(): + # stderr('READING STDIN TO ACCEPT...') + stdin_str = stdin.read() + + if stdin_str: + # stderr('GOT STDIN...', len(stdin_str)) + return stdin_str + return None @@ -174,7 +227,6 @@ def progress_bar(seconds: int, prefix: str='') -> None: def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str): - from .config import VERSION, ANSI cmd = ' '.join(('archivebox', subcommand, *subcommand_args)) stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format( now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), @@ -233,11 +285,11 @@ def log_indexing_process_finished(): def log_indexing_started(out_path: str): if IS_TTY: - sys.stdout.write(f' > {out_path}') + sys.stdout.write(f' > ./{Path(out_path).relative_to(OUTPUT_DIR)}') def log_indexing_finished(out_path: str): - print(f'\r √ {out_path}') + print(f'\r √ ./{Path(out_path).relative_to(OUTPUT_DIR)}') ### Archiving Stage @@ -272,8 +324,6 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str): total=num_links, )) print() - print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI)) - print(' archivebox server # then visit http://127.0.0.1:8000') print(' Continue archiving where you left off by running:') print(' archivebox update --resume={}'.format(timestamp)) @@ -331,6 +381,9 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats else: _LAST_RUN_STATS.succeeded += 1 + size = get_dir_size(link_dir) + print(' {black}{} files ({}){reset}'.format(size[2], printable_filesize(size[0]), **ANSI)) + def log_archive_method_started(method: str): print(' > {}'.format(method)) diff --git a/archivebox/main.py b/archivebox/main.py index c1751528..5c697c55 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -67,6 +67,7 @@ from .config import ( ConfigDict, ANSI, IS_TTY, + DEBUG, IN_DOCKER, USER, ARCHIVEBOX_BINARY, @@ -76,6 +77,7 @@ from .config import ( ARCHIVE_DIR, LOGS_DIR, CONFIG_FILE, + CONFIG_FILENAME, ARCHIVE_DIR_NAME, SOURCES_DIR_NAME, LOGS_DIR_NAME, @@ -84,6 +86,7 @@ from .config import ( SQL_INDEX_FILENAME, ROBOTS_TXT_FILENAME, FAVICON_FILENAME, + SEARCH_BACKEND_ENGINE, check_dependencies, check_data_folder, write_config_file, @@ -125,14 +128,19 @@ ALLOWED_IN_OUTPUT_DIR = { 'node_modules', 'package-lock.json', 'static', + 'sonic', ARCHIVE_DIR_NAME, SOURCES_DIR_NAME, LOGS_DIR_NAME, SQL_INDEX_FILENAME, + f'{SQL_INDEX_FILENAME}-wal', + f'{SQL_INDEX_FILENAME}-shm', JSON_INDEX_FILENAME, HTML_INDEX_FILENAME, ROBOTS_TXT_FILENAME, FAVICON_FILENAME, + CONFIG_FILENAME, + f'{CONFIG_FILENAME}.bak', } @enforce_types @@ -214,9 +222,23 @@ def version(quiet: bool=False, if quiet: print(VERSION) else: + # ArchiveBox v0.5.6 + # Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY) print('ArchiveBox v{}'.format(VERSION)) p = platform.uname() - print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, '(in Docker)' if IN_DOCKER else '(not in Docker)') + print( + sys.implementation.name.title(), + p.system, + platform.platform(), + p.machine, + ) + print( + f'IN_DOCKER={IN_DOCKER}', + f'DEBUG={DEBUG}', + f'IS_TTY={IS_TTY}', + f'TZ={os.environ.get("TZ", "UTC")}', + f'SEARCH_BACKEND_ENGINE={SEARCH_BACKEND_ENGINE}', + ) print() print('{white}[i] Dependency versions:{reset}'.format(**ANSI)) @@ -261,7 +283,7 @@ def run(subcommand: str, @enforce_types -def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: +def init(force: bool=False, quick: bool=False, out_dir: Path=OUTPUT_DIR) -> None: """Initialize a new ArchiveBox collection in the current directory""" from core.models import Snapshot @@ -276,13 +298,12 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists() if is_empty and not existing_index: - print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI)) - print(f' {out_dir}') - print('{green}------------------------------------------------------------------{reset}'.format(**ANSI)) + print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI)) + print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI)) elif existing_index: - print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI)) - print(f' {out_dir}') - print('{green}------------------------------------------------------------------{reset}'.format(**ANSI)) + # TODO: properly detect and print the existing version in current index as well + print('{green}[^] Verifying and updating existing ArchiveBox collection to v{}...{reset}'.format(VERSION, **ANSI)) + print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI)) else: if force: stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow') @@ -303,30 +324,25 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: else: print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI)) + print(f' + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...') Path(SOURCES_DIR).mkdir(exist_ok=True) - print(f' √ {SOURCES_DIR}') - Path(ARCHIVE_DIR).mkdir(exist_ok=True) - print(f' √ {ARCHIVE_DIR}') - Path(LOGS_DIR).mkdir(exist_ok=True) - print(f' √ {LOGS_DIR}') - + print(f' + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...') write_config_file({}, out_dir=out_dir) - print(f' √ {CONFIG_FILE}') + if (Path(out_dir) / SQL_INDEX_FILENAME).exists(): - print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI)) + print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI)) else: - print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI)) + print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI)) DATABASE_FILE = Path(out_dir) / SQL_INDEX_FILENAME - print(f' √ {DATABASE_FILE}') - print() for migration_line in apply_migrations(out_dir): print(f' {migration_line}') - assert DATABASE_FILE.exists() + print() + print(f' √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}') # from django.contrib.auth.models import User # if IS_TTY and not User.objects.filter(is_superuser=True).exists(): @@ -334,7 +350,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: # call_command("createsuperuser", interactive=True) print() - print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI)) + print('{green}[*] Checking links from indexes and archive folders (safe to Ctrl+C)...{reset}'.format(**ANSI)) all_links = Snapshot.objects.none() pending_links: Dict[str, Link] = {} @@ -343,63 +359,77 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: all_links = load_main_index(out_dir=out_dir, warn=False) print(' √ Loaded {} links from existing main index.'.format(all_links.count())) - # Links in data folders that dont match their timestamp - fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir) - if fixed: - print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI)) - if cant_fix: - print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI)) + if quick: + print(' > Skipping full snapshot directory check (quick mode)') + else: + try: + # Links in data folders that dont match their timestamp + fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir) + if fixed: + print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI)) + if cant_fix: + print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI)) - # Links in JSON index but not in main index - orphaned_json_links = { - link.url: link - for link in parse_json_main_index(out_dir) - if not all_links.filter(url=link.url).exists() - } - if orphaned_json_links: - pending_links.update(orphaned_json_links) - print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI)) + # Links in JSON index but not in main index + orphaned_json_links = { + link.url: link + for link in parse_json_main_index(out_dir) + if not all_links.filter(url=link.url).exists() + } + if orphaned_json_links: + pending_links.update(orphaned_json_links) + print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI)) - # Links in data dir indexes but not in main index - orphaned_data_dir_links = { - link.url: link - for link in parse_json_links_details(out_dir) - if not all_links.filter(url=link.url).exists() - } - if orphaned_data_dir_links: - pending_links.update(orphaned_data_dir_links) - print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI)) + # Links in data dir indexes but not in main index + orphaned_data_dir_links = { + link.url: link + for link in parse_json_links_details(out_dir) + if not all_links.filter(url=link.url).exists() + } + if orphaned_data_dir_links: + pending_links.update(orphaned_data_dir_links) + print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI)) - # Links in invalid/duplicate data dirs - invalid_folders = { - folder: link - for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items() - } - if invalid_folders: - print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI)) - print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items())) - print() - print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI)) - print(' archivebox status') - print(' archivebox list --status=invalid') + # Links in invalid/duplicate data dirs + invalid_folders = { + folder: link + for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items() + } + if invalid_folders: + print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI)) + print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(OUTPUT_DIR)} {link}' for folder, link in invalid_folders.items())) + print() + print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI)) + print(' archivebox status') + print(' archivebox list --status=invalid') + except (KeyboardInterrupt, SystemExit): + stderr() + stderr('[x] Stopped checking archive directories due to Ctrl-C/SIGTERM', color='red') + stderr(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.') + stderr() + stderr(' {lightred}Hint:{reset} In the future you can run a quick init without checking dirs like so:'.format(**ANSI)) + stderr(' archivebox init --quick') + raise SystemExit(1) + + write_main_index(list(pending_links.values()), out_dir=out_dir) - write_main_index(list(pending_links.values()), out_dir=out_dir) - - print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI)) + print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI)) if existing_index: print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI)) else: - print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI)) - print() - print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI)) - print(' archivebox server # then visit http://127.0.0.1:8000') - print() - print(' To add new links, you can run:') - print(" archivebox add ~/some/path/or/url/to/list_of_links.txt") - print() - print(' For more usage and examples, run:') - print(' archivebox help') + print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI)) + + if Snapshot.objects.count() < 25: # hide the hints for experienced users + print() + print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI)) + print(' archivebox server # then visit http://127.0.0.1:8000') + print() + print(' To add new links, you can run:') + print(" archivebox add ~/some/path/or/url/to/list_of_links.txt") + print() + print(' For more usage and examples, run:') + print(' archivebox help') json_index = Path(out_dir) / JSON_INDEX_FILENAME html_index = Path(out_dir) / HTML_INDEX_FILENAME @@ -531,6 +561,7 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR): @enforce_types def add(urls: Union[str, List[str]], + tag: str='', depth: int=0, update_all: bool=not ONLY_NEW, index_only: bool=False, @@ -540,6 +571,8 @@ def add(urls: Union[str, List[str]], out_dir: Path=OUTPUT_DIR) -> List[Link]: """Add a new URL or list of URLs to your archive""" + from core.models import Tag + assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' extractors = extractors.split(",") if extractors else [] @@ -572,26 +605,48 @@ def add(urls: Union[str, List[str]], new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) + new_links = dedupe_links(all_links, imported_links) write_main_index(links=new_links, out_dir=out_dir) all_links = load_main_index(out_dir=out_dir) if index_only: - return all_links + # mock archive all the links using the fake index_only extractor method in order to update their state + if overwrite: + archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir) + else: + archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir) + else: + # fully run the archive extractor methods for each link + archive_kwargs = { + "out_dir": out_dir, + } + if extractors: + archive_kwargs["methods"] = extractors + + if update_all: + archive_links(all_links, overwrite=overwrite, **archive_kwargs) + elif overwrite: + archive_links(imported_links, overwrite=True, **archive_kwargs) + elif new_links: + archive_links(new_links, overwrite=False, **archive_kwargs) + + + # add any tags to imported links + tags = [ + Tag.objects.get_or_create(name=name.strip())[0] + for name in tag.split(',') + if name.strip() + ] + if tags: + for link in imported_links: + snapshot = link.as_snapshot() + snapshot.tags.add(*tags) + snapshot.tags_str(nocache=True) + snapshot.save() + # print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}') - # Run the archive methods for each link - archive_kwargs = { - "out_dir": out_dir, - } - if extractors: - archive_kwargs["methods"] = extractors - if update_all: - archive_links(all_links, overwrite=overwrite, **archive_kwargs) - elif overwrite: - archive_links(imported_links, overwrite=True, **archive_kwargs) - elif new_links: - archive_links(new_links, overwrite=False, **archive_kwargs) return all_links @@ -811,11 +866,15 @@ def list_links(snapshots: Optional[QuerySet]=None, all_snapshots = load_main_index(out_dir=out_dir) if after is not None: - all_snapshots = all_snapshots.filter(timestamp__lt=after) + all_snapshots = all_snapshots.filter(timestamp__gte=after) if before is not None: - all_snapshots = all_snapshots.filter(timestamp__gt=before) + all_snapshots = all_snapshots.filter(timestamp__lt=before) if filter_patterns: all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type) + + if not all_snapshots: + stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow') + return all_snapshots @enforce_types @@ -1061,6 +1120,7 @@ def server(runserver_args: Optional[List[str]]=None, reload: bool=False, debug: bool=False, init: bool=False, + quick_init: bool=False, createsuperuser: bool=False, out_dir: Path=OUTPUT_DIR) -> None: """Run the ArchiveBox HTTP server""" @@ -1069,9 +1129,14 @@ def server(runserver_args: Optional[List[str]]=None, if init: run_subcommand('init', stdin=None, pwd=out_dir) + print() + elif quick_init: + run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir) + print() if createsuperuser: run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) + print() # setup config for django runserver from . import config @@ -1083,12 +1148,9 @@ def server(runserver_args: Optional[List[str]]=None, from django.core.management import call_command from django.contrib.auth.models import User - admin_user = User.objects.filter(is_superuser=True).order_by('date_joined').only('username').last() - print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI)) - if admin_user: - hint('The admin username is{lightblue} {}{reset}\n'.format(admin_user.username, **ANSI)) - else: + print(' > Logging errors to ./logs/errors.log') + if not User.objects.filter(is_superuser=True).exists(): print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI)) print() print(' To create an admin user, run:') @@ -1106,7 +1168,6 @@ def server(runserver_args: Optional[List[str]]=None, config.SHOW_PROGRESS = False config.DEBUG = config.DEBUG or debug - call_command("runserver", *runserver_args) diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 441c08ac..4af2c5ac 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -68,7 +68,6 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None): """ parse a list of URLS without touching the filesystem """ - check_url_parsing_invariants() timer = TimedProgress(TIMEOUT * 4) #urls = list(map(lambda x: x + "\n", urls)) @@ -89,8 +88,6 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li RSS feed, bookmarks export, or text file """ - check_url_parsing_invariants() - timer = TimedProgress(TIMEOUT * 4) with open(source_file, 'r', encoding='utf-8') as file: links, parser = run_parser_functions(file, timer, root_url=root_url) @@ -173,31 +170,48 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba return source_path -def check_url_parsing_invariants() -> None: - """Check that plain text regex URL parsing works as expected""" - - # this is last-line-of-defense to make sure the URL_REGEX isn't - # misbehaving, as the consequences could be disastrous and lead to many - # incorrect/badly parsed links being added to the archive - - test_urls = ''' - https://example1.com/what/is/happening.html?what=1#how-about-this=1 - https://example2.com/what/is/happening/?what=1#how-about-this=1 - HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f - https://example4.com/what/is/happening.html - https://example5.com/ - https://example6.com - - http://example7.com - [https://example8.com/what/is/this.php?what=1] - [and http://example9.com?what=1&other=3#and-thing=2] - https://example10.com#and-thing=2 " - abcdef - sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi - example13.bada - and example14.badb - htt://example15.badc - ''' - # print('\n'.join(re.findall(URL_REGEX, test_urls))) - assert len(re.findall(URL_REGEX, test_urls)) == 12 - +# Check that plain text regex URL parsing works as expected +# this is last-line-of-defense to make sure the URL_REGEX isn't +# misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib) +# the consequences of bad URL parsing could be disastrous and lead to many +# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking +_test_url_strs = { + 'example.com': 0, + '/example.com': 0, + '//example.com': 0, + ':/example.com': 0, + '://example.com': 0, + 'htt://example8.com': 0, + '/htt://example.com': 0, + 'https://example': 1, + 'https://localhost/2345': 1, + 'https://localhost:1234/123': 1, + '://': 0, + 'https://': 0, + 'http://': 0, + 'ftp://': 0, + 'ftp://example.com': 0, + 'https://example.com': 1, + 'https://example.com/': 1, + 'https://a.example.com': 1, + 'https://a.example.com/': 1, + 'https://a.example.com/what/is/happening.html': 1, + 'https://a.example.com/what/ís/happening.html': 1, + 'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1, + 'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1, + 'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1, + 'https://example.com/?what=1#how-about-this=1&2%20baf': 1, + 'https://example.com?what=1#how-about-this=1&2%20baf': 1, + 'http://example7.com': 1, + '[https://example8.com/what/is/this.php?what=1]': 1, + '[and http://example9.com?what=1&other=3#and-thing=2]': 1, + 'https://example10.com#and-thing=2 "': 1, + 'abcdef': 1, + 'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1, + 'http://examplehttp://15.badc': 2, + 'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2, + '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3, +} +for url_str, num_urls in _test_url_strs.items(): + assert len(re.findall(URL_REGEX, url_str)) == num_urls, ( + f'{url_str} does not contain {num_urls} urls') diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py index e6d15455..82d1880e 100644 --- a/archivebox/search/utils.py +++ b/archivebox/search/utils.py @@ -16,7 +16,7 @@ def get_file_result_content(res, extra_path, use_pwd=False): if extra_path: fpath = f'{fpath}/{extra_path}' - with open(fpath, 'r') as file: + with open(fpath, 'r', encoding='utf-8') as file: data = file.read() if data: return [data] diff --git a/archivebox/system.py b/archivebox/system.py index 2191c70a..3adf2e73 100644 --- a/archivebox/system.py +++ b/archivebox/system.py @@ -37,10 +37,11 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over """Safe atomic write to filesystem by writing to temp file + atomic rename""" mode = 'wb+' if isinstance(contents, bytes) else 'w' + encoding = None if isinstance(contents, bytes) else 'utf-8' # enforce utf-8 on all text writes # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}') try: - with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f: + with lib_atomic_write(path, mode=mode, overwrite=overwrite, encoding=encoding) as f: if isinstance(contents, dict): dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) elif isinstance(contents, (bytes, str)): diff --git a/archivebox/templates/admin/actions_as_select.html b/archivebox/templates/admin/actions_as_select.html index 86a77190..e69de29b 100644 --- a/archivebox/templates/admin/actions_as_select.html +++ b/archivebox/templates/admin/actions_as_select.html @@ -1 +0,0 @@ -actions_as_select diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index d8ad8d00..a3d21ba9 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -20,7 +20,7 @@ -