From e1d0b8bce73c8a919da9915e99a5c51089b0a138 Mon Sep 17 00:00:00 2001 From: Cristian Date: Mon, 26 Oct 2020 07:45:21 -0500 Subject: [PATCH 001/235] feat: Initialize django at the beginning --- archivebox/__init__.py | 4 ++++ archivebox/index/sql.py | 9 +-------- archivebox/main.py | 5 ----- tests/fixtures.py | 1 + tests/test_add.py | 1 + 5 files changed, 7 insertions(+), 13 deletions(-) diff --git a/archivebox/__init__.py b/archivebox/__init__.py index b0c00b61..d41776ba 100644 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -1 +1,5 @@ __package__ = 'archivebox' +from .config import setup_django, OUTPUT_DIR + +print(OUTPUT_DIR) +setup_django() diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index aa7c8817..42d29b66 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -7,14 +7,13 @@ from django.db.models import QuerySet from .schema import Link from ..util import enforce_types -from ..config import setup_django, OUTPUT_DIR +from ..config import OUTPUT_DIR ### Main Links Index @enforce_types def parse_sql_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: - setup_django(out_dir, check_db=True) from core.models import Snapshot return ( @@ -24,7 +23,6 @@ def parse_sql_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: @enforce_types def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) -> None: - setup_django(out_dir, check_db=True) from django.db import transaction with transaction.atomic(): @@ -51,7 +49,6 @@ def write_link_to_sql_index(link: Link): @enforce_types def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: - setup_django(out_dir, check_db=True) from django.db import transaction with transaction.atomic(): @@ -61,7 +58,6 @@ def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: @enforce_types def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: - setup_django(out_dir, check_db=True) from core.models import Snapshot from django.db import transaction @@ -84,7 +80,6 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: @enforce_types def list_migrations(out_dir: Path=OUTPUT_DIR) -> List[Tuple[bool, str]]: - setup_django(out_dir, check_db=False) from django.core.management import call_command out = StringIO() call_command("showmigrations", list=True, stdout=out) @@ -101,7 +96,6 @@ def list_migrations(out_dir: Path=OUTPUT_DIR) -> List[Tuple[bool, str]]: @enforce_types def apply_migrations(out_dir: Path=OUTPUT_DIR) -> List[str]: - setup_django(out_dir, check_db=False) from django.core.management import call_command null, out = StringIO(), StringIO() call_command("makemigrations", interactive=False, stdout=null) @@ -112,6 +106,5 @@ def apply_migrations(out_dir: Path=OUTPUT_DIR) -> List[str]: @enforce_types def get_admins(out_dir: Path=OUTPUT_DIR) -> List[str]: - setup_django(out_dir, check_db=False) from django.contrib.auth.models import User return User.objects.filter(is_superuser=True) diff --git a/archivebox/main.py b/archivebox/main.py index 72d5009c..8e695a56 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -82,7 +82,6 @@ from .config import ( check_dependencies, check_data_folder, write_config_file, - setup_django, VERSION, CODE_LOCATIONS, EXTERNAL_LOCATIONS, @@ -305,7 +304,6 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: else: print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI)) - setup_django(out_dir, check_db=False) DATABASE_FILE = Path(out_dir) / SQL_INDEX_FILENAME print(f' √ {DATABASE_FILE}') print() @@ -1033,7 +1031,6 @@ def server(runserver_args: Optional[List[str]]=None, config.DEBUG = config.DEBUG or debug check_data_folder(out_dir=out_dir) - setup_django(out_dir) from django.core.management import call_command from django.contrib.auth.models import User @@ -1070,7 +1067,6 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None: """Run an ArchiveBox Django management command""" check_data_folder(out_dir=out_dir) - setup_django(out_dir) from django.core.management import execute_from_command_line if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY): @@ -1087,7 +1083,6 @@ def shell(out_dir: Path=OUTPUT_DIR) -> None: check_data_folder(out_dir=out_dir) - setup_django(OUTPUT_DIR) from django.core.management import call_command call_command("shell_plus") diff --git a/tests/fixtures.py b/tests/fixtures.py index 6dd4cb28..fbff5e0f 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -6,6 +6,7 @@ import pytest @pytest.fixture def process(tmp_path): os.chdir(tmp_path) + print("should be at", tmp_path) process = subprocess.run(['archivebox', 'init'], capture_output=True) return process diff --git a/tests/test_add.py b/tests/test_add.py index 5e672e8d..37b13653 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -32,6 +32,7 @@ def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extrac env=disable_extractors_dict, ) + breakpoint() archived_item_path = list(tmp_path.glob('archive/**/*'))[0] with open(archived_item_path / "index.json", "r") as f: output_json = json.load(f) From a6bee5f11100f0fb0901205db5fb646b46777b22 Mon Sep 17 00:00:00 2001 From: Cristian Date: Mon, 26 Oct 2020 08:02:04 -0500 Subject: [PATCH 002/235] feat: Move setup_django to an inner module --- archivebox/__init__.py | 4 ---- archivebox/cli/__init__.py | 4 ++++ tests/test_add.py | 1 - 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/archivebox/__init__.py b/archivebox/__init__.py index d41776ba..b0c00b61 100644 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -1,5 +1 @@ __package__ = 'archivebox' -from .config import setup_django, OUTPUT_DIR - -print(OUTPUT_DIR) -setup_django() diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 83055e8e..f41ba5dc 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -134,3 +134,7 @@ __all__ = ( 'run_subcommand', *SUBCOMMANDS.keys(), ) + + +from ..config import setup_django +setup_django() diff --git a/tests/test_add.py b/tests/test_add.py index 37b13653..5e672e8d 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -32,7 +32,6 @@ def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extrac env=disable_extractors_dict, ) - breakpoint() archived_item_path = list(tmp_path.glob('archive/**/*'))[0] with open(archived_item_path / "index.json", "r") as f: output_json = json.load(f) From f6ce1de8827367997dea32ec6cd691803d3acec5 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 27 Oct 2020 09:11:41 -0500 Subject: [PATCH 003/235] fix: archivebox version was being called as root --- Dockerfile | 3 ++- tests/fixtures.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5f16e658..d67220a4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -99,7 +99,8 @@ ENV IN_DOCKER=True \ MERCURY_BINARY="$NODE_DIR/node_modules/.bin/mercury-parser" # Print version for nice docker finish summary -RUN archivebox version +# RUN archivebox version +RUN /app/bin/docker_entrypoint.sh archivebox version # Open up the interfaces to the outside world VOLUME "$DATA_DIR" diff --git a/tests/fixtures.py b/tests/fixtures.py index fbff5e0f..cca722f3 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -6,7 +6,6 @@ import pytest @pytest.fixture def process(tmp_path): os.chdir(tmp_path) - print("should be at", tmp_path) process = subprocess.run(['archivebox', 'init'], capture_output=True) return process @@ -26,4 +25,4 @@ def disable_extractors_dict(): "SAVE_MEDIA": "false", "SAVE_ARCHIVE_DOT_ORG": "false" }) - return env \ No newline at end of file + return env From 8f3c03a0f9f79a88842afcb73d41adb6004cfb2d Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 3 Nov 2020 09:54:02 -0500 Subject: [PATCH 004/235] feat: Initial (and naive) ArchiveResult model --- .../core/migrations/0007_archiveresult.py | 27 +++++++++++++++++++ archivebox/core/models.py | 10 +++++++ 2 files changed, 37 insertions(+) create mode 100644 archivebox/core/migrations/0007_archiveresult.py diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py new file mode 100644 index 00000000..56f4143e --- /dev/null +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -0,0 +1,27 @@ +# Generated by Django 3.0.8 on 2020-11-03 14:52 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0006_auto_20201012_1520'), + ] + + operations = [ + migrations.CreateModel( + name='ArchiveResult', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('cmd', models.CharField(default='', max_length=500)), + ('pwd', models.CharField(default='', max_length=200)), + ('cmd_version', models.CharField(default='', max_length=20)), + ('output', models.CharField(default='', max_length=500)), + ('start_ts', models.DateTimeField()), + ('end_ts', models.DateTimeField()), + ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')), + ], + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index f43fc631..53c43e29 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -148,3 +148,13 @@ class Snapshot(models.Model): tags_id.append(Tag.objects.get_or_create(name=tag)[0].id) self.tags.clear() self.tags.add(*tags_id) + + +class ArchiveResult(models.Model): + snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) + cmd = models.CharField(max_length=500, default="") + pwd = models.CharField(max_length=200, default="") + cmd_version = models.CharField(max_length=20, default="") + output = models.CharField(max_length=500, default="") + start_ts = models.DateTimeField() + end_ts = models.DateTimeField() \ No newline at end of file From 309a87e8fecdcd291d64d66add47c46d766dd9e0 Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 4 Nov 2020 07:28:02 -0500 Subject: [PATCH 005/235] feat: Add extractor field to the database --- archivebox/core/migrations/0007_archiveresult.py | 3 ++- archivebox/core/models.py | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index 56f4143e..1d0da342 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -1,4 +1,4 @@ -# Generated by Django 3.0.8 on 2020-11-03 14:52 +# Generated by Django 3.0.8 on 2020-11-04 12:25 from django.db import migrations, models import django.db.models.deletion @@ -21,6 +21,7 @@ class Migration(migrations.Migration): ('output', models.CharField(default='', max_length=500)), ('start_ts', models.DateTimeField()), ('end_ts', models.DateTimeField()), + ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=20)), ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')), ], ), diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 53c43e29..944d8612 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -8,6 +8,9 @@ from django.utils.text import slugify from ..util import parse_date from ..index.schema import Link +from ..extractors import get_default_archive_methods + +EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()] class Tag(models.Model): @@ -157,4 +160,5 @@ class ArchiveResult(models.Model): cmd_version = models.CharField(max_length=20, default="") output = models.CharField(max_length=500, default="") start_ts = models.DateTimeField() - end_ts = models.DateTimeField() \ No newline at end of file + end_ts = models.DateTimeField() + extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20) \ No newline at end of file From b3e0400bc0b0b24891a63ded515526b0dba38420 Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 4 Nov 2020 10:31:20 -0500 Subject: [PATCH 006/235] feat: initial functional version with icons calculated based on archive results --- .../core/migrations/0007_archiveresult.py | 37 ++++++++ archivebox/core/models.py | 6 +- archivebox/core/utils.py | 90 +++++++++++++------ 3 files changed, 104 insertions(+), 29 deletions(-) diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index 1d0da342..c0e1393b 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -1,8 +1,43 @@ # Generated by Django 3.0.8 on 2020-11-04 12:25 +import json +from pathlib import Path + from django.db import migrations, models import django.db.models.deletion +from config import CONFIG + + +def forwards_func(apps, schema_editor): + from core.models import EXTRACTORS + + Snapshot = apps.get_model("core", "Snapshot") + ArchiveResult = apps.get_model("core", "ArchiveResult") + + snapshots = Snapshot.objects.all() + for snapshot in snapshots: + out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp + + try: + with open(out_dir / "index.json", "r") as f: + fs_index = json.load(f) + except Exception as e: + continue + + history = fs_index["history"] + + for extractor in history: + for result in history[extractor]: + ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=json.dumps(result["cmd"]), cmd_version=result["cmd_version"], + start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"]) + + + +def reverse_func(apps, schema_editor): + ArchiveResult = apps.get_model("core", "ArchiveResult") + ArchiveResult.objects.all().delete() + class Migration(migrations.Migration): @@ -18,6 +53,7 @@ class Migration(migrations.Migration): ('cmd', models.CharField(default='', max_length=500)), ('pwd', models.CharField(default='', max_length=200)), ('cmd_version', models.CharField(default='', max_length=20)), + ('status', models.CharField(max_length=10)), ('output', models.CharField(default='', max_length=500)), ('start_ts', models.DateTimeField()), ('end_ts', models.DateTimeField()), @@ -25,4 +61,5 @@ class Migration(migrations.Migration): ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')), ], ), + migrations.RunPython(forwards_func, reverse_func), ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 944d8612..41976348 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -161,4 +161,8 @@ class ArchiveResult(models.Model): output = models.CharField(max_length=500, default="") start_ts = models.DateTimeField() end_ts = models.DateTimeField() - extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20) \ No newline at end of file + status = models.CharField(max_length=10) + extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20) + + def __str__(self): + return self.extractor diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 0bb8fceb..56c74b5c 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -2,38 +2,72 @@ from pathlib import Path from django.utils.html import format_html -from core.models import Snapshot +from core.models import Snapshot, ArchiveResult, EXTRACTORS def get_icons(snapshot: Snapshot) -> str: + archive_results = snapshot.archiveresult_set link = snapshot.as_link() canon = link.canonical_outputs() - out_dir = Path(link.link_dir) + output = "" + output_template = '{} ' + icons = { + "singlefile": "❶", + "wget": "🆆", + "dom": "🅷", + "pdf": "📄", + "screenshot": "💻", + "media": "📼", + "git": "🅶", + "archive_org": "🏛", + "readability": "🆁", + "mercury": "🅼", + } + exclude = ["favicon"] + # Missing specific entry for WARC - # slow version: highlights icons based on whether files exist or not for that output - # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) - # fast version: all icons are highlighted without checking for outputs in filesystem - link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) - return format_html( - '' - '' - '🆆 ' - '🅷 ' - '📄 ' - '💻 ' - '📦 ' - '📼 ' - '🅶 ' - '🏛 ' - '', - *link_tuple(link, 'singlefile_path'), - *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')), - *link_tuple(link, 'pdf_path'), - *link_tuple(link, 'screenshot_path'), - *link_tuple(link, 'dom_path'), - *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), - *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), - *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), - canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), - ) + for extractor in EXTRACTORS: + result = archive_results.filter(extractor=extractor[0]) + try: + if extractor[0] not in exclude: + output += output_template.format(link.archive_path, canon[f"{extractor[0]}_path"], + result.exists(), extractor[0], icons.get(extractor[0], "?")) + except Exception as e: + print(e) + + return format_html(f'{output}') + +#def get_icons(snapshot: Snapshot) -> str: +# link = snapshot.as_link() +# canon = link.canonical_outputs() +# out_dir = Path(link.link_dir) +# +# # slow version: highlights icons based on whether files exist or not for that output +# # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) +# # fast version: all icons are highlighted without checking for outputs in filesystem +# link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) +# +# return format_html( +# '' +# '' +# '🆆 ' +# '🅷 ' +# '📄 ' +# '💻 ' +# '📦 ' +# '📼 ' +# '🅶 ' +# '🏛 ' +# '', +# *link_tuple(link, 'singlefile_path'), +# *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')), +# *link_tuple(link, 'pdf_path'), +# *link_tuple(link, 'screenshot_path'), +# *link_tuple(link, 'dom_path'), +# *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), +# *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), +# *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), +# canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), +# ) +# \ No newline at end of file From 4484491fb77aeafe116aa5226d4c0cfd12e5de61 Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 4 Nov 2020 11:22:55 -0500 Subject: [PATCH 007/235] feat: Create ArchiveResult after finishing an extractor process --- archivebox/core/utils.py | 6 ++---- archivebox/extractors/__init__.py | 7 +++++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 56c74b5c..78d0cff5 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -1,8 +1,6 @@ -from pathlib import Path - from django.utils.html import format_html -from core.models import Snapshot, ArchiveResult, EXTRACTORS +from core.models import Snapshot, EXTRACTORS def get_icons(snapshot: Snapshot) -> str: @@ -70,4 +68,4 @@ def get_icons(snapshot: Snapshot) -> str: # *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), # canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), # ) -# \ No newline at end of file +# diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 60f20adf..d5d8832f 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -65,6 +65,10 @@ def ignore_methods(to_ignore: List[str]): def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, skip_index: bool=False) -> Link: """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" + # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach. + from core.models import Snapshot, ArchiveResult + snapshot = Snapshot.objects.get(url=link.url) + ARCHIVE_METHODS = get_default_archive_methods() if methods: @@ -99,6 +103,9 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s stats[result.status] += 1 log_archive_method_finished(result) + ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version, + output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status) + else: # print('{black} X {}{reset}'.format(method_name, **ANSI)) stats['skipped'] += 1 From f292cface27e6de0a552d2fc1e78fd99f6aa9219 Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 4 Nov 2020 14:40:44 -0500 Subject: [PATCH 008/235] fix: Add condition for oneshot when archiving links --- archivebox/extractors/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index d5d8832f..23a4f5ef 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -66,8 +66,9 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach. - from core.models import Snapshot, ArchiveResult - snapshot = Snapshot.objects.get(url=link.url) + if not skip_index: + from core.models import Snapshot, ArchiveResult + snapshot = Snapshot.objects.get(url=link.url) ARCHIVE_METHODS = get_default_archive_methods() @@ -103,7 +104,8 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s stats[result.status] += 1 log_archive_method_finished(result) - ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version, + if not skip_index: + ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version, output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status) else: From d064a3eeffa0a6cb52462ce1f2edb0d6be8f753a Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 4 Nov 2020 15:02:54 -0500 Subject: [PATCH 009/235] fix: Handle case when update tries to re-add a link that is not in the sql index --- archivebox/extractors/__init__.py | 6 +++++- tests/test_update.py | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 23a4f5ef..e27b9d80 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -8,6 +8,7 @@ from datetime import datetime from django.db.models import QuerySet from ..index.schema import Link +from ..index.sql import write_link_to_sql_index from ..index import ( load_link_details, write_link_details, @@ -68,7 +69,10 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach. if not skip_index: from core.models import Snapshot, ArchiveResult - snapshot = Snapshot.objects.get(url=link.url) + try: + snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot + except Snapshot.DoesNotExist: + write_link_to_sql_index(link) ARCHIVE_METHODS = get_default_archive_methods() diff --git a/tests/test_update.py b/tests/test_update.py index 238a92d9..29db0174 100644 --- a/tests/test_update.py +++ b/tests/test_update.py @@ -6,7 +6,7 @@ def test_update_status_invalid(tmp_path, process, disable_extractors_dict): subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) assert list((tmp_path / "archive").iterdir()) != [] - subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True) + a_process = subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True) conn = sqlite3.connect(str(tmp_path / "index.sqlite3")) c = conn.cursor() @@ -17,6 +17,7 @@ def test_update_status_invalid(tmp_path, process, disable_extractors_dict): assert link is None update_process = subprocess.run(['archivebox', 'update', '--status=invalid'], capture_output=True, env=disable_extractors_dict) + #breakpoint() conn = sqlite3.connect(str(tmp_path / "index.sqlite3")) c = conn.cursor() From 33182fd53c0d96f46576ee38551a7ac4a50ee534 Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 4 Nov 2020 15:07:45 -0500 Subject: [PATCH 010/235] fix: Add missing assignation --- archivebox/extractors/__init__.py | 2 +- tests/test_update.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index e27b9d80..ef5ef446 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -72,7 +72,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s try: snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot except Snapshot.DoesNotExist: - write_link_to_sql_index(link) + snapshot = write_link_to_sql_index(link) ARCHIVE_METHODS = get_default_archive_methods() diff --git a/tests/test_update.py b/tests/test_update.py index 29db0174..95a61ce9 100644 --- a/tests/test_update.py +++ b/tests/test_update.py @@ -17,7 +17,6 @@ def test_update_status_invalid(tmp_path, process, disable_extractors_dict): assert link is None update_process = subprocess.run(['archivebox', 'update', '--status=invalid'], capture_output=True, env=disable_extractors_dict) - #breakpoint() conn = sqlite3.connect(str(tmp_path / "index.sqlite3")) c = conn.cursor() From 71655220ad8554458978a078e604cb2b57fa2e1c Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 5 Nov 2020 07:54:40 -0500 Subject: [PATCH 011/235] feat: Add warc to list and limit check to succeeded archive results --- archivebox/core/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 78d0cff5..1a073fa4 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -20,17 +20,21 @@ def get_icons(snapshot: Snapshot) -> str: "archive_org": "🏛", "readability": "🆁", "mercury": "🅼", + "warc": "📦" } exclude = ["favicon"] # Missing specific entry for WARC - for extractor in EXTRACTORS: - result = archive_results.filter(extractor=extractor[0]) + result = archive_results.filter(extractor=extractor[0], status="succeeded") try: if extractor[0] not in exclude: output += output_template.format(link.archive_path, canon[f"{extractor[0]}_path"], result.exists(), extractor[0], icons.get(extractor[0], "?")) + if extractor[0] == "wget": + extractor = "warc" + output += output_template.format(link.archive_path, canon[f"{extractor}_path"], + result.exists(), extractor, icons.get(extractor, "?")) except Exception as e: print(e) From 508a0bb06ebd15bcb63407328a5d4747fb10d977 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 10 Nov 2020 12:38:29 -0500 Subject: [PATCH 012/235] refactor: Unpack extractors tuple instead of using the index to access the relevant information --- archivebox/core/utils.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 1a073fa4..228918d4 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -25,16 +25,19 @@ def get_icons(snapshot: Snapshot) -> str: exclude = ["favicon"] # Missing specific entry for WARC - for extractor in EXTRACTORS: - result = archive_results.filter(extractor=extractor[0], status="succeeded") + for extractor, _ in EXTRACTORS: + result = archive_results.filter(extractor=extractor, status="succeeded") + path, exists = link.archive_path, result.exists() try: - if extractor[0] not in exclude: - output += output_template.format(link.archive_path, canon[f"{extractor[0]}_path"], - result.exists(), extractor[0], icons.get(extractor[0], "?")) - if extractor[0] == "wget": - extractor = "warc" - output += output_template.format(link.archive_path, canon[f"{extractor}_path"], - result.exists(), extractor, icons.get(extractor, "?")) + if extractor not in exclude: + output += output_template.format(path, canon[f"{extractor}_path"], + exists, extractor, icons.get(extractor, "?")) + if extractor == "wget": + # warc isn't technically it's own extractor, so we have to add it after wget + + output += output_template.format(path, canon[f"warc_path"], + exists, "warc", icons.get("warc", "?")) + except Exception as e: print(e) From f7f0bebdcc021623a438e7975982523cdbe8bea8 Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 11 Nov 2020 15:26:54 -0500 Subject: [PATCH 013/235] feat: Modify migration reverse function to restore index (WIP) --- .../core/migrations/0007_archiveresult.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index c0e1393b..74d3a6b5 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -33,9 +33,29 @@ def forwards_func(apps, schema_editor): start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"]) +def verify_json_index_integrity(results): + results = snapshot.archiveresult_set.all() + out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp + with open(out_dir / "index.json", "r") as f: + index = json.load(f) + + history = index["history"] + extractors = [extractor for extractor in history] + index_results = [(result, extractor) for result in history[extractor]] + flattened_results = [(result["start_ts"], extractor) for result, extractor in index_results] + + missing = [result for result in results if result.start_ts not in flattened_results] + + #process missing elements here. Re-add to the index.json + + + def reverse_func(apps, schema_editor): - ArchiveResult = apps.get_model("core", "ArchiveResult") + Snapshot = apps.get_model("core", "Snapshot") + for snapshot in Snapshot.objects.all(): + verify_json_index_integrity(snapshot) + ArchiveResult.objects.all().delete() From b237e412df2c63399394a7ad0370096f7cd1009d Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 12 Nov 2020 10:30:41 -0500 Subject: [PATCH 014/235] feat: Finish reversal. Add ArchiveResults that are not found in the index.json --- .../core/migrations/0007_archiveresult.py | 20 ++++++++++++------- archivebox/core/utils.py | 2 +- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index 74d3a6b5..5da97e29 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -7,6 +7,7 @@ from django.db import migrations, models import django.db.models.deletion from config import CONFIG +from index.json import to_json def forwards_func(apps, schema_editor): @@ -33,26 +34,31 @@ def forwards_func(apps, schema_editor): start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"]) -def verify_json_index_integrity(results): +def verify_json_index_integrity(snapshot): results = snapshot.archiveresult_set.all() out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp with open(out_dir / "index.json", "r") as f: index = json.load(f) history = index["history"] - extractors = [extractor for extractor in history] - index_results = [(result, extractor) for result in history[extractor]] - flattened_results = [(result["start_ts"], extractor) for result, extractor in index_results] + index_results = [result for extractor in history for result in history[extractor]] + flattened_results = [result["start_ts"] for result in index_results] - missing = [result for result in results if result.start_ts not in flattened_results] - - #process missing elements here. Re-add to the index.json + missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results] + for missing in missing_results: + index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(), + "start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output, + "schema": "ArchiveResult", "status": missing.status}) + json_index = to_json(index) + with open(out_dir / "index.json", "w") as f: + f.write(json_index) def reverse_func(apps, schema_editor): Snapshot = apps.get_model("core", "Snapshot") + ArchiveResult = apps.get_model("core", "ArchiveResult") for snapshot in Snapshot.objects.all(): verify_json_index_integrity(snapshot) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 228918d4..a5fa2669 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -35,7 +35,7 @@ def get_icons(snapshot: Snapshot) -> str: if extractor == "wget": # warc isn't technically it's own extractor, so we have to add it after wget - output += output_template.format(path, canon[f"warc_path"], + output += output_template.format(path, canon["warc_path"], exists, "warc", icons.get("warc", "?")) except Exception as e: From e594e6a75a2895077029d97b88d7b6f8b580885f Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 12 Nov 2020 10:57:31 -0500 Subject: [PATCH 015/235] feat: WARC link points to the first warc result in target path --- archivebox/core/utils.py | 8 +++++--- archivebox/themes/default/base.html | 6 +++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index a5fa2669..67b8004d 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -1,6 +1,7 @@ from django.utils.html import format_html from core.models import Snapshot, EXTRACTORS +from pathlib import Path def get_icons(snapshot: Snapshot) -> str: @@ -34,9 +35,10 @@ def get_icons(snapshot: Snapshot) -> str: exists, extractor, icons.get(extractor, "?")) if extractor == "wget": # warc isn't technically it's own extractor, so we have to add it after wget - - output += output_template.format(path, canon["warc_path"], - exists, "warc", icons.get("warc", "?")) + exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) + if exists: + output += output_template.format(exists[0], "", + True, "warc", icons.get("warc", "?")) except Exception as e: print(e) diff --git a/archivebox/themes/default/base.html b/archivebox/themes/default/base.html index ed7d1be9..cacd0597 100644 --- a/archivebox/themes/default/base.html +++ b/archivebox/themes/default/base.html @@ -223,6 +223,10 @@ .title-col a { color: black; } + + .exists-False { + display: none; + } @@ -283,4 +287,4 @@ - \ No newline at end of file + From 8cfad64271cf72ed4572c4d3a2c5ff6885bc8b95 Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 12 Nov 2020 11:09:34 -0500 Subject: [PATCH 016/235] feat: Add specific logic for archive_org icon --- archivebox/core/utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 67b8004d..6266024b 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -23,7 +23,7 @@ def get_icons(snapshot: Snapshot) -> str: "mercury": "🅼", "warc": "📦" } - exclude = ["favicon"] + exclude = ["favicon", "archive_org"] # Missing specific entry for WARC for extractor, _ in EXTRACTORS: @@ -40,6 +40,14 @@ def get_icons(snapshot: Snapshot) -> str: output += output_template.format(exists[0], "", True, "warc", icons.get("warc", "?")) + if extractor == "archive_org" and exists: + # The check for archive_org is different, so it has to be handled separately + target_path = Path(path) / "archive.org.txt" + exists = target_path.exists() + if exists: + output += '{} '.format(canon["archive_org_path"], + True, "archive_org", icons.get("archive_org", "?")) + except Exception as e: print(e) From c565fad75cf5f6256a0ce70febb7c2246cbd1b42 Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 12 Nov 2020 11:37:56 -0500 Subject: [PATCH 017/235] feat: Use prefetch related to reduce the number of queries to the database on public index view --- archivebox/core/utils.py | 83 +++++++++++++--------------------------- archivebox/core/views.py | 1 + 2 files changed, 27 insertions(+), 57 deletions(-) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 6266024b..14c40eaf 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -1,15 +1,16 @@ from django.utils.html import format_html from core.models import Snapshot, EXTRACTORS +from core.settings import DEBUG from pathlib import Path def get_icons(snapshot: Snapshot) -> str: - archive_results = snapshot.archiveresult_set + archive_results = list(snapshot.archiveresult_set.all()) link = snapshot.as_link() canon = link.canonical_outputs() output = "" - output_template = '{} ' + output_template = '{} ' icons = { "singlefile": "❶", "wget": "🆆", @@ -27,62 +28,30 @@ def get_icons(snapshot: Snapshot) -> str: # Missing specific entry for WARC for extractor, _ in EXTRACTORS: - result = archive_results.filter(extractor=extractor, status="succeeded") - path, exists = link.archive_path, result.exists() - try: - if extractor not in exclude: - output += output_template.format(path, canon[f"{extractor}_path"], - exists, extractor, icons.get(extractor, "?")) - if extractor == "wget": - # warc isn't technically it's own extractor, so we have to add it after wget - exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) - if exists: - output += output_template.format(exists[0], "", - True, "warc", icons.get("warc", "?")) + for result in archive_results: + if result.extractor != extractor or result.status != "succeeded": + continue + path = link.archive_path + try: + if extractor not in exclude: + output += output_template.format(path, canon[f"{extractor}_path"], + extractor, icons.get(extractor, "?")) + if extractor == "wget": + # warc isn't technically it's own extractor, so we have to add it after wget + exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) + if exists: + output += output_template.format(exists[0], "", + "warc", icons.get("warc", "?")) - if extractor == "archive_org" and exists: - # The check for archive_org is different, so it has to be handled separately - target_path = Path(path) / "archive.org.txt" - exists = target_path.exists() - if exists: - output += '{} '.format(canon["archive_org_path"], - True, "archive_org", icons.get("archive_org", "?")) + if extractor == "archive_org": + # The check for archive_org is different, so it has to be handled separately + target_path = Path(path) / "archive.org.txt" + exists = target_path.exists() + if exists: + output += '{} '.format(canon["archive_org_path"], + "archive_org", icons.get("archive_org", "?")) - except Exception as e: - print(e) + except Exception as e: + print(e) return format_html(f'{output}') - -#def get_icons(snapshot: Snapshot) -> str: -# link = snapshot.as_link() -# canon = link.canonical_outputs() -# out_dir = Path(link.link_dir) -# -# # slow version: highlights icons based on whether files exist or not for that output -# # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) -# # fast version: all icons are highlighted without checking for outputs in filesystem -# link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) -# -# return format_html( -# '' -# '' -# '🆆 ' -# '🅷 ' -# '📄 ' -# '💻 ' -# '📦 ' -# '📼 ' -# '🅶 ' -# '🏛 ' -# '', -# *link_tuple(link, 'singlefile_path'), -# *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')), -# *link_tuple(link, 'pdf_path'), -# *link_tuple(link, 'screenshot_path'), -# *link_tuple(link, 'dom_path'), -# *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), -# *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), -# *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), -# canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), -# ) -# diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 7cd8b104..ee540821 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -98,6 +98,7 @@ class PublicArchiveView(ListView): query = self.request.GET.get('q') if query: qs = Snapshot.objects.filter(title__icontains=query) + qs = qs.prefetch_related("archiveresult_set").all() for snapshot in qs: snapshot.icons = get_icons(snapshot) return qs From 0f13087a0949800a54753880c1dc5d35c95bef05 Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 12 Nov 2020 13:58:13 -0500 Subject: [PATCH 018/235] refactor: Remove unneeded prefetch related --- archivebox/core/utils.py | 5 ++--- archivebox/core/views.py | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 14c40eaf..3c310525 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -1,12 +1,11 @@ from django.utils.html import format_html from core.models import Snapshot, EXTRACTORS -from core.settings import DEBUG from pathlib import Path def get_icons(snapshot: Snapshot) -> str: - archive_results = list(snapshot.archiveresult_set.all()) + archive_results = snapshot.archiveresult_set.filter(status="succeeded") link = snapshot.as_link() canon = link.canonical_outputs() output = "" @@ -29,7 +28,7 @@ def get_icons(snapshot: Snapshot) -> str: for extractor, _ in EXTRACTORS: for result in archive_results: - if result.extractor != extractor or result.status != "succeeded": + if result.extractor != extractor: continue path = link.archive_path try: diff --git a/archivebox/core/views.py b/archivebox/core/views.py index ee540821..7cd8b104 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -98,7 +98,6 @@ class PublicArchiveView(ListView): query = self.request.GET.get('q') if query: qs = Snapshot.objects.filter(title__icontains=query) - qs = qs.prefetch_related("archiveresult_set").all() for snapshot in qs: snapshot.icons = get_icons(snapshot) return qs From 34a1a6d30dd588b6d840c1e9162809e191f652ba Mon Sep 17 00:00:00 2001 From: Cristian Date: Mon, 23 Nov 2020 18:28:43 -0500 Subject: [PATCH 019/235] fix: Update model according to code review --- .../core/migrations/0007_archiveresult.py | 12 ++++++------ archivebox/core/models.py | 17 +++++++++++------ archivebox/themes/default/base.html | 2 +- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index 5da97e29..4b8a074b 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -76,14 +76,14 @@ class Migration(migrations.Migration): name='ArchiveResult', fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('cmd', models.CharField(default='', max_length=500)), - ('pwd', models.CharField(default='', max_length=200)), - ('cmd_version', models.CharField(default='', max_length=20)), - ('status', models.CharField(max_length=10)), - ('output', models.CharField(default='', max_length=500)), + ('cmd', models.CharField(max_length=500)), + ('pwd', models.CharField(max_length=200)), + ('cmd_version', models.CharField(max_length=32)), + ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)), + ('output', models.CharField(max_length=500)), ('start_ts', models.DateTimeField()), ('end_ts', models.DateTimeField()), - ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=20)), + ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)), ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')), ], ), diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 41976348..c273c072 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -11,6 +11,11 @@ from ..index.schema import Link from ..extractors import get_default_archive_methods EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()] +STATUS_CHOICES = [ + ("succeeded", "succeeded"), + ("failed", "failed"), + ("skipped", "skipped") +] class Tag(models.Model): @@ -155,14 +160,14 @@ class Snapshot(models.Model): class ArchiveResult(models.Model): snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) - cmd = models.CharField(max_length=500, default="") - pwd = models.CharField(max_length=200, default="") - cmd_version = models.CharField(max_length=20, default="") - output = models.CharField(max_length=500, default="") + cmd = models.CharField(max_length=500) + pwd = models.CharField(max_length=200) + cmd_version = models.CharField(max_length=32) + output = models.CharField(max_length=500) start_ts = models.DateTimeField() end_ts = models.DateTimeField() - status = models.CharField(max_length=10) - extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20) + status = models.CharField(max_length=16, choices=STATUS_CHOICES) + extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=32) def __str__(self): return self.extractor diff --git a/archivebox/themes/default/base.html b/archivebox/themes/default/base.html index cacd0597..f778da16 100644 --- a/archivebox/themes/default/base.html +++ b/archivebox/themes/default/base.html @@ -225,7 +225,7 @@ } .exists-False { - display: none; + opacity: 0.1; } From f84f288befd9a1cb773c146b6da7ba05273ac3d7 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 27 Nov 2020 00:01:34 -0500 Subject: [PATCH 020/235] Apply suggestions from code review minor nit --- archivebox/core/models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index c273c072..48ebd43d 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -161,13 +161,13 @@ class Snapshot(models.Model): class ArchiveResult(models.Model): snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) cmd = models.CharField(max_length=500) - pwd = models.CharField(max_length=200) + pwd = models.CharField(max_length=256) cmd_version = models.CharField(max_length=32) - output = models.CharField(max_length=500) + output = models.CharField(max_length=512) start_ts = models.DateTimeField() end_ts = models.DateTimeField() status = models.CharField(max_length=16, choices=STATUS_CHOICES) - extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=32) + extractor = models.CharField(choices=EXTRACTORS, max_length=32) def __str__(self): return self.extractor From f61e6a74bb1279124fc6ee20d7a053e73eb5bf3d Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 27 Nov 2020 15:53:34 -0500 Subject: [PATCH 021/235] feat: Re-add unused icons in list view --- archivebox/core/utils.py | 47 ++++++++++++++--------------- archivebox/themes/default/base.html | 1 + 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 3c310525..9804d6ee 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -1,4 +1,5 @@ from django.utils.html import format_html +from collections import defaultdict from core.models import Snapshot, EXTRACTORS from pathlib import Path @@ -7,9 +8,10 @@ from pathlib import Path def get_icons(snapshot: Snapshot) -> str: archive_results = snapshot.archiveresult_set.filter(status="succeeded") link = snapshot.as_link() + path = link.archive_path canon = link.canonical_outputs() output = "" - output_template = '{} ' + output_template = '{} ' icons = { "singlefile": "❶", "wget": "🆆", @@ -23,34 +25,31 @@ def get_icons(snapshot: Snapshot) -> str: "mercury": "🅼", "warc": "📦" } - exclude = ["favicon", "archive_org"] + exclude = ["favicon", "title", "headers", "archive_org"] # Missing specific entry for WARC + extractor_items = defaultdict(lambda: None) for extractor, _ in EXTRACTORS: for result in archive_results: - if result.extractor != extractor: - continue - path = link.archive_path - try: - if extractor not in exclude: - output += output_template.format(path, canon[f"{extractor}_path"], - extractor, icons.get(extractor, "?")) - if extractor == "wget": - # warc isn't technically it's own extractor, so we have to add it after wget - exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) - if exists: - output += output_template.format(exists[0], "", - "warc", icons.get("warc", "?")) + if result.extractor == extractor: + extractor_items[extractor] = result - if extractor == "archive_org": - # The check for archive_org is different, so it has to be handled separately - target_path = Path(path) / "archive.org.txt" - exists = target_path.exists() - if exists: - output += '{} '.format(canon["archive_org_path"], - "archive_org", icons.get("archive_org", "?")) + for extractor, _ in EXTRACTORS: + if extractor not in exclude: + exists = extractor_items[extractor] is not None + output += output_template.format(path, canon[f"{extractor}_path"], str(exists), + extractor, icons.get(extractor, "?")) + if extractor == "wget": + # warc isn't technically it's own extractor, so we have to add it after wget + exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) + if exists: + output += output_template.format(exists[0], "", str(bool(exists)), "warc", icons.get("warc", "?")) - except Exception as e: - print(e) + if extractor == "archive_org": + # The check for archive_org is different, so it has to be handled separately + target_path = Path(path) / "archive.org.txt" + exists = target_path.exists() + output += '{} '.format(canon["archive_org_path"], str(exists), + "archive_org", icons.get("archive_org", "?")) return format_html(f'{output}') diff --git a/archivebox/themes/default/base.html b/archivebox/themes/default/base.html index f778da16..77d912d5 100644 --- a/archivebox/themes/default/base.html +++ b/archivebox/themes/default/base.html @@ -226,6 +226,7 @@ .exists-False { opacity: 0.1; + pointer-events: none; } From 4b3f72202b92d2ab04baa99780a41fa302bf94e6 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 27 Nov 2020 16:23:27 -0500 Subject: [PATCH 022/235] feat: Bump django, update migration and change cmd to use JSONField --- archivebox.egg-info/SOURCES.txt | 1 + archivebox.egg-info/requires.txt | 3 +-- archivebox/core/migrations/0007_archiveresult.py | 8 ++++---- archivebox/core/models.py | 2 +- setup.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/archivebox.egg-info/SOURCES.txt b/archivebox.egg-info/SOURCES.txt index eee55cc5..471100ad 100644 --- a/archivebox.egg-info/SOURCES.txt +++ b/archivebox.egg-info/SOURCES.txt @@ -57,6 +57,7 @@ archivebox/core/migrations/0003_auto_20200630_1034.py archivebox/core/migrations/0004_auto_20200713_1552.py archivebox/core/migrations/0005_auto_20200728_0326.py archivebox/core/migrations/0006_auto_20201012_1520.py +archivebox/core/migrations/0007_archiveresult.py archivebox/core/migrations/__init__.py archivebox/extractors/__init__.py archivebox/extractors/archive_org.py diff --git a/archivebox.egg-info/requires.txt b/archivebox.egg-info/requires.txt index 71dc253d..e0e17f19 100644 --- a/archivebox.egg-info/requires.txt +++ b/archivebox.egg-info/requires.txt @@ -2,7 +2,7 @@ requests==2.24.0 atomicwrites==1.4.0 mypy-extensions==0.4.3 base32-crockford==0.3.0 -django==3.0.8 +django==3.1.3 django-extensions==3.0.3 dateparser ipython @@ -13,7 +13,6 @@ w3lib==1.22.0 [dev] setuptools -wheel twine flake8 ipdb diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index 4b8a074b..898e0f93 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -30,7 +30,7 @@ def forwards_func(apps, schema_editor): for extractor in history: for result in history[extractor]: - ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=json.dumps(result["cmd"]), cmd_version=result["cmd_version"], + ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=result["cmd"], cmd_version=result["cmd_version"], start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"]) @@ -76,11 +76,11 @@ class Migration(migrations.Migration): name='ArchiveResult', fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('cmd', models.CharField(max_length=500)), - ('pwd', models.CharField(max_length=200)), + ('cmd', models.JSONField()), + ('pwd', models.CharField(max_length=256)), ('cmd_version', models.CharField(max_length=32)), ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)), - ('output', models.CharField(max_length=500)), + ('output', models.CharField(max_length=512)), ('start_ts', models.DateTimeField()), ('end_ts', models.DateTimeField()), ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)), diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 48ebd43d..9d893490 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -160,7 +160,7 @@ class Snapshot(models.Model): class ArchiveResult(models.Model): snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) - cmd = models.CharField(max_length=500) + cmd = models.JSONField() pwd = models.CharField(max_length=256) cmd_version = models.CharField(max_length=32) output = models.CharField(max_length=512) diff --git a/setup.py b/setup.py index cdec8133..c540bc07 100755 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ setuptools.setup( "atomicwrites==1.4.0", "mypy-extensions==0.4.3", "base32-crockford==0.3.0", - "django==3.0.8", + "django==3.1.3", "django-extensions==3.0.3", "dateparser", From 00bb55203ec7f585e5b31d233b9f8a94dc53f830 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 27 Nov 2020 23:45:49 -0500 Subject: [PATCH 023/235] always show WARC icon with opacity set based on exists --- archivebox/core/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 9804d6ee..3df46a51 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -42,8 +42,7 @@ def get_icons(snapshot: Snapshot) -> str: if extractor == "wget": # warc isn't technically it's own extractor, so we have to add it after wget exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) - if exists: - output += output_template.format(exists[0], "", str(bool(exists)), "warc", icons.get("warc", "?")) + output += output_template.format(exists[0] if exists else '#', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?")) if extractor == "archive_org": # The check for archive_org is different, so it has to be handled separately From c2bd71667c970393b1221347dd4f3bd9f97e008a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 28 Nov 2020 00:15:31 -0500 Subject: [PATCH 024/235] fix github actions check --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8b26eca6..769d9bd5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -149,7 +149,7 @@ jobs: docker-compose up -d sleep 5 curl --silent --location 'http://127.0.0.1:8000' | grep 'ArchiveBox' - curl --silent --location 'http://127.0.0.1:8000/static/admin/js/jquery.init.js' | grep 'django.jQuery' + curl --silent --location 'http://127.0.0.1:8000/static/admin/js/jquery.init.js' | grep 'window.django' - name: Check added urls show up in index run: | From afca9cb3bde9ab575a1c74dbfb8ffcc544668336 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 28 Nov 2020 00:16:15 -0500 Subject: [PATCH 025/235] bump package version --- archivebox.egg-info/PKG-INFO | 103 +++++++++++++++++++---------------- package.json | 2 +- 2 files changed, 58 insertions(+), 47 deletions(-) diff --git a/archivebox.egg-info/PKG-INFO b/archivebox.egg-info/PKG-INFO index 1d528824..7b763c7e 100644 --- a/archivebox.egg-info/PKG-INFO +++ b/archivebox.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: archivebox -Version: 0.4.24 +Version: 0.5.0 Summary: The self-hosted internet archive. Home-page: https://github.com/ArchiveBox/ArchiveBox Author: Nick Sweeting @@ -41,31 +41,62 @@ Description:

- ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended) or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). + ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. The main index is a self-contained `data/index.sqlite3` file, and each snapshot is stored as a folder `data/archive//`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: 3 types of HTML snapshots (wget, Chrome headless, singlefile), a PDF snapshot, a screenshot, a WARC archive, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python API. - #### Quickstart + **First, get ArchiveBox using your system package manager, Docker, or pip:** ```bash - # 1. Create a folder somewhere to hold your ArchiveBox data - mkdir ~/archivebox && cd ~/archivebox - docker run -v $PWD:/data -it archivebox/archivebox init + # You can run it with Docker or Docker Compose (recommended) + docker pull archivebox/archivebox + # https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml - # 2. Archive some URLs to get started - docker run -v $PWD:/data -t archivebox/archivebox add https://github.com/ArchiveBox/ArchiveBox - docker run -v $PWD:/data -t archivebox/archivebox add --depth=1 https://example.com + # or Ubuntu/Debian + sudo add-apt-repository -u ppa:archivebox/archivebox + apt install archivebox - # 3. Then view the snapshots of the URLs you added via the self-hosted web UI - docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser # create an admin acct - docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox # start the web server - open http://127.0.0.1:8000/ # open the interactive admin panel - ls archive/*/index.html # or just browse snapshots on disk + # or macOS + brew install archivebox/archivebox/archivebox + + # or for the Python version only, without wget/git/chrome/etc. included + pip3 install archivebox + + # If you're using an apt/brew/pip install you can run archivebox commands normally + # archivebox [subcommand] [...args] + # If you're using Docker you'll have to run the commands like this + # docker run -v $PWD:/data -it archivebox/archivebox [subcommand] [...args] + # And the equivalent in Docker Compose: + # docker-compose run archivebox [subcommand] [...args] ``` + Check that everything installed correctly with `archivebox --version` + + **To start using archivebox, you have to create a data folder and `cd` into it:** + + ```bash + mkdir ~/archivebox && cd ~/archivebox # you can put the collection dir anywhere + archivebox init + ``` + + **Then Add some URLs to your archive collection:** + ```bash + archivebox add https://github.com/ArchiveBox/ArchiveBox + archivebox add --depth=1 https://example.com + ``` + + **View the snapshots of the URLs you added via the self-hosted web UI:** + ```bash + archivebox manage createsuperuser # create an admin acct + archivebox server 0.0.0.0:8000 # start the web server + open http://127.0.0.1:8000/ # open the interactive admin panel + ls ~/archivebox/archive/*/index.html # or browse the snapshots on disk + ``` + +

@@ -79,16 +110,9 @@ Description:
# Overview - ArchiveBox is a command line tool, self-hostable web-archiving server, and Python library all-in-one. It's available as a Python3 package or a Docker image, both methods provide the same CLI, Web UI, and on-disk data format. + ArchiveBox is a command line tool, self-hostable web-archiving server, and Python library all-in-one. It can be installed on Docker, macOS, and Linux/BSD, and Windows. You can download and install it as a Debian/Ubuntu package, Homebrew package, Python3 package, or a Docker image. No matter which install method you choose, they all provide the same CLI, Web UI, and on-disk data format. - It works on Docker, macOS, and Linux/BSD. Windows is not officially supported, but users have reported getting it working using the WSL2 + Docker. - - To use ArchiveBox you start by creating a folder for your data to live in (it can be anywhere on your system), and running `archivebox init` inside of it. That will create a sqlite3 index and an `ArchiveBox.conf` file. After that, you can continue to add/remove/search/import/export/manage/config/etc using the CLI `archivebox help`, or you can run the Web UI (recommended): - ```bash - archivebox manage createsuperuser - archivebox server 0.0.0.0:8000 - open http://127.0.0.1:8000 - ``` + To use ArchiveBox you start by creating a folder for your data to live in (it can be anywhere on your system), and running `archivebox init` inside of it. That will create a sqlite3 index and an `ArchiveBox.conf` file. After that, you can continue to add/export/manage/etc using the CLI `archivebox help`, or you can run the Web UI (recommended). The CLI is considered "stable", the ArchiveBox Python API and REST APIs are in "beta", and the [desktop app](https://github.com/ArchiveBox/desktop) is in "alpha" stage. @@ -252,32 +276,19 @@ Description:
```bash # archivebox [args] + + # on Debian/Ubuntu + sudo add-apt-repository -u ppa:archivebox/archivebox + apt install archivebox + + # on macOS + brew install archivebox/archivebox/archivebox ``` - First install the system, pip, and npm dependencies: + Initialize your archive in a directory somewhere and add some links: ```bash - # Install main dependendencies using apt on Ubuntu/Debian, brew on mac, or pkg on BSD - apt install python3 python3-pip python3-dev git curl wget chromium-browser youtube-dl - - # Install Node runtime (used for headless browser scripts like Readability, Singlefile, Mercury, etc.) - curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \ - && echo 'deb https://deb.nodesource.com/node_14.x $(lsb_release -cs) main' >> /etc/apt/sources.list \ - && apt-get update \ - && apt-get install --no-install-recommends nodejs - - # Make a directory to hold your collection - mkdir archivebox && cd archivebox # (can be anywhere, doesn't have to be called archivebox) - - # Install the archivebox python package in ./.venv - python3 -m venv .venv && source .venv/bin/activate - pip install --upgrade archivebox - - # Install node packages in ./node_modules (used for SingleFile, Readability, and Puppeteer) + mkdir ~/archivebox && cd archivebox npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' - ``` - - Initialize your archive and add some links: - ```bash archivebox init archivebox add 'https://example.com' # add URLs as args pipe them in via stdin archivebox add --depth=1 https://example.com/table-of-contents.html @@ -396,7 +407,7 @@ Description:
- [Supported Outputs](https://github.com/ArchiveBox/ArchiveBox/wiki#can-save-these-things-for-each-site) - [Scheduled Archiving](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) - [Publishing Your Archive](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive) - - [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install-Chromium) + - [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install) - [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview) - [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting) - [Python API](https://docs.archivebox.io/en/latest/modules.html) diff --git a/package.json b/package.json index c7a61c1e..8d88a3fd 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "archivebox", - "version": "0.4.24", + "version": "0.5.0", "description": "ArchiveBox: The self-hosted internet archive", "author": "Nick Sweeting ", "license": "MIT", From 7f2c834ea361ba5999309381b8dcfbc95b087c2e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 28 Nov 2020 01:05:35 -0500 Subject: [PATCH 026/235] fix check_data_folder mypy types --- archivebox/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/config.py b/archivebox/config.py index dc50679d..91871a94 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -936,7 +936,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media') stderr() -def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None: +def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None: output_dir = out_dir or config['OUTPUT_DIR'] assert isinstance(output_dir, (str, Path)) From 411fdcac875460824186ae618d4093c229fb85bb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 28 Nov 2020 01:05:53 -0500 Subject: [PATCH 027/235] use database for num_outputs instead of legacy json --- archivebox/core/models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 9d893490..5dd9cfc3 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -114,7 +114,7 @@ class Snapshot(models.Model): @cached_property def num_outputs(self): - return self.as_link().num_outputs + return self.archiveresult_set.filter(status='succeeded').count() @cached_property def url_hash(self): @@ -138,6 +138,7 @@ class Snapshot(models.Model): @cached_property def history(self): + # TODO: use ArchiveResult for this instead of json from ..index import load_link_details return load_link_details(self.as_link()).history From 910f3d65c7ff5bb417f8557e33977d1a27196730 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 28 Nov 2020 01:06:11 -0500 Subject: [PATCH 028/235] default function args can never be mutable --- archivebox/core/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 5dd9cfc3..d938c53f 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -151,7 +151,7 @@ class Snapshot(models.Model): return self.history['title'][-1].output.strip() return None - def save_tags(self, tags=[]): + def save_tags(self, tags=()): tags_id = [] for tag in tags: tags_id.append(Tag.objects.get_or_create(name=tag)[0].id) From 9661c863b3303261c7b87b117a33f204bf467b84 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 28 Nov 2020 01:06:23 -0500 Subject: [PATCH 029/235] css style tweaks for icons --- archivebox/core/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 3df46a51..39dca220 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -48,7 +48,7 @@ def get_icons(snapshot: Snapshot) -> str: # The check for archive_org is different, so it has to be handled separately target_path = Path(path) / "archive.org.txt" exists = target_path.exists() - output += '{} '.format(canon["archive_org_path"], str(exists), + output += '{} '.format(canon["archive_org_path"], str(exists), "archive_org", icons.get("archive_org", "?")) - return format_html(f'{output}') + return format_html(f'{output}') From c9162a6d0947d2b6b88643508eed55cc402bd5ff Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 28 Nov 2020 01:07:02 -0500 Subject: [PATCH 030/235] remove finished/not finished spinners --- archivebox/index/__init__.py | 2 +- archivebox/index/html.py | 14 +-- archivebox/logging_util.py | 4 +- archivebox/main.py | 6 +- archivebox/themes/default/base.html | 7 -- .../themes/default/core/snapshot_list.html | 112 +++++++++--------- archivebox/themes/default/main_index.html | 12 +- archivebox/themes/default/static/admin.css | 1 + archivebox/themes/legacy/main_index.html | 8 +- archivebox/themes/legacy/main_index_row.html | 2 +- 10 files changed, 69 insertions(+), 99 deletions(-) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 890777c8..9e460dc7 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -221,7 +221,7 @@ def timed_index_update(out_path: Path): @enforce_types -def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool=False) -> None: +def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: """Writes links to sqlite3 file for a given list of links""" log_indexing_process_started(len(links)) diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 793a60af..8b37c142 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -49,27 +49,15 @@ def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]: yield line.split('"')[1] return () -@enforce_types -def write_html_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool=False) -> None: - """write the html link index to a given path""" - - copy_and_overwrite(str(Path(TEMPLATES_DIR) / FAVICON_FILENAME), str(out_dir / FAVICON_FILENAME)) - copy_and_overwrite(str(Path(TEMPLATES_DIR) / ROBOTS_TXT_FILENAME), str(out_dir / ROBOTS_TXT_FILENAME)) - copy_and_overwrite(str(Path(TEMPLATES_DIR) / STATIC_DIR_NAME), str(out_dir / STATIC_DIR_NAME)) - - rendered_html = main_index_template(links, finished=finished) - atomic_write(str(out_dir / HTML_INDEX_FILENAME), rendered_html) - @enforce_types -def main_index_template(links: List[Link], finished: bool=True, template: str=MAIN_INDEX_TEMPLATE) -> str: +def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str: """render the template for the entire main index""" return render_legacy_template(template, { 'version': VERSION, 'git_sha': GIT_SHA, 'num_links': str(len(links)), - 'status': 'finished' if finished else 'running', 'date_updated': datetime.now().strftime('%Y-%m-%d'), 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), 'rows': '\n'.join( diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index aa4659f0..8648e0a4 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -501,10 +501,10 @@ def printable_folders(folders: Dict[str, Optional["Link"]], elif html: from .index.html import main_index_template if with_headers: - output = main_index_template(links, True) + output = main_index_template(links) else: from .index.html import MINIMAL_INDEX_TEMPLATE - output = main_index_template(links, True, MINIMAL_INDEX_TEMPLATE) + output = main_index_template(links, template=MINIMAL_INDEX_TEMPLATE) return output elif csv: from .index.csv import links_to_csv diff --git a/archivebox/main.py b/archivebox/main.py index c3ffcc0b..94658a8f 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -376,7 +376,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: print(' archivebox list --status=invalid') - write_main_index(list(pending_links.values()), out_dir=out_dir, finished=True) + write_main_index(list(pending_links.values()), out_dir=out_dir) print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI)) if existing_index: @@ -565,7 +565,7 @@ def add(urls: Union[str, List[str]], imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) new_links = dedupe_links(all_links, imported_links) - write_main_index(links=new_links, out_dir=out_dir, finished=not new_links) + write_main_index(links=new_links, out_dir=out_dir) all_links = load_main_index(out_dir=out_dir) if index_only: @@ -583,7 +583,7 @@ def add(urls: Union[str, List[str]], archive_links(imported_links, overwrite=True, **archive_kwargs) elif new_links: archive_links(new_links, overwrite=False, **archive_kwargs) - + return all_links @enforce_types diff --git a/archivebox/themes/default/base.html b/archivebox/themes/default/base.html index fe1fee08..84be962f 100644 --- a/archivebox/themes/default/base.html +++ b/archivebox/themes/default/base.html @@ -187,13 +187,6 @@ display: none; } - body[data-status~=finished] .files-spinner { - display: none; - } - - /*body[data-status~=running] .in-progress { - display: inline-block; - }*/ tr td a.favicon img { padding-left: 6px; padding-right: 12px; diff --git a/archivebox/themes/default/core/snapshot_list.html b/archivebox/themes/default/core/snapshot_list.html index a5beceb8..20d3cd66 100644 --- a/archivebox/themes/default/core/snapshot_list.html +++ b/archivebox/themes/default/core/snapshot_list.html @@ -2,63 +2,63 @@ {% load static %} {% block body %} -
-
- - - -
- - +
+ + + + + +
+ + + + + + + + + + {% for link in object_list %} - - - - + + + + - - - {% for link in object_list %} - - - - - - - {% endfor %} - -
BookmarkedSaved Link ({{num_links}})FilesOriginal URL
BookmarkedSaved Link ({{num_links}})FilesOriginal URL{{link.added}} + {% if link.is_archived %} + + {% else %} + + {% endif %} + + {{link.title|default:'Loading...'}} + {{link.tags_str}} + + + {{link.icons}} + + {{link.url}} +
{{link.added}} - {% if link.is_archived %} - - {% else %} - - {% endif %} - - {{link.title|default:'Loading...'}} - {{link.tags_str}} - - - 📄 - {{link.icons}} - - {{link.url}}
-
- - {% if page_obj.has_previous %} - « first - previous - {% endif %} - - - Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}. - - - {% if page_obj.has_next %} - next - last » - {% endif %} + {% endfor %} + + +
+ + {% if page_obj.has_previous %} + « first + previous + {% endif %} + + + Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}. -
+ + {% if page_obj.has_next %} + next + last » + {% endif %} +
+
- {% endblock %} +{% endblock %} diff --git a/archivebox/themes/default/main_index.html b/archivebox/themes/default/main_index.html index d5135688..11c6a9a8 100644 --- a/archivebox/themes/default/main_index.html +++ b/archivebox/themes/default/main_index.html @@ -161,12 +161,6 @@ .in-progress { display: none; } - body[data-status~=finished] .files-spinner { - display: none; - } - /*body[data-status~=running] .in-progress { - display: inline-block; - }*/ tr td a.favicon img { padding-left: 6px; padding-right: 12px; @@ -210,7 +204,7 @@ }); - +