diff --git a/archivebox.egg-info/SOURCES.txt b/archivebox.egg-info/SOURCES.txt index 8f0d5d48..1fc92194 100644 --- a/archivebox.egg-info/SOURCES.txt +++ b/archivebox.egg-info/SOURCES.txt @@ -58,6 +58,7 @@ archivebox/core/migrations/0003_auto_20200630_1034.py archivebox/core/migrations/0004_auto_20200713_1552.py archivebox/core/migrations/0005_auto_20200728_0326.py archivebox/core/migrations/0006_auto_20201012_1520.py +archivebox/core/migrations/0007_archiveresult.py archivebox/core/migrations/__init__.py archivebox/extractors/__init__.py archivebox/extractors/archive_org.py diff --git a/archivebox.egg-info/requires.txt b/archivebox.egg-info/requires.txt index 457f64e5..732866b9 100644 --- a/archivebox.egg-info/requires.txt +++ b/archivebox.egg-info/requires.txt @@ -1,7 +1,7 @@ requests==2.24.0 atomicwrites==1.4.0 mypy-extensions==0.4.3 -django==3.0.8 +django==3.1.3 django-extensions==3.0.3 dateparser ipython diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py new file mode 100644 index 00000000..898e0f93 --- /dev/null +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -0,0 +1,91 @@ +# Generated by Django 3.0.8 on 2020-11-04 12:25 + +import json +from pathlib import Path + +from django.db import migrations, models +import django.db.models.deletion + +from config import CONFIG +from index.json import to_json + + +def forwards_func(apps, schema_editor): + from core.models import EXTRACTORS + + Snapshot = apps.get_model("core", "Snapshot") + ArchiveResult = apps.get_model("core", "ArchiveResult") + + snapshots = Snapshot.objects.all() + for snapshot in snapshots: + out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp + + try: + with open(out_dir / "index.json", "r") as f: + fs_index = json.load(f) + except Exception as e: + continue + + history = fs_index["history"] + + for extractor in history: + for result in history[extractor]: + ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=result["cmd"], cmd_version=result["cmd_version"], + start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"]) + + +def verify_json_index_integrity(snapshot): + results = snapshot.archiveresult_set.all() + out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp + with open(out_dir / "index.json", "r") as f: + index = json.load(f) + + history = index["history"] + index_results = [result for extractor in history for result in history[extractor]] + flattened_results = [result["start_ts"] for result in index_results] + + missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results] + + for missing in missing_results: + index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(), + "start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output, + "schema": "ArchiveResult", "status": missing.status}) + + json_index = to_json(index) + with open(out_dir / "index.json", "w") as f: + f.write(json_index) + + +def reverse_func(apps, schema_editor): + Snapshot = apps.get_model("core", "Snapshot") + ArchiveResult = apps.get_model("core", "ArchiveResult") + for snapshot in Snapshot.objects.all(): + verify_json_index_integrity(snapshot) + + ArchiveResult.objects.all().delete() + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0006_auto_20201012_1520'), + ] + + operations = [ + migrations.CreateModel( + name='ArchiveResult', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('cmd', models.JSONField()), + ('pwd', models.CharField(max_length=256)), + ('cmd_version', models.CharField(max_length=32)), + ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)), + ('output', models.CharField(max_length=512)), + ('start_ts', models.DateTimeField()), + ('end_ts', models.DateTimeField()), + ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)), + ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')), + ], + ), + migrations.RunPython(forwards_func, reverse_func), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index f43fc631..9d893490 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -8,6 +8,14 @@ from django.utils.text import slugify from ..util import parse_date from ..index.schema import Link +from ..extractors import get_default_archive_methods + +EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()] +STATUS_CHOICES = [ + ("succeeded", "succeeded"), + ("failed", "failed"), + ("skipped", "skipped") +] class Tag(models.Model): @@ -148,3 +156,18 @@ class Snapshot(models.Model): tags_id.append(Tag.objects.get_or_create(name=tag)[0].id) self.tags.clear() self.tags.add(*tags_id) + + +class ArchiveResult(models.Model): + snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) + cmd = models.JSONField() + pwd = models.CharField(max_length=256) + cmd_version = models.CharField(max_length=32) + output = models.CharField(max_length=512) + start_ts = models.DateTimeField() + end_ts = models.DateTimeField() + status = models.CharField(max_length=16, choices=STATUS_CHOICES) + extractor = models.CharField(choices=EXTRACTORS, max_length=32) + + def __str__(self): + return self.extractor diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 75c9c4e7..3df46a51 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -1,39 +1,54 @@ -from pathlib import Path - from django.utils.html import format_html +from collections import defaultdict -from core.models import Snapshot +from core.models import Snapshot, EXTRACTORS +from pathlib import Path def get_icons(snapshot: Snapshot) -> str: + archive_results = snapshot.archiveresult_set.filter(status="succeeded") link = snapshot.as_link() + path = link.archive_path canon = link.canonical_outputs() - out_dir = Path(link.link_dir) + output = "" + output_template = '{} ' + icons = { + "singlefile": "❶", + "wget": "🆆", + "dom": "🅷", + "pdf": "📄", + "screenshot": "💻", + "media": "📼", + "git": "🅶", + "archive_org": "🏛", + "readability": "🆁", + "mercury": "🅼", + "warc": "📦" + } + exclude = ["favicon", "title", "headers", "archive_org"] + # Missing specific entry for WARC - # slow version: highlights icons based on whether files exist or not for that output - # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) - # fast version: all icons are highlighted without checking for outputs in filesystem - link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) + extractor_items = defaultdict(lambda: None) + for extractor, _ in EXTRACTORS: + for result in archive_results: + if result.extractor == extractor: + extractor_items[extractor] = result - return format_html( - '' - '' - '🆆 ' - '🅷 ' - '📄 ' - '💻 ' - '📦 ' - '📼 ' - '🅶 ' - '🏛 ' - '', - *link_tuple(link, 'singlefile_path'), - *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')), - *link_tuple(link, 'dom_path'), - *link_tuple(link, 'pdf_path'), - *link_tuple(link, 'screenshot_path'), - *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), - *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), - *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), - canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), - ) + for extractor, _ in EXTRACTORS: + if extractor not in exclude: + exists = extractor_items[extractor] is not None + output += output_template.format(path, canon[f"{extractor}_path"], str(exists), + extractor, icons.get(extractor, "?")) + if extractor == "wget": + # warc isn't technically it's own extractor, so we have to add it after wget + exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) + output += output_template.format(exists[0] if exists else '#', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?")) + + if extractor == "archive_org": + # The check for archive_org is different, so it has to be handled separately + target_path = Path(path) / "archive.org.txt" + exists = target_path.exists() + output += '{} '.format(canon["archive_org_path"], str(exists), + "archive_org", icons.get("archive_org", "?")) + + return format_html(f'{output}') diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 60f20adf..ef5ef446 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -8,6 +8,7 @@ from datetime import datetime from django.db.models import QuerySet from ..index.schema import Link +from ..index.sql import write_link_to_sql_index from ..index import ( load_link_details, write_link_details, @@ -65,6 +66,14 @@ def ignore_methods(to_ignore: List[str]): def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, skip_index: bool=False) -> Link: """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" + # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach. + if not skip_index: + from core.models import Snapshot, ArchiveResult + try: + snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot + except Snapshot.DoesNotExist: + snapshot = write_link_to_sql_index(link) + ARCHIVE_METHODS = get_default_archive_methods() if methods: @@ -99,6 +108,10 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s stats[result.status] += 1 log_archive_method_finished(result) + if not skip_index: + ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version, + output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status) + else: # print('{black} X {}{reset}'.format(method_name, **ANSI)) stats['skipped'] += 1 diff --git a/archivebox/themes/default/base.html b/archivebox/themes/default/base.html index 4a5a76c6..fe1fee08 100644 --- a/archivebox/themes/default/base.html +++ b/archivebox/themes/default/base.html @@ -224,12 +224,10 @@ color: black; } - tr td a.exists-True { - opacity: 1; - } - tr td a.exists-False { - opacity: 0.1; - filter: grayscale(100%); + .exists-False { + opacity: 0.1; + filter: grayscale(100%); + pointer-events: none; } diff --git a/setup.py b/setup.py index 6b40b803..f65ead27 100755 --- a/setup.py +++ b/setup.py @@ -51,9 +51,8 @@ setuptools.setup( "requests==2.24.0", "atomicwrites==1.4.0", "mypy-extensions==0.4.3", - "django==3.0.8", + "django==3.1.3", "django-extensions==3.0.3", - "dateparser", "ipython", "youtube-dl", diff --git a/tests/test_update.py b/tests/test_update.py index 238a92d9..95a61ce9 100644 --- a/tests/test_update.py +++ b/tests/test_update.py @@ -6,7 +6,7 @@ def test_update_status_invalid(tmp_path, process, disable_extractors_dict): subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) assert list((tmp_path / "archive").iterdir()) != [] - subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True) + a_process = subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True) conn = sqlite3.connect(str(tmp_path / "index.sqlite3")) c = conn.cursor()