feat: initial functional version with icons calculated based on archive results

2024-06-28 11:00:35 +12:00 · 2020-11-04 10:31:20 -05:00 · 2020-11-04 10:31:20 -05:00 · b3e0400bc0
parent 309a87e8fe
commit b3e0400bc0
3 changed files with 104 additions and 29 deletions
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@ -1,8 +1,43 @@
 # Generated by Django 3.0.8 on 2020-11-04 12:25

+import json
+from pathlib import Path
+
 from django.db import migrations, models
 import django.db.models.deletion

+from config import CONFIG
+
+
+def forwards_func(apps, schema_editor):
+    from core.models import EXTRACTORS
+
+    Snapshot = apps.get_model("core", "Snapshot")
+    ArchiveResult = apps.get_model("core", "ArchiveResult")
+
+    snapshots = Snapshot.objects.all()
+    for snapshot in snapshots:
+        out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
+
+        try:
+            with open(out_dir / "index.json", "r") as f:
+                fs_index = json.load(f)
+        except Exception as e:
+            continue
+
+        history = fs_index["history"]
+
+        for extractor in history:
+            for result in history[extractor]:
+                ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=json.dumps(result["cmd"]), cmd_version=result["cmd_version"], 
+                start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
+
+
+
+def reverse_func(apps, schema_editor):
+    ArchiveResult = apps.get_model("core", "ArchiveResult")
+    ArchiveResult.objects.all().delete()
+

 class Migration(migrations.Migration):

@ -18,6 +53,7 @@ class Migration(migrations.Migration):
                ('cmd', models.CharField(default='', max_length=500)),
                ('pwd', models.CharField(default='', max_length=200)),
                ('cmd_version', models.CharField(default='', max_length=20)),
+                ('status', models.CharField(max_length=10)),
                ('output', models.CharField(default='', max_length=500)),
                ('start_ts', models.DateTimeField()),
                ('end_ts', models.DateTimeField()),
@ -25,4 +61,5 @@ class Migration(migrations.Migration):
                ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
            ],
        ),
+        migrations.RunPython(forwards_func, reverse_func),
    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@ -161,4 +161,8 @@ class ArchiveResult(models.Model):
    output = models.CharField(max_length=500, default="")
    start_ts = models.DateTimeField()
    end_ts = models.DateTimeField()
-    extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20)
+    status = models.CharField(max_length=10)
+    extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20)
+
+    def __str__(self):
+        return self.extractor
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@ -2,38 +2,72 @@ from pathlib import Path

 from django.utils.html import format_html

-from core.models import Snapshot
+from core.models import Snapshot, ArchiveResult, EXTRACTORS


 def get_icons(snapshot: Snapshot) -> str:
+    archive_results = snapshot.archiveresult_set
    link = snapshot.as_link()
    canon = link.canonical_outputs()
-    out_dir = Path(link.link_dir)
+    output = ""
+    output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{} </a>'
+    icons = {
+        "singlefile": "❶",
+        "wget": "🆆",
+        "dom": "🅷",
+        "pdf": "📄",
+        "screenshot": "💻",
+        "media": "📼",
+        "git": "🅶",
+        "archive_org": "🏛",
+        "readability": "🆁",
+        "mercury": "🅼",
+    }
+    exclude = ["favicon"]
+    # Missing specific entry for WARC

-    # slow version: highlights icons based on whether files exist or not for that output
-    # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
-    # fast version: all icons are highlighted without checking for outputs in filesystem
-    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())

-    return format_html(
-            '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
-                '<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'
-                '<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '
-                '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
-                '<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '
-                '<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '
-                '<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '
-                '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
-                '<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '
-                '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
-            '</span>',
-            *link_tuple(link, 'singlefile_path'),
-            *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
-            *link_tuple(link, 'pdf_path'),
-            *link_tuple(link, 'screenshot_path'),
-            *link_tuple(link, 'dom_path'),
-            *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
-            *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
-            *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
-            canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
-        )
+    for extractor in EXTRACTORS:
+        result = archive_results.filter(extractor=extractor[0])
+        try:
+            if extractor[0] not in exclude:
+                output += output_template.format(link.archive_path, canon[f"{extractor[0]}_path"],
+                                                 result.exists(), extractor[0], icons.get(extractor[0], "?"))
+        except Exception as e:
+            print(e)
+
+    return format_html(f'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">{output}<span>')
+
+#def get_icons(snapshot: Snapshot) -> str:
+#    link = snapshot.as_link()
+#    canon = link.canonical_outputs()
+#    out_dir = Path(link.link_dir)
+#
+#    # slow version: highlights icons based on whether files exist or not for that output
+#    # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
+#    # fast version: all icons are highlighted without checking for outputs in filesystem
+#    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
+#
+#    return format_html(
+#            '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
+#                '<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'
+#                '<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '
+#                '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
+#                '<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '
+#                '<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '
+#                '<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '
+#                '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
+#                '<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '
+#                '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
+#            '</span>',
+#            *link_tuple(link, 'singlefile_path'),
+#            *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
+#            *link_tuple(link, 'pdf_path'),
+#            *link_tuple(link, 'screenshot_path'),
+#            *link_tuple(link, 'dom_path'),
+#            *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
+#            *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
+#            *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
+#            canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
+#        )
+#