From 8f3c03a0f9f79a88842afcb73d41adb6004cfb2d Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Tue, 3 Nov 2020 09:54:02 -0500
Subject: [PATCH 01/20] feat: Initial (and naive) ArchiveResult model

---
 .../core/migrations/0007_archiveresult.py     | 27 +++++++++++++++++++
 archivebox/core/models.py                     | 10 +++++++
 2 files changed, 37 insertions(+)
 create mode 100644 archivebox/core/migrations/0007_archiveresult.py

diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py
new file mode 100644
index 00000000..56f4143e
--- /dev/null
+++ b/archivebox/core/migrations/0007_archiveresult.py
@@ -0,0 +1,27 @@
+# Generated by Django 3.0.8 on 2020-11-03 14:52
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0006_auto_20201012_1520'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='ArchiveResult',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('cmd', models.CharField(default='', max_length=500)),
+                ('pwd', models.CharField(default='', max_length=200)),
+                ('cmd_version', models.CharField(default='', max_length=20)),
+                ('output', models.CharField(default='', max_length=500)),
+                ('start_ts', models.DateTimeField()),
+                ('end_ts', models.DateTimeField()),
+                ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
+            ],
+        ),
+    ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index f43fc631..53c43e29 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -148,3 +148,13 @@ class Snapshot(models.Model):
             tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
         self.tags.clear()
         self.tags.add(*tags_id)
+
+
+class ArchiveResult(models.Model):
+    snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
+    cmd = models.CharField(max_length=500, default="")
+    pwd = models.CharField(max_length=200, default="")
+    cmd_version = models.CharField(max_length=20, default="")
+    output = models.CharField(max_length=500, default="")
+    start_ts = models.DateTimeField()
+    end_ts = models.DateTimeField()
\ No newline at end of file

From 309a87e8fecdcd291d64d66add47c46d766dd9e0 Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Wed, 4 Nov 2020 07:28:02 -0500
Subject: [PATCH 02/20] feat: Add extractor field to the database

---
 archivebox/core/migrations/0007_archiveresult.py | 3 ++-
 archivebox/core/models.py                        | 6 +++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py
index 56f4143e..1d0da342 100644
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@@ -1,4 +1,4 @@
-# Generated by Django 3.0.8 on 2020-11-03 14:52
+# Generated by Django 3.0.8 on 2020-11-04 12:25
 
 from django.db import migrations, models
 import django.db.models.deletion
@@ -21,6 +21,7 @@ class Migration(migrations.Migration):
                 ('output', models.CharField(default='', max_length=500)),
                 ('start_ts', models.DateTimeField()),
                 ('end_ts', models.DateTimeField()),
+                ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=20)),
                 ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
             ],
         ),
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 53c43e29..944d8612 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -8,6 +8,9 @@ from django.utils.text import slugify
 
 from ..util import parse_date
 from ..index.schema import Link
+from ..extractors import get_default_archive_methods
+
+EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
 
 
 class Tag(models.Model):
@@ -157,4 +160,5 @@ class ArchiveResult(models.Model):
     cmd_version = models.CharField(max_length=20, default="")
     output = models.CharField(max_length=500, default="")
     start_ts = models.DateTimeField()
-    end_ts = models.DateTimeField()
\ No newline at end of file
+    end_ts = models.DateTimeField()
+    extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20)
\ No newline at end of file

From b3e0400bc0b0b24891a63ded515526b0dba38420 Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Wed, 4 Nov 2020 10:31:20 -0500
Subject: [PATCH 03/20] feat: initial functional version with icons calculated
 based on archive results

---
 .../core/migrations/0007_archiveresult.py     | 37 ++++++++
 archivebox/core/models.py                     |  6 +-
 archivebox/core/utils.py                      | 90 +++++++++++++------
 3 files changed, 104 insertions(+), 29 deletions(-)

diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py
index 1d0da342..c0e1393b 100644
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@@ -1,8 +1,43 @@
 # Generated by Django 3.0.8 on 2020-11-04 12:25
 
+import json
+from pathlib import Path
+
 from django.db import migrations, models
 import django.db.models.deletion
 
+from config import CONFIG
+
+
+def forwards_func(apps, schema_editor):
+    from core.models import EXTRACTORS
+
+    Snapshot = apps.get_model("core", "Snapshot")
+    ArchiveResult = apps.get_model("core", "ArchiveResult")
+
+    snapshots = Snapshot.objects.all()
+    for snapshot in snapshots:
+        out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
+
+        try:
+            with open(out_dir / "index.json", "r") as f:
+                fs_index = json.load(f)
+        except Exception as e:
+            continue
+
+        history = fs_index["history"]
+
+        for extractor in history:
+            for result in history[extractor]:
+                ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=json.dumps(result["cmd"]), cmd_version=result["cmd_version"], 
+                start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
+
+
+
+def reverse_func(apps, schema_editor):
+    ArchiveResult = apps.get_model("core", "ArchiveResult")
+    ArchiveResult.objects.all().delete()
+
 
 class Migration(migrations.Migration):
 
@@ -18,6 +53,7 @@ class Migration(migrations.Migration):
                 ('cmd', models.CharField(default='', max_length=500)),
                 ('pwd', models.CharField(default='', max_length=200)),
                 ('cmd_version', models.CharField(default='', max_length=20)),
+                ('status', models.CharField(max_length=10)),
                 ('output', models.CharField(default='', max_length=500)),
                 ('start_ts', models.DateTimeField()),
                 ('end_ts', models.DateTimeField()),
@@ -25,4 +61,5 @@ class Migration(migrations.Migration):
                 ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
             ],
         ),
+        migrations.RunPython(forwards_func, reverse_func),
     ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 944d8612..41976348 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -161,4 +161,8 @@ class ArchiveResult(models.Model):
     output = models.CharField(max_length=500, default="")
     start_ts = models.DateTimeField()
     end_ts = models.DateTimeField()
-    extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20)
\ No newline at end of file
+    status = models.CharField(max_length=10)
+    extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20)
+
+    def __str__(self):
+        return self.extractor
diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py
index 0bb8fceb..56c74b5c 100644
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -2,38 +2,72 @@ from pathlib import Path
 
 from django.utils.html import format_html
 
-from core.models import Snapshot
+from core.models import Snapshot, ArchiveResult, EXTRACTORS
 
 
 def get_icons(snapshot: Snapshot) -> str:
+    archive_results = snapshot.archiveresult_set
     link = snapshot.as_link()
     canon = link.canonical_outputs()
-    out_dir = Path(link.link_dir)
+    output = ""
+    output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{} </a>'
+    icons = {
+        "singlefile": "❶",
+        "wget": "🆆",
+        "dom": "🅷",
+        "pdf": "📄",
+        "screenshot": "💻",
+        "media": "📼",
+        "git": "🅶",
+        "archive_org": "🏛",
+        "readability": "🆁",
+        "mercury": "🅼",
+    }
+    exclude = ["favicon"]
+    # Missing specific entry for WARC
 
-    # slow version: highlights icons based on whether files exist or not for that output
-    # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
-    # fast version: all icons are highlighted without checking for outputs in filesystem
-    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
 
-    return format_html(
-            '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
-                '<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'
-                '<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '
-                '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
-                '<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '
-                '<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '
-                '<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '
-                '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
-                '<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '
-                '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
-            '</span>',
-            *link_tuple(link, 'singlefile_path'),
-            *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
-            *link_tuple(link, 'pdf_path'),
-            *link_tuple(link, 'screenshot_path'),
-            *link_tuple(link, 'dom_path'),
-            *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
-            *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
-            *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
-            canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
-        )
+    for extractor in EXTRACTORS:
+        result = archive_results.filter(extractor=extractor[0])
+        try:
+            if extractor[0] not in exclude:
+                output += output_template.format(link.archive_path, canon[f"{extractor[0]}_path"],
+                                                 result.exists(), extractor[0], icons.get(extractor[0], "?"))
+        except Exception as e:
+            print(e)
+
+    return format_html(f'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">{output}<span>')
+
+#def get_icons(snapshot: Snapshot) -> str:
+#    link = snapshot.as_link()
+#    canon = link.canonical_outputs()
+#    out_dir = Path(link.link_dir)
+#
+#    # slow version: highlights icons based on whether files exist or not for that output
+#    # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
+#    # fast version: all icons are highlighted without checking for outputs in filesystem
+#    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
+#
+#    return format_html(
+#            '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
+#                '<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'
+#                '<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '
+#                '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
+#                '<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '
+#                '<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '
+#                '<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '
+#                '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
+#                '<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '
+#                '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
+#            '</span>',
+#            *link_tuple(link, 'singlefile_path'),
+#            *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
+#            *link_tuple(link, 'pdf_path'),
+#            *link_tuple(link, 'screenshot_path'),
+#            *link_tuple(link, 'dom_path'),
+#            *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
+#            *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
+#            *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
+#            canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
+#        )
+#
\ No newline at end of file

From 4484491fb77aeafe116aa5226d4c0cfd12e5de61 Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Wed, 4 Nov 2020 11:22:55 -0500
Subject: [PATCH 04/20] feat: Create ArchiveResult after finishing an extractor
 process

---
 archivebox/core/utils.py          | 6 ++----
 archivebox/extractors/__init__.py | 7 +++++++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py
index 56c74b5c..78d0cff5 100644
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -1,8 +1,6 @@
-from pathlib import Path
-
 from django.utils.html import format_html
 
-from core.models import Snapshot, ArchiveResult, EXTRACTORS
+from core.models import Snapshot, EXTRACTORS
 
 
 def get_icons(snapshot: Snapshot) -> str:
@@ -70,4 +68,4 @@ def get_icons(snapshot: Snapshot) -> str:
 #            *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
 #            canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
 #        )
-#
\ No newline at end of file
+#
diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index 60f20adf..d5d8832f 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -65,6 +65,10 @@ def ignore_methods(to_ignore: List[str]):
 def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, skip_index: bool=False) -> Link:
     """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
 
+    # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
+    from core.models import Snapshot, ArchiveResult
+    snapshot = Snapshot.objects.get(url=link.url)
+
     ARCHIVE_METHODS = get_default_archive_methods()
     
     if methods:
@@ -99,6 +103,9 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
 
                     stats[result.status] += 1
                     log_archive_method_finished(result)
+                    ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
+                                                 output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
+
                 else:
                     # print('{black}      X {}{reset}'.format(method_name, **ANSI))
                     stats['skipped'] += 1

From f292cface27e6de0a552d2fc1e78fd99f6aa9219 Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Wed, 4 Nov 2020 14:40:44 -0500
Subject: [PATCH 05/20] fix: Add condition for oneshot when archiving links

---
 archivebox/extractors/__init__.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index d5d8832f..23a4f5ef 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -66,8 +66,9 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
     """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
 
     # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
-    from core.models import Snapshot, ArchiveResult
-    snapshot = Snapshot.objects.get(url=link.url)
+    if not skip_index:
+        from core.models import Snapshot, ArchiveResult
+        snapshot = Snapshot.objects.get(url=link.url)
 
     ARCHIVE_METHODS = get_default_archive_methods()
     
@@ -103,7 +104,8 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
 
                     stats[result.status] += 1
                     log_archive_method_finished(result)
-                    ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
+                    if not skip_index:
+                        ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
                                                  output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
 
                 else:

From d064a3eeffa0a6cb52462ce1f2edb0d6be8f753a Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Wed, 4 Nov 2020 15:02:54 -0500
Subject: [PATCH 06/20] fix: Handle case when update tries to re-add a link
 that is not in the sql index

---
 archivebox/extractors/__init__.py | 6 +++++-
 tests/test_update.py              | 3 ++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index 23a4f5ef..e27b9d80 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -8,6 +8,7 @@ from datetime import datetime
 from django.db.models import QuerySet
 
 from ..index.schema import Link
+from ..index.sql import write_link_to_sql_index
 from ..index import (
     load_link_details,
     write_link_details,
@@ -68,7 +69,10 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
     # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
     if not skip_index:
         from core.models import Snapshot, ArchiveResult
-        snapshot = Snapshot.objects.get(url=link.url)
+        try:
+            snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot
+        except Snapshot.DoesNotExist:
+            write_link_to_sql_index(link)
 
     ARCHIVE_METHODS = get_default_archive_methods()
     
diff --git a/tests/test_update.py b/tests/test_update.py
index 238a92d9..29db0174 100644
--- a/tests/test_update.py
+++ b/tests/test_update.py
@@ -6,7 +6,7 @@ def test_update_status_invalid(tmp_path, process, disable_extractors_dict):
     subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
     assert list((tmp_path / "archive").iterdir()) != []
 
-    subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
+    a_process = subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
 
     conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
     c = conn.cursor()
@@ -17,6 +17,7 @@ def test_update_status_invalid(tmp_path, process, disable_extractors_dict):
     assert link is None
 
     update_process = subprocess.run(['archivebox', 'update', '--status=invalid'], capture_output=True, env=disable_extractors_dict)
+    #breakpoint()
 
     conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
     c = conn.cursor()

From 33182fd53c0d96f46576ee38551a7ac4a50ee534 Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Wed, 4 Nov 2020 15:07:45 -0500
Subject: [PATCH 07/20] fix: Add missing assignation

---
 archivebox/extractors/__init__.py | 2 +-
 tests/test_update.py              | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index e27b9d80..ef5ef446 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -72,7 +72,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
         try:
             snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot
         except Snapshot.DoesNotExist:
-            write_link_to_sql_index(link)
+            snapshot = write_link_to_sql_index(link)
 
     ARCHIVE_METHODS = get_default_archive_methods()
     
diff --git a/tests/test_update.py b/tests/test_update.py
index 29db0174..95a61ce9 100644
--- a/tests/test_update.py
+++ b/tests/test_update.py
@@ -17,7 +17,6 @@ def test_update_status_invalid(tmp_path, process, disable_extractors_dict):
     assert link is None
 
     update_process = subprocess.run(['archivebox', 'update', '--status=invalid'], capture_output=True, env=disable_extractors_dict)
-    #breakpoint()
 
     conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
     c = conn.cursor()

From 71655220ad8554458978a078e604cb2b57fa2e1c Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Thu, 5 Nov 2020 07:54:40 -0500
Subject: [PATCH 08/20] feat: Add warc to list and limit check to succeeded
 archive results

---
 archivebox/core/utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py
index 78d0cff5..1a073fa4 100644
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -20,17 +20,21 @@ def get_icons(snapshot: Snapshot) -> str:
         "archive_org": "🏛",
         "readability": "🆁",
         "mercury": "🅼",
+        "warc": "📦"
     }
     exclude = ["favicon"]
     # Missing specific entry for WARC
 
-
     for extractor in EXTRACTORS:
-        result = archive_results.filter(extractor=extractor[0])
+        result = archive_results.filter(extractor=extractor[0], status="succeeded")
         try:
             if extractor[0] not in exclude:
                 output += output_template.format(link.archive_path, canon[f"{extractor[0]}_path"],
                                                  result.exists(), extractor[0], icons.get(extractor[0], "?"))
+            if extractor[0] == "wget":
+                extractor = "warc"
+                output += output_template.format(link.archive_path, canon[f"{extractor}_path"],
+                                                 result.exists(), extractor, icons.get(extractor, "?"))
         except Exception as e:
             print(e)
 

From 508a0bb06ebd15bcb63407328a5d4747fb10d977 Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Tue, 10 Nov 2020 12:38:29 -0500
Subject: [PATCH 09/20] refactor: Unpack extractors tuple instead of using the
 index to access the relevant information

---
 archivebox/core/utils.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py
index 1a073fa4..228918d4 100644
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -25,16 +25,19 @@ def get_icons(snapshot: Snapshot) -> str:
     exclude = ["favicon"]
     # Missing specific entry for WARC
 
-    for extractor in EXTRACTORS:
-        result = archive_results.filter(extractor=extractor[0], status="succeeded")
+    for extractor, _ in EXTRACTORS:
+        result = archive_results.filter(extractor=extractor, status="succeeded")
+        path, exists = link.archive_path, result.exists()
         try:
-            if extractor[0] not in exclude:
-                output += output_template.format(link.archive_path, canon[f"{extractor[0]}_path"],
-                                                 result.exists(), extractor[0], icons.get(extractor[0], "?"))
-            if extractor[0] == "wget":
-                extractor = "warc"
-                output += output_template.format(link.archive_path, canon[f"{extractor}_path"],
-                                                 result.exists(), extractor, icons.get(extractor, "?"))
+            if extractor not in exclude:
+                output += output_template.format(path, canon[f"{extractor}_path"],
+                                                 exists, extractor, icons.get(extractor, "?"))
+            if extractor == "wget":
+                # warc isn't technically it's own extractor, so we have to add it after wget
+
+                output += output_template.format(path, canon[f"warc_path"],
+                                                 exists, "warc", icons.get("warc", "?"))
+
         except Exception as e:
             print(e)
 

From f7f0bebdcc021623a438e7975982523cdbe8bea8 Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Wed, 11 Nov 2020 15:26:54 -0500
Subject: [PATCH 10/20] feat: Modify migration reverse function to restore
 index (WIP)

---
 .../core/migrations/0007_archiveresult.py     | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py
index c0e1393b..74d3a6b5 100644
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@@ -33,9 +33,29 @@ def forwards_func(apps, schema_editor):
                 start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
 
 
+def verify_json_index_integrity(results):
+    results = snapshot.archiveresult_set.all()
+    out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
+    with open(out_dir / "index.json", "r") as f:
+        index = json.load(f)
+
+    history = index["history"]
+    extractors = [extractor for extractor in history]
+    index_results = [(result, extractor) for result in history[extractor]]
+    flattened_results = [(result["start_ts"], extractor) for result, extractor in index_results]
+    
+    missing = [result for result in results if result.start_ts not in flattened_results]
+
+    #process missing elements here. Re-add to the index.json
+
+
+
 
 def reverse_func(apps, schema_editor):
-    ArchiveResult = apps.get_model("core", "ArchiveResult")
+    Snapshot = apps.get_model("core", "Snapshot")
+    for snapshot in Snapshot.objects.all():
+        verify_json_index_integrity(snapshot)
+
     ArchiveResult.objects.all().delete()
 
 

From b237e412df2c63399394a7ad0370096f7cd1009d Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Thu, 12 Nov 2020 10:30:41 -0500
Subject: [PATCH 11/20] feat: Finish reversal. Add ArchiveResults that are not
 found in the index.json

---
 .../core/migrations/0007_archiveresult.py     | 20 ++++++++++++-------
 archivebox/core/utils.py                      |  2 +-
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py
index 74d3a6b5..5da97e29 100644
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@@ -7,6 +7,7 @@ from django.db import migrations, models
 import django.db.models.deletion
 
 from config import CONFIG
+from index.json import to_json
 
 
 def forwards_func(apps, schema_editor):
@@ -33,26 +34,31 @@ def forwards_func(apps, schema_editor):
                 start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
 
 
-def verify_json_index_integrity(results):
+def verify_json_index_integrity(snapshot):
     results = snapshot.archiveresult_set.all()
     out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
     with open(out_dir / "index.json", "r") as f:
         index = json.load(f)
 
     history = index["history"]
-    extractors = [extractor for extractor in history]
-    index_results = [(result, extractor) for result in history[extractor]]
-    flattened_results = [(result["start_ts"], extractor) for result, extractor in index_results]
+    index_results = [result for extractor in history for result in history[extractor]]
+    flattened_results = [result["start_ts"] for result in index_results]
     
-    missing = [result for result in results if result.start_ts not in flattened_results]
-
-    #process missing elements here. Re-add to the index.json
+    missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
 
+    for missing in missing_results:
+        index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(),
+                                                    "start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output,
+                                                    "schema": "ArchiveResult", "status": missing.status})
 
+    json_index = to_json(index)
+    with open(out_dir / "index.json", "w") as f:
+        f.write(json_index)
 
 
 def reverse_func(apps, schema_editor):
     Snapshot = apps.get_model("core", "Snapshot")
+    ArchiveResult = apps.get_model("core", "ArchiveResult")
     for snapshot in Snapshot.objects.all():
         verify_json_index_integrity(snapshot)
 
diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py
index 228918d4..a5fa2669 100644
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -35,7 +35,7 @@ def get_icons(snapshot: Snapshot) -> str:
             if extractor == "wget":
                 # warc isn't technically it's own extractor, so we have to add it after wget
 
-                output += output_template.format(path, canon[f"warc_path"],
+                output += output_template.format(path, canon["warc_path"],
                                                  exists, "warc", icons.get("warc", "?"))
 
         except Exception as e:

From e594e6a75a2895077029d97b88d7b6f8b580885f Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Thu, 12 Nov 2020 10:57:31 -0500
Subject: [PATCH 12/20] feat: WARC link points to the first warc result in
 target path

---
 archivebox/core/utils.py            | 8 +++++---
 archivebox/themes/default/base.html | 6 +++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py
index a5fa2669..67b8004d 100644
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -1,6 +1,7 @@
 from django.utils.html import format_html
 
 from core.models import Snapshot, EXTRACTORS
+from pathlib import Path
 
 
 def get_icons(snapshot: Snapshot) -> str:
@@ -34,9 +35,10 @@ def get_icons(snapshot: Snapshot) -> str:
                                                  exists, extractor, icons.get(extractor, "?"))
             if extractor == "wget":
                 # warc isn't technically it's own extractor, so we have to add it after wget
-
-                output += output_template.format(path, canon["warc_path"],
-                                                 exists, "warc", icons.get("warc", "?"))
+                exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
+                if exists:
+                    output += output_template.format(exists[0], "",
+                                                     True, "warc", icons.get("warc", "?"))
 
         except Exception as e:
             print(e)
diff --git a/archivebox/themes/default/base.html b/archivebox/themes/default/base.html
index ed7d1be9..cacd0597 100644
--- a/archivebox/themes/default/base.html
+++ b/archivebox/themes/default/base.html
@@ -223,6 +223,10 @@
         .title-col a {
             color: black;
         }
+
+        .exists-False {
+          display: none;
+        }
     </style>
     <link rel="stylesheet" href="{% static 'bootstrap.min.css' %}">
     <link rel="stylesheet" href="{% static 'jquery.dataTables.min.css' %}" />
@@ -283,4 +287,4 @@
         </footer>
     </body>
     
-    </html>
\ No newline at end of file
+    </html>

From 8cfad64271cf72ed4572c4d3a2c5ff6885bc8b95 Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Thu, 12 Nov 2020 11:09:34 -0500
Subject: [PATCH 13/20] feat: Add specific logic for archive_org icon

---
 archivebox/core/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py
index 67b8004d..6266024b 100644
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -23,7 +23,7 @@ def get_icons(snapshot: Snapshot) -> str:
         "mercury": "🅼",
         "warc": "📦"
     }
-    exclude = ["favicon"]
+    exclude = ["favicon", "archive_org"]
     # Missing specific entry for WARC
 
     for extractor, _ in EXTRACTORS:
@@ -40,6 +40,14 @@ def get_icons(snapshot: Snapshot) -> str:
                     output += output_template.format(exists[0], "",
                                                      True, "warc", icons.get("warc", "?"))
 
+            if extractor == "archive_org" and exists:
+                # The check for archive_org is different, so it has to be handled separately
+                target_path = Path(path) / "archive.org.txt"
+                exists = target_path.exists()
+                if exists:
+                    output += '<a href="{}" class="exists-{}" title="{}">{} </a>'.format(canon["archive_org_path"],
+                                                                                         True, "archive_org", icons.get("archive_org", "?"))
+
         except Exception as e:
             print(e)
 

From c565fad75cf5f6256a0ce70febb7c2246cbd1b42 Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Thu, 12 Nov 2020 11:37:56 -0500
Subject: [PATCH 14/20] feat: Use prefetch related to reduce the number of
 queries to the database on public index view

---
 archivebox/core/utils.py | 83 +++++++++++++---------------------------
 archivebox/core/views.py |  1 +
 2 files changed, 27 insertions(+), 57 deletions(-)

diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py
index 6266024b..14c40eaf 100644
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -1,15 +1,16 @@
 from django.utils.html import format_html
 
 from core.models import Snapshot, EXTRACTORS
+from core.settings import DEBUG
 from pathlib import Path
 
 
 def get_icons(snapshot: Snapshot) -> str:
-    archive_results = snapshot.archiveresult_set
+    archive_results = list(snapshot.archiveresult_set.all())
     link = snapshot.as_link()
     canon = link.canonical_outputs()
     output = ""
-    output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{} </a>'
+    output_template = '<a href="/{}/{}" class="exists-True" title="{}">{} </a>'
     icons = {
         "singlefile": "❶",
         "wget": "🆆",
@@ -27,62 +28,30 @@ def get_icons(snapshot: Snapshot) -> str:
     # Missing specific entry for WARC
 
     for extractor, _ in EXTRACTORS:
-        result = archive_results.filter(extractor=extractor, status="succeeded")
-        path, exists = link.archive_path, result.exists()
-        try:
-            if extractor not in exclude:
-                output += output_template.format(path, canon[f"{extractor}_path"],
-                                                 exists, extractor, icons.get(extractor, "?"))
-            if extractor == "wget":
-                # warc isn't technically it's own extractor, so we have to add it after wget
-                exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
-                if exists:
-                    output += output_template.format(exists[0], "",
-                                                     True, "warc", icons.get("warc", "?"))
+        for result in archive_results:
+            if result.extractor != extractor or result.status != "succeeded":
+                continue
+            path = link.archive_path
+            try:
+                if extractor not in exclude:
+                    output += output_template.format(path, canon[f"{extractor}_path"],
+                                                     extractor, icons.get(extractor, "?"))
+                if extractor == "wget":
+                    # warc isn't technically it's own extractor, so we have to add it after wget
+                    exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
+                    if exists:
+                        output += output_template.format(exists[0], "",
+                                                         "warc", icons.get("warc", "?"))
 
-            if extractor == "archive_org" and exists:
-                # The check for archive_org is different, so it has to be handled separately
-                target_path = Path(path) / "archive.org.txt"
-                exists = target_path.exists()
-                if exists:
-                    output += '<a href="{}" class="exists-{}" title="{}">{} </a>'.format(canon["archive_org_path"],
-                                                                                         True, "archive_org", icons.get("archive_org", "?"))
+                if extractor == "archive_org":
+                    # The check for archive_org is different, so it has to be handled separately
+                    target_path = Path(path) / "archive.org.txt"
+                    exists = target_path.exists()
+                    if exists:
+                        output += '<a href="{}" class="exists-True" title="{}">{} </a>'.format(canon["archive_org_path"],
+                                                                                               "archive_org", icons.get("archive_org", "?"))
 
-        except Exception as e:
-            print(e)
+            except Exception as e:
+                print(e)
 
     return format_html(f'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">{output}<span>')
-
-#def get_icons(snapshot: Snapshot) -> str:
-#    link = snapshot.as_link()
-#    canon = link.canonical_outputs()
-#    out_dir = Path(link.link_dir)
-#
-#    # slow version: highlights icons based on whether files exist or not for that output
-#    # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
-#    # fast version: all icons are highlighted without checking for outputs in filesystem
-#    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
-#
-#    return format_html(
-#            '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
-#                '<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'
-#                '<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '
-#                '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
-#                '<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '
-#                '<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '
-#                '<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '
-#                '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
-#                '<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '
-#                '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
-#            '</span>',
-#            *link_tuple(link, 'singlefile_path'),
-#            *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
-#            *link_tuple(link, 'pdf_path'),
-#            *link_tuple(link, 'screenshot_path'),
-#            *link_tuple(link, 'dom_path'),
-#            *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
-#            *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
-#            *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
-#            canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
-#        )
-#
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index 7cd8b104..ee540821 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -98,6 +98,7 @@ class PublicArchiveView(ListView):
         query = self.request.GET.get('q')
         if query:
             qs = Snapshot.objects.filter(title__icontains=query)
+        qs = qs.prefetch_related("archiveresult_set").all()
         for snapshot in qs:
             snapshot.icons = get_icons(snapshot) 
         return qs

From 0f13087a0949800a54753880c1dc5d35c95bef05 Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Thu, 12 Nov 2020 13:58:13 -0500
Subject: [PATCH 15/20] refactor: Remove unneeded prefetch related

---
 archivebox/core/utils.py | 5 ++---
 archivebox/core/views.py | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py
index 14c40eaf..3c310525 100644
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -1,12 +1,11 @@
 from django.utils.html import format_html
 
 from core.models import Snapshot, EXTRACTORS
-from core.settings import DEBUG
 from pathlib import Path
 
 
 def get_icons(snapshot: Snapshot) -> str:
-    archive_results = list(snapshot.archiveresult_set.all())
+    archive_results = snapshot.archiveresult_set.filter(status="succeeded")
     link = snapshot.as_link()
     canon = link.canonical_outputs()
     output = ""
@@ -29,7 +28,7 @@ def get_icons(snapshot: Snapshot) -> str:
 
     for extractor, _ in EXTRACTORS:
         for result in archive_results:
-            if result.extractor != extractor or result.status != "succeeded":
+            if result.extractor != extractor:
                 continue
             path = link.archive_path
             try:
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index ee540821..7cd8b104 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -98,7 +98,6 @@ class PublicArchiveView(ListView):
         query = self.request.GET.get('q')
         if query:
             qs = Snapshot.objects.filter(title__icontains=query)
-        qs = qs.prefetch_related("archiveresult_set").all()
         for snapshot in qs:
             snapshot.icons = get_icons(snapshot) 
         return qs

From 34a1a6d30dd588b6d840c1e9162809e191f652ba Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Mon, 23 Nov 2020 18:28:43 -0500
Subject: [PATCH 16/20] fix: Update model according to code review

---
 .../core/migrations/0007_archiveresult.py       | 12 ++++++------
 archivebox/core/models.py                       | 17 +++++++++++------
 archivebox/themes/default/base.html             |  2 +-
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py
index 5da97e29..4b8a074b 100644
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@@ -76,14 +76,14 @@ class Migration(migrations.Migration):
             name='ArchiveResult',
             fields=[
                 ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
-                ('cmd', models.CharField(default='', max_length=500)),
-                ('pwd', models.CharField(default='', max_length=200)),
-                ('cmd_version', models.CharField(default='', max_length=20)),
-                ('status', models.CharField(max_length=10)),
-                ('output', models.CharField(default='', max_length=500)),
+                ('cmd', models.CharField(max_length=500)),
+                ('pwd', models.CharField(max_length=200)),
+                ('cmd_version', models.CharField(max_length=32)),
+                ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
+                ('output', models.CharField(max_length=500)),
                 ('start_ts', models.DateTimeField()),
                 ('end_ts', models.DateTimeField()),
-                ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=20)),
+                ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)),
                 ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
             ],
         ),
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 41976348..c273c072 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -11,6 +11,11 @@ from ..index.schema import Link
 from ..extractors import get_default_archive_methods
 
 EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
+STATUS_CHOICES = [
+    ("succeeded", "succeeded"),
+    ("failed", "failed"),
+    ("skipped", "skipped")
+]
 
 
 class Tag(models.Model):
@@ -155,14 +160,14 @@ class Snapshot(models.Model):
 
 class ArchiveResult(models.Model):
     snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
-    cmd = models.CharField(max_length=500, default="")
-    pwd = models.CharField(max_length=200, default="")
-    cmd_version = models.CharField(max_length=20, default="")
-    output = models.CharField(max_length=500, default="")
+    cmd = models.CharField(max_length=500)
+    pwd = models.CharField(max_length=200)
+    cmd_version = models.CharField(max_length=32)
+    output = models.CharField(max_length=500)
     start_ts = models.DateTimeField()
     end_ts = models.DateTimeField()
-    status = models.CharField(max_length=10)
-    extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20)
+    status = models.CharField(max_length=16, choices=STATUS_CHOICES)
+    extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=32)
 
     def __str__(self):
         return self.extractor
diff --git a/archivebox/themes/default/base.html b/archivebox/themes/default/base.html
index cacd0597..f778da16 100644
--- a/archivebox/themes/default/base.html
+++ b/archivebox/themes/default/base.html
@@ -225,7 +225,7 @@
         }
 
         .exists-False {
-          display: none;
+          opacity: 0.1;
         }
     </style>
     <link rel="stylesheet" href="{% static 'bootstrap.min.css' %}">

From f84f288befd9a1cb773c146b6da7ba05273ac3d7 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Fri, 27 Nov 2020 00:01:34 -0500
Subject: [PATCH 17/20] Apply suggestions from code review

minor nit
---
 archivebox/core/models.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index c273c072..48ebd43d 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -161,13 +161,13 @@ class Snapshot(models.Model):
 class ArchiveResult(models.Model):
     snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
     cmd = models.CharField(max_length=500)
-    pwd = models.CharField(max_length=200)
+    pwd = models.CharField(max_length=256)
     cmd_version = models.CharField(max_length=32)
-    output = models.CharField(max_length=500)
+    output = models.CharField(max_length=512)
     start_ts = models.DateTimeField()
     end_ts = models.DateTimeField()
     status = models.CharField(max_length=16, choices=STATUS_CHOICES)
-    extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=32)
+    extractor = models.CharField(choices=EXTRACTORS, max_length=32)
 
     def __str__(self):
         return self.extractor

From f61e6a74bb1279124fc6ee20d7a053e73eb5bf3d Mon Sep 17 00:00:00 2001
From: Cristian <cristianvargasvalencia@gmail.com>
Date: Fri, 27 Nov 2020 15:53:34 -0500
Subject: [PATCH 18/20] feat: Re-add unused icons in list view

---
 archivebox/core/utils.py            | 47 ++++++++++++++---------------
 archivebox/themes/default/base.html |  1 +
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py
index 3c310525..9804d6ee 100644
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -1,4 +1,5 @@
 from django.utils.html import format_html
+from collections import defaultdict
 
 from core.models import Snapshot, EXTRACTORS
 from pathlib import Path
@@ -7,9 +8,10 @@ from pathlib import Path
 def get_icons(snapshot: Snapshot) -> str:
     archive_results = snapshot.archiveresult_set.filter(status="succeeded")
     link = snapshot.as_link()
+    path = link.archive_path
     canon = link.canonical_outputs()
     output = ""
-    output_template = '<a href="/{}/{}" class="exists-True" title="{}">{} </a>'
+    output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{} </a>'
     icons = {
         "singlefile": "❶",
         "wget": "🆆",
@@ -23,34 +25,31 @@ def get_icons(snapshot: Snapshot) -> str:
         "mercury": "🅼",
         "warc": "📦"
     }
-    exclude = ["favicon", "archive_org"]
+    exclude = ["favicon", "title", "headers", "archive_org"]
     # Missing specific entry for WARC
 
+    extractor_items = defaultdict(lambda: None)
     for extractor, _ in EXTRACTORS:
         for result in archive_results:
-            if result.extractor != extractor:
-                continue
-            path = link.archive_path
-            try:
-                if extractor not in exclude:
-                    output += output_template.format(path, canon[f"{extractor}_path"],
-                                                     extractor, icons.get(extractor, "?"))
-                if extractor == "wget":
-                    # warc isn't technically it's own extractor, so we have to add it after wget
-                    exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
-                    if exists:
-                        output += output_template.format(exists[0], "",
-                                                         "warc", icons.get("warc", "?"))
+            if result.extractor == extractor:
+                extractor_items[extractor] = result
 
-                if extractor == "archive_org":
-                    # The check for archive_org is different, so it has to be handled separately
-                    target_path = Path(path) / "archive.org.txt"
-                    exists = target_path.exists()
-                    if exists:
-                        output += '<a href="{}" class="exists-True" title="{}">{} </a>'.format(canon["archive_org_path"],
-                                                                                               "archive_org", icons.get("archive_org", "?"))
+    for extractor, _ in EXTRACTORS:
+        if extractor not in exclude:
+            exists = extractor_items[extractor] is not None
+            output += output_template.format(path, canon[f"{extractor}_path"], str(exists),
+                                             extractor, icons.get(extractor, "?"))
+        if extractor == "wget":
+            # warc isn't technically it's own extractor, so we have to add it after wget
+            exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
+            if exists:
+                output += output_template.format(exists[0], "", str(bool(exists)), "warc", icons.get("warc", "?"))
 
-            except Exception as e:
-                print(e)
+        if extractor == "archive_org":
+            # The check for archive_org is different, so it has to be handled separately
+            target_path = Path(path) / "archive.org.txt"
+            exists = target_path.exists()
+            output += '<a href="{}" class="exists-{}" title="{}">{} </a>'.format(canon["archive_org_path"], str(exists),
+                                                                                        "archive_org", icons.get("archive_org", "?"))
 
     return format_html(f'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">{output}<span>')
diff --git a/archivebox/themes/default/base.html b/archivebox/themes/default/base.html
index f778da16..77d912d5 100644
--- a/archivebox/themes/default/base.html
+++ b/archivebox/themes/default/base.html
@@ -226,6 +226,7 @@
 
         .exists-False {
           opacity: 0.1;
+          pointer-events: none;
         }
     </style>
     <link rel="stylesheet" href="{% static 'bootstrap.min.css' %}">

From 4b3f72202b92d2ab04baa99780a41fa302bf94e6 Mon Sep 17 00:00:00 2001
From: Cristian <cristianvargasvalencia@gmail.com>
Date: Fri, 27 Nov 2020 16:23:27 -0500
Subject: [PATCH 19/20] feat: Bump django, update migration and change cmd to
 use JSONField

---
 archivebox.egg-info/SOURCES.txt                  | 1 +
 archivebox.egg-info/requires.txt                 | 3 +--
 archivebox/core/migrations/0007_archiveresult.py | 8 ++++----
 archivebox/core/models.py                        | 2 +-
 setup.py                                         | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/archivebox.egg-info/SOURCES.txt b/archivebox.egg-info/SOURCES.txt
index eee55cc5..471100ad 100644
--- a/archivebox.egg-info/SOURCES.txt
+++ b/archivebox.egg-info/SOURCES.txt
@@ -57,6 +57,7 @@ archivebox/core/migrations/0003_auto_20200630_1034.py
 archivebox/core/migrations/0004_auto_20200713_1552.py
 archivebox/core/migrations/0005_auto_20200728_0326.py
 archivebox/core/migrations/0006_auto_20201012_1520.py
+archivebox/core/migrations/0007_archiveresult.py
 archivebox/core/migrations/__init__.py
 archivebox/extractors/__init__.py
 archivebox/extractors/archive_org.py
diff --git a/archivebox.egg-info/requires.txt b/archivebox.egg-info/requires.txt
index 71dc253d..e0e17f19 100644
--- a/archivebox.egg-info/requires.txt
+++ b/archivebox.egg-info/requires.txt
@@ -2,7 +2,7 @@ requests==2.24.0
 atomicwrites==1.4.0
 mypy-extensions==0.4.3
 base32-crockford==0.3.0
-django==3.0.8
+django==3.1.3
 django-extensions==3.0.3
 dateparser
 ipython
@@ -13,7 +13,6 @@ w3lib==1.22.0
 
 [dev]
 setuptools
-wheel
 twine
 flake8
 ipdb
diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py
index 4b8a074b..898e0f93 100644
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@@ -30,7 +30,7 @@ def forwards_func(apps, schema_editor):
 
         for extractor in history:
             for result in history[extractor]:
-                ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=json.dumps(result["cmd"]), cmd_version=result["cmd_version"], 
+                ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=result["cmd"], cmd_version=result["cmd_version"], 
                 start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
 
 
@@ -76,11 +76,11 @@ class Migration(migrations.Migration):
             name='ArchiveResult',
             fields=[
                 ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
-                ('cmd', models.CharField(max_length=500)),
-                ('pwd', models.CharField(max_length=200)),
+                ('cmd', models.JSONField()),
+                ('pwd', models.CharField(max_length=256)),
                 ('cmd_version', models.CharField(max_length=32)),
                 ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
-                ('output', models.CharField(max_length=500)),
+                ('output', models.CharField(max_length=512)),
                 ('start_ts', models.DateTimeField()),
                 ('end_ts', models.DateTimeField()),
                 ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)),
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 48ebd43d..9d893490 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -160,7 +160,7 @@ class Snapshot(models.Model):
 
 class ArchiveResult(models.Model):
     snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
-    cmd = models.CharField(max_length=500)
+    cmd = models.JSONField()
     pwd = models.CharField(max_length=256)
     cmd_version = models.CharField(max_length=32)
     output = models.CharField(max_length=512)
diff --git a/setup.py b/setup.py
index cdec8133..c540bc07 100755
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,7 @@ setuptools.setup(
         "atomicwrites==1.4.0",
         "mypy-extensions==0.4.3",
         "base32-crockford==0.3.0",
-        "django==3.0.8",
+        "django==3.1.3",
         "django-extensions==3.0.3",
 
         "dateparser",

From 00bb55203ec7f585e5b31d233b9f8a94dc53f830 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Fri, 27 Nov 2020 23:45:49 -0500
Subject: [PATCH 20/20] always show WARC icon with opacity set based on exists

---
 archivebox/core/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py
index 9804d6ee..3df46a51 100644
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -42,8 +42,7 @@ def get_icons(snapshot: Snapshot) -> str:
         if extractor == "wget":
             # warc isn't technically it's own extractor, so we have to add it after wget
             exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
-            if exists:
-                output += output_template.format(exists[0], "", str(bool(exists)), "warc", icons.get("warc", "?"))
+            output += output_template.format(exists[0] if exists else '#', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
 
         if extractor == "archive_org":
             # The check for archive_org is different, so it has to be handled separately