diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index bb7e53d2..76a90199 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -5,9 +5,11 @@ import uuid
from django.db import models, transaction
from django.utils.functional import cached_property
from django.utils.text import slugify
+from django.core.cache import cache
from django.db.models import Case, When, Value, IntegerField
-from ..config import ARCHIVE_DIR
+from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
+from ..system import get_dir_size
from ..util import parse_date, base_url, hashurl
from ..index.schema import Link
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
@@ -111,7 +113,9 @@ class Snapshot(models.Model):
return load_link_details(self.as_link())
def tags_str(self) -> str:
- return ','.join(self.tags.order_by('name').values_list('name', flat=True))
+ cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
+ calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
+ return cache.get_or_set(cache_key, calc_tags_str)
@cached_property
def bookmarked(self):
@@ -148,10 +152,15 @@ class Snapshot(models.Model):
@cached_property
def archive_size(self):
- try:
- return get_dir_size(self.link_dir)[0]
- except Exception:
- return 0
+ cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'
+
+ def calc_dir_size():
+ try:
+ return get_dir_size(self.link_dir)[0]
+ except Exception:
+ return 0
+
+ return cache.get_or_set(cache_key, calc_dir_size)
@cached_property
def history(self):
diff --git a/archivebox/index/html.py b/archivebox/index/html.py
index c1b6df10..c4f66f55 100644
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -6,6 +6,7 @@ from collections import defaultdict
from typing import List, Optional, Iterator, Mapping
from django.utils.html import format_html, mark_safe
+from django.core.cache import cache
from .schema import Link
from ..system import atomic_write
@@ -115,74 +116,78 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
def snapshot_icons(snapshot) -> str:
- from core.models import EXTRACTORS
- # start = datetime.now()
+ cache_key = f'{str(snapshot.id)[:12]}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
+
+ def calc_snapshot_icons():
+ from core.models import EXTRACTORS
+ # start = datetime.now()
- archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
- link = snapshot.as_link()
- path = link.archive_path
- canon = link.canonical_outputs()
- output = ""
- output_template = '{} '
- icons = {
- "singlefile": "❶",
- "wget": "🆆",
- "dom": "🅷",
- "pdf": "📄",
- "screenshot": "💻",
- "media": "📼",
- "git": "🅶",
- "archive_org": "🏛",
- "readability": "🆁",
- "mercury": "🅼",
- "warc": "📦"
- }
- exclude = ["favicon", "title", "headers", "archive_org"]
- # Missing specific entry for WARC
+ archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
+ link = snapshot.as_link()
+ path = link.archive_path
+ canon = link.canonical_outputs()
+ output = ""
+ output_template = '{} '
+ icons = {
+ "singlefile": "❶",
+ "wget": "🆆",
+ "dom": "🅷",
+ "pdf": "📄",
+ "screenshot": "💻",
+ "media": "📼",
+ "git": "🅶",
+ "archive_org": "🏛",
+ "readability": "🆁",
+ "mercury": "🅼",
+ "warc": "📦"
+ }
+ exclude = ["favicon", "title", "headers", "archive_org"]
+ # Missing specific entry for WARC
- extractor_outputs = defaultdict(lambda: None)
- for extractor, _ in EXTRACTORS:
- for result in archive_results:
- if result.extractor == extractor and result:
- extractor_outputs[extractor] = result
+ extractor_outputs = defaultdict(lambda: None)
+ for extractor, _ in EXTRACTORS:
+ for result in archive_results:
+ if result.extractor == extractor and result:
+ extractor_outputs[extractor] = result
- for extractor, _ in EXTRACTORS:
- if extractor not in exclude:
- existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
- # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
- # if existing:
- # existing = (Path(path) / existing)
- # if existing.is_file():
- # existing = True
- # elif existing.is_dir():
- # existing = any(existing.glob('*.*'))
- output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
- extractor, icons.get(extractor, "?"))
- if extractor == "wget":
- # warc isn't technically it's own extractor, so we have to add it after wget
-
- # get from db (faster but less thurthful)
- exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
- # get from filesystem (slower but more accurate)
- # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
- output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
+ for extractor, _ in EXTRACTORS:
+ if extractor not in exclude:
+ existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+ # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
+ # if existing:
+ # existing = (Path(path) / existing)
+ # if existing.is_file():
+ # existing = True
+ # elif existing.is_dir():
+ # existing = any(existing.glob('*.*'))
+ output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
+ extractor, icons.get(extractor, "?"))
+ if extractor == "wget":
+ # warc isn't technically it's own extractor, so we have to add it after wget
+
+ # get from db (faster but less thurthful)
+ exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+ # get from filesystem (slower but more accurate)
+ # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
+ output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
- if extractor == "archive_org":
- # The check for archive_org is different, so it has to be handled separately
+ if extractor == "archive_org":
+ # The check for archive_org is different, so it has to be handled separately
- # get from db (faster)
- exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
- # get from filesystem (slower)
- # target_path = Path(path) / "archive.org.txt"
- # exists = target_path.exists()
- output += '{} '.format(canon["archive_org_path"], str(exists),
- "archive_org", icons.get("archive_org", "?"))
+ # get from db (faster)
+ exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+ # get from filesystem (slower)
+ # target_path = Path(path) / "archive.org.txt"
+ # exists = target_path.exists()
+ output += '{} '.format(canon["archive_org_path"], str(exists),
+ "archive_org", icons.get("archive_org", "?"))
- result = format_html('{}', mark_safe(output))
- # end = datetime.now()
- # print(((end - start).total_seconds()*1000) // 1, 'ms')
- return result
+ result = format_html('{}', mark_safe(output))
+ # end = datetime.now()
+ # print(((end - start).total_seconds()*1000) // 1, 'ms')
+ return result
- # return cache.get_or_set(cache_key, calc_snapshot_icons)
+ return cache.get_or_set(cache_key, calc_snapshot_icons)
+ # return calc_snapshot_icons()
diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py
index 9c83c4cf..df43d7b7 100644
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@@ -16,6 +16,7 @@ from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass, asdict, field, fields
+from django.utils.functional import cached_property
from ..system import get_dir_size
@@ -133,7 +134,6 @@ class Link:
updated: Optional[datetime] = None
schema: str = 'Link'
-
def __str__(self) -> str:
return f'[{self.timestamp}] {self.url} "{self.title}"'