diff --git a/archivebox/core/models.py b/archivebox/core/models.py index bb7e53d2..76a90199 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -5,9 +5,11 @@ import uuid from django.db import models, transaction from django.utils.functional import cached_property from django.utils.text import slugify +from django.core.cache import cache from django.db.models import Case, When, Value, IntegerField -from ..config import ARCHIVE_DIR +from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME +from ..system import get_dir_size from ..util import parse_date, base_url, hashurl from ..index.schema import Link from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE @@ -111,7 +113,9 @@ class Snapshot(models.Model): return load_link_details(self.as_link()) def tags_str(self) -> str: - return ','.join(self.tags.order_by('name').values_list('name', flat=True)) + cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags' + calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True)) + return cache.get_or_set(cache_key, calc_tags_str) @cached_property def bookmarked(self): @@ -148,10 +152,15 @@ class Snapshot(models.Model): @cached_property def archive_size(self): - try: - return get_dir_size(self.link_dir)[0] - except Exception: - return 0 + cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size' + + def calc_dir_size(): + try: + return get_dir_size(self.link_dir)[0] + except Exception: + return 0 + + return cache.get_or_set(cache_key, calc_dir_size) @cached_property def history(self): diff --git a/archivebox/index/html.py b/archivebox/index/html.py index c1b6df10..c4f66f55 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -6,6 +6,7 @@ from collections import defaultdict from typing import List, Optional, Iterator, Mapping from django.utils.html import format_html, mark_safe +from django.core.cache import cache from .schema import Link from ..system import atomic_write @@ -115,74 +116,78 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str: def snapshot_icons(snapshot) -> str: - from core.models import EXTRACTORS - # start = datetime.now() + cache_key = f'{str(snapshot.id)[:12]}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons' + + def calc_snapshot_icons(): + from core.models import EXTRACTORS + # start = datetime.now() - archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) - link = snapshot.as_link() - path = link.archive_path - canon = link.canonical_outputs() - output = "" - output_template = '{}  ' - icons = { - "singlefile": "❶", - "wget": "🆆", - "dom": "🅷", - "pdf": "📄", - "screenshot": "💻", - "media": "📼", - "git": "🅶", - "archive_org": "🏛", - "readability": "🆁", - "mercury": "🅼", - "warc": "📦" - } - exclude = ["favicon", "title", "headers", "archive_org"] - # Missing specific entry for WARC + archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) + link = snapshot.as_link() + path = link.archive_path + canon = link.canonical_outputs() + output = "" + output_template = '{}  ' + icons = { + "singlefile": "❶", + "wget": "🆆", + "dom": "🅷", + "pdf": "📄", + "screenshot": "💻", + "media": "📼", + "git": "🅶", + "archive_org": "🏛", + "readability": "🆁", + "mercury": "🅼", + "warc": "📦" + } + exclude = ["favicon", "title", "headers", "archive_org"] + # Missing specific entry for WARC - extractor_outputs = defaultdict(lambda: None) - for extractor, _ in EXTRACTORS: - for result in archive_results: - if result.extractor == extractor and result: - extractor_outputs[extractor] = result + extractor_outputs = defaultdict(lambda: None) + for extractor, _ in EXTRACTORS: + for result in archive_results: + if result.extractor == extractor and result: + extractor_outputs[extractor] = result - for extractor, _ in EXTRACTORS: - if extractor not in exclude: - existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output - # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching) - # if existing: - # existing = (Path(path) / existing) - # if existing.is_file(): - # existing = True - # elif existing.is_dir(): - # existing = any(existing.glob('*.*')) - output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)), - extractor, icons.get(extractor, "?")) - if extractor == "wget": - # warc isn't technically it's own extractor, so we have to add it after wget - - # get from db (faster but less thurthful) - exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output - # get from filesystem (slower but more accurate) - # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) - output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?")) + for extractor, _ in EXTRACTORS: + if extractor not in exclude: + existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output + # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching) + # if existing: + # existing = (Path(path) / existing) + # if existing.is_file(): + # existing = True + # elif existing.is_dir(): + # existing = any(existing.glob('*.*')) + output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)), + extractor, icons.get(extractor, "?")) + if extractor == "wget": + # warc isn't technically it's own extractor, so we have to add it after wget + + # get from db (faster but less thurthful) + exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output + # get from filesystem (slower but more accurate) + # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) + output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?")) - if extractor == "archive_org": - # The check for archive_org is different, so it has to be handled separately + if extractor == "archive_org": + # The check for archive_org is different, so it has to be handled separately - # get from db (faster) - exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output - # get from filesystem (slower) - # target_path = Path(path) / "archive.org.txt" - # exists = target_path.exists() - output += '{} '.format(canon["archive_org_path"], str(exists), - "archive_org", icons.get("archive_org", "?")) + # get from db (faster) + exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output + # get from filesystem (slower) + # target_path = Path(path) / "archive.org.txt" + # exists = target_path.exists() + output += '{} '.format(canon["archive_org_path"], str(exists), + "archive_org", icons.get("archive_org", "?")) - result = format_html('{}', mark_safe(output)) - # end = datetime.now() - # print(((end - start).total_seconds()*1000) // 1, 'ms') - return result + result = format_html('{}', mark_safe(output)) + # end = datetime.now() + # print(((end - start).total_seconds()*1000) // 1, 'ms') + return result - # return cache.get_or_set(cache_key, calc_snapshot_icons) + return cache.get_or_set(cache_key, calc_snapshot_icons) + # return calc_snapshot_icons() diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 9c83c4cf..df43d7b7 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -16,6 +16,7 @@ from typing import List, Dict, Any, Optional, Union from dataclasses import dataclass, asdict, field, fields +from django.utils.functional import cached_property from ..system import get_dir_size @@ -133,7 +134,6 @@ class Link: updated: Optional[datetime] = None schema: str = 'Link' - def __str__(self) -> str: return f'[{self.timestamp}] {self.url} "{self.title}"'