1
0
Fork 0
mirror of synced 2024-09-27 23:01:22 +12:00

cache dir size, snapshot icons, tags str, and title in django cache

This commit is contained in:
Nick Sweeting 2021-02-16 15:49:29 -05:00
parent 51440ede3a
commit 8b236b9367
3 changed files with 83 additions and 69 deletions

View file

@ -5,9 +5,11 @@ import uuid
from django.db import models, transaction from django.db import models, transaction
from django.utils.functional import cached_property from django.utils.functional import cached_property
from django.utils.text import slugify from django.utils.text import slugify
from django.core.cache import cache
from django.db.models import Case, When, Value, IntegerField from django.db.models import Case, When, Value, IntegerField
from ..config import ARCHIVE_DIR from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
from ..system import get_dir_size
from ..util import parse_date, base_url, hashurl from ..util import parse_date, base_url, hashurl
from ..index.schema import Link from ..index.schema import Link
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
@ -111,7 +113,9 @@ class Snapshot(models.Model):
return load_link_details(self.as_link()) return load_link_details(self.as_link())
def tags_str(self) -> str: def tags_str(self) -> str:
return ','.join(self.tags.order_by('name').values_list('name', flat=True)) cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
return cache.get_or_set(cache_key, calc_tags_str)
@cached_property @cached_property
def bookmarked(self): def bookmarked(self):
@ -148,10 +152,15 @@ class Snapshot(models.Model):
@cached_property @cached_property
def archive_size(self): def archive_size(self):
try: cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'
return get_dir_size(self.link_dir)[0]
except Exception: def calc_dir_size():
return 0 try:
return get_dir_size(self.link_dir)[0]
except Exception:
return 0
return cache.get_or_set(cache_key, calc_dir_size)
@cached_property @cached_property
def history(self): def history(self):

View file

@ -6,6 +6,7 @@ from collections import defaultdict
from typing import List, Optional, Iterator, Mapping from typing import List, Optional, Iterator, Mapping
from django.utils.html import format_html, mark_safe from django.utils.html import format_html, mark_safe
from django.core.cache import cache
from .schema import Link from .schema import Link
from ..system import atomic_write from ..system import atomic_write
@ -115,74 +116,78 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
def snapshot_icons(snapshot) -> str: def snapshot_icons(snapshot) -> str:
from core.models import EXTRACTORS cache_key = f'{str(snapshot.id)[:12]}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
# start = datetime.now()
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) def calc_snapshot_icons():
link = snapshot.as_link() from core.models import EXTRACTORS
path = link.archive_path # start = datetime.now()
canon = link.canonical_outputs()
output = ""
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
icons = {
"singlefile": "",
"wget": "🆆",
"dom": "🅷",
"pdf": "📄",
"screenshot": "💻",
"media": "📼",
"git": "🅶",
"archive_org": "🏛",
"readability": "🆁",
"mercury": "🅼",
"warc": "📦"
}
exclude = ["favicon", "title", "headers", "archive_org"]
# Missing specific entry for WARC
extractor_outputs = defaultdict(lambda: None) archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
for extractor, _ in EXTRACTORS: link = snapshot.as_link()
for result in archive_results: path = link.archive_path
if result.extractor == extractor and result: canon = link.canonical_outputs()
extractor_outputs[extractor] = result output = ""
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
icons = {
"singlefile": "",
"wget": "🆆",
"dom": "🅷",
"pdf": "📄",
"screenshot": "💻",
"media": "📼",
"git": "🅶",
"archive_org": "🏛",
"readability": "🆁",
"mercury": "🅼",
"warc": "📦"
}
exclude = ["favicon", "title", "headers", "archive_org"]
# Missing specific entry for WARC
for extractor, _ in EXTRACTORS: extractor_outputs = defaultdict(lambda: None)
if extractor not in exclude: for extractor, _ in EXTRACTORS:
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output for result in archive_results:
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching) if result.extractor == extractor and result:
# if existing: extractor_outputs[extractor] = result
# existing = (Path(path) / existing)
# if existing.is_file():
# existing = True
# elif existing.is_dir():
# existing = any(existing.glob('*.*'))
output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
extractor, icons.get(extractor, "?"))
if extractor == "wget":
# warc isn't technically it's own extractor, so we have to add it after wget
# get from db (faster but less thurthful) for extractor, _ in EXTRACTORS:
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output if extractor not in exclude:
# get from filesystem (slower but more accurate) existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?")) # if existing:
# existing = (Path(path) / existing)
# if existing.is_file():
# existing = True
# elif existing.is_dir():
# existing = any(existing.glob('*.*'))
output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
extractor, icons.get(extractor, "?"))
if extractor == "wget":
# warc isn't technically it's own extractor, so we have to add it after wget
if extractor == "archive_org": # get from db (faster but less thurthful)
# The check for archive_org is different, so it has to be handled separately exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# get from filesystem (slower but more accurate)
# exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
# get from db (faster) if extractor == "archive_org":
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output # The check for archive_org is different, so it has to be handled separately
# get from filesystem (slower)
# target_path = Path(path) / "archive.org.txt"
# exists = target_path.exists()
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
"archive_org", icons.get("archive_org", "?"))
result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output)) # get from db (faster)
# end = datetime.now() exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# print(((end - start).total_seconds()*1000) // 1, 'ms') # get from filesystem (slower)
return result # target_path = Path(path) / "archive.org.txt"
# exists = target_path.exists()
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
"archive_org", icons.get("archive_org", "?"))
# return cache.get_or_set(cache_key, calc_snapshot_icons) result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
# end = datetime.now()
# print(((end - start).total_seconds()*1000) // 1, 'ms')
return result
return cache.get_or_set(cache_key, calc_snapshot_icons)
# return calc_snapshot_icons()

View file

@ -16,6 +16,7 @@ from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass, asdict, field, fields from dataclasses import dataclass, asdict, field, fields
from django.utils.functional import cached_property
from ..system import get_dir_size from ..system import get_dir_size
@ -133,7 +134,6 @@ class Link:
updated: Optional[datetime] = None updated: Optional[datetime] = None
schema: str = 'Link' schema: str = 'Link'
def __str__(self) -> str: def __str__(self) -> str:
return f'[{self.timestamp}] {self.url} "{self.title}"' return f'[{self.timestamp}] {self.url} "{self.title}"'