From 15d88be22922ecc993ed62e04e1c128122de9bb5 Mon Sep 17 00:00:00 2001 From: Cristian Date: Mon, 4 Jan 2021 09:14:02 -0500 Subject: [PATCH] fix: Partially restore `server` command functionality (html still needs some refactoring) --- archivebox/core/models.py | 54 +++++++++++++++++++++++++++++++++------ archivebox/core/views.py | 2 +- archivebox/index/html.py | 39 ++++++++++++++-------------- 3 files changed, 66 insertions(+), 29 deletions(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 82f7ecef..388e3f70 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -3,6 +3,7 @@ __package__ = 'archivebox.core' import uuid from pathlib import Path from typing import Dict, Optional, List +from datetime import datetime from django.db import models, transaction from django.utils.functional import cached_property @@ -12,6 +13,7 @@ from django.db.models import Case, When, Value, IntegerField from ..util import parse_date from ..index.schema import Link from ..config import CONFIG +from ..system import get_dir_size #EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()] EXTRACTORS = [("title", "title"), ("wget", "wget")] @@ -133,8 +135,36 @@ class Snapshot(models.Model): return parse_date(self.timestamp) @cached_property - def is_archived(self): - return self.as_link().is_archived + def is_archived(self) -> bool: + from ..config import ARCHIVE_DIR + from ..util import domain + + output_paths = ( + domain(self.url), + 'output.pdf', + 'screenshot.png', + 'output.html', + 'media', + 'singlefile.html' + ) + + return any( + (Path(ARCHIVE_DIR) / self.timestamp / path).exists() + for path in output_paths + ) + + @cached_property + def archive_dates(self) -> List[datetime]: + return [ + result.start_ts + for result in self.archiveresult_set.all() + ] + + @cached_property + def oldest_archive_date(self) -> Optional[datetime]: + oldest = self.archiveresult_set.all().order_by("-start_ts")[:1] + if len(oldest) > 0: + return oldest[0].start_ts @cached_property def num_outputs(self): @@ -145,8 +175,9 @@ class Snapshot(models.Model): return self.as_link().url_hash @cached_property - def base_url(self): - return self.as_link().base_url + def base_url(self) -> str: + from ..util import base_url + return base_url(self.url) @cached_property def snapshot_dir(self): @@ -155,11 +186,15 @@ class Snapshot(models.Model): @cached_property def archive_path(self): - return self.as_link().archive_path + from ..config import ARCHIVE_DIR_NAME + return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp) @cached_property - def archive_size(self): - return self.as_link().archive_size + def archive_size(self) -> float: + try: + return get_dir_size(self.archive_path)[0] + except Exception: + return 0 @cached_property def history(self): @@ -191,7 +226,10 @@ class Snapshot(models.Model): # TODO: Define what details are, and return them accordingly return {"history": {}} - + @property + def extension(self) -> str: + from ..util import extension + return extension(self.url) def canonical_outputs(self) -> Dict[str, Optional[str]]: """predict the expected output paths that should be present after archiving""" diff --git a/archivebox/core/views.py b/archivebox/core/views.py index ed549aa4..411cce29 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -61,7 +61,7 @@ class LinkDetails(View): by_ts = {page.timestamp: page for page in all_pages} try: # print('SERVING STATICFILE', by_ts[slug].link_dir, request.path, path) - response = static.serve(request, archivefile, document_root=by_ts[slug].link_dir, show_indexes=True) + response = static.serve(request, archivefile, document_root=by_ts[slug].snapshot_dir, show_indexes=True) response["Link"] = f'<{by_ts[slug].url}>; rel="canonical"' return response except KeyError: diff --git a/archivebox/index/html.py b/archivebox/index/html.py index b34d38bb..92ce988d 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -61,10 +61,10 @@ def main_index_template(snapshots: List[Model], template: str=MAIN_INDEX_TEMPLAT return render_django_template(template, { 'version': VERSION, 'git_sha': GIT_SHA, - 'num_links': str(len(snapshots)), + 'num_snapshots': str(len(snapshots)), 'date_updated': datetime.now().strftime('%Y-%m-%d'), 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), - 'links': [snapshot.as_json() for snapshot in snapshots], + 'snapshots': snapshots, 'FOOTER_INFO': FOOTER_INFO, }) @@ -80,30 +80,30 @@ def write_html_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> @enforce_types -def link_details_template(link: Link) -> str: +def link_details_template(snapshot: Model) -> str: from ..extractors.wget import wget_output_path - link_info = link._asdict(extended=True) + snapshot._asdict() return render_django_template(LINK_DETAILS_TEMPLATE, { - **link_info, - **link_info['canonical'], + **snapshot._asdict(), + **snapshot.canonical_outputs(), 'title': htmlencode( - link.title - or (link.base_url if link.is_archived else TITLE_LOADING_MSG) + snapshot.title + or (snapshot.base_url if snapshot.is_archived else TITLE_LOADING_MSG) ), - 'url_str': htmlencode(urldecode(link.base_url)), + 'url_str': htmlencode(urldecode(snapshot.base_url)), 'archive_url': urlencode( - wget_output_path(link) - or (link.domain if link.is_archived else '') + wget_output_path(snapshot) + or (snapshot.domain if snapshot.is_archived else '') ) or 'about:blank', - 'extension': link.extension or 'html', - 'tags': link.tags or 'untagged', - 'size': printable_filesize(link.archive_size) if link.archive_size else 'pending', - 'status': 'archived' if link.is_archived else 'not yet archived', - 'status_color': 'success' if link.is_archived else 'danger', - 'oldest_archive_date': ts_to_date(link.oldest_archive_date), + 'extension': snapshot.extension or 'html', + 'tags': snapshot.tags.all() or 'untagged', #TODO: Return a proper comma separated list. Leaving it like this for now to revisit when fixing tags + 'size': printable_filesize(snapshot.archive_size) if snapshot.archive_size else 'pending', + 'status': 'archived' if snapshot.is_archived else 'not yet archived', + 'status_color': 'success' if snapshot.is_archived else 'danger', + 'oldest_archive_date': ts_to_date(snapshot.oldest_archive_date), }) @enforce_types @@ -118,9 +118,8 @@ def snapshot_icons(snapshot) -> str: from core.models import EXTRACTORS archive_results = snapshot.archiveresult_set.filter(status="succeeded") - link = snapshot.as_link() - path = link.archive_path - canon = link.canonical_outputs() + path = snapshot.archive_path + canon = snapshot.canonical_outputs() output = "" output_template = '{} ' icons = {