import os from datetime import datetime from typing import List, Dict, Any, Optional, Union from dataclasses import dataclass, asdict, field class ArchiveError(Exception): def __init__(self, message, hints=None): super().__init__(message) self.hints = hints LinkDict = Dict[str, Any] @dataclass(frozen=True) class ArchiveResult: cmd: List[str] pwd: Optional[str] cmd_version: Optional[str] output: Union[str, Exception, None] status: str start_ts: datetime end_ts: datetime def _asdict(self): return asdict(self) @property def duration(self) -> int: return (self.end_ts - self.start_ts).seconds @dataclass(frozen=True) class Link: timestamp: str url: str title: Optional[str] tags: Optional[str] sources: List[str] history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {}) updated: Optional[str] = None def __hash__(self): return self.urlhash def __eq__(self, other): if not isinstance(other, Link): return NotImplemented return self.urlhash == other.urlhash def __gt__(self, other): if not isinstance(other, Link): return NotImplemented if not self.timestamp or not other.timestamp: return return float(self.timestamp) > float(other.timestamp) def _asdict(self, extended=False): info = { 'url': self.url, 'title': self.title or None, 'timestamp': self.timestamp, 'updated': self.updated or None, 'tags': self.tags or None, 'sources': self.sources or [], 'history': self.history or {}, } if extended: info.update({ 'link_dir': self.link_dir, 'archive_path': self.archive_path, 'bookmarked_date': self.bookmarked_date, 'updated_date': self.updated_date, 'domain': self.domain, 'path': self.path, 'basename': self.basename, 'extension': self.extension, 'base_url': self.base_url, 'is_static': self.is_static, 'is_archived': self.is_archived, 'num_outputs': self.num_outputs, }) return info @property def link_dir(self) -> str: from config import ARCHIVE_DIR return os.path.join(ARCHIVE_DIR, self.timestamp) @property def archive_path(self) -> str: from config import ARCHIVE_DIR_NAME return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp) ### URL Helpers @property def urlhash(self): from util import hashurl return hashurl(self.url) @property def extension(self) -> str: from util import extension return extension(self.url) @property def domain(self) -> str: from util import domain return domain(self.url) @property def path(self) -> str: from util import path return path(self.url) @property def basename(self) -> str: from util import basename return basename(self.url) @property def base_url(self) -> str: from util import base_url return base_url(self.url) ### Pretty Printing Helpers @property def bookmarked_date(self) -> Optional[str]: from util import ts_to_date return ts_to_date(self.timestamp) if self.timestamp else None @property def updated_date(self) -> Optional[str]: from util import ts_to_date return ts_to_date(self.updated) if self.updated else None ### Archive Status Helpers @property def num_outputs(self) -> int: return len(tuple(filter(None, self.latest_outputs().values()))) @property def is_static(self) -> bool: from util import is_static_file return is_static_file(self.url) @property def is_archived(self) -> bool: from config import ARCHIVE_DIR from util import domain return os.path.exists(os.path.join( ARCHIVE_DIR, self.timestamp, domain(self.url), )) def latest_outputs(self, status: str=None) -> Dict[str, Optional[str]]: """get the latest output that each archive method produced for link""" latest = { 'title': None, 'favicon': None, 'wget': None, 'warc': None, 'pdf': None, 'screenshot': None, 'dom': None, 'git': None, 'media': None, 'archive_org': None, } for archive_method in latest.keys(): # get most recent succesful result in history for each archive method history = self.history.get(archive_method) or [] history = filter(lambda result: result.output, reversed(history)) if status is not None: history = filter(lambda result: result.status == status, history) history = list(history) if history: latest[archive_method] = history[0].output return latest def canonical_outputs(self) -> Dict[str, Optional[str]]: from util import wget_output_path canonical = { 'index_url': 'index.html', 'favicon_url': 'favicon.ico', 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain), 'archive_url': wget_output_path(self), 'warc_url': 'warc', 'pdf_url': 'output.pdf', 'screenshot_url': 'screenshot.png', 'dom_url': 'output.html', 'archive_org_url': 'https://web.archive.org/web/{}'.format(self.base_url), 'git_url': 'git', 'media_url': 'media', } if self.is_static: # static binary files like PDF and images are handled slightly differently. # they're just downloaded once and aren't archived separately multiple times, # so the wget, screenshot, & pdf urls should all point to the same file static_url = wget_output_path(self) canonical.update({ 'title': self.basename, 'archive_url': static_url, 'pdf_url': static_url, 'screenshot_url': static_url, 'dom_url': static_url, }) return canonical @dataclass(frozen=True) class ArchiveIndex: info: str version: str source: str docs: str num_links: int updated: str links: List[Link] def _asdict(self): return asdict(self) @dataclass class RuntimeStats: skipped: int succeeded: int failed: int parse_start_ts: datetime parse_end_ts: datetime index_start_ts: datetime index_end_ts: datetime archiving_start_ts: datetime archiving_end_ts: datetime