ArchiveBox/archivebox/schema.py

import os

from datetime import datetime

from typing import List, Dict, Any, Optional, Union

from dataclasses import dataclass, asdict, field


class ArchiveError(Exception):
    def __init__(self, message, hints=None):
        super().__init__(message)
        self.hints = hints

LinkDict = Dict[str, Any]

@dataclass(frozen=True)
class ArchiveResult:
    cmd: List[str]
    pwd: Optional[str]
    cmd_version: Optional[str]
    output: Union[str, Exception, None]
    status: str
    start_ts: datetime
    end_ts: datetime

    def _asdict(self):
        return asdict(self)

    @property
    def duration(self) -> int:
        return (self.end_ts - self.start_ts).seconds

@dataclass(frozen=True)
class Link:
    timestamp: str
    url: str
    title: Optional[str]
    tags: Optional[str]
    sources: List[str]
    history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
    updated: Optional[str] = None

    def __hash__(self):
        return self.urlhash

    def __eq__(self, other):
        if not isinstance(other, Link):
            return NotImplemented
        return self.urlhash == other.urlhash

    def __gt__(self, other):
        if not isinstance(other, Link):
            return NotImplemented
        if not self.timestamp or not other.timestamp:
            return 
        return float(self.timestamp) > float(other.timestamp)
    
    def _asdict(self, extended=False):
        info = {
            'url': self.url,
            'title': self.title or None,
            'timestamp': self.timestamp,
            'updated': self.updated or None,
            'tags': self.tags or None,
            'sources': self.sources or [],
            'history': self.history or {},
        }
        if extended:
            info.update({
                'link_dir': self.link_dir,
                'archive_path': self.archive_path,
                'bookmarked_date': self.bookmarked_date,
                'updated_date': self.updated_date,
                'domain': self.domain,
                'path': self.path,
                'basename': self.basename,
                'extension': self.extension,
                'base_url': self.base_url,
                'is_static': self.is_static,
                'is_archived': self.is_archived,
                'num_outputs': self.num_outputs,
            })
        return info

    @property
    def link_dir(self) -> str:
        from config import ARCHIVE_DIR
        return os.path.join(ARCHIVE_DIR, self.timestamp)

    @property
    def archive_path(self) -> str:
        from config import ARCHIVE_DIR_NAME
        return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
    
    ### URL Helpers
    @property
    def urlhash(self):
        from util import hashurl

        return hashurl(self.url)

    @property
    def extension(self) -> str:
        from util import extension
        return extension(self.url)

    @property
    def domain(self) -> str:
        from util import domain
        return domain(self.url)

    @property
    def path(self) -> str:
        from util import path
        return path(self.url)

    @property
    def basename(self) -> str:
        from util import basename
        return basename(self.url)

    @property
    def base_url(self) -> str:
        from util import base_url
        return base_url(self.url)

    ### Pretty Printing Helpers
    @property
    def bookmarked_date(self) -> Optional[str]:
        from util import ts_to_date
        return ts_to_date(self.timestamp) if self.timestamp else None

    @property
    def updated_date(self) -> Optional[str]:
        from util import ts_to_date
        return ts_to_date(self.updated) if self.updated else None

    ### Archive Status Helpers
    @property
    def num_outputs(self) -> int:
        return len(tuple(filter(None, self.latest_outputs().values())))

    @property
    def is_static(self) -> bool:
        from util import is_static_file
        return is_static_file(self.url)

    @property
    def is_archived(self) -> bool:
        from config import ARCHIVE_DIR
        from util import domain

        return os.path.exists(os.path.join(
            ARCHIVE_DIR,
            self.timestamp,
            domain(self.url),
        ))

    def latest_outputs(self, status: str=None) -> Dict[str, Optional[str]]:
        """get the latest output that each archive method produced for link"""
        
        latest = {
            'title': None,
            'favicon': None,
            'wget': None,
            'warc': None,
            'pdf': None,
            'screenshot': None,
            'dom': None,
            'git': None,
            'media': None,
            'archive_org': None,
        }
        for archive_method in latest.keys():
            # get most recent succesful result in history for each archive method
            history = self.history.get(archive_method) or []
            history = filter(lambda result: result.output, reversed(history))
            if status is not None:
                history = filter(lambda result: result.status == status, history)

            history = list(history)
            if history:
                latest[archive_method] = history[0].output

        return latest

    def canonical_outputs(self) -> Dict[str, Optional[str]]:
        from util import wget_output_path
        canonical = {
            'index_url': 'index.html',
            'favicon_url': 'favicon.ico',
            'google_favicon_url': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
            'archive_url': wget_output_path(self),
            'warc_url': 'warc',
            'pdf_url': 'output.pdf',
            'screenshot_url': 'screenshot.png',
            'dom_url': 'output.html',
            'archive_org_url': 'https://web.archive.org/web/{}'.format(self.base_url),
            'git_url': 'git',
            'media_url': 'media',
        }
        if self.is_static:
            # static binary files like PDF and images are handled slightly differently.
            # they're just downloaded once and aren't archived separately multiple times, 
            # so the wget, screenshot, & pdf urls should all point to the same file

            static_url = wget_output_path(self)
            canonical.update({
                'title': self.basename,
                'archive_url': static_url,
                'pdf_url': static_url,
                'screenshot_url': static_url,
                'dom_url': static_url,
            })
        return canonical


@dataclass(frozen=True)
class ArchiveIndex:
    info: str
    version: str
    source: str
    docs: str
    num_links: int
    updated: str
    links: List[Link]

    def _asdict(self):
        return asdict(self)

@dataclass
class RuntimeStats:
    skipped: int
    succeeded: int
    failed: int

    parse_start_ts: datetime
    parse_end_ts: datetime

    index_start_ts: datetime
    index_end_ts: datetime

    archiving_start_ts: datetime
    archiving_end_ts: datetime
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`import os`

switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00			`from datetime import datetime`

switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`from typing import List, Dict, Any, Optional, Union`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`from dataclasses import dataclass, asdict, field`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00

switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`class ArchiveError(Exception):`
			`def __init__(self, message, hints=None):`
			`super().__init__(message)`
			`self.hints = hints`

			`LinkDict = Dict[str, Any]`

			`@dataclass(frozen=True)`
			`class ArchiveResult:`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00			`cmd: List[str]`
			`pwd: Optional[str]`
			`cmd_version: Optional[str]`
			`output: Union[str, Exception, None]`
			`status: str`
			`start_ts: datetime`
			`end_ts: datetime`

switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`def _asdict(self):`
			`return asdict(self)`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`@property`
			`def duration(self) -> int:`
			`return (self.end_ts - self.start_ts).seconds`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`@dataclass(frozen=True)`
			`class Link:`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00			`timestamp: str`
			`url: str`
			`title: Optional[str]`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`tags: Optional[str]`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00			`sources: List[str]`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})`
			`updated: Optional[str] = None`

			`def __hash__(self):`
			`return self.urlhash`

			`def __eq__(self, other):`
			`if not isinstance(other, Link):`
			`return NotImplemented`
			`return self.urlhash == other.urlhash`

			`def __gt__(self, other):`
			`if not isinstance(other, Link):`
			`return NotImplemented`
			`if not self.timestamp or not other.timestamp:`
			`return`
			`return float(self.timestamp) > float(other.timestamp)`

			`def _asdict(self, extended=False):`
			`info = {`
			`'url': self.url,`
			`'title': self.title or None,`
			`'timestamp': self.timestamp,`
			`'updated': self.updated or None,`
			`'tags': self.tags or None,`
			`'sources': self.sources or [],`
			`'history': self.history or {},`
			`}`
			`if extended:`
			`info.update({`
			`'link_dir': self.link_dir,`
			`'archive_path': self.archive_path,`
			`'bookmarked_date': self.bookmarked_date,`
			`'updated_date': self.updated_date,`
			`'domain': self.domain,`
			`'path': self.path,`
			`'basename': self.basename,`
			`'extension': self.extension,`
			`'base_url': self.base_url,`
			`'is_static': self.is_static,`
			`'is_archived': self.is_archived,`
			`'num_outputs': self.num_outputs,`
			`})`
			`return info`

			`@property`
			`def link_dir(self) -> str:`
			`from config import ARCHIVE_DIR`
			`return os.path.join(ARCHIVE_DIR, self.timestamp)`

			`@property`
			`def archive_path(self) -> str:`
			`from config import ARCHIVE_DIR_NAME`
			`return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)`

			`### URL Helpers`
			`@property`
			`def urlhash(self):`
			`from util import hashurl`

			`return hashurl(self.url)`

			`@property`
			`def extension(self) -> str:`
			`from util import extension`
			`return extension(self.url)`

			`@property`
			`def domain(self) -> str:`
			`from util import domain`
			`return domain(self.url)`

			`@property`
			`def path(self) -> str:`
			`from util import path`
			`return path(self.url)`

			`@property`
			`def basename(self) -> str:`
			`from util import basename`
			`return basename(self.url)`

			`@property`
			`def base_url(self) -> str:`
			`from util import base_url`
			`return base_url(self.url)`

			`### Pretty Printing Helpers`
			`@property`
			`def bookmarked_date(self) -> Optional[str]:`
			`from util import ts_to_date`
			`return ts_to_date(self.timestamp) if self.timestamp else None`

			`@property`
			`def updated_date(self) -> Optional[str]:`
			`from util import ts_to_date`
			`return ts_to_date(self.updated) if self.updated else None`

			`### Archive Status Helpers`
			`@property`
			`def num_outputs(self) -> int:`
			`return len(tuple(filter(None, self.latest_outputs().values())))`

			`@property`
			`def is_static(self) -> bool:`
			`from util import is_static_file`
			`return is_static_file(self.url)`

			`@property`
			`def is_archived(self) -> bool:`
			`from config import ARCHIVE_DIR`
			`from util import domain`

			`return os.path.exists(os.path.join(`
			`ARCHIVE_DIR,`
			`self.timestamp,`
			`domain(self.url),`
			`))`

			`def latest_outputs(self, status: str=None) -> Dict[str, Optional[str]]:`
			`"""get the latest output that each archive method produced for link"""`

			`latest = {`
			`'title': None,`
			`'favicon': None,`
			`'wget': None,`
			`'warc': None,`
			`'pdf': None,`
			`'screenshot': None,`
			`'dom': None,`
			`'git': None,`
			`'media': None,`
			`'archive_org': None,`
			`}`
			`for archive_method in latest.keys():`
			`# get most recent succesful result in history for each archive method`
			`history = self.history.get(archive_method) or []`
			`history = filter(lambda result: result.output, reversed(history))`
			`if status is not None:`
			`history = filter(lambda result: result.status == status, history)`

			`history = list(history)`
			`if history:`
			`latest[archive_method] = history[0].output`

			`return latest`

			`def canonical_outputs(self) -> Dict[str, Optional[str]]:`
			`from util import wget_output_path`
			`canonical = {`
			`'index_url': 'index.html',`
			`'favicon_url': 'favicon.ico',`
			`'google_favicon_url': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),`
			`'archive_url': wget_output_path(self),`
			`'warc_url': 'warc',`
			`'pdf_url': 'output.pdf',`
			`'screenshot_url': 'screenshot.png',`
			`'dom_url': 'output.html',`
			`'archive_org_url': 'https://web.archive.org/web/{}'.format(self.base_url),`
			`'git_url': 'git',`
			`'media_url': 'media',`
			`}`
			`if self.is_static:`
			`# static binary files like PDF and images are handled slightly differently.`
			`# they're just downloaded once and aren't archived separately multiple times,`
			`# so the wget, screenshot, & pdf urls should all point to the same file`

			`static_url = wget_output_path(self)`
			`canonical.update({`
			`'title': self.basename,`
			`'archive_url': static_url,`
			`'pdf_url': static_url,`
			`'screenshot_url': static_url,`
			`'dom_url': static_url,`
			`})`
			`return canonical`


			`@dataclass(frozen=True)`
			`class ArchiveIndex:`
			`info: str`
			`version: str`
			`source: str`
			`docs: str`
			`num_links: int`
			`updated: str`
			`links: List[Link]`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`def _asdict(self):`
			`return asdict(self)`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`@dataclass`
			`class RuntimeStats:`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00			`skipped: int`
			`succeeded: int`
			`failed: int`

			`parse_start_ts: datetime`
			`parse_end_ts: datetime`

			`index_start_ts: datetime`
			`index_end_ts: datetime`

			`archiving_start_ts: datetime`
			`archiving_end_ts: datetime`