ArchiveBox/archivebox/schema.py

import os

from datetime import datetime

from typing import List, Dict, Any, Optional, Union

from dataclasses import dataclass, asdict, field


class ArchiveError(Exception):
    def __init__(self, message, hints=None):
        super().__init__(message)
        self.hints = hints

LinkDict = Dict[str, Any]

@dataclass(frozen=True)
class ArchiveResult:
    cmd: List[str]
    pwd: Optional[str]
    cmd_version: Optional[str]
    output: Union[str, Exception, None]
    status: str
    start_ts: datetime
    end_ts: datetime
    schema: str = 'ArchiveResult'

    def __post_init__(self):
        assert self.schema == self.__class__.__name__

    def _asdict(self):
        return asdict(self)

    @property
    def duration(self) -> int:
        return (self.end_ts - self.start_ts).seconds

@dataclass(frozen=True)
class Link:
    timestamp: str
    url: str
    title: Optional[str]
    tags: Optional[str]
    sources: List[str]
    history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
    updated: Optional[datetime] = None
    schema: str = 'Link'

    def __post_init__(self):
        """fix any history result items to be type-checked ArchiveResults"""
        assert self.schema == self.__class__.__name__
        cast_history = {}
        for method, method_history in self.history.items():
            cast_history[method] = []
            for result in method_history:
                if isinstance(result, dict):
                    result = ArchiveResult(**result)
                cast_history[method].append(result)

        object.__setattr__(self, 'history', cast_history)

    def overwrite(self, **kwargs):
        """pure functional version of dict.update that returns a new instance"""
        return Link(**{**self._asdict(), **kwargs})

    def __eq__(self, other):
        if not isinstance(other, Link):
            return NotImplemented
        return self.url == other.url

    def __gt__(self, other):
        if not isinstance(other, Link):
            return NotImplemented
        if not self.timestamp or not other.timestamp:
            return 
        return float(self.timestamp) > float(other.timestamp)
    
    def _asdict(self, extended=False):
        info = {
            'schema': 'Link',
            'url': self.url,
            'title': self.title or None,
            'timestamp': self.timestamp,
            'updated': self.updated or None,
            'tags': self.tags or None,
            'sources': self.sources or [],
            'history': self.history or {},
        }
        if extended:
            info.update({
                'link_dir': self.link_dir,
                'archive_path': self.archive_path,
                'bookmarked_date': self.bookmarked_date,
                'updated_date': self.updated_date,
                'domain': self.domain,
                'path': self.path,
                'basename': self.basename,
                'extension': self.extension,
                'base_url': self.base_url,
                'is_static': self.is_static,
                'is_archived': self.is_archived,
                'num_outputs': self.num_outputs,
                'num_failures': self.num_failures,
                'oldest_archive_date': self.oldest_archive_date,
                'newest_archive_date': self.newest_archive_date,
            })
        return info

    @property
    def link_dir(self) -> str:
        from .config import ARCHIVE_DIR
        return os.path.join(ARCHIVE_DIR, self.timestamp)

    @property
    def archive_path(self) -> str:
        from .config import ARCHIVE_DIR_NAME
        return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
    
    ### URL Helpers
    @property
    def urlhash(self):
        from .util import hashurl

        return hashurl(self.url)

    @property
    def extension(self) -> str:
        from .util import extension
        return extension(self.url)

    @property
    def domain(self) -> str:
        from .util import domain
        return domain(self.url)

    @property
    def path(self) -> str:
        from .util import path
        return path(self.url)

    @property
    def basename(self) -> str:
        from .util import basename
        return basename(self.url)

    @property
    def base_url(self) -> str:
        from .util import base_url
        return base_url(self.url)

    ### Pretty Printing Helpers
    @property
    def bookmarked_date(self) -> Optional[str]:
        from .util import ts_to_date
        return ts_to_date(self.timestamp) if self.timestamp else None

    @property
    def updated_date(self) -> Optional[str]:
        from .util import ts_to_date
        return ts_to_date(self.updated) if self.updated else None

    @property
    def oldest_archive_date(self) -> Optional[datetime]:
        from .util import ts_to_date

        most_recent = min(
            (ts_to_date(result.start_ts)
             for method in self.history.keys()
                for result in self.history[method]),
            default=None,
        )
        return ts_to_date(most_recent) if most_recent else None

    @property
    def newest_archive_date(self) -> Optional[datetime]:
        from .util import ts_to_date

        most_recent = max(
            (ts_to_date(result.start_ts)
             for method in self.history.keys()
                for result in self.history[method]),
            default=None,
        )
        return ts_to_date(most_recent) if most_recent else None

    ### Archive Status Helpers
    @property
    def num_outputs(self) -> int:
        return len(tuple(filter(None, self.latest_outputs().values())))

    @property
    def num_failures(self) -> int:
        return sum(1
                   for method in self.history.keys()
                       for result in self.history[method]
                            if result.status == 'failed')

    @property
    def is_static(self) -> bool:
        from .util import is_static_file
        return is_static_file(self.url)

    @property
    def is_archived(self) -> bool:
        from .config import ARCHIVE_DIR
        from .util import domain

        return os.path.exists(os.path.join(
            ARCHIVE_DIR,
            self.timestamp,
            domain(self.url),
        ))

    def latest_outputs(self, status: str=None) -> Dict[str, Optional[str]]:
        """get the latest output that each archive method produced for link"""
        
        latest = {
            'title': None,
            'favicon': None,
            'wget': None,
            'warc': None,
            'pdf': None,
            'screenshot': None,
            'dom': None,
            'git': None,
            'media': None,
            'archive_org': None,
        }
        for archive_method in latest.keys():
            # get most recent succesful result in history for each archive method
            history = self.history.get(archive_method) or []
            history = filter(lambda result: result.output, reversed(history))
            if status is not None:
                history = filter(lambda result: result.status == status, history)

            history = list(history)
            if history:
                latest[archive_method] = history[0].output

        return latest

    def canonical_outputs(self) -> Dict[str, Optional[str]]:
        from .util import wget_output_path
        canonical = {
            'index_url': 'index.html',
            'favicon_url': 'favicon.ico',
            'google_favicon_url': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
            'archive_url': wget_output_path(self),
            'warc_url': 'warc',
            'pdf_url': 'output.pdf',
            'screenshot_url': 'screenshot.png',
            'dom_url': 'output.html',
            'archive_org_url': 'https://web.archive.org/web/{}'.format(self.base_url),
            'git_url': 'git',
            'media_url': 'media',
        }
        if self.is_static:
            # static binary files like PDF and images are handled slightly differently.
            # they're just downloaded once and aren't archived separately multiple times, 
            # so the wget, screenshot, & pdf urls should all point to the same file

            static_url = wget_output_path(self)
            canonical.update({
                'title': self.basename,
                'archive_url': static_url,
                'pdf_url': static_url,
                'screenshot_url': static_url,
                'dom_url': static_url,
            })
        return canonical
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`import os`

switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00			`from datetime import datetime`

switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`from typing import List, Dict, Any, Optional, Union`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`from dataclasses import dataclass, asdict, field`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00

switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`class ArchiveError(Exception):`
			`def __init__(self, message, hints=None):`
			`super().__init__(message)`
			`self.hints = hints`

			`LinkDict = Dict[str, Any]`

			`@dataclass(frozen=True)`
			`class ArchiveResult:`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00			`cmd: List[str]`
			`pwd: Optional[str]`
			`cmd_version: Optional[str]`
			`output: Union[str, Exception, None]`
			`status: str`
			`start_ts: datetime`
			`end_ts: datetime`
full type-hinting coverage 2019-03-27 16:25:07 +13:00			`schema: str = 'ArchiveResult'`

			`def __post_init__(self):`
			`assert self.schema == self.__class__.__name__`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`def _asdict(self):`
			`return asdict(self)`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`@property`
			`def duration(self) -> int:`
			`return (self.end_ts - self.start_ts).seconds`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`@dataclass(frozen=True)`
			`class Link:`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00			`timestamp: str`
			`url: str`
			`title: Optional[str]`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`tags: Optional[str]`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 22:33:34 +13:00			`sources: List[str]`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})`
working runtime type casting and enforcement for a wide range of types 2019-03-27 15:26:21 +13:00			`updated: Optional[datetime] = None`
full type-hinting coverage 2019-03-27 16:25:07 +13:00			`schema: str = 'Link'`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00
working runtime type casting and enforcement for a wide range of types 2019-03-27 15:26:21 +13:00			`def __post_init__(self):`
			`"""fix any history result items to be type-checked ArchiveResults"""`
full type-hinting coverage 2019-03-27 16:25:07 +13:00			`assert self.schema == self.__class__.__name__`
working runtime type casting and enforcement for a wide range of types 2019-03-27 15:26:21 +13:00			`cast_history = {}`
			`for method, method_history in self.history.items():`
			`cast_history[method] = []`
			`for result in method_history:`
			`if isinstance(result, dict):`
			`result = ArchiveResult(**result)`
			`cast_history[method].append(result)`

			`object.__setattr__(self, 'history', cast_history)`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00
new details page design 2019-03-27 20:49:39 +13:00			`def overwrite(self, **kwargs):`
			`"""pure functional version of dict.update that returns a new instance"""`
			`return Link({self._asdict(), **kwargs})`

switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`def __eq__(self, other):`
			`if not isinstance(other, Link):`
			`return NotImplemented`
working runtime type casting and enforcement for a wide range of types 2019-03-27 15:26:21 +13:00			`return self.url == other.url`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00
			`def __gt__(self, other):`
			`if not isinstance(other, Link):`
			`return NotImplemented`
			`if not self.timestamp or not other.timestamp:`
			`return`
			`return float(self.timestamp) > float(other.timestamp)`

			`def _asdict(self, extended=False):`
			`info = {`
full type-hinting coverage 2019-03-27 16:25:07 +13:00			`'schema': 'Link',`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`'url': self.url,`
			`'title': self.title or None,`
			`'timestamp': self.timestamp,`
			`'updated': self.updated or None,`
			`'tags': self.tags or None,`
			`'sources': self.sources or [],`
			`'history': self.history or {},`
			`}`
			`if extended:`
			`info.update({`
			`'link_dir': self.link_dir,`
			`'archive_path': self.archive_path,`
			`'bookmarked_date': self.bookmarked_date,`
			`'updated_date': self.updated_date,`
			`'domain': self.domain,`
			`'path': self.path,`
			`'basename': self.basename,`
			`'extension': self.extension,`
			`'base_url': self.base_url,`
			`'is_static': self.is_static,`
			`'is_archived': self.is_archived,`
			`'num_outputs': self.num_outputs,`
new details page design 2019-03-27 20:49:39 +13:00			`'num_failures': self.num_failures,`
			`'oldest_archive_date': self.oldest_archive_date,`
			`'newest_archive_date': self.newest_archive_date,`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`})`
			`return info`

			`@property`
			`def link_dir(self) -> str:`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .config import ARCHIVE_DIR`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`return os.path.join(ARCHIVE_DIR, self.timestamp)`

			`@property`
			`def archive_path(self) -> str:`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .config import ARCHIVE_DIR_NAME`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)`

			`### URL Helpers`
			`@property`
			`def urlhash(self):`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .util import hashurl`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00
			`return hashurl(self.url)`

			`@property`
			`def extension(self) -> str:`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .util import extension`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`return extension(self.url)`

			`@property`
			`def domain(self) -> str:`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .util import domain`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`return domain(self.url)`

			`@property`
			`def path(self) -> str:`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .util import path`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`return path(self.url)`

			`@property`
			`def basename(self) -> str:`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .util import basename`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`return basename(self.url)`

			`@property`
			`def base_url(self) -> str:`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .util import base_url`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`return base_url(self.url)`

			`### Pretty Printing Helpers`
			`@property`
			`def bookmarked_date(self) -> Optional[str]:`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .util import ts_to_date`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`return ts_to_date(self.timestamp) if self.timestamp else None`

			`@property`
			`def updated_date(self) -> Optional[str]:`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .util import ts_to_date`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`return ts_to_date(self.updated) if self.updated else None`

new details page design 2019-03-27 20:49:39 +13:00			`@property`
			`def oldest_archive_date(self) -> Optional[datetime]:`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .util import ts_to_date`
new details page design 2019-03-27 20:49:39 +13:00
			`most_recent = min(`
support loading link indexes with extra keys 2019-03-28 03:31:07 +13:00			`(ts_to_date(result.start_ts)`
new details page design 2019-03-27 20:49:39 +13:00			`for method in self.history.keys()`
			`for result in self.history[method]),`
			`default=None,`
			`)`
			`return ts_to_date(most_recent) if most_recent else None`

			`@property`
			`def newest_archive_date(self) -> Optional[datetime]:`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .util import ts_to_date`
new details page design 2019-03-27 20:49:39 +13:00
			`most_recent = max(`
support loading link indexes with extra keys 2019-03-28 03:31:07 +13:00			`(ts_to_date(result.start_ts)`
new details page design 2019-03-27 20:49:39 +13:00			`for method in self.history.keys()`
			`for result in self.history[method]),`
			`default=None,`
			`)`
			`return ts_to_date(most_recent) if most_recent else None`

switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`### Archive Status Helpers`
			`@property`
			`def num_outputs(self) -> int:`
			`return len(tuple(filter(None, self.latest_outputs().values())))`

new details page design 2019-03-27 20:49:39 +13:00			`@property`
			`def num_failures(self) -> int:`
			`return sum(1`
			`for method in self.history.keys()`
			`for result in self.history[method]`
			`if result.status == 'failed')`

switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`@property`
			`def is_static(self) -> bool:`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .util import is_static_file`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`return is_static_file(self.url)`

			`@property`
			`def is_archived(self) -> bool:`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .config import ARCHIVE_DIR`
			`from .util import domain`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00
			`return os.path.exists(os.path.join(`
			`ARCHIVE_DIR,`
			`self.timestamp,`
			`domain(self.url),`
			`))`

			`def latest_outputs(self, status: str=None) -> Dict[str, Optional[str]]:`
			`"""get the latest output that each archive method produced for link"""`

			`latest = {`
			`'title': None,`
			`'favicon': None,`
			`'wget': None,`
			`'warc': None,`
			`'pdf': None,`
			`'screenshot': None,`
			`'dom': None,`
			`'git': None,`
			`'media': None,`
			`'archive_org': None,`
			`}`
			`for archive_method in latest.keys():`
			`# get most recent succesful result in history for each archive method`
			`history = self.history.get(archive_method) or []`
			`history = filter(lambda result: result.output, reversed(history))`
			`if status is not None:`
			`history = filter(lambda result: result.status == status, history)`

			`history = list(history)`
			`if history:`
			`latest[archive_method] = history[0].output`

			`return latest`

			`def canonical_outputs(self) -> Dict[str, Optional[str]]:`
new version handling and absolute imports 2019-03-28 08:35:13 +13:00			`from .util import wget_output_path`
switch to dataclasses, working Link type hints everywhere 2019-03-27 12:21:34 +13:00			`canonical = {`
			`'index_url': 'index.html',`
			`'favicon_url': 'favicon.ico',`
			`'google_favicon_url': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),`
			`'archive_url': wget_output_path(self),`
			`'warc_url': 'warc',`
			`'pdf_url': 'output.pdf',`
			`'screenshot_url': 'screenshot.png',`
			`'dom_url': 'output.html',`
			`'archive_org_url': 'https://web.archive.org/web/{}'.format(self.base_url),`
			`'git_url': 'git',`
			`'media_url': 'media',`
			`}`
			`if self.is_static:`
			`# static binary files like PDF and images are handled slightly differently.`
			`# they're just downloaded once and aren't archived separately multiple times,`
			`# so the wget, screenshot, & pdf urls should all point to the same file`

			`static_url = wget_output_path(self)`
			`canonical.update({`
			`'title': self.basename,`
			`'archive_url': static_url,`
			`'pdf_url': static_url,`
			`'screenshot_url': static_url,`
			`'dom_url': static_url,`
			`})`
			`return canonical`