better loading and saving storage mechanism

2024-06-27 02:20:36 +12:00 · 2019-04-17 02:25:28 -04:00 · 2019-04-17 02:25:28 -04:00 · 9ce47431da
parent c95f893b61
commit 9ce47431da
14 changed files with 395 additions and 238 deletions
--- a/archivebox/legacy/archive_methods.py
+++ b/archivebox/legacy/archive_methods.py
@ -6,9 +6,9 @@ from datetime import datetime
 from .schema import Link, ArchiveResult, ArchiveOutput
 from .index import (
-    write_link_index,
+    load_link_details,
-    patch_links_index,
+    write_link_details,
-    load_json_link_index,
+    patch_main_index,
 )
 from .config import (
    CURL_BINARY,
--- a/archivebox/legacy/config.py
+++ b/archivebox/legacy/config.py
@ -115,7 +115,6 @@ URL_BLACKLIST_PTN = re.compile(URL_BLACKLIST, re.IGNORECASE) if URL_BLACKLIST el
 VERSION = open(os.path.join(REPO_DIR, 'VERSION'), 'r').read().strip()
 GIT_SHA = VERSION.split('+')[-1] or 'unknown'
 HAS_INVALID_DEPENDENCIES = False
 HAS_INVALID_DB = not os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
 ### Check system environment
 if USER == 'root':
@ -429,13 +428,12 @@ def check_dependencies() -> None:
        raise SystemExit(1)
 def check_data_folder() -> None:
-    if HAS_INVALID_DB:
+    if not os.path.exists(os.path.join(OUTPUT_DIR, 'index.json')):
-        stderr('{red}[X] No archive data found in:{reset} {}'.format(OUTPUT_DIR, **ANSI))
+        stderr('{red}[X] No archive data was found in:{reset} {}'.format(OUTPUT_DIR, **ANSI))
        stderr('    Are you running archivebox in the right folder?')
-        stderr('        cd path/to/your/archive')
+        stderr('        cd path/to/your/archive/folder')
        stderr('        archivebox [command]')
        stderr()
-        stderr('    To create a new archive folder, run:')
+        stderr('    To create a new archive collection in this folder, run:')
        stderr('        mkdir new_archive_dir && cd new_archive_dir')
        stderr('        archivebox init')
        raise SystemExit(1)
--- a/archivebox/legacy/index.py
+++ b/archivebox/legacy/index.py
@ -1,33 +1,28 @@
 import os
 import json
-from datetime import datetime
+from typing import List, Tuple, Optional, Iterable
 from string import Template
 from typing import List, Tuple, Iterator, Optional, Mapping, Iterable
 from collections import OrderedDict
 from .schema import Link, ArchiveResult
 from .config import (
    OUTPUT_DIR,
    TEMPLATES_DIR,
    VERSION,
    GIT_SHA,
    FOOTER_INFO,
    TIMEOUT,
    URL_BLACKLIST_PTN,
    ANSI,
    stderr,
 )
 from .storage.html import write_html_main_index, write_html_link_details
 from .storage.json import (
    parse_json_main_index,
    write_json_main_index,
    parse_json_link_details, 
    write_json_link_details,
 )
 from .util import (
    scheme,
    ts_to_date,
    urlencode,
    htmlencode,
    urldecode,
    wget_output_path,
    enforce_types,
    TimedProgress,
    copy_and_overwrite,
    atomic_write,
    ExtendedEncoder,
 )
@ -40,8 +35,6 @@ from .logs import (
    log_parsing_finished,
 )
 TITLE_LOADING_MSG = 'Not yet archived...'
 ### Link filtering and checking
@ -53,8 +46,10 @@ def merge_links(a: Link, b: Link) -> Link:
    """
    assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
    # longest url wins (because a fuzzy url will always be shorter)
    url = a.url if len(a.url) > len(b.url) else b.url
    # best title based on length and quality
    possible_titles = [
        title
        for title in (a.title, b.title)
@ -66,20 +61,24 @@ def merge_links(a: Link, b: Link) -> Link:
    elif len(possible_titles) == 1:
        title = possible_titles[0]
    # earliest valid timestamp
    timestamp = (
        a.timestamp
        if float(a.timestamp or 0) < float(b.timestamp or 0) else
        b.timestamp
    )
    # all unique, truthy tags
    tags_set = (
        set(tag.strip() for tag in (a.tags or '').split(','))
        | set(tag.strip() for tag in (b.tags or '').split(','))
    )
    tags = ','.join(tags_set) or None
    # all unique source entries
    sources = list(set(a.sources + b.sources))
    # all unique history entries for the combined archive methods
    all_methods = set(list(a.history.keys()) + list(a.history.keys()))
    history = {
        method: (a.history.get(method) or []) + (b.history.get(method) or [])
@ -95,7 +94,6 @@ def merge_links(a: Link, b: Link) -> Link:
            key=lambda result: result.start_ts,
        )))
    return Link(
        url=url,
        timestamp=timestamp,
@ -105,6 +103,8 @@ def merge_links(a: Link, b: Link) -> Link:
        history=history,
    )
@enforce_types
 def validate_links(links: Iterable[Link]) -> Iterable[Link]:
    links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
    links = sorted_links(links)      # deterministically sort the links based on timstamp, url
@ -121,6 +121,8 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
    return links
@enforce_types
 def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
    """remove chrome://, about:// or other schemed links that cant be archived"""
    for link in links:
@ -130,6 +132,7 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
            yield link
@enforce_types
 def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
    """
    ensures that all non-duplicate links have monotonically increasing timestamps
@ -153,12 +156,14 @@ def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
    return unique_timestamps.values()
@enforce_types
 def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
    sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
    return sorted(links, key=sort_func, reverse=True)
-def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]:
+@enforce_types
 def links_after_timestamp(links: Iterable[Link], resume: Optional[float]=None) -> Iterable[Link]:
    if not resume:
        yield from links
        return
@ -171,6 +176,7 @@ def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable
            print('Resume value and all timestamp values must be valid numbers.')
@enforce_types
 def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
    """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
@ -190,10 +196,10 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
-### Homepage index for all the links
+### Main Links Index
@enforce_types
-def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
+def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
    """create index.html file for a given list of links"""
    log_indexing_process_started()
@ -201,7 +207,7 @@ def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool
    log_indexing_started(out_dir, 'index.json')
    timer = TimedProgress(TIMEOUT * 2, prefix='      ')
    try:
-        write_json_links_index(links, out_dir=out_dir)
+        write_json_main_index(links, out_dir=out_dir)
    finally:
        timer.end()
    log_indexing_finished(out_dir, 'index.json')
@ -209,19 +215,19 @@ def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool
    log_indexing_started(out_dir, 'index.html')
    timer = TimedProgress(TIMEOUT * 2, prefix='      ')
    try:
-        write_html_links_index(links, out_dir=out_dir, finished=finished)
+        write_html_main_index(links, out_dir=out_dir, finished=finished)
    finally:
        timer.end()
    log_indexing_finished(out_dir, 'index.html')
@enforce_types
-def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
+def load_main_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
    """parse and load existing index with any new links from import_path merged in"""
    existing_links: List[Link] = []
    if out_dir:
-        existing_links = list(parse_json_links_index(out_dir))
+        existing_links = list(parse_json_main_index(out_dir))
    new_links: List[Link] = []
    if import_path:
@ -242,108 +248,16 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -
@enforce_types
-def write_json_links_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
+def patch_main_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
-    """write the json link index to a given path"""
+    """hack to in-place update one row's info in the generated index files"""
-    assert isinstance(links, List), 'Links must be a list, not a generator.'
+    # TODO: remove this ASAP, it's ugly, error-prone, and potentially dangerous
    assert not links or isinstance(links[0].history, dict)
    assert not links or isinstance(links[0].sources, list)
-    if links and links[0].history.get('title'):
+    title = link.title or link.latest_outputs(status='succeeded')['title']
        assert isinstance(links[0].history['title'][0], ArchiveResult)
    if links and links[0].sources:
        assert isinstance(links[0].sources[0], str)
    path = os.path.join(out_dir, 'index.json')
    index_json = {
        'info': 'ArchiveBox Index',
        'source': 'https://github.com/pirate/ArchiveBox',
        'docs': 'https://github.com/pirate/ArchiveBox/wiki',
        'version': VERSION,
        'num_links': len(links),
        'updated': datetime.now(),
        'links': links,
    }
    atomic_write(index_json, path)
@enforce_types
 def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
    """parse a archive index json file and return the list of links"""
    index_path = os.path.join(out_dir, 'index.json')
    if os.path.exists(index_path):
        with open(index_path, 'r', encoding='utf-8') as f:
            links = json.load(f)['links']
            for link_json in links:
                yield Link.from_json(link_json)
    return ()
@enforce_types
 def write_html_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
    """write the html link index to a given path"""
    copy_and_overwrite(
        os.path.join(TEMPLATES_DIR, 'static'),
        os.path.join(out_dir, 'static'),
    )
    atomic_write('User-agent: *\nDisallow: /', os.path.join(out_dir, 'robots.txt'))
    with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r', encoding='utf-8') as f:
        index_html = f.read()
    with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
        link_row_html = f.read()
    link_rows = []
    for link in links:
        template_row_vars: Mapping[str, str] = {
            **derived_link_info(link),
            'title': (
                link.title
                or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
            ),
            'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''),
            'favicon_url': (
                os.path.join('archive', link.timestamp, 'favicon.ico')
                # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs='
            ),
            'archive_url': urlencode(
                wget_output_path(link) or 'index.html'
            ),
        }
        link_rows.append(Template(link_row_html).substitute(**template_row_vars))
    template_vars: Mapping[str, str] = {
        'num_links': str(len(links)),
        'date_updated': datetime.now().strftime('%Y-%m-%d'),
        'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
        'footer_info': FOOTER_INFO,
        'version': VERSION,
        'git_sha': GIT_SHA,
        'rows': '\n'.join(link_rows),
        'status': 'finished' if finished else 'running',
    }
    template_html = Template(index_html).substitute(**template_vars)
    atomic_write(template_html, os.path.join(out_dir, 'index.html'))
@enforce_types
 def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
    """hack to in-place update one row's info in the generated index html"""
    title = link.title or link.latest_outputs()['title']
    successful = link.num_outputs
-    # Patch JSON index
+    # Patch JSON main index
-    json_file_links = parse_json_links_index(out_dir)
+    json_file_links = parse_json_main_index(out_dir)
    patched_links = []
    for saved_link in json_file_links:
        if saved_link.url == link.url:
@ -355,11 +269,12 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
        else:
            patched_links.append(saved_link)
-    write_json_links_index(patched_links, out_dir=out_dir)
+    write_json_main_index(patched_links, out_dir=out_dir)
-    # Patch HTML index
+    # Patch HTML main index
    html_path = os.path.join(out_dir, 'index.html')
-    html = open(html_path, 'r').read().split('\n')
+    with open(html_path, 'r') as f:
        html = f.read().split('\n')
    for idx, line in enumerate(html):
        if title and ('<span data-title-for="{}"'.format(link.url) in line):
            html[idx] = '<span>{}</span>'.format(title)
@ -370,76 +285,31 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
    atomic_write('\n'.join(html), html_path)
-### Individual link index
+### Link Details Index
@enforce_types
-def write_link_index(link: Link, link_dir: Optional[str]=None) -> None:
+def write_link_details(link: Link, out_dir: Optional[str]=None) -> None:
-    link_dir = link_dir or link.link_dir
+    out_dir = out_dir or link.link_dir
-    write_json_link_index(link, link_dir)
+    write_json_link_details(link, out_dir=out_dir)
-    write_html_link_index(link, link_dir)
+    write_html_link_details(link, out_dir=out_dir)
@enforce_types
-def write_json_link_index(link: Link, link_dir: Optional[str]=None) -> None:
+def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
    """write a json file with some info about the link"""
    link_dir = link_dir or link.link_dir
    path = os.path.join(link_dir, 'index.json')
    atomic_write(link._asdict(), path)
@enforce_types
 def parse_json_link_index(link_dir: str) -> Optional[Link]:
    """load the json link index from a given directory"""
    existing_index = os.path.join(link_dir, 'index.json')
    if os.path.exists(existing_index):
        with open(existing_index, 'r', encoding='utf-8') as f:
            link_json = json.load(f)
            return Link.from_json(link_json)
    return None
@enforce_types
 def load_json_link_index(link: Link, link_dir: Optional[str]=None) -> Link:
    """check for an existing link archive in the given directory, 
       and load+merge it into the given link dict
    """
-    link_dir = link_dir or link.link_dir
+    out_dir = out_dir or link.link_dir
-    existing_link = parse_json_link_index(link_dir)
+
    existing_link = parse_json_link_details(out_dir)
    if existing_link:
        return merge_links(existing_link, link)
    return link
@enforce_types
 def write_html_link_index(link: Link, link_dir: Optional[str]=None) -> None:
    link_dir = link_dir or link.link_dir
    with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
        link_html = f.read()
    path = os.path.join(link_dir, 'index.html')
    template_vars: Mapping[str, str] = {
        **derived_link_info(link),
        'title': (
            link.title
            or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
        ),
        'url_str': htmlencode(urldecode(link.base_url)),
        'archive_url': urlencode(
            wget_output_path(link)
            or (link.domain if link.is_archived else 'about:blank')
        ),
        'extension': link.extension or 'html',
        'tags': link.tags or 'untagged',
        'status': 'archived' if link.is_archived else 'not yet archived',
        'status_color': 'success' if link.is_archived else 'danger',
        'oldest_archive_date': ts_to_date(link.oldest_archive_date),
    }
    html_index = Template(link_html).substitute(**template_vars)
    atomic_write(html_index, path)
--- a/archivebox/legacy/main.py
+++ b/archivebox/legacy/main.py
@ -1,3 +1,4 @@
 import os
 import re
 import shutil
@ -7,13 +8,18 @@ from .schema import Link
 from .util import enforce_types, TimedProgress
 from .index import (
    links_after_timestamp,
-    load_links_index,
+    load_main_index,
-    write_links_index,
+    write_main_index,
 )
 from .archive_methods import archive_link
 from .config import (
    stderr,
    ANSI,
    ONLY_NEW,
    OUTPUT_DIR,
    SOURCES_DIR,
    ARCHIVE_DIR,
    DATABASE_DIR,
    check_dependencies,
    check_data_folder,
 )
@ -28,6 +34,51 @@ from .logs import (
 )
@enforce_types
 def init():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'}
    is_empty = not len(set(os.listdir(OUTPUT_DIR)) - harmless_files)
    existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
    if not is_empty:
        if existing_index:
            stderr('{green}[√] You already have an archive index in: {}{reset}'.format(OUTPUT_DIR, **ANSI))
            stderr('    To add new links, you can run:')
            stderr("        archivebox add 'https://example.com'")
            stderr()
            stderr('    For more usage and examples, run:')
            stderr('        archivebox help')
            # TODO: import old archivebox version's archive data folder
            raise SystemExit(1)
        else:
            stderr(
                ("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}"
                "\n\n"
                "    {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
                "    just cd into the folder and run the archivebox command to pick up where you left off.\n\n"
                "    (Always make sure your data folder is backed up first before updating ArchiveBox)"
                ).format(OUTPUT_DIR, **ANSI)
            )
            raise SystemExit(1)
    stderr('{green}[+] Initializing new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
    os.makedirs(SOURCES_DIR)
    stderr(f'    > {SOURCES_DIR}')
    os.makedirs(ARCHIVE_DIR)
    stderr(f'    > {ARCHIVE_DIR}')
    os.makedirs(DATABASE_DIR)
    stderr(f'    > {DATABASE_DIR}')
    write_main_index([], out_dir=OUTPUT_DIR, finished=True)
    stderr('{green}[√] Done.{reset}'.format(**ANSI))
@enforce_types
 def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]:
    """The main ArchiveBox entrancepoint. Everything starts here."""
@ -37,19 +88,19 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
    # Step 1: Load list of links from the existing index
    #         merge in and dedupe new links from import_path
-    all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
+    all_links, new_links = load_main_index(out_dir=OUTPUT_DIR, import_path=import_path)
    # Step 2: Write updated index with deduped old and new links back to disk
-    write_links_index(links=list(all_links), out_dir=OUTPUT_DIR)
+    write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
    # Step 3: Run the archive methods for each link
    links = new_links if ONLY_NEW else all_links
    log_archiving_started(len(links), resume)
    idx: int = 0
-    link: Optional[Link] = None
+    link: Link = None                                             # type: ignore
    try:
        for idx, link in enumerate(links_after_timestamp(links, resume)):
-            archive_link(link, link_dir=link.link_dir)
+            archive_link(link, out_dir=link.link_dir)
    except KeyboardInterrupt:
        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
@ -62,8 +113,8 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
    log_archiving_finished(len(links))
    # Step 4: Re-write links index with updated titles, icons, and resources
-    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
+    all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
-    write_links_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
+    write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
    return all_links
@ -87,7 +138,7 @@ def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str
 def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
                      after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
-    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
+    all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
    for link in all_links:
        if after is not None and float(link.timestamp) < after:
@ -133,7 +184,7 @@ def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
    timer = TimedProgress(360, prefix='      ')
    try:
        to_keep = []
-        all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
+        all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
        for link in all_links:
            should_remove = (
                (after is not None and float(link.timestamp) < after)
@ -147,7 +198,7 @@ def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
    finally:
        timer.end()
-    write_links_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
+    write_main_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
    log_removal_finished(len(all_links), len(to_keep))
    return to_keep
--- a/archivebox/legacy/schema.py
+++ b/archivebox/legacy/schema.py
@ -112,20 +112,25 @@ class Link:
        return float(self.timestamp) > float(other.timestamp)
    def typecheck(self) -> None:
-        assert self.schema == self.__class__.__name__
+        from .config import stderr, ANSI
-        assert isinstance(self.timestamp, str) and self.timestamp
+        try:
-        assert self.timestamp.replace('.', '').isdigit()
+            assert self.schema == self.__class__.__name__
-        assert isinstance(self.url, str) and '://' in self.url
+            assert isinstance(self.timestamp, str) and self.timestamp
-        assert self.updated is None or isinstance(self.updated, datetime)
+            assert self.timestamp.replace('.', '').isdigit()
-        assert self.title is None or isinstance(self.title, str) and self.title
+            assert isinstance(self.url, str) and '://' in self.url
-        assert self.tags is None or isinstance(self.tags, str) and self.tags
+            assert self.updated is None or isinstance(self.updated, datetime)
-        assert isinstance(self.sources, list)
+            assert self.title is None or (isinstance(self.title, str) and self.title)
-        assert all(isinstance(source, str) and source for source in self.sources)
+            assert self.tags is None or (isinstance(self.tags, str) and self.tags)
-        assert isinstance(self.history, dict)
+            assert isinstance(self.sources, list)
-        for method, results in self.history.items():
+            assert all(isinstance(source, str) and source for source in self.sources)
-            assert isinstance(method, str) and method
+            assert isinstance(self.history, dict)
-            assert isinstance(results, list)
+            for method, results in self.history.items():
-            assert all(isinstance(result, ArchiveResult) for result in results)
+                assert isinstance(method, str) and method
                assert isinstance(results, list)
                assert all(isinstance(result, ArchiveResult) for result in results)
        except Exception:
            stderr('{red}[X] Error while loading link! [{}] {} "{}"{reset}'.format(self.timestamp, self.url, self.title, **ANSI))
            raise
    def _asdict(self, extended=False):
        info = {
--- a/archivebox/legacy/storage/init.py
+++ b/archivebox/legacy/storage/init.py
@ -0,0 +1 @@
 __package__ = 'archivebox.legacy.storage'
--- a/archivebox/legacy/storage/html.py
+++ b/archivebox/legacy/storage/html.py
@ -0,0 +1,126 @@
 import os
 from datetime import datetime
 from typing import List, Optional
 from ..schema import Link
 from ..config import (
    OUTPUT_DIR,
    TEMPLATES_DIR,
    VERSION,
    GIT_SHA,
    FOOTER_INFO,
    ARCHIVE_DIR_NAME,
 )
 from ..util import (
    enforce_types,
    ts_to_date,
    urlencode,
    htmlencode,
    urldecode,
    wget_output_path,
    render_template,
    atomic_write,
    copy_and_overwrite,
 )
 join = lambda *paths: os.path.join(*paths)
 MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')
 MAIN_INDEX_ROW_TEMPLATE = join(TEMPLATES_DIR, 'main_index_row.html')
 LINK_DETAILS_TEMPLATE = join(TEMPLATES_DIR, 'link_details.html')
 TITLE_LOADING_MSG = 'Not yet archived...'
 ### Main Links Index
@enforce_types
 def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
    """write the html link index to a given path"""
    copy_and_overwrite(join(TEMPLATES_DIR, 'favicon.ico'), join(out_dir, 'favicon.ico'))
    copy_and_overwrite(join(TEMPLATES_DIR, 'robots.txt'), join(out_dir, 'robots.txt'))
    copy_and_overwrite(join(TEMPLATES_DIR, 'static'), join(out_dir, 'static'))
    rendered_html = main_index_template(links, finished=finished)
    atomic_write(rendered_html, join(out_dir, 'index.html'))
@enforce_types
 def main_index_template(links: List[Link], finished: bool=True) -> str:
    """render the template for the entire main index"""
    return render_template(MAIN_INDEX_TEMPLATE, {
        'version': VERSION,
        'git_sha': GIT_SHA,
        'num_links': str(len(links)),
        'status': 'finished' if finished else 'running',
        'date_updated': datetime.now().strftime('%Y-%m-%d'),
        'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
        'rows': '\n'.join(
            main_index_row_template(link)
            for link in links
        ),
        'footer_info': FOOTER_INFO,
    })
@enforce_types
 def main_index_row_template(link: Link) -> str:
    """render the template for an individual link row of the main index"""
    return render_template(MAIN_INDEX_ROW_TEMPLATE, {
        **link._asdict(extended=True),
        # before pages are finished archiving, show loading msg instead of title
        'title': (
            link.title
            or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
        ),
        # before pages are finished archiving, show fallback loading favicon
        'favicon_url': (
            join(ARCHIVE_DIR_NAME, link.timestamp, 'favicon.ico')
            # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs='
        ),
        # before pages are finished archiving, show the details page instead
        'wget_url': urlencode(wget_output_path(link) or 'index.html'),
        # replace commas in tags with spaces, or file extension if it's static
        'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''),
    })
 ### Link Details Index
@enforce_types
 def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
    out_dir = out_dir or link.link_dir
    rendered_html = link_details_template(link)
    atomic_write(rendered_html, join(out_dir, 'index.html'))
@enforce_types
 def link_details_template(link: Link) -> str:
    link_info = link._asdict(extended=True)
    return render_template(LINK_DETAILS_TEMPLATE, {
        **link_info,
        **link_info['canonical'],
        'title': (
            link.title
            or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
        ),
        'url_str': htmlencode(urldecode(link.base_url)),
        'archive_url': urlencode(
            wget_output_path(link)
            or (link.domain if link.is_archived else 'about:blank')
        ),
        'extension': link.extension or 'html',
        'tags': link.tags or 'untagged',
        'status': 'archived' if link.is_archived else 'not yet archived',
        'status_color': 'success' if link.is_archived else 'danger',
        'oldest_archive_date': ts_to_date(link.oldest_archive_date),
    })
--- a/archivebox/legacy/storage/json.py
+++ b/archivebox/legacy/storage/json.py
@ -0,0 +1,81 @@
 import os
 import json
 from datetime import datetime
 from typing import List, Optional, Iterator
 from ..schema import Link, ArchiveResult
 from ..config import (
    VERSION,
    OUTPUT_DIR,
 )
 from ..util import (
    enforce_types,
    atomic_write,
 )
 ### Main Links Index
@enforce_types
 def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
    """parse a archive index json file and return the list of links"""
    index_path = os.path.join(out_dir, 'index.json')
    if os.path.exists(index_path):
        with open(index_path, 'r', encoding='utf-8') as f:
            links = json.load(f)['links']
            for link_json in links:
                yield Link.from_json(link_json)
    return ()
@enforce_types
 def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
    """write the json link index to a given path"""
    assert isinstance(links, List), 'Links must be a list, not a generator.'
    assert not links or isinstance(links[0].history, dict)
    assert not links or isinstance(links[0].sources, list)
    if links and links[0].history.get('title'):
        assert isinstance(links[0].history['title'][0], ArchiveResult)
    if links and links[0].sources:
        assert isinstance(links[0].sources[0], str)
    path = os.path.join(out_dir, 'index.json')
    index_json = {
        'info': 'ArchiveBox Index',
        'source': 'https://github.com/pirate/ArchiveBox',
        'docs': 'https://github.com/pirate/ArchiveBox/wiki',
        'version': VERSION,
        'num_links': len(links),
        'updated': datetime.now(),
        'links': links,
    }
    atomic_write(index_json, path)
 ### Link Details Index
@enforce_types
 def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
    """write a json file with some info about the link"""
    out_dir = out_dir or link.link_dir
    path = os.path.join(out_dir, 'index.json')
    atomic_write(link._asdict(extended=True), path)
@enforce_types
 def parse_json_link_details(out_dir: str) -> Optional[Link]:
    """load the json link index from a given directory"""
    existing_index = os.path.join(out_dir, 'index.json')
    if os.path.exists(existing_index):
        with open(existing_index, 'r', encoding='utf-8') as f:
            link_json = json.load(f)
            return Link.from_json(link_json)
    return None
--- a/archivebox/legacy/templates/favicon.ico
+++ b/archivebox/legacy/templates/favicon.ico
--- a/archivebox/legacy/templates/link_details.html
+++ b/archivebox/legacy/templates/link_details.html
@ -246,7 +246,7 @@
                        </a>
                    </div>
                    <div class="col-lg-8">
-                        <img src="$link_dir/$favicon_url" alt="Favicon">
+                        <img src="$link_dir/favicon.ico" alt="Favicon">
                        &nbsp;&nbsp;
                        $title
                        &nbsp;&nbsp;
@ -325,36 +325,36 @@
                    </div>
                    <div class="col-lg-2">
                        <div class="card">
-                          <iframe class="card-img-top" src="$dom_url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                          <iframe class="card-img-top" src="$dom_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
                          <div class="card-body">
-                            <a href="$dom_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
+                            <a href="$dom_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
                                <img src="../../static/external.png" class="external"/>
                            </a>
-                            <a href="$dom_url" target="preview"><h4 class="card-title">HTML</h4></a>
+                            <a href="$dom_path" target="preview"><h4 class="card-title">HTML</h4></a>
                            <p class="card-text">archive/output.html</p>
                          </div>
                        </div>
                    </div>
                    <div class="col-lg-2">
                        <div class="card">
-                          <iframe class="card-img-top pdf-frame" src="$pdf_url" scrolling="no"></iframe>
+                          <iframe class="card-img-top pdf-frame" src="$pdf_path" scrolling="no"></iframe>
                          <div class="card-body">
-                            <a href="$pdf_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
+                            <a href="$pdf_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
                                <img src="../../static/external.png" class="external"/>
                            </a>
-                            <a href="$pdf_url" target="preview" id="pdf-btn"><h4 class="card-title">PDF</h4></a>
+                            <a href="$pdf_path" target="preview" id="pdf-btn"><h4 class="card-title">PDF</h4></a>
                            <p class="card-text">archive/output.pdf</p>
                          </div>
                        </div>
                    </div>
                    <div class="col-lg-2">
                        <div class="card">
-                          <img class="card-img-top screenshot" src="$screenshot_url"></iframe>
+                          <img class="card-img-top screenshot" src="$screenshot_path"></iframe>
                          <div class="card-body">
-                            <a href="$screenshot_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
+                            <a href="$screenshot_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
                                <img src="../../static/external.png" class="external"/>
                            </a>
-                            <a href="$screenshot_url" target="preview"><h4 class="card-title">Screenshot</h4></a>
+                            <a href="$screenshot_path" target="preview"><h4 class="card-title">Screenshot</h4></a>
                            <p class="card-text">archive/screenshot.png</p>
                          </div>
                        </div>
@ -373,12 +373,12 @@
                    </div>
                    <div class="col-lg-2">
                        <div class="card">
-                          <iframe class="card-img-top" src="$archive_org_url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                          <iframe class="card-img-top" src="$archive_org_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
                          <div class="card-body">
-                            <a href="$archive_org_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
+                            <a href="$archive_org_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
                                <img src="../../static/external.png" class="external"/>
                            </a>
-                            <a href="$archive_org_url" target="preview"><h4 class="card-title">Archive.Org</h4></a>
+                            <a href="$archive_org_path" target="preview"><h4 class="card-title">Archive.Org</h4></a>
                            <p class="card-text">web.archive.org/web/...</p>
                          </div>
                        </div>
--- a/archivebox/legacy/templates/main_index.html
+++ b/archivebox/legacy/templates/main_index.html
--- a/archivebox/legacy/templates/main_index_row.html
+++ b/archivebox/legacy/templates/main_index_row.html
@ -1,14 +1,14 @@
 <tr>
    <td title="$timestamp">$bookmarked_date</td>
    <td class="title-col">
-        <a href="$archive_path/$index_url"><img src="$favicon_url" class="link-favicon" decoding="async"></a>
+        <a href="$archive_path/index.html"><img src="$favicon_url" class="link-favicon" decoding="async"></a>
        <a href="$archive_path/$wget_url" title="$title">
            <span data-title-for="$url" data-archived="$is_archived">$title</span>
            <small style="float:right">$tags</small>
        </a>
    </td>
    <td>
-        <a href="$archive_path/$index_url">📄 
+        <a href="$archive_path/index.html">📄 
            <span data-number-for="$url" title="Fetching any missing files...">$num_outputs <img src="static/spinner.gif" class="files-spinner" decoding="async"/></span>
        </a>
    </td>
--- a/archivebox/legacy/templates/robots.txt
+++ b/archivebox/legacy/templates/robots.txt
@ -0,0 +1,2 @@
 User-agent: *
    Disallow: /
--- a/archivebox/legacy/util.py
+++ b/archivebox/legacy/util.py
@ -5,8 +5,9 @@ import json
 import time
 import shutil
 from string import Template
 from json import JSONEncoder
-from typing import List, Optional, Any, Union, IO
+from typing import List, Optional, Any, Union, IO, Mapping
 from inspect import signature
 from functools import wraps
 from hashlib import sha256
@ -396,10 +397,11 @@ def parse_date(date: Any) -> Optional[datetime]:
            try:
                return datetime.fromisoformat(date)
            except Exception:
-                try:
+                pass
-                    return datetime.strptime(date, '%Y-%m-%d %H:%M')
+            try:
-                except Exception:
+                return datetime.strptime(date, '%Y-%m-%d %H:%M')
-                    pass
+            except Exception:
                pass
    raise ValueError('Tried to parse invalid date! {}'.format(date))
@ -552,9 +554,12 @@ def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, tim
@enforce_types
 def copy_and_overwrite(from_path: str, to_path: str):
-    if os.path.exists(to_path):
+    if os.path.isdir(from_path):
-        shutil.rmtree(to_path)
+        shutil.rmtree(to_path, ignore_errors=True)
-    shutil.copytree(from_path, to_path)
+        shutil.copytree(from_path, to_path)
    else:
        with open(from_path, 'rb') as src:
            atomic_write(src.read(), to_path)
@enforce_types
 def chrome_args(**options) -> List[str]:
@ -642,11 +647,27 @@ def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
    return '\n'.join((header_str, *row_strs))
-def atomic_write(contents: Union[dict, str], path: str) -> None:
+@enforce_types
 def render_template(template_path: str, context: Mapping[str, str]) -> str:
    """render a given html template string with the given template content"""
    # will be replaced by django templates in the future
    with open(template_path, 'r', encoding='utf-8') as template:
        template_str = template.read()
    return Template(template_str).substitute(**context)
 def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
    """Safe atomic write to filesystem by writing to temp file + atomic rename"""
    try:
        tmp_file = '{}.tmp'.format(path)
-        with open(tmp_file, 'w+', encoding='utf-8') as f:
+        
        if isinstance(contents, bytes):
            args = {'mode': 'wb+'}
        else:
            args = {'mode': 'w+', 'encoding': 'utf-8'}
        with open(tmp_file, **args) as f:
            if isinstance(contents, dict):
                to_json(contents, file=f)
            else:
@ -678,3 +699,5 @@ def reject_stdin(caller: str) -> None:
            ))
            print()
            raise SystemExit(1)