From 9ce47431daaae42cbc8243327ad934c58aaf0142 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 17 Apr 2019 02:25:28 -0400 Subject: [PATCH] better loading and saving storage mechanism --- archivebox/legacy/archive_methods.py | 6 +- archivebox/legacy/config.py | 10 +- archivebox/legacy/index.py | 228 ++++-------------- archivebox/legacy/main.py | 73 +++++- archivebox/legacy/schema.py | 33 +-- archivebox/legacy/storage/__init__.py | 1 + archivebox/legacy/storage/html.py | 126 ++++++++++ archivebox/legacy/storage/json.py | 81 +++++++ archivebox/legacy/templates/favicon.ico | Bin 0 -> 15086 bytes .../{link_index.html => link_details.html} | 26 +- .../templates/{index.html => main_index.html} | 0 .../{index_row.html => main_index_row.html} | 4 +- archivebox/legacy/templates/robots.txt | 2 + archivebox/legacy/util.py | 43 +++- 14 files changed, 395 insertions(+), 238 deletions(-) create mode 100644 archivebox/legacy/storage/__init__.py create mode 100644 archivebox/legacy/storage/html.py create mode 100644 archivebox/legacy/storage/json.py create mode 100644 archivebox/legacy/templates/favicon.ico rename archivebox/legacy/templates/{link_index.html => link_details.html} (93%) rename archivebox/legacy/templates/{index.html => main_index.html} (100%) rename archivebox/legacy/templates/{index_row.html => main_index_row.html} (84%) create mode 100644 archivebox/legacy/templates/robots.txt diff --git a/archivebox/legacy/archive_methods.py b/archivebox/legacy/archive_methods.py index 0abff907..56b415bf 100644 --- a/archivebox/legacy/archive_methods.py +++ b/archivebox/legacy/archive_methods.py @@ -6,9 +6,9 @@ from datetime import datetime from .schema import Link, ArchiveResult, ArchiveOutput from .index import ( - write_link_index, - patch_links_index, - load_json_link_index, + load_link_details, + write_link_details, + patch_main_index, ) from .config import ( CURL_BINARY, diff --git a/archivebox/legacy/config.py b/archivebox/legacy/config.py index 490f0a4b..c158e52b 100644 --- a/archivebox/legacy/config.py +++ b/archivebox/legacy/config.py @@ -115,7 +115,6 @@ URL_BLACKLIST_PTN = re.compile(URL_BLACKLIST, re.IGNORECASE) if URL_BLACKLIST el VERSION = open(os.path.join(REPO_DIR, 'VERSION'), 'r').read().strip() GIT_SHA = VERSION.split('+')[-1] or 'unknown' HAS_INVALID_DEPENDENCIES = False -HAS_INVALID_DB = not os.path.exists(os.path.join(OUTPUT_DIR, 'index.json')) ### Check system environment if USER == 'root': @@ -429,13 +428,12 @@ def check_dependencies() -> None: raise SystemExit(1) def check_data_folder() -> None: - if HAS_INVALID_DB: - stderr('{red}[X] No archive data found in:{reset} {}'.format(OUTPUT_DIR, **ANSI)) + if not os.path.exists(os.path.join(OUTPUT_DIR, 'index.json')): + stderr('{red}[X] No archive data was found in:{reset} {}'.format(OUTPUT_DIR, **ANSI)) stderr(' Are you running archivebox in the right folder?') - stderr(' cd path/to/your/archive') + stderr(' cd path/to/your/archive/folder') stderr(' archivebox [command]') stderr() - stderr(' To create a new archive folder, run:') - stderr(' mkdir new_archive_dir && cd new_archive_dir') + stderr(' To create a new archive collection in this folder, run:') stderr(' archivebox init') raise SystemExit(1) diff --git a/archivebox/legacy/index.py b/archivebox/legacy/index.py index 03cd52a9..4df15e30 100644 --- a/archivebox/legacy/index.py +++ b/archivebox/legacy/index.py @@ -1,33 +1,28 @@ import os import json -from datetime import datetime -from string import Template -from typing import List, Tuple, Iterator, Optional, Mapping, Iterable +from typing import List, Tuple, Optional, Iterable from collections import OrderedDict from .schema import Link, ArchiveResult from .config import ( OUTPUT_DIR, - TEMPLATES_DIR, - VERSION, - GIT_SHA, - FOOTER_INFO, TIMEOUT, URL_BLACKLIST_PTN, ANSI, stderr, ) +from .storage.html import write_html_main_index, write_html_link_details +from .storage.json import ( + parse_json_main_index, + write_json_main_index, + parse_json_link_details, + write_json_link_details, +) from .util import ( scheme, - ts_to_date, - urlencode, - htmlencode, - urldecode, - wget_output_path, enforce_types, TimedProgress, - copy_and_overwrite, atomic_write, ExtendedEncoder, ) @@ -40,8 +35,6 @@ from .logs import ( log_parsing_finished, ) -TITLE_LOADING_MSG = 'Not yet archived...' - ### Link filtering and checking @@ -53,8 +46,10 @@ def merge_links(a: Link, b: Link) -> Link: """ assert a.base_url == b.base_url, 'Cannot merge two links with different URLs' + # longest url wins (because a fuzzy url will always be shorter) url = a.url if len(a.url) > len(b.url) else b.url + # best title based on length and quality possible_titles = [ title for title in (a.title, b.title) @@ -66,20 +61,24 @@ def merge_links(a: Link, b: Link) -> Link: elif len(possible_titles) == 1: title = possible_titles[0] + # earliest valid timestamp timestamp = ( a.timestamp if float(a.timestamp or 0) < float(b.timestamp or 0) else b.timestamp ) + # all unique, truthy tags tags_set = ( set(tag.strip() for tag in (a.tags or '').split(',')) | set(tag.strip() for tag in (b.tags or '').split(',')) ) tags = ','.join(tags_set) or None + # all unique source entries sources = list(set(a.sources + b.sources)) + # all unique history entries for the combined archive methods all_methods = set(list(a.history.keys()) + list(a.history.keys())) history = { method: (a.history.get(method) or []) + (b.history.get(method) or []) @@ -95,7 +94,6 @@ def merge_links(a: Link, b: Link) -> Link: key=lambda result: result.start_ts, ))) - return Link( url=url, timestamp=timestamp, @@ -105,6 +103,8 @@ def merge_links(a: Link, b: Link) -> Link: history=history, ) + +@enforce_types def validate_links(links: Iterable[Link]) -> Iterable[Link]: links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = sorted_links(links) # deterministically sort the links based on timstamp, url @@ -121,6 +121,8 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]: return links + +@enforce_types def archivable_links(links: Iterable[Link]) -> Iterable[Link]: """remove chrome://, about:// or other schemed links that cant be archived""" for link in links: @@ -130,6 +132,7 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]: yield link +@enforce_types def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]: """ ensures that all non-duplicate links have monotonically increasing timestamps @@ -153,12 +156,14 @@ def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]: return unique_timestamps.values() +@enforce_types def sorted_links(links: Iterable[Link]) -> Iterable[Link]: sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url) return sorted(links, key=sort_func, reverse=True) -def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]: +@enforce_types +def links_after_timestamp(links: Iterable[Link], resume: Optional[float]=None) -> Iterable[Link]: if not resume: yield from links return @@ -171,6 +176,7 @@ def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable print('Resume value and all timestamp values must be valid numbers.') +@enforce_types def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str: """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2""" @@ -190,10 +196,10 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str: -### Homepage index for all the links +### Main Links Index @enforce_types -def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None: +def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None: """create index.html file for a given list of links""" log_indexing_process_started() @@ -201,7 +207,7 @@ def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool log_indexing_started(out_dir, 'index.json') timer = TimedProgress(TIMEOUT * 2, prefix=' ') try: - write_json_links_index(links, out_dir=out_dir) + write_json_main_index(links, out_dir=out_dir) finally: timer.end() log_indexing_finished(out_dir, 'index.json') @@ -209,19 +215,19 @@ def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool log_indexing_started(out_dir, 'index.html') timer = TimedProgress(TIMEOUT * 2, prefix=' ') try: - write_html_links_index(links, out_dir=out_dir, finished=finished) + write_html_main_index(links, out_dir=out_dir, finished=finished) finally: timer.end() log_indexing_finished(out_dir, 'index.html') @enforce_types -def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]: +def load_main_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]: """parse and load existing index with any new links from import_path merged in""" existing_links: List[Link] = [] if out_dir: - existing_links = list(parse_json_links_index(out_dir)) + existing_links = list(parse_json_main_index(out_dir)) new_links: List[Link] = [] if import_path: @@ -242,108 +248,16 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) - @enforce_types -def write_json_links_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: - """write the json link index to a given path""" +def patch_main_index(link: Link, out_dir: str=OUTPUT_DIR) -> None: + """hack to in-place update one row's info in the generated index files""" - assert isinstance(links, List), 'Links must be a list, not a generator.' - assert not links or isinstance(links[0].history, dict) - assert not links or isinstance(links[0].sources, list) + # TODO: remove this ASAP, it's ugly, error-prone, and potentially dangerous - if links and links[0].history.get('title'): - assert isinstance(links[0].history['title'][0], ArchiveResult) - - if links and links[0].sources: - assert isinstance(links[0].sources[0], str) - - path = os.path.join(out_dir, 'index.json') - - index_json = { - 'info': 'ArchiveBox Index', - 'source': 'https://github.com/pirate/ArchiveBox', - 'docs': 'https://github.com/pirate/ArchiveBox/wiki', - 'version': VERSION, - 'num_links': len(links), - 'updated': datetime.now(), - 'links': links, - } - atomic_write(index_json, path) - - -@enforce_types -def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]: - """parse a archive index json file and return the list of links""" - - index_path = os.path.join(out_dir, 'index.json') - if os.path.exists(index_path): - with open(index_path, 'r', encoding='utf-8') as f: - links = json.load(f)['links'] - for link_json in links: - yield Link.from_json(link_json) - - return () - - -@enforce_types -def write_html_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None: - """write the html link index to a given path""" - - copy_and_overwrite( - os.path.join(TEMPLATES_DIR, 'static'), - os.path.join(out_dir, 'static'), - ) - - atomic_write('User-agent: *\nDisallow: /', os.path.join(out_dir, 'robots.txt')) - - with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r', encoding='utf-8') as f: - index_html = f.read() - - with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f: - link_row_html = f.read() - - link_rows = [] - for link in links: - template_row_vars: Mapping[str, str] = { - **derived_link_info(link), - 'title': ( - link.title - or (link.base_url if link.is_archived else TITLE_LOADING_MSG) - ), - 'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''), - 'favicon_url': ( - os.path.join('archive', link.timestamp, 'favicon.ico') - # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs=' - ), - 'archive_url': urlencode( - wget_output_path(link) or 'index.html' - ), - } - link_rows.append(Template(link_row_html).substitute(**template_row_vars)) - - template_vars: Mapping[str, str] = { - 'num_links': str(len(links)), - 'date_updated': datetime.now().strftime('%Y-%m-%d'), - 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), - 'footer_info': FOOTER_INFO, - 'version': VERSION, - 'git_sha': GIT_SHA, - 'rows': '\n'.join(link_rows), - 'status': 'finished' if finished else 'running', - } - template_html = Template(index_html).substitute(**template_vars) - - atomic_write(template_html, os.path.join(out_dir, 'index.html')) - - - -@enforce_types -def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None: - """hack to in-place update one row's info in the generated index html""" - - title = link.title or link.latest_outputs()['title'] + title = link.title or link.latest_outputs(status='succeeded')['title'] successful = link.num_outputs - # Patch JSON index - json_file_links = parse_json_links_index(out_dir) + # Patch JSON main index + json_file_links = parse_json_main_index(out_dir) patched_links = [] for saved_link in json_file_links: if saved_link.url == link.url: @@ -355,11 +269,12 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None: else: patched_links.append(saved_link) - write_json_links_index(patched_links, out_dir=out_dir) + write_json_main_index(patched_links, out_dir=out_dir) - # Patch HTML index + # Patch HTML main index html_path = os.path.join(out_dir, 'index.html') - html = open(html_path, 'r').read().split('\n') + with open(html_path, 'r') as f: + html = f.read().split('\n') for idx, line in enumerate(html): if title and (' None: atomic_write('\n'.join(html), html_path) -### Individual link index +### Link Details Index @enforce_types -def write_link_index(link: Link, link_dir: Optional[str]=None) -> None: - link_dir = link_dir or link.link_dir +def write_link_details(link: Link, out_dir: Optional[str]=None) -> None: + out_dir = out_dir or link.link_dir - write_json_link_index(link, link_dir) - write_html_link_index(link, link_dir) + write_json_link_details(link, out_dir=out_dir) + write_html_link_details(link, out_dir=out_dir) @enforce_types -def write_json_link_index(link: Link, link_dir: Optional[str]=None) -> None: - """write a json file with some info about the link""" - - link_dir = link_dir or link.link_dir - path = os.path.join(link_dir, 'index.json') - - atomic_write(link._asdict(), path) - - -@enforce_types -def parse_json_link_index(link_dir: str) -> Optional[Link]: - """load the json link index from a given directory""" - existing_index = os.path.join(link_dir, 'index.json') - if os.path.exists(existing_index): - with open(existing_index, 'r', encoding='utf-8') as f: - link_json = json.load(f) - return Link.from_json(link_json) - return None - - -@enforce_types -def load_json_link_index(link: Link, link_dir: Optional[str]=None) -> Link: +def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link: """check for an existing link archive in the given directory, and load+merge it into the given link dict """ - link_dir = link_dir or link.link_dir - existing_link = parse_json_link_index(link_dir) + out_dir = out_dir or link.link_dir + + existing_link = parse_json_link_details(out_dir) if existing_link: return merge_links(existing_link, link) + return link -@enforce_types -def write_html_link_index(link: Link, link_dir: Optional[str]=None) -> None: - link_dir = link_dir or link.link_dir - with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f: - link_html = f.read() - path = os.path.join(link_dir, 'index.html') - template_vars: Mapping[str, str] = { - **derived_link_info(link), - 'title': ( - link.title - or (link.base_url if link.is_archived else TITLE_LOADING_MSG) - ), - 'url_str': htmlencode(urldecode(link.base_url)), - 'archive_url': urlencode( - wget_output_path(link) - or (link.domain if link.is_archived else 'about:blank') - ), - 'extension': link.extension or 'html', - 'tags': link.tags or 'untagged', - 'status': 'archived' if link.is_archived else 'not yet archived', - 'status_color': 'success' if link.is_archived else 'danger', - 'oldest_archive_date': ts_to_date(link.oldest_archive_date), - } - html_index = Template(link_html).substitute(**template_vars) - - atomic_write(html_index, path) diff --git a/archivebox/legacy/main.py b/archivebox/legacy/main.py index 36f8cfc6..c437d5d4 100644 --- a/archivebox/legacy/main.py +++ b/archivebox/legacy/main.py @@ -1,3 +1,4 @@ +import os import re import shutil @@ -7,13 +8,18 @@ from .schema import Link from .util import enforce_types, TimedProgress from .index import ( links_after_timestamp, - load_links_index, - write_links_index, + load_main_index, + write_main_index, ) from .archive_methods import archive_link from .config import ( + stderr, + ANSI, ONLY_NEW, OUTPUT_DIR, + SOURCES_DIR, + ARCHIVE_DIR, + DATABASE_DIR, check_dependencies, check_data_folder, ) @@ -28,6 +34,51 @@ from .logs import ( ) +@enforce_types +def init(): + os.makedirs(OUTPUT_DIR, exist_ok=True) + + harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'} + is_empty = not len(set(os.listdir(OUTPUT_DIR)) - harmless_files) + existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json')) + + if not is_empty: + if existing_index: + stderr('{green}[√] You already have an archive index in: {}{reset}'.format(OUTPUT_DIR, **ANSI)) + stderr(' To add new links, you can run:') + stderr(" archivebox add 'https://example.com'") + stderr() + stderr(' For more usage and examples, run:') + stderr(' archivebox help') + # TODO: import old archivebox version's archive data folder + + raise SystemExit(1) + else: + stderr( + ("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}" + "\n\n" + " {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n" + " just cd into the folder and run the archivebox command to pick up where you left off.\n\n" + " (Always make sure your data folder is backed up first before updating ArchiveBox)" + ).format(OUTPUT_DIR, **ANSI) + ) + raise SystemExit(1) + + + stderr('{green}[+] Initializing new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI)) + os.makedirs(SOURCES_DIR) + stderr(f' > {SOURCES_DIR}') + os.makedirs(ARCHIVE_DIR) + stderr(f' > {ARCHIVE_DIR}') + os.makedirs(DATABASE_DIR) + stderr(f' > {DATABASE_DIR}') + + write_main_index([], out_dir=OUTPUT_DIR, finished=True) + + stderr('{green}[√] Done.{reset}'.format(**ANSI)) + + + @enforce_types def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]: """The main ArchiveBox entrancepoint. Everything starts here.""" @@ -37,19 +88,19 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float] # Step 1: Load list of links from the existing index # merge in and dedupe new links from import_path - all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path) + all_links, new_links = load_main_index(out_dir=OUTPUT_DIR, import_path=import_path) # Step 2: Write updated index with deduped old and new links back to disk - write_links_index(links=list(all_links), out_dir=OUTPUT_DIR) + write_main_index(links=list(all_links), out_dir=OUTPUT_DIR) # Step 3: Run the archive methods for each link links = new_links if ONLY_NEW else all_links log_archiving_started(len(links), resume) idx: int = 0 - link: Optional[Link] = None + link: Link = None # type: ignore try: for idx, link in enumerate(links_after_timestamp(links, resume)): - archive_link(link, link_dir=link.link_dir) + archive_link(link, out_dir=link.link_dir) except KeyboardInterrupt: log_archiving_paused(len(links), idx, link.timestamp if link else '0') @@ -62,8 +113,8 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float] log_archiving_finished(len(links)) # Step 4: Re-write links index with updated titles, icons, and resources - all_links, _ = load_links_index(out_dir=OUTPUT_DIR) - write_links_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True) + all_links, _ = load_main_index(out_dir=OUTPUT_DIR) + write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True) return all_links @@ -87,7 +138,7 @@ def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact', after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]: - all_links, _ = load_links_index(out_dir=OUTPUT_DIR) + all_links, _ = load_main_index(out_dir=OUTPUT_DIR) for link in all_links: if after is not None and float(link.timestamp) < after: @@ -133,7 +184,7 @@ def remove_archive_links(filter_patterns: List[str], filter_type: str='exact', timer = TimedProgress(360, prefix=' ') try: to_keep = [] - all_links, _ = load_links_index(out_dir=OUTPUT_DIR) + all_links, _ = load_main_index(out_dir=OUTPUT_DIR) for link in all_links: should_remove = ( (after is not None and float(link.timestamp) < after) @@ -147,7 +198,7 @@ def remove_archive_links(filter_patterns: List[str], filter_type: str='exact', finally: timer.end() - write_links_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True) + write_main_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True) log_removal_finished(len(all_links), len(to_keep)) return to_keep diff --git a/archivebox/legacy/schema.py b/archivebox/legacy/schema.py index 743f3a14..38f2ec95 100644 --- a/archivebox/legacy/schema.py +++ b/archivebox/legacy/schema.py @@ -112,20 +112,25 @@ class Link: return float(self.timestamp) > float(other.timestamp) def typecheck(self) -> None: - assert self.schema == self.__class__.__name__ - assert isinstance(self.timestamp, str) and self.timestamp - assert self.timestamp.replace('.', '').isdigit() - assert isinstance(self.url, str) and '://' in self.url - assert self.updated is None or isinstance(self.updated, datetime) - assert self.title is None or isinstance(self.title, str) and self.title - assert self.tags is None or isinstance(self.tags, str) and self.tags - assert isinstance(self.sources, list) - assert all(isinstance(source, str) and source for source in self.sources) - assert isinstance(self.history, dict) - for method, results in self.history.items(): - assert isinstance(method, str) and method - assert isinstance(results, list) - assert all(isinstance(result, ArchiveResult) for result in results) + from .config import stderr, ANSI + try: + assert self.schema == self.__class__.__name__ + assert isinstance(self.timestamp, str) and self.timestamp + assert self.timestamp.replace('.', '').isdigit() + assert isinstance(self.url, str) and '://' in self.url + assert self.updated is None or isinstance(self.updated, datetime) + assert self.title is None or (isinstance(self.title, str) and self.title) + assert self.tags is None or (isinstance(self.tags, str) and self.tags) + assert isinstance(self.sources, list) + assert all(isinstance(source, str) and source for source in self.sources) + assert isinstance(self.history, dict) + for method, results in self.history.items(): + assert isinstance(method, str) and method + assert isinstance(results, list) + assert all(isinstance(result, ArchiveResult) for result in results) + except Exception: + stderr('{red}[X] Error while loading link! [{}] {} "{}"{reset}'.format(self.timestamp, self.url, self.title, **ANSI)) + raise def _asdict(self, extended=False): info = { diff --git a/archivebox/legacy/storage/__init__.py b/archivebox/legacy/storage/__init__.py new file mode 100644 index 00000000..40c7f113 --- /dev/null +++ b/archivebox/legacy/storage/__init__.py @@ -0,0 +1 @@ +__package__ = 'archivebox.legacy.storage' diff --git a/archivebox/legacy/storage/html.py b/archivebox/legacy/storage/html.py new file mode 100644 index 00000000..2ca4a2fc --- /dev/null +++ b/archivebox/legacy/storage/html.py @@ -0,0 +1,126 @@ +import os + +from datetime import datetime +from typing import List, Optional + +from ..schema import Link +from ..config import ( + OUTPUT_DIR, + TEMPLATES_DIR, + VERSION, + GIT_SHA, + FOOTER_INFO, + ARCHIVE_DIR_NAME, +) +from ..util import ( + enforce_types, + ts_to_date, + urlencode, + htmlencode, + urldecode, + wget_output_path, + render_template, + atomic_write, + copy_and_overwrite, +) + +join = lambda *paths: os.path.join(*paths) +MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html') +MAIN_INDEX_ROW_TEMPLATE = join(TEMPLATES_DIR, 'main_index_row.html') +LINK_DETAILS_TEMPLATE = join(TEMPLATES_DIR, 'link_details.html') +TITLE_LOADING_MSG = 'Not yet archived...' + + +### Main Links Index + +@enforce_types +def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None: + """write the html link index to a given path""" + + copy_and_overwrite(join(TEMPLATES_DIR, 'favicon.ico'), join(out_dir, 'favicon.ico')) + copy_and_overwrite(join(TEMPLATES_DIR, 'robots.txt'), join(out_dir, 'robots.txt')) + copy_and_overwrite(join(TEMPLATES_DIR, 'static'), join(out_dir, 'static')) + + rendered_html = main_index_template(links, finished=finished) + atomic_write(rendered_html, join(out_dir, 'index.html')) + + +@enforce_types +def main_index_template(links: List[Link], finished: bool=True) -> str: + """render the template for the entire main index""" + + return render_template(MAIN_INDEX_TEMPLATE, { + 'version': VERSION, + 'git_sha': GIT_SHA, + 'num_links': str(len(links)), + 'status': 'finished' if finished else 'running', + 'date_updated': datetime.now().strftime('%Y-%m-%d'), + 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), + 'rows': '\n'.join( + main_index_row_template(link) + for link in links + ), + 'footer_info': FOOTER_INFO, + }) + + +@enforce_types +def main_index_row_template(link: Link) -> str: + """render the template for an individual link row of the main index""" + + return render_template(MAIN_INDEX_ROW_TEMPLATE, { + **link._asdict(extended=True), + + # before pages are finished archiving, show loading msg instead of title + 'title': ( + link.title + or (link.base_url if link.is_archived else TITLE_LOADING_MSG) + ), + + # before pages are finished archiving, show fallback loading favicon + 'favicon_url': ( + join(ARCHIVE_DIR_NAME, link.timestamp, 'favicon.ico') + # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs=' + ), + + # before pages are finished archiving, show the details page instead + 'wget_url': urlencode(wget_output_path(link) or 'index.html'), + + # replace commas in tags with spaces, or file extension if it's static + 'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''), + }) + + +### Link Details Index + +@enforce_types +def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None: + out_dir = out_dir or link.link_dir + + rendered_html = link_details_template(link) + atomic_write(rendered_html, join(out_dir, 'index.html')) + + +@enforce_types +def link_details_template(link: Link) -> str: + + link_info = link._asdict(extended=True) + + return render_template(LINK_DETAILS_TEMPLATE, { + **link_info, + **link_info['canonical'], + 'title': ( + link.title + or (link.base_url if link.is_archived else TITLE_LOADING_MSG) + ), + 'url_str': htmlencode(urldecode(link.base_url)), + 'archive_url': urlencode( + wget_output_path(link) + or (link.domain if link.is_archived else 'about:blank') + ), + 'extension': link.extension or 'html', + 'tags': link.tags or 'untagged', + 'status': 'archived' if link.is_archived else 'not yet archived', + 'status_color': 'success' if link.is_archived else 'danger', + 'oldest_archive_date': ts_to_date(link.oldest_archive_date), + }) diff --git a/archivebox/legacy/storage/json.py b/archivebox/legacy/storage/json.py new file mode 100644 index 00000000..de581910 --- /dev/null +++ b/archivebox/legacy/storage/json.py @@ -0,0 +1,81 @@ +import os +import json + +from datetime import datetime +from typing import List, Optional, Iterator + +from ..schema import Link, ArchiveResult +from ..config import ( + VERSION, + OUTPUT_DIR, +) +from ..util import ( + enforce_types, + atomic_write, +) + + +### Main Links Index + +@enforce_types +def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]: + """parse a archive index json file and return the list of links""" + + index_path = os.path.join(out_dir, 'index.json') + if os.path.exists(index_path): + with open(index_path, 'r', encoding='utf-8') as f: + links = json.load(f)['links'] + for link_json in links: + yield Link.from_json(link_json) + + return () + +@enforce_types +def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: + """write the json link index to a given path""" + + assert isinstance(links, List), 'Links must be a list, not a generator.' + assert not links or isinstance(links[0].history, dict) + assert not links or isinstance(links[0].sources, list) + + if links and links[0].history.get('title'): + assert isinstance(links[0].history['title'][0], ArchiveResult) + + if links and links[0].sources: + assert isinstance(links[0].sources[0], str) + + path = os.path.join(out_dir, 'index.json') + + index_json = { + 'info': 'ArchiveBox Index', + 'source': 'https://github.com/pirate/ArchiveBox', + 'docs': 'https://github.com/pirate/ArchiveBox/wiki', + 'version': VERSION, + 'num_links': len(links), + 'updated': datetime.now(), + 'links': links, + } + atomic_write(index_json, path) + + +### Link Details Index + +@enforce_types +def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None: + """write a json file with some info about the link""" + + out_dir = out_dir or link.link_dir + path = os.path.join(out_dir, 'index.json') + + atomic_write(link._asdict(extended=True), path) + + +@enforce_types +def parse_json_link_details(out_dir: str) -> Optional[Link]: + """load the json link index from a given directory""" + existing_index = os.path.join(out_dir, 'index.json') + if os.path.exists(existing_index): + with open(existing_index, 'r', encoding='utf-8') as f: + link_json = json.load(f) + return Link.from_json(link_json) + return None diff --git a/archivebox/legacy/templates/favicon.ico b/archivebox/legacy/templates/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..835c37687d93dc28ba06b29dd59f1f3f4bcb8df3 GIT binary patch literal 15086 zcmeHOX-rgC6n-pjpa`;vfHHs-X*Dq}L`}4f#zdlMH?74Y(o#VGX-rM}$1UQYQrDQc zlpTvnV{D8I+C+_eKm-vGTC57P+n^>W>*;r%^T;^NeLQexcqF{!n>X{`z4v_gaPGb5 z+2#M|-%sx=t83Zk*@%@#8(+-QB%BJUnDfm@vUx zfv2aZx2@sr?d=m165^)T@3CXYdV>EcaLfmv0xvJGLZApJ1|Fb|65yc;O7Yz;fUX{m z88apadURFkLtt1~*wZs-&d|-9H|fTW8@j-)Tem1ZJ)NM#hv4p|!ar!%tXb91pFbzP zUB$)4(K>sP22{j15b)jNg1qobq0h`*bgTj=cB zvupdZA6s!$YuCvCAIyS<2^`CO_lwWw6rwR0xp03`gQ8=?p8A$`JK^z9%GK5EYd%GyfCf&o0^(v{`~p0 zYSk)AN=lOFBlv!Vwh3cQ6S(ZMWy|Qzn>Skd{|ldfiikgaX3hNT>+5OZ!i99_&K+4c z>o6`ZPUi36;6Pfy{Ff|QLanW>n)$o9xCr^fKh?~?p`n2mEm}l(@7}d1|0h_Z0!8{q zEHVxDQ?JGT2ZMh=K)_S1Q9}McK0bQq{|vFLL8SkQ6DR7PKi1eF5r1D_-)VE^%+a&` zJnMsh_)z9QY0@O--=O31tEzv*m%8Tc266_{c*0nLK$i^KaBLe*1J-tXzQoPYwJ((K~o=$HSf)z7q6Io?w;X-np(Cdkast<0WbM-akEv)+ptwo9kwQF0^x8>&Y5kJP9K7G3H1E+TeHsDX3sdqy57v}P@{;!Ap0k^BQqEY^r=JFB$ z#M(|?zO%FQHx}f7FY14L zd%L`!c>er(x^UqFUA%Zv2!5ZRpD*tnTT&JGA7}Mz&E+Hh5z4QttE1@XXgYG_2<_Oh zgZ*{FJVrRP?FFCz<< zfpcH(-MdF!U0uTQCy<{Y(!a%q2KbQYALJAEvOn~HP~~F}DseFTZ%6rrT$%Oe;~#ri z_UgY#ev+N^Psp|D)qk9E*iAlSjaWthSc`rIegUVyR<2x0TeohdEnBwO1l;!0rAvA4 z$^%kZHhkfpoGfsX*Hv5JO=aSH%OK-Z$R9MAPNPpGX_T!YnOrQJ_bGT=!T|K_Q{aEi zY;ZXsr!j8l&Yizu9sO z<3i3c$DTd0Pr`T39p%0$^|8O7Wl}#^kgU`}gk?qp+}0{#&wcP59?nsxKGl zP#6Qb;O~MxgRAfn8W%5KEa%2@er#XhvWP1hQ1=Sv fxKqGMw8j0=46eg<3088|Ns@M8B=iT6C13vo-`!7o literal 0 HcmV?d00001 diff --git a/archivebox/legacy/templates/link_index.html b/archivebox/legacy/templates/link_details.html similarity index 93% rename from archivebox/legacy/templates/link_index.html rename to archivebox/legacy/templates/link_details.html index efe8a7e8..f9019926 100644 --- a/archivebox/legacy/templates/link_index.html +++ b/archivebox/legacy/templates/link_details.html @@ -246,7 +246,7 @@
- Favicon + Favicon    $title    @@ -325,36 +325,36 @@
- +
- + -

HTML

+

HTML

archive/output.html

- +
- + -

PDF

+

PDF

archive/output.pdf

- +
- + -

Screenshot

+

Screenshot

archive/screenshot.png

@@ -373,12 +373,12 @@
- +
- + -

Archive.Org

+

Archive.Org

web.archive.org/web/...

diff --git a/archivebox/legacy/templates/index.html b/archivebox/legacy/templates/main_index.html similarity index 100% rename from archivebox/legacy/templates/index.html rename to archivebox/legacy/templates/main_index.html diff --git a/archivebox/legacy/templates/index_row.html b/archivebox/legacy/templates/main_index_row.html similarity index 84% rename from archivebox/legacy/templates/index_row.html rename to archivebox/legacy/templates/main_index_row.html index 48f22802..5f851603 100644 --- a/archivebox/legacy/templates/index_row.html +++ b/archivebox/legacy/templates/main_index_row.html @@ -1,14 +1,14 @@ $bookmarked_date - + $title $tags - 📄 + 📄 $num_outputs diff --git a/archivebox/legacy/templates/robots.txt b/archivebox/legacy/templates/robots.txt new file mode 100644 index 00000000..b338083e --- /dev/null +++ b/archivebox/legacy/templates/robots.txt @@ -0,0 +1,2 @@ +User-agent: * + Disallow: / diff --git a/archivebox/legacy/util.py b/archivebox/legacy/util.py index a1c823ff..c4f14328 100644 --- a/archivebox/legacy/util.py +++ b/archivebox/legacy/util.py @@ -5,8 +5,9 @@ import json import time import shutil +from string import Template from json import JSONEncoder -from typing import List, Optional, Any, Union, IO +from typing import List, Optional, Any, Union, IO, Mapping from inspect import signature from functools import wraps from hashlib import sha256 @@ -396,10 +397,11 @@ def parse_date(date: Any) -> Optional[datetime]: try: return datetime.fromisoformat(date) except Exception: - try: - return datetime.strptime(date, '%Y-%m-%d %H:%M') - except Exception: - pass + pass + try: + return datetime.strptime(date, '%Y-%m-%d %H:%M') + except Exception: + pass raise ValueError('Tried to parse invalid date! {}'.format(date)) @@ -552,9 +554,12 @@ def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, tim @enforce_types def copy_and_overwrite(from_path: str, to_path: str): - if os.path.exists(to_path): - shutil.rmtree(to_path) - shutil.copytree(from_path, to_path) + if os.path.isdir(from_path): + shutil.rmtree(to_path, ignore_errors=True) + shutil.copytree(from_path, to_path) + else: + with open(from_path, 'rb') as src: + atomic_write(src.read(), to_path) @enforce_types def chrome_args(**options) -> List[str]: @@ -642,11 +647,27 @@ def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None, return '\n'.join((header_str, *row_strs)) -def atomic_write(contents: Union[dict, str], path: str) -> None: +@enforce_types +def render_template(template_path: str, context: Mapping[str, str]) -> str: + """render a given html template string with the given template content""" + + # will be replaced by django templates in the future + with open(template_path, 'r', encoding='utf-8') as template: + template_str = template.read() + return Template(template_str).substitute(**context) + + +def atomic_write(contents: Union[dict, str, bytes], path: str) -> None: """Safe atomic write to filesystem by writing to temp file + atomic rename""" try: tmp_file = '{}.tmp'.format(path) - with open(tmp_file, 'w+', encoding='utf-8') as f: + + if isinstance(contents, bytes): + args = {'mode': 'wb+'} + else: + args = {'mode': 'w+', 'encoding': 'utf-8'} + + with open(tmp_file, **args) as f: if isinstance(contents, dict): to_json(contents, file=f) else: @@ -678,3 +699,5 @@ def reject_stdin(caller: str) -> None: )) print() raise SystemExit(1) + +