1
0
Fork 0
mirror of synced 2024-06-28 02:50:24 +12:00

switch to strict type hints with NamedTuples instead of dicts

This commit is contained in:
Nick Sweeting 2019-03-26 05:33:34 -04:00
parent 0a44779b21
commit 76abc58135
8 changed files with 201 additions and 98 deletions

View file

@ -12,6 +12,9 @@ Usage & Documentation:
import os import os
import sys import sys
from typing import List
from schema import Link
from links import links_after_timestamp from links import links_after_timestamp
from index import write_links_index, load_links_index from index import write_links_index, load_links_index
from archive_methods import archive_link from archive_methods import archive_link
@ -50,7 +53,7 @@ def print_help():
print(" ./archive 15109948213.123\n") print(" ./archive 15109948213.123\n")
def main(*args): def main(*args) -> List[Link]:
if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2: if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
print_help() print_help()
raise SystemExit(0) raise SystemExit(0)
@ -95,10 +98,10 @@ def main(*args):
import_path = save_remote_source(import_path) import_path = save_remote_source(import_path)
### Run the main archive update process ### Run the main archive update process
update_archive_data(import_path=import_path, resume=resume) return update_archive_data(import_path=import_path, resume=resume)
def update_archive_data(import_path=None, resume=None): def update_archive_data(import_path: str=None, resume: float=None) -> List[Link]:
"""The main ArchiveBox entrancepoint. Everything starts here.""" """The main ArchiveBox entrancepoint. Everything starts here."""
# Step 1: Load list of links from the existing index # Step 1: Load list of links from the existing index
@ -111,14 +114,14 @@ def update_archive_data(import_path=None, resume=None):
# Step 3: Run the archive methods for each link # Step 3: Run the archive methods for each link
links = new_links if ONLY_NEW else all_links links = new_links if ONLY_NEW else all_links
log_archiving_started(len(links), resume) log_archiving_started(len(links), resume)
idx, link = 0, 0 idx, link = 0, {'timestamp': 0}
try: try:
for idx, link in enumerate(links_after_timestamp(links, resume)): for idx, link in enumerate(links_after_timestamp(links, resume)):
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
archive_link(link_dir, link) archive_link(link_dir, link)
except KeyboardInterrupt: except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link and link['timestamp']) log_archiving_paused(len(links), idx, link['timestamp'])
raise SystemExit(0) raise SystemExit(0)
except: except:
@ -130,7 +133,7 @@ def update_archive_data(import_path=None, resume=None):
# Step 4: Re-write links index with updated titles, icons, and resources # Step 4: Re-write links index with updated titles, icons, and resources
all_links, _ = load_links_index(out_dir=OUTPUT_DIR) all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True) write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
return all_links
if __name__ == '__main__': if __name__ == '__main__':
main(*sys.argv) main(*sys.argv)

View file

@ -1,10 +1,10 @@
import os import os
import json
from typing import Union, Dict, List, Tuple, NamedTuple from typing import Dict, List, Tuple
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from schema import Link, ArchiveResult, ArchiveError
from index import ( from index import (
write_link_index, write_link_index,
patch_links_index, patch_links_index,
@ -102,7 +102,7 @@ def archive_link(link_dir: str, link: Link, page=None) -> Link:
link['history'][method_name].append(result._asdict()) link['history'][method_name].append(result._asdict())
stats[result.status] += 1 stats[result.status] += 1
log_archive_method_finished(result._asdict()) log_archive_method_finished(result)
else: else:
stats['skipped'] += 1 stats['skipped'] += 1

View file

@ -11,6 +11,7 @@ except ImportError:
print('[X] Missing "distutils" python package. To install it, run:') print('[X] Missing "distutils" python package. To install it, run:')
print(' pip install distutils') print(' pip install distutils')
from schema import Link, ArchiveIndex
from config import ( from config import (
OUTPUT_DIR, OUTPUT_DIR,
TEMPLATES_DIR, TEMPLATES_DIR,
@ -25,7 +26,7 @@ from util import (
check_links_structure, check_links_structure,
wget_output_path, wget_output_path,
latest_output, latest_output,
Link, ExtendedEncoder,
) )
from parse import parse_links from parse import parse_links
from links import validate_links from links import validate_links
@ -56,6 +57,7 @@ def write_links_index(out_dir: str, links: List[Link], finished: bool=False) ->
write_html_links_index(out_dir, links, finished=finished) write_html_links_index(out_dir, links, finished=finished)
log_indexing_finished(out_dir, 'index.html') log_indexing_finished(out_dir, 'index.html')
def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[List[Link], List[Link]]: def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[List[Link], List[Link]]:
"""parse and load existing index with any new links from import_path merged in""" """parse and load existing index with any new links from import_path merged in"""
@ -82,6 +84,7 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[Li
return all_links, new_links return all_links, new_links
def write_json_links_index(out_dir: str, links: List[Link]) -> None: def write_json_links_index(out_dir: str, links: List[Link]) -> None:
"""write the json link index to a given path""" """write the json link index to a given path"""
@ -89,20 +92,24 @@ def write_json_links_index(out_dir: str, links: List[Link]) -> None:
path = os.path.join(out_dir, 'index.json') path = os.path.join(out_dir, 'index.json')
index_json = { index_json = ArchiveIndex(
'info': 'ArchiveBox Index', info='ArchiveBox Index',
'help': 'https://github.com/pirate/ArchiveBox', source='https://github.com/pirate/ArchiveBox',
'version': GIT_SHA, docs='https://github.com/pirate/ArchiveBox/wiki',
'num_links': len(links), version=GIT_SHA,
'updated': str(datetime.now().timestamp()), num_links=len(links),
'links': links, updated=str(datetime.now().timestamp()),
} links=links,
)
assert isinstance(index_json._asdict(), dict)
with open(path, 'w', encoding='utf-8') as f: with open(path, 'w', encoding='utf-8') as f:
json.dump(index_json, f, indent=4, default=str) json.dump(index_json._asdict(), f, indent=4, cls=ExtendedEncoder)
chmod_file(path) chmod_file(path)
def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> List[Link]: def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> List[Link]:
"""parse a archive index json file and return the list of links""" """parse a archive index json file and return the list of links"""
index_path = os.path.join(out_dir, 'index.json') index_path = os.path.join(out_dir, 'index.json')
@ -114,6 +121,7 @@ def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> List[Link]:
return [] return []
def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None: def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
"""write the html link index to a given path""" """write the html link index to a given path"""
@ -208,6 +216,7 @@ def write_link_index(out_dir: str, link: Link) -> None:
write_json_link_index(out_dir, link) write_json_link_index(out_dir, link)
write_html_link_index(out_dir, link) write_html_link_index(out_dir, link)
def write_json_link_index(out_dir: str, link: Link) -> None: def write_json_link_index(out_dir: str, link: Link) -> None:
"""write a json file with some info about the link""" """write a json file with some info about the link"""
@ -215,10 +224,11 @@ def write_json_link_index(out_dir: str, link: Link) -> None:
path = os.path.join(out_dir, 'index.json') path = os.path.join(out_dir, 'index.json')
with open(path, 'w', encoding='utf-8') as f: with open(path, 'w', encoding='utf-8') as f:
json.dump(link, f, indent=4, default=str) json.dump(link, f, indent=4, cls=ExtendedEncoder)
chmod_file(path) chmod_file(path)
def parse_json_link_index(out_dir: str) -> dict: def parse_json_link_index(out_dir: str) -> dict:
"""load the json link index from a given directory""" """load the json link index from a given directory"""
existing_index = os.path.join(out_dir, 'index.json') existing_index = os.path.join(out_dir, 'index.json')
@ -229,6 +239,7 @@ def parse_json_link_index(out_dir: str) -> dict:
return link_json return link_json
return {} return {}
def load_json_link_index(out_dir: str, link: Link) -> Link: def load_json_link_index(out_dir: str, link: Link) -> Link:
"""check for an existing link archive in the given directory, """check for an existing link archive in the given directory,
and load+merge it into the given link dict and load+merge it into the given link dict
@ -244,6 +255,7 @@ def load_json_link_index(out_dir: str, link: Link) -> Link:
check_link_structure(link) check_link_structure(link)
return link return link
def write_html_link_index(out_dir: str, link: Link) -> None: def write_html_link_index(out_dir: str, link: Link) -> None:
check_link_structure(link) check_link_structure(link)
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f: with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:

View file

@ -19,17 +19,19 @@ Link {
} }
""" """
from html import unescape from typing import List, Iterable
from collections import OrderedDict from collections import OrderedDict
from schema import Link
from util import ( from util import (
merge_links, merge_links,
check_link_structure, check_link_structure,
check_links_structure, check_links_structure,
htmldecode,
) )
def validate_links(links): def validate_links(links: Iterable[Link]) -> List[Link]:
check_links_structure(links) check_links_structure(links)
links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
@ -40,13 +42,13 @@ def validate_links(links):
raise SystemExit(1) raise SystemExit(1)
for link in links: for link in links:
link['title'] = unescape(link['title'].strip()) if link['title'] else None link['title'] = htmldecode(link['title'].strip()) if link['title'] else None
check_link_structure(link) check_link_structure(link)
return list(links) return list(links)
def archivable_links(links): def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
"""remove chrome://, about:// or other schemed links that cant be archived""" """remove chrome://, about:// or other schemed links that cant be archived"""
return ( return (
link link
@ -55,12 +57,12 @@ def archivable_links(links):
) )
def uniquefied_links(sorted_links): def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
""" """
ensures that all non-duplicate links have monotonically increasing timestamps ensures that all non-duplicate links have monotonically increasing timestamps
""" """
unique_urls = OrderedDict() unique_urls: OrderedDict[str, Link] = OrderedDict()
lower = lambda url: url.lower().strip() lower = lambda url: url.lower().strip()
without_www = lambda url: url.replace('://www.', '://', 1) without_www = lambda url: url.replace('://www.', '://', 1)
@ -73,7 +75,7 @@ def uniquefied_links(sorted_links):
link = merge_links(unique_urls[fuzzy_url], link) link = merge_links(unique_urls[fuzzy_url], link)
unique_urls[fuzzy_url] = link unique_urls[fuzzy_url] = link
unique_timestamps = OrderedDict() unique_timestamps: OrderedDict[str, Link] = OrderedDict()
for link in unique_urls.values(): for link in unique_urls.values():
link['timestamp'] = lowest_uniq_timestamp(unique_timestamps, link['timestamp']) link['timestamp'] = lowest_uniq_timestamp(unique_timestamps, link['timestamp'])
unique_timestamps[link['timestamp']] = link unique_timestamps[link['timestamp']] = link
@ -81,12 +83,12 @@ def uniquefied_links(sorted_links):
return unique_timestamps.values() return unique_timestamps.values()
def sorted_links(links): def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url']) sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url'])
return sorted(links, key=sort_func, reverse=True) return sorted(links, key=sort_func, reverse=True)
def links_after_timestamp(links, timestamp=None): def links_after_timestamp(links: Iterable[Link], timestamp: str=None) -> Iterable[Link]:
if not timestamp: if not timestamp:
yield from links yield from links
return return
@ -99,7 +101,7 @@ def links_after_timestamp(links, timestamp=None):
print('Resume value and all timestamp values must be valid numbers.') print('Resume value and all timestamp values must be valid numbers.')
def lowest_uniq_timestamp(used_timestamps, timestamp): def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
"""resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2""" """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
timestamp = timestamp.split('.')[0] timestamp = timestamp.split('.')[0]

View file

@ -1,43 +1,44 @@
import sys import sys
from datetime import datetime from datetime import datetime
from schema import Link, ArchiveResult, RuntimeStats
from config import ANSI, REPO_DIR, OUTPUT_DIR from config import ANSI, REPO_DIR, OUTPUT_DIR
# globals are bad, mmkay # globals are bad, mmkay
_LAST_RUN_STATS = { _LAST_RUN_STATS = RuntimeStats(
'skipped': 0, skipped=0,
'succeeded': 0, succeeded=0,
'failed': 0, failed=0,
'parsing_start_ts': 0, parse_start_ts=0,
'parsing_end_ts': 0, parse_end_ts=0,
'indexing_start_ts': 0, index_start_ts=0,
'indexing_end_ts': 0, index_end_ts=0,
'archiving_start_ts': 0, archiving_start_ts=0,
'archiving_end_ts': 0, archiving_end_ts=0,
)
'links': {}, def pretty_path(path: str) -> str:
}
def pretty_path(path):
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
return path.replace(REPO_DIR + '/', '') return path.replace(REPO_DIR + '/', '')
### Parsing Stage ### Parsing Stage
def log_parsing_started(source_file): def log_parsing_started(source_file: str):
start_ts = datetime.now() start_ts = datetime.now()
_LAST_RUN_STATS['parse_start_ts'] = start_ts _LAST_RUN_STATS.parse_start_ts = start_ts
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format( print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'), start_ts.strftime('%Y-%m-%d %H:%M:%S'),
source_file.rsplit('/', 1)[-1], source_file.rsplit('/', 1)[-1],
**ANSI, **ANSI,
)) ))
def log_parsing_finished(num_new_links, parser_name): def log_parsing_finished(num_new_links: int, parser_name: str):
end_ts = datetime.now()
_LAST_RUN_STATS.parse_end_ts = end_ts
print(' > Adding {} new links to index (parsed import as {})'.format( print(' > Adding {} new links to index (parsed import as {})'.format(
num_new_links, num_new_links,
parser_name, parser_name,
@ -48,26 +49,26 @@ def log_parsing_finished(num_new_links, parser_name):
def log_indexing_process_started(): def log_indexing_process_started():
start_ts = datetime.now() start_ts = datetime.now()
_LAST_RUN_STATS['index_start_ts'] = start_ts _LAST_RUN_STATS.index_start_ts = start_ts
print('{green}[*] [{}] Saving main index files...{reset}'.format( print('{green}[*] [{}] Saving main index files...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'), start_ts.strftime('%Y-%m-%d %H:%M:%S'),
**ANSI, **ANSI,
)) ))
def log_indexing_started(out_dir, out_file): def log_indexing_started(out_dir: str, out_file: str):
sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file)) sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file))
def log_indexing_finished(out_dir, out_file): def log_indexing_finished(out_dir: str, out_file: str):
end_ts = datetime.now() end_ts = datetime.now()
_LAST_RUN_STATS['index_end_ts'] = end_ts _LAST_RUN_STATS.index_end_ts = end_ts
print('\r{}/{}'.format(pretty_path(out_dir), out_file)) print('\r{}/{}'.format(pretty_path(out_dir), out_file))
### Archiving Stage ### Archiving Stage
def log_archiving_started(num_links, resume): def log_archiving_started(num_links: int, resume: float):
start_ts = datetime.now() start_ts = datetime.now()
_LAST_RUN_STATS['start_ts'] = start_ts _LAST_RUN_STATS.archiving_start_ts = start_ts
if resume: if resume:
print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format( print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'), start_ts.strftime('%Y-%m-%d %H:%M:%S'),
@ -82,9 +83,9 @@ def log_archiving_started(num_links, resume):
**ANSI, **ANSI,
)) ))
def log_archiving_paused(num_links, idx, timestamp): def log_archiving_paused(num_links: int, idx: int, timestamp: str):
end_ts = datetime.now() end_ts = datetime.now()
_LAST_RUN_STATS['end_ts'] = end_ts _LAST_RUN_STATS.archiving_end_ts = end_ts
print() print()
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format( print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
**ANSI, **ANSI,
@ -100,10 +101,10 @@ def log_archiving_paused(num_links, idx, timestamp):
timestamp, timestamp,
)) ))
def log_archiving_finished(num_links): def log_archiving_finished(num_links: int):
end_ts = datetime.now() end_ts = datetime.now()
_LAST_RUN_STATS['end_ts'] = end_ts _LAST_RUN_STATS.archiving_end_ts = end_ts
seconds = end_ts.timestamp() - _LAST_RUN_STATS['start_ts'].timestamp() seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp()
if seconds > 60: if seconds > 60:
duration = '{0:.2f} min'.format(seconds / 60, 2) duration = '{0:.2f} min'.format(seconds / 60, 2)
else: else:
@ -116,13 +117,13 @@ def log_archiving_finished(num_links):
duration, duration,
ANSI['reset'], ANSI['reset'],
)) ))
print(' - {} links skipped'.format(_LAST_RUN_STATS['skipped'])) print(' - {} links skipped'.format(_LAST_RUN_STATS.skipped))
print(' - {} links updated'.format(_LAST_RUN_STATS['succeeded'])) print(' - {} links updated'.format(_LAST_RUN_STATS.succeeded))
print(' - {} links had errors'.format(_LAST_RUN_STATS['failed'])) print(' - {} links had errors'.format(_LAST_RUN_STATS.failed))
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', ''))) print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
def log_link_archiving_started(link_dir, link, is_new): def log_link_archiving_started(link_dir: str, link: Link, is_new: bool):
# [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford" # [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/ # http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
# > output/archive/1478739709 # > output/archive/1478739709
@ -140,40 +141,34 @@ def log_link_archiving_started(link_dir, link, is_new):
pretty_path(link_dir), pretty_path(link_dir),
)) ))
def log_link_archiving_finished(link_dir, link, is_new, stats): def log_link_archiving_finished(link_dir: str, link: Link, is_new: bool, stats: dict):
total = sum(stats.values()) total = sum(stats.values())
if stats['failed'] > 0 : if stats['failed'] > 0 :
_LAST_RUN_STATS['failed'] += 1 _LAST_RUN_STATS.failed += 1
elif stats['skipped'] == total: elif stats['skipped'] == total:
_LAST_RUN_STATS['skipped'] += 1 _LAST_RUN_STATS.skipped += 1
else: else:
_LAST_RUN_STATS['succeeded'] += 1 _LAST_RUN_STATS.succeeded += 1
def log_archive_method_started(method): def log_archive_method_started(method: str):
print(' > {}'.format(method)) print(' > {}'.format(method))
def log_archive_method_finished(result):
def log_archive_method_finished(result: ArchiveResult):
"""quote the argument with whitespace in a command so the user can """quote the argument with whitespace in a command so the user can
copy-paste the outputted string directly to run the cmd copy-paste the outputted string directly to run the cmd
""" """
required_keys = ('cmd', 'pwd', 'output', 'status', 'start_ts', 'end_ts')
assert (
isinstance(result, dict)
and all(key in result for key in required_keys)
and ('output' in result)
), 'Archive method did not return a valid result.'
# Prettify CMD string and make it safe to copy-paste by quoting arguments # Prettify CMD string and make it safe to copy-paste by quoting arguments
quoted_cmd = ' '.join( quoted_cmd = ' '.join(
'"{}"'.format(arg) if ' ' in arg else arg '"{}"'.format(arg) if ' ' in arg else arg
for arg in result['cmd'] for arg in result.cmd
) )
if result['status'] == 'failed': if result.status == 'failed':
# Prettify error output hints string and limit to five lines # Prettify error output hints string and limit to five lines
hints = getattr(result['output'], 'hints', None) or () hints = getattr(result.output, 'hints', None) or ()
if hints: if hints:
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n') hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
hints = ( hints = (
@ -185,13 +180,13 @@ def log_archive_method_finished(result):
output_lines = [ output_lines = [
'{}Failed:{} {}{}'.format( '{}Failed:{} {}{}'.format(
ANSI['red'], ANSI['red'],
result['output'].__class__.__name__.replace('ArchiveError', ''), result.output.__class__.__name__.replace('ArchiveError', ''),
result['output'], result.output,
ANSI['reset'] ANSI['reset']
), ),
*hints, *hints,
'{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']), '{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']),
' cd {};'.format(result['pwd']), *((' cd {};'.format(result.pwd),) if result.pwd else ()),
' {}'.format(quoted_cmd), ' {}'.format(quoted_cmd),
] ]
print('\n'.join( print('\n'.join(

View file

@ -20,6 +20,7 @@ Link: {
import re import re
import json import json
from typing import Tuple, List, IO, Iterable
from datetime import datetime from datetime import datetime
import xml.etree.ElementTree as etree import xml.etree.ElementTree as etree
@ -29,10 +30,11 @@ from util import (
URL_REGEX, URL_REGEX,
check_url_parsing_invariants, check_url_parsing_invariants,
TimedProgress, TimedProgress,
Link,
) )
def parse_links(source_file): def parse_links(source_file: str) -> Tuple[List[Link], str]:
"""parse a list of URLs with their metadata from an """parse a list of URLs with their metadata from an
RSS feed, bookmarks export, or text file RSS feed, bookmarks export, or text file
""" """
@ -74,7 +76,7 @@ def parse_links(source_file):
### Import Parser Functions ### Import Parser Functions
def parse_pocket_html_export(html_file): def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
html_file.seek(0) html_file.seek(0)
@ -98,7 +100,7 @@ def parse_pocket_html_export(html_file):
} }
def parse_json_export(json_file): def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
json_file.seek(0) json_file.seek(0)
@ -150,7 +152,7 @@ def parse_json_export(json_file):
} }
def parse_rss_export(rss_file): def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse RSS XML-format files into links""" """Parse RSS XML-format files into links"""
rss_file.seek(0) rss_file.seek(0)
@ -187,7 +189,7 @@ def parse_rss_export(rss_file):
} }
def parse_shaarli_rss_export(rss_file): def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Shaarli-specific RSS XML-format files into links""" """Parse Shaarli-specific RSS XML-format files into links"""
rss_file.seek(0) rss_file.seek(0)
@ -224,7 +226,7 @@ def parse_shaarli_rss_export(rss_file):
} }
def parse_netscape_html_export(html_file): def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
"""Parse netscape-format bookmarks export files (produced by all browsers)""" """Parse netscape-format bookmarks export files (produced by all browsers)"""
html_file.seek(0) html_file.seek(0)
@ -247,7 +249,7 @@ def parse_netscape_html_export(html_file):
} }
def parse_pinboard_rss_export(rss_file): def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Pinboard RSS feed files into links""" """Parse Pinboard RSS feed files into links"""
rss_file.seek(0) rss_file.seek(0)
@ -278,7 +280,7 @@ def parse_pinboard_rss_export(rss_file):
} }
def parse_medium_rss_export(rss_file): def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Medium RSS feed files into links""" """Parse Medium RSS feed files into links"""
rss_file.seek(0) rss_file.seek(0)
@ -299,7 +301,7 @@ def parse_medium_rss_export(rss_file):
} }
def parse_plain_text_export(text_file): def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]:
"""Parse raw links from each line in a text file""" """Parse raw links from each line in a text file"""
text_file.seek(0) text_file.seek(0)

55
archivebox/schema.py Normal file
View file

@ -0,0 +1,55 @@
from datetime import datetime
from typing import List, Dict, Any, Optional, Union, NamedTuple
from recordclass import RecordClass
Link = Dict[str, Any]
class ArchiveIndex(NamedTuple):
info: str
version: str
source: str
docs: str
num_links: int
updated: str
links: List[Link]
class ArchiveResult(NamedTuple):
cmd: List[str]
pwd: Optional[str]
cmd_version: Optional[str]
output: Union[str, Exception, None]
status: str
start_ts: datetime
end_ts: datetime
duration: int
class ArchiveError(Exception):
def __init__(self, message, hints=None):
super().__init__(message)
self.hints = hints
class LinkDict(NamedTuple):
timestamp: str
url: str
title: Optional[str]
tags: str
sources: List[str]
history: Dict[str, ArchiveResult]
class RuntimeStats(RecordClass):
skipped: int
succeeded: int
failed: int
parse_start_ts: datetime
parse_end_ts: datetime
index_start_ts: datetime
index_end_ts: datetime
archiving_start_ts: datetime
archiving_end_ts: datetime

View file

@ -3,11 +3,13 @@ import re
import sys import sys
import time import time
from typing import List, Dict, Any, Optional, Union from json import JSONEncoder
from typing import List, Dict, Optional, Iterable
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.parse import urlparse, quote from urllib.parse import urlparse, quote, unquote
from decimal import Decimal from html import escape, unescape
from datetime import datetime from datetime import datetime
from multiprocessing import Process from multiprocessing import Process
from subprocess import ( from subprocess import (
@ -19,6 +21,7 @@ from subprocess import (
CalledProcessError, CalledProcessError,
) )
from schema import Link
from config import ( from config import (
ANSI, ANSI,
TERM_WIDTH, TERM_WIDTH,
@ -38,7 +41,8 @@ from logs import pretty_path
### Parsing Helpers ### Parsing Helpers
# Url Parsing: https://docs.python.org/3/library/urllib.parse.html#url-parsing # All of these are (str) -> str
# shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing
scheme = lambda url: urlparse(url).scheme scheme = lambda url: urlparse(url).scheme
without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//') without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//') without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
@ -54,6 +58,9 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
short_ts = lambda ts: ts.split('.')[0] short_ts = lambda ts: ts.split('.')[0]
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace') urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
urldecode = lambda s: unquote(s)
htmlencode = lambda s: escape(s, quote=True)
htmldecode = lambda s: unescape(s)
URL_REGEX = re.compile( URL_REGEX = re.compile(
r'http[s]?://' # start matching from allowed schemes r'http[s]?://' # start matching from allowed schemes
@ -89,7 +96,7 @@ STATICFILE_EXTENSIONS = {
# html, htm, shtml, xhtml, xml, aspx, php, cgi # html, htm, shtml, xhtml, xml, aspx, php, cgi
} }
Link = Dict[str, Any]
### Checks & Tests ### Checks & Tests
@ -105,7 +112,7 @@ def check_link_structure(link: Link) -> None:
assert isinstance(key, str) assert isinstance(key, str)
assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history']) assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
def check_links_structure(links: List[Link]) -> None: def check_links_structure(links: Iterable[Link]) -> None:
"""basic sanity check invariants to make sure the data is valid""" """basic sanity check invariants to make sure the data is valid"""
assert isinstance(links, list) assert isinstance(links, list)
if links: if links:
@ -334,7 +341,7 @@ def derived_link_info(link: Link) -> dict:
url = link['url'] url = link['url']
to_date_str = lambda ts: datetime.fromtimestamp(Decimal(ts)).strftime('%Y-%m-%d %H:%M') to_date_str = lambda ts: datetime.fromtimestamp(float(ts)).strftime('%Y-%m-%d %H:%M')
extended_info = { extended_info = {
**link, **link,
@ -582,3 +589,30 @@ def chrome_args(**options) -> List[str]:
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
return cmd_args return cmd_args
class ExtendedEncoder(JSONEncoder):
"""
Extended json serializer that supports serializing several model
fields and objects
"""
def default(self, obj):
cls_name = obj.__class__.__name__
if hasattr(obj, '_asdict'):
return obj._asdict()
elif isinstance(obj, bytes):
return obj.decode()
elif isinstance(obj, datetime):
return obj.isoformat()
elif isinstance(obj, Exception):
return '{}: {}'.format(obj.__class__.__name__, obj)
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
return tuple(obj)
return JSONEncoder.default(self, obj)