From 76abc58135f43e49f645e5f5dfa860f47d69134a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 26 Mar 2019 05:33:34 -0400 Subject: [PATCH] switch to strict type hints with NamedTuples instead of dicts --- archivebox/archive.py | 15 +++--- archivebox/archive_methods.py | 6 +-- archivebox/index.py | 34 ++++++++---- archivebox/links.py | 22 ++++---- archivebox/logs.py | 99 +++++++++++++++++------------------ archivebox/parse.py | 20 +++---- archivebox/schema.py | 55 +++++++++++++++++++ archivebox/util.py | 48 ++++++++++++++--- 8 files changed, 201 insertions(+), 98 deletions(-) create mode 100644 archivebox/schema.py diff --git a/archivebox/archive.py b/archivebox/archive.py index 5c0d195d..46ada292 100755 --- a/archivebox/archive.py +++ b/archivebox/archive.py @@ -12,6 +12,9 @@ Usage & Documentation: import os import sys +from typing import List + +from schema import Link from links import links_after_timestamp from index import write_links_index, load_links_index from archive_methods import archive_link @@ -50,7 +53,7 @@ def print_help(): print(" ./archive 15109948213.123\n") -def main(*args): +def main(*args) -> List[Link]: if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2: print_help() raise SystemExit(0) @@ -95,10 +98,10 @@ def main(*args): import_path = save_remote_source(import_path) ### Run the main archive update process - update_archive_data(import_path=import_path, resume=resume) + return update_archive_data(import_path=import_path, resume=resume) -def update_archive_data(import_path=None, resume=None): +def update_archive_data(import_path: str=None, resume: float=None) -> List[Link]: """The main ArchiveBox entrancepoint. Everything starts here.""" # Step 1: Load list of links from the existing index @@ -111,14 +114,14 @@ def update_archive_data(import_path=None, resume=None): # Step 3: Run the archive methods for each link links = new_links if ONLY_NEW else all_links log_archiving_started(len(links), resume) - idx, link = 0, 0 + idx, link = 0, {'timestamp': 0} try: for idx, link in enumerate(links_after_timestamp(links, resume)): link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) archive_link(link_dir, link) except KeyboardInterrupt: - log_archiving_paused(len(links), idx, link and link['timestamp']) + log_archiving_paused(len(links), idx, link['timestamp']) raise SystemExit(0) except: @@ -130,7 +133,7 @@ def update_archive_data(import_path=None, resume=None): # Step 4: Re-write links index with updated titles, icons, and resources all_links, _ = load_links_index(out_dir=OUTPUT_DIR) write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True) - + return all_links if __name__ == '__main__': main(*sys.argv) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 5f6f0e78..e214a909 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -1,10 +1,10 @@ import os -import json -from typing import Union, Dict, List, Tuple, NamedTuple +from typing import Dict, List, Tuple from collections import defaultdict from datetime import datetime +from schema import Link, ArchiveResult, ArchiveError from index import ( write_link_index, patch_links_index, @@ -102,7 +102,7 @@ def archive_link(link_dir: str, link: Link, page=None) -> Link: link['history'][method_name].append(result._asdict()) stats[result.status] += 1 - log_archive_method_finished(result._asdict()) + log_archive_method_finished(result) else: stats['skipped'] += 1 diff --git a/archivebox/index.py b/archivebox/index.py index 503b82ad..3c31ac84 100644 --- a/archivebox/index.py +++ b/archivebox/index.py @@ -11,6 +11,7 @@ except ImportError: print('[X] Missing "distutils" python package. To install it, run:') print(' pip install distutils') +from schema import Link, ArchiveIndex from config import ( OUTPUT_DIR, TEMPLATES_DIR, @@ -25,7 +26,7 @@ from util import ( check_links_structure, wget_output_path, latest_output, - Link, + ExtendedEncoder, ) from parse import parse_links from links import validate_links @@ -56,6 +57,7 @@ def write_links_index(out_dir: str, links: List[Link], finished: bool=False) -> write_html_links_index(out_dir, links, finished=finished) log_indexing_finished(out_dir, 'index.html') + def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[List[Link], List[Link]]: """parse and load existing index with any new links from import_path merged in""" @@ -82,6 +84,7 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[Li return all_links, new_links + def write_json_links_index(out_dir: str, links: List[Link]) -> None: """write the json link index to a given path""" @@ -89,20 +92,24 @@ def write_json_links_index(out_dir: str, links: List[Link]) -> None: path = os.path.join(out_dir, 'index.json') - index_json = { - 'info': 'ArchiveBox Index', - 'help': 'https://github.com/pirate/ArchiveBox', - 'version': GIT_SHA, - 'num_links': len(links), - 'updated': str(datetime.now().timestamp()), - 'links': links, - } + index_json = ArchiveIndex( + info='ArchiveBox Index', + source='https://github.com/pirate/ArchiveBox', + docs='https://github.com/pirate/ArchiveBox/wiki', + version=GIT_SHA, + num_links=len(links), + updated=str(datetime.now().timestamp()), + links=links, + ) + + assert isinstance(index_json._asdict(), dict) with open(path, 'w', encoding='utf-8') as f: - json.dump(index_json, f, indent=4, default=str) + json.dump(index_json._asdict(), f, indent=4, cls=ExtendedEncoder) chmod_file(path) + def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> List[Link]: """parse a archive index json file and return the list of links""" index_path = os.path.join(out_dir, 'index.json') @@ -114,6 +121,7 @@ def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> List[Link]: return [] + def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None: """write the html link index to a given path""" @@ -208,6 +216,7 @@ def write_link_index(out_dir: str, link: Link) -> None: write_json_link_index(out_dir, link) write_html_link_index(out_dir, link) + def write_json_link_index(out_dir: str, link: Link) -> None: """write a json file with some info about the link""" @@ -215,10 +224,11 @@ def write_json_link_index(out_dir: str, link: Link) -> None: path = os.path.join(out_dir, 'index.json') with open(path, 'w', encoding='utf-8') as f: - json.dump(link, f, indent=4, default=str) + json.dump(link, f, indent=4, cls=ExtendedEncoder) chmod_file(path) + def parse_json_link_index(out_dir: str) -> dict: """load the json link index from a given directory""" existing_index = os.path.join(out_dir, 'index.json') @@ -229,6 +239,7 @@ def parse_json_link_index(out_dir: str) -> dict: return link_json return {} + def load_json_link_index(out_dir: str, link: Link) -> Link: """check for an existing link archive in the given directory, and load+merge it into the given link dict @@ -244,6 +255,7 @@ def load_json_link_index(out_dir: str, link: Link) -> Link: check_link_structure(link) return link + def write_html_link_index(out_dir: str, link: Link) -> None: check_link_structure(link) with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f: diff --git a/archivebox/links.py b/archivebox/links.py index ba8057a5..41aceebc 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -19,17 +19,19 @@ Link { } """ -from html import unescape +from typing import List, Iterable from collections import OrderedDict +from schema import Link from util import ( merge_links, check_link_structure, check_links_structure, + htmldecode, ) -def validate_links(links): +def validate_links(links: Iterable[Link]) -> List[Link]: check_links_structure(links) links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls @@ -40,13 +42,13 @@ def validate_links(links): raise SystemExit(1) for link in links: - link['title'] = unescape(link['title'].strip()) if link['title'] else None + link['title'] = htmldecode(link['title'].strip()) if link['title'] else None check_link_structure(link) return list(links) -def archivable_links(links): +def archivable_links(links: Iterable[Link]) -> Iterable[Link]: """remove chrome://, about:// or other schemed links that cant be archived""" return ( link @@ -55,12 +57,12 @@ def archivable_links(links): ) -def uniquefied_links(sorted_links): +def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]: """ ensures that all non-duplicate links have monotonically increasing timestamps """ - unique_urls = OrderedDict() + unique_urls: OrderedDict[str, Link] = OrderedDict() lower = lambda url: url.lower().strip() without_www = lambda url: url.replace('://www.', '://', 1) @@ -73,7 +75,7 @@ def uniquefied_links(sorted_links): link = merge_links(unique_urls[fuzzy_url], link) unique_urls[fuzzy_url] = link - unique_timestamps = OrderedDict() + unique_timestamps: OrderedDict[str, Link] = OrderedDict() for link in unique_urls.values(): link['timestamp'] = lowest_uniq_timestamp(unique_timestamps, link['timestamp']) unique_timestamps[link['timestamp']] = link @@ -81,12 +83,12 @@ def uniquefied_links(sorted_links): return unique_timestamps.values() -def sorted_links(links): +def sorted_links(links: Iterable[Link]) -> Iterable[Link]: sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url']) return sorted(links, key=sort_func, reverse=True) -def links_after_timestamp(links, timestamp=None): +def links_after_timestamp(links: Iterable[Link], timestamp: str=None) -> Iterable[Link]: if not timestamp: yield from links return @@ -99,7 +101,7 @@ def links_after_timestamp(links, timestamp=None): print('Resume value and all timestamp values must be valid numbers.') -def lowest_uniq_timestamp(used_timestamps, timestamp): +def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str: """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2""" timestamp = timestamp.split('.')[0] diff --git a/archivebox/logs.py b/archivebox/logs.py index 4dc2c051..769257a6 100644 --- a/archivebox/logs.py +++ b/archivebox/logs.py @@ -1,43 +1,44 @@ import sys from datetime import datetime + +from schema import Link, ArchiveResult, RuntimeStats from config import ANSI, REPO_DIR, OUTPUT_DIR - # globals are bad, mmkay -_LAST_RUN_STATS = { - 'skipped': 0, - 'succeeded': 0, - 'failed': 0, +_LAST_RUN_STATS = RuntimeStats( + skipped=0, + succeeded=0, + failed=0, - 'parsing_start_ts': 0, - 'parsing_end_ts': 0, + parse_start_ts=0, + parse_end_ts=0, - 'indexing_start_ts': 0, - 'indexing_end_ts': 0, + index_start_ts=0, + index_end_ts=0, - 'archiving_start_ts': 0, - 'archiving_end_ts': 0, + archiving_start_ts=0, + archiving_end_ts=0, +) - 'links': {}, -} - -def pretty_path(path): +def pretty_path(path: str) -> str: """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" return path.replace(REPO_DIR + '/', '') ### Parsing Stage -def log_parsing_started(source_file): +def log_parsing_started(source_file: str): start_ts = datetime.now() - _LAST_RUN_STATS['parse_start_ts'] = start_ts + _LAST_RUN_STATS.parse_start_ts = start_ts print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format( start_ts.strftime('%Y-%m-%d %H:%M:%S'), source_file.rsplit('/', 1)[-1], **ANSI, )) -def log_parsing_finished(num_new_links, parser_name): +def log_parsing_finished(num_new_links: int, parser_name: str): + end_ts = datetime.now() + _LAST_RUN_STATS.parse_end_ts = end_ts print(' > Adding {} new links to index (parsed import as {})'.format( num_new_links, parser_name, @@ -48,26 +49,26 @@ def log_parsing_finished(num_new_links, parser_name): def log_indexing_process_started(): start_ts = datetime.now() - _LAST_RUN_STATS['index_start_ts'] = start_ts + _LAST_RUN_STATS.index_start_ts = start_ts print('{green}[*] [{}] Saving main index files...{reset}'.format( start_ts.strftime('%Y-%m-%d %H:%M:%S'), **ANSI, )) -def log_indexing_started(out_dir, out_file): +def log_indexing_started(out_dir: str, out_file: str): sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file)) -def log_indexing_finished(out_dir, out_file): +def log_indexing_finished(out_dir: str, out_file: str): end_ts = datetime.now() - _LAST_RUN_STATS['index_end_ts'] = end_ts + _LAST_RUN_STATS.index_end_ts = end_ts print('\r √ {}/{}'.format(pretty_path(out_dir), out_file)) ### Archiving Stage -def log_archiving_started(num_links, resume): +def log_archiving_started(num_links: int, resume: float): start_ts = datetime.now() - _LAST_RUN_STATS['start_ts'] = start_ts + _LAST_RUN_STATS.archiving_start_ts = start_ts if resume: print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format( start_ts.strftime('%Y-%m-%d %H:%M:%S'), @@ -82,9 +83,9 @@ def log_archiving_started(num_links, resume): **ANSI, )) -def log_archiving_paused(num_links, idx, timestamp): +def log_archiving_paused(num_links: int, idx: int, timestamp: str): end_ts = datetime.now() - _LAST_RUN_STATS['end_ts'] = end_ts + _LAST_RUN_STATS.archiving_end_ts = end_ts print() print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format( **ANSI, @@ -100,10 +101,10 @@ def log_archiving_paused(num_links, idx, timestamp): timestamp, )) -def log_archiving_finished(num_links): +def log_archiving_finished(num_links: int): end_ts = datetime.now() - _LAST_RUN_STATS['end_ts'] = end_ts - seconds = end_ts.timestamp() - _LAST_RUN_STATS['start_ts'].timestamp() + _LAST_RUN_STATS.archiving_end_ts = end_ts + seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp() if seconds > 60: duration = '{0:.2f} min'.format(seconds / 60, 2) else: @@ -116,13 +117,13 @@ def log_archiving_finished(num_links): duration, ANSI['reset'], )) - print(' - {} links skipped'.format(_LAST_RUN_STATS['skipped'])) - print(' - {} links updated'.format(_LAST_RUN_STATS['succeeded'])) - print(' - {} links had errors'.format(_LAST_RUN_STATS['failed'])) + print(' - {} links skipped'.format(_LAST_RUN_STATS.skipped)) + print(' - {} links updated'.format(_LAST_RUN_STATS.succeeded)) + print(' - {} links had errors'.format(_LAST_RUN_STATS.failed)) print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', ''))) -def log_link_archiving_started(link_dir, link, is_new): +def log_link_archiving_started(link_dir: str, link: Link, is_new: bool): # [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford" # http://www.benstopford.com/2015/02/14/log-structured-merge-trees/ # > output/archive/1478739709 @@ -140,40 +141,34 @@ def log_link_archiving_started(link_dir, link, is_new): pretty_path(link_dir), )) -def log_link_archiving_finished(link_dir, link, is_new, stats): +def log_link_archiving_finished(link_dir: str, link: Link, is_new: bool, stats: dict): total = sum(stats.values()) if stats['failed'] > 0 : - _LAST_RUN_STATS['failed'] += 1 + _LAST_RUN_STATS.failed += 1 elif stats['skipped'] == total: - _LAST_RUN_STATS['skipped'] += 1 + _LAST_RUN_STATS.skipped += 1 else: - _LAST_RUN_STATS['succeeded'] += 1 + _LAST_RUN_STATS.succeeded += 1 -def log_archive_method_started(method): +def log_archive_method_started(method: str): print(' > {}'.format(method)) -def log_archive_method_finished(result): + +def log_archive_method_finished(result: ArchiveResult): """quote the argument with whitespace in a command so the user can copy-paste the outputted string directly to run the cmd """ - required_keys = ('cmd', 'pwd', 'output', 'status', 'start_ts', 'end_ts') - assert ( - isinstance(result, dict) - and all(key in result for key in required_keys) - and ('output' in result) - ), 'Archive method did not return a valid result.' - # Prettify CMD string and make it safe to copy-paste by quoting arguments quoted_cmd = ' '.join( '"{}"'.format(arg) if ' ' in arg else arg - for arg in result['cmd'] + for arg in result.cmd ) - if result['status'] == 'failed': + if result.status == 'failed': # Prettify error output hints string and limit to five lines - hints = getattr(result['output'], 'hints', None) or () + hints = getattr(result.output, 'hints', None) or () if hints: hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n') hints = ( @@ -185,13 +180,13 @@ def log_archive_method_finished(result): output_lines = [ '{}Failed:{} {}{}'.format( ANSI['red'], - result['output'].__class__.__name__.replace('ArchiveError', ''), - result['output'], + result.output.__class__.__name__.replace('ArchiveError', ''), + result.output, ANSI['reset'] ), *hints, '{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']), - ' cd {};'.format(result['pwd']), + *((' cd {};'.format(result.pwd),) if result.pwd else ()), ' {}'.format(quoted_cmd), ] print('\n'.join( diff --git a/archivebox/parse.py b/archivebox/parse.py index baaa447e..3da3cb35 100644 --- a/archivebox/parse.py +++ b/archivebox/parse.py @@ -20,6 +20,7 @@ Link: { import re import json +from typing import Tuple, List, IO, Iterable from datetime import datetime import xml.etree.ElementTree as etree @@ -29,10 +30,11 @@ from util import ( URL_REGEX, check_url_parsing_invariants, TimedProgress, + Link, ) -def parse_links(source_file): +def parse_links(source_file: str) -> Tuple[List[Link], str]: """parse a list of URLs with their metadata from an RSS feed, bookmarks export, or text file """ @@ -74,7 +76,7 @@ def parse_links(source_file): ### Import Parser Functions -def parse_pocket_html_export(html_file): +def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]: """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" html_file.seek(0) @@ -98,7 +100,7 @@ def parse_pocket_html_export(html_file): } -def parse_json_export(json_file): +def parse_json_export(json_file: IO[str]) -> Iterable[Link]: """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" json_file.seek(0) @@ -150,7 +152,7 @@ def parse_json_export(json_file): } -def parse_rss_export(rss_file): +def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]: """Parse RSS XML-format files into links""" rss_file.seek(0) @@ -187,7 +189,7 @@ def parse_rss_export(rss_file): } -def parse_shaarli_rss_export(rss_file): +def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]: """Parse Shaarli-specific RSS XML-format files into links""" rss_file.seek(0) @@ -224,7 +226,7 @@ def parse_shaarli_rss_export(rss_file): } -def parse_netscape_html_export(html_file): +def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]: """Parse netscape-format bookmarks export files (produced by all browsers)""" html_file.seek(0) @@ -247,7 +249,7 @@ def parse_netscape_html_export(html_file): } -def parse_pinboard_rss_export(rss_file): +def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]: """Parse Pinboard RSS feed files into links""" rss_file.seek(0) @@ -278,7 +280,7 @@ def parse_pinboard_rss_export(rss_file): } -def parse_medium_rss_export(rss_file): +def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]: """Parse Medium RSS feed files into links""" rss_file.seek(0) @@ -299,7 +301,7 @@ def parse_medium_rss_export(rss_file): } -def parse_plain_text_export(text_file): +def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]: """Parse raw links from each line in a text file""" text_file.seek(0) diff --git a/archivebox/schema.py b/archivebox/schema.py new file mode 100644 index 00000000..719298e8 --- /dev/null +++ b/archivebox/schema.py @@ -0,0 +1,55 @@ +from datetime import datetime + +from typing import List, Dict, Any, Optional, Union, NamedTuple +from recordclass import RecordClass + +Link = Dict[str, Any] + +class ArchiveIndex(NamedTuple): + info: str + version: str + source: str + docs: str + num_links: int + updated: str + links: List[Link] + +class ArchiveResult(NamedTuple): + cmd: List[str] + pwd: Optional[str] + cmd_version: Optional[str] + output: Union[str, Exception, None] + status: str + start_ts: datetime + end_ts: datetime + duration: int + + +class ArchiveError(Exception): + def __init__(self, message, hints=None): + super().__init__(message) + self.hints = hints + + +class LinkDict(NamedTuple): + timestamp: str + url: str + title: Optional[str] + tags: str + sources: List[str] + history: Dict[str, ArchiveResult] + + +class RuntimeStats(RecordClass): + skipped: int + succeeded: int + failed: int + + parse_start_ts: datetime + parse_end_ts: datetime + + index_start_ts: datetime + index_end_ts: datetime + + archiving_start_ts: datetime + archiving_end_ts: datetime diff --git a/archivebox/util.py b/archivebox/util.py index 1835bd16..2c2c6a05 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -3,11 +3,13 @@ import re import sys import time -from typing import List, Dict, Any, Optional, Union +from json import JSONEncoder + +from typing import List, Dict, Optional, Iterable from urllib.request import Request, urlopen -from urllib.parse import urlparse, quote -from decimal import Decimal +from urllib.parse import urlparse, quote, unquote +from html import escape, unescape from datetime import datetime from multiprocessing import Process from subprocess import ( @@ -19,6 +21,7 @@ from subprocess import ( CalledProcessError, ) +from schema import Link from config import ( ANSI, TERM_WIDTH, @@ -38,7 +41,8 @@ from logs import pretty_path ### Parsing Helpers -# Url Parsing: https://docs.python.org/3/library/urllib.parse.html#url-parsing +# All of these are (str) -> str +# shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing scheme = lambda url: urlparse(url).scheme without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//') without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//') @@ -54,6 +58,9 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links short_ts = lambda ts: ts.split('.')[0] urlencode = lambda s: quote(s, encoding='utf-8', errors='replace') +urldecode = lambda s: unquote(s) +htmlencode = lambda s: escape(s, quote=True) +htmldecode = lambda s: unescape(s) URL_REGEX = re.compile( r'http[s]?://' # start matching from allowed schemes @@ -89,7 +96,7 @@ STATICFILE_EXTENSIONS = { # html, htm, shtml, xhtml, xml, aspx, php, cgi } -Link = Dict[str, Any] + ### Checks & Tests @@ -105,7 +112,7 @@ def check_link_structure(link: Link) -> None: assert isinstance(key, str) assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history']) -def check_links_structure(links: List[Link]) -> None: +def check_links_structure(links: Iterable[Link]) -> None: """basic sanity check invariants to make sure the data is valid""" assert isinstance(links, list) if links: @@ -334,7 +341,7 @@ def derived_link_info(link: Link) -> dict: url = link['url'] - to_date_str = lambda ts: datetime.fromtimestamp(Decimal(ts)).strftime('%Y-%m-%d %H:%M') + to_date_str = lambda ts: datetime.fromtimestamp(float(ts)).strftime('%Y-%m-%d %H:%M') extended_info = { **link, @@ -582,3 +589,30 @@ def chrome_args(**options) -> List[str]: cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) return cmd_args + + +class ExtendedEncoder(JSONEncoder): + """ + Extended json serializer that supports serializing several model + fields and objects + """ + + def default(self, obj): + cls_name = obj.__class__.__name__ + + if hasattr(obj, '_asdict'): + return obj._asdict() + + elif isinstance(obj, bytes): + return obj.decode() + + elif isinstance(obj, datetime): + return obj.isoformat() + + elif isinstance(obj, Exception): + return '{}: {}'.format(obj.__class__.__name__, obj) + + elif cls_name in ('dict_items', 'dict_keys', 'dict_values'): + return tuple(obj) + + return JSONEncoder.default(self, obj)