1
0
Fork 0
mirror of synced 2024-06-14 00:15:03 +12:00
ArchiveBox/archivebox/logs.py

198 lines
6.5 KiB
Python
Raw Normal View History

2019-03-21 18:28:12 +13:00
import sys
from datetime import datetime
from typing import Optional
from schema import Link, ArchiveResult, RuntimeStats
from config import ANSI, REPO_DIR, OUTPUT_DIR
2019-03-21 18:28:12 +13:00
# globals are bad, mmkay
_LAST_RUN_STATS = RuntimeStats(
skipped=0,
succeeded=0,
failed=0,
2019-03-21 18:28:12 +13:00
parse_start_ts=0,
parse_end_ts=0,
2019-03-21 18:28:12 +13:00
index_start_ts=0,
index_end_ts=0,
2019-03-21 18:28:12 +13:00
archiving_start_ts=0,
archiving_end_ts=0,
)
2019-03-21 18:28:12 +13:00
def pretty_path(path: str) -> str:
2019-03-21 18:28:12 +13:00
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
return path.replace(REPO_DIR + '/', '')
2019-03-23 14:38:24 +13:00
### Parsing Stage
2019-03-21 18:28:12 +13:00
def log_parsing_started(source_file: str):
2019-03-21 18:28:12 +13:00
start_ts = datetime.now()
_LAST_RUN_STATS.parse_start_ts = start_ts
2019-03-21 18:28:12 +13:00
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
source_file.rsplit('/', 1)[-1],
**ANSI,
))
def log_parsing_finished(num_parsed: int, num_new_links: int, parser_name: str):
end_ts = datetime.now()
_LAST_RUN_STATS.parse_end_ts = end_ts
print(' > Parsed {} links as {}'.format(num_parsed, parser_name))
print(' > Adding {} new links to collection: {}'.format(
2019-03-21 18:28:12 +13:00
num_new_links,
OUTPUT_DIR,
2019-03-21 18:28:12 +13:00
))
2019-03-23 14:38:24 +13:00
### Indexing Stage
2019-03-23 08:09:39 +13:00
def log_indexing_process_started():
2019-03-21 18:28:12 +13:00
start_ts = datetime.now()
_LAST_RUN_STATS.index_start_ts = start_ts
2019-03-21 18:28:12 +13:00
print('{green}[*] [{}] Saving main index files...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
**ANSI,
))
def log_indexing_started(out_dir: str, out_file: str):
2019-03-23 08:09:39 +13:00
sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file))
def log_indexing_finished(out_dir: str, out_file: str):
2019-03-21 18:28:12 +13:00
end_ts = datetime.now()
_LAST_RUN_STATS.index_end_ts = end_ts
2019-03-23 08:09:39 +13:00
print('\r{}/{}'.format(pretty_path(out_dir), out_file))
2019-03-21 18:28:12 +13:00
2019-03-23 14:38:24 +13:00
### Archiving Stage
def log_archiving_started(num_links: int, resume: Optional[float]):
2019-03-21 18:28:12 +13:00
start_ts = datetime.now()
_LAST_RUN_STATS.archiving_start_ts = start_ts
2019-03-21 18:28:12 +13:00
if resume:
print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
resume,
**ANSI,
))
else:
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
**ANSI,
))
def log_archiving_paused(num_links: int, idx: int, timestamp: str):
2019-03-21 18:28:12 +13:00
end_ts = datetime.now()
_LAST_RUN_STATS.archiving_end_ts = end_ts
2019-03-21 18:28:12 +13:00
print()
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
**ANSI,
now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
idx=idx+1,
timestamp=timestamp,
total=num_links,
))
print(' To view your archive, open:')
print(' {}/index.html'.format(OUTPUT_DIR))
print(' Continue archiving where you left off by running:')
print(' archivebox {}'.format(timestamp))
2019-03-21 18:28:12 +13:00
def log_archiving_finished(num_links: int):
2019-03-21 18:28:12 +13:00
end_ts = datetime.now()
_LAST_RUN_STATS.archiving_end_ts = end_ts
seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp()
2019-03-21 18:28:12 +13:00
if seconds > 60:
duration = '{0:.2f} min'.format(seconds / 60, 2)
else:
duration = '{0:.2f} sec'.format(seconds, 2)
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
ANSI['green'],
end_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
duration,
ANSI['reset'],
))
print(' - {} links skipped'.format(_LAST_RUN_STATS.skipped))
print(' - {} links updated'.format(_LAST_RUN_STATS.succeeded))
print(' - {} links had errors'.format(_LAST_RUN_STATS.failed))
print(' To view your archive, open:')
print(' {}/index.html'.format(OUTPUT_DIR))
2019-03-23 14:38:24 +13:00
def log_link_archiving_started(link_dir: str, link: Link, is_new: bool):
2019-03-23 14:38:24 +13:00
# [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
# > output/archive/1478739709
print('\n[{symbol_color}{symbol}{reset}] [{symbol_color}{now}{reset}] "{title}"'.format(
symbol_color=ANSI['green' if is_new else 'black'],
2019-03-27 20:49:39 +13:00
symbol='+' if is_new else '',
2019-03-23 14:38:24 +13:00
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
title=link.title or link.base_url,
2019-03-23 14:38:24 +13:00
**ANSI,
))
print(' {blue}{url}{reset}'.format(url=link.url, **ANSI))
2019-03-23 16:00:43 +13:00
print(' {} {}'.format(
'>' if is_new else '',
2019-03-23 14:38:24 +13:00
pretty_path(link_dir),
))
def log_link_archiving_finished(link_dir: str, link: Link, is_new: bool, stats: dict):
2019-03-23 16:00:43 +13:00
total = sum(stats.values())
if stats['failed'] > 0 :
_LAST_RUN_STATS.failed += 1
2019-03-23 16:00:43 +13:00
elif stats['skipped'] == total:
_LAST_RUN_STATS.skipped += 1
2019-03-23 16:00:43 +13:00
else:
_LAST_RUN_STATS.succeeded += 1
2019-03-23 14:38:24 +13:00
def log_archive_method_started(method: str):
2019-03-23 14:38:24 +13:00
print(' > {}'.format(method))
def log_archive_method_finished(result: ArchiveResult):
2019-03-23 14:38:24 +13:00
"""quote the argument with whitespace in a command so the user can
copy-paste the outputted string directly to run the cmd
"""
# Prettify CMD string and make it safe to copy-paste by quoting arguments
quoted_cmd = ' '.join(
'"{}"'.format(arg) if ' ' in arg else arg
for arg in result.cmd
2019-03-23 14:38:24 +13:00
)
if result.status == 'failed':
2019-03-23 14:38:24 +13:00
# Prettify error output hints string and limit to five lines
hints = getattr(result.output, 'hints', None) or ()
2019-03-23 14:38:24 +13:00
if hints:
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
hints = (
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
for line in hints[:5] if line.strip()
)
# Collect and prefix output lines with indentation
output_lines = [
'{}Failed:{} {}{}'.format(
ANSI['red'],
result.output.__class__.__name__.replace('ArchiveError', ''),
result.output,
2019-03-23 14:38:24 +13:00
ANSI['reset']
),
*hints,
'{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']),
*((' cd {};'.format(result.pwd),) if result.pwd else ()),
2019-03-23 14:38:24 +13:00
' {}'.format(quoted_cmd),
]
print('\n'.join(
' {}'.format(line)
for line in output_lines
if line
))