1
0
Fork 0
mirror of synced 2024-07-01 04:20:55 +12:00

better logging during long output

This commit is contained in:
Nick Sweeting 2019-03-22 15:09:39 -04:00
parent 1c5732d5c6
commit bd9f3e313f
6 changed files with 63 additions and 72 deletions

View file

@ -1,6 +1,5 @@
import os import os
from functools import wraps
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
@ -50,10 +49,9 @@ from util import (
run, PIPE, DEVNULL run, PIPE, DEVNULL
) )
from logs import ( from logs import (
_LAST_RUN_STATS,
log_link_archiving_started, log_link_archiving_started,
log_link_archiving_finished, log_link_archiving_finished,
log_archive_method_starting, log_archive_method_started,
log_archive_method_finished, log_archive_method_finished,
) )
@ -94,6 +92,7 @@ def archive_link(link_dir, link):
link['history'][method_name] = [] link['history'][method_name] = []
if method_name not in link['latest']: if method_name not in link['latest']:
link['latest'][method_name] = None link['latest'][method_name] = None
if not should_run(link_dir, link): if not should_run(link_dir, link):
continue continue
@ -101,7 +100,7 @@ def archive_link(link_dir, link):
skipped_entirely = False skipped_entirely = False
print() print()
log_archive_method_starting(method_name) log_archive_method_started(method_name)
result = method_function(link_dir, link) result = method_function(link_dir, link)
log_archive_method_finished(result) log_archive_method_finished(result)
@ -109,11 +108,6 @@ def archive_link(link_dir, link):
if result['status'] == 'succeeded': if result['status'] == 'succeeded':
link['latest'][method_name] = result['output'] link['latest'][method_name] = result['output']
if result['status'] != 'skipped':
made_changes = True
_LAST_RUN_STATS[result['status']] += 1
write_link_index(link_dir, link) write_link_index(link_dir, link)
patch_links_index(link) patch_links_index(link)
@ -126,6 +120,7 @@ def archive_link(link_dir, link):
return link return link
### Archive Method Functions
def should_fetch_title(link_dir, link): def should_fetch_title(link_dir, link):
# if link already has valid title, skip it # if link already has valid title, skip it
@ -428,8 +423,8 @@ def should_fetch_git(link_dir, link):
return False return False
is_clonable_url = ( is_clonable_url = (
domain(link['url']) in GIT_DOMAINS (domain(link['url']) in GIT_DOMAINS)
or extension(link['url']) == 'git' or (extension(link['url']) == 'git')
) )
if not is_clonable_url: if not is_clonable_url:
return False return False
@ -477,6 +472,7 @@ def fetch_git(link_dir, link, timeout=TIMEOUT):
**timer.stats, **timer.stats,
} }
def should_fetch_media(link_dir, link): def should_fetch_media(link_dir, link):
if is_static_file(link['url']): if is_static_file(link['url']):
return False return False
@ -547,21 +543,6 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT):
**timer.stats, **timer.stats,
} }
def parse_archive_dot_org_response(response):
# Parse archive.org response headers
headers = defaultdict(list)
# lowercase all the header names and store in dict
for header in response.splitlines():
if b':' not in header or not header.strip():
continue
name, val = header.decode().split(':', 1)
headers[name.lower().strip()].append(val.strip())
# Get successful archive url in "content-location" header or any errors
content_location = headers['content-location']
errors = headers['x-archive-wayback-runtime-error']
return content_location, errors
def should_fetch_archive_dot_org(link_dir, link): def should_fetch_archive_dot_org(link_dir, link):
if is_static_file(link['url']): if is_static_file(link['url']):
@ -627,4 +608,18 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
**timer.stats, **timer.stats,
} }
def parse_archive_dot_org_response(response):
# Parse archive.org response headers
headers = defaultdict(list)
# lowercase all the header names and store in dict
for header in response.splitlines():
if b':' not in header or not header.strip():
continue
name, val = header.decode().split(':', 1)
headers[name.lower().strip()].append(val.strip())
# Get successful archive url in "content-location" header or any errors
content_location = headers['content-location']
errors = headers['x-archive-wayback-runtime-error']
return content_location, errors

View file

@ -26,6 +26,7 @@ from util import (
from parse import parse_links from parse import parse_links
from links import validate_links from links import validate_links
from logs import ( from logs import (
log_indexing_process_started,
log_indexing_started, log_indexing_started,
log_indexing_finished, log_indexing_finished,
log_parsing_started, log_parsing_started,
@ -40,12 +41,14 @@ TITLE_LOADING_MSG = 'Not yet archived...'
def write_links_index(out_dir, links, finished=False): def write_links_index(out_dir, links, finished=False):
"""create index.html file for a given list of links""" """create index.html file for a given list of links"""
log_indexing_started() log_indexing_process_started()
check_links_structure(links) check_links_structure(links)
log_indexing_started(out_dir, 'index.json')
write_json_links_index(out_dir, links) write_json_links_index(out_dir, links)
log_indexing_finished(out_dir, 'index.json') log_indexing_finished(out_dir, 'index.json')
log_indexing_started(out_dir, 'index.html')
write_html_links_index(out_dir, links, finished=finished) write_html_links_index(out_dir, links, finished=finished)
log_indexing_finished(out_dir, 'index.html') log_indexing_finished(out_dir, 'index.html')

View file

@ -3,33 +3,26 @@ In ArchiveBox, a Link represents a single entry that we track in the
json index. All links pass through all archiver functions and the latest, json index. All links pass through all archiver functions and the latest,
most up-to-date canonical output for each is stored in "latest". most up-to-date canonical output for each is stored in "latest".
Link { Link {
timestamp: str, (how we uniquely id links) _ _ _ _ ___ timestamp: str, (how we uniquely id links)
url: str, | \ / \ |\| ' | url: str,
base_url: str, |_/ \_/ | | | title: str,
domain: str, _ _ _ _ _ _ tags: str,
tags: str, |_) /| |\| | / ` sources: [str],
type: str, | /"| | | | \_, latest: {
title: str, ,-'"`-. ...,
sources: [str], /// / @ @ \ \\\\ pdf: 'output.pdf',
latest: { \ :=| ,._,. |=: / wget: 'example.com/1234/index.html',
..., || ,\ \_../ /. || screenshot: null,
pdf: 'output.pdf', ||','`-._))'`.`||
wget: 'example.com/1234/index.html' `-' (/ `-'
}, },
history: { history: {
...
pdf: [ pdf: [
{timestamp: 15444234325, status: 'skipped', result='output.pdf'}, {start_ts, end_ts, duration, cmd, pwd, status, output},
... ...
], ],
wget: [ ...
{timestamp: 11534435345, status: 'succeded', result='donuts.com/eat/them.html'}
]
}, },
} }
""" """
from html import unescape from html import unescape

View file

@ -45,13 +45,21 @@ def log_link_archiving_started(link_dir, link, is_new):
)) ))
def log_link_archiving_finished(link_dir, link, is_new, skipped_entirely): def log_link_archiving_finished(link_dir, link, is_new, skipped_entirely):
if all(output == 'succeeded' for output in link['latest']):
_LAST_RUN_STATS['succeeded'] += 1
elif skipped_entirely or all(output == 'skipped' for output in link['latest']):
_LAST_RUN_STATS['skipped'] += 1
else:
_LAST_RUN_STATS['failed'] += 1
# import ipdb; ipdb.set_trace()
if skipped_entirely: if skipped_entirely:
print('\r{}{}'.format( print('\r{}{}'.format(
pretty_path(link_dir), pretty_path(link_dir),
' (new)' if is_new else '', ' (new)' if is_new else '',
)) ))
def log_archive_method_starting(method): def log_archive_method_started(method):
print(' > {}'.format(method)) print(' > {}'.format(method))
def log_archive_method_finished(result): def log_archive_method_finished(result):
@ -117,7 +125,7 @@ def log_parsing_finished(num_new_links, parser_name):
parser_name, parser_name,
)) ))
def log_indexing_started(): def log_indexing_process_started():
start_ts = datetime.now() start_ts = datetime.now()
_LAST_RUN_STATS['index_start_ts'] = start_ts _LAST_RUN_STATS['index_start_ts'] = start_ts
print('{green}[*] [{}] Saving main index files...{reset}'.format( print('{green}[*] [{}] Saving main index files...{reset}'.format(
@ -125,10 +133,13 @@ def log_indexing_started():
**ANSI, **ANSI,
)) ))
def log_indexing_started(out_dir, out_file):
sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file))
def log_indexing_finished(out_dir, out_file): def log_indexing_finished(out_dir, out_file):
end_ts = datetime.now() end_ts = datetime.now()
_LAST_RUN_STATS['index_end_ts'] = end_ts _LAST_RUN_STATS['index_end_ts'] = end_ts
print('{}/{}'.format(pretty_path(out_dir), out_file)) print('\r{}/{}'.format(pretty_path(out_dir), out_file))
def log_archiving_started(num_links, resume): def log_archiving_started(num_links, resume):
start_ts = datetime.now() start_ts = datetime.now()

View file

@ -314,10 +314,20 @@ def wget_output_path(link):
# Wget downloads can save in a number of different ways depending on the url: # Wget downloads can save in a number of different ways depending on the url:
# https://example.com # https://example.com
# > output/archive/<timestamp>/example.com/index.html # > output/archive/<timestamp>/example.com/index.html
# https://example.com?v=zzVa_tX1OiI
# > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
# https://www.example.com/?v=zzVa_tX1OiI
# > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc # https://example.com/abc
# > output/archive/<timestamp>/example.com/abc.html # > output/archive/<timestamp>/example.com/abc.html
# https://example.com/abc/ # https://example.com/abc/
# > output/archive/<timestamp>/example.com/abc/index.html # > output/archive/<timestamp>/example.com/abc/index.html
# https://example.com/abc?v=zzVa_tX1OiI.html
# > output/archive/<timestamp>/example.com/abc?v=zzVa_tX1OiI.html
# https://example.com/abc/?v=zzVa_tX1OiI.html
# > output/archive/<timestamp>/example.com/abc/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc/test.html # https://example.com/abc/test.html
# > output/archive/<timestamp>/example.com/abc/test.html # > output/archive/<timestamp>/example.com/abc/test.html
# https://example.com/abc/test?v=zzVa_tX1OiI # https://example.com/abc/test?v=zzVa_tX1OiI
@ -326,7 +336,7 @@ def wget_output_path(link):
# > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html # > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
# There's also lots of complexity around how the urlencoding and renaming # There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments or extensions like shtml / htm # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
# Since the wget algorithm for -E (appending .html) is incredibly complex # Since the wget algorithm for -E (appending .html) is incredibly complex
# and there's no way to get the computed output path from wget # and there's no way to get the computed output path from wget
@ -359,27 +369,6 @@ def wget_output_path(link):
return None return None
# If finding the actual output file didn't work, fall back to the buggy
# implementation of the wget .html appending algorithm
# split_url = link['url'].split('#', 1)
# query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
# if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
# # already ends in .html
# return urlencode(base_url(link['url']))
# else:
# # .html needs to be appended
# without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
# if without_scheme.endswith('/'):
# if query:
# return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
# return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
# else:
# if query:
# return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
# elif '/' in without_scheme:
# return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
# return urlencode(base_url(link['url']) + '/index.html')
### String Manipulation & Logging Helpers ### String Manipulation & Logging Helpers