1
0
Fork 0
mirror of synced 2024-06-28 19:10:33 +12:00

better separation of archive method checking and running logic

This commit is contained in:
Nick Sweeting 2019-03-21 05:35:41 -04:00
parent e6bd1f8ca8
commit d798117081
6 changed files with 424 additions and 479 deletions

View file

@ -3,7 +3,6 @@ import os
from functools import wraps from functools import wraps
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from stdlib_patches import run, PIPE, DEVNULL
from index import ( from index import (
write_link_index, write_link_index,
@ -43,16 +42,18 @@ from util import (
without_fragment, without_fragment,
fetch_page_title, fetch_page_title,
is_static_file, is_static_file,
progress, TimedProgress,
chmod_file, chmod_file,
check_link_structure,
wget_output_path, wget_output_path,
chrome_args, chrome_args,
check_link_structure,
run, PIPE, DEVNULL
) )
from logs import ( from logs import (
_LAST_RUN_STATS, _LAST_RUN_STATS,
log_link_archiving_started, log_link_archiving_started,
log_link_archiving_failed, log_archive_method_starting,
log_archive_method_finished,
) )
@ -63,21 +64,20 @@ class ArchiveError(Exception):
self.hints = hints self.hints = hints
def archive_link(link_dir, link, overwrite=True): def archive_link(link_dir, link):
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
ARCHIVE_METHODS = ( ARCHIVE_METHODS = (
(FETCH_TITLE, fetch_title), ('title', should_fetch_title, fetch_title),
(FETCH_FAVICON, fetch_favicon), ('favicon', should_fetch_favicon, fetch_favicon),
(FETCH_WGET, fetch_wget), ('wget', should_fetch_wget, fetch_wget),
(FETCH_PDF, fetch_pdf), ('pdf', should_fetch_pdf, fetch_pdf),
(FETCH_SCREENSHOT, fetch_screenshot), ('screenshot', should_fetch_screenshot, fetch_screenshot),
(FETCH_DOM, fetch_dom), ('dom', should_fetch_dom, fetch_dom),
(FETCH_GIT, fetch_git), ('git', should_fetch_git, fetch_git),
(FETCH_MEDIA, fetch_media), ('media', should_fetch_media, fetch_media),
(SUBMIT_ARCHIVE_DOT_ORG, archive_dot_org), ('archive_org', should_fetch_archive_dot_org, archive_dot_org),
) )
active_methods = [method for toggle, method in ARCHIVE_METHODS if toggle]
try: try:
is_new = not os.path.exists(link_dir) is_new = not os.path.exists(link_dir)
@ -87,109 +87,88 @@ def archive_link(link_dir, link, overwrite=True):
link = load_json_link_index(link_dir, link) link = load_json_link_index(link_dir, link)
log_link_archiving_started(link_dir, link, is_new) log_link_archiving_started(link_dir, link, is_new)
for archive_method in active_methods: for method_name, should_run, method_function in ARCHIVE_METHODS:
archive_method(link_dir, link, overwrite=overwrite) if method_name not in link['history']:
link['history'][method_name] = []
if method_name not in link['latest']:
link['latest'][method_name] = None
if not should_run(link_dir, link):
continue
log_archive_method_starting(method_name)
result = method_function(link_dir, link)
log_archive_method_finished(result)
link['history'][method_name].append(result)
if result['status'] == 'succeeded':
link['latest'][method_name] = result['output']
_LAST_RUN_STATS[result['status']] += 1
write_link_index(link_dir, link) write_link_index(link_dir, link)
patch_links_index(link) patch_links_index(link)
except Exception as err: except Exception as err:
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
raise
return link return link
def attach_result_to_link(method):
"""
Instead of returning a result={output:'...', status:'success'} object,
attach that result to the links's history & latest fields, then return
the updated link object.
"""
def decorator(fetch_func):
@wraps(fetch_func)
def timed_fetch_func(link_dir, link, overwrite=False, **kwargs):
# initialize methods and history json field on link
link['latest'] = link.get('latest') or {}
link['latest'][method] = link['latest'].get(method) or None
link['history'] = link.get('history') or {}
link['history'][method] = link['history'].get(method) or []
start_ts = datetime.now().timestamp() def should_fetch_title(link_dir, link):
# if link already has valid title, skip it
if link['title'] and not link['title'].lower().startswith('http'):
return False
# if a valid method output is already present, dont run the fetch function if is_static_file(link['url']):
if link['latest'][method] and not overwrite: return False
print('{}'.format(method))
result = None
else:
print(' > {}'.format(method))
result = fetch_func(link_dir, link, **kwargs)
end_ts = datetime.now().timestamp() return FETCH_TITLE
duration = str(end_ts * 1000 - start_ts * 1000).split('.')[0]
# append a history item recording fail/success
history_entry = {
'timestamp': str(start_ts).split('.')[0],
}
if result is None:
history_entry['status'] = 'skipped'
elif isinstance(result.get('output'), Exception):
history_entry['status'] = 'failed'
history_entry['duration'] = duration
history_entry.update(result or {})
link['history'][method].append(history_entry)
else:
history_entry['status'] = 'succeded'
history_entry['duration'] = duration
history_entry.update(result or {})
link['history'][method].append(history_entry)
link['latest'][method] = result['output']
_LAST_RUN_STATS[history_entry['status']] += 1
return link
return timed_fetch_func
return decorator
@attach_result_to_link('title')
def fetch_title(link_dir, link, timeout=TIMEOUT): def fetch_title(link_dir, link, timeout=TIMEOUT):
"""try to guess the page's title from its content""" """try to guess the page's title from its content"""
# if link already has valid title, skip it output = None
if link['title'] and not link['title'].lower().startswith('http'): cmd = [
return {'output': link['title'], 'status': 'skipped'} CURL_BINARY,
link['url'],
if is_static_file(link['url']): '|',
return {'output': None, 'status': 'skipped'} 'grep',
'<title>',
end = progress(timeout, prefix=' ') ]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try: try:
title = fetch_page_title(link['url'], timeout=timeout, progress=False) output = fetch_page_title(link['url'], timeout=timeout, progress=False)
end() if not output:
output = title raise ArchiveError('Unable to detect page title')
except Exception as e: except Exception as err:
end() status = 'failed'
output = e output = err
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) finally:
timer.end()
if title and title.strip():
link['title'] = title
output = title
return { return {
'cmd': 'fetch_page_title("{}")'.format(link['url']), 'cmd': cmd,
'pwd': link_dir,
'output': output, 'output': output,
'status': status,
**timer.stats,
} }
@attach_result_to_link('favicon')
def should_fetch_favicon(link_dir, link):
if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
return False
return FETCH_FAVICON
def fetch_favicon(link_dir, link, timeout=TIMEOUT): def fetch_favicon(link_dir, link, timeout=TIMEOUT):
"""download site favicon from google's favicon api""" """download site favicon from google's favicon api"""
output = 'favicon.ico' output = 'favicon.ico'
if os.path.exists(os.path.join(link_dir, output)): cmd = [
return {'output': output, 'status': 'skipped'}
CMD = [
CURL_BINARY, CURL_BINARY,
'--max-time', str(timeout), '--max-time', str(timeout),
'--location', '--location',
@ -197,37 +176,44 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
*(() if CHECK_SSL_VALIDITY else ('--insecure',)), *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])), 'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
] ]
end = progress(timeout, prefix=' ') status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try: try:
run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
end()
chmod_file(output, cwd=link_dir) chmod_file(output, cwd=link_dir)
except Exception as e: except Exception as err:
end() status = 'failed'
output = e output = err
print_error_hints(cmd=CMD, pwd=link_dir, err=e) finally:
timer.end()
return { return {
'cmd': CMD, 'cmd': cmd,
'pwd': link_dir,
'output': output, 'output': output,
'status': status,
**timer.stats,
} }
@attach_result_to_link('wget') def should_fetch_wget(link_dir, link):
output_path = wget_output_path(link)
if output_path and os.path.exists(os.path.join(link_dir, output_path)):
return False
return FETCH_WGET
def fetch_wget(link_dir, link, timeout=TIMEOUT): def fetch_wget(link_dir, link, timeout=TIMEOUT):
"""download full site using wget""" """download full site using wget"""
domain_dir = os.path.join(link_dir, domain(link['url']))
existing_file = wget_output_path(link)
if os.path.exists(domain_dir) and existing_file:
return {'output': existing_file, 'status': 'skipped'}
if FETCH_WARC: if FETCH_WARC:
warc_dir = os.path.join(link_dir, 'warc') warc_dir = os.path.join(link_dir, 'warc')
os.makedirs(warc_dir, exist_ok=True) os.makedirs(warc_dir, exist_ok=True)
warc_path = os.path.join('warc', str(int(datetime.now().timestamp()))) warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
CMD = [ output = None
cmd = [
WGET_BINARY, WGET_BINARY,
# '--server-response', # print headers for better error parsing # '--server-response', # print headers for better error parsing
'--no-verbose', '--no-verbose',
@ -248,20 +234,19 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT):
*((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))), *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))),
link['url'], link['url'],
] ]
end = progress(timeout, prefix=' ') status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
end()
output = wget_output_path(link) output = wget_output_path(link)
# parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [ output_tail = [
line.strip() line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
if line.strip() if line.strip()
] ]
# parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
files_downloaded = ( files_downloaded = (
int(output_tail[-1].strip().split(' ', 2)[1] or 0) int(output_tail[-1].strip().split(' ', 2)[1] or 0)
if 'Downloaded:' in output_tail[-1] if 'Downloaded:' in output_tail[-1]
@ -271,7 +256,7 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT):
# Check for common failure cases # Check for common failure cases
if result.returncode > 0 and files_downloaded < 1: if result.returncode > 0 and files_downloaded < 1:
hints = ( hints = (
'Got wget response code {}:\n'.format(result.returncode), 'Got wget response code: {}.'.format(result.returncode),
*output_tail, *output_tail,
) )
if b'403: Forbidden' in result.stderr: if b'403: Forbidden' in result.stderr:
@ -281,144 +266,173 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT):
if b'ERROR 500: Internal Server Error' in result.stderr: if b'ERROR 500: Internal Server Error' in result.stderr:
raise ArchiveError('500 Internal Server Error', hints) raise ArchiveError('500 Internal Server Error', hints)
raise ArchiveError('Got an error from the server', hints) raise ArchiveError('Got an error from the server', hints)
except Exception as e: except Exception as err:
end() status = 'failed'
output = e output = err
print_error_hints(cmd=CMD, pwd=link_dir, err=e) finally:
timer.end()
return { return {
'cmd': CMD, 'cmd': cmd,
'pwd': link_dir,
'output': output, 'output': output,
'status': status,
**timer.stats,
} }
@attach_result_to_link('pdf') def should_fetch_pdf(link_dir, link):
if is_static_file(link['url']):
return False
if os.path.exists(os.path.join(link_dir, 'output.pdf')):
return False
return FETCH_PDF
def fetch_pdf(link_dir, link, timeout=TIMEOUT): def fetch_pdf(link_dir, link, timeout=TIMEOUT):
"""print PDF of site to file using chrome --headless""" """print PDF of site to file using chrome --headless"""
if is_static_file(link['url']):
return {'output': None, 'status': 'skipped'}
output = 'output.pdf' output = 'output.pdf'
if os.path.exists(os.path.join(link_dir, output)): cmd = [
return {'output': output, 'status': 'skipped'}
CMD = [
*chrome_args(timeout=timeout), *chrome_args(timeout=timeout),
'--print-to-pdf', '--print-to-pdf',
link['url'] link['url'],
] ]
end = progress(timeout, prefix=' ') status = 'succeeded'
hints = None timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
end()
if result.returncode: if result.returncode:
hints = (result.stderr or result.stdout).decode() hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to print PDF', hints) raise ArchiveError('Failed to print PDF', hints)
chmod_file('output.pdf', cwd=link_dir) chmod_file('output.pdf', cwd=link_dir)
except Exception as e: except Exception as err:
end() status = 'failed'
output = e output = err
print_error_hints(cmd=CMD, pwd=link_dir, err=e, hints=hints) finally:
timer.end()
return { return {
'cmd': CMD, 'cmd': cmd,
'pwd': link_dir,
'output': output, 'output': output,
'status': status,
**timer.stats,
} }
@attach_result_to_link('screenshot') def should_fetch_screenshot(link_dir, link):
if is_static_file(link['url']):
return False
if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
return False
return FETCH_SCREENSHOT
def fetch_screenshot(link_dir, link, timeout=TIMEOUT): def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
"""take screenshot of site using chrome --headless""" """take screenshot of site using chrome --headless"""
if is_static_file(link['url']):
return {'output': None, 'status': 'skipped'}
output = 'screenshot.png' output = 'screenshot.png'
if os.path.exists(os.path.join(link_dir, output)): cmd = [
return {'output': output, 'status': 'skipped'}
CMD = [
*chrome_args(timeout=timeout), *chrome_args(timeout=timeout),
'--screenshot', '--screenshot',
link['url'], link['url'],
] ]
end = progress(timeout, prefix=' ') status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
end()
if result.returncode: if result.returncode:
hints = (result.stderr or result.stdout).decode() hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to take screenshot', hints) raise ArchiveError('Failed to take screenshot', hints)
chmod_file(output, cwd=link_dir) chmod_file(output, cwd=link_dir)
except Exception as e: except Exception as err:
end() status = 'failed'
output = e output = err
print_error_hints(cmd=CMD, pwd=link_dir, err=e) finally:
timer.end()
return { return {
'cmd': CMD, 'cmd': cmd,
'pwd': link_dir,
'output': output, 'output': output,
'status': status,
**timer.stats,
} }
def should_fetch_dom(link_dir, link):
if is_static_file(link['url']):
return False
if os.path.exists(os.path.join(link_dir, 'output.html')):
return False
return FETCH_DOM
@attach_result_to_link('dom')
def fetch_dom(link_dir, link, timeout=TIMEOUT): def fetch_dom(link_dir, link, timeout=TIMEOUT):
"""print HTML of site to file using chrome --dump-html""" """print HTML of site to file using chrome --dump-html"""
if is_static_file(link['url']):
return {'output': None, 'status': 'skipped'}
output = 'output.html' output = 'output.html'
output_path = os.path.join(link_dir, output) output_path = os.path.join(link_dir, output)
if os.path.exists(output_path): cmd = [
return {'output': output, 'status': 'skipped'}
CMD = [
*chrome_args(timeout=timeout), *chrome_args(timeout=timeout),
'--dump-dom', '--dump-dom',
link['url'] link['url']
] ]
end = progress(timeout, prefix=' ') status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try: try:
with open(output_path, 'w+') as f: with open(output_path, 'w+') as f:
result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout) result = run(cmd, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout)
end()
if result.returncode: if result.returncode:
hints = result.stderr.decode() hints = result.stderr.decode()
raise ArchiveError('Failed to fetch DOM', hints) raise ArchiveError('Failed to fetch DOM', hints)
chmod_file(output, cwd=link_dir) chmod_file(output, cwd=link_dir)
except Exception as e: except Exception as err:
end() status = 'failed'
output = e output = err
print_error_hints(cmd=CMD, pwd=link_dir, err=e) finally:
timer.end()
return { return {
'cmd': CMD, 'cmd': cmd,
'pwd': link_dir,
'output': output, 'output': output,
'status': status,
**timer.stats,
} }
@attach_result_to_link('git') def should_fetch_git(link_dir, link):
def fetch_git(link_dir, link, timeout=TIMEOUT): if is_static_file(link['url']):
"""download full site using git""" return False
if os.path.exists(os.path.join(link_dir, 'git')):
return False
is_clonable_url = ( is_clonable_url = (
domain(link['url']) in GIT_DOMAINS domain(link['url']) in GIT_DOMAINS
or extension(link['url']) == 'git' or extension(link['url']) == 'git'
) )
if is_static_file(link['url']) or not is_clonable_url: if not is_clonable_url:
return {'output': None, 'status': 'skipped'} return False
return FETCH_GIT
def fetch_git(link_dir, link, timeout=TIMEOUT):
"""download full site using git"""
output = 'git' output = 'git'
output_path = os.path.join(link_dir, 'git') output_path = os.path.join(link_dir, 'git')
if os.path.exists(output_path):
return {'output': output, 'status': 'skipped'}
os.makedirs(output_path, exist_ok=True) os.makedirs(output_path, exist_ok=True)
CMD = [ cmd = [
GIT_BINARY, GIT_BINARY,
'clone', 'clone',
'--mirror', '--mirror',
@ -426,39 +440,48 @@ def fetch_git(link_dir, link, timeout=TIMEOUT):
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')), *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
without_query(without_fragment(link['url'])), without_query(without_fragment(link['url'])),
] ]
end = progress(timeout, prefix=' ') status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
end()
if result.returncode == 128: if result.returncode == 128:
# ignore failed re-download when the folder already exists # ignore failed re-download when the folder already exists
pass pass
elif result.returncode > 0: elif result.returncode > 0:
hints = 'got git response code {}:'.format(result.returncode) hints = 'Got git response code: {}.'.format(result.returncode)
raise ArchiveError('Failed git download', hints) raise ArchiveError('Failed git download', hints)
except Exception as e:
end() except Exception as err:
output = e status = 'failed'
print_error_hints(cmd=CMD, pwd=link_dir, err=e) output = err
finally:
timer.end()
return { return {
'cmd': CMD, 'cmd': cmd,
'pwd': link_dir,
'output': output, 'output': output,
'status': status,
**timer.stats,
} }
@attach_result_to_link('media') def should_fetch_media(link_dir, link):
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False): if is_static_file(link['url']):
return False
if os.path.exists(os.path.join(link_dir, 'media')):
return False
return FETCH_MEDIA
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT):
"""Download playlists or individual video, audio, and subtitles using youtube-dl""" """Download playlists or individual video, audio, and subtitles using youtube-dl"""
output = 'media' output = 'media'
output_path = os.path.join(link_dir, 'media') output_path = os.path.join(link_dir, 'media')
if os.path.exists(output_path) and not overwrite:
return {'output': output, 'status': 'skipped'}
os.makedirs(output_path, exist_ok=True) os.makedirs(output_path, exist_ok=True)
CMD = [ cmd = [
YOUTUBEDL_BINARY, YOUTUBEDL_BINARY,
'--write-description', '--write-description',
'--write-info-json', '--write-info-json',
@ -480,12 +503,11 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)), *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
link['url'], link['url'],
] ]
status = 'succeeded'
end = progress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
chmod_file(output, cwd=link_dir) chmod_file(output, cwd=link_dir)
end()
if result.returncode: if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr if (b'ERROR: Unsupported URL' in result.stderr
or b'HTTP Error 404' in result.stderr or b'HTTP Error 404' in result.stderr
@ -496,18 +518,22 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
pass pass
else: else:
hints = ( hints = (
'got youtubedl response code {}:'.format(result.returncode), 'Got youtube-dl response code: {}.'.format(result.returncode),
*result.stderr.decode().split('\n'), *result.stderr.decode().split('\n'),
) )
raise ArchiveError('Failed to download media', hints) raise ArchiveError('Failed to download media', hints)
except Exception as e: except Exception as err:
end() status = 'failed'
output = e output = err
print_error_hints(cmd=CMD, pwd=link_dir, err=e) finally:
timer.end()
return { return {
'cmd': CMD, 'cmd': cmd,
'pwd': link_dir,
'output': output, 'output': output,
'status': status,
**timer.stats,
} }
def parse_archive_dot_org_response(response): def parse_archive_dot_org_response(response):
@ -526,20 +552,23 @@ def parse_archive_dot_org_response(response):
errors = headers['x-archive-wayback-runtime-error'] errors = headers['x-archive-wayback-runtime-error']
return content_location, errors return content_location, errors
@attach_result_to_link('archive_org') def should_fetch_archive_dot_org(link_dir, link):
if is_static_file(link['url']):
return False
if os.path.exists(os.path.join(link_dir, 'archive.org.txt')):
# if open(path, 'r').read().strip() != 'None':
return False
return SUBMIT_ARCHIVE_DOT_ORG
def archive_dot_org(link_dir, link, timeout=TIMEOUT): def archive_dot_org(link_dir, link, timeout=TIMEOUT):
"""submit site to archive.org for archiving via their service, save returned archive url""" """submit site to archive.org for archiving via their service, save returned archive url"""
output = 'archive.org.txt' output = 'archive.org.txt'
archive_org_url = None archive_org_url = None
path = os.path.join(link_dir, output)
if os.path.exists(path):
archive_org_url = open(path, 'r').read().strip()
return {'output': archive_org_url, 'status': 'skipped'}
submit_url = 'https://web.archive.org/save/{}'.format(link['url']) submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
CMD = [ cmd = [
CURL_BINARY, CURL_BINARY,
'--location', '--location',
'--head', '--head',
@ -548,10 +577,10 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
*(() if CHECK_SSL_VALIDITY else ('--insecure',)), *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
submit_url, submit_url,
] ]
end = progress(timeout, prefix=' ') status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout) result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout)
end()
content_location, errors = parse_archive_dot_org_response(result.stdout) content_location, errors = parse_archive_dot_org_response(result.stdout)
if content_location: if content_location:
archive_org_url = 'https://web.archive.org{}'.format(content_location[0]) archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
@ -562,10 +591,11 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
raise ArchiveError(', '.join(errors)) raise ArchiveError(', '.join(errors))
else: else:
raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.') raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
except Exception as e: except Exception as err:
end() status = 'failed'
output = e output = err
print_error_hints(cmd=CMD, pwd=link_dir, err=e) finally:
timer.end()
if not isinstance(output, Exception): if not isinstance(output, Exception):
# instead of writing None when archive.org rejects the url write the # instead of writing None when archive.org rejects the url write the
@ -579,8 +609,11 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
output = archive_org_url output = archive_org_url
return { return {
'cmd': CMD, 'cmd': cmd,
'pwd': link_dir,
'output': output, 'output': output,
'status': status,
**timer.stats,
} }

View file

@ -17,6 +17,7 @@ from config import (
) )
from util import ( from util import (
chmod_file, chmod_file,
urlencode,
derived_link_info, derived_link_info,
check_link_structure, check_link_structure,
check_links_structure, check_links_structure,
@ -137,7 +138,7 @@ def write_html_links_index(out_dir, links, finished=False):
os.path.join('archive', link['timestamp'], 'favicon.ico') os.path.join('archive', link['timestamp'], 'favicon.ico')
# if link['is_archived'] else '' # if link['is_archived'] else ''
), ),
'archive_url': ( 'archive_url': urlencode(
wget_output_path(link) or 'index.html' wget_output_path(link) or 'index.html'
), ),
}) })
@ -174,6 +175,7 @@ def patch_links_index(link, out_dir=OUTPUT_DIR):
if saved_link['url'] == link['url']: if saved_link['url'] == link['url']:
saved_link['title'] = title saved_link['title'] = title
saved_link['latest'] = link['latest'] saved_link['latest'] = link['latest']
saved_link['history'] = link['history']
changed = True changed = True
break break
if changed: if changed:
@ -199,6 +201,7 @@ def write_link_index(out_dir, link):
link['updated'] = str(datetime.now().timestamp()) link['updated'] = str(datetime.now().timestamp())
write_json_link_index(out_dir, link) write_json_link_index(out_dir, link)
write_html_link_index(out_dir, link) write_html_link_index(out_dir, link)
# print(' √ index.html, index.json')
def write_json_link_index(out_dir, link): def write_json_link_index(out_dir, link):
"""write a json file with some info about the link""" """write a json file with some info about the link"""
@ -206,8 +209,6 @@ def write_json_link_index(out_dir, link):
check_link_structure(link) check_link_structure(link)
path = os.path.join(out_dir, 'index.json') path = os.path.join(out_dir, 'index.json')
print(' √ index.json')
with open(path, 'w', encoding='utf-8') as f: with open(path, 'w', encoding='utf-8') as f:
json.dump(link, f, indent=4, default=str) json.dump(link, f, indent=4, default=str)
@ -231,8 +232,13 @@ def load_json_link_index(out_dir, link):
**parse_json_link_index(out_dir), **parse_json_link_index(out_dir),
**link, **link,
} }
link.update({
'latest': link.get('latest') or {},
'history': link.get('history') or {},
})
check_link_structure(link) check_link_structure(link)
return link return link
def write_html_link_index(out_dir, link): def write_html_link_index(out_dir, link):
@ -242,8 +248,6 @@ def write_html_link_index(out_dir, link):
path = os.path.join(out_dir, 'index.html') path = os.path.join(out_dir, 'index.html')
print(' √ index.html')
link = derived_link_info(link) link = derived_link_info(link)
with open(path, 'w', encoding='utf-8') as f: with open(path, 'w', encoding='utf-8') as f:
@ -253,7 +257,7 @@ def write_html_link_index(out_dir, link):
link['title'] link['title']
or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG) or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
), ),
'archive_url': ( 'archive_url': urlencode(
wget_output_path(link) wget_output_path(link)
or (link['domain'] if link['is_archived'] else 'about:blank') or (link['domain'] if link['is_archived'] else 'about:blank')
), ),

View file

@ -6,7 +6,7 @@ from config import ANSI, REPO_DIR, OUTPUT_DIR
# globals are bad, mmkay # globals are bad, mmkay
_LAST_RUN_STATS = { _LAST_RUN_STATS = {
'skipped': 0, 'skipped': 0,
'succeded': 0, 'succeeded': 0,
'failed': 0, 'failed': 0,
'parsing_start_ts': 0, 'parsing_start_ts': 0,
@ -38,41 +38,54 @@ def log_link_archiving_started(link_dir, link, is_new):
print(' > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else '')) print(' > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
def log_link_archiving_failed(cmd, pwd, err=None, hints=None, prefix=' '): def log_archive_method_starting(method):
print(' > {}'.format(method))
def log_archive_method_finished(result):
"""quote the argument with whitespace in a command so the user can """quote the argument with whitespace in a command so the user can
copy-paste the outputted string directly to run the cmd copy-paste the outputted string directly to run the cmd
""" """
required_keys = ('cmd', 'pwd', 'output', 'status', 'start_ts', 'end_ts')
assert (
isinstance(result, dict)
and all(key in result for key in required_keys)
and ('output' in result)
), 'Archive method did not return a valid result.'
# Prettify CMD string and make it save to copy-paste by quoting arguments # Prettify CMD string and make it save to copy-paste by quoting arguments
quoted_cmd = ' '.join( quoted_cmd = ' '.join(
'"{}"'.format(arg) if ' ' in arg else arg '"{}"'.format(arg) if ' ' in arg else arg
for arg in cmd for arg in result['cmd']
) )
# Prettify error output hints string and limit to five lines if result['status'] == 'failed':
hints = hints or getattr(err, 'hints', None) # Prettify error output hints string and limit to five lines
if hints: hints = getattr(result['output'], 'hints', None) or ()
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n') if hints:
hints = ( hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset']) hints = (
for line in hints[:5] if line.strip() ' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
) for line in hints[:5] if line.strip()
else: )
hints = ()
output_lines = [ # Collect and prefix output lines with indentation
'{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']), output_lines = [
*hints, '{}Failed:{} {}{}'.format(
'Run to see full output:' ANSI['red'],
' cd {};'.format(pwd), result['output'].__class__.__name__.replace('ArchiveError', ''),
' {}'.format(quoted_cmd), result['output'],
] ANSI['reset']
),
return '\n'.join( *hints,
'{}{}'.format(prefix, line) '{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']),
for line in output_lines ' cd {};'.format(result['pwd']),
if line ' {}'.format(quoted_cmd),
) ]
print('\n'.join(
' {}'.format(line)
for line in output_lines
if line
))
### Logging Helpers ### Logging Helpers
@ -102,7 +115,7 @@ def log_indexing_started():
def log_indexing_finished(out_dir, out_file): def log_indexing_finished(out_dir, out_file):
end_ts = datetime.now() end_ts = datetime.now()
_LAST_RUN_STATS['index_end_ts'] = end_ts _LAST_RUN_STATS['index_end_ts'] = end_ts
print(' > {}/{}'.format(pretty_path(out_dir), out_file)) print(' {}/{}'.format(pretty_path(out_dir), out_file))
def log_archiving_started(num_links, resume): def log_archiving_started(num_links, resume):
start_ts = datetime.now() start_ts = datetime.now()

View file

@ -28,7 +28,7 @@ from util import (
str_between, str_between,
URL_REGEX, URL_REGEX,
check_url_parsing_invariants, check_url_parsing_invariants,
progress, TimedProgress,
) )
@ -53,13 +53,13 @@ def parse_links(source_file):
# Fallback parser # Fallback parser
('Plain Text', parse_plain_text_export), ('Plain Text', parse_plain_text_export),
) )
end = progress(TIMEOUT * 4, prefix=' ') timer = TimedProgress(TIMEOUT * 4)
with open(source_file, 'r', encoding='utf-8') as file: with open(source_file, 'r', encoding='utf-8') as file:
for parser_name, parser_func in PARSERS: for parser_name, parser_func in PARSERS:
try: try:
links = list(parser_func(file)) links = list(parser_func(file))
if links: if links:
end() timer.end()
return links, parser_name return links, parser_name
except Exception as err: except Exception as err:
# Parsers are tried one by one down the list, and the first one # Parsers are tried one by one down the list, and the first one
@ -68,7 +68,7 @@ def parse_links(source_file):
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err)) # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
pass pass
end() timer.end()
return [], 'Plain Text' return [], 'Plain Text'

View file

@ -1,167 +0,0 @@
"""
Patches, additions, and shortcuts for Python standard library functions.
"""
### subprocess
from subprocess import (
Popen,
PIPE,
DEVNULL,
CompletedProcess,
TimeoutExpired,
CalledProcessError,
)
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
if input is not None:
if 'stdin' in kwargs:
raise ValueError('stdin and input arguments may not both be used.')
kwargs['stdin'] = PIPE
if capture_output:
if ('stdout' in kwargs) or ('stderr' in kwargs):
raise ValueError('stdout and stderr arguments may not be used '
'with capture_output.')
kwargs['stdout'] = PIPE
kwargs['stderr'] = PIPE
with Popen(*popenargs, **kwargs) as process:
try:
stdout, stderr = process.communicate(input, timeout=timeout)
except TimeoutExpired:
process.kill()
try:
stdout, stderr = process.communicate(input, timeout=2)
except:
pass
raise TimeoutExpired(popenargs[0][0], timeout)
except BaseException as err:
process.kill()
# We don't call process.wait() as .__exit__ does that for us.
raise
retcode = process.poll()
if check and retcode:
raise CalledProcessError(retcode, process.args,
output=stdout, stderr=stderr)
return CompletedProcess(process.args, retcode, stdout, stderr)
### collections
from sys import maxsize
from itertools import islice
from collections import deque
_marker = object()
class PeekableGenerator:
"""Peekable version of a normal python generator.
Useful when you don't want to evaluate the entire iterable to look at
a specific item at a given idx.
"""
def __init__(self, iterable):
self._it = iter(iterable)
self._cache = deque()
def __iter__(self):
return self
def __bool__(self):
try:
self.peek()
except StopIteration:
return False
return True
def __nonzero__(self):
# For Python 2 compatibility
return self.__bool__()
def peek(self, default=_marker):
"""Return the item that will be next returned from ``next()``.
Return ``default`` if there are no items left. If ``default`` is not
provided, raise ``StopIteration``.
"""
if not self._cache:
try:
self._cache.append(next(self._it))
except StopIteration:
if default is _marker:
raise
return default
return self._cache[0]
def prepend(self, *items):
"""Stack up items to be the next ones returned from ``next()`` or
``self.peek()``. The items will be returned in
first in, first out order::
>>> p = peekable([1, 2, 3])
>>> p.prepend(10, 11, 12)
>>> next(p)
10
>>> list(p)
[11, 12, 1, 2, 3]
It is possible, by prepending items, to "resurrect" a peekable that
previously raised ``StopIteration``.
>>> p = peekable([])
>>> next(p)
Traceback (most recent call last):
...
StopIteration
>>> p.prepend(1)
>>> next(p)
1
>>> next(p)
Traceback (most recent call last):
...
StopIteration
"""
self._cache.extendleft(reversed(items))
def __next__(self):
if self._cache:
return self._cache.popleft()
return next(self._it)
def _get_slice(self, index):
# Normalize the slice's arguments
step = 1 if (index.step is None) else index.step
if step > 0:
start = 0 if (index.start is None) else index.start
stop = maxsize if (index.stop is None) else index.stop
elif step < 0:
start = -1 if (index.start is None) else index.start
stop = (-maxsize - 1) if (index.stop is None) else index.stop
else:
raise ValueError('slice step cannot be zero')
# If either the start or stop index is negative, we'll need to cache
# the rest of the iterable in order to slice from the right side.
if (start < 0) or (stop < 0):
self._cache.extend(self._it)
# Otherwise we'll need to find the rightmost index and cache to that
# point.
else:
n = min(max(start, stop) + 1, maxsize)
cache_len = len(self._cache)
if n >= cache_len:
self._cache.extend(islice(self._it, n - cache_len))
return list(self._cache)[index]
def __getitem__(self, index):
if isinstance(index, slice):
return self._get_slice(index)
cache_len = len(self._cache)
if index < 0:
self._cache.extend(self._it)
elif index >= cache_len:
self._cache.extend(islice(self._it, index + 1 - cache_len))
return self._cache[index]

View file

@ -8,12 +8,18 @@ from urllib.parse import urlparse, quote
from decimal import Decimal from decimal import Decimal
from datetime import datetime from datetime import datetime
from multiprocessing import Process from multiprocessing import Process
from subprocess import (
Popen,
PIPE,
DEVNULL,
CompletedProcess,
TimeoutExpired,
CalledProcessError,
)
from stdlib_patches import run, PIPE, DEVNULL
from config import ( from config import (
ANSI, ANSI,
TERM_WIDTH, TERM_WIDTH,
REPO_DIR,
SOURCES_DIR, SOURCES_DIR,
ARCHIVE_DIR, ARCHIVE_DIR,
OUTPUT_PERMISSIONS, OUTPUT_PERMISSIONS,
@ -43,6 +49,7 @@ from config import (
CHROME_HEADLESS, CHROME_HEADLESS,
CHROME_SANDBOX, CHROME_SANDBOX,
) )
from logs import pretty_path
### Parsing Helpers ### Parsing Helpers
@ -105,6 +112,17 @@ def check_link_structure(link):
assert isinstance(link.get('url'), str) assert isinstance(link.get('url'), str)
assert len(link['url']) > 2 assert len(link['url']) > 2
assert len(re.findall(URL_REGEX, link['url'])) == 1 assert len(re.findall(URL_REGEX, link['url'])) == 1
if 'history' in link:
assert isinstance(link['history'], dict), 'history must be a Dict'
for key, val in link['history'].items():
assert isinstance(key, str)
assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
if 'latest' in link:
assert isinstance(link['latest'], dict), 'latest must be a Dict'
for key, val in link['latest'].items():
assert isinstance(key, str)
assert (val is None) or isinstance(val, (str, Exception)), 'latest must be a Dict[str, Optional[str]], got: {}'.format(link['latest'])
def check_links_structure(links): def check_links_structure(links):
"""basic sanity check invariants to make sure the data is valid""" """basic sanity check invariants to make sure the data is valid"""
@ -236,12 +254,12 @@ def save_remote_source(url, timeout=TIMEOUT):
url, url,
ANSI['reset'], ANSI['reset'],
)) ))
end = progress(TIMEOUT, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
downloaded_xml = download_url(url, timeout=timeout) downloaded_xml = download_url(url, timeout=timeout)
end() timer.end()
except Exception as e: except Exception as e:
end() timer.end()
print('{}[!] Failed to download {}{}\n'.format( print('{}[!] Failed to download {}{}\n'.format(
ANSI['red'], ANSI['red'],
url, url,
@ -291,9 +309,9 @@ def wget_output_path(link):
return link['latest']['wget'] return link['latest']['wget']
if is_static_file(link['url']): if is_static_file(link['url']):
return urlencode(without_scheme(without_fragment(link['url']))) return without_scheme(without_fragment(link['url']))
# Wget downloads can save in a number of different ways depending on the url # Wget downloads can save in a number of different ways depending on the url:
# https://example.com # https://example.com
# > output/archive/<timestamp>/example.com/index.html # > output/archive/<timestamp>/example.com/index.html
# https://example.com/abc # https://example.com/abc
@ -302,6 +320,10 @@ def wget_output_path(link):
# > output/archive/<timestamp>/example.com/abc/index.html # > output/archive/<timestamp>/example.com/abc/index.html
# https://example.com/abc/test.html # https://example.com/abc/test.html
# > output/archive/<timestamp>/example.com/abc/test.html # > output/archive/<timestamp>/example.com/abc/test.html
# https://example.com/abc/test?v=zzVa_tX1OiI
# > output/archive/<timestamp>/example.com/abc/test?v=zzVa_tX1OiI.html
# https://example.com/abc/test/?v=zzVa_tX1OiI
# > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
# There's also lots of complexity around how the urlencoding and renaming # There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments or extensions like shtml / htm # is done for pages with query and hash fragments or extensions like shtml / htm
@ -327,7 +349,7 @@ def wget_output_path(link):
] ]
if html_files: if html_files:
path_from_link_dir = search_dir.split(link_dir)[-1].strip('/') path_from_link_dir = search_dir.split(link_dir)[-1].strip('/')
return urlencode(os.path.join(path_from_link_dir, html_files[0])) return os.path.join(path_from_link_dir, html_files[0])
# Move up one directory level # Move up one directory level
search_dir = search_dir.rsplit('/', 1)[0] search_dir = search_dir.rsplit('/', 1)[0]
@ -456,69 +478,109 @@ def derived_link_info(link):
### Python / System Helpers ### Python / System Helpers
def progress(seconds=TIMEOUT, prefix=''): def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
"""Show a (subprocess-controlled) progress bar with a <seconds> timeout, """Patched of subprocess.run to fix blocking io making timeout=innefective"""
returns end() function to instantly finish the progress
"""
if not SHOW_PROGRESS: if input is not None:
return lambda: None if 'stdin' in kwargs:
raise ValueError('stdin and input arguments may not both be used.')
kwargs['stdin'] = PIPE
def progress_bar(seconds, prefix): if capture_output:
"""show timer in the form of progress bar, with percentage and seconds remaining""" if ('stdout' in kwargs) or ('stderr' in kwargs):
chunk = '' if sys.stdout.encoding == 'UTF-8' else '#' raise ValueError('stdout and stderr arguments may not be used '
chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width) 'with capture_output.')
kwargs['stdout'] = PIPE
kwargs['stderr'] = PIPE
with Popen(*popenargs, **kwargs) as process:
try: try:
for s in range(seconds * chunks): stdout, stderr = process.communicate(input, timeout=timeout)
progress = s / chunks / seconds * 100 except TimeoutExpired:
bar_width = round(progress/(100/chunks)) process.kill()
try:
stdout, stderr = process.communicate(input, timeout=2)
except:
pass
raise TimeoutExpired(popenargs[0][0], timeout)
except BaseException:
process.kill()
# We don't call process.wait() as .__exit__ does that for us.
raise
retcode = process.poll()
if check and retcode:
raise CalledProcessError(retcode, process.args,
output=stdout, stderr=stderr)
return CompletedProcess(process.args, retcode, stdout, stderr)
# ████████████████████ 0.9% (1/60sec)
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
prefix,
ANSI['green'],
(chunk * bar_width).ljust(chunks),
ANSI['reset'],
round(progress, 1),
round(s/chunks),
seconds,
))
sys.stdout.flush()
time.sleep(1 / chunks)
# ██████████████████████████████████ 100.0% (60/60sec) def progress_bar(seconds, prefix):
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format( """show timer in the form of progress bar, with percentage and seconds remaining"""
chunk = '' if sys.stdout.encoding == 'UTF-8' else '#'
chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
try:
for s in range(seconds * chunks):
progress = s / chunks / seconds * 100
bar_width = round(progress/(100/chunks))
# ████████████████████ 0.9% (1/60sec)
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
prefix, prefix,
ANSI['red'], ANSI['green'],
chunk * chunks, (chunk * bar_width).ljust(chunks),
ANSI['reset'], ANSI['reset'],
100.0, round(progress, 1),
seconds, round(s/chunks),
seconds, seconds,
)) ))
sys.stdout.flush() sys.stdout.flush()
except KeyboardInterrupt: time.sleep(1 / chunks)
print()
pass
p = Process(target=progress_bar, args=(seconds, prefix)) # ██████████████████████████████████ 100.0% (60/60sec)
p.start() sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
prefix,
ANSI['red'],
chunk * chunks,
ANSI['reset'],
100.0,
seconds,
seconds,
))
sys.stdout.flush()
except KeyboardInterrupt:
print()
pass
def end(): class TimedProgress:
def __init__(self, seconds, prefix=''):
if SHOW_PROGRESS:
self.p = Process(target=progress_bar, args=(seconds, prefix))
self.p.start()
self.stats = {
'start_ts': datetime.now(),
'end_ts': None,
'duration': None,
}
def end(self):
"""immediately finish progress and clear the progressbar line""" """immediately finish progress and clear the progressbar line"""
# protect from double termination end_ts = datetime.now()
#if p is None or not hasattr(p, 'kill'): self.stats.update({
# return 'end_ts': end_ts,
nonlocal p 'duration': (end_ts - self.stats['start_ts']).seconds,
if p is not None: })
p.terminate()
p = None
sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line if SHOW_PROGRESS:
sys.stdout.flush() # protect from double termination
#if p is None or not hasattr(p, 'kill'):
# return
if self.p is not None:
self.p.terminate()
self.p = None
return end sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line
sys.stdout.flush()
def download_url(url, timeout=TIMEOUT): def download_url(url, timeout=TIMEOUT):
req = Request(url, headers={'User-Agent': WGET_USER_AGENT}) req = Request(url, headers={'User-Agent': WGET_USER_AGENT})