1
0
Fork 0
mirror of synced 2024-09-28 23:31:09 +12:00

major codebase-wide code cleanups

This commit is contained in:
Nick Sweeting 2019-03-21 01:28:12 -04:00
parent c806068683
commit e6bd1f8ca8
8 changed files with 825 additions and 743 deletions

View file

@ -1,225 +1,132 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# ArchiveBox """
# Nick Sweeting 2017 | MIT License ArchiveBox command line application.
# https://github.com/pirate/ArchiveBox
./archive and ./bin/archivebox both point to this file,
but you can also run it directly using `python3 archive.py`
Usage & Documentation:
https://github.com/pirate/ArchiveBox/Wiki
"""
import os import os
import sys import sys
from datetime import datetime from links import links_after_timestamp
from peekable import Peekable from index import write_links_index, load_links_index
from archive_methods import archive_link
from parse import parse_links
from links import validate_links, links_after_timestamp
from archive_methods import archive_link, _RESULTS_TOTALS
from index import (
write_links_index,
parse_json_links_index,
)
from config import ( from config import (
ARCHIVE_DIR, ARCHIVE_DIR,
ONLY_NEW, ONLY_NEW,
OUTPUT_DIR, OUTPUT_DIR,
REPO_DIR,
ANSI,
GIT_SHA, GIT_SHA,
) )
from util import ( from util import (
check_dependencies, check_dependencies,
save_remote_source, save_remote_source,
save_stdin_source, save_stdin_source,
pretty_path, )
check_links_structure, from logs import (
log_archiving_started,
log_archiving_paused,
log_archiving_finished,
) )
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>' __AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
__VERSION__ = GIT_SHA __VERSION__ = GIT_SHA
__DESCRIPTION__ = 'ArchiveBox Usage: Create a browsable html archive of a list of links.' __DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki' __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
def print_help(): def print_help():
print(__DESCRIPTION__) print('ArchiveBox: The self-hosted internet archive.\n')
print("Documentation: {}\n".format(__DOCUMENTATION__)) print("Documentation:")
print(" https://github.com/pirate/ArchiveBox/wiki\n")
print("Usage:") print("Usage:")
print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
print("")
print(" ./bin/archivebox https://example.com/feed.rss\n")
print("")
print(" echo 'https://examplecom' | ./bin/archivebox\n") print(" echo 'https://examplecom' | ./bin/archivebox\n")
print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
print(" ./bin/archivebox https://example.com/feed.rss\n")
print(" ./bin/archivebox 15109948213.123\n")
def load_links(archive_path=OUTPUT_DIR, import_path=None): def main(*args):
"""get new links from file and optionally append them to links in existing archive""" if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
existing_links = []
if archive_path:
existing_links = parse_json_links_index(archive_path)
check_links_structure(existing_links)
new_links = []
if import_path:
# parse and validate the import file
raw_links, parser_name = parse_links(import_path)
new_links = validate_links(raw_links)
check_links_structure(new_links)
# merge existing links in archive_path and new links
all_links = validate_links(existing_links + new_links)
check_links_structure(all_links)
num_new_links = len(all_links) - len(existing_links)
if import_path and parser_name:
print(' > Adding {} new links to index (parsed import as {})'.format(
num_new_links,
parser_name,
))
return all_links, new_links
def update_archive(archive_path, links, source=None, resume=None, append=True):
"""update or create index.html+json given a path to an export file containing new links"""
start_ts = datetime.now().timestamp()
if resume:
print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
resume,
**ANSI,
))
else:
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
len(links),
**ANSI,
))
check_links_structure(links)
# prefetch the first link off the generator so that if we pause or fail
# immediately we can show that we paused on the first link and not just None
to_archive = Peekable(links_after_timestamp(links, resume))
idx, link = 0, to_archive.peek(0)
# loop over links and archive them
try:
check_dependencies()
for idx, link in enumerate(to_archive):
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
archive_link(link_dir, link)
except (KeyboardInterrupt, SystemExit, Exception) as e:
# if isinstance(e, KeyboardInterrupt):
# # Step 4: Re-write links index with updated titles, icons, and resources
# all_links, _ = load_links(archive_path=out_dir)
# write_links_index(out_dir=out_dir, links=all_links, finished=True)
print()
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
**ANSI,
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
idx=idx+1,
timestamp=link['timestamp'],
total=len(links),
))
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
print(' Continue where you left off by running:')
print(' {} {}'.format(
pretty_path(sys.argv[0]),
link['timestamp'],
))
if not isinstance(e, KeyboardInterrupt):
print()
raise e
raise SystemExit(1)
# print timing information & summary
end_ts = datetime.now().timestamp()
seconds = end_ts - start_ts
if seconds > 60:
duration = '{0:.2f} min'.format(seconds / 60, 2)
else:
duration = '{0:.2f} sec'.format(seconds, 2)
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
ANSI['green'],
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
len(links),
duration,
ANSI['reset'],
))
print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
print(' - {} errors'.format(_RESULTS_TOTALS['failed']))
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
if __name__ == '__main__':
argc = len(sys.argv)
if set(sys.argv).intersection(('-h', '--help', 'help')):
print_help() print_help()
raise SystemExit(0) raise SystemExit(0)
source = sys.argv[1] if argc > 1 else None # path of links file to import ### Handle CLI arguments
resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from # ./archive bookmarks.html
# ./archive 1523422111.234
stdin_raw_text = '' import_path, resume = None, None
if len(args) == 2:
# if the argument is a string, it's a import_path file to import
# if it's a number, it's a timestamp to resume archiving from
if args[1].replace('.', '').isdigit():
import_path, resume = None, args[1]
else:
import_path, resume = args[1], None
### Set up output folder
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
### Handle ingesting urls piped in through stdin
# (.e.g if user does cat example_urls.txt | ./archive)
if not sys.stdin.isatty(): if not sys.stdin.isatty():
stdin_raw_text = sys.stdin.read() stdin_raw_text = sys.stdin.read()
if stdin_raw_text and import_path:
print(
'[X] You should pass either a path as an argument, '
'or pass a list of links via stdin, but not both.\n'
)
print_help()
raise SystemExit(1)
if source and stdin_raw_text: import_path = save_stdin_source(stdin_raw_text)
print(
'[X] You should pass either a path as an argument, ' ### Handle ingesting urls from a remote file/feed
'or pass a list of links via stdin, but not both.\n' # (e.g. if an RSS feed URL is used as the import path)
) if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
print_help() import_path = save_remote_source(import_path)
raise SystemExit(1)
### Run the main archive update process
update_archive_data(import_path=import_path, resume=resume)
if argc == 1: def update_archive_data(import_path=None, resume=None):
source, resume = None, None """The main ArchiveBox entrancepoint. Everything starts here."""
elif argc == 2: check_dependencies()
if all(d.isdigit() for d in sys.argv[1].split('.')):
# argv[1] is a resume timestamp
source, resume = None, sys.argv[1]
else:
# argv[1] is a path to a file to import
source, resume = sys.argv[1].strip(), None
elif argc == 3:
source, resume = sys.argv[1].strip(), sys.argv[2]
else:
print_help()
raise SystemExit(1)
# See if archive folder already exists # Step 1: Load list of links from the existing index
for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'): # merge in and dedupe new links from import_path
if os.path.exists(out_dir): all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
break
else:
out_dir = OUTPUT_DIR
# Step 0: Download url to local file (only happens if a URL is specified instead of local path) # Step 2: Write updated index with deduped old and new links back to disk
if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')): write_links_index(out_dir=OUTPUT_DIR, links=all_links)
source = save_remote_source(source)
elif stdin_raw_text:
source = save_stdin_source(stdin_raw_text)
# Step 1: Parse the links and dedupe them with existing archive
all_links, new_links = load_links(archive_path=out_dir, import_path=source)
# Step 2: Write new index
write_links_index(out_dir=out_dir, links=all_links)
# Step 3: Run the archive methods for each link # Step 3: Run the archive methods for each link
if ONLY_NEW: links = new_links if ONLY_NEW else all_links
update_archive(out_dir, new_links, source=source, resume=resume, append=True) log_archiving_started(len(links), resume)
else: idx, link = 0, 0
update_archive(out_dir, all_links, source=source, resume=resume, append=True) try:
for idx, link in enumerate(links_after_timestamp(links, resume)):
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
archive_link(link_dir, link)
except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link and link['timestamp'])
raise SystemExit(0)
except:
print()
raise
log_archiving_finished(len(links))
# Step 4: Re-write links index with updated titles, icons, and resources # Step 4: Re-write links index with updated titles, icons, and resources
all_links, _ = load_links(archive_path=out_dir) all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
write_links_index(out_dir=out_dir, links=all_links, finished=True) write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
if __name__ == '__main__':
main(*sys.argv)

View file

@ -3,18 +3,18 @@ import os
from functools import wraps from functools import wraps
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from stdlib_patches import run, PIPE, DEVNULL
from index import ( from index import (
parse_json_link_index,
write_link_index, write_link_index,
update_main_index, patch_links_index,
load_json_link_index,
) )
from config import ( from config import (
CURL_BINARY, CURL_BINARY,
GIT_BINARY, GIT_BINARY,
WGET_BINARY, WGET_BINARY,
YOUTUBEDL_BINARY, YOUTUBEDL_BINARY,
CHROME_BINARY,
FETCH_FAVICON, FETCH_FAVICON,
FETCH_TITLE, FETCH_TITLE,
FETCH_WGET, FETCH_WGET,
@ -25,62 +25,37 @@ from config import (
FETCH_WARC, FETCH_WARC,
FETCH_GIT, FETCH_GIT,
FETCH_MEDIA, FETCH_MEDIA,
RESOLUTION,
CHECK_SSL_VALIDITY,
SUBMIT_ARCHIVE_DOT_ORG, SUBMIT_ARCHIVE_DOT_ORG,
COOKIES_FILE,
WGET_USER_AGENT,
CHROME_USER_AGENT,
CHROME_USER_DATA_DIR,
CHROME_HEADLESS,
CHROME_SANDBOX,
TIMEOUT, TIMEOUT,
MEDIA_TIMEOUT, MEDIA_TIMEOUT,
ANSI, ANSI,
ARCHIVE_DIR, OUTPUT_DIR,
GIT_DOMAINS, GIT_DOMAINS,
GIT_SHA, GIT_SHA,
WGET_USER_AGENT,
CHECK_SSL_VALIDITY,
COOKIES_FILE,
) )
from util import ( from util import (
domain, domain,
extension,
without_query, without_query,
without_fragment, without_fragment,
fetch_page_title, fetch_page_title,
is_static_file, is_static_file,
progress, progress,
chmod_file, chmod_file,
pretty_path,
print_error_hints,
check_link_structure, check_link_structure,
wget_output_path, wget_output_path,
run, PIPE, DEVNULL, chrome_args,
)
from logs import (
_LAST_RUN_STATS,
log_link_archiving_started,
log_link_archiving_failed,
) )
_RESULTS_TOTALS = { # globals are bad, mmkay
'skipped': 0,
'succeded': 0,
'failed': 0,
}
def load_link_index(link_dir, link):
"""check for an existing link archive in the given directory,
and load+merge it into the given link dict
"""
is_new = not os.path.exists(link_dir)
if is_new:
os.makedirs(link_dir)
else:
link = {
**parse_json_link_index(link_dir),
**link,
}
check_link_structure(link)
print_link_status_line(link_dir, link, is_new)
return link
class ArchiveError(Exception): class ArchiveError(Exception):
def __init__(self, message, hints=None): def __init__(self, message, hints=None):
@ -105,32 +80,24 @@ def archive_link(link_dir, link, overwrite=True):
active_methods = [method for toggle, method in ARCHIVE_METHODS if toggle] active_methods = [method for toggle, method in ARCHIVE_METHODS if toggle]
try: try:
link = load_link_index(link_dir, link) is_new = not os.path.exists(link_dir)
if is_new:
os.makedirs(link_dir)
link = load_json_link_index(link_dir, link)
log_link_archiving_started(link_dir, link, is_new)
for archive_method in active_methods: for archive_method in active_methods:
archive_method(link_dir, link, overwrite=overwrite) archive_method(link_dir, link, overwrite=overwrite)
write_link_index(link_dir, link) write_link_index(link_dir, link)
update_main_index(link) patch_links_index(link)
except Exception as err: except Exception as err:
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
return link return link
def print_link_status_line(link_dir, link, is_new):
print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
symbol='+' if is_new else '*',
symbol_color=ANSI['green' if is_new else 'black'],
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
**{**link, 'title': link['title'] or link['url']},
**ANSI,
))
print(' > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
def attach_result_to_link(method): def attach_result_to_link(method):
""" """
@ -178,15 +145,75 @@ def attach_result_to_link(method):
link['history'][method].append(history_entry) link['history'][method].append(history_entry)
link['latest'][method] = result['output'] link['latest'][method] = result['output']
_RESULTS_TOTALS[history_entry['status']] += 1 _LAST_RUN_STATS[history_entry['status']] += 1
return link return link
return timed_fetch_func return timed_fetch_func
return decorator return decorator
@attach_result_to_link('title')
def fetch_title(link_dir, link, timeout=TIMEOUT):
"""try to guess the page's title from its content"""
# if link already has valid title, skip it
if link['title'] and not link['title'].lower().startswith('http'):
return {'output': link['title'], 'status': 'skipped'}
if is_static_file(link['url']):
return {'output': None, 'status': 'skipped'}
end = progress(timeout, prefix=' ')
try:
title = fetch_page_title(link['url'], timeout=timeout, progress=False)
end()
output = title
except Exception as e:
end()
output = e
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
if title and title.strip():
link['title'] = title
output = title
return {
'cmd': 'fetch_page_title("{}")'.format(link['url']),
'output': output,
}
@attach_result_to_link('favicon')
def fetch_favicon(link_dir, link, timeout=TIMEOUT):
"""download site favicon from google's favicon api"""
output = 'favicon.ico'
if os.path.exists(os.path.join(link_dir, output)):
return {'output': output, 'status': 'skipped'}
CMD = [
CURL_BINARY,
'--max-time', str(timeout),
'--location',
'--output', output,
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
]
end = progress(timeout, prefix=' ')
try:
run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
end()
chmod_file(output, cwd=link_dir)
except Exception as e:
end()
output = e
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
return {
'cmd': CMD,
'output': output,
}
@attach_result_to_link('wget') @attach_result_to_link('wget')
def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT): def fetch_wget(link_dir, link, timeout=TIMEOUT):
"""download full site using wget""" """download full site using wget"""
domain_dir = os.path.join(link_dir, domain(link['url'])) domain_dir = os.path.join(link_dir, domain(link['url']))
@ -194,7 +221,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
if os.path.exists(domain_dir) and existing_file: if os.path.exists(domain_dir) and existing_file:
return {'output': existing_file, 'status': 'skipped'} return {'output': existing_file, 'status': 'skipped'}
if warc: if FETCH_WARC:
warc_dir = os.path.join(link_dir, 'warc') warc_dir = os.path.join(link_dir, 'warc')
os.makedirs(warc_dir, exist_ok=True) os.makedirs(warc_dir, exist_ok=True)
warc_path = os.path.join('warc', str(int(datetime.now().timestamp()))) warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
@ -213,8 +240,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
'-e', 'robots=off', '-e', 'robots=off',
'--restrict-file-names=unix', '--restrict-file-names=unix',
'--timeout={}'.format(timeout), '--timeout={}'.format(timeout),
*(() if warc else ('--timestamping',)), *(() if FETCH_WARC else ('--timestamping',)),
*(('--warc-file={}'.format(warc_path),) if warc else ()), *(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
*(('--page-requisites',) if FETCH_WGET_REQUISITES else ()), *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
*(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()), *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
*(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()), *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
@ -233,7 +260,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
if line.strip() if line.strip()
] ]
# parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" # parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
files_downloaded = ( files_downloaded = (
int(output_tail[-1].strip().split(' ', 2)[1] or 0) int(output_tail[-1].strip().split(' ', 2)[1] or 0)
if 'Downloaded:' in output_tail[-1] if 'Downloaded:' in output_tail[-1]
@ -263,20 +291,19 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
'output': output, 'output': output,
} }
@attach_result_to_link('pdf') @attach_result_to_link('pdf')
def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs): def fetch_pdf(link_dir, link, timeout=TIMEOUT):
"""print PDF of site to file using chrome --headless""" """print PDF of site to file using chrome --headless"""
if is_static_file(link['url']): if is_static_file(link['url']):
return {'output': wget_output_path(link), 'status': 'skipped'} return {'output': None, 'status': 'skipped'}
output = 'output.pdf' output = 'output.pdf'
if os.path.exists(os.path.join(link_dir, output)): if os.path.exists(os.path.join(link_dir, output)):
return {'output': output, 'status': 'skipped'} return {'output': output, 'status': 'skipped'}
CMD = [ CMD = [
*chrome_headless(timeout=timeout, **chrome_kwargs), *chrome_args(timeout=timeout),
'--print-to-pdf', '--print-to-pdf',
link['url'] link['url']
] ]
@ -302,18 +329,18 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
} }
@attach_result_to_link('screenshot') @attach_result_to_link('screenshot')
def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs): def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
"""take screenshot of site using chrome --headless""" """take screenshot of site using chrome --headless"""
if is_static_file(link['url']): if is_static_file(link['url']):
return {'output': wget_output_path(link), 'status': 'skipped'} return {'output': None, 'status': 'skipped'}
output = 'screenshot.png' output = 'screenshot.png'
if os.path.exists(os.path.join(link_dir, output)): if os.path.exists(os.path.join(link_dir, output)):
return {'output': output, 'status': 'skipped'} return {'output': output, 'status': 'skipped'}
CMD = [ CMD = [
*chrome_headless(timeout=timeout, **chrome_kwargs), *chrome_args(timeout=timeout),
'--screenshot', '--screenshot',
link['url'], link['url'],
] ]
@ -337,18 +364,19 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
} }
@attach_result_to_link('dom') @attach_result_to_link('dom')
def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs): def fetch_dom(link_dir, link, timeout=TIMEOUT):
"""print HTML of site to file using chrome --dump-html""" """print HTML of site to file using chrome --dump-html"""
if is_static_file(link['url']): if is_static_file(link['url']):
return {'output': wget_output_path(link), 'status': 'skipped'} return {'output': None, 'status': 'skipped'}
output = 'output.html' output = 'output.html'
if os.path.exists(os.path.join(link_dir, output)): output_path = os.path.join(link_dir, output)
if os.path.exists(output_path):
return {'output': output, 'status': 'skipped'} return {'output': output, 'status': 'skipped'}
CMD = [ CMD = [
*chrome_headless(timeout=timeout, **chrome_kwargs), *chrome_args(timeout=timeout),
'--dump-dom', '--dump-dom',
link['url'] link['url']
] ]
@ -372,6 +400,116 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
'output': output, 'output': output,
} }
@attach_result_to_link('git')
def fetch_git(link_dir, link, timeout=TIMEOUT):
"""download full site using git"""
is_clonable_url = (
domain(link['url']) in GIT_DOMAINS
or extension(link['url']) == 'git'
)
if is_static_file(link['url']) or not is_clonable_url:
return {'output': None, 'status': 'skipped'}
output = 'git'
output_path = os.path.join(link_dir, 'git')
if os.path.exists(output_path):
return {'output': output, 'status': 'skipped'}
os.makedirs(output_path, exist_ok=True)
CMD = [
GIT_BINARY,
'clone',
'--mirror',
'--recursive',
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
without_query(without_fragment(link['url'])),
]
end = progress(timeout, prefix=' ')
try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
end()
if result.returncode == 128:
# ignore failed re-download when the folder already exists
pass
elif result.returncode > 0:
hints = 'got git response code {}:'.format(result.returncode)
raise ArchiveError('Failed git download', hints)
except Exception as e:
end()
output = e
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
return {
'cmd': CMD,
'output': output,
}
@attach_result_to_link('media')
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
output = 'media'
output_path = os.path.join(link_dir, 'media')
if os.path.exists(output_path) and not overwrite:
return {'output': output, 'status': 'skipped'}
os.makedirs(output_path, exist_ok=True)
CMD = [
YOUTUBEDL_BINARY,
'--write-description',
'--write-info-json',
'--write-annotations',
'--yes-playlist',
'--write-thumbnail',
'--no-call-home',
'--no-check-certificate',
'--user-agent',
'--all-subs',
'--extract-audio',
'--keep-video',
'--ignore-errors',
'--geo-bypass',
'--audio-format', 'mp3',
'--audio-quality', '320K',
'--embed-thumbnail',
'--add-metadata',
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
link['url'],
]
end = progress(timeout, prefix=' ')
try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
chmod_file(output, cwd=link_dir)
end()
if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr
or b'HTTP Error 404' in result.stderr
or b'HTTP Error 403' in result.stderr
or b'URL could be a direct video link' in result.stderr
or b'Unable to extract container ID' in result.stderr):
# These happen too frequently on non-media pages to warrant printing to console
pass
else:
hints = (
'got youtubedl response code {}:'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to download media', hints)
except Exception as e:
end()
output = e
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
return {
'cmd': CMD,
'output': output,
}
def parse_archive_dot_org_response(response): def parse_archive_dot_org_response(response):
# Parse archive.org response headers # Parse archive.org response headers
headers = defaultdict(list) headers = defaultdict(list)
@ -445,226 +583,4 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
'output': output, 'output': output,
} }
@attach_result_to_link('favicon')
def fetch_favicon(link_dir, link, timeout=TIMEOUT):
"""download site favicon from google's favicon api"""
output = 'favicon.ico'
if os.path.exists(os.path.join(link_dir, output)):
return {'output': output, 'status': 'skipped'}
CMD = [
CURL_BINARY,
'--max-time', str(timeout),
'--location',
'--output', output,
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
]
end = progress(timeout, prefix=' ')
try:
run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
end()
chmod_file(output, cwd=link_dir)
except Exception as e:
end()
output = e
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
return {
'cmd': CMD,
'output': output,
}
@attach_result_to_link('title')
def fetch_title(link_dir, link, timeout=TIMEOUT):
"""try to guess the page's title from its content"""
# if link already has valid title, skip it
if link['title'] and not link['title'].lower().startswith('http'):
return {'output': link['title'], 'status': 'skipped'}
if is_static_file(link['url']):
return {'output': None, 'status': 'skipped'}
end = progress(timeout, prefix=' ')
try:
title = fetch_page_title(link['url'], timeout=timeout, progress=False)
end()
output = title
except Exception as e:
end()
output = e
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
if title and title.strip():
link['title'] = title
output = title
return {
'cmd': 'fetch_page_title("{}")'.format(link['url']),
'output': output,
}
@attach_result_to_link('media')
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
output = 'media'
output_path = os.path.join(link_dir, 'media')
if os.path.exists(output_path) and not overwrite:
return {'output': output, 'status': 'skipped'}
os.makedirs(output_path, exist_ok=True)
CMD = [
YOUTUBEDL_BINARY,
'--write-description',
'--write-info-json',
'--write-annotations',
'--yes-playlist',
'--write-thumbnail',
'--no-call-home',
'--no-check-certificate',
'--user-agent',
'--all-subs',
'--extract-audio',
'--keep-video',
'--ignore-errors',
'--geo-bypass',
'--audio-format', 'mp3',
'--audio-quality', '320K',
'--embed-thumbnail',
'--add-metadata',
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
link['url'],
]
end = progress(timeout, prefix=' ')
try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
chmod_file(output, cwd=link_dir)
end()
if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr
or b'HTTP Error 404' in result.stderr
or b'HTTP Error 403' in result.stderr
or b'URL could be a direct video link' in result.stderr
or b'Unable to extract container ID' in result.stderr):
# These happen too frequently on non-media pages to warrant printing to console
pass
else:
hints = (
'got youtubedl response code {}:'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to download media', hints)
except Exception as e:
end()
output = e
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
return {
'cmd': CMD,
'output': output,
}
@attach_result_to_link('git')
def fetch_git(link_dir, link, timeout=TIMEOUT):
"""download full site using git"""
url_is_clonable = (
domain(link['url']) in GIT_DOMAINS
or link['url'].endswith('.git')
)
if not url_is_clonable or is_static_file(link['url']):
return {'output': None, 'status': 'skipped'}
output = 'git'
output_path = os.path.join(link_dir, 'git')
if os.path.exists(output_path):
return {'output': output, 'status': 'skipped'}
os.makedirs(output_path, exist_ok=True)
CMD = [
GIT_BINARY,
'clone',
'--mirror',
'--recursive',
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
without_query(without_fragment(link['url'])),
]
end = progress(timeout, prefix=' ')
try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
end()
if result.returncode == 128:
# ignore failed re-download when the folder already exists
pass
elif result.returncode > 0:
hints = 'got git response code {}:'.format(result.returncode)
raise ArchiveError('Failed git download', hints)
except Exception as e:
end()
output = e
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
return {
'cmd': CMD,
'output': output,
}
def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR, headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX, check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT, resolution=RESOLUTION, timeout=TIMEOUT):
global CACHED_USER_DATA_DIR
user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
cmd_args = [binary]
if headless:
cmd_args += ('--headless',)
if not sandbox:
# dont use GPU or sandbox when running inside docker container
cmd_args += ('--no-sandbox', '--disable-gpu')
if not check_ssl_validity:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
if user_agent:
cmd_args += ('--user-agent={}'.format(user_agent),)
if resolution:
cmd_args += ('--window-size={}'.format(RESOLUTION),)
if timeout:
cmd_args += ('--timeout={}'.format((timeout) * 1000),)
# Find chrome user data directory
default_profile_paths = (
'~/.config/chromium',
'~/.config/google-chrome',
'~/.config/google-chrome-beta',
'~/.config/google-chrome-unstable',
'~/Library/Application Support/Chromium',
'~/Library/Application Support/Google/Chrome',
'~/Library/Application Support/Google/Chrome Canary',
'~/AppData/Local/Chromium/User Data',
'~/AppData/Local/Google/Chrome/User Data',
'~/AppData/Local/Google/Chrome SxS/User Data',
)
if user_data_dir:
cmd_args.append('--user-data-dir={}'.format(user_data_dir))
else:
for path in default_profile_paths:
full_path = os.path.expanduser(path)
if os.path.exists(full_path):
CACHED_USER_DATA_DIR = full_path
cmd_args.append('--user-data-dir={}'.format(full_path))
break
return cmd_args
CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR

View file

@ -12,18 +12,24 @@ except ImportError:
from config import ( from config import (
OUTPUT_DIR, OUTPUT_DIR,
TEMPLATES_DIR, TEMPLATES_DIR,
ANSI,
GIT_SHA, GIT_SHA,
FOOTER_INFO, FOOTER_INFO,
) )
from util import ( from util import (
chmod_file, chmod_file,
derived_link_info, derived_link_info,
pretty_path,
check_link_structure, check_link_structure,
check_links_structure, check_links_structure,
wget_output_path, wget_output_path,
) )
from parse import parse_links
from links import validate_links
from logs import (
log_indexing_started,
log_indexing_finished,
log_parsing_started,
log_parsing_finished,
)
TITLE_LOADING_MSG = 'Not yet archived...' TITLE_LOADING_MSG = 'Not yet archived...'
@ -33,21 +39,40 @@ TITLE_LOADING_MSG = 'Not yet archived...'
def write_links_index(out_dir, links, finished=False): def write_links_index(out_dir, links, finished=False):
"""create index.html file for a given list of links""" """create index.html file for a given list of links"""
log_indexing_started()
check_links_structure(links) check_links_structure(links)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
print('{green}[*] [{}] Saving main index files...{reset}'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
**ANSI,
))
write_json_links_index(out_dir, links) write_json_links_index(out_dir, links)
print(' > {}/index.json'.format(pretty_path(out_dir))) log_indexing_finished(out_dir, 'index.json')
write_html_links_index(out_dir, links, finished=finished) write_html_links_index(out_dir, links, finished=finished)
print(' > {}/index.html'.format(pretty_path(out_dir))) log_indexing_finished(out_dir, 'index.html')
def load_links_index(out_dir=OUTPUT_DIR, import_path=None):
"""parse and load existing index with any new links from import_path merged in"""
existing_links = []
if out_dir:
existing_links = parse_json_links_index(out_dir)
check_links_structure(existing_links)
new_links = []
if import_path:
# parse and validate the import file
log_parsing_started(import_path)
raw_links, parser_name = parse_links(import_path)
new_links = validate_links(raw_links)
check_links_structure(new_links)
# merge existing links in out_dir and new links
all_links = validate_links(existing_links + new_links)
check_links_structure(all_links)
num_new_links = len(all_links) - len(existing_links)
if import_path and parser_name:
log_parsing_finished(num_new_links, parser_name)
return all_links, new_links
def write_json_links_index(out_dir, links): def write_json_links_index(out_dir, links):
"""write the json link index to a given path""" """write the json link index to a given path"""
@ -70,8 +95,8 @@ def write_json_links_index(out_dir, links):
chmod_file(path) chmod_file(path)
def parse_json_links_index(out_dir): def parse_json_links_index(out_dir=OUTPUT_DIR):
"""load the index in a given directory and merge it with the given link""" """parse a archive index json file and return the list of links"""
index_path = os.path.join(out_dir, 'index.json') index_path = os.path.join(out_dir, 'index.json')
if os.path.exists(index_path): if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f: with open(index_path, 'r', encoding='utf-8') as f:
@ -136,31 +161,26 @@ def write_html_links_index(out_dir, links, finished=False):
chmod_file(path) chmod_file(path)
def update_main_index(link): def patch_links_index(link, out_dir=OUTPUT_DIR):
"""hack to in-place update one row's info in the generated index html""" """hack to in-place update one row's info in the generated index html"""
title = link['latest']['title'] title = link['latest']['title']
successful = len([entry for entry in link['latest'].values() if entry]) successful = len([entry for entry in link['latest'].values() if entry])
# Patch JSON index # Patch JSON index
json_path = os.path.join(OUTPUT_DIR, 'index.json')
links = parse_json_links_index(OUTPUT_DIR)
changed = False changed = False
for json_link in links: json_file_links = parse_json_links_index(out_dir)
if json_link['url'] == link['url']: for saved_link in json_file_links:
json_link['title'] = title if saved_link['url'] == link['url']:
json_link['latest'] = link['latest'] saved_link['title'] = title
saved_link['latest'] = link['latest']
changed = True changed = True
break break
if changed: if changed:
write_json_links_index(OUTPUT_DIR, links) write_json_links_index(out_dir, json_file_links)
# Patch HTML index # Patch HTML index
html_path = os.path.join(OUTPUT_DIR, 'index.html') html_path = os.path.join(out_dir, 'index.html')
html = open(html_path, 'r').read().split('\n') html = open(html_path, 'r').read().split('\n')
for idx, line in enumerate(html): for idx, line in enumerate(html):
if title and ('<span data-title-for="{}"'.format(link['url']) in line): if title and ('<span data-title-for="{}"'.format(link['url']) in line):
@ -172,6 +192,7 @@ def update_main_index(link):
with open(html_path, 'w') as f: with open(html_path, 'w') as f:
f.write('\n'.join(html)) f.write('\n'.join(html))
### Individual link index ### Individual link index
def write_link_index(out_dir, link): def write_link_index(out_dir, link):
@ -202,6 +223,18 @@ def parse_json_link_index(out_dir):
return link_json return link_json
return {} return {}
def load_json_link_index(out_dir, link):
"""check for an existing link archive in the given directory,
and load+merge it into the given link dict
"""
link = {
**parse_json_link_index(out_dir),
**link,
}
check_link_structure(link)
return link
def write_html_link_index(out_dir, link): def write_html_link_index(out_dir, link):
check_link_structure(link) check_link_structure(link)
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f: with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
@ -224,7 +257,10 @@ def write_html_link_index(out_dir, link):
wget_output_path(link) wget_output_path(link)
or (link['domain'] if link['is_archived'] else 'about:blank') or (link['domain'] if link['is_archived'] else 'about:blank')
), ),
'extension': link['extension'] or 'HTML', 'extension': link['extension'] or 'html',
'tags': link['tags'].strip() or 'untagged',
'status': 'Archived' if link['is_archived'] else 'Not yet archived',
'status_color': 'success' if link['is_archived'] else 'danger',
})) }))
chmod_file(path) chmod_file(path)

161
archivebox/logs.py Normal file
View file

@ -0,0 +1,161 @@
import sys
from datetime import datetime
from config import ANSI, REPO_DIR, OUTPUT_DIR
# globals are bad, mmkay
_LAST_RUN_STATS = {
'skipped': 0,
'succeded': 0,
'failed': 0,
'parsing_start_ts': 0,
'parsing_end_ts': 0,
'indexing_start_ts': 0,
'indexing_end_ts': 0,
'archiving_start_ts': 0,
'archiving_end_ts': 0,
'links': {},
}
def pretty_path(path):
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
return path.replace(REPO_DIR + '/', '')
def log_link_archiving_started(link_dir, link, is_new):
print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
symbol='+' if is_new else '*',
symbol_color=ANSI['green' if is_new else 'black'],
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
**{**link, 'title': link['title'] or link['url']},
**ANSI,
))
print(' > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
def log_link_archiving_failed(cmd, pwd, err=None, hints=None, prefix=' '):
"""quote the argument with whitespace in a command so the user can
copy-paste the outputted string directly to run the cmd
"""
# Prettify CMD string and make it save to copy-paste by quoting arguments
quoted_cmd = ' '.join(
'"{}"'.format(arg) if ' ' in arg else arg
for arg in cmd
)
# Prettify error output hints string and limit to five lines
hints = hints or getattr(err, 'hints', None)
if hints:
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
hints = (
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
for line in hints[:5] if line.strip()
)
else:
hints = ()
output_lines = [
'{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
*hints,
'Run to see full output:'
' cd {};'.format(pwd),
' {}'.format(quoted_cmd),
]
return '\n'.join(
'{}{}'.format(prefix, line)
for line in output_lines
if line
)
### Logging Helpers
def log_parsing_started(source_file):
start_ts = datetime.now()
_LAST_RUN_STATS['parse_start_ts'] = start_ts
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
source_file.rsplit('/', 1)[-1],
**ANSI,
))
def log_parsing_finished(num_new_links, parser_name):
print(' > Adding {} new links to index (parsed import as {})'.format(
num_new_links,
parser_name,
))
def log_indexing_started():
start_ts = datetime.now()
_LAST_RUN_STATS['index_start_ts'] = start_ts
print('{green}[*] [{}] Saving main index files...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
**ANSI,
))
def log_indexing_finished(out_dir, out_file):
end_ts = datetime.now()
_LAST_RUN_STATS['index_end_ts'] = end_ts
print(' > {}/{}'.format(pretty_path(out_dir), out_file))
def log_archiving_started(num_links, resume):
start_ts = datetime.now()
_LAST_RUN_STATS['start_ts'] = start_ts
if resume:
print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
resume,
**ANSI,
))
else:
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
**ANSI,
))
def log_archiving_paused(num_links, idx, timestamp):
end_ts = datetime.now()
_LAST_RUN_STATS['end_ts'] = end_ts
print()
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
**ANSI,
now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
idx=idx+1,
timestamp=timestamp,
total=num_links,
))
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
print(' Continue where you left off by running:')
print(' {} {}'.format(
pretty_path(sys.argv[0]),
timestamp,
))
def log_archiving_finished(num_links):
end_ts = datetime.now()
_LAST_RUN_STATS['end_ts'] = end_ts
seconds = end_ts - _LAST_RUN_STATS['start_ts'].timestamp()
if seconds > 60:
duration = '{0:.2f} min'.format(seconds / 60, 2)
else:
duration = '{0:.2f} sec'.format(seconds, 2)
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
ANSI['green'],
end_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
duration,
ANSI['reset'],
))
print(' - {} entries skipped'.format(_LAST_RUN_STATS['skipped']))
print(' - {} entries updated'.format(_LAST_RUN_STATS['succeded']))
print(' - {} errors'.format(_LAST_RUN_STATS['failed']))
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))

View file

@ -1,17 +1,19 @@
# coding: utf-8
""" """
Everything related to parsing links from bookmark services. Everything related to parsing links from input sources.
For a list of supported services, see the README.md. For a list of supported services, see the README.md.
For examples of supported files see examples/. For examples of supported import formats see tests/.
Parsed link schema: { Link: {
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop', 'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
'timestamp': '15442123124234', 'timestamp': '1544212312.4234',
'title': 'Example.com Page Title', 'title': 'Example.com Page Title',
'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
'tags': 'abc,def', 'tags': 'abc,def',
'sources': [
'output/sources/ril_export.html',
'output/sources/getpocket.com-1523422111.txt',
'output/sources/stdin-234234112312.txt'
]
} }
""" """
@ -19,45 +21,59 @@ import re
import json import json
from datetime import datetime from datetime import datetime
from collections import OrderedDict
import xml.etree.ElementTree as etree import xml.etree.ElementTree as etree
from config import ANSI from config import TIMEOUT
from util import ( from util import (
str_between, str_between,
URL_REGEX, URL_REGEX,
check_url_parsing, check_url_parsing_invariants,
progress,
) )
def parse_links(path): def parse_links(source_file):
"""parse a list of links dictionaries from a bookmark export file""" """parse a list of URLs with their metadata from an
RSS feed, bookmarks export, or text file
check_url_parsing() """
links = [] check_url_parsing_invariants()
with open(path, 'r', encoding='utf-8') as file: PARSERS = (
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format( # Specialized parsers
datetime.now().strftime('%Y-%m-%d %H:%M:%S'), ('Pocket HTML', parse_pocket_html_export),
path.rsplit('/', 1)[-1], ('Pinboard RSS', parse_pinboard_rss_export),
**ANSI, ('Shaarli RSS', parse_shaarli_rss_export),
)) ('Medium RSS', parse_medium_rss_export),
# General parsers
('Netscape HTML', parse_netscape_html_export),
('Generic RSS', parse_rss_export),
('Generic JSON', parse_json_export),
for parser_name, parser_func in PARSERS.items(): # Fallback parser
('Plain Text', parse_plain_text_export),
)
end = progress(TIMEOUT * 4, prefix=' ')
with open(source_file, 'r', encoding='utf-8') as file:
for parser_name, parser_func in PARSERS:
try: try:
links += list(parser_func(file)) links = list(parser_func(file))
if links: if links:
break end()
return links, parser_name
except Exception as err: except Exception as err:
# we try each parser one by one, wong parsers will throw exeptions # Parsers are tried one by one down the list, and the first one
# if unsupported and we accept the first one that passes # that succeeds is used. To see why a certain parser was not used
# uncomment the following line to see why the parser was unsupported for each attempted format # due to error or format incompatibility, uncomment this line:
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err)) # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
pass pass
return links, parser_name end()
return [], 'Plain Text'
### Import Parser Functions
def parse_pocket_html_export(html_file): def parse_pocket_html_export(html_file):
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
@ -81,40 +97,57 @@ def parse_pocket_html_export(html_file):
'sources': [html_file.name], 'sources': [html_file.name],
} }
def parse_pinboard_json_export(json_file):
def parse_json_export(json_file):
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
json_file.seek(0) json_file.seek(0)
json_content = json.load(json_file) links = json.load(json_file)
for line in json_content: json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
for link in links:
# example line # example line
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
if line: if link:
erg = line # Parse URL
if erg.get('timestamp'): url = link.get('href') or link.get('url') or link.get('URL')
timestamp = str(erg['timestamp']/10000000) # chrome/ff histories use a very precise timestamp if not url:
elif erg.get('time'): raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
timestamp = str(datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ').timestamp())
elif erg.get('created_at'):
timestamp = str(datetime.strptime(erg['created_at'], '%Y-%m-%dT%H:%M:%S%z').timestamp())
else:
timestamp = str(datetime.now().timestamp())
if erg.get('href'):
url = erg['href']
else:
url = erg['url']
if erg.get('description'):
title = (erg.get('description') or '').replace(' — Readability', '')
else:
title = erg['title'].strip()
info = { # Parse the timestamp
ts_str = str(datetime.now().timestamp())
if link.get('timestamp'):
# chrome/ff histories use a very precise timestamp
ts_str = str(link['timestamp'] / 10000000)
elif link.get('time'):
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
elif link.get('created_at'):
ts_str = str(json_date(link['created_at']).timestamp())
elif link.get('created'):
ts_str = str(json_date(link['created']).timestamp())
elif link.get('date'):
ts_str = str(json_date(link['date']).timestamp())
elif link.get('bookmarked'):
ts_str = str(json_date(link['bookmarked']).timestamp())
elif link.get('saved'):
ts_str = str(json_date(link['saved']).timestamp())
# Parse the title
title = None
if link.get('title'):
title = link['title'].strip() or None
elif link.get('description'):
title = link['description'].replace(' — Readability', '').strip() or None
elif link.get('name'):
title = link['name'].strip() or None
yield {
'url': url, 'url': url,
'timestamp': timestamp, 'timestamp': ts_str,
'title': title or None, 'title': title,
'tags': erg.get('tags') or '', 'tags': link.get('tags') or '',
'sources': [json_file.name], 'sources': [json_file.name],
} }
yield info
def parse_rss_export(rss_file): def parse_rss_export(rss_file):
@ -139,15 +172,15 @@ def parse_rss_export(rss_file):
def get_row(key): def get_row(key):
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0] return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
url = str_between(get_row('link'), '<link>', '</link>') url = str_between(get_row('link'), '<link>', '</link>')
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>') ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
title = str_between(get_row('title'), '<![CDATA[', ']]').strip() or None
yield { yield {
'url': url, 'url': url,
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'title': title or None, 'title': title,
'tags': '', 'tags': '',
'sources': [rss_file.name], 'sources': [rss_file.name],
} }
@ -224,9 +257,6 @@ def parse_pinboard_rss_export(rss_file):
tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None
title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None
ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None
# = 🌈🌈🌈🌈
# = 🌈🌈🌈🌈
# = 🏆🏆🏆🏆
# Pinboard includes a colon in its date stamp timezone offsets, which # Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it: # Python can't parse. Remove it:
@ -254,8 +284,6 @@ def parse_medium_rss_export(rss_file):
root = etree.parse(rss_file).getroot() root = etree.parse(rss_file).getroot()
items = root.find("channel").findall("item") items = root.find("channel").findall("item")
for item in items: for item in items:
# for child in item:
# print(child.tag, child.text)
url = item.find("link").text url = item.find("link").text
title = item.find("title").text.strip() title = item.find("title").text.strip()
ts_str = item.find("pubDate").text ts_str = item.find("pubDate").text
@ -274,31 +302,13 @@ def parse_plain_text_export(text_file):
"""Parse raw links from each line in a text file""" """Parse raw links from each line in a text file"""
text_file.seek(0) text_file.seek(0)
text_content = text_file.readlines() for line in text_file.readlines():
for line in text_content: urls = re.findall(URL_REGEX, line) if line.strip() else ()
if line: for url in urls:
urls = re.findall(URL_REGEX, line) yield {
'url': url,
for url in urls: 'timestamp': str(datetime.now().timestamp()),
url = url.strip() 'title': None,
time = datetime.now() 'tags': '',
'sources': [text_file.name],
yield { }
'url': url,
'timestamp': str(time.timestamp()),
'title': None,
'tags': '',
'sources': [text_file.name],
}
PARSERS = OrderedDict([
('Pocket HTML', parse_pocket_html_export),
('Pinboard JSON', parse_pinboard_json_export),
('Netscape HTML', parse_netscape_html_export),
('RSS', parse_rss_export),
('Pinboard RSS', parse_pinboard_rss_export),
('Shaarli RSS', parse_shaarli_rss_export),
('Medium RSS', parse_medium_rss_export),
('Plain Text', parse_plain_text_export),
])

View file

@ -1,10 +1,64 @@
"""
Patches, additions, and shortcuts for Python standard library functions.
"""
### subprocess
from subprocess import (
Popen,
PIPE,
DEVNULL,
CompletedProcess,
TimeoutExpired,
CalledProcessError,
)
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
if input is not None:
if 'stdin' in kwargs:
raise ValueError('stdin and input arguments may not both be used.')
kwargs['stdin'] = PIPE
if capture_output:
if ('stdout' in kwargs) or ('stderr' in kwargs):
raise ValueError('stdout and stderr arguments may not be used '
'with capture_output.')
kwargs['stdout'] = PIPE
kwargs['stderr'] = PIPE
with Popen(*popenargs, **kwargs) as process:
try:
stdout, stderr = process.communicate(input, timeout=timeout)
except TimeoutExpired:
process.kill()
try:
stdout, stderr = process.communicate(input, timeout=2)
except:
pass
raise TimeoutExpired(popenargs[0][0], timeout)
except BaseException as err:
process.kill()
# We don't call process.wait() as .__exit__ does that for us.
raise
retcode = process.poll()
if check and retcode:
raise CalledProcessError(retcode, process.args,
output=stdout, stderr=stderr)
return CompletedProcess(process.args, retcode, stdout, stderr)
### collections
from sys import maxsize from sys import maxsize
from itertools import islice from itertools import islice
from collections import deque from collections import deque
_marker = object() _marker = object()
class Peekable(object): class PeekableGenerator:
"""Peekable version of a normal python generator. """Peekable version of a normal python generator.
Useful when you don't want to evaluate the entire iterable to look at Useful when you don't want to evaluate the entire iterable to look at
a specific item at a given idx. a specific item at a given idx.
@ -74,8 +128,6 @@ class Peekable(object):
return next(self._it) return next(self._it)
next = __next__ # For Python 2 compatibility
def _get_slice(self, index): def _get_slice(self, index):
# Normalize the slice's arguments # Normalize the slice's arguments
step = 1 if (index.step is None) else index.step step = 1 if (index.step is None) else index.step

View file

@ -192,22 +192,27 @@
Bookmarked: <small title="Timestamp: $timestamp">$bookmarked_date</small> Bookmarked: <small title="Timestamp: $timestamp">$bookmarked_date</small>
&nbsp; | &nbsp; &nbsp; | &nbsp;
Last updated: <small title="Timestamp: $updated">$updated_date</small> Last updated: <small title="Timestamp: $updated">$updated_date</small>
&nbsp; | &nbsp;
Total files: <small title="Archive methods">🗃 $num_outputs</small>
</div> </div>
<div class="col-lg-4 alert well"> <div class="col-lg-4 alert well">
Type: Type:
<span class="badge badge-default">$extension</span> <span class="badge badge-default">$extension</span>
&nbsp; | &nbsp; &nbsp; | &nbsp;
Tags: Tags:
<span class="badge badge-success">$tags</span> <span class="badge badge-warning">$tags</span>
&nbsp; | &nbsp;
Status:
<span class="badge badge-$status_color">$status</span>
</div> </div>
<div class="col-lg-4 alert well"> <div class="col-lg-4 alert well">
Download: Archive Methods:
<a href="index.json" title="JSON summary of archived link.">JSON</a> | <a href="index.json" title="JSON summary of archived link.">JSON</a> |
<a href="warc/" title="Any WARC archives for the page">WARC</a> | <a href="warc/" title="Any WARC archives for the page">WARC</a> |
<a href="media/" title="Audio, Video, and Subtitle files.">Media</a> | <a href="media/" title="Audio, Video, and Subtitle files.">Media</a> |
<a href="git/" title="Any git repos at the url">Git Repos</a> | <a href="git/" title="Any git repos at the url">Git Repos</a> |
<a href="favicon.ico" title="Any git repos at the url">Favicon</a> | <a href="favicon.ico" title="Any git repos at the url">Favicon</a> |
<a href="." title="Webserver-provided index of files directory.">More files...</a> <a href="." title="Webserver-provided index of files directory.">See all files...</a>
</div> </div>
<hr/> <hr/>
<div class="col-lg-2"> <div class="col-lg-2">

View file

@ -8,8 +8,8 @@ from urllib.parse import urlparse, quote
from decimal import Decimal from decimal import Decimal
from datetime import datetime from datetime import datetime
from multiprocessing import Process from multiprocessing import Process
from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError
from stdlib_patches import run, PIPE, DEVNULL
from config import ( from config import (
ANSI, ANSI,
TERM_WIDTH, TERM_WIDTH,
@ -19,8 +19,6 @@ from config import (
OUTPUT_PERMISSIONS, OUTPUT_PERMISSIONS,
TIMEOUT, TIMEOUT,
SHOW_PROGRESS, SHOW_PROGRESS,
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
CURL_BINARY, CURL_BINARY,
WGET_BINARY, WGET_BINARY,
CHROME_BINARY, CHROME_BINARY,
@ -37,6 +35,13 @@ from config import (
FETCH_MEDIA, FETCH_MEDIA,
SUBMIT_ARCHIVE_DOT_ORG, SUBMIT_ARCHIVE_DOT_ORG,
ARCHIVE_DIR_NAME, ARCHIVE_DIR_NAME,
RESOLUTION,
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
CHROME_USER_AGENT,
CHROME_USER_DATA_DIR,
CHROME_HEADLESS,
CHROME_SANDBOX,
) )
### Parsing Helpers ### Parsing Helpers
@ -56,6 +61,7 @@ extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basen
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
short_ts = lambda ts: ts.split('.')[0] short_ts = lambda ts: ts.split('.')[0]
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
URL_REGEX = re.compile( URL_REGEX = re.compile(
r'http[s]?://' # start matching from allowed schemes r'http[s]?://' # start matching from allowed schemes
@ -109,66 +115,74 @@ def check_links_structure(links):
def check_dependencies(): def check_dependencies():
"""Check that all necessary dependencies are installed, and have valid versions""" """Check that all necessary dependencies are installed, and have valid versions"""
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor)) try:
if python_vers < 3.5: python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset'])) if python_vers < 3.5:
print(' See https://github.com/pirate/ArchiveBox#troubleshooting for help upgrading your Python installation.') print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(1)
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
if FETCH_WGET or FETCH_WARC:
if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
try:
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
version_str = result.stdout.decode('utf-8')
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
version = [l for l in version_lines if l.isdigit()][-1]
if int(version) < 59:
print(version_lines)
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
except (IndexError, TypeError, OSError):
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
if FETCH_GIT:
if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
if FETCH_MEDIA:
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
except (KeyboardInterrupt, Exception):
raise SystemExit(1) raise SystemExit(1)
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: def check_url_parsing_invariants():
if run(['which', CURL_BINARY], stdout=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL).returncode:
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CURL_BINARY))
print(' See https://github.com/pirate/ArchiveBox for help.')
raise SystemExit(1)
if FETCH_WGET or FETCH_WARC:
if run(['which', WGET_BINARY], stdout=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL).returncode:
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(WGET_BINARY))
print(' See https://github.com/pirate/ArchiveBox for help.')
raise SystemExit(1)
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode:
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/ArchiveBox for help.')
raise SystemExit(1)
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
try:
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
version_str = result.stdout.decode('utf-8')
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
version = [l for l in version_lines if l.isdigit()][-1]
if int(version) < 59:
print(version_lines)
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
print(' See https://github.com/pirate/ArchiveBox for help.')
raise SystemExit(1)
except (IndexError, TypeError, OSError):
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/ArchiveBox for help.')
raise SystemExit(1)
if FETCH_GIT:
if run(['which', GIT_BINARY], stdout=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL).returncode:
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(GIT_BINARY))
print(' See https://github.com/pirate/ArchiveBox for help.')
raise SystemExit(1)
if FETCH_MEDIA:
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL).returncode:
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
print(' See https://github.com/pirate/ArchiveBox for help.')
raise SystemExit(1)
def check_url_parsing():
"""Check that plain text regex URL parsing works as expected""" """Check that plain text regex URL parsing works as expected"""
# this is last-line-of-defense to make sure the URL_REGEX isn't
# misbehaving, as the consequences could be disastrous and lead to many
# incorrect/badly parsed links being added to the archive
test_urls = ''' test_urls = '''
https://example1.com/what/is/happening.html?what=1#how-about-this=1 https://example1.com/what/is/happening.html?what=1#how-about-this=1
https://example2.com/what/is/happening/?what=1#how-about-this=1 https://example2.com/what/is/happening/?what=1#how-about-this=1
@ -276,22 +290,9 @@ def wget_output_path(link):
if link.get('latest', {}).get('wget'): if link.get('latest', {}).get('wget'):
return link['latest']['wget'] return link['latest']['wget']
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
if is_static_file(link['url']): if is_static_file(link['url']):
return urlencode(without_scheme(without_fragment(link['url']))) return urlencode(without_scheme(without_fragment(link['url'])))
# Since the wget algorithm to for -E (appending .html) is incredibly complex
# instead of trying to emulate it here, we just look in the output folder
# to see what html file wget actually created as the output
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
full_path = without_fragment(without_query(path(link['url']))).strip('/')
search_dir = os.path.join(
link_dir,
domain(link['url']),
full_path,
)
# Wget downloads can save in a number of different ways depending on the url # Wget downloads can save in a number of different ways depending on the url
# https://example.com # https://example.com
# > output/archive/<timestamp>/example.com/index.html # > output/archive/<timestamp>/example.com/index.html
@ -304,6 +305,19 @@ def wget_output_path(link):
# There's also lots of complexity around how the urlencoding and renaming # There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments or extensions like shtml / htm # is done for pages with query and hash fragments or extensions like shtml / htm
# Since the wget algorithm for -E (appending .html) is incredibly complex
# and there's no way to get the computed output path from wget
# in order to avoid having to reverse-engineer how they calculate it,
# we just look in the output folder read the filename wget used from the filesystem
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
full_path = without_fragment(without_query(path(link['url']))).strip('/')
search_dir = os.path.join(
link_dir,
domain(link['url']),
full_path,
)
for _ in range(4): for _ in range(4):
if os.path.exists(search_dir): if os.path.exists(search_dir):
if os.path.isdir(search_dir): if os.path.isdir(search_dir):
@ -356,47 +370,6 @@ def str_between(string, start, end=None):
return content return content
def pretty_path(path):
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
return path.replace(REPO_DIR + '/', '')
def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '):
"""quote the argument with whitespace in a command so the user can
copy-paste the outputted string directly to run the cmd
"""
# Prettify CMD string and make it save to copy-paste by quoting arguments
quoted_cmd = ' '.join(
'"{}"'.format(arg) if ' ' in arg else arg
for arg in cmd
)
# Prettify error output hints string and limit to five lines
hints = hints or getattr(err, 'hints', None)
if hints:
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
hints = (
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
for line in hints[:5] if line.strip()
)
else:
hints = ()
output_lines = [
'{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
*hints,
'Run to see full output:'
' cd {};'.format(pwd),
' {}'.format(quoted_cmd),
]
return '\n'.join(
'{}{}'.format(prefix, line)
for line in output_lines
if line
)
### Link Helpers ### Link Helpers
@ -571,37 +544,59 @@ def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
print(' ', chmod_result.stderr.decode()) print(' ', chmod_result.stderr.decode())
raise Exception('Failed to chmod {}/{}'.format(cwd, path)) raise Exception('Failed to chmod {}/{}'.format(cwd, path))
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
if input is not None: CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR
if 'stdin' in kwargs:
raise ValueError('stdin and input arguments may not both be used.')
kwargs['stdin'] = PIPE
if capture_output: def chrome_args(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR,
if ('stdout' in kwargs) or ('stderr' in kwargs): headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX,
raise ValueError('stdout and stderr arguments may not be used ' check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT,
'with capture_output.') resolution=RESOLUTION, timeout=TIMEOUT):
kwargs['stdout'] = PIPE """helper to build up a chrome shell command with arguments"""
kwargs['stderr'] = PIPE
with Popen(*popenargs, **kwargs) as process: global CACHED_USER_DATA_DIR
try: user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
stdout, stderr = process.communicate(input, timeout=timeout) cmd_args = [binary]
except TimeoutExpired:
process.kill() if headless:
try: cmd_args += ('--headless',)
stdout, stderr = process.communicate(input, timeout=2)
except: if not sandbox:
pass # dont use GPU or sandbox when running inside docker container
raise TimeoutExpired(popenargs[0][0], timeout) cmd_args += ('--no-sandbox', '--disable-gpu')
except BaseException as err:
process.kill() if not check_ssl_validity:
# We don't call process.wait() as .__exit__ does that for us. cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
raise
retcode = process.poll() if user_agent:
if check and retcode: cmd_args += ('--user-agent={}'.format(user_agent),)
raise CalledProcessError(retcode, process.args,
output=stdout, stderr=stderr) if resolution:
return CompletedProcess(process.args, retcode, stdout, stderr) cmd_args += ('--window-size={}'.format(RESOLUTION),)
if timeout:
cmd_args += ('--timeout={}'.format((timeout) * 1000),)
# Find chrome user data directory
default_profile_paths = (
'~/.config/chromium',
'~/.config/google-chrome',
'~/.config/google-chrome-beta',
'~/.config/google-chrome-unstable',
'~/Library/Application Support/Chromium',
'~/Library/Application Support/Google/Chrome',
'~/Library/Application Support/Google/Chrome Canary',
'~/AppData/Local/Chromium/User Data',
'~/AppData/Local/Google/Chrome/User Data',
'~/AppData/Local/Google/Chrome SxS/User Data',
)
if user_data_dir:
cmd_args.append('--user-data-dir={}'.format(user_data_dir))
else:
for path in default_profile_paths:
full_path = os.path.expanduser(path)
if os.path.exists(full_path):
CACHED_USER_DATA_DIR = full_path
cmd_args.append('--user-data-dir={}'.format(full_path))
break
return cmd_args