1
0
Fork 0
mirror of synced 2024-06-28 19:10:33 +12:00

better link corruption guards, remove title prefetching, save index after run

This commit is contained in:
Nick Sweeting 2019-02-21 17:45:28 -05:00
parent c95632883e
commit b03e9fade8
6 changed files with 165 additions and 93 deletions

View file

@ -7,34 +7,31 @@ import os
import sys import sys
from datetime import datetime from datetime import datetime
from subprocess import run from peekable import Peekable
from parse import parse_links from parse import parse_links
from links import validate_links from links import validate_links, links_after_timestamp
from archive_methods import archive_links, _RESULTS_TOTALS from archive_methods import archive_link, _RESULTS_TOTALS
from index import ( from index import (
write_links_index, write_links_index,
write_link_index,
parse_json_links_index, parse_json_links_index,
parse_json_link_index,
) )
from config import ( from config import (
ARCHIVE_DIR,
ONLY_NEW, ONLY_NEW,
OUTPUT_PERMISSIONS,
OUTPUT_DIR, OUTPUT_DIR,
REPO_DIR, REPO_DIR,
ANSI, ANSI,
TIMEOUT,
SHOW_PROGRESS,
GIT_SHA, GIT_SHA,
) )
from util import ( from util import (
check_dependencies,
download_url, download_url,
save_source, save_source,
progress,
cleanup_archive,
pretty_path, pretty_path,
migrate_data, migrate_data,
check_links_structure,
) )
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>' __AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
@ -42,6 +39,7 @@ __VERSION__ = GIT_SHA
__DESCRIPTION__ = 'ArchiveBox Usage: Create a browsable html archive of a list of links.' __DESCRIPTION__ = 'ArchiveBox Usage: Create a browsable html archive of a list of links.'
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki' __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
def print_help(): def print_help():
print(__DESCRIPTION__) print(__DESCRIPTION__)
print("Documentation: {}\n".format(__DOCUMENTATION__)) print("Documentation: {}\n".format(__DOCUMENTATION__))
@ -55,21 +53,22 @@ def print_help():
def load_links(archive_path=OUTPUT_DIR, import_path=None): def load_links(archive_path=OUTPUT_DIR, import_path=None):
"""get new links from file and optionally append them to links in existing archive""" """get new links from file and optionally append them to links in existing archive"""
existing_links = [] existing_links = []
if archive_path: if archive_path:
existing_links = parse_json_links_index(archive_path) existing_links = parse_json_links_index(archive_path)
check_links_structure(existing_links)
new_links = [] new_links = []
if import_path: if import_path:
# parse and validate the import file # parse and validate the import file
raw_links, parser_name = parse_links(import_path) raw_links, parser_name = parse_links(import_path)
new_links = validate_links(raw_links) new_links = validate_links(raw_links)
if SHOW_PROGRESS: check_links_structure(new_links)
print()
# merge existing links in archive_path and new links # merge existing links in archive_path and new links
all_links = validate_links(existing_links + new_links) all_links = validate_links(existing_links + new_links)
check_links_structure(all_links)
num_new_links = len(all_links) - len(existing_links) num_new_links = len(all_links) - len(existing_links)
if import_path and parser_name: if import_path and parser_name:
@ -81,6 +80,7 @@ def load_links(archive_path=OUTPUT_DIR, import_path=None):
return all_links, new_links return all_links, new_links
def update_archive(archive_path, links, source=None, resume=None, append=True): def update_archive(archive_path, links, source=None, resume=None, append=True):
"""update or create index.html+json given a path to an export file containing new links""" """update or create index.html+json given a path to an export file containing new links"""
@ -99,8 +99,38 @@ def update_archive(archive_path, links, source=None, resume=None, append=True):
**ANSI, **ANSI,
)) ))
check_links_structure(links)
# prefetch the first link off the generator so that if we pause or fail
# immediately we can show that we paused on the first link and not just None
to_archive = Peekable(links_after_timestamp(links, resume))
idx, link = 0, to_archive.peek(0)
# loop over links and archive them # loop over links and archive them
archive_links(archive_path, links, source=source, resume=resume) try:
check_dependencies()
for idx, link in enumerate(to_archive):
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
archive_link(link_dir, link)
except (KeyboardInterrupt, SystemExit, Exception) as e:
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
**ANSI,
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
idx=idx+1,
timestamp=link['timestamp'],
total=len(links),
))
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
print(' Continue where you left off by running:')
print(' {} {}'.format(
pretty_path(sys.argv[0]),
link['timestamp'],
))
if not isinstance(e, KeyboardInterrupt):
print()
raise e
raise SystemExit(1)
# print timing information & summary # print timing information & summary
end_ts = datetime.now().timestamp() end_ts = datetime.now().timestamp()
@ -135,7 +165,7 @@ if __name__ == '__main__':
source = sys.argv[1] if argc > 1 else None # path of links file to import source = sys.argv[1] if argc > 1 else None # path of links file to import
resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from
stdin_raw_text = [] stdin_raw_text = ''
if not sys.stdin.isatty(): if not sys.stdin.isatty():
stdin_raw_text = sys.stdin.read() stdin_raw_text = sys.stdin.read()
@ -192,3 +222,7 @@ if __name__ == '__main__':
update_archive(out_dir, new_links, source=source, resume=resume, append=True) update_archive(out_dir, new_links, source=source, resume=resume, append=True)
else: else:
update_archive(out_dir, all_links, source=source, resume=resume, append=True) update_archive(out_dir, all_links, source=source, resume=resume, append=True)
# Step 5: Re-write links index with updated titles, icons, and resources
all_links, _ = load_links(archive_path=out_dir)
write_links_index(out_dir=out_dir, links=all_links)

View file

@ -1,16 +1,17 @@
import os import os
import re
import sys
from functools import wraps from functools import wraps
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from peekable import Peekable from index import (
wget_output_path,
from index import wget_output_path, parse_json_link_index, write_link_index parse_json_link_index,
from links import links_after_timestamp write_link_index,
patch_index_title_hack,
)
from config import ( from config import (
OUTPUT_DIR,
CURL_BINARY, CURL_BINARY,
GIT_BINARY, GIT_BINARY,
WGET_BINARY, WGET_BINARY,
@ -42,12 +43,12 @@ from config import (
) )
from util import ( from util import (
without_fragment, without_fragment,
check_dependencies,
fetch_page_title, fetch_page_title,
progress, progress,
chmod_file, chmod_file,
pretty_path, pretty_path,
run, PIPE, DEVNULL check_link_structure,
run, PIPE, DEVNULL,
) )
@ -57,38 +58,12 @@ _RESULTS_TOTALS = { # globals are bad, mmkay
'failed': 0, 'failed': 0,
} }
def archive_links(archive_path, links, source=None, resume=None):
check_dependencies()
to_archive = Peekable(links_after_timestamp(links, resume))
idx, link = 0, to_archive.peek(0)
try:
for idx, link in enumerate(to_archive):
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
archive_link(link_dir, link)
except (KeyboardInterrupt, SystemExit, Exception) as e:
print('{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
**ANSI,
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
idx=idx+1,
timestamp=link['timestamp'],
total=len(links),
))
print(' Continue where you left off by running:')
print(' {} {}'.format(
pretty_path(sys.argv[0]),
link['timestamp'],
))
if not isinstance(e, KeyboardInterrupt):
raise e
raise SystemExit(1)
def archive_link(link_dir, link, overwrite=True): def archive_link(link_dir, link, overwrite=True):
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
check_link_structure(link)
try: try:
update_existing = os.path.exists(link_dir) update_existing = os.path.exists(link_dir)
if update_existing: if update_existing:
@ -99,7 +74,7 @@ def archive_link(link_dir, link, overwrite=True):
else: else:
os.makedirs(link_dir) os.makedirs(link_dir)
log_link_archive(link_dir, link, update_existing) print_link_status_line(link_dir, link, update_existing)
if FETCH_FAVICON: if FETCH_FAVICON:
link = fetch_favicon(link_dir, link, overwrite=overwrite) link = fetch_favicon(link_dir, link, overwrite=overwrite)
@ -135,7 +110,7 @@ def archive_link(link_dir, link, overwrite=True):
return link return link
def log_link_archive(link_dir, link, update_existing): def print_link_status_line(link_dir, link, update_existing):
print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format( print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
symbol='*' if update_existing else '+', symbol='*' if update_existing else '+',
symbol_color=ANSI['black' if update_existing else 'green'], symbol_color=ANSI['black' if update_existing else 'green'],
@ -518,7 +493,7 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):
# if link already has valid title, skip it # if link already has valid title, skip it
if link['title'] and not link['title'].lower().startswith('http'): if link['title'] and not link['title'].lower().startswith('http'):
return {'output': link['title'], 'cmd': 'fetch_page_title("{}")'.format(link['url'])} return {'output': link['title'], 'status': 'skipped'}
end = progress(timeout, prefix=' ') end = progress(timeout, prefix=' ')
try: try:
@ -530,6 +505,13 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
output = e output = e
# titles should show up in the global index immediatley for better UX,
# do a hacky immediate replacement to add them in as we're archiving
# TODO: figure out how to do this without gnarly string replacement
if title:
link['title'] = title
patch_index_title_hack(link['url'], title)
return { return {
'cmd': 'fetch_page_title("{}")'.format(link['url']), 'cmd': 'fetch_page_title("{}")'.format(link['url']),
'output': output, 'output': output,

View file

@ -6,6 +6,7 @@ from string import Template
from distutils.dir_util import copy_tree from distutils.dir_util import copy_tree
from config import ( from config import (
OUTPUT_DIR,
TEMPLATES_DIR, TEMPLATES_DIR,
OUTPUT_PERMISSIONS, OUTPUT_PERMISSIONS,
ANSI, ANSI,
@ -17,6 +18,8 @@ from util import (
wget_output_path, wget_output_path,
derived_link_info, derived_link_info,
pretty_path, pretty_path,
check_link_structure,
check_links_structure,
) )
@ -25,6 +28,8 @@ from util import (
def write_links_index(out_dir, links): def write_links_index(out_dir, links):
"""create index.html file for a given list of links""" """create index.html file for a given list of links"""
check_links_structure(links)
if not os.path.exists(out_dir): if not os.path.exists(out_dir):
os.makedirs(out_dir) os.makedirs(out_dir)
@ -42,6 +47,8 @@ def write_links_index(out_dir, links):
def write_json_links_index(out_dir, links): def write_json_links_index(out_dir, links):
"""write the json link index to a given path""" """write the json link index to a given path"""
check_links_structure(links)
path = os.path.join(out_dir, 'index.json') path = os.path.join(out_dir, 'index.json')
index_json = { index_json = {
@ -63,13 +70,17 @@ def parse_json_links_index(out_dir):
index_path = os.path.join(out_dir, 'index.json') index_path = os.path.join(out_dir, 'index.json')
if os.path.exists(index_path): if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f: with open(index_path, 'r', encoding='utf-8') as f:
return json.load(f)['links'] links = json.load(f)['links']
check_links_structure(links)
return links
return [] return []
def write_html_links_index(out_dir, links): def write_html_links_index(out_dir, links):
"""write the html link index to a given path""" """write the html link index to a given path"""
check_links_structure(links)
path = os.path.join(out_dir, 'index.html') path = os.path.join(out_dir, 'index.html')
copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static')) copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static'))
@ -104,6 +115,25 @@ def write_html_links_index(out_dir, links):
chmod_file(path) chmod_file(path)
def patch_index_title_hack(link_url, new_title):
"""hack to update just one link's title in the link index json"""
json_path = os.path.join(OUTPUT_DIR, 'index.json')
links = parse_json_links_index(OUTPUT_DIR)
changed = False
for link in links:
if link['url'] == link_url:
link['title'] = new_title
changed = True
break
if changed:
write_json_links_index(OUTPUT_DIR, links)
### Individual link index ### Individual link index
def write_link_index(out_dir, link): def write_link_index(out_dir, link):
@ -114,6 +144,7 @@ def write_link_index(out_dir, link):
def write_json_link_index(out_dir, link): def write_json_link_index(out_dir, link):
"""write a json file with some info about the link""" """write a json file with some info about the link"""
check_link_structure(link)
path = os.path.join(out_dir, 'index.json') path = os.path.join(out_dir, 'index.json')
print(' √ index.json') print(' √ index.json')
@ -128,10 +159,13 @@ def parse_json_link_index(out_dir):
existing_index = os.path.join(out_dir, 'index.json') existing_index = os.path.join(out_dir, 'index.json')
if os.path.exists(existing_index): if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f: with open(existing_index, 'r', encoding='utf-8') as f:
return json.load(f) link_json = json.load(f)
check_link_structure(link_json)
return link_json
return {} return {}
def write_html_link_index(out_dir, link): def write_html_link_index(out_dir, link):
check_link_structure(link)
with open(os.path.join(TEMPLATES_DIR, 'link_index_fancy.html'), 'r', encoding='utf-8') as f: with open(os.path.join(TEMPLATES_DIR, 'link_index_fancy.html'), 'r', encoding='utf-8') as f:
link_html = f.read() link_html = f.read()

View file

@ -32,34 +32,33 @@ Link {
""" """
import datetime
from html import unescape from html import unescape
from collections import OrderedDict from collections import OrderedDict
from util import ( from util import (
domain,
base_url,
str_between,
get_link_type,
merge_links, merge_links,
wget_output_path, wget_output_path,
check_link_structure,
check_links_structure,
) )
from config import ANSI
def validate_links(links): def validate_links(links):
check_links_structure(links)
links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
links = sorted_links(links) # deterministically sort the links based on timstamp, url links = sorted_links(links) # deterministically sort the links based on timstamp, url
if not links: if not links:
print('[X] No links found :(') print('[X] No links found :(')
raise SystemExit(1) raise SystemExit(1)
for link in links: for link in links:
check_link_structure(link)
link['title'] = unescape(link['title']) if link['title'] else None link['title'] = unescape(link['title']) if link['title'] else None
link['latest'] = link.get('latest') or {} link['latest'] = link.get('latest') or {}
latest = link['latest'] latest = link['latest']
if not link['latest'].get('wget'): if not link['latest'].get('wget'):
link['latest']['wget'] = wget_output_path(link) link['latest']['wget'] = wget_output_path(link)
@ -81,14 +80,16 @@ def validate_links(links):
return list(links) return list(links)
def archivable_links(links): def archivable_links(links):
"""remove chrome://, about:// or other schemed links that cant be archived""" """remove chrome://, about:// or other schemed links that cant be archived"""
return ( return (
link link
for link in links for link in links
if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://')) if any(link['url'].lower().startswith(s) for s in ('http://', 'https://', 'ftp://'))
) )
def uniquefied_links(sorted_links): def uniquefied_links(sorted_links):
""" """
ensures that all non-duplicate links have monotonically increasing timestamps ensures that all non-duplicate links have monotonically increasing timestamps
@ -114,10 +115,12 @@ def uniquefied_links(sorted_links):
return unique_timestamps.values() return unique_timestamps.values()
def sorted_links(links): def sorted_links(links):
sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url']) sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url'])
return sorted(links, key=sort_func, reverse=True) return sorted(links, key=sort_func, reverse=True)
def links_after_timestamp(links, timestamp=None): def links_after_timestamp(links, timestamp=None):
if not timestamp: if not timestamp:
yield from links yield from links
@ -130,6 +133,7 @@ def links_after_timestamp(links, timestamp=None):
except (ValueError, TypeError): except (ValueError, TypeError):
print('Resume value and all timestamp values must be valid numbers.') print('Resume value and all timestamp values must be valid numbers.')
def lowest_uniq_timestamp(used_timestamps, timestamp): def lowest_uniq_timestamp(used_timestamps, timestamp):
"""resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2""" """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""

View file

@ -20,7 +20,6 @@ Parsed link schema: {
import re import re
import sys import sys
import json import json
import urllib
from collections import OrderedDict from collections import OrderedDict
import xml.etree.ElementTree as etree import xml.etree.ElementTree as etree
@ -32,7 +31,6 @@ from util import (
base_url, base_url,
str_between, str_between,
get_link_type, get_link_type,
fetch_page_title,
URL_REGEX, URL_REGEX,
) )
@ -56,13 +54,11 @@ def parse_links(path):
links = [] links = []
with open(path, 'r', encoding='utf-8') as file: with open(path, 'r', encoding='utf-8') as file:
print('{green}[*] [{}] Parsing new links from output/sources/{} and fetching titles...{reset}'.format( print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'), datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
path.rsplit('/', 1)[-1], path.rsplit('/', 1)[-1],
**ANSI, **ANSI,
)) ))
if SHOW_PROGRESS:
sys.stdout.write(' ')
for parser_name, parser_func in get_parsers(file).items(): for parser_name, parser_func in get_parsers(file).items():
# otherwise try all parsers until one works # otherwise try all parsers until one works
@ -98,7 +94,7 @@ def parse_pocket_html_export(html_file):
'base_url': base_url(fixed_url), 'base_url': base_url(fixed_url),
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': match.group(3), 'tags': match.group(3),
'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or fetch_page_title(fixed_url), 'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None,
'sources': [html_file.name], 'sources': [html_file.name],
} }
info['type'] = get_link_type(info) info['type'] = get_link_type(info)
@ -135,7 +131,7 @@ def parse_pinboard_json_export(json_file):
'base_url': base_url(url), 'base_url': base_url(url),
'timestamp': timestamp, 'timestamp': timestamp,
'tags': erg.get('tags') or '', 'tags': erg.get('tags') or '',
'title': title or fetch_page_title(url), 'title': title or None,
'sources': [json_file.name], 'sources': [json_file.name],
} }
info['type'] = get_link_type(info) info['type'] = get_link_type(info)
@ -174,7 +170,7 @@ def parse_rss_export(rss_file):
'base_url': base_url(url), 'base_url': base_url(url),
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': '', 'tags': '',
'title': title or fetch_page_title(url), 'title': title or None,
'sources': [rss_file.name], 'sources': [rss_file.name],
} }
info['type'] = get_link_type(info) info['type'] = get_link_type(info)
@ -215,7 +211,7 @@ def parse_shaarli_rss_export(rss_file):
'base_url': base_url(url), 'base_url': base_url(url),
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': '', 'tags': '',
'title': title or fetch_page_title(url), 'title': title or None,
'sources': [rss_file.name], 'sources': [rss_file.name],
} }
info['type'] = get_link_type(info) info['type'] = get_link_type(info)
@ -242,7 +238,7 @@ def parse_netscape_html_export(html_file):
'base_url': base_url(url), 'base_url': base_url(url),
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': "", 'tags': "",
'title': match.group(3).strip() or fetch_page_title(url), 'title': match.group(3).strip() or None,
'sources': [html_file.name], 'sources': [html_file.name],
} }
info['type'] = get_link_type(info) info['type'] = get_link_type(info)
@ -275,7 +271,7 @@ def parse_pinboard_rss_export(rss_file):
'base_url': base_url(url), 'base_url': base_url(url),
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': tags, 'tags': tags,
'title': title or fetch_page_title(url), 'title': title or None,
'sources': [rss_file.name], 'sources': [rss_file.name],
} }
info['type'] = get_link_type(info) info['type'] = get_link_type(info)
@ -300,7 +296,7 @@ def parse_medium_rss_export(rss_file):
'base_url': base_url(url), 'base_url': base_url(url),
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': '', 'tags': '',
'title': title or fetch_page_title(url), 'title': title or None,
'sources': [rss_file.name], 'sources': [rss_file.name],
} }
info['type'] = get_link_type(info) info['type'] = get_link_type(info)
@ -324,7 +320,7 @@ def parse_plain_text_export(text_file):
'base_url': base_url(url), 'base_url': base_url(url),
'timestamp': str(datetime.now().timestamp()), 'timestamp': str(datetime.now().timestamp()),
'tags': '', 'tags': '',
'title': fetch_page_title(url), 'title': None,
'sources': [text_file.name], 'sources': [text_file.name],
} }
info['type'] = get_link_type(info) info['type'] = get_link_type(info)

View file

@ -3,8 +3,7 @@ import re
import sys import sys
import time import time
import json import json
import signal from urllib.request import Request, urlopen
from urllib.request import urlopen
from urllib.parse import urlparse from urllib.parse import urlparse
from decimal import Decimal from decimal import Decimal
@ -25,6 +24,7 @@ from config import (
TIMEOUT, TIMEOUT,
SHOW_PROGRESS, SHOW_PROGRESS,
CHECK_SSL_VALIDITY, CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
CURL_BINARY, CURL_BINARY,
WGET_BINARY, WGET_BINARY,
CHROME_BINARY, CHROME_BINARY,
@ -219,7 +219,21 @@ def save_source(raw_text):
return source_path return source_path
def download_url(url): def fetch_page_content(url, timeout=TIMEOUT):
req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
if CHECK_SSL_VALIDITY:
resp = urlopen(req, timeout=timeout)
else:
import ssl
insecure = ssl._create_unverified_context()
resp = urlopen(req, timeout=timeout, context=insecure)
encoding = resp.headers.get_content_charset() or 'utf-8'
return resp.read().decode(encoding)
def download_url(url, timeout=TIMEOUT):
"""download a given url's content into downloads/domain.txt""" """download a given url's content into downloads/domain.txt"""
if not os.path.exists(SOURCES_DIR): if not os.path.exists(SOURCES_DIR):
@ -236,7 +250,7 @@ def download_url(url):
)) ))
end = progress(TIMEOUT, prefix=' ') end = progress(TIMEOUT, prefix=' ')
try: try:
downloaded_xml = urlopen(url).read().decode('utf-8') downloaded_xml = fetch_page_content(url, timeout=timeout)
end() end()
except Exception as e: except Exception as e:
end() end()
@ -260,19 +274,15 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
sys.stdout.write('.') sys.stdout.write('.')
sys.stdout.flush() sys.stdout.flush()
if CHECK_SSL_VALIDITY: html = fetch_page_content(url, timeout=timeout)
html_content = urlopen(url, timeout=timeout)
else:
try:
import ssl
insecure = ssl._create_unverified_context()
html_content = urlopen(url, timeout=timeout, context=insecure)
except ImportError:
html_content = urlopen(url, timeout=timeout)
match = re.search(HTML_TITLE_REGEX, html_content.read().decode('utf-8')) match = re.search(HTML_TITLE_REGEX, html)
return match.group(1).strip() if match else None return match.group(1).strip() if match else None
except Exception: except Exception as err:
# print('[!] Failed to fetch title because of {}: {}'.format(
# err.__class__.__name__,
# err,
# ))
return None return None
@ -603,3 +613,15 @@ def run(*popenargs, input=None, capture_output=False, timeout=None, check=False,
raise CalledProcessError(retcode, process.args, raise CalledProcessError(retcode, process.args,
output=stdout, stderr=stderr) output=stdout, stderr=stderr)
return CompletedProcess(process.args, retcode, stdout, stderr) return CompletedProcess(process.args, retcode, stdout, stderr)
def check_link_structure(link):
assert isinstance(link, dict)
assert isinstance(link.get('url'), str)
assert len(link['url']) > 2
def check_links_structure(links):
assert isinstance(links, list)
if links:
check_link_structure(links[0])