From 5a7d00a6399f2b3a256a6059920a9330134b5fd7 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 19 Feb 2019 01:44:54 -0500 Subject: [PATCH] fetch page title during archiving process --- archivebox/archive_methods.py | 32 ++++++++++++++++++++++++++++++-- archivebox/config.py | 1 + archivebox/links.py | 5 ++++- archivebox/util.py | 20 ++++++++------------ etc/ArchiveBox.conf.default | 1 + 5 files changed, 44 insertions(+), 15 deletions(-) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 0148849d..26530d22 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -12,6 +12,8 @@ from index import wget_output_path, parse_json_link_index, write_link_index from links import links_after_timestamp from config import ( CHROME_BINARY, + FETCH_FAVICON, + FETCH_TITLE, FETCH_WGET, FETCH_WGET_REQUISITES, FETCH_PDF, @@ -23,7 +25,6 @@ from config import ( RESOLUTION, CHECK_SSL_VALIDITY, SUBMIT_ARCHIVE_DOT_ORG, - FETCH_FAVICON, WGET_USER_AGENT, CHROME_USER_DATA_DIR, CHROME_SANDBOX, @@ -36,6 +37,7 @@ from config import ( ) from util import ( check_dependencies, + fetch_page_title, progress, chmod_file, pretty_path, @@ -96,6 +98,9 @@ def archive_link(link_dir, link, overwrite=True): if FETCH_FAVICON: link = fetch_favicon(link_dir, link, overwrite=overwrite) + if FETCH_TITLE: + link = fetch_title(link_dir, link, overwrite=overwrite) + if FETCH_WGET: link = fetch_wget(link_dir, link, overwrite=overwrite) @@ -129,7 +134,7 @@ def log_link_archive(link_dir, link, update_existing): symbol='*' if update_existing else '+', symbol_color=ANSI['black' if update_existing else 'green'], now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - **link, + **{**link, 'title': link['title'] or link['url']}, **ANSI, )) @@ -492,6 +497,29 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT): 'output': output, } +@attach_result_to_link('title') +def fetch_title(link_dir, link, timeout=TIMEOUT): + """try to guess the page's title from its content""" + + # if link already has valid title, skip it + if link['title'] and not link['title'].lower().startswith('http'): + return {'output': link['title'], 'cmd': 'fetch_page_title("{}")'.format(link['url'])} + + end = progress(timeout, prefix=' ') + try: + title = fetch_page_title(link['url'], timeout=timeout, progress=False) + end() + output = title + except Exception as e: + end() + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) + output = e + + return { + 'cmd': 'fetch_page_title("{}")'.format(link['url']), + 'output': output, + } + @attach_result_to_link('media') def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False): """Download playlists or individual video, audio, and subtitles using youtube-dl""" diff --git a/archivebox/config.py b/archivebox/config.py index c887c7f2..1202fd3c 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -27,6 +27,7 @@ FETCH_WARC = os.getenv('FETCH_WARC', 'True' FETCH_GIT = os.getenv('FETCH_GIT', 'True' ).lower() == 'true' FETCH_MEDIA = os.getenv('FETCH_MEDIA', 'True' ).lower() == 'true' FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true' +FETCH_TITLE = os.getenv('FETCH_TITLE', 'True' ).lower() == 'true' SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true' CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true' diff --git a/archivebox/links.py b/archivebox/links.py index e544618a..1a88f793 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -57,7 +57,7 @@ def validate_links(links): raise SystemExit(1) for link in links: - link['title'] = unescape(link['title']) + link['title'] = unescape(link['title']) if link['title'] else None link['latest'] = link.get('latest') or {} latest = link['latest'] @@ -76,6 +76,9 @@ def validate_links(links): if not latest.get('favicon'): latest['favicon'] = None + if not link['latest'].get('title'): + link['latest']['title'] = link['title'] + return list(links) def new_links(all_links, existing_links): diff --git a/archivebox/util.py b/archivebox/util.py index 6a91dd76..89f95ccf 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -44,6 +44,7 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links short_ts = lambda ts: ts.split('.')[0] URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+' +HTML_TITLE_REGEX = '(.[^<>]+)' def check_dependencies(): @@ -227,22 +228,17 @@ def download_url(url): return source_path -def fetch_page_title(url, default=True): +def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS): """Attempt to guess a page's title by downloading the html""" - if default is True: - default = url - try: - if SHOW_PROGRESS: + if progress: sys.stdout.write('.') sys.stdout.flush() html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8') match = re.search('<title>(.*?)', html_content) return match.group(1) if match else default or None except Exception: - if default is False: - raise - return default + return None def str_between(string, start, end=None): @@ -277,19 +273,19 @@ def merge_links(a, b): """deterministially merge two links, favoring longer field values over shorter, and "cleaner" values over worse ones. """ - longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key] + longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key]) earlier = lambda key: a[key] if a[key] < b[key] else b[key] url = longer('url') longest_title = longer('title') - cleanest_title = a['title'] if '://' not in a['title'] else b['title'] + cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title'] link = { 'timestamp': earlier('timestamp'), 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'tags': longer('tags'), - 'title': longest_title if '://' not in longest_title else cleanest_title, + 'title': longest_title if '://' not in (longest_title or '') else cleanest_title, 'sources': list(set(a.get('sources', []) + b.get('sources', []))), } link['type'] = get_link_type(link) @@ -532,7 +528,7 @@ def derived_link_info(link): 'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link), 'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link), 'dom_link': 'archive/{timestamp}/{base_url}'.format(**link), - 'title': '{title} ({type})'.format(**link), + 'title': link['title'] or basename(link['url']), }) return link_info diff --git a/etc/ArchiveBox.conf.default b/etc/ArchiveBox.conf.default index 08fcbe1c..52dc5c4b 100644 --- a/etc/ArchiveBox.conf.default +++ b/etc/ArchiveBox.conf.default @@ -10,6 +10,7 @@ # FETCH_MEDIA=False # FETCH_GIT=True # FETCH_FAVICON=True +# FETCH_TITLE=True # SUBMIT_ARCHIVE_DOT_ORG=True ### To only download new links, and never attempt to update old ones, uncomment this line: