1
0
Fork 0
mirror of synced 2024-06-28 19:10:33 +12:00

refactoring and fancy new link index

This commit is contained in:
Nick Sweeting 2017-10-23 04:58:41 -05:00
parent 1249493fcd
commit a95912679e
7 changed files with 295 additions and 174 deletions

115
index.py
View file

@ -1,5 +1,4 @@
import os import os
import re
import json import json
from datetime import datetime from datetime import datetime
@ -14,20 +13,15 @@ from config import (
ANSI, ANSI,
GIT_SHA, GIT_SHA,
) )
from util import chmod_file from util import (
chmod_file,
html_appended_url,
derived_link_info,
)
### Homepage index for all the links ### Homepage index for all the links
def parse_json_links_index(out_dir):
"""load the index in a given directory and merge it with the given link"""
index_path = os.path.join(out_dir, 'index.json')
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
return json.load(f)['links']
return []
def write_links_index(out_dir, links): def write_links_index(out_dir, links):
"""create index.html file for a given list of links""" """create index.html file for a given list of links"""
@ -44,8 +38,6 @@ def write_links_index(out_dir, links):
write_json_links_index(out_dir, links) write_json_links_index(out_dir, links)
write_html_links_index(out_dir, links) write_html_links_index(out_dir, links)
chmod_file(out_dir, permissions=ARCHIVE_PERMISSIONS)
def write_json_links_index(out_dir, links): def write_json_links_index(out_dir, links):
"""write the json link index to a given path""" """write the json link index to a given path"""
@ -65,6 +57,15 @@ def write_json_links_index(out_dir, links):
chmod_file(path) chmod_file(path)
def parse_json_links_index(out_dir):
"""load the index in a given directory and merge it with the given link"""
index_path = os.path.join(out_dir, 'index.json')
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
return json.load(f)['links']
return []
def write_html_links_index(out_dir, links): def write_html_links_index(out_dir, links):
"""write the html link index to a given path""" """write the html link index to a given path"""
@ -91,17 +92,11 @@ def write_html_links_index(out_dir, links):
with open(path, 'w', encoding='utf-8') as f: with open(path, 'w', encoding='utf-8') as f:
f.write(Template(index_html).substitute(**template_vars)) f.write(Template(index_html).substitute(**template_vars))
chmod_file(path)
### Individual link index ### Individual link index
def parse_json_link_index(out_dir):
"""load the index in a given directory and merge it with the given link"""
existing_index = os.path.join(out_dir, 'index.json')
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
return json.load(f)
return {}
def write_link_index(out_dir, link): def write_link_index(out_dir, link):
link['updated'] = str(datetime.now().timestamp()) link['updated'] = str(datetime.now().timestamp())
write_json_link_index(out_dir, link) write_json_link_index(out_dir, link)
@ -112,85 +107,39 @@ def write_json_link_index(out_dir, link):
path = os.path.join(out_dir, 'index.json') path = os.path.join(out_dir, 'index.json')
print(' √ Updating: index.json')
with open(path, 'w', encoding='utf-8') as f: with open(path, 'w', encoding='utf-8') as f:
json.dump(link, f, indent=4, default=str) json.dump(link, f, indent=4, default=str)
chmod_file(path) chmod_file(path)
def parse_json_link_index(out_dir):
"""load the json link index from a given directory"""
existing_index = os.path.join(out_dir, 'index.json')
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
return json.load(f)
return {}
def write_html_link_index(out_dir, link): def write_html_link_index(out_dir, link):
with open(LINK_INDEX_TEMPLATE, 'r', encoding='utf-8') as f: with open(LINK_INDEX_TEMPLATE, 'r', encoding='utf-8') as f:
link_html = f.read() link_html = f.read()
path = os.path.join(out_dir, 'index.html') path = os.path.join(out_dir, 'index.html')
print(' √ Updating: index.html')
with open(path, 'w', encoding='utf-8') as f: with open(path, 'w', encoding='utf-8') as f:
f.write(Template(link_html).substitute({ f.write(Template(link_html).substitute({
**link, **link,
**link['methods'], **link['latest'],
'type': link['type'] or 'website', 'type': link['type'] or 'website',
'tags': link['tags'] or '', 'tags': link['tags'] or 'untagged',
'bookmarked': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'), 'bookmarked': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
'updated': datetime.fromtimestamp(float(link['updated'])).strftime('%Y-%m-%d %H:%M'), 'updated': datetime.fromtimestamp(float(link['updated'])).strftime('%Y-%m-%d %H:%M'),
'archive_org': link['methods']['archive_org'] or 'https://web.archive.org/save/{}'.format(link['url']), 'archive_org': link['latest']['archive_org'] or 'https://web.archive.org/save/{}'.format(link['url']),
'wget': link['methods']['wget'] or link['domain'], 'wget': link['latest']['wget'] or link['domain'],
})) }))
chmod_file(path) chmod_file(path)
def html_appended_url(link):
"""calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path.
See docs on wget --adjust-extension."""
if link['type'] in ('PDF', 'image'):
return link['base_url']
split_url = link['url'].split('#', 1)
query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
# already ends in .html
return link['base_url']
else:
# .html needs to be appended
without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
if without_scheme.endswith('/'):
if query:
return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
return '#'.join([without_scheme + 'index.html', *split_url[1:]])
else:
if query:
return '#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])
elif '/' in without_scheme:
return '#'.join([without_scheme + '.html', *split_url[1:]])
return link['base_url'] + '/index.html'
def derived_link_info(link):
"""extend link info with the archive urls and other derived data"""
link_info = {
**link,
'date': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
'files_url': 'archive/{timestamp}/'.format(**link),
'archive_url': 'archive/{}/{}'.format(link['timestamp'], html_appended_url(link)),
'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link),
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
}
# PDF and images are handled slightly differently
# wget, screenshot, & pdf urls all point to the same file
if link['type'] in ('PDF', 'image'):
link_info.update({
'archive_url': 'archive/{timestamp}/{base_url}'.format(**link),
'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
'title': '{title} ({type})'.format(**link),
})
return link_info

View file

@ -1,18 +1,11 @@
from util import (
domain,
base_url,
get_str_between,
get_link_type,
)
""" """
In Bookmark Archiver, a Link represents a single entry that we track in the In Bookmark Archiver, a Link represents a single entry that we track in the
json index. All links pass through all archiver functions and the latest, json index. All links pass through all archiver functions and the latest,
most up-to-date canonical output for each is stored in "latest_archives". most up-to-date canonical output for each is stored in "latest".
.
Link { Link {
timestamp: float, (how we uniquely id links) _ _ _ _ ___ timestamp: str, (how we uniquely id links) _ _ _ _ ___
url: str, | \ / \ |\| ' | url: str, | \ / \ |\| ' |
base_url: str, |_/ \_/ | | | base_url: str, |_/ \_/ | | |
domain: str, _ _ _ _ _ _ domain: str, _ _ _ _ _ _
@ -20,7 +13,7 @@ Link {
type: str, | /"| | | | \_, type: str, | /"| | | | \_,
title: str, ,-'"`-. title: str, ,-'"`-.
sources: [str], /// / @ @ \ \\\\ sources: [str], /// / @ @ \ \\\\
latest_archives: { :=| ,._,. |=: / latest: { \ :=| ,._,. |=: /
..., || ,\ \_../ /. || ..., || ,\ \_../ /. ||
pdf: 'output.pdf', ||','`-._))'`.`|| pdf: 'output.pdf', ||','`-._))'`.`||
wget: 'example.com/1234/index.html' `-' (/ `-' wget: 'example.com/1234/index.html' `-' (/ `-'
@ -39,10 +32,18 @@ Link {
""" """
from util import (
domain,
base_url,
get_str_between,
get_link_type,
)
def validate_links(links): def validate_links(links):
links = valid_links(links) # remove chrome://, about:, mailto: etc. links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = uniquefied_links(links) # fix duplicate timestamps, returns sorted list links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
links = sorted_links(links) # deterministically sort the links links = sorted_links(links) # deterministically sort the links based on timstamp, url
if not links: if not links:
print('[X] No links found :(') print('[X] No links found :(')
@ -50,34 +51,14 @@ def validate_links(links):
return list(links) return list(links)
def sorted_links(links):
return sorted(
links,
key=lambda link: (link['timestamp'], link['url']),
reverse=True,
)
def merge_links(link1, link2): def archivable_links(links):
"""deterministially merge two links, favoring longer field values over shorter, """remove chrome://, about:// or other schemed links that cant be archived"""
and "cleaner" values over worse ones. return (
""" link
longer = lambda a, b, key: a[key] if len(a[key]) > len(b[key]) else b[key] for link in links
earlier = lambda a, b, key: a[key] if a[key] < b[key] else b[key] if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://'))
)
url = longer(link1, link2, 'url')
longest_title = longer(link1, link2, 'title')
cleanest_title = link1['title'] if '://' not in link1['title'] else link2['title']
link = {
'url': url,
'domain': domain(url),
'base_url': base_url(url),
'timestamp': earlier(link1, link2, 'timestamp'),
'tags': longer(link1, link2, 'tags'),
'title': longest_title if '://' not in longest_title else cleanest_title,
'sources': list(set(link1['sources'] + link2['sources'])),
}
link['type'] = get_link_type(link)
return link
def uniquefied_links(sorted_links): def uniquefied_links(sorted_links):
""" """
@ -104,13 +85,33 @@ def uniquefied_links(sorted_links):
return unique_timestamps.values() return unique_timestamps.values()
def valid_links(links): def sorted_links(links):
"""remove chrome://, about:// or other schemed links that cant be archived""" sort_func = lambda link: (link['timestamp'], link['url'])
return ( return sorted(links, key=sort_func, reverse=True)
link
for link in links
if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://'))
) def merge_links(a, b):
"""deterministially merge two links, favoring longer field values over shorter,
and "cleaner" values over worse ones.
"""
longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key]
earlier = lambda key: a[key] if a[key] < b[key] else b[key]
url = longer('url')
longest_title = longer('title')
cleanest_title = a['title'] if '://' not in a['title'] else b['title']
link = {
'timestamp': earlier('timestamp'),
'url': url,
'domain': domain(url),
'base_url': base_url(url),
'tags': longer('tags'),
'title': longest_title if '://' not in longest_title else cleanest_title,
'sources': list(set(a.get('sources', []) + b.get('sources', []))),
}
link['type'] = get_link_type(link)
return link
def links_after_timestamp(links, timestamp=None): def links_after_timestamp(links, timestamp=None):
if not timestamp: if not timestamp:

View file

@ -1,32 +1,36 @@
"""
Everything related to parsing links from bookmark services.
For a list of supported services, see the README.md.
For examples of supported files see examples/.
Parsed link schema: {
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
'domain': 'example.com',
'base_url': 'example.com/example/',
'timestamp': '15442123124234',
'tags': 'abc,def',
'title': 'Example.com Page Title',
'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
}
"""
import re import re
import json import json
from datetime import datetime from datetime import datetime
from util import ( from util import (
domain, domain,
base_url, base_url,
get_str_between, str_between,
get_link_type, get_link_type,
) )
def parse_export(path): def get_parsers(file):
"""parse a list of links dictionaries from a bookmark export file""" """return all parsers that work on a given file, defaults to all of them"""
links = []
with open(path, 'r', encoding='utf-8') as file:
for service, parser_func in get_parsers().items():
# otherwise try all parsers until one works
try:
links += list(parser_func(file))
if links:
break
except Exception as e:
pass
return links
def get_parsers():
return { return {
'pocket': parse_pocket_export, 'pocket': parse_pocket_export,
'pinboard': parse_json_export, 'pinboard': parse_json_export,
@ -34,12 +38,32 @@ def get_parsers():
'rss': parse_rss_export, 'rss': parse_rss_export,
} }
def parse_links(path):
"""parse a list of links dictionaries from a bookmark export file"""
links = []
with open(path, 'r', encoding='utf-8') as file:
for parser_func in get_parsers(file).values():
# otherwise try all parsers until one works
try:
links += list(parser_func(file))
if links:
break
except (ValueError, TypeError):
# parser not supported on this file
pass
return links
def parse_pocket_export(html_file): def parse_pocket_export(html_file):
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
html_file.seek(0) html_file.seek(0)
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE) # see sample input in ./example_ril_export.html pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
for line in html_file: for line in html_file:
# example line
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
match = pattern.search(line) match = pattern.search(line)
if match: if match:
fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
@ -62,6 +86,8 @@ def parse_json_export(json_file):
json_file.seek(0) json_file.seek(0)
json_content = json.load(json_file) json_content = json.load(json_file)
for line in json_content: for line in json_content:
# example line
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
if line: if line:
erg = line erg = line
time = datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ') time = datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ')
@ -96,11 +122,12 @@ def parse_rss_export(rss_file):
leading_removed = trailing_removed.split('<item>', 1)[-1] leading_removed = trailing_removed.split('<item>', 1)[-1]
rows = leading_removed.split('\n') rows = leading_removed.split('\n')
row = lambda key: [r for r in rows if r.startswith('<{}>'.format(key))][0] def get_row(key):
return [r for r in rows if r.startswith('<{}>'.format(key))][0]
title = get_str_between(row('title'), '<![CDATA[', ']]') title = str_between(get_row('title'), '<![CDATA[', ']]')
url = get_str_between(row('link'), '<link>', '</link>') url = str_between(get_row('link'), '<link>', '</link>')
ts_str = get_str_between(row('pubDate'), '<pubDate>', '</pubDate>') ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
info = { info = {
@ -112,17 +139,20 @@ def parse_rss_export(rss_file):
'title': title, 'title': title,
'sources': [rss_file.name], 'sources': [rss_file.name],
} }
info['type'] = get_link_type(info) info['type'] = get_link_type(info)
# import ipdb; ipdb.set_trace()
yield info yield info
def parse_bookmarks_export(html_file): def parse_bookmarks_export(html_file):
"""Parse netscape-format bookmarks export files (produced by all browsers)""" """Parse netscape-format bookmarks export files (produced by all browsers)"""
html_file.seek(0) html_file.seek(0)
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE) pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
for line in html_file: for line in html_file:
# example line
# <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
match = pattern.search(line) match = pattern.search(line)
if match: if match:
url = match.group(1) url = match.group(1)
@ -137,6 +167,6 @@ def parse_bookmarks_export(html_file):
'title': match.group(3), 'title': match.group(3),
'sources': [html_file.name], 'sources': [html_file.name],
} }
info['type'] = get_link_type(info) info['type'] = get_link_type(info)
yield info yield info

View file

@ -68,7 +68,7 @@
<img src="https://nicksweeting.com/images/archive.png" height="36px"> <img src="https://nicksweeting.com/images/archive.png" height="36px">
Archived Sites <img src="https://getpocket.com/favicon.ico" height="36px"> <br/> Archived Sites <img src="https://getpocket.com/favicon.ico" height="36px"> <br/>
<small> <small>
Archived with: <a href="https://github.com/pirate/bookmark-archiver">Bookmark Archiver</a> on $date_updated <a href="https://github.com/pirate/bookmark-archiver">Bookmark Archiver</a>
</small> </small>
</h1> </h1>
</header> </header>

View file

@ -4,7 +4,7 @@
<img src="$favicon_url"> <img src="$favicon_url">
$title <small style="background-color: #eee;border-radius:4px; float:right">$tags</small> $title <small style="background-color: #eee;border-radius:4px; float:right">$tags</small>
</td> </td>
<td style="text-align:center"><a href="$files_url/index.html" title="Files">📂</a></td> <td style="text-align:center"><a href="$files_url" title="Files">📂</a></td>
<td style="text-align:center"><a href="$pdf_link" title="PDF">📄</a></td> <td style="text-align:center"><a href="$pdf_link" title="PDF">📄</a></td>
<td style="text-align:center"><a href="$screenshot_link" title="Screenshot">🖼</a></td> <td style="text-align:center"><a href="$screenshot_link" title="Screenshot">🖼</a></td>
<td style="text-align:center"><a href="$archive_org_url" title="Archive.org">🏛</a></td> <td style="text-align:center"><a href="$archive_org_url" title="Archive.org">🏛</a></td>

View file

@ -140,7 +140,7 @@
<a href="#" class="collapse-icon" title="Collapse Navbar"> <a href="#" class="collapse-icon" title="Collapse Navbar">
[-] [-]
</a> </a>
<a href="../../../index.html" class="nav-icon" title="Archived Sites"> <a href="./../../index.html" class="nav-icon" title="Archived Sites">
<img src="https://nicksweeting.com/images/archive.png" alt="Archive Icon"> <img src="https://nicksweeting.com/images/archive.png" alt="Archive Icon">
</a> </a>
$title<br/> $title<br/>
@ -221,6 +221,7 @@
</body> </body>
<script> <script>
// show selected file in iframe when preview card is clicked
jQuery('.card').on('click', function(e) { jQuery('.card').on('click', function(e) {
jQuery('.selected-card').removeClass('selected-card') jQuery('.selected-card').removeClass('selected-card')
jQuery(e.target).closest('.card').addClass('selected-card') jQuery(e.target).closest('.card').addClass('selected-card')
@ -233,12 +234,16 @@
} }
return true return true
}) })
// un-sandbox iframes showing pdfs (required to display pdf viewer)
jQuery('iframe').map(function() { jQuery('iframe').map(function() {
if (this.src.endsWith('.pdf')) { if (this.src.endsWith('.pdf')) {
this.removeAttribute('sandbox') this.removeAttribute('sandbox')
this.src = this.src this.src = this.src
} }
}) })
// hide header when collapse icon is clicked
jQuery('.collapse-icon').on('click', function() { jQuery('.collapse-icon').on('click', function() {
if (jQuery('.collapse-icon').text().includes('[-]')) { if (jQuery('.collapse-icon').text().includes('[-]')) {
jQuery('.collapse-icon').text('[+]') jQuery('.collapse-icon').text('[+]')
@ -251,6 +256,8 @@
} }
return true return true
}) })
// hide all preview iframes on small screens
if (window.innerWidth < 1091) { if (window.innerWidth < 1091) {
jQuery('.card a[target=preview]').attr('target', '_self') jQuery('.card a[target=preview]').attr('target', '_self')
} }

164
util.py
View file

@ -1,6 +1,8 @@
import os import os
import re
import sys import sys
import time import time
import json
import requests import requests
from datetime import datetime from datetime import datetime
@ -24,6 +26,17 @@ from config import (
SUBMIT_ARCHIVE_DOT_ORG, SUBMIT_ARCHIVE_DOT_ORG,
) )
# URL helpers
without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '')
without_query = lambda url: url.split('?', 1)[0]
without_hash = lambda url: url.split('#', 1)[0]
without_path = lambda url: url.split('/', 1)[0]
domain = lambda url: without_hash(without_query(without_path(without_scheme(url))))
base_url = lambda url: without_query(without_scheme(url))
short_ts = lambda ts: ts.split('.')[0]
def check_dependencies(): def check_dependencies():
"""Check that all necessary dependencies are installed, and have valid versions""" """Check that all necessary dependencies are installed, and have valid versions"""
@ -149,11 +162,15 @@ def progress(seconds=TIMEOUT, prefix=''):
def download_url(url): def download_url(url):
if not os.path.exists(os.path.join(ARCHIVE_DIR, 'downloads')): """download a given url's content into downloads/domain.txt"""
os.makedirs(os.path.join(ARCHIVE_DIR, 'downloads'))
download_dir = os.path.join(ARCHIVE_DIR, 'downloads')
if not os.path.exists(download_dir):
os.makedirs(download_dir)
url_domain = url.split('/', 3)[2] url_domain = url.split('/', 3)[2]
output_path = os.path.join(ARCHIVE_DIR, 'downloads', '{}.txt'.format(url_domain)) output_path = os.path.join(download_dir, '{}.txt'.format(url_domain))
print('[*] [{}] Downloading {} > {}'.format( print('[*] [{}] Downloading {} > {}'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'), datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
@ -172,10 +189,10 @@ def download_url(url):
with open(output_path, 'w', encoding='utf-8') as f: with open(output_path, 'w', encoding='utf-8') as f:
f.write(downloaded_xml) f.write(downloaded_xml)
return output_path return output_path
def str_between(string, start, end=None):
def get_str_between(string, start, end=None):
"""(<abc>12345</def>, <abc>, </def>) -> 12345""" """(<abc>12345</def>, <abc>, </def>) -> 12345"""
content = string.split(start, 1)[-1] content = string.split(start, 1)[-1]
@ -184,9 +201,6 @@ def get_str_between(string, start, end=None):
return content return content
def get_link_type(link): def get_link_type(link):
"""Certain types of links need to be handled specially, this figures out when that's the case""" """Certain types of links need to be handled specially, this figures out when that's the case"""
@ -207,10 +221,130 @@ def get_link_type(link):
return None return None
# URL helpers def find_link(folder, links):
without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '') """for a given archive folder, find the corresponding link object in links"""
without_query = lambda url: url.split('?', 1)[0] url = parse_url(folder)
without_hash = lambda url: url.split('#', 1)[0] if url:
without_path = lambda url: url.split('/', 1)[0] for link in links:
domain = lambda url: without_hash(without_query(without_path(without_scheme(url)))) if (link['base_url'] in url) or (url in link['url']):
base_url = lambda url: without_query(without_scheme(url)) return link
timestamp = folder.split('.')[0]
for link in links:
if link['timestamp'].startswith(timestamp):
if link['domain'] in os.listdir('./html/archive/' + folder):
return link # careful now, this isn't safe for most ppl
if link['domain'] in parse_url(folder):
return link
return None
def parse_url(folder):
"""for a given archive folder, figure out what url it's for"""
link_json = os.path.join('./html/archive/' + folder, 'index.json')
if os.path.exists(link_json):
with open(link_json, 'r') as f:
link = json.load(f)
return link['base_url']
archive_org_txt = os.path.join('./html/archive/' + folder, 'archive.org.txt')
if os.path.exists(archive_org_txt):
with open(archive_org_txt, 'r') as f:
original_link = f.read().strip().split('/http', 1)[-1]
with_scheme = 'http{}'.format(original_link)
return with_scheme
return ''
def merge_folders(folder, link):
"""given a folder, merge it to the canonical 'correct' path for the given link object"""
base_url = parse_url(folder)
if not (base_url in link['base_url']
or link['base_url'] in base_url):
print(base_url, link['base_url'])
assert False
print('{} > {}'.format(folder, link['timestamp']))
def cleanup_archive(path, links):
"""move any incorrectly named folders to their canonical locations"""
# for each folder that exists, see if we can match it up with a known good link
# if we can, then merge the two folders, if not, move it to lost & found
# for each timestamp, find similar timestamped folders
# check each folder for a "domain.com" folder or
unmatched = []
for folder in os.listdir(path):
link = find_link(folder, links)
if link is None:
unmatched.append(folder)
continue
if folder != link['timestamp']:
merge_folders(folder, link)
if unmatched:
print('[!] Warning! {} unrecognized folders in html/archive/'.format(len(unmatched)))
print('\n '.join(unmatched))
def html_appended_url(link):
"""calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path.
See docs on wget --adjust-extension.
"""
if link['type'] in ('PDF', 'image'):
return link['base_url']
split_url = link['url'].split('#', 1)
query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
# already ends in .html
return link['base_url']
else:
# .html needs to be appended
without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
if without_scheme.endswith('/'):
if query:
return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
return '#'.join([without_scheme + 'index.html', *split_url[1:]])
else:
if query:
return '#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])
elif '/' in without_scheme:
return '#'.join([without_scheme + '.html', *split_url[1:]])
return link['base_url'] + '/index.html'
def derived_link_info(link):
"""extend link info with the archive urls and other derived data"""
link_info = {
**link,
'date': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
'favicon_url': './archive/{timestamp}/favicon.ico'.format(**link),
'files_url': './archive/{timestamp}/index.html'.format(**link),
'archive_url': './archive/{}/{}'.format(link['timestamp'], html_appended_url(link)),
'pdf_link': './archive/{timestamp}/output.pdf'.format(**link),
'screenshot_link': './archive/{timestamp}/screenshot.png'.format(**link),
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
}
# PDF and images are handled slightly differently
# wget, screenshot, & pdf urls all point to the same file
if link['type'] in ('PDF', 'image'):
link_info.update({
'archive_url': 'archive/{timestamp}/{base_url}'.format(**link),
'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
'title': '{title} ({type})'.format(**link),
})
return link_info