From 5de6b3adc1ae8c8ad08af487a4753405d03ccad5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 15 Jun 2017 17:33:01 -0500 Subject: [PATCH] de-duplicate timstamps if using url --- archive.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/archive.py b/archive.py index c6a0f2ba..100e123f 100755 --- a/archive.py +++ b/archive.py @@ -216,6 +216,52 @@ def fetch_favicon(out_dir, link, overwrite=False): ### ORCHESTRATION +def next_uniq_timestamp(used_timestamps, timestamp): + """resolve duplicate timestamps by appending a decimal""" + + if timestamp not in used_timestamps: + return timestamp + + if '.' in timestamp: + timestamp, nonce = timestamp.split('.') + nonce = int(nonce) + else: + nonce = 1 + + new_timestamp = '{}.{}'.format(timestamp, nonce) + + while new_timestamp in used_timestamps: + nonce += 1 + new_timestamp = '{}.{}'.format(timestamp, nonce) + + return new_timestamp + +def uniquefied_links(links): + """uniqueify link timestamps by de-duping using url, returns links sorted most recent -> oldest + + needed because firefox will produce exports where many links share the same timestamp, this func + ensures that all non-duplicate links have monotonically increasing timestamps""" + + links = list(reversed(sorted(links, key=lambda l: (l['timestamp'], l['url'])))) + seen_timestamps = {} + + for link in links: + t = link['timestamp'] + if t in seen_timestamps: + if link['url'] == seen_timestamps[t]['url']: + # don't create new unique timestamp if link is the same + continue + else: + # resolve duplicate timstamp by appending a decimal + link['timestamp'] = next_uniq_timestamp(seen_timestamps, link['timestamp']) + seen_timestamps[link['timestamp']] = link + + return links + +def valid_links(links): + return (link for link in links if link['url'].startswith('http') or link['url'].startswith('ftp')) + + def dump_index(links, service): with open(INDEX_TEMPLATE, 'r') as f: index_html = f.read() @@ -271,6 +317,10 @@ def dump_website(link, service, overwrite=False): if link['type']: print(' i Type: {}'.format(link['type'])) + if not link['url'].startswith('http'): + print(' X Skipping: invalid link.') + return + if FETCH_WGET: fetch_wget(out_dir, link, overwrite=overwrite)