1
0
Fork 0
mirror of synced 2024-10-02 18:17:07 +13:00

Merge pull request #96 from fgtham/fix_issue_74

fix unstable sorting between consecutive runs
This commit is contained in:
Nick Sweeting 2018-09-19 21:55:36 -04:00 committed by GitHub
commit d46caf604d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -34,6 +34,7 @@ Link {
import datetime import datetime
from html import unescape from html import unescape
from collections import OrderedDict
from util import ( from util import (
domain, domain,
@ -87,7 +88,7 @@ def uniquefied_links(sorted_links):
ensures that all non-duplicate links have monotonically increasing timestamps ensures that all non-duplicate links have monotonically increasing timestamps
""" """
unique_urls = {} unique_urls = OrderedDict()
lower = lambda url: url.lower().strip() lower = lambda url: url.lower().strip()
without_www = lambda url: url.replace('://www.', '://', 1) without_www = lambda url: url.replace('://www.', '://', 1)
@ -100,7 +101,7 @@ def uniquefied_links(sorted_links):
link = merge_links(unique_urls[fuzzy_url], link) link = merge_links(unique_urls[fuzzy_url], link)
unique_urls[fuzzy_url] = link unique_urls[fuzzy_url] = link
unique_timestamps = {} unique_timestamps = OrderedDict()
for link in unique_urls.values(): for link in unique_urls.values():
link['timestamp'] = lowest_uniq_timestamp(unique_timestamps, link['timestamp']) link['timestamp'] = lowest_uniq_timestamp(unique_timestamps, link['timestamp'])
unique_timestamps[link['timestamp']] = link unique_timestamps[link['timestamp']] = link
@ -108,7 +109,7 @@ def uniquefied_links(sorted_links):
return unique_timestamps.values() return unique_timestamps.values()
def sorted_links(links): def sorted_links(links):
sort_func = lambda link: (link['timestamp'], link['url']) sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url'])
return sorted(links, key=sort_func, reverse=True) return sorted(links, key=sort_func, reverse=True)
def links_after_timestamp(links, timestamp=None): def links_after_timestamp(links, timestamp=None):