From 8502fa5cc3aa608a546fd93483f113a826b02332 Mon Sep 17 00:00:00 2001 From: mlazana Date: Wed, 27 Mar 2019 20:10:05 +0200 Subject: [PATCH] config.py: update function exclude_blacklisted(links) --- archivebox/links.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/archivebox/links.py b/archivebox/links.py index fd8985ec..5eff61f4 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -34,11 +34,11 @@ from config import ( def validate_links(links): check_links_structure(links) - links = archivable_links(links) # remove chrome://, about:, mailto: etc. - links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls - links = sorted_links(links) # deterministically sort the links based on timstamp, url - links = exclude_links(links) # exclude links that are in blacklist - + links = archivable_links(links) # remove chrome://, about:, mailto: etc. + links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls + links = sorted_links(links) # deterministically sort the links based on timstamp, url + links = list(exclude_links(links)) # exclude URLs that match the blacklisted url pattern regex + if not links: print('[X] No links found :(') raise SystemExit(1) @@ -46,7 +46,7 @@ def validate_links(links): for link in links: link['title'] = unescape(link['title'].strip()) if link['title'] else None check_link_structure(link) - + return list(links) @@ -120,9 +120,8 @@ def lowest_uniq_timestamp(used_timestamps, timestamp): return new_timestamp -def exclude_links(links): - """exclude links that are in blacklist""" - - links = [link for link in links if not URL_BLACKLIST.match(link['url'])] - - return links \ No newline at end of file +def exclude_blacklisted(links): + """exclude URLs that match the blacklisted url pattern regex""" + return (link for link in links if not URL_BLACKLIST.match(link['url'])) + + \ No newline at end of file