diff --git a/README.md b/README.md index 87e94038..29ede58d 100644 --- a/README.md +++ b/README.md @@ -377,10 +377,11 @@ will run fast subsequent times because it only downloads new links that haven't ## Changelog - - v0.0.2 released + - v0.0.3 released - support for chrome `--user-data-dir` to archive sites that need logins - fancy individual html & json indexes for each link - smartly append new links to existing index instead of overwriting + - v0.0.2 released - proper HTML templating instead of format strings (thanks to https://github.com/bardisty!) - refactored into separate files, wip audio & video archiving - v0.0.1 released diff --git a/archive.py b/archive.py index 89a0935c..fcecae7f 100755 --- a/archive.py +++ b/archive.py @@ -3,6 +3,7 @@ # Nick Sweeting 2017 | MIT License # https://github.com/pirate/bookmark-archiver +import os import sys from datetime import datetime @@ -19,7 +20,6 @@ from index import ( from config import ( ARCHIVE_PERMISSIONS, HTML_FOLDER, - ARCHIVE_FOLDER, ANSI, TIMEOUT, ) @@ -33,19 +33,50 @@ from util import ( __DESCRIPTION__ = 'Bookmark Archiver: Create a browsable html archive of a list of links.' __DOCUMENTATION__ = 'https://github.com/pirate/bookmark-archiver' +def print_help(): + print(__DESCRIPTION__) + print("Documentation: {}\n".format(__DOCUMENTATION__)) + print("Usage:") + print(" ./archive.py ~/Downloads/bookmarks_export.html\n") -def update_archive(export_path, links, resume=None, append=True): + +def get_links(new_links_file_path, archive_path=HTML_FOLDER): + """get new links from file and optionally append them to links in existing archive""" + # parse and validate the new_links_file + raw_links = parse_links(new_links_file_path) + valid_links = validate_links(raw_links) + + # merge existing links in archive_path and new links + existing_links = [] + if archive_path: + existing_links = parse_json_links_index(archive_path) + valid_links = validate_links(existing_links + valid_links) + + num_new_links = len(valid_links) - len(existing_links) + print('[*] [{}] Adding {} new links from {} to index'.format( + datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + num_new_links, + new_links_file_path, + )) + + return valid_links + +def update_archive(archive_path, links, source=None, resume=None, append=True): """update or create index.html+json given a path to an export file containing new links""" start_ts = datetime.now().timestamp() # loop over links and archive them - archive_links(ARCHIVE_FOLDER, links, export_path, resume=resume) + archive_links(archive_path, links, source=source, resume=resume) # print timing information & summary end_ts = datetime.now().timestamp() - seconds = round(end_ts - start_ts, 2) - duration = '{} min'.format(round(seconds / 60, 2)) if seconds > 60 else '{} sec'.format(seconds) + seconds = end_ts - start_ts + if seconds > 60: + duration = '{0:.2f} min'.format(seconds / 60, 2) + else: + duration = '{0:.2f} sec'.format(seconds, 2) + print('{}[√] [{}] Archive update complete ({}){}'.format( ANSI['green'], datetime.now().strftime('%Y-%m-%d %H:%M:%S'), @@ -57,53 +88,37 @@ def update_archive(export_path, links, resume=None, append=True): print(' - {} errors'.format(_RESULTS_TOTALS['failed'])) -def update_index(export_path, resume=None, append=True): - """handling parsing new links into the json index, returns a set of clean links""" - - # parse an validate the export file - new_links = validate_links(parse_links(export_path)) - - # load existing links if archive folder is present - existing_links = [] - if append: - existing_links = parse_json_links_index(HTML_FOLDER) - links = validate_links(existing_links + new_links) - - - # merge existing links and new links - num_new_links = len(links) - len(existing_links) - print('[*] [{}] Adding {} new links from {} to index'.format( - datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - num_new_links, - export_path, - )) - - # write link index html & json - write_links_index(HTML_FOLDER, links) - - return links - - if __name__ == '__main__': argc = len(sys.argv) - if argc < 2 or sys.argv[1] in ('-h', '--help', 'help'): - print(__DESCRIPTION__) - print("Documentation: {}".format(__DOCUMENTATION__)) - print("") - print("Usage:") - print(" ./archive.py ~/Downloads/bookmarks_export.html") - print("") + if argc < 2 or set(sys.argv).intersection('-h', '--help', 'help'): + print_help() raise SystemExit(0) - export_path = sys.argv[1] # path to export file - resume_from = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from + source = sys.argv[1] # path to export file + resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from + + # See if archive folder already exists + for out_folder in (HTML_FOLDER, 'bookmarks', 'pocket', 'pinboard', 'html'): + if os.path.exists(out_folder): + break + else: + out_folder = HTML_FOLDER - if any(export_path.startswith(s) for s in ('http://', 'https://', 'ftp://')): - export_path = download_url(export_path) + archive_path = os.path.join(out_folder, 'archive') - links = update_index(export_path, resume=resume_from, append=True) + # Step 0: Download url to local file (only happens if a URL is specified instead of local path) + if any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')): + source = download_url(source) - # make sure folder structure is sane - cleanup_archive(ARCHIVE_FOLDER, links) - update_archive(export_path, links, resume=resume_from, append=True) + # Step 1: Parse the links and dedupe them with existing archive + links = get_links(source, archive_path=archive_path) + + # Step 2: Write new index + write_links_index(archive_path, links) + + # Step 3: Verify folder structure is 1:1 with index + cleanup_archive(archive_path, links) + + # Step 4: Run the archive methods for each link + update_archive(archive_path, links, source=source, resume=resume, append=True) diff --git a/archive_methods.py b/archive_methods.py index 06f2c58b..af934f17 100644 --- a/archive_methods.py +++ b/archive_methods.py @@ -36,24 +36,24 @@ _RESULTS_TOTALS = { # globals are bad, mmkay 'failed': 0, } -def archive_links(out_dir, links, export_path, resume=None): +def archive_links(archive_path, links, source=None, resume=None): check_dependencies() to_archive = links_after_timestamp(links, resume) try: for idx, link in enumerate(to_archive): - link_dir = os.path.join(out_dir, link['timestamp']) + link_dir = os.path.join(archive_path, link['timestamp']) archive_link(link_dir, link) except (KeyboardInterrupt, SystemExit, Exception) as e: - print('{red}[X] Archive update stopped on #{idx} out of {total} links{reset}'.format( + print('{red}[X] Index is up-to-date, archive update paused on link {idx}/{total}{reset}'.format( **ANSI, idx=idx, total=len(list(to_archive)), )) print(' Continue where you left off by running:') print(' ./archive.py {} {}'.format( - export_path, + source, link['timestamp'], )) if not isinstance(e, KeyboardInterrupt): @@ -61,42 +61,46 @@ def archive_links(out_dir, links, export_path, resume=None): raise SystemExit(1) -def archive_link(out_dir, link, overwrite=False): +def archive_link(link_dir, link, overwrite=False): """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" - link = {**parse_json_link_index(out_dir), **link} - log_link_archive(out_dir, link) + update_existing = os.path.exists(link_dir) + if update_existing: + link = { + **parse_json_link_index(link_dir), + **link, + } + else: + os.makedirs(link_dir) + + log_link_archive(link_dir, link, update_existing) if FETCH_WGET: - link = fetch_wget(out_dir, link, overwrite=overwrite) + link = fetch_wget(link_dir, link, overwrite=overwrite) if FETCH_PDF: - link = fetch_pdf(out_dir, link, overwrite=overwrite) + link = fetch_pdf(link_dir, link, overwrite=overwrite) if FETCH_SCREENSHOT: - link = fetch_screenshot(out_dir, link, overwrite=overwrite) + link = fetch_screenshot(link_dir, link, overwrite=overwrite) if SUBMIT_ARCHIVE_DOT_ORG: - link = archive_dot_org(out_dir, link, overwrite=overwrite) + link = archive_dot_org(link_dir, link, overwrite=overwrite) # if FETCH_AUDIO: - # link = fetch_audio(out_dir, link, overwrite=overwrite) + # link = fetch_audio(link_dir, link, overwrite=overwrite) # if FETCH_VIDEO: - # link = fetch_video(out_dir, link, overwrite=overwrite) + # link = fetch_video(link_dir, link, overwrite=overwrite) if FETCH_FAVICON: - link = fetch_favicon(out_dir, link, overwrite=overwrite) + link = fetch_favicon(link_dir, link, overwrite=overwrite) - write_link_index(out_dir, link) + write_link_index(link_dir, link) return link -def log_link_archive(out_dir, link): - update_existing = os.path.exists(out_dir) - if not update_existing: - os.makedirs(out_dir) - +def log_link_archive(link_dir, link, update_existing): print('[{symbol_color}{symbol}{reset}] [{timestamp}] "{title}": {blue}{base_url}{reset}'.format( symbol='*' if update_existing else '+', symbol_color=ANSI['black' if update_existing else 'green'], @@ -106,7 +110,7 @@ def log_link_archive(out_dir, link): if link['type']: print(' i Type: {}'.format(link['type'])) - print(' {} ({})'.format(out_dir, 'updating' if update_existing else 'creating')) + print(' {} ({})'.format(link_dir, 'updating' if update_existing else 'creating')) @@ -118,7 +122,7 @@ def attach_result_to_link(method): """ def decorator(fetch_func): @wraps(fetch_func) - def timed_fetch_func(out_dir, link, overwrite=False, **kwargs): + def timed_fetch_func(link_dir, link, overwrite=False, **kwargs): # initialize methods and history json field on link link['latest'] = link.get('latest') or {} link['latest'][method] = link['latest'].get(method) or None @@ -133,7 +137,7 @@ def attach_result_to_link(method): result = None else: print(' - Fetching: {}'.format(method)) - result = fetch_func(out_dir, link, **kwargs) + result = fetch_func(link_dir, link, **kwargs) end_ts = datetime.now().timestamp() duration = str(end_ts * 1000 - start_ts * 1000).split('.')[0] @@ -164,10 +168,10 @@ def attach_result_to_link(method): @attach_result_to_link('wget') -def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT): +def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT): """download full site using wget""" - if os.path.exists(os.path.join(out_dir, link['domain'])): + if os.path.exists(os.path.join(link_dir, link['domain'])): return {'output': html_appended_url(link), 'status': 'skipped'} CMD = [ @@ -178,7 +182,7 @@ def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT) ] end = progress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout + 1) # index.html + result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # index.html end() output = html_appended_url(link) if result.returncode > 0: @@ -187,7 +191,7 @@ def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT) # raise Exception('Failed to wget download') except Exception as e: end() - print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) + print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) output = e @@ -198,24 +202,23 @@ def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT) @attach_result_to_link('pdf') -def fetch_pdf(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR): +def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR): """print PDF of site to file using chrome --headless""" if link['type'] in ('PDF', 'image'): return {'output': html_appended_url(link)} - if os.path.exists(os.path.join(out_dir, 'output.pdf')): + if os.path.exists(os.path.join(link_dir, 'output.pdf')): return {'output': 'output.pdf', 'status': 'skipped'} CMD = [ - CHROME_BINARY, - *'--headless --disable-gpu --print-to-pdf'.split(' '), - *chrome_data_dir_args(user_data_dir), + *chrome_headless(user_data_dir=user_data_dir), + '--print-to-pdf', link['url'] ] end = progress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout + 1) # output.pdf + result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.pdf end() if result.returncode: print(' ', (result.stderr or result.stdout).decode()) @@ -223,7 +226,7 @@ def fetch_pdf(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR output = 'output.pdf' except Exception as e: end() - print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) + print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) output = e @@ -234,34 +237,33 @@ def fetch_pdf(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR @attach_result_to_link('screenshot') -def fetch_screenshot(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION): +def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION): """take screenshot of site using chrome --headless""" if link['type'] in ('PDF', 'image'): return {'output': html_appended_url(link)} - if os.path.exists(os.path.join(out_dir, 'screenshot.png')): + if os.path.exists(os.path.join(link_dir, 'screenshot.png')): return {'output': 'screenshot.png', 'status': 'skipped'} CMD = [ - CHROME_BINARY, - *'--headless --disable-gpu --screenshot'.split(' '), - *chrome_data_dir_args(user_data_dir), + *chrome_headless(user_data_dir=user_data_dir), + '--screenshot', '--window-size={}'.format(resolution), link['url'] ] end = progress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout + 1) # sreenshot.png + result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # sreenshot.png end() if result.returncode: print(' ', (result.stderr or result.stdout).decode()) raise Exception('Failed to take screenshot') - chmod_file('screenshot.png', cwd=out_dir) + chmod_file('screenshot.png', cwd=link_dir) output = 'screenshot.png' except Exception as e: end() - print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) + print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) output = e @@ -272,10 +274,10 @@ def fetch_screenshot(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_D @attach_result_to_link('archive_org') -def archive_dot_org(out_dir, link, timeout=TIMEOUT): +def archive_dot_org(link_dir, link, timeout=TIMEOUT): """submit site to archive.org for archiving via their service, save returned archive url""" - path = os.path.join(out_dir, 'archive.org.txt') + path = os.path.join(link_dir, 'archive.org.txt') if os.path.exists(path): archive_org_url = open(path, 'r').read().strip() return {'output': archive_org_url, 'status': 'skipped'} @@ -286,7 +288,7 @@ def archive_dot_org(out_dir, link, timeout=TIMEOUT): CMD = ['curl', '-I', submit_url] end = progress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # archive.org.txt + result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # archive.org.txt end() # Parse archive.org response headers @@ -313,9 +315,9 @@ def archive_dot_org(out_dir, link, timeout=TIMEOUT): output = e if success: - with open(os.path.join(out_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f: + with open(os.path.join(link_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f: f.write(saved_url) - chmod_file('archive.org.txt', cwd=out_dir) + chmod_file('archive.org.txt', cwd=link_dir) output = saved_url return { @@ -324,20 +326,20 @@ def archive_dot_org(out_dir, link, timeout=TIMEOUT): } @attach_result_to_link('favicon') -def fetch_favicon(out_dir, link, timeout=TIMEOUT): +def fetch_favicon(link_dir, link, timeout=TIMEOUT): """download site favicon from google's favicon api""" - if os.path.exists(os.path.join(out_dir, 'favicon.ico')): + if os.path.exists(os.path.join(link_dir, 'favicon.ico')): return {'output': 'favicon.ico', 'status': 'skipped'} CMD = ['curl', 'https://www.google.com/s2/favicons?domain={domain}'.format(**link)] - fout = open('{}/favicon.ico'.format(out_dir), 'w') + fout = open('{}/favicon.ico'.format(link_dir), 'w') end = progress(timeout, prefix=' ') try: - run(CMD, stdout=fout, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # favicon.ico + run(CMD, stdout=fout, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # favicon.ico fout.close() end() - chmod_file('favicon.ico', cwd=out_dir) + chmod_file('favicon.ico', cwd=link_dir) output = 'favicon.ico' except Exception as e: fout.close() @@ -352,14 +354,14 @@ def fetch_favicon(out_dir, link, timeout=TIMEOUT): } # @attach_result_to_link('audio') -# def fetch_audio(out_dir, link, timeout=TIMEOUT): +# def fetch_audio(link_dir, link, timeout=TIMEOUT): # """Download audio rip using youtube-dl""" # if link['type'] not in ('soundcloud',)\ # and 'audio' not in link['tags']: # return -# path = os.path.join(out_dir, 'audio') +# path = os.path.join(link_dir, 'audio') # if not os.path.exists(path) or overwrite: # print(' - Downloading audio') @@ -369,30 +371,30 @@ def fetch_favicon(out_dir, link, timeout=TIMEOUT): # ] # end = progress(timeout, prefix=' ') # try: -# result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # audio/audio.mp3 +# result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # audio/audio.mp3 # end() # if result.returncode: # print(' ', result.stderr.decode()) # raise Exception('Failed to download audio') -# chmod_file('audio.mp3', cwd=out_dir) +# chmod_file('audio.mp3', cwd=link_dir) # return 'audio.mp3' # except Exception as e: # end() -# print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) +# print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) # print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) # raise # else: # print(' √ Skipping audio download') # @attach_result_to_link('video') -# def fetch_video(out_dir, link, timeout=TIMEOUT): +# def fetch_video(link_dir, link, timeout=TIMEOUT): # """Download video rip using youtube-dl""" # if link['type'] not in ('youtube', 'youku', 'vimeo')\ # and 'video' not in link['tags']: # return -# path = os.path.join(out_dir, 'video') +# path = os.path.join(link_dir, 'video') # if not os.path.exists(path) or overwrite: # print(' - Downloading video') @@ -402,26 +404,27 @@ def fetch_favicon(out_dir, link, timeout=TIMEOUT): # ] # end = progress(timeout, prefix=' ') # try: -# result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # video/movie.mp4 +# result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # video/movie.mp4 # end() # if result.returncode: # print(' ', result.stderr.decode()) # raise Exception('Failed to download video') -# chmod_file('video.mp4', cwd=out_dir) +# chmod_file('video.mp4', cwd=link_dir) # return 'video.mp4' # except Exception as e: # end() -# print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) +# print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) # print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) # raise # else: # print(' √ Skipping video download') -def chrome_data_dir_args(user_data_dir=CHROME_USER_DATA_DIR): - default = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default') +def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR): + args = [binary, '--headless', '--disable-gpu'] + default_profile = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default') if user_data_dir: - return ('--user-data-dir={}'.format(user_data_dir),) - elif os.path.exists(default): - return ('--user-data-dir={}'.format(default),) - return () + args.append('--user-data-dir={}'.format(user_data_dir)) + elif os.path.exists(default_profile): + args.append('--user-data-dir={}'.format(default_profile)) + return args diff --git a/util.py b/util.py index f02afdea..d073e9a6 100644 --- a/util.py +++ b/util.py @@ -293,8 +293,8 @@ def manually_merge_folders(source, target): print(' {} and {} have conflicting files, which do you want to keep?'.format(fname(source), fname(target))) print(' - [enter]: do nothing (keep both)') - print(' - a: keep everything from {}'.format(source)) - print(' - b: keep everything from {}'.format(target)) + print(' - a: prefer files from {}'.format(source)) + print(' - b: prefer files from {}'.format(target)) print(' - q: quit and resolve the conflict manually') try: answer = input('> ').strip().lower() @@ -311,7 +311,7 @@ def manually_merge_folders(source, target): files_in_source = set(os.listdir(source)) files_in_target = set(os.listdir(target)) - for file in files_in_source.intersection(files_in_target): + for file in files_in_source: if file in files_in_target: to_delete = target if answer == 'a' else source run(['rm', '-Rf', os.path.join(to_delete, file)]) @@ -320,27 +320,26 @@ def manually_merge_folders(source, target): if not set(os.listdir(source)): run(['rm', '-Rf', source]) -def merge_folders(path, folder, link): +def fix_folder_path(archive_path, link_folder, link): """given a folder, merge it to the canonical 'correct' path for the given link object""" - source, target = os.path.join(path, folder), os.path.join(path, link['timestamp']) + source = os.path.join(archive_path, link_folder) + target = os.path.join(archive_path, link['timestamp']) - base_url = parse_url(source) - if not (base_url in link['base_url'] - or link['base_url'] in base_url): + url_in_folder = parse_url(source) + if not (url_in_folder in link['base_url'] + or link['base_url'] in url_in_folder): raise ValueError('The link does not match the url for this folder.') if not os.path.exists(target): # target doesn't exist so nothing needs merging, simply move A to B - if run(['mv', source, target]).returncode: - print('Failed to move {} to {}!'.format(source, target)) - return False + run(['mv', source, target]) else: # target folder exists, check for conflicting files and attempt manual merge files_in_source = set(os.listdir(source)) files_in_target = set(os.listdir(target)) + conflicting_files = files_in_source & files_in_target - if not files_in_source.intersection(files_in_target): - # no conflicts, move everything from A to B + if not conflicting_files: for file in files_in_source: run(['mv', os.path.join(source, file), os.path.join(target, file)]) @@ -352,26 +351,25 @@ def merge_folders(path, folder, link): run(['rm', '-R', source]) -def cleanup_archive(path, links): +def cleanup_archive(archive_path, links): """move any incorrectly named folders to their canonical locations""" # for each folder that exists, see if we can match it up with a known good link - # if we can, then merge the two folders, if not, move it to lost & found - - # for each timestamp, find similar timestamped folders - # check each folder for a "domain.com" folder or + # if we can, then merge the two folders (TODO: if not, move it to lost & found) unmatched = [] bad_folders = [] - if not os.path.exists(path): + if not os.path.exists(archive_path): return - for folder in os.listdir(path): - if not os.listdir(os.path.join(path, folder)): - # delete empty folders - run(['rm', '-R', os.path.join(path, folder)]) - else: + for folder in os.listdir(archive_path): + try: + files = os.listdir(os.path.join(archive_path, folder)) + except NotADirectoryError: + continue + + if files: link = find_link(folder, links) if link is None: unmatched.append(folder) @@ -379,11 +377,16 @@ def cleanup_archive(path, links): if folder != link['timestamp']: bad_folders.append((folder, link)) + else: + # delete empty folders + run(['rm', '-R', os.path.join(archive_path, folder)]) - if bad_folders: + if bad_folders and IS_TTY and input('[!] Cleanup archive? y/[n]: ') == 'y': print('[!] Fixing {} improperly named folders in archive...'.format(len(bad_folders))) for folder, link in bad_folders: - merge_folders(path, folder, link) + fix_folder_path(archive_path, folder, link) + elif bad_folders: + print('[!] Warning! {} folders need to be merged, fix by running bookmark archiver.'.format(len(bad_folders))) if unmatched: print('[!] Warning! {} unrecognized folders in html/archive/'.format(len(unmatched)))