1
0
Fork 0
mirror of synced 2024-09-28 23:31:09 +12:00

feat: Refactor get_invalid_folders to work with a queryset instead of a list of links

This commit is contained in:
Cristian 2020-08-20 14:34:49 -05:00 committed by Cristian Vargas
parent dae606de6e
commit 404f333e17
3 changed files with 50 additions and 45 deletions

View file

@ -54,7 +54,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
oneshot( oneshot(
url=stdin_url or url, url=stdin_url or url,
out_dir=str(Path(command.out_dir).absolute()), out_dir=str(Path(command.out_dir).resolve()),
) )

View file

@ -4,6 +4,7 @@ import re
import os import os
import shutil import shutil
import json as pyjson import json as pyjson
from pathlib import Path
from itertools import chain from itertools import chain
from typing import List, Tuple, Dict, Optional, Iterable from typing import List, Tuple, Dict, Optional, Iterable
@ -418,28 +419,32 @@ def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link
for link in filter(is_valid, links) for link in filter(is_valid, links)
} }
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_invalid_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized""" """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR) duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR)
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR) orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR)
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR) corrupted = get_corrupted_folders(snapshots, out_dir=OUTPUT_DIR)
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR) unrecognized = get_unrecognized_folders(snapshots, out_dir=OUTPUT_DIR)
return {**duplicate, **orphaned, **corrupted, **unrecognized} return {**duplicate, **orphaned, **corrupted, **unrecognized}
def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_duplicate_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that conflict with other directories that have the same link URL or timestamp""" """dirs that conflict with other directories that have the same link URL or timestamp"""
links = list(links) by_url = {}
by_url = {link.url: 0 for link in links} by_timestamp = {}
by_timestamp = {link.timestamp: 0 for link in links} indexed_folders = set()
for snapshot in snapshots.iterator():
link = snapshot.as_link()
by_url[link.url] = 0
by_timestamp[link.timestamp] = 0
indexed_folders.update([link.link_dir])
duplicate_folders = {} duplicate_folders = {}
indexed_folders = {link.link_dir for link in links}
data_folders = ( data_folders = (
entry.path entry.path
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)) for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir()
if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders if entry.is_dir() and str(entry) not in indexed_folders
) )
for path in chain(sorted(indexed_folders), sorted(data_folders)): for path in chain(sorted(indexed_folders), sorted(data_folders)):
@ -462,71 +467,70 @@ def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[
return duplicate_folders return duplicate_folders
def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_orphaned_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that contain a valid index but aren't listed in the main index""" """dirs that contain a valid index but aren't listed in the main index"""
links = list(links)
indexed_folders = {link.link_dir: link for link in links}
orphaned_folders = {} orphaned_folders = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
if entry.is_dir(follow_symlinks=True): if entry.is_dir():
link = None link = None
try: try:
link = parse_json_link_details(entry.path) link = parse_json_link_details(str(entry))
except Exception: except Exception:
pass pass
if link and entry.path not in indexed_folders: if link and not snapshots.filter(timestamp=entry.name).exists():
# folder is a valid link data dir with index details, but it's not in the main index # folder is a valid link data dir with index details, but it's not in the main index
orphaned_folders[entry.path] = link orphaned_folders[str(entry)] = link
return orphaned_folders return orphaned_folders
def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_corrupted_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that don't contain a valid index and aren't listed in the main index""" """dirs that don't contain a valid index and aren't listed in the main index"""
return { corrupted = {}
link.link_dir: link for snapshot in snapshots.iterator():
for link in filter(is_corrupt, links) link = snapshot.as_link()
} if is_corrupt(link):
corrupted[link.link_dir] = link
return corrupted
def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_unrecognized_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that don't contain recognizable archive data and aren't listed in the main index""" """dirs that don't contain recognizable archive data and aren't listed in the main index"""
by_timestamp = {link.timestamp: 0 for link in links}
unrecognized_folders: Dict[str, Optional[Link]] = {} unrecognized_folders: Dict[str, Optional[Link]] = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
if entry.is_dir(follow_symlinks=True): if entry.is_dir():
index_exists = os.path.exists(os.path.join(entry.path, 'index.json')) index_exists = (entry / "index.json").exists()
link = None link = None
try: try:
link = parse_json_link_details(entry.path) link = parse_json_link_details(str(entry))
except KeyError: except KeyError:
# Try to fix index # Try to fix index
if index_exists: if index_exists:
try: try:
# Last attempt to repair the detail index # Last attempt to repair the detail index
link_guessed = parse_json_link_details(entry.path, guess=True) link_guessed = parse_json_link_details(str(entry), guess=True)
write_json_link_details(link_guessed, out_dir=entry.path) write_json_link_details(link_guessed, out_dir=str(entry))
link = parse_json_link_details(entry.path) link = parse_json_link_details(str(entry))
except Exception: except Exception:
pass pass
if index_exists and link is None: if index_exists and link is None:
# index exists but it's corrupted or unparseable # index exists but it's corrupted or unparseable
unrecognized_folders[entry.path] = link unrecognized_folders[str(entry)] = link
elif not index_exists: elif not index_exists:
# link details index doesn't exist and the folder isn't in the main index # link details index doesn't exist and the folder isn't in the main index
timestamp = entry.path.rsplit('/', 1)[-1] timestamp = entry.name
if timestamp not in by_timestamp: if not snapshots.filter(timestamp=timestamp).exists():
unrecognized_folders[entry.path] = link unrecognized_folders[str(entry)] = link
return unrecognized_folders return unrecognized_folders
def is_valid(link: Link) -> bool: def is_valid(link: Link) -> bool:
dir_exists = os.path.exists(link.link_dir) dir_exists = Path(link.link_dir).exists()
index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json')) index_exists = (Path(link.link_dir) / "index.json").exists()
if not dir_exists: if not dir_exists:
# unarchived links are not included in the valid list # unarchived links are not included in the valid list
return False return False
@ -541,7 +545,7 @@ def is_valid(link: Link) -> bool:
return False return False
def is_corrupt(link: Link) -> bool: def is_corrupt(link: Link) -> bool:
if not os.path.exists(link.link_dir): if not Path(link.link_dir).exists():
# unarchived links are not considered corrupt # unarchived links are not considered corrupt
return False return False

View file

@ -352,9 +352,10 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI)) print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
# Links in invalid/duplicate data dirs # Links in invalid/duplicate data dirs
invalid_folders: Dict[str, Link] = {} invalid_folders = {
for link in all_links: folder: link
invalid_folders.update(get_invalid_folders([link.as_link()], out_dir=out_dir).items()) for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
}
if invalid_folders: if invalid_folders:
print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI)) print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items())) print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))