1
0
Fork 0
mirror of synced 2024-06-28 11:00:35 +12:00

refactor: update command is functional

This commit is contained in:
Cristian 2020-12-30 12:25:32 -05:00
parent de3c82730c
commit d92083b928
4 changed files with 91 additions and 99 deletions

View file

@ -88,7 +88,7 @@ def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[I
details = {"history": {}} details = {"history": {}}
write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False) write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
else: else:
details = list(load_snapshot_details(snapshot)) details = load_snapshot_details(snapshot)
#log_link_archiving_started(link, out_dir, is_new) #log_link_archiving_started(link, out_dir, is_new)
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}

View file

@ -42,6 +42,7 @@ from .html import (
write_html_snapshot_details, write_html_snapshot_details,
) )
from .json import ( from .json import (
load_json_snapshot_details,
parse_json_snapshot_details, parse_json_snapshot_details,
write_json_snapshot_details, write_json_snapshot_details,
) )
@ -318,9 +319,9 @@ def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model
"""check for an existing link archive in the given directory, """check for an existing link archive in the given directory,
and load+merge it into the given link dict and load+merge it into the given link dict
""" """
out_dir = out_dir or snapshot.snapshot_dir out_dir = out_dir or Path(snapshot.snapshot_dir)
existing_snapshot = parse_json_snapshot_details(out_dir) existing_snapshot = load_json_snapshot_details(out_dir)
if existing_snapshot: if existing_snapshot:
return merge_snapshots(existing_snapshot, snapshot) return merge_snapshots(existing_snapshot, snapshot)
@ -379,56 +380,41 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
return search_filter(snapshots, filter_patterns, filter_type) return search_filter(snapshots, filter_patterns, filter_type)
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""indexed links without checking archive status or data directory validity""" """indexed links without checking archive status or data directory validity"""
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return {snapshot.snapshot_dir: snapshot for snapshot in snapshots}
return {
link.link_dir: link
for link in links
}
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""indexed links that are archived with a valid data directory""" """indexed links that are archived with a valid data directory"""
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_archived, snapshots)}
return {
link.link_dir: link
for link in filter(is_archived, links)
}
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""indexed links that are unarchived with no data directory or an empty data directory""" """indexed links that are unarchived with no data directory or an empty data directory"""
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_unarchived, snapshots)}
return {
link.link_dir: link
for link in filter(is_unarchived, links)
}
def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""dirs that actually exist in the archive/ folder""" """dirs that actually exist in the archive/ folder"""
from core.models import Snapshot
all_folders = {} all_folders = {}
for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir(): for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir():
if entry.is_dir(): if entry.is_dir():
link = None snapshot = None
try: try:
link = parse_json_link_details(entry.path) snapshot = parse_json_snapshot_details(entry.path)
except Exception: except Exception:
pass pass
all_folders[entry.name] = link all_folders[entry.name] = snapshot
return all_folders return all_folders
def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""dirs with a valid index matched to the main index and archived content""" """dirs with a valid index matched to the main index and archived content"""
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_valid, snapshots)}
return {
link.link_dir: link
for link in filter(is_valid, links)
}
def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized""" """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR) duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR)
orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR) orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR)
@ -437,7 +423,7 @@ def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
return {**duplicate, **orphaned, **corrupted, **unrecognized} return {**duplicate, **orphaned, **corrupted, **unrecognized}
def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""dirs that conflict with other directories that have the same link URL or timestamp""" """dirs that conflict with other directories that have the same link URL or timestamp"""
by_url = {} by_url = {}
by_timestamp = {} by_timestamp = {}
@ -450,91 +436,92 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
) )
for path in chain(snapshots.iterator(), data_folders): for path in chain(snapshots.iterator(), data_folders):
link = None snapshot = None
if type(path) is not str: if type(path) is not str:
path = path.as_link().link_dir path = path.snapshot_dir
try: try:
link = parse_json_link_details(path) snapshot = parse_json_snapshot_details(path)
except Exception: except Exception:
pass pass
if link: if snapshot:
# link folder has same timestamp as different link folder # snapshot folder has same timestamp as different link folder
by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1 by_timestamp[snapshot.timestamp] = by_timestamp.get(snapshot.timestamp, 0) + 1
if by_timestamp[link.timestamp] > 1: if by_timestamp[snapshot.timestamp] > 1:
duplicate_folders[path] = link duplicate_folders[path] = snapshot
# link folder has same url as different link folder # link folder has same url as different link folder
by_url[link.url] = by_url.get(link.url, 0) + 1 by_url[snapshot.url] = by_url.get(snapshot.url, 0) + 1
if by_url[link.url] > 1: if by_url[snapshot.url] > 1:
duplicate_folders[path] = link duplicate_folders[path] = snapshot
return duplicate_folders return duplicate_folders
def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""dirs that contain a valid index but aren't listed in the main index""" """dirs that contain a valid index but aren't listed in the main index"""
orphaned_folders = {} orphaned_folders = {}
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir(): for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
if entry.is_dir(): if entry.is_dir():
link = None snapshot = None
try: try:
link = parse_json_link_details(str(entry)) snapshot = parse_json_snapshot_details(str(entry))
except Exception: except Exception:
pass pass
if link and not snapshots.filter(timestamp=entry.name).exists(): if snapshot and not snapshots.filter(timestamp=entry.name).exists():
# folder is a valid link data dir with index details, but it's not in the main index # folder is a valid link data dir with index details, but it's not in the main index
orphaned_folders[str(entry)] = link orphaned_folders[str(entry)] = snapshot
return orphaned_folders return orphaned_folders
def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""dirs that don't contain a valid index and aren't listed in the main index""" """dirs that don't contain a valid index and aren't listed in the main index"""
corrupted = {} corrupted = {}
for snapshot in snapshots.iterator(): for snapshot in snapshots.iterator():
link = snapshot.as_link() if is_corrupt(snapshot):
if is_corrupt(link): corrupted[snapshot.snapshot_dir] = snapshot
corrupted[link.link_dir] = link
return corrupted return corrupted
def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""dirs that don't contain recognizable archive data and aren't listed in the main index""" """dirs that don't contain recognizable archive data and aren't listed in the main index"""
unrecognized_folders: Dict[str, Optional[Link]] = {} unrecognized_folders: Dict[str, Optional[Model]] = {}
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir(): for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
if entry.is_dir(): if entry.is_dir():
index_exists = (entry / "index.json").exists() index_exists = (entry / "index.json").exists()
link = None snapshot = None
try: try:
link = parse_json_link_details(str(entry)) snapshot = parse_json_snapshot_details(str(entry))
except KeyError: except KeyError:
# Try to fix index # Try to fix index
if index_exists: if index_exists:
try: pass
# TODO: Implement the `guess` bit for snapshots
# try:
# Last attempt to repair the detail index # Last attempt to repair the detail index
link_guessed = parse_json_link_details(str(entry), guess=True) # link_guessed = parse_json_snapshot_details(str(entry), guess=True)
write_json_link_details(link_guessed, out_dir=str(entry)) # write_json_snapshot_details(link_guessed, out_dir=str(entry))
link = parse_json_link_details(str(entry)) # link = parse_json_link_details(str(entry))
except Exception: # except Exception:
pass # pass
if index_exists and link is None: if index_exists and snapshot is None:
# index exists but it's corrupted or unparseable # index exists but it's corrupted or unparseable
unrecognized_folders[str(entry)] = link unrecognized_folders[str(entry)] = snapshot
elif not index_exists: elif not index_exists:
# link details index doesn't exist and the folder isn't in the main index # link details index doesn't exist and the folder isn't in the main index
timestamp = entry.name timestamp = entry.name
if not snapshots.filter(timestamp=timestamp).exists(): if not snapshots.filter(timestamp=timestamp).exists():
unrecognized_folders[str(entry)] = link unrecognized_folders[str(entry)] = snapshot
return unrecognized_folders return unrecognized_folders
def is_valid(link: Link) -> bool: def is_valid(snapshot: Model) -> bool:
dir_exists = Path(link.link_dir).exists() dir_exists = Path(snapshot.snapshot_dir).exists()
index_exists = (Path(link.link_dir) / "index.json").exists() index_exists = (Path(snapshot.snapshot_dir) / "index.json").exists()
if not dir_exists: if not dir_exists:
# unarchived links are not included in the valid list # unarchived links are not included in the valid list
return False return False
@ -542,29 +529,30 @@ def is_valid(link: Link) -> bool:
return False return False
if dir_exists and index_exists: if dir_exists and index_exists:
try: try:
parsed_link = parse_json_link_details(link.link_dir, guess=True) # TODO: review if the `guess` was necessary here
return link.url == parsed_link.url parsed_snapshot = parse_json_snapshot_details(snapshot.snapshot_dir)
return snapshot.url == parsed_snapshot.url
except Exception: except Exception:
pass pass
return False return False
def is_corrupt(link: Link) -> bool: def is_corrupt(snapshot: Model) -> bool:
if not Path(link.link_dir).exists(): if not Path(snapshot.snapshot_dir).exists():
# unarchived links are not considered corrupt # unarchived links are not considered corrupt
return False return False
if is_valid(link): if is_valid(snapshot):
return False return False
return True return True
def is_archived(link: Link) -> bool: def is_archived(snapshot: Model) -> bool:
return is_valid(link) and link.is_archived return is_valid(snapshot) and snapshot.is_archived
def is_unarchived(link: Link) -> bool: def is_unarchived(snapshot: Model) -> bool:
if not Path(link.link_dir).exists(): if not Path(snapshot.snapshot_dir).exists():
return True return True
return not link.is_archived return not snapshot.is_archived
def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]: def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
@ -574,22 +562,22 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L
if entry.is_dir(follow_symlinks=True): if entry.is_dir(follow_symlinks=True):
if (Path(entry.path) / 'index.json').exists(): if (Path(entry.path) / 'index.json').exists():
try: try:
link = parse_json_link_details(entry.path) snapshot = parse_json_snapshot_details(entry.path)
except KeyError: except KeyError:
link = None snapshot = None
if not link: if not snapshot:
continue continue
if not entry.path.endswith(f'/{link.timestamp}'): if not entry.path.endswith(f'/{link.timestamp}'):
dest = out_dir / ARCHIVE_DIR_NAME / link.timestamp dest = out_dir / ARCHIVE_DIR_NAME / snapshot.timestamp
if dest.exists(): if dest.exists():
cant_fix.append(entry.path) cant_fix.append(entry.path)
else: else:
shutil.move(entry.path, dest) shutil.move(entry.path, dest)
fixed.append(dest) fixed.append(dest)
timestamp = entry.path.rsplit('/', 1)[-1] timestamp = entry.path.rsplit('/', 1)[-1]
assert link.link_dir == entry.path assert snapshot.snapshot_dir == entry.path
assert link.timestamp == timestamp assert snapshot.timestamp == timestamp
write_json_link_details(link, out_dir=entry.path) write_json_snapshot_details(snapshot, out_dir=entry.path)
return fixed, cant_fix return fixed, cant_fix

View file

@ -91,7 +91,7 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) ->
@enforce_types @enforce_types
def load_snapshot_details(out_Dir: Path) -> Optional[Model]: def load_json_snapshot_details(out_dir: Path) -> Optional[Model]:
""" """
Loads the detail from the local json index Loads the detail from the local json index
""" """
@ -99,7 +99,10 @@ def load_snapshot_details(out_Dir: Path) -> Optional[Model]:
if existing_index.exists(): if existing_index.exists():
with open(existing_index, 'r', encoding='utf-8') as f: with open(existing_index, 'r', encoding='utf-8') as f:
try: try:
return pyjson.load(f) output = pyjson.load(f)
if "history" not in output.keys():
output["history"] = {}
return output
except pyjson.JSONDecodeError: except pyjson.JSONDecodeError:
pass pass
return None return None
@ -109,7 +112,7 @@ def load_snapshot_details(out_Dir: Path) -> Optional[Model]:
def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[dict]: def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[dict]:
"""read through all the archive data folders and return the parsed links""" """read through all the archive data folders and return the parsed links"""
for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME): for entry in os.scandir(Path(out_dir)):
if entry.is_dir(follow_symlinks=True): if entry.is_dir(follow_symlinks=True):
if (Path(entry.path) / 'index.json').exists(): if (Path(entry.path) / 'index.json').exists():
try: try:

View file

@ -9,7 +9,7 @@ from datetime import date
from typing import Dict, List, Optional, Iterable, IO, Union from typing import Dict, List, Optional, Iterable, IO, Union
from crontab import CronTab, CronSlices from crontab import CronTab, CronSlices
from django.db.models import QuerySet from django.db.models import QuerySet, Model
from .cli import ( from .cli import (
list_subcommands, list_subcommands,
@ -689,15 +689,16 @@ def update(resume: Optional[float]=None,
extractors: str="", extractors: str="",
out_dir: Path=OUTPUT_DIR) -> List[Link]: out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Import any new links from subscriptions and retry any previously failed/skipped links""" """Import any new links from subscriptions and retry any previously failed/skipped links"""
from core.models import Snapshot
check_data_folder(out_dir=out_dir) check_data_folder(out_dir=out_dir)
check_dependencies() check_dependencies()
new_links: List[Link] = [] # TODO: Remove input argument: only_new new_links: List[Snapshot] = [] # TODO: Remove input argument: only_new
extractors = extractors.split(",") if extractors else [] extractors = extractors.split(",") if extractors else []
# Step 1: Filter for selected_links # Step 1: Filter for selected_links
matching_snapshots = list_links( matching_snapshots = list_snapshots(
filter_patterns=filter_patterns, filter_patterns=filter_patterns,
filter_type=filter_type, filter_type=filter_type,
before=before, before=before,
@ -705,15 +706,15 @@ def update(resume: Optional[float]=None,
) )
matching_folders = list_folders( matching_folders = list_folders(
links=matching_snapshots, snapshots=matching_snapshots,
status=status, status=status,
out_dir=out_dir, out_dir=out_dir,
) )
all_links = [link for link in matching_folders.values() if link] all_links = [link for link in matching_folders.values() if link]
if index_only: if index_only:
for link in all_links: for snapshot in all_snapshots:
write_snapshot_details(link, out_dir=out_dir, skip_sql_index=True) write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=True)
index_links(all_links, out_dir=out_dir) index_links(all_links, out_dir=out_dir)
return all_links return all_links
@ -797,7 +798,7 @@ def list_all(filter_patterns_str: Optional[str]=None,
@enforce_types @enforce_types
def list_links(snapshots: Optional[QuerySet]=None, def list_snapshots(snapshots: Optional[QuerySet]=None,
filter_patterns: Optional[List[str]]=None, filter_patterns: Optional[List[str]]=None,
filter_type: str='exact', filter_type: str='exact',
after: Optional[float]=None, after: Optional[float]=None,
@ -820,9 +821,9 @@ def list_links(snapshots: Optional[QuerySet]=None,
return all_snapshots return all_snapshots
@enforce_types @enforce_types
def list_folders(links: List[Link], def list_folders(snapshots: List[Model],
status: str, status: str,
out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
check_data_folder(out_dir=out_dir) check_data_folder(out_dir=out_dir)
@ -840,7 +841,7 @@ def list_folders(links: List[Link],
} }
try: try:
return STATUS_FUNCTIONS[status](links, out_dir=out_dir) return STATUS_FUNCTIONS[status](snapshots, out_dir=out_dir)
except KeyError: except KeyError:
raise ValueError('Status not recognized.') raise ValueError('Status not recognized.')