1
0
Fork 0
mirror of synced 2024-06-24 00:50:23 +12:00

refactor: update command is functional

This commit is contained in:
Cristian 2020-12-30 12:25:32 -05:00
parent de3c82730c
commit d92083b928
4 changed files with 91 additions and 99 deletions

View file

@ -88,7 +88,7 @@ def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[I
details = {"history": {}}
write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
else:
details = list(load_snapshot_details(snapshot))
details = load_snapshot_details(snapshot)
#log_link_archiving_started(link, out_dir, is_new)
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}

View file

@ -42,6 +42,7 @@ from .html import (
write_html_snapshot_details,
)
from .json import (
load_json_snapshot_details,
parse_json_snapshot_details,
write_json_snapshot_details,
)
@ -318,9 +319,9 @@ def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model
"""check for an existing link archive in the given directory,
and load+merge it into the given link dict
"""
out_dir = out_dir or snapshot.snapshot_dir
out_dir = out_dir or Path(snapshot.snapshot_dir)
existing_snapshot = parse_json_snapshot_details(out_dir)
existing_snapshot = load_json_snapshot_details(out_dir)
if existing_snapshot:
return merge_snapshots(existing_snapshot, snapshot)
@ -379,56 +380,41 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
return search_filter(snapshots, filter_patterns, filter_type)
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""indexed links without checking archive status or data directory validity"""
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
return {
link.link_dir: link
for link in links
}
return {snapshot.snapshot_dir: snapshot for snapshot in snapshots}
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""indexed links that are archived with a valid data directory"""
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
return {
link.link_dir: link
for link in filter(is_archived, links)
}
return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_archived, snapshots)}
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""indexed links that are unarchived with no data directory or an empty data directory"""
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
return {
link.link_dir: link
for link in filter(is_unarchived, links)
}
return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_unarchived, snapshots)}
def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""dirs that actually exist in the archive/ folder"""
from core.models import Snapshot
all_folders = {}
for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir():
if entry.is_dir():
link = None
snapshot = None
try:
link = parse_json_link_details(entry.path)
snapshot = parse_json_snapshot_details(entry.path)
except Exception:
pass
all_folders[entry.name] = link
all_folders[entry.name] = snapshot
return all_folders
def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""dirs with a valid index matched to the main index and archived content"""
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
return {
link.link_dir: link
for link in filter(is_valid, links)
}
return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_valid, snapshots)}
def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR)
orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR)
@ -437,7 +423,7 @@ def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
return {**duplicate, **orphaned, **corrupted, **unrecognized}
def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""dirs that conflict with other directories that have the same link URL or timestamp"""
by_url = {}
by_timestamp = {}
@ -450,91 +436,92 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
)
for path in chain(snapshots.iterator(), data_folders):
link = None
snapshot = None
if type(path) is not str:
path = path.as_link().link_dir
path = path.snapshot_dir
try:
link = parse_json_link_details(path)
snapshot = parse_json_snapshot_details(path)
except Exception:
pass
if link:
# link folder has same timestamp as different link folder
by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
if by_timestamp[link.timestamp] > 1:
duplicate_folders[path] = link
if snapshot:
# snapshot folder has same timestamp as different link folder
by_timestamp[snapshot.timestamp] = by_timestamp.get(snapshot.timestamp, 0) + 1
if by_timestamp[snapshot.timestamp] > 1:
duplicate_folders[path] = snapshot
# link folder has same url as different link folder
by_url[link.url] = by_url.get(link.url, 0) + 1
if by_url[link.url] > 1:
duplicate_folders[path] = link
by_url[snapshot.url] = by_url.get(snapshot.url, 0) + 1
if by_url[snapshot.url] > 1:
duplicate_folders[path] = snapshot
return duplicate_folders
def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""dirs that contain a valid index but aren't listed in the main index"""
orphaned_folders = {}
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
if entry.is_dir():
link = None
snapshot = None
try:
link = parse_json_link_details(str(entry))
snapshot = parse_json_snapshot_details(str(entry))
except Exception:
pass
if link and not snapshots.filter(timestamp=entry.name).exists():
if snapshot and not snapshots.filter(timestamp=entry.name).exists():
# folder is a valid link data dir with index details, but it's not in the main index
orphaned_folders[str(entry)] = link
orphaned_folders[str(entry)] = snapshot
return orphaned_folders
def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""dirs that don't contain a valid index and aren't listed in the main index"""
corrupted = {}
for snapshot in snapshots.iterator():
link = snapshot.as_link()
if is_corrupt(link):
corrupted[link.link_dir] = link
if is_corrupt(snapshot):
corrupted[snapshot.snapshot_dir] = snapshot
return corrupted
def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
"""dirs that don't contain recognizable archive data and aren't listed in the main index"""
unrecognized_folders: Dict[str, Optional[Link]] = {}
unrecognized_folders: Dict[str, Optional[Model]] = {}
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
if entry.is_dir():
index_exists = (entry / "index.json").exists()
link = None
snapshot = None
try:
link = parse_json_link_details(str(entry))
snapshot = parse_json_snapshot_details(str(entry))
except KeyError:
# Try to fix index
if index_exists:
try:
pass
# TODO: Implement the `guess` bit for snapshots
# try:
# Last attempt to repair the detail index
link_guessed = parse_json_link_details(str(entry), guess=True)
write_json_link_details(link_guessed, out_dir=str(entry))
link = parse_json_link_details(str(entry))
except Exception:
pass
# link_guessed = parse_json_snapshot_details(str(entry), guess=True)
# write_json_snapshot_details(link_guessed, out_dir=str(entry))
# link = parse_json_link_details(str(entry))
# except Exception:
# pass
if index_exists and link is None:
if index_exists and snapshot is None:
# index exists but it's corrupted or unparseable
unrecognized_folders[str(entry)] = link
unrecognized_folders[str(entry)] = snapshot
elif not index_exists:
# link details index doesn't exist and the folder isn't in the main index
timestamp = entry.name
if not snapshots.filter(timestamp=timestamp).exists():
unrecognized_folders[str(entry)] = link
unrecognized_folders[str(entry)] = snapshot
return unrecognized_folders
def is_valid(link: Link) -> bool:
dir_exists = Path(link.link_dir).exists()
index_exists = (Path(link.link_dir) / "index.json").exists()
def is_valid(snapshot: Model) -> bool:
dir_exists = Path(snapshot.snapshot_dir).exists()
index_exists = (Path(snapshot.snapshot_dir) / "index.json").exists()
if not dir_exists:
# unarchived links are not included in the valid list
return False
@ -542,29 +529,30 @@ def is_valid(link: Link) -> bool:
return False
if dir_exists and index_exists:
try:
parsed_link = parse_json_link_details(link.link_dir, guess=True)
return link.url == parsed_link.url
# TODO: review if the `guess` was necessary here
parsed_snapshot = parse_json_snapshot_details(snapshot.snapshot_dir)
return snapshot.url == parsed_snapshot.url
except Exception:
pass
return False
def is_corrupt(link: Link) -> bool:
if not Path(link.link_dir).exists():
def is_corrupt(snapshot: Model) -> bool:
if not Path(snapshot.snapshot_dir).exists():
# unarchived links are not considered corrupt
return False
if is_valid(link):
if is_valid(snapshot):
return False
return True
def is_archived(link: Link) -> bool:
return is_valid(link) and link.is_archived
def is_archived(snapshot: Model) -> bool:
return is_valid(snapshot) and snapshot.is_archived
def is_unarchived(link: Link) -> bool:
if not Path(link.link_dir).exists():
def is_unarchived(snapshot: Model) -> bool:
if not Path(snapshot.snapshot_dir).exists():
return True
return not link.is_archived
return not snapshot.is_archived
def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
@ -574,22 +562,22 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L
if entry.is_dir(follow_symlinks=True):
if (Path(entry.path) / 'index.json').exists():
try:
link = parse_json_link_details(entry.path)
snapshot = parse_json_snapshot_details(entry.path)
except KeyError:
link = None
if not link:
snapshot = None
if not snapshot:
continue
if not entry.path.endswith(f'/{link.timestamp}'):
dest = out_dir / ARCHIVE_DIR_NAME / link.timestamp
dest = out_dir / ARCHIVE_DIR_NAME / snapshot.timestamp
if dest.exists():
cant_fix.append(entry.path)
else:
shutil.move(entry.path, dest)
fixed.append(dest)
timestamp = entry.path.rsplit('/', 1)[-1]
assert link.link_dir == entry.path
assert link.timestamp == timestamp
write_json_link_details(link, out_dir=entry.path)
assert snapshot.snapshot_dir == entry.path
assert snapshot.timestamp == timestamp
write_json_snapshot_details(snapshot, out_dir=entry.path)
return fixed, cant_fix

View file

@ -91,7 +91,7 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) ->
@enforce_types
def load_snapshot_details(out_Dir: Path) -> Optional[Model]:
def load_json_snapshot_details(out_dir: Path) -> Optional[Model]:
"""
Loads the detail from the local json index
"""
@ -99,7 +99,10 @@ def load_snapshot_details(out_Dir: Path) -> Optional[Model]:
if existing_index.exists():
with open(existing_index, 'r', encoding='utf-8') as f:
try:
return pyjson.load(f)
output = pyjson.load(f)
if "history" not in output.keys():
output["history"] = {}
return output
except pyjson.JSONDecodeError:
pass
return None
@ -109,7 +112,7 @@ def load_snapshot_details(out_Dir: Path) -> Optional[Model]:
def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[dict]:
"""read through all the archive data folders and return the parsed links"""
for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
for entry in os.scandir(Path(out_dir)):
if entry.is_dir(follow_symlinks=True):
if (Path(entry.path) / 'index.json').exists():
try:

View file

@ -9,7 +9,7 @@ from datetime import date
from typing import Dict, List, Optional, Iterable, IO, Union
from crontab import CronTab, CronSlices
from django.db.models import QuerySet
from django.db.models import QuerySet, Model
from .cli import (
list_subcommands,
@ -689,15 +689,16 @@ def update(resume: Optional[float]=None,
extractors: str="",
out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
from core.models import Snapshot
check_data_folder(out_dir=out_dir)
check_dependencies()
new_links: List[Link] = [] # TODO: Remove input argument: only_new
new_links: List[Snapshot] = [] # TODO: Remove input argument: only_new
extractors = extractors.split(",") if extractors else []
# Step 1: Filter for selected_links
matching_snapshots = list_links(
matching_snapshots = list_snapshots(
filter_patterns=filter_patterns,
filter_type=filter_type,
before=before,
@ -705,15 +706,15 @@ def update(resume: Optional[float]=None,
)
matching_folders = list_folders(
links=matching_snapshots,
snapshots=matching_snapshots,
status=status,
out_dir=out_dir,
)
all_links = [link for link in matching_folders.values() if link]
if index_only:
for link in all_links:
write_snapshot_details(link, out_dir=out_dir, skip_sql_index=True)
for snapshot in all_snapshots:
write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=True)
index_links(all_links, out_dir=out_dir)
return all_links
@ -797,7 +798,7 @@ def list_all(filter_patterns_str: Optional[str]=None,
@enforce_types
def list_links(snapshots: Optional[QuerySet]=None,
def list_snapshots(snapshots: Optional[QuerySet]=None,
filter_patterns: Optional[List[str]]=None,
filter_type: str='exact',
after: Optional[float]=None,
@ -820,9 +821,9 @@ def list_links(snapshots: Optional[QuerySet]=None,
return all_snapshots
@enforce_types
def list_folders(links: List[Link],
def list_folders(snapshots: List[Model],
status: str,
out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
check_data_folder(out_dir=out_dir)
@ -840,7 +841,7 @@ def list_folders(links: List[Link],
}
try:
return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
return STATUS_FUNCTIONS[status](snapshots, out_dir=out_dir)
except KeyError:
raise ValueError('Status not recognized.')