diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index e2eed78d..b4e918b8 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -529,8 +529,16 @@ def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Option link = None try: link = parse_json_link_details(entry.path) - except Exception: - pass + except KeyError: + # Try to fix index + if index_exists: + try: + # Last attempt to repair the detail index + link_guessed = parse_json_link_details(entry.path, guess=True) + write_json_link_details(link_guessed, out_dir=entry.path) + link = parse_json_link_details(entry.path) + except Exception as e: + pass if index_exists and link is None: # index exists but it's corrupted or unparseable @@ -555,9 +563,9 @@ def is_valid(link: Link) -> bool: return False if dir_exists and index_exists: try: - parsed_link = parse_json_link_details(link.link_dir) + parsed_link = parse_json_link_details(link.link_dir, guess=True) return link.url == parsed_link.url - except Exception: + except Exception as e: pass return False diff --git a/archivebox/index/json.py b/archivebox/index/json.py index f4cb9e54..69021123 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -39,7 +39,6 @@ MAIN_INDEX_HEADER = { }, } - ### Main Links Index @enforce_types @@ -58,8 +57,12 @@ def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]: detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp'] yield parse_json_link_details(str(detail_index_path)) except KeyError: - print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI)) - continue + # as a last effort, try to guess the missing values out of existing ones + try: + yield Link.from_json(link_json, guess=True) + except KeyError: + print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI)) + continue return () @enforce_types @@ -94,19 +97,18 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None: out_dir = out_dir or link.link_dir path = os.path.join(out_dir, JSON_INDEX_FILENAME) - atomic_write(path, link._asdict(extended=True)) @enforce_types -def parse_json_link_details(out_dir: str) -> Optional[Link]: +def parse_json_link_details(out_dir: str, guess: Optional[bool]=False) -> Optional[Link]: """load the json link index from a given directory""" existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME) if os.path.exists(existing_index): with open(existing_index, 'r', encoding='utf-8') as f: try: link_json = pyjson.load(f) - return Link.from_json(link_json) + return Link.from_json(link_json, guess) except pyjson.JSONDecodeError: pass return None diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index db17c269..cf6e809b 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -1,6 +1,7 @@ __package__ = 'archivebox.index' import os +from pathlib import Path from datetime import datetime, timedelta @@ -51,7 +52,15 @@ class ArchiveResult: assert self.output @classmethod - def from_json(cls, json_info): + def guess_ts(_cls, dict_info): + from ..util import parse_date + parsed_timestamp = parse_date(dict_info["timestamp"]) + start_ts = parsed_timestamp + end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"])) + return start_ts, end_ts + + @classmethod + def from_json(cls, json_info, guess=False): from ..util import parse_date info = { @@ -59,9 +68,23 @@ class ArchiveResult: for key, val in json_info.items() if key in cls.field_names() } - info['start_ts'] = parse_date(info['start_ts']) - info['end_ts'] = parse_date(info['end_ts']) - info['cmd_version'] = info.get('cmd_version') + if guess: + keys = info.keys() + if "start_ts" not in keys: + info["start_ts"], info["end_ts"] = cls.guess_ts(json_info) + else: + info['start_ts'] = parse_date(info['start_ts']) + info['end_ts'] = parse_date(info['end_ts']) + if "pwd" not in keys: + info["pwd"] = str(os.getcwd() / Path(f"archive/{json_info['timestamp']}")) + if "cmd_version" not in keys: + info["cmd_version"] = "Undefined" + if "cmd" not in keys: + info["cmd"] = [] + else: + info['start_ts'] = parse_date(info['start_ts']) + info['end_ts'] = parse_date(info['end_ts']) + info['cmd_version'] = info.get('cmd_version') return cls(**info) def to_dict(self, *keys) -> dict: @@ -182,7 +205,7 @@ class Link: return info @classmethod - def from_json(cls, json_info): + def from_json(cls, json_info, guess=False): from ..util import parse_date info = { @@ -200,7 +223,7 @@ class Link: cast_history[method] = [] for json_result in method_history: assert isinstance(json_result, dict), 'Items in Link["history"][method] must be dicts' - cast_result = ArchiveResult.from_json(json_result) + cast_result = ArchiveResult.from_json(json_result, guess) cast_history[method].append(cast_result) info['history'] = cast_history