diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 2515b8fd..0249897b 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -186,6 +186,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s ts ) + "\n" + str(e) + "\n")) #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") + + # print(f' ERROR: {method_name} {e.__class__.__name__}: {e} {getattr(e, "hints", "")}', ts, link.url, command) + raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format( + method_name, + link.url, + )) from e + # print(' ', stats) @@ -218,7 +225,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa if type(all_links) is QuerySet: num_links: int = all_links.count() - get_link = lambda x: x.as_link() + get_link = lambda x: x.as_link_with_details() all_links = all_links.iterator() else: num_links: int = len(all_links) diff --git a/archivebox/extractors/htmltotext.py b/archivebox/extractors/htmltotext.py index 18722f13..0686f76e 100644 --- a/archivebox/extractors/htmltotext.py +++ b/archivebox/extractors/htmltotext.py @@ -121,9 +121,11 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO out_dir = Path(out_dir or link.link_dir) output = "htmltotext.txt" + cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html'] timer = TimedProgress(timeout, prefix=' ') extracted_text = None + status = 'failed' try: extractor = HTMLTextExtractor() document = get_html(link, out_dir) @@ -136,10 +138,9 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO extracted_text = str(extractor) atomic_write(str(out_dir / output), extracted_text) + status = 'succeeded' except (Exception, OSError) as err: - status = 'failed' output = err - cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html'] finally: timer.end() diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index e3860527..e50b3932 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -77,6 +77,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') + result = None try: result = run(cmd, cwd=str(out_dir), timeout=timeout) @@ -84,7 +85,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" output_tail = [ line.strip() - for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] + for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:] if line.strip() ] hints = ( @@ -94,12 +95,13 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # Check for common failure cases if (result.returncode > 0) or not (out_dir / output).is_file(): - raise ArchiveError('SingleFile was not able to archive the page', hints) + raise ArchiveError(f'SingleFile was not able to archive the page (status={result.returncode})', hints) chmod_file(output, cwd=str(out_dir)) except (Exception, OSError) as err: status = 'failed' # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes). cmd[2] = browser_args.replace('"', "\\\"") + err.hints = (result.stdout + result.stderr).decode().split('\n') output = err finally: timer.end() diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 3505e03f..6b0e37f6 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -75,7 +75,7 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str: with open(abs_path / source, "r", encoding="utf-8") as f: document = f.read() break - except (FileNotFoundError, TypeError): + except (FileNotFoundError, TypeError, UnicodeDecodeError): continue if document is None: return download_url(link.url, timeout=timeout) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index b9d57aeb..9912b4c7 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links without checking archive status or data directory validity""" - links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] + links = (snapshot.as_link() for snapshot in snapshots.iterator()) return { link.link_dir: link for link in links @@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are archived with a valid data directory""" - links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] + links = (snapshot.as_link() for snapshot in snapshots.iterator()) return { link.link_dir: link for link in filter(is_archived, links) @@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are unarchived with no data directory or an empty data directory""" - links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] + links = (snapshot.as_link() for snapshot in snapshots.iterator()) return { link.link_dir: link for link in filter(is_unarchived, links) diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index 3c688a3c..933214b9 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -432,12 +432,14 @@ def log_archive_method_finished(result: "ArchiveResult"): **ANSI, ), ] + + # import pudb; pudb.set_trace() # Prettify error output hints string and limit to five lines hints = getattr(result.output, 'hints', None) or () if hints: if isinstance(hints, (list, tuple, type(_ for _ in ()))): - hints = [hint.decode() for hint in hints if isinstance(hint, bytes)] + hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints] else: if isinstance(hints, bytes): hints = hints.decode() diff --git a/archivebox/main.py b/archivebox/main.py index 76b204b8..7389c032 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -791,6 +791,8 @@ def update(resume: Optional[float]=None, out_dir: Path=OUTPUT_DIR) -> List[Link]: """Import any new links from subscriptions and retry any previously failed/skipped links""" + from core.models import ArchiveResult + check_data_folder(out_dir=out_dir) check_dependencies() new_links: List[Link] = [] # TODO: Remove input argument: only_new @@ -798,19 +800,23 @@ def update(resume: Optional[float]=None, extractors = extractors.split(",") if extractors else [] # Step 1: Filter for selected_links + print('[*] Finding matching Snapshots to update...') + print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...') matching_snapshots = list_links( filter_patterns=filter_patterns, filter_type=filter_type, before=before, after=after, ) - + print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...') matching_folders = list_folders( links=matching_snapshots, status=status, out_dir=out_dir, ) - all_links = [link for link in matching_folders.values() if link] + all_links = (link for link in matching_folders.values() if link) + print(' - Sorting by most unfinished -> least unfinished + date archived...') + all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp)) if index_only: for link in all_links: @@ -836,6 +842,7 @@ def update(resume: Optional[float]=None, if extractors: archive_kwargs["methods"] = extractors + archive_links(to_archive, overwrite=overwrite, **archive_kwargs) # Step 4: Re-write links index with updated titles, icons, and resources diff --git a/archivebox/util.py b/archivebox/util.py index d7df7f3c..5321081c 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -179,7 +179,11 @@ def download_url(url: str, timeout: int=None) -> str: if encoding is not None: response.encoding = encoding - return response.text + try: + return response.text + except UnicodeDecodeError: + # if response is non-test (e.g. image or other binary files), just return the filename instead + return url.rsplit('/', 1)[-1] @enforce_types def get_headers(url: str, timeout: int=None) -> str: