diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 07471e29..666a98fd 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -174,23 +174,29 @@ def wget_output_path(link: Link) -> Optional[str]: full_path = without_fragment(without_query(path(link.url))).strip('/') search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path) for _ in range(4): - if search_dir.exists(): - if search_dir.is_dir(): - html_files = [ - f for f in search_dir.iterdir() - if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M) - ] - if html_files: - return str(html_files[0].relative_to(link.link_dir)) + try: + if search_dir.exists(): + if search_dir.is_dir(): + html_files = [ + f for f in search_dir.iterdir() + if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M) + ] + if html_files: + return str(html_files[0].relative_to(link.link_dir)) - # sometimes wget'd URLs have no ext and return non-html - # e.g. /some/example/rss/all -> some RSS XML content) - # /some/other/url.o4g -> some binary unrecognized ext) - # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all - last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1]) - for file_present in search_dir.iterdir(): - if file_present == last_part_of_url: - return str((search_dir / file_present).relative_to(link.link_dir)) + # sometimes wget'd URLs have no ext and return non-html + # e.g. /some/example/rss/all -> some RSS XML content) + # /some/other/url.o4g -> some binary unrecognized ext) + # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all + last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1]) + for file_present in search_dir.iterdir(): + if file_present == last_part_of_url: + return str((search_dir / file_present).relative_to(link.link_dir)) + except OSError: + # OSError 36 and others can happen here, caused by trying to check for impossible paths + # (paths derived from URLs can often contain illegal unicode characters or be too long, + # causing the OS / filesystem to reject trying to open them with a system-level error) + pass # Move up one directory level search_dir = search_dir.parent