1
0
Fork 0
mirror of synced 2024-06-11 23:14:43 +12:00

fix OSError 36 caused by checking for path that is too long to exist

This commit is contained in:
Nick Sweeting 2024-05-07 04:12:07 -07:00
parent ce42472732
commit f770bba3cf
No known key found for this signature in database

View file

@ -174,23 +174,29 @@ def wget_output_path(link: Link) -> Optional[str]:
full_path = without_fragment(without_query(path(link.url))).strip('/')
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
for _ in range(4):
if search_dir.exists():
if search_dir.is_dir():
html_files = [
f for f in search_dir.iterdir()
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
]
if html_files:
return str(html_files[0].relative_to(link.link_dir))
try:
if search_dir.exists():
if search_dir.is_dir():
html_files = [
f for f in search_dir.iterdir()
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
]
if html_files:
return str(html_files[0].relative_to(link.link_dir))
# sometimes wget'd URLs have no ext and return non-html
# e.g. /some/example/rss/all -> some RSS XML content)
# /some/other/url.o4g -> some binary unrecognized ext)
# test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
for file_present in search_dir.iterdir():
if file_present == last_part_of_url:
return str((search_dir / file_present).relative_to(link.link_dir))
# sometimes wget'd URLs have no ext and return non-html
# e.g. /some/example/rss/all -> some RSS XML content)
# /some/other/url.o4g -> some binary unrecognized ext)
# test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
for file_present in search_dir.iterdir():
if file_present == last_part_of_url:
return str((search_dir / file_present).relative_to(link.link_dir))
except OSError:
# OSError 36 and others can happen here, caused by trying to check for impossible paths
# (paths derived from URLs can often contain illegal unicode characters or be too long,
# causing the OS / filesystem to reject trying to open them with a system-level error)
pass
# Move up one directory level
search_dir = search_dir.parent