From 4c5a3fba8bcee1eb5489bb93116d8ca8dfa44abd Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 7 May 2024 05:38:29 -0700 Subject: [PATCH] more fixes for wget_output_path --- archivebox/extractors/wget.py | 10 +++------- requirements.txt | 3 ++- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 1619814b..86dba0ac 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -174,13 +174,12 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]: # check for literally any file present that isnt an empty folder domain_dir = Path(domain(link.url).replace(":", "+")) - files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*')) + files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')] if files_within: return str((domain_dir / files_within[-1]).relative_to(link.link_dir)) # abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated # that it's better we just pretend it doesnt exist - # this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools return None @@ -243,26 +242,24 @@ def wget_output_path(link: Link) -> Optional[str]: try: output_path = unsafe_wget_output_path(link) except Exception as err: - # print(err) pass # better to pretend it just failed to download than expose gnarly OSErrors to users - # check for unprintable unicode characters # https://github.com/ArchiveBox/ArchiveBox/issues/1373 if output_path: safe_path = output_path.encode('utf-8', 'replace').decode() - if output_path != safe_path: # contains unprintable unicode characters that will break other parts of archivebox # better to pretend it doesnt exist and fallback to parent dir than crash archivebox output_path = None - # check for a path that is just too long to safely handle across different OS's # https://github.com/ArchiveBox/ArchiveBox/issues/549 if output_path and len(output_path) > 250: output_path = None + if output_path: + return output_path # fallback to just the domain dir search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") @@ -274,5 +271,4 @@ def wget_output_path(link: Link) -> Optional[str]: if search_dir.is_dir(): return domain(link.url).split(":", 1)[0] - return None diff --git a/requirements.txt b/requirements.txt index 260fb907..1f5acece 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,8 @@ croniter==2.0.5 cryptography==42.0.7 dateparser==1.2.0 decorator==5.1.1 -django==5.0.4 +django==5.0.5 +django-admin-data-views==0.3.1 django-auth-ldap==4.8.0 django-extensions==3.2.3 django-ninja==1.1.0