From 4c5a3fba8bcee1eb5489bb93116d8ca8dfa44abd Mon Sep 17 00:00:00 2001
From: Nick Sweeting <github@sweeting.me>
Date: Tue, 7 May 2024 05:38:29 -0700
Subject: [PATCH] more fixes for wget_output_path

---
 archivebox/extractors/wget.py | 10 +++-------
 requirements.txt              |  3 ++-
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py
index 1619814b..86dba0ac 100644
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -174,13 +174,12 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]:
 
     # check for literally any file present that isnt an empty folder
     domain_dir = Path(domain(link.url).replace(":", "+"))
-    files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*'))
+    files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')]
     if files_within:
         return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
 
     # abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated
     # that it's better we just pretend it doesnt exist
-
     # this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools
     return None
 
@@ -243,26 +242,24 @@ def wget_output_path(link: Link) -> Optional[str]:
     try:
         output_path = unsafe_wget_output_path(link)
     except Exception as err:
-        # print(err)
         pass           # better to pretend it just failed to download than expose gnarly OSErrors to users
 
-    
     # check for unprintable unicode characters
     # https://github.com/ArchiveBox/ArchiveBox/issues/1373
     if output_path:
         safe_path = output_path.encode('utf-8', 'replace').decode()
-        
         if output_path != safe_path:
             # contains unprintable unicode characters that will break other parts of archivebox
             # better to pretend it doesnt exist and fallback to parent dir than crash archivebox
             output_path = None
 
-
     # check for a path that is just too long to safely handle across different OS's
     # https://github.com/ArchiveBox/ArchiveBox/issues/549
     if output_path and len(output_path) > 250:
         output_path = None
 
+    if output_path:
+        return output_path
 
     # fallback to just the domain dir
     search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
@@ -274,5 +271,4 @@ def wget_output_path(link: Link) -> Optional[str]:
     if search_dir.is_dir():
         return domain(link.url).split(":", 1)[0]
 
-   
     return None
diff --git a/requirements.txt b/requirements.txt
index 260fb907..1f5acece 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,8 @@ croniter==2.0.5
 cryptography==42.0.7
 dateparser==1.2.0
 decorator==5.1.1
-django==5.0.4
+django==5.0.5
+django-admin-data-views==0.3.1
 django-auth-ldap==4.8.0
 django-extensions==3.2.3
 django-ninja==1.1.0