From f1823381d07218574e81402471d3f5d7ea546050 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 18 Feb 2021 02:33:28 -0500 Subject: [PATCH] improve latest title logic to take longest title --- archivebox/core/models.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 75898360..5ffde25a 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -170,18 +170,31 @@ class Snapshot(models.Model): @cached_property def latest_title(self): if self.title: - return self.title + return self.title # whoopdedoo that was easy try: - return self.archiveresult_set.filter(extractor='title', status='succeeded')[0].output - except ArchiveResult.DoesNotExist: + # take longest successful title from ArchiveResult db history + return sorted( + self.archiveresult_set\ + .filter(extractor='title', status='succeeded', output__isnull=False)\ + .values_list('output', flat=True), + key=lambda r: len(r), + )[-1] + except IndexError: pass - if ('title' in self.history - and self.history['title'] - and (self.history['title'][-1].status == 'succeeded') - and self.history['title'][-1].output.strip()): - return self.history['title'][-1].output.strip() + try: + # take longest successful title from Link json index file history + return sorted( + ( + result.output.strip() + for result in self.history['title'] + if result.status == 'succeeded' and result.output.strip() + ), + key=lambda r: len(r), + )[-1] + except (KeyError, IndexError): + pass return None