From b6d7c74680b2e732609c7510ba3967cafb33a45d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 15 Feb 2021 20:52:08 -0500 Subject: [PATCH] speed up the Snapshot handling view and show index page when extractor output is missing or multiple snapshots returned --- archivebox/core/views.py | 181 +++++++++++++++++++++++++++++++-------- 1 file changed, 144 insertions(+), 37 deletions(-) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 9c8313f0..427ab1e4 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -4,8 +4,8 @@ from io import StringIO from contextlib import redirect_stdout from django.shortcuts import render, redirect - -from django.http import HttpResponse +from django.http import HttpResponse, Http404 +from django.utils.html import format_html, mark_safe from django.views import View, static from django.views.generic.list import ListView from django.views.generic import FormView @@ -44,10 +44,6 @@ class SnapshotView(View): # render static html index from filesystem archive//index.html def get(self, request, path): - # missing trailing slash -> redirect to index - if '/' not in path: - return redirect(f'{path}/index.html') - if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS: return redirect(f'/admin/login/?next={request.path}') @@ -56,41 +52,152 @@ class SnapshotView(View): except (IndexError, ValueError): slug, archivefile = path.split('/', 1)[0], 'index.html' - all_pages = list(Snapshot.objects.all()) - # slug is a timestamp - by_ts = {page.timestamp: page for page in all_pages} - try: - # print('SERVING STATICFILE', by_ts[slug].link_dir, request.path, path) - response = static.serve(request, archivefile, document_root=by_ts[slug].link_dir, show_indexes=True) - response["Link"] = f'<{by_ts[slug].url}>; rel="canonical"' - return response - except KeyError: - pass + if slug.replace('.','').isdigit(): - # slug is a hash - by_hash = {page.url_hash: page for page in all_pages} - try: - timestamp = by_hash[slug].timestamp - return redirect(f'/archive/{timestamp}/{archivefile}') - except KeyError: - pass + # missing trailing slash -> redirect to index + if '/' not in path: + return redirect(f'{path}/index.html') + try: + try: + snapshot = Snapshot.objects.get(timestamp=slug) + except Snapshot.DoesNotExist: + if Snapshot.objects.filter(timestamp__startswith=slug).exists(): + raise Snapshot.MultipleObjectsReturned + response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True) + response["Link"] = f'<{snapshot.url}>; rel="canonical"' + return response + except Snapshot.DoesNotExist: + # Snapshot does not exist + return HttpResponse( + format_html( + ( + '



' + 'No Snapshots match the given timestamp: {}

' + 'You can add a new Snapshot, or return to the Main Index' + '
' + ), + slug, + path, + ), + content_type="text/html", + status=404, + ) + except Snapshot.MultipleObjectsReturned: + snapshot_hrefs = mark_safe('
').join( + format_html( + '{} {} {} {}', + snap.added.strftime('%Y-%m-%d %H:%M:%S'), + snap.timestamp, + snap.timestamp, + snap.url, + snap.title or '', + ) + for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'added').order_by('-added') + ) + return HttpResponse( + format_html( + ( + 'Multiple Snapshots match the given timestamp {}
'
+                        ),
+                        slug,
+                    ) + snapshot_hrefs + format_html(
+                        (
+                            '

' + 'Choose a Snapshot to proceed or go back to the Main Index' + ) + ), + content_type="text/html", + status=404, + ) + except Http404: + # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png + return HttpResponse( + format_html( + ( + '



' + 'Snapshot {} exists but no file or folder /{} exists within.

' + 'Maybe this output type is not availabe for this URL,
or the archiving process has not completed for this Snapshot yet?
' + '
# run this cmd to finish archiving this Snapshot
archivebox update -t timestamp {}


' + 'You can go back to the Snapshot {} detail page, or return to the Main Index' + '
' + ), + snapshot.timestamp, + snapshot.timestamp, + archivefile, + snapshot.timestamp, + snapshot.timestamp, + snapshot.timestamp, + ), + content_type="text/html", + status=404, + ) # slug is a URL - by_url = {page.base_url: page for page in all_pages} - try: - # TODO: add multiple snapshot support by showing index of all snapshots - # for given url instead of redirecting to timestamp index - timestamp = by_url[base_url(path)].timestamp - return redirect(f'/archive/{timestamp}/index.html') - except KeyError: - pass - - return HttpResponse( - 'No archived link matches the given timestamp or hash.', - content_type="text/plain", - status=404, - ) + else: + try: + try: + # try exact match on full url first + snapshot = Snapshot.objects.get( + Q(url='http://' + path) | Q(url='https://' + path) + ) + except Snapshot.DoesNotExist: + # fall back to match on exact base_url + try: + snapshot = Snapshot.objects.get( + Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path)) + ) + except Snapshot.DoesNotExist: + # fall back to matching base_url as prefix + snapshot = Snapshot.objects.get( + Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path)) + ) + return redirect(f'/archive/{snapshot.timestamp}/index.html') + except Snapshot.DoesNotExist: + return HttpResponse( + format_html( + ( + '



' + 'No Snapshots match the given url: {}

' + 'You can add a new Snapshot, or return to the Main Index' + '
' + ), + base_url(path), + path, + ), + content_type="text/html", + status=404, + ) + except Snapshot.MultipleObjectsReturned: + snapshot_hrefs = mark_safe('
').join( + format_html( + '{} {} {} {}', + snap.added.strftime('%Y-%m-%d %H:%M:%S'), + snap.timestamp, + snap.timestamp, + snap.url, + snap.title or '', + ) + for snap in Snapshot.objects.filter( + Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path)) + ).only('url', 'timestamp', 'title', 'added').order_by('-added') + ) + return HttpResponse( + format_html( + ( + 'Multiple Snapshots match the given URL {}
'
+                        ),
+                        base_url(path),
+                    ) + snapshot_hrefs + format_html(
+                        (
+                            '

' + 'Choose a Snapshot to proceed or go back to the Main Index' + ) + ), + content_type="text/html", + status=404, + ) + class PublicIndexView(ListView): template_name = 'public_index.html'