2019-05-01 15:13:21 +12:00
__package__ = ' archivebox.core '
2019-04-03 09:36:41 +13:00
2024-05-07 06:06:42 +12:00
from typing import Callable
2020-08-26 06:15:42 +12:00
from io import StringIO
from contextlib import redirect_stdout
2019-05-01 15:13:21 +12:00
from django . shortcuts import render , redirect
2024-05-07 06:06:42 +12:00
from django . http import HttpRequest , HttpResponse , Http404
2021-02-16 14:52:08 +13:00
from django . utils . html import format_html , mark_safe
2019-05-01 15:13:21 +12:00
from django . views import View , static
2020-08-21 02:04:34 +12:00
from django . views . generic . list import ListView
2020-08-29 02:58:32 +12:00
from django . views . generic import FormView
2021-01-30 03:08:03 +13:00
from django . db . models import Q
2020-08-29 02:58:32 +12:00
from django . contrib . auth . mixins import UserPassesTestMixin
2021-07-02 12:55:51 +12:00
from django . views . decorators . csrf import csrf_exempt
from django . utils . decorators import method_decorator
2020-08-21 08:43:28 +12:00
2024-05-07 06:06:42 +12:00
from admin_data_views . typing import TableContext , ItemContext
from admin_data_views . utils import render_with_table_view , render_with_item_view , ItemLink
2019-05-01 15:44:51 +12:00
from core . models import Snapshot
2020-08-26 06:15:42 +12:00
from core . forms import AddLinkForm
2019-05-03 11:15:16 +12:00
from . . config import (
OUTPUT_DIR ,
PUBLIC_INDEX ,
PUBLIC_SNAPSHOTS ,
2020-11-28 19:29:34 +13:00
PUBLIC_ADD_VIEW ,
VERSION ,
2024-01-04 17:09:04 +13:00
COMMIT_HASH ,
2020-11-28 19:29:34 +13:00
FOOTER_INFO ,
2021-02-16 14:42:00 +13:00
SNAPSHOTS_PER_PAGE ,
2024-05-07 06:06:42 +12:00
CONFIG ,
CONFIG_SCHEMA ,
DYNAMIC_CONFIG_SCHEMA ,
USER_CONFIG ,
2019-05-03 11:15:16 +12:00
)
2021-04-10 21:13:56 +12:00
from . . main import add
2020-08-26 06:15:42 +12:00
from . . util import base_url , ansi_to_html
2021-04-10 21:13:56 +12:00
from . . search import query_search_index
2020-07-03 08:54:25 +12:00
2019-04-17 21:42:21 +12:00
2021-01-30 23:35:07 +13:00
class HomepageView ( View ) :
2019-04-17 21:42:21 +12:00
def get ( self , request ) :
2020-07-28 15:56:35 +12:00
if request . user . is_authenticated :
return redirect ( ' /admin/core/snapshot/ ' )
if PUBLIC_INDEX :
2021-01-30 23:35:07 +13:00
return redirect ( ' /public ' )
2021-10-04 06:12:03 +13:00
2020-07-28 15:56:35 +12:00
return redirect ( f ' /admin/login/?next= { request . path } ' )
2019-04-23 11:08:01 +12:00
2021-01-30 23:35:07 +13:00
class SnapshotView ( View ) :
# render static html index from filesystem archive/<timestamp>/index.html
2019-05-01 15:13:21 +12:00
def get ( self , request , path ) :
2019-05-03 11:15:16 +12:00
if not request . user . is_authenticated and not PUBLIC_SNAPSHOTS :
return redirect ( f ' /admin/login/?next= { request . path } ' )
2019-05-01 15:13:21 +12:00
try :
slug , archivefile = path . split ( ' / ' , 1 )
except ( IndexError , ValueError ) :
slug , archivefile = path . split ( ' / ' , 1 ) [ 0 ] , ' index.html '
# slug is a timestamp
2021-02-16 14:52:08 +13:00
if slug . replace ( ' . ' , ' ' ) . isdigit ( ) :
# missing trailing slash -> redirect to index
if ' / ' not in path :
return redirect ( f ' { path } /index.html ' )
try :
try :
2021-02-19 02:04:50 +13:00
snapshot = Snapshot . objects . get ( Q ( timestamp = slug ) | Q ( id__startswith = slug ) )
2021-02-17 09:51:56 +13:00
response = static . serve ( request , archivefile , document_root = snapshot . link_dir , show_indexes = True )
response [ " Link " ] = f ' < { snapshot . url } >; rel= " canonical " '
return response
2021-02-16 14:52:08 +13:00
except Snapshot . DoesNotExist :
if Snapshot . objects . filter ( timestamp__startswith = slug ) . exists ( ) :
raise Snapshot . MultipleObjectsReturned
2021-02-17 09:51:56 +13:00
else :
raise
2021-02-16 14:52:08 +13:00
except Snapshot . DoesNotExist :
# Snapshot does not exist
return HttpResponse (
format_html (
(
' <center><br/><br/><br/> '
2021-02-19 02:04:50 +13:00
' No Snapshot directories match the given timestamp or UUID: <code> {} </code><br/><br/> '
2021-02-16 14:52:08 +13:00
' You can <a href= " /add/ " target= " _top " >add a new Snapshot</a>, or return to the <a href= " / " target= " _top " >Main Index</a> '
' </center> '
) ,
slug ,
path ,
) ,
content_type = " text/html " ,
status = 404 ,
)
except Snapshot . MultipleObjectsReturned :
snapshot_hrefs = mark_safe ( ' <br/> ' ) . join (
format_html (
' {} <a href= " /archive/ {} /index.html " ><b><code> {} </code></b></a> {} <b> {} </b> ' ,
snap . added . strftime ( ' % Y- % m- %d % H: % M: % S ' ) ,
snap . timestamp ,
snap . timestamp ,
snap . url ,
snap . title or ' ' ,
)
for snap in Snapshot . objects . filter ( timestamp__startswith = slug ) . only ( ' url ' , ' timestamp ' , ' title ' , ' added ' ) . order_by ( ' -added ' )
)
return HttpResponse (
format_html (
(
2021-02-17 09:51:56 +13:00
' Multiple Snapshots match the given timestamp/UUID <code> {} </code><br/><pre> '
2021-02-16 14:52:08 +13:00
) ,
slug ,
) + snapshot_hrefs + format_html (
(
' </pre><br/> '
' Choose a Snapshot to proceed or go back to the <a href= " / " target= " _top " >Main Index</a> '
)
) ,
content_type = " text/html " ,
status = 404 ,
)
except Http404 :
# Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
return HttpResponse (
format_html (
(
' <center><br/><br/><br/> '
2021-02-19 02:04:50 +13:00
f ' Snapshot <a href= " /archive/ { snapshot . timestamp } /index.html " target= " _top " ><b><code>[ { snapshot . timestamp } ]</code></b></a> exists in DB, but resource <b><code> { snapshot . timestamp } / '
' {} '
2024-05-07 02:32:36 +12:00
f ' </code></b> does not exist in the <a href= " /archive/ { snapshot . timestamp } / " target= " _top " >snapshot dir</a> yet.<br/><br/> '
2024-05-07 18:10:59 +12:00
' It \' s possible that this resource type is not available for the Snapshot,<br/>or that the archiving process has not completed yet.<br/> '
2024-05-07 02:32:36 +12:00
f ' <pre><code># if interrupted, run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp { snapshot . timestamp } </code></pre><br/><br/> '
2021-02-19 02:04:50 +13:00
' <div class= " text-align: left; width: 100 % ; max-width: 400px " > '
' <i><b>Next steps:</i></b><br/> '
f ' - list all the <a href= " /archive/ { snapshot . timestamp } / " target= " _top " >Snapshot files <code>.*</code></a><br/> '
f ' - view the <a href= " /archive/ { snapshot . timestamp } /index.html " target= " _top " >Snapshot <code>./index.html</code></a><br/> '
f ' - go to the <a href= " /admin/core/snapshot/ { snapshot . id } /change/ " target= " _top " >Snapshot admin</a> to edit<br/> '
f ' - go to the <a href= " /admin/core/snapshot/?id__startswith= { snapshot . id } " target= " _top " >Snapshot actions</a> to re-archive<br/> '
' - or return to <a href= " / " target= " _top " >the main index...</a></div> '
2021-02-16 14:52:08 +13:00
' </center> '
) ,
archivefile ,
) ,
content_type = " text/html " ,
status = 404 ,
)
2019-05-01 15:13:21 +12:00
# slug is a URL
2021-02-17 09:51:56 +13:00
try :
2021-02-16 14:52:08 +13:00
try :
2021-02-17 09:51:56 +13:00
# try exact match on full url first
snapshot = Snapshot . objects . get (
2021-02-19 02:04:50 +13:00
Q ( url = ' http:// ' + path ) | Q ( url = ' https:// ' + path ) | Q ( id__startswith = path )
2021-02-17 09:51:56 +13:00
)
except Snapshot . DoesNotExist :
# fall back to match on exact base_url
2021-02-16 14:52:08 +13:00
try :
snapshot = Snapshot . objects . get (
2021-02-17 09:51:56 +13:00
Q ( url = ' http:// ' + base_url ( path ) ) | Q ( url = ' https:// ' + base_url ( path ) )
2021-02-16 14:52:08 +13:00
)
except Snapshot . DoesNotExist :
2021-02-17 09:51:56 +13:00
# fall back to matching base_url as prefix
snapshot = Snapshot . objects . get (
2021-02-16 14:52:08 +13:00
Q ( url__startswith = ' http:// ' + base_url ( path ) ) | Q ( url__startswith = ' https:// ' + base_url ( path ) )
2021-02-17 09:51:56 +13:00
)
return redirect ( f ' /archive/ { snapshot . timestamp } /index.html ' )
except Snapshot . DoesNotExist :
return HttpResponse (
format_html (
(
' <center><br/><br/><br/> '
2021-02-18 20:34:03 +13:00
' No Snapshots match the given url: <code> {} </code><br/><br/><br/> '
' Return to the <a href= " / " target= " _top " >Main Index</a>, or:<br/><br/> '
' + <i><a href= " /add/?url= {} " target= " _top " >Add a new Snapshot for <code> {} </code></a><br/><br/></i> '
2021-02-17 09:51:56 +13:00
' </center> '
2021-02-16 14:52:08 +13:00
) ,
2021-02-17 09:51:56 +13:00
base_url ( path ) ,
2021-02-18 20:34:03 +13:00
path if ' :// ' in path else f ' https:// { path } ' ,
2021-02-17 09:51:56 +13:00
path ,
) ,
content_type = " text/html " ,
status = 404 ,
)
except Snapshot . MultipleObjectsReturned :
snapshot_hrefs = mark_safe ( ' <br/> ' ) . join (
format_html (
' {} <a href= " /archive/ {} /index.html " ><b><code> {} </code></b></a> {} <b> {} </b> ' ,
snap . added . strftime ( ' % Y- % m- %d % H: % M: % S ' ) ,
snap . timestamp ,
snap . timestamp ,
snap . url ,
snap . title or ' ' ,
2021-02-16 14:52:08 +13:00
)
2021-02-17 09:51:56 +13:00
for snap in Snapshot . objects . filter (
Q ( url__startswith = ' http:// ' + base_url ( path ) ) | Q ( url__startswith = ' https:// ' + base_url ( path ) )
) . only ( ' url ' , ' timestamp ' , ' title ' , ' added ' ) . order_by ( ' -added ' )
)
return HttpResponse (
format_html (
(
' Multiple Snapshots match the given URL <code> {} </code><br/><pre> '
) ,
base_url ( path ) ,
) + snapshot_hrefs + format_html (
(
' </pre><br/> '
' Choose a Snapshot to proceed or go back to the <a href= " / " target= " _top " >Main Index</a> '
)
) ,
content_type = " text/html " ,
status = 404 ,
)
2021-10-04 06:12:03 +13:00
2020-08-21 02:04:34 +12:00
2021-01-30 23:35:07 +13:00
class PublicIndexView ( ListView ) :
template_name = ' public_index.html '
2020-08-21 02:04:34 +12:00
model = Snapshot
2021-02-16 14:42:00 +13:00
paginate_by = SNAPSHOTS_PER_PAGE
2021-04-01 19:22:15 +13:00
ordering = [ ' -added ' ]
2020-08-21 08:43:28 +12:00
2020-11-28 19:29:34 +13:00
def get_context_data ( self , * * kwargs ) :
return {
* * super ( ) . get_context_data ( * * kwargs ) ,
' VERSION ' : VERSION ,
2024-01-04 17:09:04 +13:00
' COMMIT_HASH ' : COMMIT_HASH ,
2020-11-28 19:29:34 +13:00
' FOOTER_INFO ' : FOOTER_INFO ,
}
2021-10-04 06:12:03 +13:00
def get_queryset ( self , * * kwargs ) :
2021-02-16 19:26:26 +13:00
qs = super ( ) . get_queryset ( * * kwargs )
2020-08-26 07:31:09 +12:00
query = self . request . GET . get ( ' q ' )
2021-04-10 21:13:56 +12:00
if query and query . strip ( ) :
2021-01-30 03:08:03 +13:00
qs = qs . filter ( Q ( title__icontains = query ) | Q ( url__icontains = query ) | Q ( timestamp__icontains = query ) | Q ( tags__name__icontains = query ) )
2021-04-11 00:18:13 +12:00
try :
qs = qs | query_search_index ( query )
except Exception as err :
print ( f ' [!] Error while using search backend: { err . __class__ . __name__ } { err } ' )
2024-01-20 08:49:09 +13:00
return qs . distinct ( )
2020-08-21 08:43:28 +12:00
def get ( self , * args , * * kwargs ) :
if PUBLIC_INDEX or self . request . user . is_authenticated :
response = super ( ) . get ( * args , * * kwargs )
return response
else :
return redirect ( f ' /admin/login/?next= { self . request . path } ' )
2021-07-02 12:55:51 +12:00
@method_decorator ( csrf_exempt , name = ' dispatch ' )
2020-08-29 02:58:32 +12:00
class AddView ( UserPassesTestMixin , FormView ) :
2021-01-30 23:35:07 +13:00
template_name = " add.html "
2020-08-29 02:58:32 +12:00
form_class = AddLinkForm
2020-10-04 08:57:55 +13:00
def get_initial ( self ) :
""" Prefill the AddLinkForm with the ' url ' GET parameter """
if self . request . method == ' GET ' :
url = self . request . GET . get ( ' url ' , None )
if url :
2021-02-18 20:34:03 +13:00
return { ' url ' : url if ' :// ' in url else f ' https:// { url } ' }
2021-10-04 06:12:03 +13:00
2021-02-18 20:34:03 +13:00
return super ( ) . get_initial ( )
2020-10-04 08:57:55 +13:00
2020-08-29 02:58:32 +12:00
def test_func ( self ) :
return PUBLIC_ADD_VIEW or self . request . user . is_authenticated
2020-11-28 20:01:53 +13:00
def get_context_data ( self , * * kwargs ) :
return {
* * super ( ) . get_context_data ( * * kwargs ) ,
' title ' : " Add URLs " ,
# We can't just call request.build_absolute_uri in the template, because it would include query parameters
' absolute_add_path ' : self . request . build_absolute_uri ( self . request . path ) ,
' VERSION ' : VERSION ,
' FOOTER_INFO ' : FOOTER_INFO ,
2021-02-16 14:54:47 +13:00
' stdout ' : ' ' ,
2020-11-28 20:01:53 +13:00
}
2020-08-29 02:58:32 +12:00
def form_valid ( self , form ) :
url = form . cleaned_data [ " url " ]
print ( f ' [+] Adding URL: { url } ' )
2021-04-01 19:34:16 +13:00
parser = form . cleaned_data [ " parser " ]
2021-03-27 21:30:15 +13:00
tag = form . cleaned_data [ " tag " ]
2020-08-29 02:58:32 +12:00
depth = 0 if form . cleaned_data [ " depth " ] == " 0 " else 1
2020-12-11 06:45:30 +13:00
extractors = ' , ' . join ( form . cleaned_data [ " archive_methods " ] )
2020-08-29 02:58:32 +12:00
input_kwargs = {
" urls " : url ,
2021-03-27 21:30:15 +13:00
" tag " : tag ,
2020-08-29 02:58:32 +12:00
" depth " : depth ,
2021-04-01 19:34:16 +13:00
" parser " : parser ,
2020-08-29 02:58:32 +12:00
" update_all " : False ,
" out_dir " : OUTPUT_DIR ,
}
2020-12-11 04:51:57 +13:00
if extractors :
2020-12-11 05:08:27 +13:00
input_kwargs . update ( { " extractors " : extractors } )
2020-08-29 02:58:32 +12:00
add_stdout = StringIO ( )
with redirect_stdout ( add_stdout ) :
add ( * * input_kwargs )
print ( add_stdout . getvalue ( ) )
context = self . get_context_data ( )
context . update ( {
" stdout " : ansi_to_html ( add_stdout . getvalue ( ) . strip ( ) ) ,
" form " : AddLinkForm ( )
} )
2020-08-29 03:06:48 +12:00
return render ( template_name = self . template_name , request = self . request , context = context )
2021-10-04 06:12:03 +13:00
class HealthCheckView ( View ) :
"""
A Django view that renders plain text " OK " for service discovery tools
"""
def get ( self , request ) :
"""
Handle a GET request
"""
return HttpResponse (
' OK ' ,
content_type = ' text/plain ' ,
status = 200
)
2024-05-07 06:06:42 +12:00
def find_config_section ( key : str ) - > str :
matching_sections = [
name for name , opts in CONFIG_SCHEMA . items ( ) if key in opts
]
section = matching_sections [ 0 ] if matching_sections else ' DYNAMIC '
return section
def find_config_default ( key : str ) - > str :
default_val = USER_CONFIG . get ( key , { } ) . get ( ' default ' , lambda : None )
if isinstance ( default_val , Callable ) :
return None
else :
default_val = repr ( default_val )
return default_val
def find_config_type ( key : str ) - > str :
if key in USER_CONFIG :
return USER_CONFIG [ key ] [ ' type ' ] . __name__
elif key in DYNAMIC_CONFIG_SCHEMA :
return type ( CONFIG [ key ] ) . __name__
return ' str '
def key_is_safe ( key : str ) - > bool :
for term in ( ' key ' , ' password ' , ' secret ' , ' token ' ) :
if term in key . lower ( ) :
return False
return True
@render_with_table_view
def live_config_list_view ( request : HttpRequest , * * kwargs ) - > TableContext :
assert request . user . is_superuser , ' Must be a superuser to view configuration settings. '
rows = {
" Section " : [ ] ,
" Key " : [ ] ,
" Type " : [ ] ,
" Value " : [ ] ,
" Default " : [ ] ,
# "Documentation": [],
" Aliases " : [ ] ,
}
for section in CONFIG_SCHEMA . keys ( ) :
for key in CONFIG_SCHEMA [ section ] . keys ( ) :
rows [ ' Section ' ] . append ( section . replace ( ' _ ' , ' ' ) . title ( ) . replace ( ' Config ' , ' ' ) )
rows [ ' Key ' ] . append ( ItemLink ( key , key = key ) )
rows [ ' Type ' ] . append ( mark_safe ( f ' <code> { find_config_type ( key ) } </code> ' ) )
rows [ ' Value ' ] . append ( mark_safe ( f ' <code> { CONFIG [ key ] } </code> ' ) if key_is_safe ( key ) else ' ******** (redacted) ' )
rows [ ' Default ' ] . append ( mark_safe ( f ' <a href= " https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27 { key } %27&type=code " ><code style= " text-decoration: underline " > { find_config_default ( key ) or ' See here... ' } </code></a> ' ) )
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
rows [ ' Aliases ' ] . append ( ' , ' . join ( CONFIG_SCHEMA [ section ] [ key ] . get ( ' aliases ' , [ ] ) ) )
section = ' DYNAMIC '
for key in DYNAMIC_CONFIG_SCHEMA . keys ( ) :
rows [ ' Section ' ] . append ( section . replace ( ' _ ' , ' ' ) . title ( ) . replace ( ' Config ' , ' ' ) )
rows [ ' Key ' ] . append ( ItemLink ( key , key = key ) )
rows [ ' Type ' ] . append ( mark_safe ( f ' <code> { find_config_type ( key ) } </code> ' ) )
rows [ ' Value ' ] . append ( mark_safe ( f ' <code> { CONFIG [ key ] } </code> ' ) if key_is_safe ( key ) else ' ******** (redacted) ' )
rows [ ' Default ' ] . append ( mark_safe ( f ' <a href= " https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27 { key } %27&type=code " ><code style= " text-decoration: underline " > { find_config_default ( key ) or ' See here... ' } </code></a> ' ) )
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
rows [ ' Aliases ' ] . append ( ItemLink ( key , key = key ) if key in USER_CONFIG else ' ' )
return TableContext (
title = " Computed Configuration Values " ,
table = rows ,
)
@render_with_item_view
def live_config_value_view ( request : HttpRequest , key : str , * * kwargs ) - > ItemContext :
assert request . user . is_superuser , ' Must be a superuser to view configuration settings. '
aliases = USER_CONFIG . get ( key , { } ) . get ( " aliases " , [ ] )
return ItemContext (
slug = key ,
title = key ,
data = [
{
" name " : mark_safe ( f ' data / ArchiveBox.conf [ { find_config_section ( key ) } ] <b><code style= " color: lightgray " > { key } </code></b> ' if key in USER_CONFIG else f ' [DYNAMIC CONFIG] <b><code style= " color: lightgray " > { key } </code></b> <small>(calculated at runtime)</small> ' ) ,
" description " : None ,
" fields " : {
' Key ' : key ,
' Type ' : find_config_type ( key ) ,
' Value ' : CONFIG [ key ] if key_is_safe ( key ) else ' ******** ' ,
} ,
" help_texts " : {
' Key ' : mark_safe ( f '''
< a href = " https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration# { key.lower()} " > Documentation < / a > & nbsp ;
< span style = " display: { " inline " if aliases else " none " } " >
Aliases : { " , " . join ( aliases ) }
< / span >
''' ),
' Type ' : mark_safe ( f '''
< a href = " https://github.com/search?q=repo % 3AArchiveBox %2F ArchiveBox+path % 3Aconfig.py+ % 27 {key} % 27&type=code " >
See full definition in < code > archivebox / config . py < / code > . . .
< / a >
''' ),
' Value ' : mark_safe ( f '''
{ ' <b style= " color: red " >Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/> ' if not key_is_safe ( key ) else ' ' }
Default : < a href = " https://github.com/search?q=repo % 3AArchiveBox %2F ArchiveBox+path % 3Aconfig.py+ % 27 {key} % 27&type=code " >
< code > { find_config_default ( key ) or ' See 1here... ' } < / code >
< / a >
< br / > < br / >
< p style = " display: { " block " if key in USER_CONFIG else " none " } " >
< i > To change this value , edit < code > data / ArchiveBox . conf < / code > or run : < / i >
< br / > < br / >
< code > archivebox config - - set { key } = " {
val . strip ( " ' " )
if ( val := find_config_default ( key ) ) else
( repr ( CONFIG [ key ] if key_is_safe ( key ) else ' ******** ' ) ) . strip ( " ' " )
} " </code>
< / p >
''' ),
} ,
} ,
] ,
)