1
0
Fork 0
mirror of synced 2024-06-28 19:10:33 +12:00

Merge pull request #449 from pirate/dev

This commit is contained in:
Nick Sweeting 2020-08-18 09:38:09 -04:00 committed by GitHub
commit 897a101c1f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
19 changed files with 221 additions and 138 deletions

View file

@ -70,6 +70,7 @@ archivebox/index/json.py
archivebox/index/schema.py archivebox/index/schema.py
archivebox/index/sql.py archivebox/index/sql.py
archivebox/parsers/__init__.py archivebox/parsers/__init__.py
archivebox/parsers/generic_html.py
archivebox/parsers/generic_json.py archivebox/parsers/generic_json.py
archivebox/parsers/generic_rss.py archivebox/parsers/generic_rss.py
archivebox/parsers/generic_txt.py archivebox/parsers/generic_txt.py

View file

@ -108,8 +108,8 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'DEPENDENCY_CONFIG': { 'DEPENDENCY_CONFIG': {
'USE_CURL': {'type': bool, 'default': True}, 'USE_CURL': {'type': bool, 'default': True},
'USE_WGET': {'type': bool, 'default': True}, 'USE_WGET': {'type': bool, 'default': True},
'USE_SINGLEFILE': {'type': bool, 'default': False}, 'USE_SINGLEFILE': {'type': bool, 'default': True},
'USE_READABILITY': {'type': bool, 'default': False}, 'USE_READABILITY': {'type': bool, 'default': True},
'USE_GIT': {'type': bool, 'default': True}, 'USE_GIT': {'type': bool, 'default': True},
'USE_CHROME': {'type': bool, 'default': True}, 'USE_CHROME': {'type': bool, 'default': True},
'USE_YOUTUBEDL': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True},

View file

@ -33,7 +33,7 @@ def update_titles(modeladmin, request, queryset):
archive_links([ archive_links([
snapshot.as_link() snapshot.as_link()
for snapshot in queryset for snapshot in queryset
], overwrite=True, methods=('title',), out_dir=OUTPUT_DIR) ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
update_titles.short_description = "Pull title" update_titles.short_description = "Pull title"
def overwrite_snapshots(modeladmin, request, queryset): def overwrite_snapshots(modeladmin, request, queryset):
@ -58,9 +58,9 @@ delete_snapshots.short_description = "Delete"
class SnapshotAdmin(admin.ModelAdmin): class SnapshotAdmin(admin.ModelAdmin):
list_display = ('added', 'title_str', 'url_str', 'files', 'size') list_display = ('added', 'title_str', 'url_str', 'files', 'size')
sort_fields = ('title_str', 'url_str', 'added') sort_fields = ('title_str', 'url_str', 'added')
readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated') readonly_fields = ('id', 'url', 'timestamp', 'title', 'tags', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
search_fields = ('url', 'timestamp', 'title', 'tags') search_fields = ('url', 'timestamp', 'title', 'tags')
fields = ('title', 'tags', *readonly_fields) fields = (*readonly_fields,)
list_filter = ('added', 'updated', 'tags') list_filter = ('added', 'updated', 'tags')
ordering = ['-added'] ordering = ['-added']
actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots] actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
@ -82,12 +82,12 @@ class SnapshotAdmin(admin.ModelAdmin):
'<a href="/{}">' '<a href="/{}">'
'<img src="/{}/{}" class="favicon" onerror="this.remove()">' '<img src="/{}/{}" class="favicon" onerror="this.remove()">'
'</a>' '</a>'
'<a href="/{}/{}">' '<a href="/{}/index.html">'
'<b class="status-{}">{}</b>' '<b class="status-{}">{}</b>'
'</a>', '</a>',
obj.archive_path, obj.archive_path,
obj.archive_path, canon['favicon_path'], obj.archive_path, canon['favicon_path'],
obj.archive_path, canon['wget_path'] or '', obj.archive_path,
'fetched' if obj.latest_title or obj.title else 'pending', 'fetched' if obj.latest_title or obj.title else 'pending',
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...' urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
) + mark_safe(f'<span class="tags">{tags}</span>') ) + mark_safe(f'<span class="tags">{tags}</span>')

View file

@ -62,10 +62,10 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
ARCHIVE_METHODS = get_default_archive_methods() ARCHIVE_METHODS = get_default_archive_methods()
if methods is not None: if methods:
ARCHIVE_METHODS = [ ARCHIVE_METHODS = [
method for method in ARCHIVE_METHODS method for method in ARCHIVE_METHODS
if method[1] in methods if method[0] in methods
] ]
out_dir = out_dir or link.link_dir out_dir = out_dir or link.link_dir

View file

@ -129,7 +129,7 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
try: try:
links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = sorted_links(links) # deterministically sort the links based on timstamp, url links = sorted_links(links) # deterministically sort the links based on timstamp, url
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = fix_duplicate_links(links) # merge/dedupe duplicate timestamps & urls
finally: finally:
timer.end() timer.end()
@ -144,34 +144,39 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
urlparse(link.url) urlparse(link.url)
except ValueError: except ValueError:
continue continue
scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp') if scheme(link.url) not in ('http', 'https', 'ftp'):
not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True continue
if scheme_is_valid and not_blacklisted: if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
yield link continue
yield link
@enforce_types @enforce_types
def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]: def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
""" """
ensures that all non-duplicate links have monotonically increasing timestamps ensures that all non-duplicate links have monotonically increasing timestamps
""" """
# from core.models import Snapshot
unique_urls: OrderedDict[str, Link] = OrderedDict() unique_urls: OrderedDict[str, Link] = OrderedDict()
for link in sorted_links: for link in sorted_links:
if link.base_url in unique_urls: if link.url in unique_urls:
# merge with any other links that share the same url # merge with any other links that share the same url
link = merge_links(unique_urls[link.base_url], link) link = merge_links(unique_urls[link.url], link)
unique_urls[link.base_url] = link unique_urls[link.url] = link
unique_timestamps: OrderedDict[str, Link] = OrderedDict() # unique_timestamps: OrderedDict[str, Link] = OrderedDict()
for link in unique_urls.values(): # for link in unique_urls.values():
new_link = link.overwrite( # closest_non_duplicate_ts = lowest_uniq_timestamp(unique_timestamps, link.timestamp)
timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp), # if closest_non_duplicate_ts != link.timestamp:
) # link = link.overwrite(timestamp=closest_non_duplicate_ts)
unique_timestamps[new_link.timestamp] = new_link # Snapshot.objects.filter(url=link.url).update(timestamp=link.timestamp)
# unique_timestamps[link.timestamp] = link
return unique_timestamps.values() # return unique_timestamps.values()
return unique_urls.values()
@enforce_types @enforce_types
@ -301,14 +306,14 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:
@enforce_types @enforce_types
def parse_links_from_source(source_path: str) -> Tuple[List[Link], List[Link]]: def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
from ..parsers import parse_links from ..parsers import parse_links
new_links: List[Link] = [] new_links: List[Link] = []
# parse and validate the import file # parse and validate the import file
raw_links, parser_name = parse_links(source_path) raw_links, parser_name = parse_links(source_path, root_url=root_url)
new_links = validate_links(raw_links) new_links = validate_links(raw_links)
if parser_name: if parser_name:

View file

@ -8,6 +8,7 @@ from typing import List, Optional, Iterator, Mapping
from .schema import Link from .schema import Link
from ..system import atomic_write, copy_and_overwrite from ..system import atomic_write, copy_and_overwrite
from ..logging_util import printable_filesize
from ..util import ( from ..util import (
enforce_types, enforce_types,
ts_to_date, ts_to_date,
@ -140,6 +141,7 @@ def link_details_template(link: Link) -> str:
) or 'about:blank', ) or 'about:blank',
'extension': link.extension or 'html', 'extension': link.extension or 'html',
'tags': link.tags or 'untagged', 'tags': link.tags or 'untagged',
'size': printable_filesize(link.archive_size) if link.archive_size else 'pending',
'status': 'archived' if link.is_archived else 'not yet archived', 'status': 'archived' if link.is_archived else 'not yet archived',
'status_color': 'success' if link.is_archived else 'danger', 'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date(link.oldest_archive_date), 'oldest_archive_date': ts_to_date(link.oldest_archive_date),

View file

@ -39,6 +39,10 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
with transaction.atomic(): with transaction.atomic():
for link in links: for link in links:
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
try:
info['timestamp'] = Snapshot.objects.get(url=link.url).timestamp
except Snapshot.DoesNotExist:
pass
Snapshot.objects.update_or_create(url=link.url, defaults=info) Snapshot.objects.update_or_create(url=link.url, defaults=info)
@enforce_types @enforce_types

View file

@ -548,7 +548,7 @@ def add(urls: Union[str, List[str]],
# save verbatim args to sources # save verbatim args to sources
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
new_links += parse_links_from_source(write_ahead_log) new_links += parse_links_from_source(write_ahead_log, root_url=None)
# If we're going one level deeper, download each link and look for more links # If we're going one level deeper, download each link and look for more links
new_links_depth = [] new_links_depth = []
@ -556,9 +556,9 @@ def add(urls: Union[str, List[str]],
log_crawl_started(new_links) log_crawl_started(new_links)
for new_link in new_links: for new_link in new_links:
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir) downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
new_links_depth += parse_links_from_source(downloaded_file) new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
imported_links = new_links + new_links_depth imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
all_links, new_links = dedupe_links(all_links, imported_links) all_links, new_links = dedupe_links(all_links, imported_links)
write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)

View file

@ -11,7 +11,7 @@ import re
import os import os
from io import StringIO from io import StringIO
from typing import IO, Tuple, List from typing import IO, Tuple, List, Optional
from datetime import datetime from datetime import datetime
from ..system import atomic_write from ..system import atomic_write
@ -38,26 +38,29 @@ from .medium_rss import parse_medium_rss_export
from .netscape_html import parse_netscape_html_export from .netscape_html import parse_netscape_html_export
from .generic_rss import parse_generic_rss_export from .generic_rss import parse_generic_rss_export
from .generic_json import parse_generic_json_export from .generic_json import parse_generic_json_export
from .generic_html import parse_generic_html_export
from .generic_txt import parse_generic_txt_export from .generic_txt import parse_generic_txt_export
PARSERS = ( PARSERS = (
# Specialized parsers # Specialized parsers
('Pocket HTML', parse_pocket_html_export), ('Pocket HTML', parse_pocket_html_export),
('Pinboard RSS', parse_pinboard_rss_export), ('Pinboard RSS', parse_pinboard_rss_export),
('Shaarli RSS', parse_shaarli_rss_export), ('Shaarli RSS', parse_shaarli_rss_export),
('Medium RSS', parse_medium_rss_export), ('Medium RSS', parse_medium_rss_export),
# General parsers # General parsers
('Netscape HTML', parse_netscape_html_export), ('Netscape HTML', parse_netscape_html_export),
('Generic RSS', parse_generic_rss_export), ('Generic RSS', parse_generic_rss_export),
('Generic JSON', parse_generic_json_export), ('Generic JSON', parse_generic_json_export),
('Generic HTML', parse_generic_html_export),
# Fallback parser
('Plain Text', parse_generic_txt_export),
)
# Fallback parser
('Plain Text', parse_generic_txt_export),
)
@enforce_types @enforce_types
def parse_links_memory(urls: List[str]): def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
""" """
parse a list of URLS without touching the filesystem parse a list of URLS without touching the filesystem
""" """
@ -68,17 +71,16 @@ def parse_links_memory(urls: List[str]):
file = StringIO() file = StringIO()
file.writelines(urls) file.writelines(urls)
file.name = "io_string" file.name = "io_string"
output = _parse(file, timer) links, parser = run_parser_functions(file, timer, root_url=root_url)
if output is not None:
return output
timer.end() timer.end()
return [], 'Failed to parse'
if parser is None:
return [], 'Failed to parse'
return links, parser
@enforce_types @enforce_types
def parse_links(source_file: str) -> Tuple[List[Link], str]: def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Link], str]:
"""parse a list of URLs with their metadata from an """parse a list of URLs with their metadata from an
RSS feed, bookmarks export, or text file RSS feed, bookmarks export, or text file
""" """
@ -87,28 +89,39 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]:
timer = TimedProgress(TIMEOUT * 4) timer = TimedProgress(TIMEOUT * 4)
with open(source_file, 'r', encoding='utf-8') as file: with open(source_file, 'r', encoding='utf-8') as file:
output = _parse(file, timer) links, parser = run_parser_functions(file, timer, root_url=root_url)
if output is not None:
return output
timer.end() timer.end()
return [], 'Failed to parse' if parser is None:
return [], 'Failed to parse'
return links, parser
def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Link], Optional[str]]:
most_links: List[Link] = []
best_parser_name = None
def _parse(to_parse: IO[str], timer) -> Tuple[List[Link], str]:
for parser_name, parser_func in PARSERS: for parser_name, parser_func in PARSERS:
try: try:
links = list(parser_func(to_parse)) parsed_links = list(parser_func(to_parse, root_url=root_url))
if links: if not parsed_links:
timer.end() raise Exception('no links found')
return links, parser_name
except Exception as err: # noqa # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
pass if len(parsed_links) > len(most_links):
most_links = parsed_links
best_parser_name = parser_name
except Exception as err: # noqa
# Parsers are tried one by one down the list, and the first one # Parsers are tried one by one down the list, and the first one
# that succeeds is used. To see why a certain parser was not used # that succeeds is used. To see why a certain parser was not used
# due to error or format incompatibility, uncomment this line: # due to error or format incompatibility, uncomment this line:
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err)) # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
# raise # raise
pass
timer.end()
return most_links, best_parser_name
@enforce_types @enforce_types

View file

@ -0,0 +1,53 @@
__package__ = 'archivebox.parsers'
import re
from typing import IO, Iterable, Optional
from datetime import datetime
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
URL_REGEX,
)
from html.parser import HTMLParser
from urllib.parse import urljoin
class HrefParser(HTMLParser):
def __init__(self):
super().__init__()
self.urls = []
def handle_starttag(self, tag, attrs):
if tag == "a":
for attr, value in attrs:
if attr == "href":
self.urls.append(value)
@enforce_types
def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
"""Parse Generic HTML for href tags and use only the url (support for title coming later)"""
html_file.seek(0)
for line in html_file:
parser = HrefParser()
# example line
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
parser.feed(line)
for url in parser.urls:
if root_url:
# resolve relative urls /home.html -> https://example.com/home.html
url = urljoin(root_url, url)
for archivable_url in re.findall(URL_REGEX, url):
yield Link(
url=htmldecode(archivable_url),
timestamp=str(datetime.now().timestamp()),
title=None,
tags=None,
sources=[html_file.name],
)

View file

@ -13,7 +13,7 @@ from ..util import (
@enforce_types @enforce_types
def parse_generic_json_export(json_file: IO[str]) -> Iterable[Link]: def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
json_file.seek(0) json_file.seek(0)

View file

@ -12,7 +12,7 @@ from ..util import (
) )
@enforce_types @enforce_types
def parse_generic_rss_export(rss_file: IO[str]) -> Iterable[Link]: def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse RSS XML-format files into links""" """Parse RSS XML-format files into links"""
rss_file.seek(0) rss_file.seek(0)

View file

@ -16,7 +16,7 @@ from ..util import (
@enforce_types @enforce_types
def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]: def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse raw links from each line in a text file""" """Parse raw links from each line in a text file"""
text_file.seek(0) text_file.seek(0)

View file

@ -14,7 +14,7 @@ from ..util import (
@enforce_types @enforce_types
def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]: def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse Medium RSS feed files into links""" """Parse Medium RSS feed files into links"""
rss_file.seek(0) rss_file.seek(0)

View file

@ -14,7 +14,7 @@ from ..util import (
@enforce_types @enforce_types
def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]: def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse netscape-format bookmarks export files (produced by all browsers)""" """Parse netscape-format bookmarks export files (produced by all browsers)"""
html_file.seek(0) html_file.seek(0)

View file

@ -14,7 +14,7 @@ from ..util import (
@enforce_types @enforce_types
def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]: def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse Pinboard RSS feed files into links""" """Parse Pinboard RSS feed files into links"""
rss_file.seek(0) rss_file.seek(0)

View file

@ -14,7 +14,7 @@ from ..util import (
@enforce_types @enforce_types
def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]: def parse_pocket_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
html_file.seek(0) html_file.seek(0)

View file

@ -13,7 +13,7 @@ from ..util import (
@enforce_types @enforce_types
def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]: def parse_shaarli_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse Shaarli-specific RSS XML-format files into links""" """Parse Shaarli-specific RSS XML-format files into links"""
rss_file.seek(0) rss_file.seek(0)

View file

@ -298,6 +298,10 @@
<h5>Errors</h5> <h5>Errors</h5>
❌ $num_failures ❌ $num_failures
</div> </div>
<div class="info-chunk">
<h5>Size</h5>
$size
</div>
</div> </div>
<div class="col-lg-4"> <div class="col-lg-4">
<div class="info-chunk"> <div class="info-chunk">
@ -312,99 +316,100 @@
</div> </div>
</div> </div>
<div class="row header-bottom-frames"> <div class="row header-bottom-frames">
<div class="col-lg-2"> <div class="col-lg-3">
<div class="card selected-card"> <div class="card selected-card">
<iframe class="card-img-top" src="$archive_url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe> <iframe class="card-img-top" src="$archive_url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body"> <div class="card-body">
<a href="$archive_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="$archive_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <img src="../../static/external.png" class="external"/>
</a> </a>
<a href="$archive_url" target="preview"><h4 class="card-title">Local Archive</h4></a> <a href="$archive_url" target="preview"><h4 class="card-title">Wget &gt; WARC</h4></a>
<p class="card-text">archive/$domain</p> <p class="card-text">archive/$domain</p>
</div>
</div>
</div>
<div class="col-lg-3">
<div class="card">
<iframe class="card-img-top" src="$singlefile_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="$singlefile_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$singlefile_path" target="preview"><h4 class="card-title">Chrome &gt; SingleFile</h4></a>
<p class="card-text">archive/singlefile.html</p>
</div> </div>
</div> </div>
</div> </div>
<div class="col-lg-2"> <div class="col-lg-3">
<div class="card"> <div class="card">
<iframe class="card-img-top" src="$dom_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe> <iframe class="card-img-top" src="$archive_org_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body"> <div class="card-body">
<a href="$dom_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="$archive_org_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <img src="../../static/external.png" class="external"/>
</a> </a>
<a href="$dom_path" target="preview"><h4 class="card-title">HTML</h4></a> <a href="$archive_org_path" target="preview"><h4 class="card-title">Archive.Org</h4></a>
<p class="card-text">archive/output.html</p> <p class="card-text">web.archive.org/web/...</p>
</div> </div>
</div> </div>
</div> </div>
<div class="col-lg-2"> <div class="col-lg-3">
<div class="card"> <div class="card">
<iframe class="card-img-top" src="$singlefile_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe> <iframe class="card-img-top" src="$url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body"> <div class="card-body">
<a href="$singlefile_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="$url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <img src="../../static/external.png" class="external"/>
</a> </a>
<a href="$singlefile_path" target="preview"><h4 class="card-title">SingleFile</h4></a> <a href="$url" target="preview"><h4 class="card-title">Original</h4></a>
<p class="card-text">archive/singlefile.html</p> <p class="card-text">$domain</p>
</div> </div>
</div> </div>
</div> </div>
<div class="col-lg-2"> <br/>
<div class="col-lg-3">
<div class="card"> <div class="card">
<iframe class="card-img-top" src="$readability_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe> <iframe class="card-img-top pdf-frame" src="$pdf_path" scrolling="no"></iframe>
<div class="card-body"> <div class="card-body">
<a href="$readability_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="$pdf_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <img src="../../static/external.png" class="external"/>
</a> </a>
<a href="$readability_path" target="preview"><h4 class="card-title">Readability</h4></a> <a href="$pdf_path" target="preview" id="pdf-btn"><h4 class="card-title">Chrome &gt; PDF</h4></a>
<p class="card-text">archive/readability/...</p> <p class="card-text">archive/output.pdf</p>
</div> </div>
</div> </div>
</div> </div>
<div class="col-lg-2"> <div class="col-lg-3">
<div class="card"> <div class="card">
<iframe class="card-img-top pdf-frame" src="$pdf_path" scrolling="no"></iframe> <img class="card-img-top screenshot" src="$screenshot_path"></iframe>
<div class="card-body"> <div class="card-body">
<a href="$pdf_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="$screenshot_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <img src="../../static/external.png" class="external"/>
</a> </a>
<a href="$pdf_path" target="preview" id="pdf-btn"><h4 class="card-title">PDF</h4></a> <a href="$screenshot_path" target="preview"><h4 class="card-title">Chrome &gt; Screenshot</h4></a>
<p class="card-text">archive/output.pdf</p> <p class="card-text">archive/screenshot.png</p>
</div> </div>
</div> </div>
</div> </div>
<div class="col-lg-2"> <div class="col-lg-3">
<div class="card"> <div class="card">
<img class="card-img-top screenshot" src="$screenshot_path"></iframe> <iframe class="card-img-top" src="$dom_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body"> <div class="card-body">
<a href="$screenshot_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="$dom_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <img src="../../static/external.png" class="external"/>
</a> </a>
<a href="$screenshot_path" target="preview"><h4 class="card-title">Screenshot</h4></a> <a href="$dom_path" target="preview"><h4 class="card-title">Chrome &gt; HTML</h4></a>
<p class="card-text">archive/screenshot.png</p> <p class="card-text">archive/output.html</p>
</div> </div>
</div> </div>
</div> </div>
<div class="col-lg-2"> <div class="col-lg-3">
<div class="card"> <div class="card">
<iframe class="card-img-top" src="$archive_org_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe> <iframe class="card-img-top" src="$readability_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body"> <div class="card-body">
<a href="$archive_org_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="$readability_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <img src="../../static/external.png" class="external"/>
</a> </a>
<a href="$archive_org_path" target="preview"><h4 class="card-title">Archive.Org</h4></a> <a href="$readability_path" target="preview"><h4 class="card-title">Readability</h4></a>
<p class="card-text">web.archive.org/web/...</p> <p class="card-text">archive/readability/...</p>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="$url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="$url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$url" target="preview"><h4 class="card-title">Original</h4></a>
<p class="card-text">$domain</p>
</div> </div>
</div> </div>
</div> </div>