ArchiveBox/archivebox/core/utils.py

from django.utils.html import format_html

from core.models import Snapshot, EXTRACTORS


def get_icons(snapshot: Snapshot) -> str:
    archive_results = snapshot.archiveresult_set
    link = snapshot.as_link()
    canon = link.canonical_outputs()
    output = ""
    output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{} </a>'
    icons = {
        "singlefile": "❶",
        "wget": "🆆",
        "dom": "🅷",
        "pdf": "📄",
        "screenshot": "💻",
        "media": "📼",
        "git": "🅶",
        "archive_org": "🏛",
        "readability": "🆁",
        "mercury": "🅼",
        "warc": "📦"
    }
    exclude = ["favicon"]
    # Missing specific entry for WARC

    for extractor, _ in EXTRACTORS:
        result = archive_results.filter(extractor=extractor, status="succeeded")
        path, exists = link.archive_path, result.exists()
        try:
            if extractor not in exclude:
                output += output_template.format(path, canon[f"{extractor}_path"],
                                                 exists, extractor, icons.get(extractor, "?"))
            if extractor == "wget":
                # warc isn't technically it's own extractor, so we have to add it after wget

                output += output_template.format(path, canon["warc_path"],
                                                 exists, "warc", icons.get("warc", "?"))

        except Exception as e:
            print(e)

    return format_html(f'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">{output}<span>')

#def get_icons(snapshot: Snapshot) -> str:
#    link = snapshot.as_link()
#    canon = link.canonical_outputs()
#    out_dir = Path(link.link_dir)
#
#    # slow version: highlights icons based on whether files exist or not for that output
#    # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
#    # fast version: all icons are highlighted without checking for outputs in filesystem
#    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
#
#    return format_html(
#            '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
#                '<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'
#                '<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '
#                '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
#                '<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '
#                '<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '
#                '<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '
#                '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
#                '<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '
#                '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
#            '</span>',
#            *link_tuple(link, 'singlefile_path'),
#            *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
#            *link_tuple(link, 'pdf_path'),
#            *link_tuple(link, 'screenshot_path'),
#            *link_tuple(link, 'dom_path'),
#            *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
#            *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
#            *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
#            canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
#        )
#
add icons to new public view 2020-08-26 05:35:06 +12:00			`from django.utils.html import format_html`

feat: Create ArchiveResult after finishing an extractor process 2020-11-05 05:22:55 +13:00			`from core.models import Snapshot, EXTRACTORS`
add icons to new public view 2020-08-26 05:35:06 +12:00

			`def get_icons(snapshot: Snapshot) -> str:`
feat: initial functional version with icons calculated based on archive results 2020-11-05 04:31:20 +13:00			`archive_results = snapshot.archiveresult_set`
add icons to new public view 2020-08-26 05:35:06 +12:00			`link = snapshot.as_link()`
			`canon = link.canonical_outputs()`
feat: initial functional version with icons calculated based on archive results 2020-11-05 04:31:20 +13:00			`output = ""`
			`output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{} </a>'`
			`icons = {`
			`"singlefile": "❶",`
			`"wget": "🆆",`
			`"dom": "🅷",`
			`"pdf": "📄",`
			`"screenshot": "💻",`
			`"media": "📼",`
			`"git": "🅶",`
			`"archive_org": "🏛",`
			`"readability": "🆁",`
			`"mercury": "🅼",`
feat: Add warc to list and limit check to succeeded archive results 2020-11-06 01:54:40 +13:00			`"warc": "📦"`
feat: initial functional version with icons calculated based on archive results 2020-11-05 04:31:20 +13:00			`}`
			`exclude = ["favicon"]`
			`# Missing specific entry for WARC`

refactor: Unpack extractors tuple instead of using the index to access the relevant information 2020-11-11 06:38:29 +13:00			`for extractor, _ in EXTRACTORS:`
			`result = archive_results.filter(extractor=extractor, status="succeeded")`
			`path, exists = link.archive_path, result.exists()`
feat: initial functional version with icons calculated based on archive results 2020-11-05 04:31:20 +13:00			`try:`
refactor: Unpack extractors tuple instead of using the index to access the relevant information 2020-11-11 06:38:29 +13:00			`if extractor not in exclude:`
			`output += output_template.format(path, canon[f"{extractor}_path"],`
			`exists, extractor, icons.get(extractor, "?"))`
			`if extractor == "wget":`
			`# warc isn't technically it's own extractor, so we have to add it after wget`

feat: Finish reversal. Add ArchiveResults that are not found in the index.json 2020-11-13 04:30:41 +13:00			`output += output_template.format(path, canon["warc_path"],`
refactor: Unpack extractors tuple instead of using the index to access the relevant information 2020-11-11 06:38:29 +13:00			`exists, "warc", icons.get("warc", "?"))`

feat: initial functional version with icons calculated based on archive results 2020-11-05 04:31:20 +13:00			`except Exception as e:`
			`print(e)`

			`return format_html(f'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">{output}<span>')`

			`#def get_icons(snapshot: Snapshot) -> str:`
			`# link = snapshot.as_link()`
			`# canon = link.canonical_outputs()`
			`# out_dir = Path(link.link_dir)`
			`#`
			`# # slow version: highlights icons based on whether files exist or not for that output`
			`# # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())`
			`# # fast version: all icons are highlighted without checking for outputs in filesystem`
			`# link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())`
			`#`
			`# return format_html(`
			`# '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'`
			`# '<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'`
			`# '<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '`
			`# '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '`
			`# '<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '`
			`# '<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '`
			`# '<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '`
			`# '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '`
			`# '<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '`
			`# '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '`
			`# '</span>',`
			`# *link_tuple(link, 'singlefile_path'),`
			`# link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('')),`
			`# *link_tuple(link, 'pdf_path'),`
			`# *link_tuple(link, 'screenshot_path'),`
			`# *link_tuple(link, 'dom_path'),`
			`# link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('.warc.gz')),`
			`# link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('')),`
			`# link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('')),`
			`# canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),`
			`# )`
feat: Create ArchiveResult after finishing an extractor process 2020-11-05 05:22:55 +13:00			`#`