1
0
Fork 0
mirror of synced 2024-06-26 10:00:19 +12:00

load EXTRACTORS dynamically using importlib.import_module

This commit is contained in:
Nick Sweeting 2024-05-11 22:28:59 -07:00
parent c7f55fc3ba
commit 457c42bf84
No known key found for this signature in database
18 changed files with 198 additions and 40 deletions

View file

@ -17,8 +17,6 @@ except AttributeError:
def forwards_func(apps, schema_editor):
from core.models import EXTRACTORS
Snapshot = apps.get_model("core", "Snapshot")
ArchiveResult = apps.get_model("core", "ArchiveResult")

View file

@ -6,6 +6,7 @@ import json
from pathlib import Path
from typing import Optional, List
from importlib import import_module
from django.db import models
from django.utils.functional import cached_property
@ -20,9 +21,9 @@ from ..system import get_dir_size
from ..util import parse_date, base_url, hashurl
from ..index.schema import Link
from ..index.html import snapshot_icons
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
STATUS_CHOICES = [
("succeeded", "succeeded"),
("failed", "failed"),
@ -267,11 +268,13 @@ class ArchiveResultManager(models.Manager):
class ArchiveResult(models.Model):
EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
uuid = models.UUIDField(default=uuid.uuid4, editable=False)
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
cmd = JSONField()
pwd = models.CharField(max_length=256)
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
@ -284,3 +287,34 @@ class ArchiveResult(models.Model):
def __str__(self):
return self.extractor
@cached_property
def snapshot_dir(self):
return Path(self.snapshot.link_dir)
@property
def extractor_module(self):
return EXTRACTORS[self.extractor]
def output_path(self) -> str:
"""return the canonical output filename or directory name within the snapshot dir"""
return self.extractor_module.get_output_path()
def embed_path(self) -> str:
"""
return the actual runtime-calculated path to the file on-disk that
should be used for user-facing iframe embeds of this result
"""
if hasattr(self.extractor_module, 'get_embed_path'):
return self.extractor_module.get_embed_path(self)
return self.extractor_module.get_output_path()
def legacy_output_path(self):
link = self.snapshot.as_link()
return link.canonical_outputs().get(f'{self.extractor}_path')
def output_exists(self) -> bool:
return Path(self.output_path()).exists()

View file

@ -1,11 +1,13 @@
__package__ = 'archivebox.extractors'
from typing import Callable, Optional, Dict, List, Iterable, Union, Protocol, cast
import os
import sys
from pathlib import Path
from typing import Callable, Optional, List, Iterable, Union
from importlib import import_module
from datetime import datetime, timezone
from django.db.models import QuerySet
from ..config import (
@ -240,3 +242,37 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
log_archiving_finished(num_links)
return all_links
EXTRACTORS_DIR = Path(__file__).parent
class ExtractorModuleProtocol(Protocol):
"""Type interface for an Extractor Module (WIP)"""
get_output_path: Callable
# TODO:
# get_embed_path: Callable | None
# should_extract(Snapshot)
# extract(Snapshot)
def get_extractors(dir: Path=EXTRACTORS_DIR) -> Dict[str, ExtractorModuleProtocol]:
"""iterate through archivebox/extractors/*.py and load extractor modules"""
EXTRACTORS = {}
for filename in EXTRACTORS_DIR.glob('*.py'):
if filename.name.startswith('__'):
continue
extractor_name = filename.name.replace('.py', '')
extractor_module = cast(ExtractorModuleProtocol, import_module(f'.{extractor_name}', package=__package__))
assert getattr(extractor_module, 'get_output_path')
EXTRACTORS[extractor_name] = extractor_module
return EXTRACTORS
EXTRACTORS = get_extractors(EXTRACTORS_DIR)

View file

@ -24,6 +24,8 @@ from ..config import (
)
from ..logging_util import TimedProgress
def get_output_path():
return 'archive.org.txt'
@enforce_types
@ -32,7 +34,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'archive.org.txt').exists():
if not overwrite and (out_dir / get_output_path()).exists():
# if open(path, 'r', encoding='utf-8').read().strip() != 'None':
return False
@ -43,7 +45,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
"""submit site to archive.org for archiving via their service, save returned archive url"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'archive.org.txt'
output: ArchiveOutput = get_output_path()
archive_org_url = None
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
# later options take precedence
@ -88,7 +90,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
archive_org_url = archive_org_url or submit_url
with open(str(out_dir / output), 'w', encoding='utf-8') as f:
f.write(archive_org_url)
chmod_file('archive.org.txt', cwd=str(out_dir))
chmod_file(str(out_dir / output), cwd=str(out_dir))
output = archive_org_url
return ArchiveResult(

View file

@ -19,6 +19,9 @@ from ..config import (
from ..logging_util import TimedProgress
def get_output_path():
return 'output.html'
@enforce_types
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@ -26,8 +29,8 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'output.html').exists():
if (out_dir / 'output.html').stat().st_size > 1:
if not overwrite and (out_dir / get_output_path()).exists():
if (out_dir / get_output_path()).stat().st_size > 1:
return False
return SAVE_DOM
@ -37,7 +40,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
"""print HTML of site to file using chrome --dump-html"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'output.html'
output: ArchiveOutput = get_output_path()
output_path = out_dir / output
cmd = [
*chrome_args(),

View file

@ -8,8 +8,8 @@ from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..system import chmod_file, run
from ..util import (
enforce_types,
domain,
dedupe,
domain,
dedupe,
)
from ..config import (
TIMEOUT,
@ -33,6 +33,11 @@ def should_save_favicon(link: Link, out_dir: Optional[str]=None, overwrite: Opti
return SAVE_FAVICON
@enforce_types
def get_output_path():
return 'favicon.ico'
@enforce_types
def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api"""

View file

@ -26,6 +26,19 @@ from ..config import (
from ..logging_util import TimedProgress
def get_output_path():
return 'git/'
def get_embed_path(archiveresult=None):
if not archiveresult:
return get_output_path()
try:
return get_output_path() + list((archiveresult.snapshot_dir / get_output_path()).glob('*'))[0].name + '/'
except IndexError:
pass
return get_output_path()
@enforce_types
def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@ -33,7 +46,7 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'git').exists():
if not overwrite and (out_dir / get_output_path()).exists():
return False
is_clonable_url = (
@ -51,7 +64,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
"""download full site using git"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'git'
output: ArchiveOutput = get_output_path()
output_path = out_dir / output
output_path.mkdir(exist_ok=True)
cmd = [

View file

@ -23,10 +23,14 @@ from ..config import (
)
from ..logging_util import TimedProgress
def get_output_path():
return 'headers.json'
@enforce_types
def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'headers.json').exists():
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SAVE_HEADERS
@ -38,7 +42,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute()
output: ArchiveOutput = 'headers.json'
output: ArchiveOutput = get_output_path()
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
@ -59,7 +63,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
try:
json_headers = get_headers(link.url, timeout=timeout)
output_folder.mkdir(exist_ok=True)
atomic_write(str(output_folder / "headers.json"), json_headers)
atomic_write(str(output_folder / get_output_path()), json_headers)
except (Exception, OSError) as err:
status = 'failed'
output = err

View file

@ -19,6 +19,12 @@ from ..util import (
)
from .title import get_html
def get_output_path():
return "htmltotext.txt"
class HTMLTextExtractor(HTMLParser):
TEXT_ATTRS = [
"alt", "cite", "href", "label",
@ -109,7 +115,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'htmltotext.txt').exists():
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SAVE_HTMLTOTEXT
@ -120,7 +126,7 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
"""extract search-indexing-friendly text from an HTML document"""
out_dir = Path(out_dir or link.link_dir)
output = "htmltotext.txt"
output = get_output_path()
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
timer = TimedProgress(timeout, prefix=' ')

View file

@ -22,13 +22,27 @@ from ..config import (
from ..logging_util import TimedProgress
def get_output_path():
return 'media/'
def get_embed_path(archiveresult=None):
if not archiveresult:
return get_output_path()
out_dir = archiveresult.snapshot_dir / get_output_path()
try:
return get_output_path() + list(out_dir.glob('*.mp4'))[0].name
except IndexError:
return get_output_path()
@enforce_types
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'media').exists():
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SAVE_MEDIA
@ -38,7 +52,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'media'
output: ArchiveOutput = get_output_path()
output_path = out_dir / output
output_path.mkdir(exist_ok=True)
# later options take precedence

View file

@ -24,6 +24,12 @@ from ..config import (
from ..logging_util import TimedProgress
def get_output_path():
return 'mercury/'
def get_embed_path(archiveresult=None):
return get_output_path() + 'content.html'
@enforce_types
def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
@ -44,7 +50,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Opti
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'mercury').exists():
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SAVE_MERCURY
@ -55,8 +61,8 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
"""download reader friendly version using @postlight/mercury-parser"""
out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute() / "mercury"
output = "mercury"
output_folder = out_dir.absolute() / get_output_path()
output = get_output_path()
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')

View file

@ -19,13 +19,17 @@ from ..config import (
from ..logging_util import TimedProgress
def get_output_path():
return 'output.pdf'
@enforce_types
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'output.pdf').exists():
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SAVE_PDF
@ -36,7 +40,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
"""print PDF of site to file using chrome --headless"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'output.pdf'
output: ArchiveOutput = get_output_path()
cmd = [
*chrome_args(),
'--print-to-pdf',
@ -51,7 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to save PDF', hints)
chmod_file('output.pdf', cwd=str(out_dir))
chmod_file(get_output_path(), cwd=str(out_dir))
except Exception as err:
status = 'failed'
output = err

View file

@ -22,6 +22,12 @@ from ..config import (
from ..logging_util import TimedProgress
from .title import get_html
def get_output_path():
return 'readability/'
def get_embed_path(archiveresult=None):
return get_output_path() + 'content.html'
@enforce_types
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
@ -29,7 +35,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite:
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'readability').exists():
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SAVE_READABILITY
@ -40,8 +46,8 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
"""download reader friendly version using @mozilla/readability"""
out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute() / "readability"
output = "readability"
output_folder = out_dir.absolute() / get_output_path()
output = get_output_path()
# Readability Docs: https://github.com/mozilla/readability

View file

@ -19,6 +19,9 @@ from ..config import (
from ..logging_util import TimedProgress
def get_output_path():
return 'screenshot.png'
@enforce_types
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@ -26,7 +29,7 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'screenshot.png').exists():
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SAVE_SCREENSHOT
@ -36,7 +39,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
"""take screenshot of site using chrome --headless"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'screenshot.png'
output: ArchiveOutput = get_output_path()
cmd = [
*chrome_args(),
'--screenshot',

View file

@ -26,13 +26,17 @@ from ..config import (
from ..logging_util import TimedProgress
def get_output_path():
return 'singlefile.html'
@enforce_types
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'singlefile.html').exists():
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SAVE_SINGLEFILE
@ -43,7 +47,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
"""download full site using single-file"""
out_dir = out_dir or Path(link.link_dir)
output = "singlefile.html"
output = get_output_path()
browser_args = chrome_args(CHROME_TIMEOUT=0)

View file

@ -60,6 +60,7 @@ class TitleParser(HTMLParser):
if tag.lower() == "title":
self.inside_title_tag = False
@enforce_types
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
"""
@ -84,6 +85,13 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
else:
return document
def get_output_path():
# TODO: actually save title to this file
# (currently only saved in ArchiveResult.output as charfield value, not saved to filesystem)
return 'title.json'
@enforce_types
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
# if link already has valid title, skip it

View file

@ -35,6 +35,18 @@ from ..config import (
from ..logging_util import TimedProgress
def get_output_path():
# TODO: actually save output into this folder, instead of do {domain}/**/index.html
return 'wget/'
def get_embed_path(archiveresult=None):
if not archiveresult:
return get_output_path()
link = archiveresult.snapshot.as_link()
return wget_output_path(link)
@enforce_types
def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
output_path = wget_output_path(link)

View file

@ -121,7 +121,7 @@ def snapshot_icons(snapshot) -> str:
cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
def calc_snapshot_icons():
from core.models import EXTRACTORS
from core.models import EXTRACTOR_CHOICES
# start = datetime.now(timezone.utc)
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
@ -147,12 +147,12 @@ def snapshot_icons(snapshot) -> str:
# Missing specific entry for WARC
extractor_outputs = defaultdict(lambda: None)
for extractor, _ in EXTRACTORS:
for extractor, _ in EXTRACTOR_CHOICES:
for result in archive_results:
if result.extractor == extractor and result:
extractor_outputs[extractor] = result
for extractor, _ in EXTRACTORS:
for extractor, _ in EXTRACTOR_CHOICES:
if extractor not in exclude:
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)