From 457c42bf84526d0c4b1c6013efae235023339085 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 11 May 2024 22:28:59 -0700 Subject: [PATCH] load EXTRACTORS dynamically using importlib.import_module --- .../core/migrations/0007_archiveresult.py | 2 - archivebox/core/models.py | 40 +++++++++++++++++-- archivebox/extractors/__init__.py | 40 ++++++++++++++++++- archivebox/extractors/archive_org.py | 8 ++-- archivebox/extractors/dom.py | 9 +++-- archivebox/extractors/favicon.py | 9 ++++- archivebox/extractors/git.py | 17 +++++++- archivebox/extractors/headers.py | 10 +++-- archivebox/extractors/htmltotext.py | 10 ++++- archivebox/extractors/media.py | 18 ++++++++- archivebox/extractors/mercury.py | 12 ++++-- archivebox/extractors/pdf.py | 10 +++-- archivebox/extractors/readability.py | 12 ++++-- archivebox/extractors/screenshot.py | 7 +++- archivebox/extractors/singlefile.py | 8 +++- archivebox/extractors/title.py | 8 ++++ archivebox/extractors/wget.py | 12 ++++++ archivebox/index/html.py | 6 +-- 18 files changed, 198 insertions(+), 40 deletions(-) diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index 29b269f6..3da3b93c 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -17,8 +17,6 @@ except AttributeError: def forwards_func(apps, schema_editor): - from core.models import EXTRACTORS - Snapshot = apps.get_model("core", "Snapshot") ArchiveResult = apps.get_model("core", "ArchiveResult") diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 0c9733d0..b51f9a59 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -6,6 +6,7 @@ import json from pathlib import Path from typing import Optional, List +from importlib import import_module from django.db import models from django.utils.functional import cached_property @@ -20,9 +21,9 @@ from ..system import get_dir_size from ..util import parse_date, base_url, hashurl from ..index.schema import Link from ..index.html import snapshot_icons -from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE +from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS -EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()] +EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()] STATUS_CHOICES = [ ("succeeded", "succeeded"), ("failed", "failed"), @@ -267,11 +268,13 @@ class ArchiveResultManager(models.Manager): class ArchiveResult(models.Model): + EXTRACTOR_CHOICES = EXTRACTOR_CHOICES + id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') uuid = models.UUIDField(default=uuid.uuid4, editable=False) snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) - extractor = models.CharField(choices=EXTRACTORS, max_length=32) + extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32) cmd = JSONField() pwd = models.CharField(max_length=256) cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True) @@ -284,3 +287,34 @@ class ArchiveResult(models.Model): def __str__(self): return self.extractor + + @cached_property + def snapshot_dir(self): + return Path(self.snapshot.link_dir) + + + @property + def extractor_module(self): + return EXTRACTORS[self.extractor] + + def output_path(self) -> str: + """return the canonical output filename or directory name within the snapshot dir""" + return self.extractor_module.get_output_path() + + def embed_path(self) -> str: + """ + return the actual runtime-calculated path to the file on-disk that + should be used for user-facing iframe embeds of this result + """ + + if hasattr(self.extractor_module, 'get_embed_path'): + return self.extractor_module.get_embed_path(self) + + return self.extractor_module.get_output_path() + + def legacy_output_path(self): + link = self.snapshot.as_link() + return link.canonical_outputs().get(f'{self.extractor}_path') + + def output_exists(self) -> bool: + return Path(self.output_path()).exists() diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index cb1c6841..1527cc98 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -1,11 +1,13 @@ __package__ = 'archivebox.extractors' +from typing import Callable, Optional, Dict, List, Iterable, Union, Protocol, cast + import os import sys from pathlib import Path - -from typing import Callable, Optional, List, Iterable, Union +from importlib import import_module from datetime import datetime, timezone + from django.db.models import QuerySet from ..config import ( @@ -240,3 +242,37 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa log_archiving_finished(num_links) return all_links + + + +EXTRACTORS_DIR = Path(__file__).parent + +class ExtractorModuleProtocol(Protocol): + """Type interface for an Extractor Module (WIP)""" + + get_output_path: Callable + + # TODO: + # get_embed_path: Callable | None + # should_extract(Snapshot) + # extract(Snapshot) + + +def get_extractors(dir: Path=EXTRACTORS_DIR) -> Dict[str, ExtractorModuleProtocol]: + """iterate through archivebox/extractors/*.py and load extractor modules""" + EXTRACTORS = {} + + for filename in EXTRACTORS_DIR.glob('*.py'): + if filename.name.startswith('__'): + continue + + extractor_name = filename.name.replace('.py', '') + + extractor_module = cast(ExtractorModuleProtocol, import_module(f'.{extractor_name}', package=__package__)) + + assert getattr(extractor_module, 'get_output_path') + EXTRACTORS[extractor_name] = extractor_module + + return EXTRACTORS + +EXTRACTORS = get_extractors(EXTRACTORS_DIR) diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index 245315f1..5aa66fa7 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -24,6 +24,8 @@ from ..config import ( ) from ..logging_util import TimedProgress +def get_output_path(): + return 'archive.org.txt' @enforce_types @@ -32,7 +34,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'archive.org.txt').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): # if open(path, 'r', encoding='utf-8').read().strip() != 'None': return False @@ -43,7 +45,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int= """submit site to archive.org for archiving via their service, save returned archive url""" out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'archive.org.txt' + output: ArchiveOutput = get_output_path() archive_org_url = None submit_url = 'https://web.archive.org/save/{}'.format(link.url) # later options take precedence @@ -88,7 +90,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int= archive_org_url = archive_org_url or submit_url with open(str(out_dir / output), 'w', encoding='utf-8') as f: f.write(archive_org_url) - chmod_file('archive.org.txt', cwd=str(out_dir)) + chmod_file(str(out_dir / output), cwd=str(out_dir)) output = archive_org_url return ArchiveResult( diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py index 8a86026f..0035ec87 100644 --- a/archivebox/extractors/dom.py +++ b/archivebox/extractors/dom.py @@ -19,6 +19,9 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + return 'output.html' + @enforce_types def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: @@ -26,8 +29,8 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'output.html').exists(): - if (out_dir / 'output.html').stat().st_size > 1: + if not overwrite and (out_dir / get_output_path()).exists(): + if (out_dir / get_output_path()).stat().st_size > 1: return False return SAVE_DOM @@ -37,7 +40,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> """print HTML of site to file using chrome --dump-html""" out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'output.html' + output: ArchiveOutput = get_output_path() output_path = out_dir / output cmd = [ *chrome_args(), diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index f793f8df..31473b1a 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -8,8 +8,8 @@ from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..system import chmod_file, run from ..util import ( enforce_types, - domain, - dedupe, + domain, + dedupe, ) from ..config import ( TIMEOUT, @@ -33,6 +33,11 @@ def should_save_favicon(link: Link, out_dir: Optional[str]=None, overwrite: Opti return SAVE_FAVICON +@enforce_types +def get_output_path(): + return 'favicon.ico' + + @enforce_types def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: """download site favicon from google's favicon api""" diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py index efef37c2..029e8022 100644 --- a/archivebox/extractors/git.py +++ b/archivebox/extractors/git.py @@ -26,6 +26,19 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + return 'git/' + +def get_embed_path(archiveresult=None): + if not archiveresult: + return get_output_path() + + try: + return get_output_path() + list((archiveresult.snapshot_dir / get_output_path()).glob('*'))[0].name + '/' + except IndexError: + pass + + return get_output_path() @enforce_types def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: @@ -33,7 +46,7 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'git').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False is_clonable_url = ( @@ -51,7 +64,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> """download full site using git""" out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'git' + output: ArchiveOutput = get_output_path() output_path = out_dir / output output_path.mkdir(exist_ok=True) cmd = [ diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 975787ad..9fd48469 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -23,10 +23,14 @@ from ..config import ( ) from ..logging_util import TimedProgress +def get_output_path(): + return 'headers.json' + + @enforce_types def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'headers.json').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_HEADERS @@ -38,7 +42,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) out_dir = Path(out_dir or link.link_dir) output_folder = out_dir.absolute() - output: ArchiveOutput = 'headers.json' + output: ArchiveOutput = get_output_path() status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') @@ -59,7 +63,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) try: json_headers = get_headers(link.url, timeout=timeout) output_folder.mkdir(exist_ok=True) - atomic_write(str(output_folder / "headers.json"), json_headers) + atomic_write(str(output_folder / get_output_path()), json_headers) except (Exception, OSError) as err: status = 'failed' output = err diff --git a/archivebox/extractors/htmltotext.py b/archivebox/extractors/htmltotext.py index 0686f76e..1957579a 100644 --- a/archivebox/extractors/htmltotext.py +++ b/archivebox/extractors/htmltotext.py @@ -19,6 +19,12 @@ from ..util import ( ) from .title import get_html + +def get_output_path(): + return "htmltotext.txt" + + + class HTMLTextExtractor(HTMLParser): TEXT_ATTRS = [ "alt", "cite", "href", "label", @@ -109,7 +115,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite: return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'htmltotext.txt').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_HTMLTOTEXT @@ -120,7 +126,7 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO """extract search-indexing-friendly text from an HTML document""" out_dir = Path(out_dir or link.link_dir) - output = "htmltotext.txt" + output = get_output_path() cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html'] timer = TimedProgress(timeout, prefix=' ') diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index ad4c9c4b..8c33e92d 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -22,13 +22,27 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + return 'media/' + +def get_embed_path(archiveresult=None): + if not archiveresult: + return get_output_path() + + out_dir = archiveresult.snapshot_dir / get_output_path() + try: + return get_output_path() + list(out_dir.glob('*.mp4'))[0].name + except IndexError: + return get_output_path() + + @enforce_types def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'media').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_MEDIA @@ -38,7 +52,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp""" out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'media' + output: ArchiveOutput = get_output_path() output_path = out_dir / output output_path.mkdir(exist_ok=True) # later options take precedence diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index a0f38434..71af1329 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -24,6 +24,12 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + return 'mercury/' + +def get_embed_path(archiveresult=None): + return get_output_path() + 'content.html' + @enforce_types def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError: @@ -44,7 +50,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Opti return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'mercury').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_MERCURY @@ -55,8 +61,8 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) """download reader friendly version using @postlight/mercury-parser""" out_dir = Path(out_dir or link.link_dir) - output_folder = out_dir.absolute() / "mercury" - output = "mercury" + output_folder = out_dir.absolute() / get_output_path() + output = get_output_path() status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py index a6b51948..17bdd47f 100644 --- a/archivebox/extractors/pdf.py +++ b/archivebox/extractors/pdf.py @@ -19,13 +19,17 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + return 'output.pdf' + + @enforce_types def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'output.pdf').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_PDF @@ -36,7 +40,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> """print PDF of site to file using chrome --headless""" out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'output.pdf' + output: ArchiveOutput = get_output_path() cmd = [ *chrome_args(), '--print-to-pdf', @@ -51,7 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> hints = (result.stderr or result.stdout).decode() raise ArchiveError('Failed to save PDF', hints) - chmod_file('output.pdf', cwd=str(out_dir)) + chmod_file(get_output_path(), cwd=str(out_dir)) except Exception as err: status = 'failed' output = err diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index dc2a06b9..155438d3 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -22,6 +22,12 @@ from ..config import ( from ..logging_util import TimedProgress from .title import get_html +def get_output_path(): + return 'readability/' + +def get_embed_path(archiveresult=None): + return get_output_path() + 'content.html' + @enforce_types def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: @@ -29,7 +35,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'readability').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_READABILITY @@ -40,8 +46,8 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO """download reader friendly version using @mozilla/readability""" out_dir = Path(out_dir or link.link_dir) - output_folder = out_dir.absolute() / "readability" - output = "readability" + output_folder = out_dir.absolute() / get_output_path() + output = get_output_path() # Readability Docs: https://github.com/mozilla/readability diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py index 7ed8dd9d..ae380e6f 100644 --- a/archivebox/extractors/screenshot.py +++ b/archivebox/extractors/screenshot.py @@ -19,6 +19,9 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + return 'screenshot.png' + @enforce_types def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: @@ -26,7 +29,7 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'screenshot.png').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_SCREENSHOT @@ -36,7 +39,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO """take screenshot of site using chrome --headless""" out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'screenshot.png' + output: ArchiveOutput = get_output_path() cmd = [ *chrome_args(), '--screenshot', diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 1d5275dd..b07af788 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -26,13 +26,17 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + return 'singlefile.html' + + @enforce_types def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'singlefile.html').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_SINGLEFILE @@ -43,7 +47,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO """download full site using single-file""" out_dir = out_dir or Path(link.link_dir) - output = "singlefile.html" + output = get_output_path() browser_args = chrome_args(CHROME_TIMEOUT=0) diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 5decc52c..a1cb769f 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -60,6 +60,7 @@ class TitleParser(HTMLParser): if tag.lower() == "title": self.inside_title_tag = False + @enforce_types def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str: """ @@ -84,6 +85,13 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str: else: return document + +def get_output_path(): + # TODO: actually save title to this file + # (currently only saved in ArchiveResult.output as charfield value, not saved to filesystem) + return 'title.json' + + @enforce_types def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: # if link already has valid title, skip it diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 86dba0ac..cd72be4e 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -35,6 +35,18 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + # TODO: actually save output into this folder, instead of do {domain}/**/index.html + return 'wget/' + +def get_embed_path(archiveresult=None): + if not archiveresult: + return get_output_path() + + link = archiveresult.snapshot.as_link() + return wget_output_path(link) + + @enforce_types def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: output_path = wget_output_path(link) diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 6b914446..a5facc98 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -121,7 +121,7 @@ def snapshot_icons(snapshot) -> str: cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons' def calc_snapshot_icons(): - from core.models import EXTRACTORS + from core.models import EXTRACTOR_CHOICES # start = datetime.now(timezone.utc) archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) @@ -147,12 +147,12 @@ def snapshot_icons(snapshot) -> str: # Missing specific entry for WARC extractor_outputs = defaultdict(lambda: None) - for extractor, _ in EXTRACTORS: + for extractor, _ in EXTRACTOR_CHOICES: for result in archive_results: if result.extractor == extractor and result: extractor_outputs[extractor] = result - for extractor, _ in EXTRACTORS: + for extractor, _ in EXTRACTOR_CHOICES: if extractor not in exclude: existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)