1
0
Fork 0
mirror of synced 2024-06-28 11:00:35 +12:00

refactor: dom uses snapshot instead of link

This commit is contained in:
Cristian 2020-12-29 14:08:48 -05:00
parent eac29e3a7a
commit 5ea1b9e39f

View file

@ -3,6 +3,8 @@ __package__ = 'archivebox.extractors'
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from django.db.models import Model
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file, atomic_write from ..system import run, chmod_file, atomic_write
from ..util import ( from ..util import (
@ -20,9 +22,9 @@ from ..logging_util import TimedProgress
@enforce_types @enforce_types
def should_save_dom(link: Link, out_dir: Optional[Path]=None) -> bool: def should_save_dom(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(snapshot.snapshot_dir)
if is_static_file(link.url): if is_static_file(snapshot.url):
return False return False
if (out_dir / 'output.html').exists(): if (out_dir / 'output.html').exists():
@ -31,16 +33,16 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None) -> bool:
return SAVE_DOM return SAVE_DOM
@enforce_types @enforce_types
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_dom(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html""" """print HTML of site to file using chrome --dump-html"""
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(snapshot.snapshot_dir)
output: ArchiveOutput = 'output.html' output: ArchiveOutput = 'output.html'
output_path = out_dir / output output_path = out_dir / output
cmd = [ cmd = [
*chrome_args(TIMEOUT=timeout), *chrome_args(TIMEOUT=timeout),
'--dump-dom', '--dump-dom',
link.url snapshot.url
] ]
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')