1
0
Fork 0
mirror of synced 2024-06-28 11:00:35 +12:00

refactor: pdf uses snapshot instead of link

This commit is contained in:
Cristian 2020-12-29 13:58:51 -05:00
parent 5cf9ca0e2c
commit 756b7fc76d

View file

@ -3,6 +3,8 @@ __package__ = 'archivebox.extractors'
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from django.db.models import Model
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file from ..system import run, chmod_file
from ..util import ( from ..util import (
@ -19,9 +21,9 @@ from ..logging_util import TimedProgress
@enforce_types @enforce_types
def should_save_pdf(link: Link, out_dir: Optional[Path]=None) -> bool: def should_save_pdf(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(snapshot.snapshot_dir)
if is_static_file(link.url): if is_static_file(snapshot.url):
return False return False
if (out_dir / "output.pdf").exists(): if (out_dir / "output.pdf").exists():
@ -31,15 +33,15 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None) -> bool:
@enforce_types @enforce_types
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_pdf(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print PDF of site to file using chrome --headless""" """print PDF of site to file using chrome --headless"""
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(snapshot.snapshot_dir)
output: ArchiveOutput = 'output.pdf' output: ArchiveOutput = 'output.pdf'
cmd = [ cmd = [
*chrome_args(TIMEOUT=timeout), *chrome_args(TIMEOUT=timeout),
'--print-to-pdf', '--print-to-pdf',
link.url, snapshot.url,
] ]
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')