From 756b7fc76d4912597e83f68ba2b6c4e7d4d1a753 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 29 Dec 2020 13:58:51 -0500 Subject: [PATCH] refactor: pdf uses snapshot instead of link --- archivebox/extractors/pdf.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py index 1b0201e3..215a9de3 100644 --- a/archivebox/extractors/pdf.py +++ b/archivebox/extractors/pdf.py @@ -3,6 +3,8 @@ __package__ = 'archivebox.extractors' from pathlib import Path from typing import Optional +from django.db.models import Model + from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..system import run, chmod_file from ..util import ( @@ -19,9 +21,9 @@ from ..logging_util import TimedProgress @enforce_types -def should_save_pdf(link: Link, out_dir: Optional[Path]=None) -> bool: - out_dir = out_dir or Path(link.link_dir) - if is_static_file(link.url): +def should_save_pdf(snapshot: Model, out_dir: Optional[Path]=None) -> bool: + out_dir = out_dir or Path(snapshot.snapshot_dir) + if is_static_file(snapshot.url): return False if (out_dir / "output.pdf").exists(): @@ -31,15 +33,15 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None) -> bool: @enforce_types -def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_pdf(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: """print PDF of site to file using chrome --headless""" - out_dir = out_dir or Path(link.link_dir) + out_dir = out_dir or Path(snapshot.snapshot_dir) output: ArchiveOutput = 'output.pdf' cmd = [ *chrome_args(TIMEOUT=timeout), '--print-to-pdf', - link.url, + snapshot.url, ] status = 'succeeded' timer = TimedProgress(timeout, prefix=' ')