1
0
Fork 0
mirror of synced 2024-06-28 11:00:35 +12:00

refactor: git uses snapshot instead of link

This commit is contained in:
Cristian 2020-12-29 14:05:45 -05:00
parent b9489c971c
commit d1326c3660

View file

@ -4,6 +4,8 @@ __package__ = 'archivebox.extractors'
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from django.db.models import Model
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file from ..system import run, chmod_file
from ..util import ( from ..util import (
@ -28,17 +30,17 @@ from ..logging_util import TimedProgress
@enforce_types @enforce_types
def should_save_git(link: Link, out_dir: Optional[Path]=None) -> bool: def should_save_git(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or link.link_dir out_dir = out_dir or snapshot.snapshot_dir
if is_static_file(link.url): if is_static_file(snapshot.url):
return False return False
if (out_dir / "git").exists(): if (out_dir / "git").exists():
return False return False
is_clonable_url = ( is_clonable_url = (
(domain(link.url) in GIT_DOMAINS) (domain(snapshot.url) in GIT_DOMAINS)
or (extension(link.url) == 'git') or (extension(snapshot.url) == 'git')
) )
if not is_clonable_url: if not is_clonable_url:
return False return False
@ -47,10 +49,10 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None) -> bool:
@enforce_types @enforce_types
def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_git(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using git""" """download full site using git"""
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(snapshot.snapshot_dir)
output: ArchiveOutput = 'git' output: ArchiveOutput = 'git'
output_path = out_dir / output output_path = out_dir / output
output_path.mkdir(exist_ok=True) output_path.mkdir(exist_ok=True)
@ -59,7 +61,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
'clone', 'clone',
*GIT_ARGS, *GIT_ARGS,
*([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']), *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
without_query(without_fragment(link.url)), without_query(without_fragment(snapshot.url)),
] ]
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')