1
0
Fork 0
mirror of synced 2024-06-28 11:00:35 +12:00

refactor: wget uses snapshot instead of link

This commit is contained in:
Cristian 2020-12-29 12:38:36 -05:00
parent e0e65bf4b1
commit f6152ded44
2 changed files with 15 additions and 14 deletions

View file

@ -6,6 +6,8 @@ from pathlib import Path
from typing import Optional from typing import Optional
from datetime import datetime from datetime import datetime
from django.db.models import Model
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file from ..system import run, chmod_file
from ..util import ( from ..util import (
@ -36,9 +38,9 @@ from ..logging_util import TimedProgress
@enforce_types @enforce_types
def should_save_wget(link: Link, out_dir: Optional[Path]=None) -> bool: def should_save_wget(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
output_path = wget_output_path(link) output_path = wget_output_path(snapshot)
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(snapshot.snapshot_dir)
if output_path and (out_dir / output_path).exists(): if output_path and (out_dir / output_path).exists():
return False return False
@ -46,7 +48,7 @@ def should_save_wget(link: Link, out_dir: Optional[Path]=None) -> bool:
@enforce_types @enforce_types
def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_wget(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using wget""" """download full site using wget"""
out_dir = out_dir or link.link_dir out_dir = out_dir or link.link_dir
@ -70,14 +72,14 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []), *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
*([] if SAVE_WARC else ['--timestamping']), *([] if SAVE_WARC else ['--timestamping']),
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
link.url, snapshot.url,
] ]
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(cmd, cwd=str(out_dir), timeout=timeout) result = run(cmd, cwd=str(out_dir), timeout=timeout)
output = wget_output_path(link) output = wget_output_path(snapshot)
# parse out number of files downloaded from last line of stderr: # parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
@ -123,14 +125,14 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
@enforce_types @enforce_types
def wget_output_path(link: Link) -> Optional[str]: def wget_output_path(snapshot: Model) -> Optional[str]:
"""calculate the path to the wgetted .html file, since wget may """calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path. adjust some paths to be different than the base_url path.
See docs on wget --adjust-extension (-E) See docs on wget --adjust-extension (-E)
""" """
if is_static_file(link.url): if is_static_file(snapshot.url):
return without_scheme(without_fragment(link.url)) return without_scheme(without_fragment(snapshot.url))
# Wget downloads can save in a number of different ways depending on the url: # Wget downloads can save in a number of different ways depending on the url:
# https://example.com # https://example.com
@ -163,8 +165,8 @@ def wget_output_path(link: Link) -> Optional[str]:
# and there's no way to get the computed output path from wget # and there's no way to get the computed output path from wget
# in order to avoid having to reverse-engineer how they calculate it, # in order to avoid having to reverse-engineer how they calculate it,
# we just look in the output folder read the filename wget used from the filesystem # we just look in the output folder read the filename wget used from the filesystem
full_path = without_fragment(without_query(path(link.url))).strip('/') full_path = without_fragment(without_query(path(snapshot.url))).strip('/')
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path) search_dir = Path(snapshot.snapshot_dir) / domain(snapshot.url).replace(":", "+") / urldecode(full_path)
for _ in range(4): for _ in range(4):
if search_dir.exists(): if search_dir.exists():
if search_dir.is_dir(): if search_dir.is_dir():
@ -173,12 +175,12 @@ def wget_output_path(link: Link) -> Optional[str]:
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M) if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
] ]
if html_files: if html_files:
return str(html_files[0].relative_to(link.link_dir)) return str(html_files[0].relative_to(snapshot.snapshot_dir))
# Move up one directory level # Move up one directory level
search_dir = search_dir.parent search_dir = search_dir.parent
if str(search_dir) == link.link_dir: if str(search_dir) == snapshot.snapshot_dir:
break break
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path) search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)

View file

@ -87,7 +87,6 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) ->
out_dir = out_dir or snapshot.snapshot_dir out_dir = out_dir or snapshot.snapshot_dir
path = Path(out_dir) / JSON_INDEX_FILENAME path = Path(out_dir) / JSON_INDEX_FILENAME
print(snapshot._asdict())
atomic_write(str(path), snapshot._asdict()) atomic_write(str(path), snapshot._asdict())