1
0
Fork 0
mirror of synced 2024-06-18 18:34:51 +12:00
This commit is contained in:
Nick Sweeting 2020-06-25 22:14:40 -04:00
parent 1a16221752
commit 5c2bbe7efe
11 changed files with 44 additions and 61 deletions

View file

@ -78,7 +78,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'}, 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'} 'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'}, 'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}, 'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},

View file

@ -6,18 +6,18 @@ from typing import Optional, List, Dict, Tuple
from collections import defaultdict from collections import defaultdict
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, DEVNULL, chmod_file from ..system import run, chmod_file
from ..util import ( from ..util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
) )
from ..config import ( from ..config import (
VERSION,
TIMEOUT, TIMEOUT,
CHECK_SSL_VALIDITY,
SAVE_ARCHIVE_DOT_ORG, SAVE_ARCHIVE_DOT_ORG,
CURL_BINARY, CURL_BINARY,
CURL_VERSION, CURL_VERSION,
CHECK_SSL_VALIDITY CURL_USER_AGENT,
) )
from ..cli.logging import TimedProgress from ..cli.logging import TimedProgress
@ -45,17 +45,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=T
submit_url = 'https://web.archive.org/save/{}'.format(link.url) submit_url = 'https://web.archive.org/save/{}'.format(link.url)
cmd = [ cmd = [
CURL_BINARY, CURL_BINARY,
'--silent',
'--location', '--location',
'--head', '--head',
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
'--max-time', str(timeout), '--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']), *([] if CHECK_SSL_VALIDITY else ['--insecure']),
submit_url, submit_url,
] ]
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout) result = run(cmd, cwd=out_dir, timeout=timeout)
content_location, errors = parse_archive_dot_org_response(result.stdout) content_location, errors = parse_archive_dot_org_response(result.stdout)
if content_location: if content_location:
archive_org_url = 'https://web.archive.org{}'.format(content_location[0]) archive_org_url = 'https://web.archive.org{}'.format(content_location[0])

View file

@ -5,7 +5,7 @@ import os
from typing import Optional from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file from ..system import run, chmod_file
from ..util import ( from ..util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
@ -47,7 +47,7 @@ def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
with open(output_path, 'w+') as f: with open(output_path, 'w+') as f:
result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout) result = run(cmd, stdout=f, cwd=out_dir, timeout=timeout)
if result.returncode: if result.returncode:
hints = result.stderr.decode() hints = result.stderr.decode()

View file

@ -5,7 +5,7 @@ import os
from typing import Optional from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..system import chmod_file, run, PIPE from ..system import chmod_file, run
from ..util import enforce_types, domain from ..util import enforce_types, domain
from ..config import ( from ..config import (
TIMEOUT, TIMEOUT,
@ -38,14 +38,14 @@ def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
'--max-time', str(timeout), '--max-time', str(timeout),
'--location', '--location',
'--output', str(output), '--output', str(output),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else [], *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']), *([] if CHECK_SSL_VALIDITY else ['--insecure']),
'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)), 'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
] ]
status = 'pending' status = 'pending'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) run(cmd, cwd=out_dir, timeout=timeout)
chmod_file(output, cwd=out_dir) chmod_file(output, cwd=out_dir)
status = 'succeeded' status = 'succeeded'
except Exception as err: except Exception as err:

View file

@ -5,7 +5,7 @@ import os
from typing import Optional from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file from ..system import run, chmod_file
from ..util import ( from ..util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
@ -64,7 +64,7 @@ def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) result = run(cmd, cwd=output_path, timeout=timeout + 1)
if result.returncode == 128: if result.returncode == 128:
# ignore failed re-download when the folder already exists # ignore failed re-download when the folder already exists

View file

@ -5,7 +5,7 @@ import os
from typing import Optional from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file from ..system import run, chmod_file
from ..util import ( from ..util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
@ -66,7 +66,7 @@ def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEO
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) result = run(cmd, cwd=output_path, timeout=timeout + 1)
chmod_file(output, cwd=out_dir) chmod_file(output, cwd=out_dir)
if result.returncode: if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr if (b'ERROR: Unsupported URL' in result.stderr

View file

@ -5,7 +5,7 @@ import os
from typing import Optional from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file from ..system import run, chmod_file
from ..util import ( from ..util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
@ -45,7 +45,7 @@ def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) result = run(cmd, cwd=out_dir, timeout=timeout)
if result.returncode: if result.returncode:
hints = (result.stderr or result.stdout).decode() hints = (result.stderr or result.stdout).decode()

View file

@ -5,7 +5,7 @@ import os
from typing import Optional from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file from ..system import run, chmod_file
from ..util import ( from ..util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
@ -45,7 +45,7 @@ def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOU
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) result = run(cmd, cwd=out_dir, timeout=timeout)
if result.returncode: if result.returncode:
hints = (result.stderr or result.stdout).decode() hints = (result.stderr or result.stdout).decode()

View file

@ -12,9 +12,11 @@ from ..util import (
) )
from ..config import ( from ..config import (
TIMEOUT, TIMEOUT,
CHECK_SSL_VALIDITY,
SAVE_TITLE, SAVE_TITLE,
CURL_BINARY, CURL_BINARY,
CURL_VERSION, CURL_VERSION,
CURL_USER_AGENT,
) )
from ..cli.logging import TimedProgress from ..cli.logging import TimedProgress
@ -44,6 +46,11 @@ def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
output: ArchiveOutput = None output: ArchiveOutput = None
cmd = [ cmd = [
CURL_BINARY, CURL_BINARY,
'--silent',
'--max-time', str(timeout),
'--location',
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
link.url, link.url,
'|', '|',
'grep', 'grep',

View file

@ -7,7 +7,7 @@ from typing import Optional
from datetime import datetime from datetime import datetime
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE from ..system import run
from ..util import ( from ..util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
@ -81,7 +81,7 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) result = run(cmd, cwd=out_dir, timeout=timeout)
output = wget_output_path(link) output = wget_output_path(link)
# parse out number of files downloaded from last line of stderr: # parse out number of files downloaded from last line of stderr:

View file

@ -4,69 +4,44 @@ __package__ = 'archivebox'
import os import os
import shutil import shutil
import json as pyjson from json import dump
from pathlib import Path
from typing import Optional, Union, Set, Tuple from typing import Optional, Union, Set, Tuple
from subprocess import run as subprocess_run
from crontab import CronTab from crontab import CronTab
from atomicwrites import atomic_write as awrite from atomicwrites import atomic_write as lib_atomic_write
from subprocess import (
Popen,
PIPE,
DEVNULL,
CompletedProcess,
TimeoutExpired,
CalledProcessError,
)
from .util import enforce_types, ExtendedEncoder from .util import enforce_types, ExtendedEncoder
from .config import OUTPUT_PERMISSIONS from .config import OUTPUT_PERMISSIONS
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs): def run(*args, input=None, capture_output=True, text=True, timeout=None, check=False, **kwargs):
"""Patched of subprocess.run to fix blocking io making timeout=innefective""" """Patched of subprocess.run to fix blocking io making timeout=innefective"""
if input is not None: if input is not None:
if 'stdin' in kwargs: if 'stdin' in kwargs:
raise ValueError('stdin and input arguments may not both be used.') raise ValueError('stdin and input arguments may not both be used.')
kwargs['stdin'] = PIPE
if capture_output: if capture_output:
if ('stdout' in kwargs) or ('stderr' in kwargs): if ('stdout' in kwargs) or ('stderr' in kwargs):
raise ValueError('stdout and stderr arguments may not be used ' raise ValueError('stdout and stderr arguments may not be used '
'with capture_output.') 'with capture_output.')
kwargs['stdout'] = PIPE
kwargs['stderr'] = PIPE
with Popen(*popenargs, **kwargs) as process: return subprocess_run(*args, input=input, capture_output=capture_output, text=text, timeout=timeout, check=check, **kwargs)
try:
stdout, stderr = process.communicate(input, timeout=timeout)
except TimeoutExpired:
process.kill()
try:
stdout, stderr = process.communicate(input, timeout=2)
except:
pass
raise TimeoutExpired(popenargs[0][0], timeout)
except BaseException:
process.kill()
# We don't call process.wait() as .__exit__ does that for us.
raise
retcode = process.poll()
if check and retcode:
raise CalledProcessError(retcode, process.args,
output=stdout, stderr=stderr)
return CompletedProcess(process.args, retcode, stdout, stderr)
@enforce_types
def atomic_write(path: str, contents: Union[dict, str, bytes], overwrite: bool=True) -> None: def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], overwrite: bool=True) -> None:
"""Safe atomic write to filesystem by writing to temp file + atomic rename""" """Safe atomic write to filesystem by writing to temp file + atomic rename"""
with awrite(path, overwrite=overwrite) as f: mode = 'wb+' if isinstance(contents, bytes) else 'w'
# print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
if isinstance(contents, dict): if isinstance(contents, dict):
pyjson.dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
else: elif isinstance(contents, (bytes, str)):
f.write(contents) f.write(contents)
@enforce_types @enforce_types
@ -76,7 +51,7 @@ def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, tim
if not os.path.exists(os.path.join(cwd, path)): if not os.path.exists(os.path.join(cwd, path)):
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path)) raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout) chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, timeout=timeout)
if chmod_result.returncode == 1: if chmod_result.returncode == 1:
print(' ', chmod_result.stderr.decode()) print(' ', chmod_result.stderr.decode())
raise Exception('Failed to chmod {}/{}'.format(cwd, path)) raise Exception('Failed to chmod {}/{}'.format(cwd, path))