1
0
Fork 0
mirror of synced 2024-05-16 02:13:16 +12:00

Flip dedupe precedence order

This commit is contained in:
Ben Muthalaly 2024-03-01 14:50:32 -06:00
parent 4d9c5a7b4b
commit d74ddd42ae
8 changed files with 33 additions and 40 deletions

View file

@ -46,14 +46,14 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
output: ArchiveOutput = 'archive.org.txt' output: ArchiveOutput = 'archive.org.txt'
archive_org_url = None archive_org_url = None
submit_url = 'https://web.archive.org/save/{}'.format(link.url) submit_url = 'https://web.archive.org/save/{}'.format(link.url)
# earlier options take precedence # later options take precedence
options = [ options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--head', '--head',
'--max-time', str(timeout), '--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']), *([] if CHECK_SSL_VALIDITY else ['--insecure']),
*CURL_EXTRA_ARGS,
*CURL_ARGS,
] ]
cmd = [ cmd = [
CURL_BINARY, CURL_BINARY,

View file

@ -39,14 +39,14 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
out_dir = out_dir or link.link_dir out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'favicon.ico' output: ArchiveOutput = 'favicon.ico'
# earlier options take precedence # later options take precedence
options = [ options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--max-time', str(timeout), '--max-time', str(timeout),
'--output', str(output), '--output', str(output),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']), *([] if CHECK_SSL_VALIDITY else ['--insecure']),
*CURL_EXTRA_ARGS,
*CURL_ARGS,
] ]
cmd = [ cmd = [
CURL_BINARY, CURL_BINARY,

View file

@ -42,14 +42,14 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
# earlier options take precedence # later options take precedence
options = [ options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--head', '--head',
'--max-time', str(timeout), '--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']), *([] if CHECK_SSL_VALIDITY else ['--insecure']),
*CURL_EXTRA_ARGS,
*CURL_ARGS,
] ]
cmd = [ cmd = [
CURL_BINARY, CURL_BINARY,

View file

@ -41,11 +41,12 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
output: ArchiveOutput = 'media' output: ArchiveOutput = 'media'
output_path = out_dir / output output_path = out_dir / output
output_path.mkdir(exist_ok=True) output_path.mkdir(exist_ok=True)
# later options take precedence
options = [ options = [
*YOUTUBEDL_ARGS,
*YOUTUBEDL_EXTRA_ARGS,
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR} # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
*YOUTUBEDL_EXTRA_ARGS,
*YOUTUBEDL_ARGS,
] ]
cmd = [ cmd = [
YOUTUBEDL_BINARY, YOUTUBEDL_BINARY,

View file

@ -48,18 +48,12 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
# later options take precedence
# Deduplicate options (single-file doesn't like when you use the same option two times)
#
# NOTE: Options names that come first clobber conflicting names that come later
# My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
# specificity, therefore the user sets it with a lot intent, therefore it should take precedence
# kind of like the ergonomic principle of lexical scope in programming languages.
options = [ options = [
'--browser-executable-path={}'.format(CHROME_BINARY),
browser_args,
*SINGLEFILE_EXTRA_ARGS,
*SINGLEFILE_ARGS, *SINGLEFILE_ARGS,
*SINGLEFILE_EXTRA_ARGS,
browser_args,
'--browser-executable-path={}'.format(CHROME_BINARY),
] ]
cmd = [ cmd = [
DEPENDENCIES['SINGLEFILE_BINARY']['path'], DEPENDENCIES['SINGLEFILE_BINARY']['path'],

View file

@ -104,13 +104,13 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
from core.models import Snapshot from core.models import Snapshot
output: ArchiveOutput = None output: ArchiveOutput = None
# earlier options take precedence # later options take precedence
options = [ options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--max-time', str(timeout), '--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']), *([] if CHECK_SSL_VALIDITY else ['--insecure']),
*CURL_EXTRA_ARGS,
*CURL_ARGS,
] ]
cmd = [ cmd = [
CURL_BINARY, CURL_BINARY,

View file

@ -57,8 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
output: ArchiveOutput = None output: ArchiveOutput = None
# earlier options take precedence # later options take precedence
options = [ options = [
*WGET_ARGS,
*WGET_EXTRA_ARGS,
'--timeout={}'.format(timeout), '--timeout={}'.format(timeout),
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []), *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []), *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
@ -69,8 +71,6 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
*([] if SAVE_WARC else ['--timestamping']), *([] if SAVE_WARC else ['--timestamping']),
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
# '--server-response', # print headers for better error parsing # '--server-response', # print headers for better error parsing
*WGET_EXTRA_ARGS,
*WGET_ARGS,
] ]
cmd = [ cmd = [
WGET_BINARY, WGET_BINARY,

View file

@ -240,6 +240,8 @@ def chrome_args(**options) -> List[str]:
cmd_args = [options['CHROME_BINARY']] cmd_args = [options['CHROME_BINARY']]
cmd_args += CHROME_EXTRA_ARGS
if options['CHROME_HEADLESS']: if options['CHROME_HEADLESS']:
chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1]) chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
if chrome_major_version >= 111: if chrome_major_version >= 111:
@ -284,7 +286,6 @@ def chrome_args(**options) -> List[str]:
if options['CHROME_USER_DATA_DIR']: if options['CHROME_USER_DATA_DIR']:
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
cmd_args += CHROME_EXTRA_ARGS
return dedupe(*cmd_args) return dedupe(*cmd_args)
@ -324,20 +325,17 @@ def ansi_to_html(text):
@enforce_types @enforce_types
def dedupe(*options: List[str]) -> List[str]: def dedupe(*options: str) -> List[str]:
""" """
Deduplicates the given options. Options that come earlier in the list clobber Deduplicates the given options. Options that come later clobber earlier
later conflicting options. conflicting options.
""" """
seen_option_names = [] deduped = {}
def test_seen(argument):
option_name = argument.split("=")[0] for option in options:
if option_name in seen_option_names: deduped[option.split('=')[0]] = option
return False
else: return list(deduped.values())
seen_option_names.append(option_name)
return True
return list(filter(test_seen, options))
class AttributeDict(dict): class AttributeDict(dict):