1
0
Fork 0
mirror of synced 2024-05-17 02:43:16 +12:00

Merge branch 'dev' into plugins-browsertrix

This commit is contained in:
Nick Sweeting 2024-03-15 15:45:56 -07:00 committed by GitHub
commit 11b067a1ae
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 566 additions and 165 deletions

View file

@ -152,6 +152,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'CHROME_TIMEOUT': {'type': int, 'default': 0},
'CHROME_HEADLESS': {'type': bool, 'default': True},
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
'CHROME_EXTRA_ARGS': {'type': list, 'default': None},
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
'--restrict-filenames',
'--trim-filenames', '128',
@ -176,6 +178,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--add-metadata',
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
]},
'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
'WGET_ARGS': {'type': list, 'default': ['--no-verbose',
@ -187,12 +190,17 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--no-parent',
'-e', 'robots=off',
]},
'WGET_EXTRA_ARGS': {'type': list, 'default': None},
'CURL_ARGS': {'type': list, 'default': ['--silent',
'--location',
'--compressed'
]},
'CURL_EXTRA_ARGS': {'type': list, 'default': None},
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
'SINGLEFILE_ARGS': {'type': list, 'default' : None},
'SINGLEFILE_ARGS': {'type': list, 'default': None},
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
'MERCURY_ARGS': {'type': list, 'default': ['--format=text']},
'MERCURY_EXTRA_ARGS': {'type': list, 'default': None},
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
},
@ -500,7 +508,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
@ -530,6 +538,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
@ -540,18 +549,22 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
@ -561,6 +574,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
@ -582,6 +596,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
}
@ -920,27 +935,36 @@ def find_chrome_binary() -> Optional[str]:
def find_chrome_data_dir() -> Optional[str]:
"""find any installed chrome user data directories in the default locations"""
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# make sure data dir finding precedence order always matches binary finding order
default_profile_paths = (
'~/.config/chromium',
'~/Library/Application Support/Chromium',
'~/AppData/Local/Chromium/User Data',
'~/.config/chrome',
'~/.config/google-chrome',
'~/Library/Application Support/Google/Chrome',
'~/AppData/Local/Google/Chrome/User Data',
'~/.config/google-chrome-stable',
'~/.config/google-chrome-beta',
'~/Library/Application Support/Google/Chrome Canary',
'~/AppData/Local/Google/Chrome SxS/User Data',
'~/.config/google-chrome-unstable',
'~/.config/google-chrome-dev',
)
for path in default_profile_paths:
full_path = Path(path).resolve()
if full_path.exists():
return full_path
# deprecated because this is DANGEROUS, do not re-implement/uncomment this behavior.
# Going forward we want to discourage people from using their main chrome profile for archiving.
# Session tokens, personal data, and cookies are often returned in server responses,
# when they get archived, they are essentially burned as anyone who can view the archive
# can use that data to masquerade as the logged-in user that did the archiving.
# For this reason users should always create dedicated burner profiles for archiving and not use
# their daily driver main accounts.
# # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# # make sure data dir finding precedence order always matches binary finding order
# default_profile_paths = (
# '~/.config/chromium',
# '~/Library/Application Support/Chromium',
# '~/AppData/Local/Chromium/User Data',
# '~/.config/chrome',
# '~/.config/google-chrome',
# '~/Library/Application Support/Google/Chrome',
# '~/AppData/Local/Google/Chrome/User Data',
# '~/.config/google-chrome-stable',
# '~/.config/google-chrome-beta',
# '~/Library/Application Support/Google/Chrome Canary',
# '~/AppData/Local/Google/Chrome SxS/User Data',
# '~/.config/google-chrome-unstable',
# '~/.config/google-chrome-dev',
# )
# for path in default_profile_paths:
# full_path = Path(path).resolve()
# if full_path.exists():
# return full_path
return None
def wget_supports_compression(config):

View file

@ -10,10 +10,12 @@ from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
dedupe,
)
from ..config import (
TIMEOUT,
CURL_ARGS,
CURL_EXTRA_ARGS,
CHECK_SSL_VALIDITY,
SAVE_ARCHIVE_DOT_ORG,
CURL_BINARY,
@ -44,13 +46,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
output: ArchiveOutput = 'archive.org.txt'
archive_org_url = None
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
cmd = [
CURL_BINARY,
# later options take precedence
options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--head',
'--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
*dedupe(options),
submit_url,
]
status = 'succeeded'

View file

@ -6,13 +6,18 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..system import chmod_file, run
from ..util import enforce_types, domain
from ..util import (
enforce_types,
domain,
dedupe,
)
from ..config import (
TIMEOUT,
SAVE_FAVICON,
FAVICON_PROVIDER,
CURL_BINARY,
CURL_ARGS,
CURL_EXTRA_ARGS,
CURL_VERSION,
CHECK_SSL_VALIDITY,
CURL_USER_AGENT,
@ -34,13 +39,18 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'favicon.ico'
cmd = [
CURL_BINARY,
# later options take precedence
options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--max-time', str(timeout),
'--output', str(output),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
*dedupe(options),
FAVICON_PROVIDER.format(domain(link.url)),
]
status = 'failed'

View file

@ -9,11 +9,13 @@ from ..system import atomic_write
from ..util import (
enforce_types,
get_headers,
dedupe,
)
from ..config import (
TIMEOUT,
CURL_BINARY,
CURL_ARGS,
CURL_EXTRA_ARGS,
CURL_USER_AGENT,
CURL_VERSION,
CHECK_SSL_VALIDITY,
@ -40,14 +42,18 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
cmd = [
CURL_BINARY,
# later options take precedence
options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--head',
'--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
*dedupe(options),
link.url,
]
try:

View file

@ -8,11 +8,13 @@ from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
dedupe,
)
from ..config import (
MEDIA_TIMEOUT,
SAVE_MEDIA,
YOUTUBEDL_ARGS,
YOUTUBEDL_EXTRA_ARGS,
YOUTUBEDL_BINARY,
YOUTUBEDL_VERSION,
CHECK_SSL_VALIDITY
@ -39,11 +41,16 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
output: ArchiveOutput = 'media'
output_path = out_dir / output
output_path.mkdir(exist_ok=True)
cmd = [
YOUTUBEDL_BINARY,
# later options take precedence
options = [
*YOUTUBEDL_ARGS,
*YOUTUBEDL_EXTRA_ARGS,
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
]
cmd = [
YOUTUBEDL_BINARY,
*dedupe(options),
link.url,
]
status = 'succeeded'

View file

@ -11,13 +11,15 @@ from ..system import run, atomic_write
from ..util import (
enforce_types,
is_static_file,
dedupe,
)
from ..config import (
TIMEOUT,
SAVE_MERCURY,
DEPENDENCIES,
MERCURY_VERSION,
MERCURY_ARGS,
MERCURY_EXTRA_ARGS,
)
from ..logging_util import TimedProgress
@ -60,12 +62,16 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
timer = TimedProgress(timeout, prefix=' ')
try:
output_folder.mkdir(exist_ok=True)
# Get plain text version of article
# later options take precedence
options = [
*MERCURY_ARGS,
*MERCURY_EXTRA_ARGS,
]
# By default, get plain text version of article
cmd = [
DEPENDENCIES['MERCURY_BINARY']['path'],
link.url,
"--format=text"
*dedupe(options)
]
result = run(cmd, cwd=out_dir, timeout=timeout)
try:

View file

@ -11,6 +11,7 @@ from ..util import (
enforce_types,
is_static_file,
chrome_args,
dedupe,
)
from ..config import (
TIMEOUT,
@ -18,6 +19,7 @@ from ..config import (
DEPENDENCIES,
SINGLEFILE_VERSION,
SINGLEFILE_ARGS,
SINGLEFILE_EXTRA_ARGS,
CHROME_BINARY,
)
from ..logging_util import TimedProgress
@ -46,31 +48,16 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
# later options take precedence
options = [
*SINGLEFILE_ARGS,
'--browser-executable-path={}'.format(CHROME_BINARY),
*SINGLEFILE_EXTRA_ARGS,
browser_args,
'--browser-executable-path={}'.format(CHROME_BINARY),
]
# Deduplicate options (single-file doesn't like when you use the same option two times)
#
# NOTE: Options names that come first clobber conflicting names that come later
# My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
# specificity, therefore the user sets it with a lot intent, therefore it should take precedence
# kind of like the ergonomic principle of lexical scope in programming languages.
seen_option_names = []
def test_seen(argument):
option_name = argument.split("=")[0]
if option_name in seen_option_names:
return False
else:
seen_option_names.append(option_name)
return True
deduped_options = list(filter(test_seen, options))
cmd = [
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
*deduped_options,
*dedupe(options),
link.url,
output,
]

View file

@ -10,6 +10,7 @@ from ..util import (
enforce_types,
download_url,
htmldecode,
dedupe,
)
from ..config import (
TIMEOUT,
@ -17,6 +18,7 @@ from ..config import (
SAVE_TITLE,
CURL_BINARY,
CURL_ARGS,
CURL_EXTRA_ARGS,
CURL_VERSION,
CURL_USER_AGENT,
)
@ -102,12 +104,17 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
from core.models import Snapshot
output: ArchiveOutput = None
cmd = [
CURL_BINARY,
# later options take precedence
options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
'--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
*dedupe(options),
link.url,
]
status = 'succeeded'

View file

@ -15,9 +15,11 @@ from ..util import (
path,
domain,
urldecode,
dedupe,
)
from ..config import (
WGET_ARGS,
WGET_EXTRA_ARGS,
TIMEOUT,
SAVE_WGET,
SAVE_WARC,
@ -55,10 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
output: ArchiveOutput = None
cmd = [
WGET_BINARY,
# '--server-response', # print headers for better error parsing
# later options take precedence
options = [
*WGET_ARGS,
*WGET_EXTRA_ARGS,
'--timeout={}'.format(timeout),
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
@ -68,6 +70,11 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
*([] if SAVE_WARC else ['--timestamping']),
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
# '--server-response', # print headers for better error parsing
]
cmd = [
WGET_BINARY,
*dedupe(options),
link.url,
]

View file

@ -44,6 +44,7 @@ from . import medium_rss
from . import netscape_html
from . import generic_rss
from . import generic_json
from . import generic_jsonl
from . import generic_html
from . import generic_txt
from . import url_list
@ -63,6 +64,7 @@ PARSERS = {
netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER),
generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER),
generic_json.KEY: (generic_json.NAME, generic_json.PARSER),
generic_jsonl.KEY: (generic_jsonl.NAME, generic_jsonl.PARSER),
generic_html.KEY: (generic_html.NAME, generic_html.PARSER),
# Catchall fallback parser

View file

@ -11,6 +11,60 @@ from ..util import (
enforce_types,
)
# This gets used by generic_jsonl, too
def jsonObjectToLink(link: str, source: str):
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
# example line
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
# Parse URL
url = link.get('href') or link.get('url') or link.get('URL')
if not url:
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
# Parse the timestamp
ts_str = str(datetime.now(timezone.utc).timestamp())
if link.get('timestamp'):
# chrome/ff histories use a very precise timestamp
ts_str = str(link['timestamp'] / 10000000)
elif link.get('time'):
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
elif link.get('created_at'):
ts_str = str(json_date(link['created_at']).timestamp())
elif link.get('created'):
ts_str = str(json_date(link['created']).timestamp())
elif link.get('date'):
ts_str = str(json_date(link['date']).timestamp())
elif link.get('bookmarked'):
ts_str = str(json_date(link['bookmarked']).timestamp())
elif link.get('saved'):
ts_str = str(json_date(link['saved']).timestamp())
# Parse the title
title = None
if link.get('title'):
title = link['title'].strip()
elif link.get('description'):
title = link['description'].replace(' — Readability', '').strip()
elif link.get('name'):
title = link['name'].strip()
# if we have a list, join it with commas
tags = link.get('tags')
if type(tags) == list:
tags = ','.join(tags)
elif type(tags) == str:
# if there's no comma, assume it was space-separated
if ',' not in tags:
tags = tags.replace(' ', ',')
return Link(
url=htmldecode(url),
timestamp=ts_str,
title=htmldecode(title) or None,
tags=htmldecode(tags),
sources=[source],
)
@enforce_types
def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
@ -18,55 +72,21 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
json_file.seek(0)
# sometimes the first line is a comment or filepath, so we get everything after the first {
json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
links = json.loads(json_file_json_str)
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
try:
links = json.load(json_file)
if type(links) != list:
raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
except json.decoder.JSONDecodeError:
# sometimes the first line is a comment or other junk, so try without
json_file.seek(0)
first_line = json_file.readline()
#print(' > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '')
links = json.load(json_file)
# we may fail again, which means we really don't know what to do
for link in links:
# example line
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
if link:
# Parse URL
url = link.get('href') or link.get('url') or link.get('URL')
if not url:
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
# Parse the timestamp
ts_str = str(datetime.now(timezone.utc).timestamp())
if link.get('timestamp'):
# chrome/ff histories use a very precise timestamp
ts_str = str(link['timestamp'] / 10000000)
elif link.get('time'):
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
elif link.get('created_at'):
ts_str = str(json_date(link['created_at']).timestamp())
elif link.get('created'):
ts_str = str(json_date(link['created']).timestamp())
elif link.get('date'):
ts_str = str(json_date(link['date']).timestamp())
elif link.get('bookmarked'):
ts_str = str(json_date(link['bookmarked']).timestamp())
elif link.get('saved'):
ts_str = str(json_date(link['saved']).timestamp())
# Parse the title
title = None
if link.get('title'):
title = link['title'].strip()
elif link.get('description'):
title = link['description'].replace(' — Readability', '').strip()
elif link.get('name'):
title = link['name'].strip()
yield Link(
url=htmldecode(url),
timestamp=ts_str,
title=htmldecode(title) or None,
tags=htmldecode(link.get('tags')) or '',
sources=[json_file.name],
)
yield jsonObjectToLink(link,json_file.name)
KEY = 'json'
NAME = 'Generic JSON'

View file

@ -0,0 +1,34 @@
__package__ = 'archivebox.parsers'
import json
from typing import IO, Iterable
from datetime import datetime, timezone
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
)
from .generic_json import jsonObjectToLink
def parse_line(line: str):
if line.strip() != "":
return json.loads(line)
@enforce_types
def parse_generic_jsonl_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse JSONL format bookmarks export files"""
json_file.seek(0)
links = [ parse_line(line) for line in json_file ]
for link in links:
if link:
yield jsonObjectToLink(link,json_file.name)
KEY = 'jsonl'
NAME = 'Generic JSONL'
PARSER = parse_generic_jsonl_export

View file

@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
from time import mktime
from feedparser import parse as feedparser
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
str_between,
enforce_types
)
@enforce_types
@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse RSS XML-format files into links"""
rss_file.seek(0)
items = rss_file.read().split('<item>')
items = items[1:] if items else []
for item in items:
# example item:
# <item>
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
# <category>Unread</category>
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
# </item>
feed = feedparser(rss_file.read())
for item in feed.entries:
url = item.link
title = item.title
time = mktime(item.updated_parsed)
trailing_removed = item.split('</item>', 1)[0]
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
rows = leading_removed.split('\n')
try:
tags = ','.join(map(lambda tag: tag.term, item.tags))
except AttributeError:
tags = ''
def get_row(key):
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
url = str_between(get_row('link'), '<link>', '</link>')
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
if url is None:
# Yielding a Link with no URL will
# crash on a URL validation assertion
continue
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
timestamp=str(time),
title=htmldecode(title) or None,
tags=None,
tags=tags,
sources=[rss_file.name],
)

View file

@ -2,50 +2,41 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime, timezone
from xml.etree import ElementTree
from time import mktime
from feedparser import parse as feedparser
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
enforce_types
)
@enforce_types
def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse Pinboard RSS feed files into links"""
rss_file.seek(0)
root = ElementTree.parse(rss_file).getroot()
items = root.findall("{http://purl.org/rss/1.0/}item")
for item in items:
find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore
feed = feedparser(rss_file.read())
for item in feed.entries:
url = item.link
# title will start with "[priv] " if pin was marked private. useful?
title = item.title
time = mktime(item.updated_parsed)
url = find("{http://purl.org/rss/1.0/}link")
tags = find("{http://purl.org/dc/elements/1.1/}subject")
title = find("{http://purl.org/rss/1.0/}title")
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
# all tags are in one entry.tags with spaces in it. annoying!
try:
tags = item.tags[0].term.replace(' ', ',')
except AttributeError:
tags = ''
if url is None:
# Yielding a Link with no URL will
# crash on a URL validation assertion
continue
# Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it:
if ts_str and ts_str[-3:-2] == ":":
ts_str = ts_str[:-3]+ts_str[-2:]
if ts_str:
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
else:
time = datetime.now(timezone.utc)
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
timestamp=str(time),
title=htmldecode(title) or None,
tags=htmldecode(tags) or None,
sources=[rss_file.name],

View file

@ -3,6 +3,7 @@ __package__ = 'archivebox'
import re
import requests
import json as pyjson
import http.cookiejar
from typing import List, Optional, Any
from pathlib import Path
@ -164,9 +165,22 @@ def parse_date(date: Any) -> Optional[datetime]:
@enforce_types
def download_url(url: str, timeout: int=None) -> str:
"""Download the contents of a remote url and return the text"""
from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT
from .config import (
TIMEOUT,
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
COOKIES_FILE,
)
timeout = timeout or TIMEOUT
response = requests.get(
session = requests.Session()
if COOKIES_FILE and Path(COOKIES_FILE).is_file():
cookie_jar = http.cookiejar.MozillaCookieJar(COOKIES_FILE)
cookie_jar.load(ignore_discard=True, ignore_expires=True)
for cookie in cookie_jar:
session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path)
response = session.get(
url,
headers={'User-Agent': WGET_USER_AGENT},
verify=CHECK_SSL_VALIDITY,
@ -231,7 +245,11 @@ def chrome_args(**options) -> List[str]:
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
from .config import CHROME_OPTIONS, CHROME_VERSION
from .config import (
CHROME_OPTIONS,
CHROME_VERSION,
CHROME_EXTRA_ARGS,
)
options = {**CHROME_OPTIONS, **options}
@ -240,6 +258,8 @@ def chrome_args(**options) -> List[str]:
cmd_args = [options['CHROME_BINARY']]
cmd_args += CHROME_EXTRA_ARGS
if options['CHROME_HEADLESS']:
chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
if chrome_major_version >= 111:
@ -283,8 +303,9 @@ def chrome_args(**options) -> List[str]:
if options['CHROME_USER_DATA_DIR']:
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
return cmd_args
return dedupe(cmd_args)
def chrome_cleanup():
"""
@ -321,6 +342,20 @@ def ansi_to_html(text):
return COLOR_REGEX.sub(single_sub, text)
@enforce_types
def dedupe(options: List[str]) -> List[str]:
"""
Deduplicates the given options. Options that come later clobber earlier
conflicting options.
"""
deduped = {}
for option in options:
deduped[option.split('=')[0]] = option
return list(deduped.values())
class AttributeDict(dict):
"""Helper to allow accessing dict values via Example.key or Example['key']"""

View file

@ -14,4 +14,4 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
source "$DIR/.venv/bin/activate"
pytest -s --basetemp=tests/out --ignore=archivebox/vendor --ignore=deb_dist --ignore=pip_dist --ignore=brew_dist
pytest -s --basetemp=tests/out "$@"

View file

@ -20,7 +20,7 @@ dependencies = [
"django-extensions>=3.0.3",
"django-solo>=2.0.0",
"w3lib>=1.22.0",
"yt-dlp>=2023.10.13",
"yt-dlp>=2024.3.10",
# dont add playwright becuase packages without sdists cause trouble on many build systems that refuse to install wheel-only packages
# "playwright>=1.39.0; platform_machine != 'armv7l'",
"mypy-extensions>=0.4.3",
@ -128,6 +128,8 @@ lint = "./bin/lint.sh"
test = "./bin/test.sh"
# all = {composite = ["lint mypackage/", "test -v tests/"]}
[tool.pytest.ini_options]
testpaths = [ "tests" ]
[project.scripts]
archivebox = "archivebox.cli:main"

View file

@ -50,4 +50,4 @@ def redirect_to_static(filename):
def start():
run(host='localhost', port=8080)
run(host='localhost', port=8080, quiet=True)

View file

@ -0,0 +1 @@
{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}

View file

@ -0,0 +1,24 @@
<?xml version="1.0" encoding="utf-8"?>
<feed
xml:lang="en"
xmlns="http://www.w3.org/2005/Atom"
>
<id>http://www.example.com/</id>
<title>Example of an Atom feed</title>
<link rel="self" type="application/atom+xml" href="http://www.example.com/index.atom" />
<link rel="alternate" type="text/html" href="http://www.example.com/" />
<author>
<name>Jim Winstead</name>
</author>
<updated>2024-02-26T03:18:26Z</updated>
<entry>
<title>Example</title>
<link rel="alternate" type="text/html" href="http://127.0.0.1:8080/static/example.com.html" />
<id>tag:example.com,2024-02-25:3319</id>
<updated>2024-02-26T03:18:26Z</updated>
<published>2024-02-25T19:18:25-08:00</published>
<category term="Tag1" scheme="http://example.com/archive" />
<category term="Tag2" scheme="http://example.com/archive" />
<content type="html">This is some &lt;b&gt;content&lt;/b&gt;</content>
</entry>
</feed>

View file

@ -0,0 +1,6 @@
[
{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"},
{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"},
{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]},
{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"}
]

View file

@ -0,0 +1,2 @@
this line would cause problems but --parser=json will actually skip it
[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}]

View file

@ -0,0 +1,4 @@
{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}
{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"}
{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]}
{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"}

View file

@ -0,0 +1,32 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:admin="http://webns.net/mvcb/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<channel>
<title>Sample Feed</title>
<link>http://example.org/</link>
<description>For documentation only</description>
<dc:language>en-us</dc:language>
<dc:creator>Nobody (nobody@example.org)</dc:creator>
<dc:rights>Public domain</dc:rights>
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
<admin:generatorAgent rdf:resource="http://www.example.org/"/>
<admin:errorReportsTo rdf:resource="mailto:nobody@example.org"/>
<item>
<title>First!</title>
<link>http://127.0.0.1:8080/static/example.com.html</link>
<guid isPermaLink="false">just-an@example.org</guid>
<description>
This has a description.
</description>
<dc:subject>Tag1 Tag2</dc:subject>
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
<content:encoded><![CDATA[
This has a <b>description</b>.]]>
</content:encoded>
</item>
</channel>
</rss>

View file

@ -91,3 +91,198 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
assert (archived_item_path / "warc").exists()
assert not (archived_item_path / "singlefile.html").exists()
def test_json(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.json', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=json"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
assert "http://127.0.0.1:8080/static/iana.org.html" in urls
assert "http://127.0.0.1:8080/static/shift_jis.html" in urls
assert "http://127.0.0.1:8080/static/title_og_with_html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://www.example.com/should-not-exist" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
assert "Tag3" in tags
assert "Tag4 with Space" in tags
assert "Tag5" in tags
assert "Tag6 with Space" in tags
def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.json.bad', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=json"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://www.example.com/should-not-exist" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
def test_generic_rss(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://purl.org/dc/elements/1.1/" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1 Tag2" in tags
def test_pinboard_rss(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=pinboard_rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
def test_atom(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://www.w3.org/2005/Atom" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
def test_jsonl(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.jsonl', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=jsonl"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
assert "http://127.0.0.1:8080/static/iana.org.html" in urls
assert "http://127.0.0.1:8080/static/shift_jis.html" in urls
assert "http://127.0.0.1:8080/static/title_og_with_html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://www.example.com/should-not-exist" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
assert "Tag3" in tags
assert "Tag4 with Space" in tags
assert "Tag5" in tags
assert "Tag6 with Space" in tags
def test_jsonl_single(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example-single.jsonl', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=jsonl"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://www.example.com/should-not-exist" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
# make sure that JSON parser rejects a single line of JSONL which is valid
# JSON but not our expected format
def test_json_single(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example-single.jsonl', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=json"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
assert 'expects list of objects' in arg_process.stderr.decode("utf-8")