diff --git a/archivebox/config.py b/archivebox/config.py index 3da38b3f..9c3a2ccc 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -152,6 +152,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'CHROME_TIMEOUT': {'type': int, 'default': 0}, 'CHROME_HEADLESS': {'type': bool, 'default': True}, 'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']}, + 'CHROME_EXTRA_ARGS': {'type': list, 'default': None}, + 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [ '--restrict-filenames', '--trim-filenames', '128', @@ -176,6 +178,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--add-metadata', '--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']), ]}, + 'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None}, 'WGET_ARGS': {'type': list, 'default': ['--no-verbose', @@ -187,12 +190,17 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--no-parent', '-e', 'robots=off', ]}, + 'WGET_EXTRA_ARGS': {'type': list, 'default': None}, 'CURL_ARGS': {'type': list, 'default': ['--silent', '--location', '--compressed' ]}, + 'CURL_EXTRA_ARGS': {'type': list, 'default': None}, 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, - 'SINGLEFILE_ARGS': {'type': list, 'default' : None}, + 'SINGLEFILE_ARGS': {'type': list, 'default': None}, + 'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None}, + 'MERCURY_ARGS': {'type': list, 'default': ['--format=text']}, + 'MERCURY_EXTRA_ARGS': {'type': list, 'default': None}, 'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'}, }, @@ -500,7 +508,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME}, 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME}, 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()}, - 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None + 'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None}, 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)}, 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)}, 'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories @@ -530,6 +538,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None}, 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)}, 'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []}, + 'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []}, 'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']}, 'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']}, @@ -540,18 +549,22 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']}, 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, + 'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []}, 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None}, 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, 'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []}, + 'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []}, 'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']}, 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']}, 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750 + 'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []}, + 'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []}, 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, @@ -561,6 +574,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None}, 'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']}, 'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []}, + 'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []}, 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()}, 'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])}, @@ -582,6 +596,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)}, 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, 'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)}, + 'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []}, 'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}}, 'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}}, } @@ -920,27 +935,36 @@ def find_chrome_binary() -> Optional[str]: def find_chrome_data_dir() -> Optional[str]: """find any installed chrome user data directories in the default locations""" - # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev - # make sure data dir finding precedence order always matches binary finding order - default_profile_paths = ( - '~/.config/chromium', - '~/Library/Application Support/Chromium', - '~/AppData/Local/Chromium/User Data', - '~/.config/chrome', - '~/.config/google-chrome', - '~/Library/Application Support/Google/Chrome', - '~/AppData/Local/Google/Chrome/User Data', - '~/.config/google-chrome-stable', - '~/.config/google-chrome-beta', - '~/Library/Application Support/Google/Chrome Canary', - '~/AppData/Local/Google/Chrome SxS/User Data', - '~/.config/google-chrome-unstable', - '~/.config/google-chrome-dev', - ) - for path in default_profile_paths: - full_path = Path(path).resolve() - if full_path.exists(): - return full_path + # deprecated because this is DANGEROUS, do not re-implement/uncomment this behavior. + + # Going forward we want to discourage people from using their main chrome profile for archiving. + # Session tokens, personal data, and cookies are often returned in server responses, + # when they get archived, they are essentially burned as anyone who can view the archive + # can use that data to masquerade as the logged-in user that did the archiving. + # For this reason users should always create dedicated burner profiles for archiving and not use + # their daily driver main accounts. + + # # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev + # # make sure data dir finding precedence order always matches binary finding order + # default_profile_paths = ( + # '~/.config/chromium', + # '~/Library/Application Support/Chromium', + # '~/AppData/Local/Chromium/User Data', + # '~/.config/chrome', + # '~/.config/google-chrome', + # '~/Library/Application Support/Google/Chrome', + # '~/AppData/Local/Google/Chrome/User Data', + # '~/.config/google-chrome-stable', + # '~/.config/google-chrome-beta', + # '~/Library/Application Support/Google/Chrome Canary', + # '~/AppData/Local/Google/Chrome SxS/User Data', + # '~/.config/google-chrome-unstable', + # '~/.config/google-chrome-dev', + # ) + # for path in default_profile_paths: + # full_path = Path(path).resolve() + # if full_path.exists(): + # return full_path return None def wget_supports_compression(config): diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index a0883113..245315f1 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -10,10 +10,12 @@ from ..system import run, chmod_file from ..util import ( enforce_types, is_static_file, + dedupe, ) from ..config import ( TIMEOUT, CURL_ARGS, + CURL_EXTRA_ARGS, CHECK_SSL_VALIDITY, SAVE_ARCHIVE_DOT_ORG, CURL_BINARY, @@ -44,13 +46,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int= output: ArchiveOutput = 'archive.org.txt' archive_org_url = None submit_url = 'https://web.archive.org/save/{}'.format(link.url) - cmd = [ - CURL_BINARY, + # later options take precedence + options = [ *CURL_ARGS, + *CURL_EXTRA_ARGS, '--head', '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), + ] + cmd = [ + CURL_BINARY, + *dedupe(options), submit_url, ] status = 'succeeded' diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 5baafc17..f793f8df 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -6,13 +6,18 @@ from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..system import chmod_file, run -from ..util import enforce_types, domain +from ..util import ( + enforce_types, + domain, + dedupe, +) from ..config import ( TIMEOUT, SAVE_FAVICON, FAVICON_PROVIDER, CURL_BINARY, CURL_ARGS, + CURL_EXTRA_ARGS, CURL_VERSION, CHECK_SSL_VALIDITY, CURL_USER_AGENT, @@ -34,13 +39,18 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) out_dir = out_dir or link.link_dir output: ArchiveOutput = 'favicon.ico' - cmd = [ - CURL_BINARY, + # later options take precedence + options = [ *CURL_ARGS, + *CURL_EXTRA_ARGS, '--max-time', str(timeout), '--output', str(output), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), + ] + cmd = [ + CURL_BINARY, + *dedupe(options), FAVICON_PROVIDER.format(domain(link.url)), ] status = 'failed' diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 91dcb8e3..975787ad 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -9,11 +9,13 @@ from ..system import atomic_write from ..util import ( enforce_types, get_headers, + dedupe, ) from ..config import ( TIMEOUT, CURL_BINARY, CURL_ARGS, + CURL_EXTRA_ARGS, CURL_USER_AGENT, CURL_VERSION, CHECK_SSL_VALIDITY, @@ -40,14 +42,18 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') - - cmd = [ - CURL_BINARY, + # later options take precedence + options = [ *CURL_ARGS, + *CURL_EXTRA_ARGS, '--head', '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), + ] + cmd = [ + CURL_BINARY, + *dedupe(options), link.url, ] try: diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 7d73024f..ad4c9c4b 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -8,11 +8,13 @@ from ..system import run, chmod_file from ..util import ( enforce_types, is_static_file, + dedupe, ) from ..config import ( MEDIA_TIMEOUT, SAVE_MEDIA, YOUTUBEDL_ARGS, + YOUTUBEDL_EXTRA_ARGS, YOUTUBEDL_BINARY, YOUTUBEDL_VERSION, CHECK_SSL_VALIDITY @@ -39,11 +41,16 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME output: ArchiveOutput = 'media' output_path = out_dir / output output_path.mkdir(exist_ok=True) - cmd = [ - YOUTUBEDL_BINARY, + # later options take precedence + options = [ *YOUTUBEDL_ARGS, + *YOUTUBEDL_EXTRA_ARGS, *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']), # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR} + ] + cmd = [ + YOUTUBEDL_BINARY, + *dedupe(options), link.url, ] status = 'succeeded' diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index e7d20362..a0f38434 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -11,13 +11,15 @@ from ..system import run, atomic_write from ..util import ( enforce_types, is_static_file, - + dedupe, ) from ..config import ( TIMEOUT, SAVE_MERCURY, DEPENDENCIES, MERCURY_VERSION, + MERCURY_ARGS, + MERCURY_EXTRA_ARGS, ) from ..logging_util import TimedProgress @@ -60,12 +62,16 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) timer = TimedProgress(timeout, prefix=' ') try: output_folder.mkdir(exist_ok=True) - - # Get plain text version of article + # later options take precedence + options = [ + *MERCURY_ARGS, + *MERCURY_EXTRA_ARGS, + ] + # By default, get plain text version of article cmd = [ DEPENDENCIES['MERCURY_BINARY']['path'], link.url, - "--format=text" + *dedupe(options) ] result = run(cmd, cwd=out_dir, timeout=timeout) try: diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index e50b3932..553c9f8d 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -11,6 +11,7 @@ from ..util import ( enforce_types, is_static_file, chrome_args, + dedupe, ) from ..config import ( TIMEOUT, @@ -18,6 +19,7 @@ from ..config import ( DEPENDENCIES, SINGLEFILE_VERSION, SINGLEFILE_ARGS, + SINGLEFILE_EXTRA_ARGS, CHROME_BINARY, ) from ..logging_util import TimedProgress @@ -46,31 +48,16 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) + # later options take precedence options = [ *SINGLEFILE_ARGS, - '--browser-executable-path={}'.format(CHROME_BINARY), + *SINGLEFILE_EXTRA_ARGS, browser_args, + '--browser-executable-path={}'.format(CHROME_BINARY), ] - - # Deduplicate options (single-file doesn't like when you use the same option two times) - # - # NOTE: Options names that come first clobber conflicting names that come later - # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most - # specificity, therefore the user sets it with a lot intent, therefore it should take precedence - # kind of like the ergonomic principle of lexical scope in programming languages. - seen_option_names = [] - def test_seen(argument): - option_name = argument.split("=")[0] - if option_name in seen_option_names: - return False - else: - seen_option_names.append(option_name) - return True - deduped_options = list(filter(test_seen, options)) - cmd = [ DEPENDENCIES['SINGLEFILE_BINARY']['path'], - *deduped_options, + *dedupe(options), link.url, output, ] diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 6b0e37f6..5decc52c 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -10,6 +10,7 @@ from ..util import ( enforce_types, download_url, htmldecode, + dedupe, ) from ..config import ( TIMEOUT, @@ -17,6 +18,7 @@ from ..config import ( SAVE_TITLE, CURL_BINARY, CURL_ARGS, + CURL_EXTRA_ARGS, CURL_VERSION, CURL_USER_AGENT, ) @@ -102,12 +104,17 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - from core.models import Snapshot output: ArchiveOutput = None - cmd = [ - CURL_BINARY, + # later options take precedence + options = [ *CURL_ARGS, + *CURL_EXTRA_ARGS, '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), + ] + cmd = [ + CURL_BINARY, + *dedupe(options), link.url, ] status = 'succeeded' diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index f3057271..07471e29 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -15,9 +15,11 @@ from ..util import ( path, domain, urldecode, + dedupe, ) from ..config import ( WGET_ARGS, + WGET_EXTRA_ARGS, TIMEOUT, SAVE_WGET, SAVE_WARC, @@ -55,10 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html output: ArchiveOutput = None - cmd = [ - WGET_BINARY, - # '--server-response', # print headers for better error parsing + # later options take precedence + options = [ *WGET_ARGS, + *WGET_EXTRA_ARGS, '--timeout={}'.format(timeout), *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []), *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []), @@ -68,6 +70,11 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []), *([] if SAVE_WARC else ['--timestamping']), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), + # '--server-response', # print headers for better error parsing + ] + cmd = [ + WGET_BINARY, + *dedupe(options), link.url, ] diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index c6f2f382..0cd39d8a 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -44,6 +44,7 @@ from . import medium_rss from . import netscape_html from . import generic_rss from . import generic_json +from . import generic_jsonl from . import generic_html from . import generic_txt from . import url_list @@ -63,6 +64,7 @@ PARSERS = { netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER), generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER), generic_json.KEY: (generic_json.NAME, generic_json.PARSER), + generic_jsonl.KEY: (generic_jsonl.NAME, generic_jsonl.PARSER), generic_html.KEY: (generic_html.NAME, generic_html.PARSER), # Catchall fallback parser diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py index daebb7c4..8b64f55e 100644 --- a/archivebox/parsers/generic_json.py +++ b/archivebox/parsers/generic_json.py @@ -11,6 +11,60 @@ from ..util import ( enforce_types, ) +# This gets used by generic_jsonl, too +def jsonObjectToLink(link: str, source: str): + json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') + + # example line + # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] + # Parse URL + url = link.get('href') or link.get('url') or link.get('URL') + if not url: + raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]') + + # Parse the timestamp + ts_str = str(datetime.now(timezone.utc).timestamp()) + if link.get('timestamp'): + # chrome/ff histories use a very precise timestamp + ts_str = str(link['timestamp'] / 10000000) + elif link.get('time'): + ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp()) + elif link.get('created_at'): + ts_str = str(json_date(link['created_at']).timestamp()) + elif link.get('created'): + ts_str = str(json_date(link['created']).timestamp()) + elif link.get('date'): + ts_str = str(json_date(link['date']).timestamp()) + elif link.get('bookmarked'): + ts_str = str(json_date(link['bookmarked']).timestamp()) + elif link.get('saved'): + ts_str = str(json_date(link['saved']).timestamp()) + + # Parse the title + title = None + if link.get('title'): + title = link['title'].strip() + elif link.get('description'): + title = link['description'].replace(' — Readability', '').strip() + elif link.get('name'): + title = link['name'].strip() + + # if we have a list, join it with commas + tags = link.get('tags') + if type(tags) == list: + tags = ','.join(tags) + elif type(tags) == str: + # if there's no comma, assume it was space-separated + if ',' not in tags: + tags = tags.replace(' ', ',') + + return Link( + url=htmldecode(url), + timestamp=ts_str, + title=htmldecode(title) or None, + tags=htmldecode(tags), + sources=[source], + ) @enforce_types def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: @@ -18,55 +72,21 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: json_file.seek(0) - # sometimes the first line is a comment or filepath, so we get everything after the first { - json_file_json_str = '{' + json_file.read().split('{', 1)[-1] - links = json.loads(json_file_json_str) - json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') + try: + links = json.load(json_file) + if type(links) != list: + raise Exception('JSON parser expects list of objects, maybe this is JSONL?') + except json.decoder.JSONDecodeError: + # sometimes the first line is a comment or other junk, so try without + json_file.seek(0) + first_line = json_file.readline() + #print(' > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '') + links = json.load(json_file) + # we may fail again, which means we really don't know what to do for link in links: - # example line - # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] if link: - # Parse URL - url = link.get('href') or link.get('url') or link.get('URL') - if not url: - raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]') - - # Parse the timestamp - ts_str = str(datetime.now(timezone.utc).timestamp()) - if link.get('timestamp'): - # chrome/ff histories use a very precise timestamp - ts_str = str(link['timestamp'] / 10000000) - elif link.get('time'): - ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp()) - elif link.get('created_at'): - ts_str = str(json_date(link['created_at']).timestamp()) - elif link.get('created'): - ts_str = str(json_date(link['created']).timestamp()) - elif link.get('date'): - ts_str = str(json_date(link['date']).timestamp()) - elif link.get('bookmarked'): - ts_str = str(json_date(link['bookmarked']).timestamp()) - elif link.get('saved'): - ts_str = str(json_date(link['saved']).timestamp()) - - # Parse the title - title = None - if link.get('title'): - title = link['title'].strip() - elif link.get('description'): - title = link['description'].replace(' — Readability', '').strip() - elif link.get('name'): - title = link['name'].strip() - - yield Link( - url=htmldecode(url), - timestamp=ts_str, - title=htmldecode(title) or None, - tags=htmldecode(link.get('tags')) or '', - sources=[json_file.name], - ) - + yield jsonObjectToLink(link,json_file.name) KEY = 'json' NAME = 'Generic JSON' diff --git a/archivebox/parsers/generic_jsonl.py b/archivebox/parsers/generic_jsonl.py new file mode 100644 index 00000000..8ee94b28 --- /dev/null +++ b/archivebox/parsers/generic_jsonl.py @@ -0,0 +1,34 @@ +__package__ = 'archivebox.parsers' + +import json + +from typing import IO, Iterable +from datetime import datetime, timezone + +from ..index.schema import Link +from ..util import ( + htmldecode, + enforce_types, +) + +from .generic_json import jsonObjectToLink + +def parse_line(line: str): + if line.strip() != "": + return json.loads(line) + +@enforce_types +def parse_generic_jsonl_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: + """Parse JSONL format bookmarks export files""" + + json_file.seek(0) + + links = [ parse_line(line) for line in json_file ] + + for link in links: + if link: + yield jsonObjectToLink(link,json_file.name) + +KEY = 'jsonl' +NAME = 'Generic JSONL' +PARSER = parse_generic_jsonl_export diff --git a/archivebox/parsers/generic_rss.py b/archivebox/parsers/generic_rss.py index 4bd04967..005da688 100644 --- a/archivebox/parsers/generic_rss.py +++ b/archivebox/parsers/generic_rss.py @@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers' from typing import IO, Iterable -from datetime import datetime +from time import mktime +from feedparser import parse as feedparser from ..index.schema import Link from ..util import ( htmldecode, - enforce_types, - str_between, + enforce_types ) @enforce_types @@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: """Parse RSS XML-format files into links""" rss_file.seek(0) - items = rss_file.read().split('') - items = items[1:] if items else [] - for item in items: - # example item: - # - # <![CDATA[How JavaScript works: inside the V8 engine]]> - # Unread - # https://blog.sessionstack.com/how-javascript-works-inside - # https://blog.sessionstack.com/how-javascript-works-inside - # Mon, 21 Aug 2017 14:21:58 -0500 - # + feed = feedparser(rss_file.read()) + for item in feed.entries: + url = item.link + title = item.title + time = mktime(item.updated_parsed) - trailing_removed = item.split('', 1)[0] - leading_removed = trailing_removed.split('', 1)[-1].strip() - rows = leading_removed.split('\n') + try: + tags = ','.join(map(lambda tag: tag.term, item.tags)) + except AttributeError: + tags = '' - def get_row(key): - return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0] - - url = str_between(get_row('link'), '', '') - ts_str = str_between(get_row('pubDate'), '', '') - time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") - title = str_between(get_row('title'), ' Iterable[Link]: """Parse Pinboard RSS feed files into links""" rss_file.seek(0) - root = ElementTree.parse(rss_file).getroot() - items = root.findall("{http://purl.org/rss/1.0/}item") - for item in items: - find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore + feed = feedparser(rss_file.read()) + for item in feed.entries: + url = item.link + # title will start with "[priv] " if pin was marked private. useful? + title = item.title + time = mktime(item.updated_parsed) - url = find("{http://purl.org/rss/1.0/}link") - tags = find("{http://purl.org/dc/elements/1.1/}subject") - title = find("{http://purl.org/rss/1.0/}title") - ts_str = find("{http://purl.org/dc/elements/1.1/}date") + # all tags are in one entry.tags with spaces in it. annoying! + try: + tags = item.tags[0].term.replace(' ', ',') + except AttributeError: + tags = '' if url is None: # Yielding a Link with no URL will # crash on a URL validation assertion continue - # Pinboard includes a colon in its date stamp timezone offsets, which - # Python can't parse. Remove it: - if ts_str and ts_str[-3:-2] == ":": - ts_str = ts_str[:-3]+ts_str[-2:] - - if ts_str: - time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") - else: - time = datetime.now(timezone.utc) - yield Link( url=htmldecode(url), - timestamp=str(time.timestamp()), + timestamp=str(time), title=htmldecode(title) or None, tags=htmldecode(tags) or None, sources=[rss_file.name], diff --git a/archivebox/util.py b/archivebox/util.py index f2f75ae3..37b44824 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -3,6 +3,7 @@ __package__ = 'archivebox' import re import requests import json as pyjson +import http.cookiejar from typing import List, Optional, Any from pathlib import Path @@ -164,9 +165,22 @@ def parse_date(date: Any) -> Optional[datetime]: @enforce_types def download_url(url: str, timeout: int=None) -> str: """Download the contents of a remote url and return the text""" - from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT + from .config import ( + TIMEOUT, + CHECK_SSL_VALIDITY, + WGET_USER_AGENT, + COOKIES_FILE, + ) timeout = timeout or TIMEOUT - response = requests.get( + session = requests.Session() + + if COOKIES_FILE and Path(COOKIES_FILE).is_file(): + cookie_jar = http.cookiejar.MozillaCookieJar(COOKIES_FILE) + cookie_jar.load(ignore_discard=True, ignore_expires=True) + for cookie in cookie_jar: + session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path) + + response = session.get( url, headers={'User-Agent': WGET_USER_AGENT}, verify=CHECK_SSL_VALIDITY, @@ -231,7 +245,11 @@ def chrome_args(**options) -> List[str]: # Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/ - from .config import CHROME_OPTIONS, CHROME_VERSION + from .config import ( + CHROME_OPTIONS, + CHROME_VERSION, + CHROME_EXTRA_ARGS, + ) options = {**CHROME_OPTIONS, **options} @@ -240,6 +258,8 @@ def chrome_args(**options) -> List[str]: cmd_args = [options['CHROME_BINARY']] + cmd_args += CHROME_EXTRA_ARGS + if options['CHROME_HEADLESS']: chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1]) if chrome_major_version >= 111: @@ -283,8 +303,9 @@ def chrome_args(**options) -> List[str]: if options['CHROME_USER_DATA_DIR']: cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) - - return cmd_args + + + return dedupe(cmd_args) def chrome_cleanup(): """ @@ -321,6 +342,20 @@ def ansi_to_html(text): return COLOR_REGEX.sub(single_sub, text) +@enforce_types +def dedupe(options: List[str]) -> List[str]: + """ + Deduplicates the given options. Options that come later clobber earlier + conflicting options. + """ + deduped = {} + + for option in options: + deduped[option.split('=')[0]] = option + + return list(deduped.values()) + + class AttributeDict(dict): """Helper to allow accessing dict values via Example.key or Example['key']""" diff --git a/bin/test.sh b/bin/test.sh index f9ea3575..515806bb 100755 --- a/bin/test.sh +++ b/bin/test.sh @@ -14,4 +14,4 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" source "$DIR/.venv/bin/activate" -pytest -s --basetemp=tests/out --ignore=archivebox/vendor --ignore=deb_dist --ignore=pip_dist --ignore=brew_dist +pytest -s --basetemp=tests/out "$@" diff --git a/pyproject.toml b/pyproject.toml index 3fbdfc93..a5887570 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ dependencies = [ "django-extensions>=3.0.3", "django-solo>=2.0.0", "w3lib>=1.22.0", - "yt-dlp>=2023.10.13", + "yt-dlp>=2024.3.10", # dont add playwright becuase packages without sdists cause trouble on many build systems that refuse to install wheel-only packages # "playwright>=1.39.0; platform_machine != 'armv7l'", "mypy-extensions>=0.4.3", @@ -128,6 +128,8 @@ lint = "./bin/lint.sh" test = "./bin/test.sh" # all = {composite = ["lint mypackage/", "test -v tests/"]} +[tool.pytest.ini_options] +testpaths = [ "tests" ] [project.scripts] archivebox = "archivebox.cli:main" diff --git a/tests/mock_server/server.py b/tests/mock_server/server.py index 4283574f..39abd80c 100644 --- a/tests/mock_server/server.py +++ b/tests/mock_server/server.py @@ -50,4 +50,4 @@ def redirect_to_static(filename): def start(): - run(host='localhost', port=8080) \ No newline at end of file + run(host='localhost', port=8080, quiet=True) diff --git a/tests/mock_server/templates/example-single.jsonl b/tests/mock_server/templates/example-single.jsonl new file mode 100644 index 00000000..492c906d --- /dev/null +++ b/tests/mock_server/templates/example-single.jsonl @@ -0,0 +1 @@ +{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"} diff --git a/tests/mock_server/templates/example.atom b/tests/mock_server/templates/example.atom new file mode 100644 index 00000000..9d71abb1 --- /dev/null +++ b/tests/mock_server/templates/example.atom @@ -0,0 +1,24 @@ + + + http://www.example.com/ + Example of an Atom feed + + + + Jim Winstead + + 2024-02-26T03:18:26Z + + Example + + tag:example.com,2024-02-25:3319 + 2024-02-26T03:18:26Z + 2024-02-25T19:18:25-08:00 + + + This is some <b>content</b> + + diff --git a/tests/mock_server/templates/example.json b/tests/mock_server/templates/example.json new file mode 100644 index 00000000..6ee15597 --- /dev/null +++ b/tests/mock_server/templates/example.json @@ -0,0 +1,6 @@ +[ +{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}, +{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"}, +{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]}, +{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"} +] diff --git a/tests/mock_server/templates/example.json.bad b/tests/mock_server/templates/example.json.bad new file mode 100644 index 00000000..88d77757 --- /dev/null +++ b/tests/mock_server/templates/example.json.bad @@ -0,0 +1,2 @@ +this line would cause problems but --parser=json will actually skip it +[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}] diff --git a/tests/mock_server/templates/example.jsonl b/tests/mock_server/templates/example.jsonl new file mode 100644 index 00000000..de0b3b5c --- /dev/null +++ b/tests/mock_server/templates/example.jsonl @@ -0,0 +1,4 @@ +{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"} +{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"} +{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]} +{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"} diff --git a/tests/mock_server/templates/example.rss b/tests/mock_server/templates/example.rss new file mode 100644 index 00000000..d47a5a38 --- /dev/null +++ b/tests/mock_server/templates/example.rss @@ -0,0 +1,32 @@ + + + + Sample Feed + http://example.org/ + For documentation only + en-us + Nobody (nobody@example.org) + Public domain + 2024-02-26T17:28:12-08:00 + + + + + First! + http://127.0.0.1:8080/static/example.com.html + just-an@example.org + + This has a description. + + Tag1 Tag2 + 2024-02-26T17:28:12-08:00 + description.]]> + + + + diff --git a/tests/test_add.py b/tests/test_add.py index 331178fe..c899b320 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -91,3 +91,198 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process): assert (archived_item_path / "warc").exists() assert not (archived_item_path / "singlefile.html").exists() + +def test_json(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.json', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=json"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + assert "http://127.0.0.1:8080/static/iana.org.html" in urls + assert "http://127.0.0.1:8080/static/shift_jis.html" in urls + assert "http://127.0.0.1:8080/static/title_og_with_html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.example.com/should-not-exist" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + assert "Tag3" in tags + assert "Tag4 with Space" in tags + assert "Tag5" in tags + assert "Tag6 with Space" in tags + +def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.json.bad', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=json"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.example.com/should-not-exist" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + +def test_generic_rss(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=rss"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://purl.org/dc/elements/1.1/" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1 Tag2" in tags + +def test_pinboard_rss(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=pinboard_rss"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + +def test_atom(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=rss"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.w3.org/2005/Atom" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + +def test_jsonl(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.jsonl', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=jsonl"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + assert "http://127.0.0.1:8080/static/iana.org.html" in urls + assert "http://127.0.0.1:8080/static/shift_jis.html" in urls + assert "http://127.0.0.1:8080/static/title_og_with_html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.example.com/should-not-exist" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + assert "Tag3" in tags + assert "Tag4 with Space" in tags + assert "Tag5" in tags + assert "Tag6 with Space" in tags + +def test_jsonl_single(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example-single.jsonl', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=jsonl"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.example.com/should-not-exist" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + +# make sure that JSON parser rejects a single line of JSONL which is valid +# JSON but not our expected format +def test_json_single(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example-single.jsonl', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=json"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + assert 'expects list of objects' in arg_process.stderr.decode("utf-8")