1
0
Fork 0
mirror of synced 2024-05-12 16:33:53 +12:00

Merge branch 'dev' into plugins-browsertrix

This commit is contained in:
Nick Sweeting 2024-03-18 14:43:06 -07:00
commit c22df0b63a
4 changed files with 38 additions and 5 deletions

View file

@ -267,7 +267,13 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
# Setup ArchiveBox runtime config
WORKDIR "$DATA_DIR"
ENV IN_DOCKER=True \
CUSTOM_TEMPLATES_DIR=/data/templates
DISPLAY=novnc:0.0 \
CUSTOM_TEMPLATES_DIR=/data/templates \
CHROME_USER_DATA_DIR=/data/personas/Default/chromium \
GOOGLE_API_KEY=no \
GOOGLE_DEFAULT_CLIENT_ID=no \
GOOGLE_DEFAULT_CLIENT_SECRET=no \
ALLOWED_HOSTS=*
## No need to set explicitly, these values will be autodetected by archivebox in docker:
# CHROME_SANDBOX=False \
# WGET_BINARY="wget" \

View file

@ -142,9 +142,10 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'USER_AGENT': {'type': str, 'default': None},
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'COOKIES_FILE': {'type': str, 'default': None},
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
@ -280,6 +281,7 @@ TEMPLATES_DIR_NAME = 'templates'
ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources'
LOGS_DIR_NAME = 'logs'
PERSONAS_DIR_NAME = 'personas'
SQL_INDEX_FILENAME = 'index.sqlite3'
JSON_INDEX_FILENAME = 'index.json'
HTML_INDEX_FILENAME = 'index.html'
@ -356,6 +358,7 @@ ALLOWED_IN_OUTPUT_DIR = {
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
PERSONAS_DIR_NAME,
SQL_INDEX_FILENAME,
f'{SQL_INDEX_FILENAME}-wal',
f'{SQL_INDEX_FILENAME}-shm',
@ -506,6 +509,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
@ -1035,6 +1039,11 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
'enabled': True,
'is_valid': config['LOGS_DIR'].exists(),
},
'PERSONAS': {
'path': config['PERSONAS'].resolve(),
'enabled': True,
'is_valid': config['PERSONAS'].exists(),
},
'ARCHIVE_DIR': {
'path': config['ARCHIVE_DIR'].resolve(),
'enabled': True,
@ -1382,6 +1391,8 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO
(Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
(Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
(Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True)
(Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True)

View file

@ -303,10 +303,11 @@ def chrome_args(**options) -> List[str]:
if options['CHROME_USER_DATA_DIR']:
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
cmd_args.append('--profile-directory=Default')
return dedupe(cmd_args)
def chrome_cleanup():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by

View file

@ -135,6 +135,21 @@ services:
# - ./data:/var/www
### Example: Watch the ArchiveBox browser in realtime as it archives things,
# or remote control it to set up logins and credentials for sites you want to archive.
# https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile
novnc:
image: theasp/novnc:latest
environment:
- DISPLAY_WIDTH=1920
- DISPLAY_HEIGHT=1080
- RUN_XTERM=no
ports:
# to view/control ArchiveBox's browser, visit: http://localhost:8080/vnc.html
- "8080:8080"
### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel
# wireguard: