From 95cf85f8cf4a036ebf2f332fe29d0735219cf715 Mon Sep 17 00:00:00 2001 From: Igor Rzegocki Date: Thu, 30 Sep 2021 17:40:13 +0200 Subject: [PATCH 1/3] Support for Reverse Proxy authentication backends (like authelia) --- archivebox/config.py | 64 ++++++++++++++++++----------------- archivebox/core/middleware.py | 25 +++++++++++++- archivebox/core/settings.py | 2 ++ 3 files changed, 59 insertions(+), 32 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index ba68e2a3..96e0b9bb 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -82,17 +82,19 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { }, 'SERVER_CONFIG': { - 'SECRET_KEY': {'type': str, 'default': None}, - 'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]}, - 'ALLOWED_HOSTS': {'type': str, 'default': '*'}, - 'DEBUG': {'type': bool, 'default': False}, - 'PUBLIC_INDEX': {'type': bool, 'default': True}, - 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, - 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, - 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, - 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, - 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None}, - 'TIME_ZONE': {'type': str, 'default': 'UTC'}, + 'SECRET_KEY': {'type': str, 'default': None}, + 'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]}, + 'ALLOWED_HOSTS': {'type': str, 'default': '*'}, + 'DEBUG': {'type': bool, 'default': False}, + 'PUBLIC_INDEX': {'type': bool, 'default': True}, + 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, + 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, + 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, + 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, + 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None}, + 'TIME_ZONE': {'type': str, 'default': 'UTC'}, + 'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'}, + 'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''}, }, 'ARCHIVE_METHOD_TOGGLES': { @@ -145,7 +147,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--add-metadata', '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']), ]}, - + 'WGET_ARGS': {'type': list, 'default': ['--no-verbose', '--adjust-extension', @@ -187,7 +189,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'USE_NODE': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True}, 'USE_RIPGREP': {'type': bool, 'default': True}, - + 'CURL_BINARY': {'type': str, 'default': 'curl'}, 'GIT_BINARY': {'type': str, 'default': 'git'}, 'WGET_BINARY': {'type': str, 'default': 'wget'}, @@ -268,7 +270,7 @@ STATICFILE_EXTENSIONS = { # that can be downloaded as-is, not html pages that need to be rendered 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', - 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', + 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8', 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', 'atom', 'rss', 'css', 'js', 'json', @@ -277,7 +279,7 @@ STATICFILE_EXTENSIONS = { # Less common extensions to consider adding later # jar, swf, bin, com, exe, dll, deb - # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, + # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf, # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml @@ -389,14 +391,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()}, 'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])}, 'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None}, - + 'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']}, 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']}, 'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']}, 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']}, 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']}, 'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']}, - + 'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])}, 'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None}, @@ -446,7 +448,7 @@ def load_config_val(key: str, elif val.lower() in ('false', 'no', '0'): return False else: - raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)') + raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)') elif type is str: if val.lower() in ('true', 'false', 'yes', 'no', '1', '0'): @@ -471,7 +473,7 @@ def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]: config_path = Path(out_dir) / CONFIG_FILENAME if config_path.exists(): config_file = ConfigParser() - config_file.optionxform = str + config_file.optionxform = str config_file.read(config_path) # flatten into one namespace config_file_vars = { @@ -495,7 +497,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: # # You can add options here manually in INI format, or automatically by running: # archivebox config --set KEY=VALUE - # + # # If you modify this file manually, make sure to update your archive after by running: # archivebox init # @@ -506,7 +508,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() config_path = Path(out_dir) / CONFIG_FILENAME - + if not config_path.exists(): atomic_write(config_path, CONFIG_HEADER) @@ -544,7 +546,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: with open(config_path, 'w+', encoding='utf-8') as new: config_file.write(new) - + try: # validate the config by attempting to re-parse it CONFIG = load_all_config() @@ -557,20 +559,20 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: if Path(f'{config_path}.bak').exists(): os.remove(f'{config_path}.bak') - + return { key.upper(): CONFIG.get(key.upper()) for key in config.keys() } - + def load_config(defaults: ConfigDefaultDict, config: Optional[ConfigDict]=None, out_dir: Optional[str]=None, env_vars: Optional[os._Environ]=None, config_file_vars: Optional[Dict[str, str]]=None) -> ConfigDict: - + env_vars = env_vars or os.environ config_file_vars = config_file_vars or load_config_file(out_dir=out_dir) @@ -600,7 +602,7 @@ def load_config(defaults: ConfigDefaultDict, stderr() # raise raise SystemExit(2) - + return extended_config # def write_config(config: ConfigDict): @@ -683,7 +685,7 @@ def bin_hash(binary: Optional[str]) -> Optional[str]: with io.open(abs_path, mode='rb') as f: for chunk in iter(lambda: f.read(io.DEFAULT_BUFFER_SIZE), b''): file_hash.update(chunk) - + return f'md5:{file_hash.hexdigest()}' def find_chrome_binary() -> Optional[str]: @@ -708,7 +710,7 @@ def find_chrome_binary() -> Optional[str]: full_path_exists = shutil.which(name) if full_path_exists: return name - + return None def find_chrome_data_dir() -> Optional[str]: @@ -1078,7 +1080,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:') stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media') stderr() - + def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None: output_dir = out_dir or config['OUTPUT_DIR'] assert isinstance(output_dir, (str, Path)) @@ -1117,7 +1119,7 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None: check_system_config() - + output_dir = out_dir or Path(config['OUTPUT_DIR']) assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path) @@ -1152,7 +1154,7 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, # Otherwise use default sqlite3 file-based database and initialize django # without running migrations automatically (user runs them manually by calling init) django.setup() - + from django.conf import settings diff --git a/archivebox/core/middleware.py b/archivebox/core/middleware.py index 3b5787c4..cf7ab991 100644 --- a/archivebox/core/middleware.py +++ b/archivebox/core/middleware.py @@ -1,8 +1,11 @@ __package__ = 'archivebox.core' +import ipaddress from django.utils import timezone +from django.contrib.auth.middleware import RemoteUserMiddleware +from django.core.exceptions import ImproperlyConfigured -from ..config import PUBLIC_SNAPSHOTS +from ..config import PUBLIC_SNAPSHOTS, REVERSE_PROXY_USER_HEADER, REVERSE_PROXY_WHITELIST def detect_timezone(request, activate: bool=True): @@ -35,3 +38,23 @@ def CacheControlMiddleware(get_response): return response return middleware + +class ReverseProxyAuthMiddleware(RemoteUserMiddleware): + header = 'HTTP_{normalized}'.format(normalized=REVERSE_PROXY_USER_HEADER.replace('-', '_').upper()) + + def process_request(self, request): + if REVERSE_PROXY_WHITELIST == '': + return + + ip = request.META.get('REMOTE_ADDR') + + for cidr in REVERSE_PROXY_WHITELIST.split(','): + try: + network = ipaddress.ip_network(cidr) + except ValueError: + raise ImproperlyConfigured( + "The REVERSE_PROXY_WHITELIST config paramater is in invalid format, or " + "contains invalid CIDR. Correct format is a coma-separated list of IPv4/IPv6 CIDRs.") + + if ipaddress.ip_address(ip) in network: + return super().process_request(request) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 550c6077..3627d247 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -61,11 +61,13 @@ MIDDLEWARE = [ 'django.middleware.common.CommonMiddleware', 'django.middleware.csrf.CsrfViewMiddleware', 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'core.middleware.ReverseProxyAuthMiddleware', 'django.contrib.messages.middleware.MessageMiddleware', 'core.middleware.CacheControlMiddleware', ] AUTHENTICATION_BACKENDS = [ + 'django.contrib.auth.backends.RemoteUserBackend', 'django.contrib.auth.backends.ModelBackend', ] From d4f534e61273dd42f1b9447c8ced3dfe4f8872bf Mon Sep 17 00:00:00 2001 From: Igor Rzegocki Date: Thu, 31 Mar 2022 21:40:14 +0200 Subject: [PATCH 2/3] add `LOGOUT_REDIRECT_URL` --- archivebox/config.py | 1 + archivebox/core/settings.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/archivebox/config.py b/archivebox/config.py index 96e0b9bb..3c88adbb 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -95,6 +95,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'TIME_ZONE': {'type': str, 'default': 'UTC'}, 'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'}, 'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''}, + 'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'}, }, 'ARCHIVE_METHOD_TOGGLES': { diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 3627d247..70cffa85 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -34,7 +34,8 @@ WSGI_APPLICATION = 'core.wsgi.application' ROOT_URLCONF = 'core.urls' LOGIN_URL = '/accounts/login/' -LOGOUT_REDIRECT_URL = '/' +LOGOUT_REDIRECT_URL = os.environ.get('LOGOUT_REDIRECT_URL', '/') + PASSWORD_RESET_URL = '/accounts/password_reset/' APPEND_SLASH = True From dca69933ebe3bd172aaed8a040dce744dc9a6dea Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 9 Jan 2023 18:22:01 -0800 Subject: [PATCH 3/3] Update archivebox/config.py Co-authored-by: dugite-code --- archivebox/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/archivebox/config.py b/archivebox/config.py index f20303ac..dbfb1a4f 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -105,6 +105,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''}, 'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'}, 'PREVIEW_ORIGINALS': {'type': bool, 'default': True}, + 'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'}, }, 'ARCHIVE_METHOD_TOGGLES': {