diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index e838e167..6955bc88 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -222,17 +222,17 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'USER': {'default': lambda c: getpass.getuser() or os.getlogin()}, 'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}}, - 'REPO_DIR': {'default': lambda c: Path(os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..')))}, - 'PYTHON_DIR': {'default': lambda c: Path.joinpath(Path(c['REPO_DIR']), PYTHON_DIR_NAME)}, - 'TEMPLATES_DIR': {'default': lambda c: Path.joinpath(c['PYTHON_DIR'], TEMPLATES_DIR_NAME, 'legacy')}, + 'REPO_DIR': {'default': lambda c: Path(__file__).resolve().parent.parent.parent}, + 'PYTHON_DIR': {'default': lambda c: c['REPO_DIR'] / PYTHON_DIR_NAME}, + 'TEMPLATES_DIR': {'default': lambda c: c['PYTHON_DIR'] / TEMPLATES_DIR_NAME / 'legacy'}, - 'OUTPUT_DIR': {'default': lambda c: Path(os.path.abspath(os.path.expanduser(c['OUTPUT_DIR'])) if c['OUTPUT_DIR'] else os.path.abspath(os.curdir))}, - 'ARCHIVE_DIR': {'default': lambda c: Path.joinpath(c['OUTPUT_DIR'], ARCHIVE_DIR_NAME)}, - 'SOURCES_DIR': {'default': lambda c: Path.joinpath(c['OUTPUT_DIR'], SOURCES_DIR_NAME)}, - 'LOGS_DIR': {'default': lambda c: Path.joinpath(c['OUTPUT_DIR'], LOGS_DIR_NAME)}, - 'CONFIG_FILE': {'default': lambda c: os.path.abspath(os.path.expanduser(c['CONFIG_FILE'])) if c['CONFIG_FILE'] else Path.joinpath(c['OUTPUT_DIR'], CONFIG_FILENAME)}, - 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and os.path.abspath(os.path.expanduser(c['COOKIES_FILE']))}, - 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (os.path.abspath(os.path.expanduser(c['CHROME_USER_DATA_DIR'])) or None)}, + 'OUTPUT_DIR': {'default': lambda c: Path.home() / c['OUTPUT_DIR'] if c['OUTPUT_DIR'] else Path(os.curdir).resolve()}, + 'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME}, + 'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME}, + 'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME}, + 'CONFIG_FILE': {'default': lambda c: Path.home() / c['CONFIG_FILE'] if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME}, + 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path.home() / c['COOKIES_FILE']}, + 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else Path.home() / c['CHROME_USER_DATA_DIR'] or None}, 'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]}, @@ -347,9 +347,9 @@ def load_config_val(key: str, def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]: """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf""" - out_dir = out_dir or os.path.abspath(os.getenv('OUTPUT_DIR', '.')) - config_path = Path.joinpath(Path(out_dir), CONFIG_FILENAME) - if os.path.exists(config_path): + out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() + config_path = Path(out_dir) / CONFIG_FILENAME + if config_path.exists(): config_file = ConfigParser() config_file.optionxform = str config_file.read(config_path) @@ -370,10 +370,10 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: from ..system import atomic_write - out_dir = out_dir or os.path.abspath(os.getenv('OUTPUT_DIR', '.')) - config_path = Path.joinpath(out_dir, CONFIG_FILENAME) + out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() + config_path = Path(out_dir) / CONFIG_FILENAME - if not os.path.exists(config_path): + if not config_path.exists(): atomic_write(config_path, CONFIG_HEADER) config_file = ConfigParser() @@ -593,8 +593,8 @@ def find_chrome_data_dir() -> Optional[str]: '~/.config/google-chrome-dev', ) for path in default_profile_paths: - full_path = os.path.expanduser(path) - if os.path.exists(full_path): + full_path = Path.home() / path + if full_path.exists(): return full_path return None @@ -609,19 +609,19 @@ def wget_supports_compression(config): def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: return { 'REPO_DIR': { - 'path': os.path.abspath(config['REPO_DIR']), + 'path': config['REPO_DIR'].resolve(), 'enabled': True, - 'is_valid': os.path.exists(Path.joinpath(config['REPO_DIR'], 'archivebox')), + 'is_valid': (config['REPO_DIR'] / 'archivebox').exists(), }, 'PYTHON_DIR': { - 'path': os.path.abspath(config['PYTHON_DIR']), + 'path': (config['PYTHON_DIR']).resolve(), 'enabled': True, - 'is_valid': os.path.exists(Path.joinpath(config['PYTHON_DIR'], '__main__.py')), + 'is_valid': (config['PYTHON_DIR'] / '__main__.py').exists(), }, 'TEMPLATES_DIR': { - 'path': os.path.abspath(config['TEMPLATES_DIR']), + 'path': (config['TEMPLATES_DIR']).resolve(), 'enabled': True, - 'is_valid': os.path.exists(Path.joinpath(config['TEMPLATES_DIR'], 'static')), + 'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(), }, } @@ -643,44 +643,44 @@ def get_external_locations(config: ConfigDict) -> ConfigValue: def get_data_locations(config: ConfigDict) -> ConfigValue: return { 'OUTPUT_DIR': { - 'path': os.path.abspath(config['OUTPUT_DIR']), + 'path': config['OUTPUT_DIR'].resolve(), 'enabled': True, - 'is_valid': os.path.exists(Path.joinpath(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)), + 'is_valid': (config['OUTPUT_DIR'] / JSON_INDEX_FILENAME).exists(), }, 'SOURCES_DIR': { - 'path': os.path.abspath(config['SOURCES_DIR']), + 'path': config['SOURCES_DIR'].resolve(), 'enabled': True, - 'is_valid': os.path.exists(config['SOURCES_DIR']), + 'is_valid': config['SOURCES_DIR'].exists(), }, 'LOGS_DIR': { - 'path': os.path.abspath(config['LOGS_DIR']), + 'path': config['LOGS_DIR'].resolve(), 'enabled': True, - 'is_valid': os.path.exists(config['LOGS_DIR']), + 'is_valid': config['LOGS_DIR'].exists(), }, 'ARCHIVE_DIR': { - 'path': os.path.abspath(config['ARCHIVE_DIR']), + 'path': config['ARCHIVE_DIR'].resolve(), 'enabled': True, - 'is_valid': os.path.exists(config['ARCHIVE_DIR']), + 'is_valid': config['ARCHIVE_DIR'].exists(), }, 'CONFIG_FILE': { - 'path': os.path.abspath(config['CONFIG_FILE']), + 'path': config['CONFIG_FILE'].resolve(), 'enabled': True, - 'is_valid': os.path.exists(config['CONFIG_FILE']), + 'is_valid': config['CONFIG_FILE'].exists(), }, 'SQL_INDEX': { - 'path': os.path.abspath(Path.joinpath(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)), + 'path': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve(), 'enabled': True, - 'is_valid': os.path.exists(Path.joinpath(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)), + 'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(), }, 'JSON_INDEX': { - 'path': os.path.abspath(Path.joinpath(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)), + 'path': (config['OUTPUT_DIR'] / JSON_INDEX_FILENAME).resolve(), 'enabled': True, - 'is_valid': os.path.exists(Path.joinpath(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)), + 'is_valid': (config['OUTPUT_DIR'] / JSON_INDEX_FILENAME).exists(), }, 'HTML_INDEX': { - 'path': os.path.abspath(Path.joinpath(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)), + 'path': (config['OUTPUT_DIR'] / HTML_INDEX_FILENAME).resolve(), 'enabled': True, - 'is_valid': os.path.exists(Path.joinpath(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)), + 'is_valid': (config['OUTPUT_DIR'] / HTML_INDEX_FILENAME).exists(), }, } @@ -909,9 +909,9 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> stderr(' archivebox init') raise SystemExit(3) - sources_dir = Path.joinpath(output_dir, SOURCES_DIR_NAME) - if not os.path.exists(sources_dir): - os.makedirs(sources_dir) + sources_dir = Path(output_dir) / SOURCES_DIR_NAME + if not sources_dir.exists(): + sources_dir.mkdir() @@ -930,8 +930,8 @@ def setup_django(out_dir: str=None, check_db=False, config: ConfigDict=CONFIG) - django.setup() if check_db: - sql_index_path = Path.joinpath(output_dir, SQL_INDEX_FILENAME) - assert os.path.exists(sql_index_path), ( + sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME + assert sql_index_path.exists(), ( f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {config["OUTPUT_DIR"]}') except KeyboardInterrupt: raise SystemExit(2) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 536b5819..f14c1aa4 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -212,7 +212,7 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str: @contextmanager @enforce_types -def timed_index_update(out_path: str): +def timed_index_update(out_path: Path): log_indexing_started(out_path) timer = TimedProgress(TIMEOUT * 2, prefix=' ') try: @@ -220,7 +220,7 @@ def timed_index_update(out_path: str): finally: timer.end() - assert os.path.exists(out_path), f'Failed to write index file: {out_path}' + assert out_path.exists(), f'Failed to write index file: {out_path}' log_indexing_finished(out_path) @@ -231,27 +231,27 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool log_indexing_process_started(len(links)) try: - with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)): + with timed_index_update(out_dir / SQL_INDEX_FILENAME): write_sql_main_index(links, out_dir=out_dir) - os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes + os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes if finished: write_static_index(links, out_dir=out_dir) except (KeyboardInterrupt, SystemExit): stderr('[!] Warning: Still writing index to disk...', color='lightyellow') stderr(' Run archivebox init to fix any inconsisntencies from an ungraceful exit.') - with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)): + with timed_index_update(out_dir / SQL_INDEX_FILENAME): write_sql_main_index(links, out_dir=out_dir) - os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes + os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes raise SystemExit(0) log_indexing_process_finished() @enforce_types -def write_static_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: - with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)): +def write_static_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: + with timed_index_update(str(out_dir / JSON_INDEX_FILENAME)): write_json_main_index(links) - with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)): + with timed_index_update(str(out_dir / HTML_INDEX_FILENAME)): write_html_main_index(links, out_dir=out_dir, finished=True) @enforce_types @@ -273,8 +273,8 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]: @enforce_types def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]: - index_path = os.path.join(out_dir, JSON_INDEX_FILENAME) - if os.path.exists(index_path): + index_path = out_dir / JSON_INDEX_FILENAME + if index_path.exists(): with open(index_path, 'r', encoding='utf-8') as f: meta_dict = pyjson.load(f) meta_dict.pop('links') @@ -422,7 +422,7 @@ def get_present_folders(links, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[L all_folders = {} - for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir(): + for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir(): if entry.is_dir(): link = None try: @@ -584,9 +584,9 @@ def is_unarchived(link: Link) -> bool: def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]: fixed = [] cant_fix = [] - for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): + for entry in os.scandir(out_dir / ARCHIVE_DIR_NAME): if entry.is_dir(follow_symlinks=True): - if os.path.exists(os.path.join(entry.path, 'index.json')): + if (Path(entry.path) / 'index.json').exists(): try: link = parse_json_link_details(entry.path) except KeyError: @@ -595,8 +595,8 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L continue if not entry.path.endswith(f'/{link.timestamp}'): - dest = os.path.join(out_dir, ARCHIVE_DIR_NAME, link.timestamp) - if os.path.exists(dest): + dest = out_dir / ARCHIVE_DIR_NAME / link.timestamp + if dest.exists(): cant_fix.append(entry.path) else: shutil.move(entry.path, dest) diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index 36903867..212f3892 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -6,6 +6,7 @@ import sys import time import argparse from multiprocessing import Process +from pathlib import Path from datetime import datetime from dataclasses import dataclass @@ -442,11 +443,11 @@ def log_shell_welcome_msg(): ### Helpers @enforce_types -def pretty_path(path: str) -> str: +def pretty_path(path: Union[Path, str]) -> str: """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" pwd = os.path.abspath('.') # parent = os.path.abspath(os.path.join(pwd, os.path.pardir)) - return path.replace(pwd + '/', './') + return str(path).replace(pwd + '/', './') @enforce_types