1
0
Fork 0
mirror of synced 2024-06-27 02:20:36 +12:00

pathlib with / syntax for config, index

This commit is contained in:
apkallum 2020-09-07 18:49:14 -04:00 committed by Cristian Vargas
parent 594d9e49ce
commit b99784b919
3 changed files with 64 additions and 63 deletions

View file

@ -222,17 +222,17 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'USER': {'default': lambda c: getpass.getuser() or os.getlogin()}, 'USER': {'default': lambda c: getpass.getuser() or os.getlogin()},
'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}}, 'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}},
'REPO_DIR': {'default': lambda c: Path(os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..')))}, 'REPO_DIR': {'default': lambda c: Path(__file__).resolve().parent.parent.parent},
'PYTHON_DIR': {'default': lambda c: Path.joinpath(Path(c['REPO_DIR']), PYTHON_DIR_NAME)}, 'PYTHON_DIR': {'default': lambda c: c['REPO_DIR'] / PYTHON_DIR_NAME},
'TEMPLATES_DIR': {'default': lambda c: Path.joinpath(c['PYTHON_DIR'], TEMPLATES_DIR_NAME, 'legacy')}, 'TEMPLATES_DIR': {'default': lambda c: c['PYTHON_DIR'] / TEMPLATES_DIR_NAME / 'legacy'},
'OUTPUT_DIR': {'default': lambda c: Path(os.path.abspath(os.path.expanduser(c['OUTPUT_DIR'])) if c['OUTPUT_DIR'] else os.path.abspath(os.curdir))}, 'OUTPUT_DIR': {'default': lambda c: Path.home() / c['OUTPUT_DIR'] if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
'ARCHIVE_DIR': {'default': lambda c: Path.joinpath(c['OUTPUT_DIR'], ARCHIVE_DIR_NAME)}, 'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
'SOURCES_DIR': {'default': lambda c: Path.joinpath(c['OUTPUT_DIR'], SOURCES_DIR_NAME)}, 'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
'LOGS_DIR': {'default': lambda c: Path.joinpath(c['OUTPUT_DIR'], LOGS_DIR_NAME)}, 'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
'CONFIG_FILE': {'default': lambda c: os.path.abspath(os.path.expanduser(c['CONFIG_FILE'])) if c['CONFIG_FILE'] else Path.joinpath(c['OUTPUT_DIR'], CONFIG_FILENAME)}, 'CONFIG_FILE': {'default': lambda c: Path.home() / c['CONFIG_FILE'] if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and os.path.abspath(os.path.expanduser(c['COOKIES_FILE']))}, 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path.home() / c['COOKIES_FILE']},
'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (os.path.abspath(os.path.expanduser(c['CHROME_USER_DATA_DIR'])) or None)}, 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else Path.home() / c['CHROME_USER_DATA_DIR'] or None},
'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, 'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]}, 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]},
@ -347,9 +347,9 @@ def load_config_val(key: str,
def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]: def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]:
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf""" """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
out_dir = out_dir or os.path.abspath(os.getenv('OUTPUT_DIR', '.')) out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
config_path = Path.joinpath(Path(out_dir), CONFIG_FILENAME) config_path = Path(out_dir) / CONFIG_FILENAME
if os.path.exists(config_path): if config_path.exists():
config_file = ConfigParser() config_file = ConfigParser()
config_file.optionxform = str config_file.optionxform = str
config_file.read(config_path) config_file.read(config_path)
@ -370,10 +370,10 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
from ..system import atomic_write from ..system import atomic_write
out_dir = out_dir or os.path.abspath(os.getenv('OUTPUT_DIR', '.')) out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
config_path = Path.joinpath(out_dir, CONFIG_FILENAME) config_path = Path(out_dir) / CONFIG_FILENAME
if not os.path.exists(config_path): if not config_path.exists():
atomic_write(config_path, CONFIG_HEADER) atomic_write(config_path, CONFIG_HEADER)
config_file = ConfigParser() config_file = ConfigParser()
@ -593,8 +593,8 @@ def find_chrome_data_dir() -> Optional[str]:
'~/.config/google-chrome-dev', '~/.config/google-chrome-dev',
) )
for path in default_profile_paths: for path in default_profile_paths:
full_path = os.path.expanduser(path) full_path = Path.home() / path
if os.path.exists(full_path): if full_path.exists():
return full_path return full_path
return None return None
@ -609,19 +609,19 @@ def wget_supports_compression(config):
def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
return { return {
'REPO_DIR': { 'REPO_DIR': {
'path': os.path.abspath(config['REPO_DIR']), 'path': config['REPO_DIR'].resolve(),
'enabled': True, 'enabled': True,
'is_valid': os.path.exists(Path.joinpath(config['REPO_DIR'], 'archivebox')), 'is_valid': (config['REPO_DIR'] / 'archivebox').exists(),
}, },
'PYTHON_DIR': { 'PYTHON_DIR': {
'path': os.path.abspath(config['PYTHON_DIR']), 'path': (config['PYTHON_DIR']).resolve(),
'enabled': True, 'enabled': True,
'is_valid': os.path.exists(Path.joinpath(config['PYTHON_DIR'], '__main__.py')), 'is_valid': (config['PYTHON_DIR'] / '__main__.py').exists(),
}, },
'TEMPLATES_DIR': { 'TEMPLATES_DIR': {
'path': os.path.abspath(config['TEMPLATES_DIR']), 'path': (config['TEMPLATES_DIR']).resolve(),
'enabled': True, 'enabled': True,
'is_valid': os.path.exists(Path.joinpath(config['TEMPLATES_DIR'], 'static')), 'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
}, },
} }
@ -643,44 +643,44 @@ def get_external_locations(config: ConfigDict) -> ConfigValue:
def get_data_locations(config: ConfigDict) -> ConfigValue: def get_data_locations(config: ConfigDict) -> ConfigValue:
return { return {
'OUTPUT_DIR': { 'OUTPUT_DIR': {
'path': os.path.abspath(config['OUTPUT_DIR']), 'path': config['OUTPUT_DIR'].resolve(),
'enabled': True, 'enabled': True,
'is_valid': os.path.exists(Path.joinpath(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)), 'is_valid': (config['OUTPUT_DIR'] / JSON_INDEX_FILENAME).exists(),
}, },
'SOURCES_DIR': { 'SOURCES_DIR': {
'path': os.path.abspath(config['SOURCES_DIR']), 'path': config['SOURCES_DIR'].resolve(),
'enabled': True, 'enabled': True,
'is_valid': os.path.exists(config['SOURCES_DIR']), 'is_valid': config['SOURCES_DIR'].exists(),
}, },
'LOGS_DIR': { 'LOGS_DIR': {
'path': os.path.abspath(config['LOGS_DIR']), 'path': config['LOGS_DIR'].resolve(),
'enabled': True, 'enabled': True,
'is_valid': os.path.exists(config['LOGS_DIR']), 'is_valid': config['LOGS_DIR'].exists(),
}, },
'ARCHIVE_DIR': { 'ARCHIVE_DIR': {
'path': os.path.abspath(config['ARCHIVE_DIR']), 'path': config['ARCHIVE_DIR'].resolve(),
'enabled': True, 'enabled': True,
'is_valid': os.path.exists(config['ARCHIVE_DIR']), 'is_valid': config['ARCHIVE_DIR'].exists(),
}, },
'CONFIG_FILE': { 'CONFIG_FILE': {
'path': os.path.abspath(config['CONFIG_FILE']), 'path': config['CONFIG_FILE'].resolve(),
'enabled': True, 'enabled': True,
'is_valid': os.path.exists(config['CONFIG_FILE']), 'is_valid': config['CONFIG_FILE'].exists(),
}, },
'SQL_INDEX': { 'SQL_INDEX': {
'path': os.path.abspath(Path.joinpath(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)), 'path': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve(),
'enabled': True, 'enabled': True,
'is_valid': os.path.exists(Path.joinpath(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)), 'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
}, },
'JSON_INDEX': { 'JSON_INDEX': {
'path': os.path.abspath(Path.joinpath(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)), 'path': (config['OUTPUT_DIR'] / JSON_INDEX_FILENAME).resolve(),
'enabled': True, 'enabled': True,
'is_valid': os.path.exists(Path.joinpath(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)), 'is_valid': (config['OUTPUT_DIR'] / JSON_INDEX_FILENAME).exists(),
}, },
'HTML_INDEX': { 'HTML_INDEX': {
'path': os.path.abspath(Path.joinpath(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)), 'path': (config['OUTPUT_DIR'] / HTML_INDEX_FILENAME).resolve(),
'enabled': True, 'enabled': True,
'is_valid': os.path.exists(Path.joinpath(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)), 'is_valid': (config['OUTPUT_DIR'] / HTML_INDEX_FILENAME).exists(),
}, },
} }
@ -909,9 +909,9 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) ->
stderr(' archivebox init') stderr(' archivebox init')
raise SystemExit(3) raise SystemExit(3)
sources_dir = Path.joinpath(output_dir, SOURCES_DIR_NAME) sources_dir = Path(output_dir) / SOURCES_DIR_NAME
if not os.path.exists(sources_dir): if not sources_dir.exists():
os.makedirs(sources_dir) sources_dir.mkdir()
@ -930,8 +930,8 @@ def setup_django(out_dir: str=None, check_db=False, config: ConfigDict=CONFIG) -
django.setup() django.setup()
if check_db: if check_db:
sql_index_path = Path.joinpath(output_dir, SQL_INDEX_FILENAME) sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
assert os.path.exists(sql_index_path), ( assert sql_index_path.exists(), (
f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {config["OUTPUT_DIR"]}') f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {config["OUTPUT_DIR"]}')
except KeyboardInterrupt: except KeyboardInterrupt:
raise SystemExit(2) raise SystemExit(2)

View file

@ -212,7 +212,7 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
@contextmanager @contextmanager
@enforce_types @enforce_types
def timed_index_update(out_path: str): def timed_index_update(out_path: Path):
log_indexing_started(out_path) log_indexing_started(out_path)
timer = TimedProgress(TIMEOUT * 2, prefix=' ') timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try: try:
@ -220,7 +220,7 @@ def timed_index_update(out_path: str):
finally: finally:
timer.end() timer.end()
assert os.path.exists(out_path), f'Failed to write index file: {out_path}' assert out_path.exists(), f'Failed to write index file: {out_path}'
log_indexing_finished(out_path) log_indexing_finished(out_path)
@ -231,27 +231,27 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool
log_indexing_process_started(len(links)) log_indexing_process_started(len(links))
try: try:
with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)): with timed_index_update(out_dir / SQL_INDEX_FILENAME):
write_sql_main_index(links, out_dir=out_dir) write_sql_main_index(links, out_dir=out_dir)
os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
if finished: if finished:
write_static_index(links, out_dir=out_dir) write_static_index(links, out_dir=out_dir)
except (KeyboardInterrupt, SystemExit): except (KeyboardInterrupt, SystemExit):
stderr('[!] Warning: Still writing index to disk...', color='lightyellow') stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
stderr(' Run archivebox init to fix any inconsisntencies from an ungraceful exit.') stderr(' Run archivebox init to fix any inconsisntencies from an ungraceful exit.')
with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)): with timed_index_update(out_dir / SQL_INDEX_FILENAME):
write_sql_main_index(links, out_dir=out_dir) write_sql_main_index(links, out_dir=out_dir)
os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
raise SystemExit(0) raise SystemExit(0)
log_indexing_process_finished() log_indexing_process_finished()
@enforce_types @enforce_types
def write_static_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: def write_static_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)): with timed_index_update(str(out_dir / JSON_INDEX_FILENAME)):
write_json_main_index(links) write_json_main_index(links)
with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)): with timed_index_update(str(out_dir / HTML_INDEX_FILENAME)):
write_html_main_index(links, out_dir=out_dir, finished=True) write_html_main_index(links, out_dir=out_dir, finished=True)
@enforce_types @enforce_types
@ -273,8 +273,8 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
@enforce_types @enforce_types
def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]: def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]:
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME) index_path = out_dir / JSON_INDEX_FILENAME
if os.path.exists(index_path): if index_path.exists():
with open(index_path, 'r', encoding='utf-8') as f: with open(index_path, 'r', encoding='utf-8') as f:
meta_dict = pyjson.load(f) meta_dict = pyjson.load(f)
meta_dict.pop('links') meta_dict.pop('links')
@ -422,7 +422,7 @@ def get_present_folders(links, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[L
all_folders = {} all_folders = {}
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir(): for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir():
if entry.is_dir(): if entry.is_dir():
link = None link = None
try: try:
@ -584,9 +584,9 @@ def is_unarchived(link: Link) -> bool:
def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]: def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
fixed = [] fixed = []
cant_fix = [] cant_fix = []
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): for entry in os.scandir(out_dir / ARCHIVE_DIR_NAME):
if entry.is_dir(follow_symlinks=True): if entry.is_dir(follow_symlinks=True):
if os.path.exists(os.path.join(entry.path, 'index.json')): if (Path(entry.path) / 'index.json').exists():
try: try:
link = parse_json_link_details(entry.path) link = parse_json_link_details(entry.path)
except KeyError: except KeyError:
@ -595,8 +595,8 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L
continue continue
if not entry.path.endswith(f'/{link.timestamp}'): if not entry.path.endswith(f'/{link.timestamp}'):
dest = os.path.join(out_dir, ARCHIVE_DIR_NAME, link.timestamp) dest = out_dir / ARCHIVE_DIR_NAME / link.timestamp
if os.path.exists(dest): if dest.exists():
cant_fix.append(entry.path) cant_fix.append(entry.path)
else: else:
shutil.move(entry.path, dest) shutil.move(entry.path, dest)

View file

@ -6,6 +6,7 @@ import sys
import time import time
import argparse import argparse
from multiprocessing import Process from multiprocessing import Process
from pathlib import Path
from datetime import datetime from datetime import datetime
from dataclasses import dataclass from dataclasses import dataclass
@ -442,11 +443,11 @@ def log_shell_welcome_msg():
### Helpers ### Helpers
@enforce_types @enforce_types
def pretty_path(path: str) -> str: def pretty_path(path: Union[Path, str]) -> str:
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
pwd = os.path.abspath('.') pwd = os.path.abspath('.')
# parent = os.path.abspath(os.path.join(pwd, os.path.pardir)) # parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
return path.replace(pwd + '/', './') return str(path).replace(pwd + '/', './')
@enforce_types @enforce_types