1
0
Fork 0
mirror of synced 2024-06-30 20:10:35 +12:00

new methods for detecting valid/invalid data dirs on init

This commit is contained in:
Nick Sweeting 2019-04-24 11:40:10 -04:00
parent ae782a1a0c
commit 56d0b2c088
2 changed files with 355 additions and 54 deletions

View file

@ -7,11 +7,21 @@ __description__ = 'List all the URLs currently in the archive.'
import sys import sys
import argparse import argparse
from ..legacy.util import SmartFormatter, reject_stdin, to_json, to_csv
from ..legacy.util import reject_stdin, to_json, to_csv from ..legacy.config import check_data_folder, OUTPUT_DIR
from ..legacy.config import check_data_folder from ..legacy.main import (
from ..legacy.main import list_archive_data list_archive_data,
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
get_present_folders,
get_valid_folders,
get_invalid_folders,
get_duplicate_folders,
get_orphaned_folders,
get_corrupted_folders,
get_unrecognized_folders,
)
def main(args=None): def main(args=None):
check_data_folder() check_data_folder()
@ -22,6 +32,7 @@ def main(args=None):
prog=__command__, prog=__command__,
description=__description__, description=__description__,
add_help=True, add_help=True,
formatter_class=SmartFormatter,
) )
group = parser.add_mutually_exclusive_group() group = parser.add_mutually_exclusive_group()
group.add_argument( group.add_argument(
@ -44,15 +55,36 @@ def main(args=None):
parser.add_argument( parser.add_argument(
'--before', #'-b', '--before', #'-b',
type=float, type=float,
help="List only URLs bookmarked before the given timestamp.", help="List only links bookmarked before the given timestamp.",
default=None, default=None,
) )
parser.add_argument( parser.add_argument(
'--after', #'-a', '--after', #'-a',
type=float, type=float,
help="List only URLs bookmarked after the given timestamp.", help="List only links bookmarked after the given timestamp.",
default=None, default=None,
) )
parser.add_argument(
'--status',
type=str,
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
default='indexed',
help=(
'List only links or data directories that have the given status\n'
f' indexed {get_indexed_folders.__doc__} (the default)\n'
f' archived {get_archived_folders.__doc__}\n'
f' unarchived {get_unarchived_folders.__doc__}\n'
'\n'
f' present {get_present_folders.__doc__}\n'
f' valid {get_valid_folders.__doc__}\n'
f' invalid {get_invalid_folders.__doc__}\n'
'\n'
f' duplicate {get_duplicate_folders.__doc__}\n'
f' orphaned {get_orphaned_folders.__doc__}\n'
f' corrupted {get_corrupted_folders.__doc__}\n'
f' unrecognized {get_unrecognized_folders.__doc__}\n'
)
)
parser.add_argument( parser.add_argument(
'--filter-type', '--filter-type',
type=str, type=str,
@ -76,17 +108,40 @@ def main(args=None):
before=command.before, before=command.before,
after=command.after, after=command.after,
) )
if command.sort: if command.sort:
links = sorted(links, key=lambda link: getattr(link, command.sort)) links = sorted(links, key=lambda link: getattr(link, command.sort))
if command.status == 'indexed':
folders = get_indexed_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'archived':
folders = get_archived_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'unarchived':
folders = get_unarchived_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'present':
folders = get_present_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'valid':
folders = get_valid_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'invalid':
folders = get_invalid_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'duplicate':
folders = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'orphaned':
folders = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'corrupted':
folders = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'unrecognized':
folders = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
if command.csv: if command.csv:
print(to_csv(links, csv_cols=command.csv.split(','), header=True)) print(to_csv(folders.values(), csv_cols=command.csv.split(','), header=True))
elif command.json: elif command.json:
print(to_json(list(links), indent=4, sort_keys=True)) print(to_json(folders.values(), indent=4, sort_keys=True))
else: else:
print('\n'.join(link.url for link in links)) print('\n'.join(f'{folder} {link}' for folder, link in folders.items()))
raise SystemExit(not folders)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View file

@ -2,7 +2,8 @@ import os
import re import re
import shutil import shutil
from typing import List, Optional, Iterable from typing import Dict, List, Optional, Iterable
from itertools import chain
from .schema import Link from .schema import Link
from .util import ( from .util import (
@ -17,8 +18,13 @@ from .index import (
import_new_links, import_new_links,
write_main_index, write_main_index,
) )
from .storage.json import parse_json_main_index, parse_json_links_details from .storage.json import (
from .storage.sql import parse_sql_main_index parse_json_main_index,
parse_json_link_details,
parse_json_links_details,
)
from .storage.sql import parse_sql_main_index, get_admins
from .storage.html import parse_html_main_index
from .archive_methods import archive_link from .archive_methods import archive_link
from .config import ( from .config import (
stderr, stderr,
@ -164,11 +170,39 @@ def init():
orphaned_data_dir_links = { orphaned_data_dir_links = {
link.url: link link.url: link
for link in parse_json_links_details(OUTPUT_DIR) for link in parse_json_links_details(OUTPUT_DIR)
if link.url not in all_links
} }
if orphaned_data_dir_links: orphan_new_links = {
all_links.update(orphaned_data_dir_links) url: link
print(' {lightyellow}√ Added {} orphaned links from existing archive directories...{reset}'.format(len(orphaned_data_dir_links), **ANSI)) for url, link in orphaned_data_dir_links.items()
if url not in all_links
}
orphan_duplicates = {
url: link
for url, link in orphaned_data_dir_links.items()
if url in all_links
}
if orphan_new_links:
all_links.update(orphan_new_links)
print(' {lightyellow}√ Added {} orphaned links from existing archive directories...{reset}'.format(len(orphan_new_links), **ANSI))
if orphan_duplicates:
print(' {lightyellow}! Skipped adding {} orphaned link data directories that would have overwritten existing data.{reset}'.format(len(orphan_duplicates), **ANSI))
orphaned_data_dirs = {folder for folder in orphan_duplicates.keys()}
invalid_folders = {
folder: link
for folder, link in get_invalid_folders(all_links.values(), out_dir=OUTPUT_DIR).items()
if folder not in orphaned_data_dirs
}
if invalid_folders:
print(' {lightyellow}! Skipped adding {} corrupted/unrecognized link data directories that could not be read.{reset}'.format(len(orphan_duplicates), **ANSI))
if orphan_duplicates or invalid_folders:
print(' For more information about the link data directories that were skipped, run:')
print(' archivebox info')
print(' archivebox list --status=invalid')
print(' archivebox list --status=orphaned')
print(' archivebox list --status=duplicate')
write_main_index(list(all_links.values()), out_dir=OUTPUT_DIR) write_main_index(list(all_links.values()), out_dir=OUTPUT_DIR)
@ -190,55 +224,88 @@ def init():
@enforce_types @enforce_types
def info(): def info():
all_links = load_main_index(out_dir=OUTPUT_DIR)
print('{green}[*] Scanning archive collection main index with {} links:{reset}'.format(len(all_links), **ANSI)) print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
print(f' {OUTPUT_DIR}') print(f' {OUTPUT_DIR}/*')
num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False) num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False, pattern='index.')
size = human_readable_size(num_bytes) size = human_readable_size(num_bytes)
print(f' > Index Size: {size} across {num_files} files') print(f' Size: {size} across {num_files} files')
print() print()
setup_django() links = load_main_index(out_dir=OUTPUT_DIR)
from django.contrib.auth.models import User num_json_links = len(links)
from core.models import Page num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=OUTPUT_DIR))
num_html_links = sum(1 for url in parse_html_main_index(out_dir=OUTPUT_DIR))
num_link_details = sum(1 for link in parse_json_links_details(out_dir=OUTPUT_DIR))
users = get_admins().values_list('username', flat=True)
print(f' > JSON Main Index: {num_json_links} links'.ljust(36), f'(found in {JSON_INDEX_FILENAME})')
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
print(f' > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})')
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
users = User.objects.all() print(f' > Admin: {len(users)} users {", ".join(users)}'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
num_pages = Page.objects.count()
print(f' > {len(users)} admin users:', ', '.join(u.username for u in users)) if num_html_links != len(links) or num_sql_links != len(links):
print(f' > {num_pages} pages in SQL database {SQL_INDEX_FILENAME}') print()
print(f' > {len(all_links)} pages in JSON database {JSON_INDEX_FILENAME}') print(' {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI))
print() print(' archivebox init')
if not users:
print()
print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI))
print(' archivebox manage createsuperuser')
print('{green}[*] Scanning archive collection data directory with {} entries:{reset}'.format(len(all_links), **ANSI)) print()
print(f' {ARCHIVE_DIR}') print('{green}[*] Scanning archive collection link data directories...{reset}'.format(**ANSI))
print(f' {ARCHIVE_DIR}/*')
num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR) num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
size = human_readable_size(num_bytes) size = human_readable_size(num_bytes)
print(f' > Total Size: {size} across {num_files} files in {num_dirs} directories') print(f' Size: {size} across {num_files} files in {num_dirs} directories')
print() print()
link_data_dirs = {link.link_dir for link in all_links} num_indexed = len(get_indexed_folders(links, out_dir=OUTPUT_DIR))
valid_archive_dirs = set() num_archived = len(get_archived_folders(links, out_dir=OUTPUT_DIR))
num_invalid = 0 num_unarchived = len(get_unarchived_folders(links, out_dir=OUTPUT_DIR))
for entry in os.scandir(ARCHIVE_DIR): print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
if entry.is_dir(follow_symlinks=True): print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
if os.path.exists(os.path.join(entry.path, 'index.json')): print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
valid_archive_dirs.add(entry.path)
else:
num_invalid += 1
print(f' > {len(valid_archive_dirs)} valid archive data directories (valid directories matched to links in the index)')
num_unarchived = sum(1 for link in all_links if link.link_dir not in valid_archive_dirs)
print(f' > {num_unarchived} missing data directories (directories missing for links in the index)')
print(f' > {num_invalid} invalid data directories (directories present that don\'t contain an index file)')
num_orphaned = sum(1 for data_dir in valid_archive_dirs if data_dir not in link_data_dirs)
print(f' > {num_orphaned} orphaned data directories (directories present for links that don\'t exist in the index)')
num_present = len(get_present_folders(links, out_dir=OUTPUT_DIR))
num_valid = len(get_valid_folders(links, out_dir=OUTPUT_DIR))
print()
print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
if num_indexed:
print()
print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
print(' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)')
if orphaned:
print()
print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI))
print(' archivebox init')
if num_invalid:
print()
print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI))
print(' archivebox init')
print()
@enforce_types @enforce_types
def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]: def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]:
@ -367,3 +434,182 @@ def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
log_removal_finished(len(all_links), len(to_keep)) log_removal_finished(len(all_links), len(to_keep))
return to_keep return to_keep
def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links without checking archive status or data directory validity"""
return {
link.link_dir: link
for link in links
}
def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are archived with a valid data directory"""
return {
link.link_dir: link
for link in filter(is_archived, links)
}
def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are unarchived with no data directory or an empty data directory"""
return {
link.link_dir: link
for link in filter(is_unarchived, links)
}
def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that are expected to exist based on the main index"""
all_folders = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
link = None
try:
link = parse_json_link_details(entry.path)
except Exception:
pass
all_folders[entry.path] = link
return all_folders
def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs with a valid index matched to the main index and archived content"""
return {
link.link_dir: link
for link in filter(is_valid, links)
}
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
return {**duplicate, **orphaned, **corrupted, **unrecognized}
def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that conflict with other directories that have the same link URL or timestamp"""
links = list(links)
by_url = {link.url: 0 for link in links}
by_timestamp = {link.timestamp: 0 for link in links}
duplicate_folders = {}
indexed_folders = {link.link_dir for link in links}
data_folders = (
entry.path
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
)
for path in chain(sorted(indexed_folders), sorted(data_folders)):
link = None
try:
link = parse_json_link_details(path)
except Exception:
pass
if link:
# link folder has same timestamp as different link folder
by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
if by_timestamp[link.timestamp] > 1:
duplicate_folders[path] = link
# link folder has same url as different link folder
by_url[link.url] = by_url.get(link.url, 0) + 1
if by_url[link.url] > 1:
duplicate_folders[path] = link
return duplicate_folders
def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that contain a valid index but aren't listed in the main index"""
links = list(links)
indexed_folders = {link.link_dir: link for link in links}
orphaned_folders = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
link = None
try:
link = parse_json_link_details(entry.path)
except Exception:
pass
if index_exists and entry.path not in indexed_folders:
# folder is a valid link data dir with index details, but it's not in the main index
orphaned_folders[entry.path] = link
return orphaned_folders
def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that don't contain a valid index and aren't listed in the main index"""
return {
link.link_dir: link
for link in filter(is_corrupt, links)
}
def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that don't contain recognizable archive data and aren't listed in the main index"""
by_timestamp = {link.timestamp: 0 for link in links}
unrecognized_folders: Dict[str, Optional[Link]] = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
link = None
try:
link = parse_json_link_details(entry.path)
except Exception:
pass
if index_exists and link is None:
# index exists but it's corrupted or unparseable
unrecognized_folders[entry.path] = link
elif not index_exists:
# link details index doesn't exist and the folder isn't in the main index
timestamp = entry.path.rsplit('/', 1)[-1]
if timestamp not in by_timestamp:
unrecognized_folders[entry.path] = link
return unrecognized_folders
def is_valid(link: Link) -> bool:
dir_exists = os.path.exists(link.link_dir)
index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
if not dir_exists:
# unarchived links are not included in the valid list
return False
if dir_exists and not index_exists:
return False
if dir_exists and index_exists:
try:
parsed_link = parse_json_link_details(link.link_dir)
return link.url == parsed_link.url
except Exception:
pass
return False
def is_corrupt(link: Link) -> bool:
if not os.path.exists(link.link_dir):
# unarchived links are not considered corrupt
return False
if is_valid(link):
return False
return True
def is_archived(link: Link) -> bool:
return is_valid(link) and link.is_archived
def is_unarchived(link: Link) -> bool:
if not os.path.exists(link.link_dir):
return True
return not link.is_archived