1
0
Fork 0
mirror of synced 2024-06-12 23:44:40 +12:00

move everything out of legacy folder

This commit is contained in:
Nick Sweeting 2019-04-27 17:26:24 -04:00
parent 553f312125
commit 1b8abc0961
74 changed files with 3162 additions and 2629 deletions

View file

@ -1,3 +1,6 @@
__package__ = 'archivebox'
from . import core
from . import cli
from .main import *

View file

@ -2,9 +2,14 @@
__package__ = 'archivebox'
from .cli.archivebox import main
import sys
from .cli import archivebox
def main():
archivebox.main(args=sys.argv[1:], stdin=sys.stdin)
if __name__ == '__main__':
main()
archivebox.main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -2,13 +2,17 @@ __package__ = 'archivebox.cli'
import os
from typing import Dict
from typing import Dict, List, Optional, IO
from importlib import import_module
CLI_DIR = os.path.dirname(os.path.abspath(__file__))
# these common commands will appear sorted before any others for ease-of-use
display_first = ('help', 'version', 'init', 'info', 'config', 'list', 'update', 'add', 'remove')
meta_cmds = ('help', 'version')
main_cmds = ('init', 'info', 'config')
archive_cmds = ('add', 'remove', 'update', 'list')
display_first = (*meta_cmds, *main_cmds, *archive_cmds)
# every imported command module must have these properties in order to be valid
required_attrs = ('__package__', '__command__', 'main')
@ -42,11 +46,14 @@ def list_subcommands() -> Dict[str, str]:
return dict(sorted(COMMANDS, key=display_order))
def run_subcommand(subcommand: str, args=None) -> None:
def run_subcommand(subcommand: str,
subcommand_args: List[str]=None,
stdin: Optional[IO]=None,
pwd: Optional[str]=None) -> None:
"""run a given ArchiveBox subcommand with the given list of args"""
module = import_module('.archivebox_{}'.format(subcommand), __package__)
module.main(args) # type: ignore
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
SUBCOMMANDS = list_subcommands()

View file

@ -5,19 +5,17 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox'
__description__ = 'ArchiveBox: The self-hosted internet archive.'
import os
import sys
import argparse
from typing import Optional, List, IO
from . import list_subcommands, run_subcommand
from ..legacy.config import OUTPUT_DIR
from ..config import OUTPUT_DIR
def parse_args(args=None):
args = sys.argv[1:] if args is None else args
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
subcommands = list_subcommands()
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
@ -43,54 +41,24 @@ def parse_args(args=None):
default=None,
)
parser.add_argument(
"args",
"subcommand_args",
help="Arguments for the subcommand",
nargs=argparse.REMAINDER,
)
command = parser.parse_args(args)
command = parser.parse_args(args or ())
if command.help:
if command.help or command.subcommand is None:
command.subcommand = 'help'
if command.version:
command.subcommand = 'version'
# print('--------------------------------------------')
# print('Command: ', sys.argv[0])
# print('Subcommand: ', command.subcommand)
# print('Args to pass:', args[1:])
# print('--------------------------------------------')
run_subcommand(
subcommand=command.subcommand,
subcommand_args=command.subcommand_args,
stdin=stdin,
pwd=pwd or OUTPUT_DIR,
)
return command.subcommand, command.args
def print_import_tutorial():
print('Welcome to ArchiveBox!')
print()
print('To import an existing archive (from a previous version of ArchiveBox):')
print(' 1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:')
print(' 2. archivebox init')
print()
print('To start a new archive:')
print(' 1. Create an emptry directory, then cd into it and run:')
print(' 2. archivebox init')
print()
print('For more information, see the migration docs here:')
print(' https://github.com/pirate/ArchiveBox/wiki/Migration')
def main(args=None):
subcommand, subcommand_args = parse_args(args)
existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
if subcommand is None:
if existing_index:
run_subcommand('help', subcommand_args)
else:
print_import_tutorial()
raise SystemExit(0)
run_subcommand(subcommand, subcommand_args)
if __name__ == '__main__':
main()
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -7,90 +7,75 @@ __description__ = 'Add a new URL or list of URLs to your archive'
import sys
import argparse
from typing import List, Optional
from typing import List, Optional, IO
from ..legacy.config import stderr, check_dependencies, check_data_folder
from ..legacy.util import (
handle_stdin_import,
handle_file_import,
)
from ..legacy.main import update_archive_data
from ..main import add
from ..util import SmartFormatter, accept_stdin
from ..config import OUTPUT_DIR, ONLY_NEW
def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
check_data_folder()
args = sys.argv[1:] if args is None else args
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
formatter_class=SmartFormatter,
)
# parser.add_argument(
# '--depth', #'-d',
# type=int,
# help='Recursively archive all linked pages up to this many hops away',
# default=0,
# )
parser.add_argument(
'--only-new', #'-n',
'--update-all', #'-n',
action='store_true',
help="Don't attempt to retry previously skipped/failed links when updating",
default=not ONLY_NEW,
help="Also retry previously skipped/failed links when adding new links",
)
parser.add_argument(
'--index-only', #'-o',
action='store_true',
help="Add the links to the main index without archiving them",
)
# parser.add_argument(
# '--mirror', #'-m',
# action='store_true',
# help='Archive an entire site (finding all linked pages below it on the same domain)',
# )
# parser.add_argument(
# '--crawler', #'-r',
# choices=('depth_first', 'breadth_first'),
# help='Controls which crawler to use in order to find outlinks in a given page',
# default=None,
# )
parser.add_argument(
'url',
'import_path',
nargs='?',
type=str,
default=None,
help='URL of page to archive (or path to local file)'
help=(
'URL or path to local file containing a list of links to import. e.g.:\n'
' https://getpocket.com/users/USERNAME/feed/all\n'
' https://example.com/some/rss/feed.xml\n'
' ~/Downloads/firefox_bookmarks_export.html\n'
' ~/Desktop/sites_list.csv\n'
)
)
command = parser.parse_args(args)
check_dependencies()
### Handle ingesting urls piped in through stdin
# (.e.g if user does cat example_urls.txt | archivebox add)
import_path = None
if stdin or not sys.stdin.isatty():
stdin_raw_text = stdin or sys.stdin.read()
if stdin_raw_text and command.url:
stderr(
'[X] You should pass either a path as an argument, '
'or pass a list of links via stdin, but not both.\n'
)
raise SystemExit(1)
import_path = handle_stdin_import(stdin_raw_text)
### Handle ingesting url from a remote file/feed
# (e.g. if an RSS feed URL is used as the import path)
elif command.url:
import_path = handle_file_import(command.url)
update_archive_data(
import_path=import_path,
resume=None,
only_new=command.only_new,
command = parser.parse_args(args or ())
import_str = accept_stdin(stdin)
add(
import_str=import_str,
import_path=command.import_path,
update_all=command.update_all,
index_only=command.index_only,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main()
main(args=sys.argv[1:], stdin=sys.stdin)
# TODO: Implement these
#
# parser.add_argument(
# '--depth', #'-d',
# type=int,
# help='Recursively archive all linked pages up to this many hops away',
# default=0,
# )
# parser.add_argument(
# '--mirror', #'-m',
# action='store_true',
# help='Archive an entire site (finding all linked pages below it on the same domain)',
# )
# parser.add_argument(
# '--crawler', #'-r',
# choices=('depth_first', 'breadth_first'),
# help='Controls which crawler to use in order to find outlinks in a given page',
# default=None,
# )

View file

@ -7,28 +7,14 @@ __description__ = 'Get and set your ArchiveBox project configuration values'
import sys
import argparse
from typing import Optional, List
from typing import Optional, List, IO
from ..legacy.util import SmartFormatter
from ..legacy.config import (
check_data_folder,
OUTPUT_DIR,
load_all_config,
write_config_file,
CONFIG,
CONFIG_FILE,
USER_CONFIG,
ConfigDict,
stderr,
get_real_name,
)
from ..main import config
from ..util import SmartFormatter, accept_stdin
from ..config import OUTPUT_DIR
def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
check_data_folder()
args = sys.argv[1:] if args is None else args
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
@ -57,102 +43,18 @@ def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
type=str,
help='KEY or KEY=VALUE formatted config values to get or set',
)
command = parser.parse_args(args)
command = parser.parse_args(args or ())
config_options_str = accept_stdin(stdin)
if stdin or not sys.stdin.isatty():
stdin_raw_text = stdin or sys.stdin.read()
if stdin_raw_text and command.config_options:
stderr(
'[X] You should either pass config values as an arguments '
'or via stdin, but not both.\n',
color='red',
)
raise SystemExit(1)
config_options = stdin_raw_text.split('\n')
else:
config_options = command.config_options
no_args = not (command.get or command.set or command.reset or command.config_options)
matching_config: ConfigDict = {}
if command.get or no_args:
if config_options:
config_options = [get_real_name(key) for key in config_options]
matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG}
failed_config = [key for key in config_options if key not in CONFIG]
if failed_config:
stderr()
stderr('[X] These options failed to get', color='red')
stderr(' {}'.format('\n '.join(config_options)))
raise SystemExit(1)
else:
matching_config = CONFIG
print(printable_config(matching_config))
raise SystemExit(not matching_config)
elif command.set:
new_config = {}
failed_options = []
for line in config_options:
if line.startswith('#') or not line.strip():
continue
if '=' not in line:
stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
stderr(f' {line}')
raise SystemExit(2)
raw_key, val = line.split('=')
raw_key = raw_key.upper().strip()
key = get_real_name(raw_key)
if key != raw_key:
stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
if key in CONFIG:
new_config[key] = val.strip()
else:
failed_options.append(line)
if new_config:
before = CONFIG
matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR)
after = load_all_config()
print(printable_config(matching_config))
side_effect_changes: ConfigDict = {}
for key, val in after.items():
if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config):
side_effect_changes[key] = after[key]
if side_effect_changes:
stderr()
stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
print(' {}'.format(printable_config(side_effect_changes, prefix=' ')))
if failed_options:
stderr()
stderr('[X] These options failed to set:', color='red')
stderr(' {}'.format('\n '.join(failed_options)))
raise SystemExit(bool(failed_options))
elif command.reset:
stderr('[X] This command is not implemented yet.', color='red')
stderr(' Please manually remove the relevant lines from your config file:')
stderr(f' {CONFIG_FILE}')
raise SystemExit(2)
else:
stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
stderr(' archivebox config')
stderr(' archivebox config --get SOME_KEY')
stderr(' archivebox config --set SOME_KEY=SOME_VALUE')
raise SystemExit(2)
def printable_config(config: ConfigDict, prefix: str='') -> str:
return f'\n{prefix}'.join(
f'{key}={val}'
for key, val in config.items()
if not (isinstance(val, dict) or callable(val))
config(
config_options_str=config_options_str,
config_options=command.config_options,
get=command.get,
set=command.set,
reset=command.reset,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main()
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -7,52 +7,24 @@ __description__ = 'Print the ArchiveBox help message and usage'
import sys
import argparse
from ..legacy.util import reject_stdin
from ..legacy.config import ANSI
from . import list_subcommands
from typing import Optional, List, IO
from ..main import help
from ..util import reject_stdin
from ..config import OUTPUT_DIR
def main(args=None):
args = sys.argv[1:] if args is None else args
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.parse_args(args)
reject_stdin(__command__)
parser.parse_args(args or ())
reject_stdin(__command__, stdin)
COMMANDS_HELP_TEXT = '\n '.join(
f'{cmd.ljust(20)} {summary}'
for cmd, summary in list_subcommands().items()
)
print('''{green}ArchiveBox: The self-hosted internet archive.{reset}
{lightblue}Usage:{reset}
archivebox [command] [--help] [--version] [...args]
{lightblue}Comamnds:{reset}
{}
{lightblue}Example Use:{reset}
mkdir my-archive; cd my-archive/
archivebox init
archivebox info
archivebox add https://example.com/some/page
archivebox add --depth=1 ~/Downloads/bookmarks_export.html
archivebox list --sort=timestamp --csv=timestamp,url,is_archived
archivebox schedule --every=week https://example.com/some/feed.rss
archivebox update --resume=15109948213.123
{lightblue}Documentation:{reset}
https://github.com/pirate/ArchiveBox/wiki
'''.format(COMMANDS_HELP_TEXT, **ANSI))
help(out_dir=pwd or OUTPUT_DIR)
if __name__ == '__main__':
main()
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -7,25 +7,24 @@ __description__ = 'Print out some info and statistics about the archive collecti
import sys
import argparse
from ..legacy.config import check_data_folder
from ..legacy.util import reject_stdin
from ..legacy.main import info
from typing import Optional, List, IO
from ..main import info
from ..config import OUTPUT_DIR
from ..util import reject_stdin
def main(args=None):
check_data_folder()
args = sys.argv[1:] if args is None else args
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.parse_args(args)
reject_stdin(__command__)
parser.parse_args(args or ())
reject_stdin(__command__, stdin)
info(out_dir=pwd or OUTPUT_DIR)
info()
if __name__ == '__main__':
main()
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -7,23 +7,24 @@ __description__ = 'Initialize a new ArchiveBox collection in the current directo
import sys
import argparse
from ..legacy.util import reject_stdin
from ..legacy.main import init
from typing import Optional, List, IO
from ..main import init
from ..util import reject_stdin
from ..config import OUTPUT_DIR
def main(args=None):
args = sys.argv[1:] if args is None else args
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.parse_args(args)
reject_stdin(__command__)
parser.parse_args(args or ())
reject_stdin(__command__, stdin)
init()
init(out_dir=pwd or OUTPUT_DIR)
if __name__ == '__main__':
main()
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -2,15 +2,17 @@
__package__ = 'archivebox.cli'
__command__ = 'archivebox list'
__description__ = 'List all the URLs currently in the archive.'
__description__ = 'List, filter, and export information about archive entries'
import sys
import argparse
from ..legacy.util import SmartFormatter, reject_stdin, to_json, to_csv
from ..legacy.config import check_data_folder, OUTPUT_DIR
from ..legacy.main import (
list_archive_data,
from typing import Optional, List, IO
from ..main import list_all
from ..util import SmartFormatter, accept_stdin
from ..config import OUTPUT_DIR
from ..index import (
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
@ -23,11 +25,7 @@ from ..legacy.main import (
get_unrecognized_folders,
)
def main(args=None):
check_data_folder()
args = sys.argv[1:] if args is None else args
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
@ -93,57 +91,27 @@ def main(args=None):
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
'patterns',
'filter_patterns',
nargs='*',
type=str,
default=None,
help='List only URLs matching these filter patterns.'
)
command = parser.parse_args(args)
reject_stdin(__command__)
command = parser.parse_args(args or ())
filter_patterns_str = accept_stdin(stdin)
links = list_archive_data(
filter_patterns=command.patterns,
list_all(
filter_patterns_str=filter_patterns_str,
filter_patterns=command.filter_patterns,
filter_type=command.filter_type,
before=command.before,
status=command.status,
after=command.after,
before=command.before,
sort=command.sort,
csv=command.csv,
json=command.json,
out_dir=pwd or OUTPUT_DIR,
)
if command.sort:
links = sorted(links, key=lambda link: getattr(link, command.sort))
links = list(links)
if command.status == 'indexed':
folders = get_indexed_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'archived':
folders = get_archived_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'unarchived':
folders = get_unarchived_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'present':
folders = get_present_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'valid':
folders = get_valid_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'invalid':
folders = get_invalid_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'duplicate':
folders = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'orphaned':
folders = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'corrupted':
folders = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
elif command.status == 'unrecognized':
folders = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
if command.csv:
print(to_csv(folders.values(), csv_cols=command.csv.split(','), header=True))
elif command.json:
print(to_json(folders.values(), indent=4, sort_keys=True))
else:
print('\n'.join(f'{folder} {link}' for folder, link in folders.items()))
raise SystemExit(not folders)
if __name__ == '__main__':
main()
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -6,24 +6,18 @@ __description__ = 'Run an ArchiveBox Django management command'
import sys
from ..legacy.config import OUTPUT_DIR, setup_django, check_data_folder
from typing import Optional, List, IO
from ..main import manage
from ..config import OUTPUT_DIR
def main(args=None):
check_data_folder()
setup_django(OUTPUT_DIR)
from django.core.management import execute_from_command_line
args = sys.argv if args is None else ['archivebox', *args]
args[0] = f'{sys.argv[0]} manage'
if args[1:] == []:
args.append('help')
execute_from_command_line(args)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
manage(
args=args,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main()
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -7,17 +7,14 @@ __description__ = 'Remove the specified URLs from the archive.'
import sys
import argparse
from typing import Optional, List, IO
from ..legacy.config import check_data_folder
from ..legacy.util import reject_stdin
from ..legacy.main import remove_archive_links
from ..main import remove
from ..util import accept_stdin
from ..config import OUTPUT_DIR
def main(args=None):
check_data_folder()
args = sys.argv[1:] if args is None else args
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
@ -56,33 +53,25 @@ def main(args=None):
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
'pattern',
'filter_patterns',
nargs='*',
type=str,
default=None,
help='URLs matching this filter pattern will be removed from the index.'
)
command = parser.parse_args(args)
command = parser.parse_args(args or ())
filter_str = accept_stdin(stdin)
if not sys.stdin.isatty():
stdin_raw_text = sys.stdin.read()
if stdin_raw_text and command.url:
print(
'[X] You should pass either a pattern as an argument, '
'or pass a list of patterns via stdin, but not both.\n'
)
raise SystemExit(1)
patterns = [pattern.strip() for pattern in stdin_raw_text.split('\n')]
else:
patterns = command.pattern
remove_archive_links(
filter_patterns=patterns, filter_type=command.filter_type,
before=command.before, after=command.after,
yes=command.yes, delete=command.delete,
remove(
filter_str=filter_str,
filter_patterns=command.filter_patterns,
filter_type=command.filter_type,
before=command.before,
after=command.after,
yes=command.yes,
delete=command.delete,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main()
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -4,34 +4,17 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox schedule'
__description__ = 'Set ArchiveBox to regularly import URLs at specific times using cron'
import os
import sys
import argparse
from datetime import datetime
from crontab import CronTab, CronSlices
from typing import Optional, List, IO
from ..main import schedule
from ..util import reject_stdin
from ..config import OUTPUT_DIR
from ..legacy.util import reject_stdin
from ..legacy.config import (
OUTPUT_DIR,
LOGS_DIR,
ARCHIVEBOX_BINARY,
USER,
ANSI,
stderr,
check_data_folder,
)
CRON_COMMENT = 'archivebox_schedule'
def main(args=None):
check_data_folder()
args = sys.argv[1:] if args is None else args
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
@ -57,7 +40,7 @@ def main(args=None):
group.add_argument(
'--clear', # '-c'
action='store_true',
help=("Stop all ArchiveBox scheduled runs, clear it completely from cron"),
help=("Stop all ArchiveBox scheduled runs (remove cron jobs)"),
)
group.add_argument(
'--show', # '-s'
@ -67,13 +50,14 @@ def main(args=None):
group.add_argument(
'--foreground', '-f',
action='store_true',
help=("Launch ArchiveBox as a long-running foreground task "
help=("Launch ArchiveBox scheduler as a long-running foreground task "
"instead of using cron."),
)
group.add_argument(
'--run-all', # '-a',
action='store_true',
help='Run all the scheduled jobs once immediately, independent of their configured schedules',
help=("Run all the scheduled jobs once immediately, independent of "
"their configured schedules, can be used together with --foreground"),
)
parser.add_argument(
'import_path',
@ -83,115 +67,21 @@ def main(args=None):
help=("Check this path and import any new links on every run "
"(can be either local file or remote URL)"),
)
command = parser.parse_args(args)
reject_stdin(__command__)
command = parser.parse_args(args or ())
reject_stdin(__command__, stdin)
os.makedirs(LOGS_DIR, exist_ok=True)
cron = CronTab(user=True)
cron = dedupe_jobs(cron)
existing_jobs = list(cron.find_comment(CRON_COMMENT))
if command.foreground or command.run_all:
if command.import_path or (not existing_jobs):
stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
stderr(' archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
raise SystemExit(1)
print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
if command.run_all:
try:
for job in existing_jobs:
sys.stdout.write(f' > {job.command}')
sys.stdout.flush()
job.run()
sys.stdout.write(f'\r{job.command}\n')
except KeyboardInterrupt:
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
raise SystemExit(1)
if command.foreground:
try:
for result in cron.run_scheduler():
print(result)
except KeyboardInterrupt:
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
raise SystemExit(1)
elif command.show:
if existing_jobs:
print('\n'.join(str(cmd) for cmd in existing_jobs))
else:
stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
stderr(' To schedule a new job, run:')
stderr(' archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
raise SystemExit(0)
elif command.clear:
print(cron.remove_all(comment=CRON_COMMENT))
cron.write()
raise SystemExit(0)
elif command.every:
quoted = lambda s: f'"{s}"' if s and ' ' in s else s
cmd = [
'cd',
quoted(OUTPUT_DIR),
'&&',
quoted(ARCHIVEBOX_BINARY),
*(('add', f'"{command.import_path}"',) if command.import_path else ('update',)),
'2>&1',
'>',
quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
]
new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
if command.every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
set_every = getattr(new_job.every(), command.every)
set_every()
elif CronSlices.is_valid(command.every):
new_job.setall(command.every)
else:
stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
stderr(' It must be one of minute/hour/day/week/month')
stderr(' or a quoted cron-format schedule like:')
stderr(' archivebox init --every=day https://example.com/some/rss/feed.xml')
stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
raise SystemExit(1)
cron = dedupe_jobs(cron)
cron.write()
total_runs = sum(j.frequency_per_year() for j in cron)
existing_jobs = list(cron.find_comment(CRON_COMMENT))
print()
print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
if total_runs > 60 and not command.quiet:
stderr()
stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
stderr(f' Congrats on being an enthusiastic internet archiver! 👌')
stderr()
stderr(' Make sure you have enough storage space available to hold all the data.')
stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
raise SystemExit(0)
def dedupe_jobs(cron: CronTab) -> CronTab:
deduped = set()
for job in list(cron):
unique_tuple = (str(job.slices), job.command)
if unique_tuple not in deduped:
deduped.add(unique_tuple)
cron.remove(job)
for schedule, command in deduped:
job = cron.new(command=command, comment=CRON_COMMENT)
job.setall(schedule)
job.enable()
return cron
schedule(
add=command.add,
show=command.show,
clear=command.clear,
foreground=command.foreground,
run_all=command.run_all,
quiet=command.quiet,
every=command.every,
import_path=command.import_path,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main()
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -7,15 +7,14 @@ __description__ = 'Run the ArchiveBox HTTP server'
import sys
import argparse
from ..legacy.config import setup_django, IS_TTY, OUTPUT_DIR, ANSI, check_data_folder
from ..legacy.util import reject_stdin
from typing import Optional, List, IO
from ..main import server
from ..util import reject_stdin
from ..config import OUTPUT_DIR
def main(args=None):
check_data_folder()
args = sys.argv[1:] if args is None else args
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
@ -33,26 +32,15 @@ def main(args=None):
action='store_true',
help='Enable auto-reloading when code or templates change',
)
command = parser.parse_args(args)
reject_stdin(__command__)
command = parser.parse_args(args or ())
reject_stdin(__command__, stdin)
setup_django(OUTPUT_DIR)
from django.core.management import call_command
from django.contrib.auth.models import User
if IS_TTY and not User.objects.filter(is_superuser=True).exists():
print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
print()
print(' To create an admin user, run:')
print(' archivebox manage createsuperuser')
print()
print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
if not command.reload:
command.runserver_args.append('--noreload')
call_command("runserver", *command.runserver_args)
server(
runserver_args=command.runserver_args,
reload=command.reload,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main()
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -7,27 +7,26 @@ __description__ = 'Enter an interactive ArchiveBox Django shell'
import sys
import argparse
from ..legacy.config import setup_django, OUTPUT_DIR, check_data_folder
from ..legacy.util import reject_stdin
from typing import Optional, List, IO
from ..main import shell
from ..config import OUTPUT_DIR
from ..util import reject_stdin
def main(args=None):
check_data_folder()
args = sys.argv[1:] if args is None else args
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.parse_args(args)
reject_stdin(__command__)
parser.parse_args(args or ())
reject_stdin(__command__, stdin)
shell(
out_dir=pwd or OUTPUT_DIR,
)
setup_django(OUTPUT_DIR)
from django.core.management import call_command
call_command("shell_plus")
if __name__ == '__main__':
main()
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -2,27 +2,36 @@
__package__ = 'archivebox.cli'
__command__ = 'archivebox update'
__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links.'
__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links'
import sys
import argparse
from typing import List
from typing import List, Optional, IO
from ..legacy.config import check_data_folder
from ..legacy.util import reject_stdin
from ..legacy.main import update_archive_data
from ..main import update
from ..util import SmartFormatter, accept_stdin
from ..config import OUTPUT_DIR
from ..index import (
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
get_present_folders,
get_valid_folders,
get_invalid_folders,
get_duplicate_folders,
get_orphaned_folders,
get_corrupted_folders,
get_unrecognized_folders,
)
def main(args: List[str]=None):
check_data_folder()
args = sys.argv[1:] if args is None else args
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.add_argument(
'--only-new', #'-n',
@ -40,16 +49,75 @@ def main(args: List[str]=None):
help='Resume the update process from a given timestamp',
default=None,
)
parser.add_argument(
'--overwrite', #'-x',
action='store_true',
help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
)
parser.add_argument(
'--before', #'-b',
type=float,
help="Update only links bookmarked before the given timestamp.",
default=None,
)
parser.add_argument(
'--after', #'-a',
type=float,
help="Update only links bookmarked after the given timestamp.",
default=None,
)
parser.add_argument(
'--status',
type=str,
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
default='indexed',
help=(
'Update only links or data directories that have the given status\n'
f' indexed {get_indexed_folders.__doc__} (the default)\n'
f' archived {get_archived_folders.__doc__}\n'
f' unarchived {get_unarchived_folders.__doc__}\n'
'\n'
f' present {get_present_folders.__doc__}\n'
f' valid {get_valid_folders.__doc__}\n'
f' invalid {get_invalid_folders.__doc__}\n'
'\n'
f' duplicate {get_duplicate_folders.__doc__}\n'
f' orphaned {get_orphaned_folders.__doc__}\n'
f' corrupted {get_corrupted_folders.__doc__}\n'
f' unrecognized {get_unrecognized_folders.__doc__}\n'
)
)
parser.add_argument(
'--filter-type',
type=str,
choices=('exact', 'substring', 'domain', 'regex'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
'filter_patterns',
nargs='*',
type=str,
default=None,
help='List only URLs matching these filter patterns.'
)
command = parser.parse_args(args)
reject_stdin(__command__)
filter_patterns_str = accept_stdin(stdin)
update_archive_data(
import_path=None,
update(
resume=command.resume,
only_new=command.only_new,
index_only=command.index_only,
overwrite=command.overwrite,
filter_patterns_str=filter_patterns_str,
filter_patterns=command.filter_patterns,
filter_type=command.filter_type,
status=command.status,
after=command.after,
before=command.before,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main()
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -4,26 +4,17 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox version'
__description__ = 'Print the ArchiveBox version and dependency information'
import os
import re
import sys
import argparse
from ..legacy.util import reject_stdin, human_readable_size
from ..legacy.config import (
ANSI,
VERSION,
CODE_LOCATIONS,
CONFIG_LOCATIONS,
DATA_LOCATIONS,
DEPENDENCIES,
check_dependencies,
)
from typing import Optional, List, IO
from ..main import version
from ..util import reject_stdin
from ..config import OUTPUT_DIR
def main(args=None):
args = sys.argv[1:] if args is None else args
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
@ -34,92 +25,14 @@ def main(args=None):
action='store_true',
help='Only print ArchiveBox version number and nothing else.',
)
command = parser.parse_args(args)
reject_stdin(__command__)
command = parser.parse_args(args or ())
reject_stdin(__command__, stdin)
if command.quiet:
print(VERSION)
else:
print('ArchiveBox v{}'.format(VERSION))
print()
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
for name, dependency in DEPENDENCIES.items():
print_dependency_version(name, dependency)
print()
print('{white}[i] Code locations:{reset}'.format(**ANSI))
for name, folder in CODE_LOCATIONS.items():
print_folder_status(name, folder)
print()
print('{white}[i] Config locations:{reset}'.format(**ANSI))
for name, folder in CONFIG_LOCATIONS.items():
print_folder_status(name, folder)
print()
print('{white}[i] Data locations:{reset}'.format(**ANSI))
for name, folder in DATA_LOCATIONS.items():
print_folder_status(name, folder)
print()
check_dependencies()
def print_folder_status(name, folder):
if folder['enabled']:
if folder['is_valid']:
color, symbol, note = 'green', '', 'valid'
else:
color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
if folder['path']:
if os.path.exists(folder['path']):
num_files = (
f'{len(os.listdir(folder["path"]))} files'
if os.path.isdir(folder['path']) else
human_readable_size(os.path.getsize(folder['path']))
)
else:
num_files = 'missing'
print(
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(24),
(folder["path"] or '').ljust(70),
num_files.ljust(14),
ANSI[color],
note,
ANSI['reset'],
)
def print_dependency_version(name, dependency):
if dependency['enabled']:
if dependency['is_valid']:
color, symbol, note = 'green', '', 'valid'
version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0]
else:
color, symbol, note, version = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
print(
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(24),
(dependency["path"] or '').ljust(70),
version.ljust(14),
ANSI[color],
note,
ANSI['reset'],
version(
quiet=command.quiet,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main()
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -1,3 +1,5 @@
__package__ = 'archivebox.cli'
import os
import sys
@ -5,8 +7,8 @@ from datetime import datetime
from dataclasses import dataclass
from typing import Optional, List
from .schema import Link, ArchiveResult
from .config import ANSI, OUTPUT_DIR, IS_TTY
from ..index.schema import Link, ArchiveResult
from ..config import ANSI, OUTPUT_DIR, IS_TTY
@dataclass
@ -80,7 +82,7 @@ def log_indexing_finished(out_path: str):
### Archiving Stage
def log_archiving_started(num_links: int, resume: Optional[float]):
def log_archiving_started(num_links: int, resume: Optional[float]=None):
start_ts = datetime.now()
_LAST_RUN_STATS.archiving_start_ts = start_ts
print()
@ -92,7 +94,7 @@ def log_archiving_started(num_links: int, resume: Optional[float]):
**ANSI,
))
else:
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
print('{green}[▶] [{}] Updating content for {} matching pages in archive...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
**ANSI,
@ -213,18 +215,18 @@ def log_archive_method_finished(result: ArchiveResult):
print()
def log_list_started(filter_patterns: List[str], filter_type: str):
def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
print('{green}[*] Finding links in the archive index matching these {} patterns:{reset}'.format(
filter_type,
**ANSI,
))
print(' {}'.format(' '.join(filter_patterns)))
print(' {}'.format(' '.join(filter_patterns or ())))
def log_list_finished(links):
from .util import to_csv
from ..util import links_to_csv
print()
print('---------------------------------------------------------------------------------------------------')
print(to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
print(links_to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
print('---------------------------------------------------------------------------------------------------')
print()

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python3
__package__ = 'archivebox'
__package__ = 'archivebox.cli'
import os
@ -29,15 +29,15 @@ TEST_CONFIG = {
OUTPUT_DIR = 'data.tests'
os.environ.update(TEST_CONFIG)
from .legacy.main import init
from .legacy.index import load_main_index
from .legacy.config import (
from ..main import init
from ..index import load_main_index
from ..config import (
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
)
from .cli import (
from . import (
archivebox_init,
archivebox_add,
archivebox_remove,

View file

@ -1,4 +1,4 @@
__package__ = 'archivebox.legacy'
__package__ = 'archivebox.config'
import os
import io
@ -13,7 +13,7 @@ from typing import Optional, Type, Tuple, Dict
from subprocess import run, PIPE, DEVNULL
from configparser import ConfigParser
from .config_stubs import (
from .stubs import (
SimpleConfigValueDict,
ConfigValue,
ConfigDict,
@ -40,7 +40,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'GENERAL_CONFIG': {
'OUTPUT_DIR': {'type': str, 'default': None},
'CONFIG_FILE': {'type': str, 'default': None},
'ONLY_NEW': {'type': bool, 'default': False},
'ONLY_NEW': {'type': bool, 'default': True},
'TIMEOUT': {'type': int, 'default': 60},
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'},
@ -122,8 +122,7 @@ ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
VERSION_FILENAME = 'VERSION'
PYTHON_DIR_NAME = 'archivebox'
LEGACY_DIR_NAME = 'legacy'
TEMPLATES_DIR_NAME = 'templates'
TEMPLATES_DIR_NAME = 'themes'
ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources'
@ -158,8 +157,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'REPO_DIR': {'default': lambda c: os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))},
'PYTHON_DIR': {'default': lambda c: os.path.join(c['REPO_DIR'], PYTHON_DIR_NAME)},
'LEGACY_DIR': {'default': lambda c: os.path.join(c['PYTHON_DIR'], LEGACY_DIR_NAME)},
'TEMPLATES_DIR': {'default': lambda c: os.path.join(c['LEGACY_DIR'], TEMPLATES_DIR_NAME)},
'TEMPLATES_DIR': {'default': lambda c: os.path.join(c['PYTHON_DIR'], TEMPLATES_DIR_NAME, 'legacy')},
'OUTPUT_DIR': {'default': lambda c: os.path.abspath(os.path.expanduser(c['OUTPUT_DIR'])) if c['OUTPUT_DIR'] else os.path.abspath(os.curdir)},
'ARCHIVE_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], ARCHIVE_DIR_NAME)},
@ -210,7 +208,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
'CONFIG_LOCATIONS': {'default': lambda c: get_config_locations(c)},
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
}
@ -370,6 +368,7 @@ def load_config(defaults: ConfigDefaultDict,
stderr(' For config documentation and examples see:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration')
stderr()
raise
raise SystemExit(2)
return extended_config
@ -492,18 +491,13 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
'REPO_DIR': {
'path': os.path.abspath(config['REPO_DIR']),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], '.github')),
'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], 'archivebox')),
},
'PYTHON_DIR': {
'path': os.path.abspath(config['PYTHON_DIR']),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['PYTHON_DIR'], '__main__.py')),
},
'LEGACY_DIR': {
'path': os.path.abspath(config['LEGACY_DIR']),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['LEGACY_DIR'], 'util.py')),
},
'TEMPLATES_DIR': {
'path': os.path.abspath(config['TEMPLATES_DIR']),
'enabled': True,
@ -511,14 +505,9 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
},
}
def get_config_locations(config: ConfigDict) -> ConfigValue:
def get_external_locations(config: ConfigDict) -> ConfigValue:
abspath = lambda path: None if path is None else os.path.abspath(path)
return {
'CONFIG_FILE': {
'path': abspath(config['CHROME_USER_DATA_DIR']),
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')),
},
'CHROME_USER_DATA_DIR': {
'path': abspath(config['CHROME_USER_DATA_DIR']),
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
@ -553,11 +542,26 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
'enabled': True,
'is_valid': os.path.exists(config['ARCHIVE_DIR']),
},
'CONFIG_FILE': {
'path': os.path.abspath(config['CONFIG_FILE']),
'enabled': True,
'is_valid': os.path.exists(config['CONFIG_FILE']),
},
'SQL_INDEX': {
'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
},
'JSON_INDEX': {
'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
},
'HTML_INDEX': {
'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
},
}
def get_dependency_info(config: ConfigDict) -> ConfigValue:
@ -731,7 +735,7 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) ->
json_index_exists = os.path.exists(os.path.join(output_dir, JSON_INDEX_FILENAME))
if not json_index_exists:
stderr('[X] No archive index was found in current directory.', color='red')
stderr('[X] No archive main index was found in current directory.', color='red')
stderr(f' {output_dir}')
stderr()
stderr(' Are you running archivebox in the right folder?')
@ -743,7 +747,7 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) ->
raise SystemExit(2)
sql_index_exists = os.path.exists(os.path.join(output_dir, SQL_INDEX_FILENAME))
from .storage.sql import list_migrations
from ..index.sql import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status]

View file

@ -17,6 +17,7 @@ class ConfigDict(BaseConfig, total=False):
SHOW_PROGRESS: bool
OUTPUT_DIR: str
CONFIG_FILE: str
ONLY_NEW: bool
TIMEOUT: int
MEDIA_TIMEOUT: int
@ -63,7 +64,6 @@ class ConfigDict(BaseConfig, total=False):
ANSI: Dict[str, str]
REPO_DIR: str
PYTHON_DIR: str
LEGACY_DIR: str
TEMPLATES_DIR: str
ARCHIVE_DIR: str
SOURCES_DIR: str

View file

@ -1,9 +1,7 @@
from datetime import datetime
from django.contrib import admin
from .models import Page
from core.models import Page
class PageAdmin(admin.ModelAdmin):
list_display = ('timestamp', 'short_url', 'title', 'is_archived', 'num_outputs', 'added', 'updated', 'url_hash')

View file

@ -4,8 +4,8 @@ import uuid
from django.db import models
from legacy.schema import Link
from legacy.util import parse_date
from ..util import parse_date
from ..index.schema import Link
class Page(models.Model):

View file

@ -2,8 +2,8 @@ from django.shortcuts import render
from django.views import View
from legacy.config import OUTPUT_DIR
from legacy.index import load_main_index, load_main_index_meta
from .index import load_main_index, load_main_index_meta
from .config import OUTPUT_DIR
class MainIndex(View):
@ -34,7 +34,7 @@ class AddLinks(View):
def post(self, request):
import_path = request.POST['url']
# TODO: add the links to the index here using archivebox.legacy.main.update_archive_data
# TODO: add the links to the index here using archivebox.main.add
print(f'Adding URL: {import_path}')
return render(template_name=self.template, request=request, context={})

View file

@ -1,4 +1,17 @@
print()
print('[i] Welcome to the ArchiveBox Shell! Example usage:')
print(' Page.objects.all()')
print(' User.objects.all()')
from cli import list_subcommands
from .config import ANSI
if __name__ == '__main__':
print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
# print('from archivebox.core.models import Page, User')
print('{green}from archivebox.cli import\narchivebox_{}{reset}'.format("\narchivebox_".join(list_subcommands().keys()), **ANSI))
print()
print('[i] Welcome to the ArchiveBox Shell! Example use:')
print(' print(Page.objects.filter(is_archived=True).count())')
print(' Page.objects.get(url="https://example.com").as_json()')
print(' Page.objects.get(url="https://example.com").as_json()')
print(' from archivebox.main import get_invalid_folders')

View file

@ -0,0 +1,105 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional
from datetime import datetime
from ..index.schema import Link
from ..index import (
load_link_details,
write_link_details,
patch_main_index,
)
from ..util import enforce_types
from ..cli.logging import (
log_link_archiving_started,
log_link_archiving_finished,
log_archive_method_started,
log_archive_method_finished,
)
from .title import should_save_title, save_title
from .favicon import should_save_favicon, save_favicon
from .wget import should_save_wget, save_wget
from .pdf import should_save_pdf, save_pdf
from .screenshot import should_save_screenshot, save_screenshot
from .dom import should_save_dom, save_dom
from .git import should_save_git, save_git
from .media import should_save_media, save_media
from .archive_org import should_save_archive_dot_org, save_archive_dot_org
@enforce_types
def archive_link(link: Link, overwrite: bool=False, out_dir: Optional[str]=None) -> Link:
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
ARCHIVE_METHODS = (
('title', should_save_title, save_title),
('favicon', should_save_favicon, save_favicon),
('wget', should_save_wget, save_wget),
('pdf', should_save_pdf, save_pdf),
('screenshot', should_save_screenshot, save_screenshot),
('dom', should_save_dom, save_dom),
('git', should_save_git, save_git),
('media', should_save_media, save_media),
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
)
out_dir = out_dir or link.link_dir
try:
is_new = not os.path.exists(out_dir)
if is_new:
os.makedirs(out_dir)
link = load_link_details(link, out_dir=out_dir)
log_link_archiving_started(link, out_dir, is_new)
link = link.overwrite(updated=datetime.now())
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
for method_name, should_run, method_function in ARCHIVE_METHODS:
try:
if method_name not in link.history:
link.history[method_name] = []
if should_run(link, out_dir) or overwrite:
log_archive_method_started(method_name)
result = method_function(link=link, out_dir=out_dir)
link.history[method_name].append(result)
stats[result.status] += 1
log_archive_method_finished(result)
else:
stats['skipped'] += 1
except Exception as e:
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
method_name,
link.url,
)) from e
# print(' ', stats)
write_link_details(link, out_dir=link.link_dir)
patch_main_index(link)
# # If any changes were made, update the main links index json and html
# was_changed = stats['succeeded'] or stats['failed']
# if was_changed:
# patch_main_index(link)
log_link_archiving_finished(link, link.link_dir, is_new, stats)
except KeyboardInterrupt:
try:
write_link_details(link, out_dir=link.link_dir)
except:
pass
raise
except Exception as err:
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
raise
return link

View file

@ -0,0 +1,115 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional, List, Dict, Tuple
from collections import defaultdict
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
DEVNULL,
is_static_file,
ArchiveError,
chmod_file,
)
from ..config import (
VERSION,
TIMEOUT,
SAVE_ARCHIVE_DOT_ORG,
CURL_BINARY,
CURL_VERSION,
CHECK_SSL_VALIDITY
)
@enforce_types
def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
# if open(path, 'r').read().strip() != 'None':
return False
return SAVE_ARCHIVE_DOT_ORG
@enforce_types
def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""submit site to archive.org for archiving via their service, save returned archive url"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'archive.org.txt'
archive_org_url = None
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
cmd = [
CURL_BINARY,
'--location',
'--head',
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
'--max-time', str(timeout),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
submit_url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
content_location, errors = parse_archive_dot_org_response(result.stdout)
if content_location:
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
archive_org_url = None
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
elif errors:
raise ArchiveError(', '.join(errors))
else:
raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
if output and not isinstance(output, Exception):
# instead of writing None when archive.org rejects the url write the
# url to resubmit it to archive.org. This is so when the user visits
# the URL in person, it will attempt to re-archive it, and it'll show the
# nicer error message explaining why the url was rejected if it fails.
archive_org_url = archive_org_url or submit_url
with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
f.write(archive_org_url)
chmod_file('archive.org.txt', cwd=out_dir)
output = archive_org_url
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CURL_VERSION,
output=output,
status=status,
**timer.stats,
)
@enforce_types
def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
# Parse archive.org response headers
headers: Dict[str, List[str]] = defaultdict(list)
# lowercase all the header names and store in dict
for header in response.splitlines():
if b':' not in header or not header.strip():
continue
name, val = header.decode().split(':', 1)
headers[name.lower().strip()].append(val.strip())
# Get successful archive url in "content-location" header or any errors
content_location = headers['content-location']
errors = headers['x-archive-wayback-runtime-error']
return content_location, errors

View file

@ -0,0 +1,73 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chrome_args,
chmod_file,
)
from ..config import (
TIMEOUT,
SAVE_DOM,
CHROME_VERSION,
)
@enforce_types
def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'output.html')):
return False
return SAVE_DOM
@enforce_types
def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'output.html'
output_path = os.path.join(out_dir, str(output))
cmd = [
*chrome_args(TIMEOUT=timeout),
'--dump-dom',
link.url
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
with open(output_path, 'w+') as f:
result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
if result.returncode:
hints = result.stderr.decode()
raise ArchiveError('Failed to save DOM', hints)
chmod_file(output, cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CHROME_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -0,0 +1,65 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..util import (
enforce_types,
TimedProgress,
domain,
run,
PIPE,
chmod_file,
)
from ..config import (
TIMEOUT,
SAVE_FAVICON,
CURL_BINARY,
CURL_VERSION,
CHECK_SSL_VALIDITY,
)
@enforce_types
def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
return False
return SAVE_FAVICON
@enforce_types
def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'favicon.ico'
cmd = [
CURL_BINARY,
'--max-time', str(timeout),
'--location',
'--output', str(output),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
chmod_file(output, cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CURL_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -0,0 +1,94 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chmod_file,
domain,
extension,
without_query,
without_fragment,
)
from ..config import (
TIMEOUT,
SAVE_GIT,
GIT_BINARY,
GIT_VERSION,
GIT_DOMAINS,
CHECK_SSL_VALIDITY
)
@enforce_types
def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'git')):
return False
is_clonable_url = (
(domain(link.url) in GIT_DOMAINS)
or (extension(link.url) == 'git')
)
if not is_clonable_url:
return False
return SAVE_GIT
@enforce_types
def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using git"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'git'
output_path = os.path.join(out_dir, str(output))
os.makedirs(output_path, exist_ok=True)
cmd = [
GIT_BINARY,
'clone',
'--mirror',
'--recursive',
*([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
without_query(without_fragment(link.url)),
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
if result.returncode == 128:
# ignore failed re-download when the folder already exists
pass
elif result.returncode > 0:
hints = 'Got git response code: {}.'.format(result.returncode)
raise ArchiveError('Failed to save git clone', hints)
chmod_file(output, cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=GIT_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -0,0 +1,100 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chmod_file,
)
from ..config import (
MEDIA_TIMEOUT,
SAVE_MEDIA,
YOUTUBEDL_BINARY,
YOUTUBEDL_VERSION,
CHECK_SSL_VALIDITY
)
@enforce_types
def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'media')):
return False
return SAVE_MEDIA
@enforce_types
def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'media'
output_path = os.path.join(out_dir, str(output))
os.makedirs(output_path, exist_ok=True)
cmd = [
YOUTUBEDL_BINARY,
'--write-description',
'--write-info-json',
'--write-annotations',
'--yes-playlist',
'--write-thumbnail',
'--no-call-home',
'--no-check-certificate',
'--user-agent',
'--all-subs',
'--extract-audio',
'--keep-video',
'--ignore-errors',
'--geo-bypass',
'--audio-format', 'mp3',
'--audio-quality', '320K',
'--embed-thumbnail',
'--add-metadata',
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
chmod_file(output, cwd=out_dir)
if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr
or b'HTTP Error 404' in result.stderr
or b'HTTP Error 403' in result.stderr
or b'URL could be a direct video link' in result.stderr
or b'Unable to extract container ID' in result.stderr):
# These happen too frequently on non-media pages to warrant printing to console
pass
else:
hints = (
'Got youtube-dl response code: {}.'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to save media', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=YOUTUBEDL_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -0,0 +1,72 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chrome_args,
chmod_file,
)
from ..config import (
TIMEOUT,
SAVE_PDF,
CHROME_VERSION,
)
@enforce_types
def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'output.pdf')):
return False
return SAVE_PDF
@enforce_types
def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print PDF of site to file using chrome --headless"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'output.pdf'
cmd = [
*chrome_args(TIMEOUT=timeout),
'--print-to-pdf',
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
if result.returncode:
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to save PDF', hints)
chmod_file('output.pdf', cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CHROME_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -0,0 +1,71 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chrome_args,
chmod_file,
)
from ..config import (
TIMEOUT,
SAVE_SCREENSHOT,
CHROME_VERSION,
)
@enforce_types
def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
return False
return SAVE_SCREENSHOT
@enforce_types
def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""take screenshot of site using chrome --headless"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'screenshot.png'
cmd = [
*chrome_args(TIMEOUT=timeout),
'--screenshot',
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
if result.returncode:
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to save screenshot', hints)
chmod_file(output, cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CHROME_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -0,0 +1,63 @@
__package__ = 'archivebox.extractors'
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..util import (
enforce_types,
TimedProgress,
is_static_file,
ArchiveError,
fetch_page_title,
)
from ..config import (
TIMEOUT,
SAVE_TITLE,
CURL_BINARY,
CURL_VERSION,
)
@enforce_types
def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
# if link already has valid title, skip it
if link.title and not link.title.lower().startswith('http'):
return False
if is_static_file(link.url):
return False
return SAVE_TITLE
@enforce_types
def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content"""
output: ArchiveOutput = None
cmd = [
CURL_BINARY,
link.url,
'|',
'grep',
'<title',
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
output = fetch_page_title(link.url, timeout=timeout, progress=False)
if not output:
raise ArchiveError('Unable to detect page title')
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CURL_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -0,0 +1,123 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional
from datetime import datetime
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
wget_output_path,
ArchiveError,
)
from ..config import (
TIMEOUT,
SAVE_WGET,
SAVE_WARC,
WGET_BINARY,
WGET_VERSION,
CHECK_SSL_VALIDITY,
SAVE_WGET_REQUISITES,
WGET_AUTO_COMPRESSION,
WGET_USER_AGENT,
COOKIES_FILE,
)
@enforce_types
def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool:
output_path = wget_output_path(link)
out_dir = out_dir or link.link_dir
if output_path and os.path.exists(os.path.join(out_dir, output_path)):
return False
return SAVE_WGET
@enforce_types
def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using wget"""
out_dir = out_dir or link.link_dir
if SAVE_WARC:
warc_dir = os.path.join(out_dir, 'warc')
os.makedirs(warc_dir, exist_ok=True)
warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
output: ArchiveOutput = None
cmd = [
WGET_BINARY,
# '--server-response', # print headers for better error parsing
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
'--restrict-file-names=windows',
'--timeout={}'.format(timeout),
*([] if SAVE_WARC else ['--timestamping']),
*(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
*(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
*(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
*(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
output = wget_output_path(link)
# parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
if line.strip()
]
files_downloaded = (
int(output_tail[-1].strip().split(' ', 2)[1] or 0)
if 'Downloaded:' in output_tail[-1]
else 0
)
# Check for common failure cases
if result.returncode > 0 and files_downloaded < 1:
hints = (
'Got wget response code: {}.'.format(result.returncode),
*output_tail,
)
if b'403: Forbidden' in result.stderr:
raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
if b'404: Not Found' in result.stderr:
raise ArchiveError('404 Not Found', hints)
if b'ERROR 500: Internal Server Error' in result.stderr:
raise ArchiveError('500 Internal Server Error', hints)
raise ArchiveError('Got an error from the server', hints)
# chmod_file(output, cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=WGET_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -1,14 +1,25 @@
__package__ = 'archivebox.legacy'
__package__ = 'archivebox.index'
import re
import os
import json
import shutil
import json as pyjson
from typing import List, Tuple, Optional, Iterable
from itertools import chain
from typing import List, Tuple, Dict, Optional, Iterable
from collections import OrderedDict
from contextlib import contextmanager
from .schema import Link, ArchiveResult
from .config import (
from ..parsers import parse_links
from ..util import (
scheme,
enforce_types,
TimedProgress,
atomic_write,
ExtendedEncoder,
)
from ..config import (
ARCHIVE_DIR_NAME,
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
@ -18,26 +29,7 @@ from .config import (
ANSI,
stderr,
)
from .storage.html import write_html_main_index, write_html_link_details
from .storage.json import (
parse_json_main_index,
write_json_main_index,
parse_json_link_details,
write_json_link_details,
)
from .storage.sql import (
write_sql_main_index,
parse_sql_main_index,
)
from .util import (
scheme,
enforce_types,
TimedProgress,
atomic_write,
ExtendedEncoder,
)
from .parse import parse_links
from .logs import (
from ..cli.logging import (
log_indexing_process_started,
log_indexing_process_finished,
log_indexing_started,
@ -46,6 +38,22 @@ from .logs import (
log_parsing_finished,
)
from .schema import Link, ArchiveResult
from .html import (
write_html_main_index,
write_html_link_details,
)
from .json import (
parse_json_main_index,
write_json_main_index,
parse_json_link_details,
write_json_link_details,
)
from .sql import (
write_sql_main_index,
parse_sql_main_index,
)
### Link filtering and checking
@enforce_types
@ -95,11 +103,11 @@ def merge_links(a: Link, b: Link) -> Link:
}
for method in all_methods:
deduped_jsons = {
json.dumps(result, sort_keys=True, cls=ExtendedEncoder)
pyjson.dumps(result, sort_keys=True, cls=ExtendedEncoder)
for result in history[method]
}
history[method] = list(reversed(sorted(
(ArchiveResult.from_json(json.loads(result)) for result in deduped_jsons),
(ArchiveResult.from_json(pyjson.loads(result)) for result in deduped_jsons),
key=lambda result: result.start_ts,
)))
@ -114,7 +122,7 @@ def merge_links(a: Link, b: Link) -> Link:
@enforce_types
def validate_links(links: Iterable[Link]) -> Iterable[Link]:
def validate_links(links: Iterable[Link]) -> List[Link]:
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = sorted_links(links) # deterministically sort the links based on timstamp, url
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
@ -128,7 +136,7 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
stderr(' archivebox help')
raise SystemExit(1)
return links
return list(links)
@enforce_types
@ -259,23 +267,32 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
meta_dict = json.load(f)
meta_dict = pyjson.load(f)
meta_dict.pop('links')
return meta_dict
return None
@enforce_types
def import_new_links(existing_links: List[Link], import_path: str) -> Tuple[List[Link], List[Link]]:
def import_new_links(existing_links: List[Link],
import_path: str,
out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]:
new_links: List[Link] = []
# parse and validate the import file
log_parsing_started(import_path)
raw_links, parser_name = parse_links(import_path)
new_links = list(validate_links(raw_links))
new_links = validate_links(raw_links)
# merge existing links in out_dir and new links
all_links = list(validate_links(existing_links + new_links))
all_links = validate_links(existing_links + new_links)
all_link_urls = {link.url for link in existing_links}
new_links = [
link for link in new_links
if link.url not in all_link_urls
]
if parser_name:
num_parsed = len(raw_links)
@ -345,3 +362,231 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
return merge_links(existing_link, link)
return link
LINK_FILTERS = {
'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
'substring': lambda link, pattern: pattern in link.url,
'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
'domain': lambda link, pattern: link.domain == pattern,
}
@enforce_types
def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
for pattern in filter_patterns:
try:
if LINK_FILTERS[filter_type](link, pattern):
return True
except Exception:
stderr()
stderr(
f'[X] Got invalid pattern for --filter-type={filter_type}:',
color='red',
)
stderr(f' {pattern}')
raise SystemExit(2)
return False
def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links without checking archive status or data directory validity"""
return {
link.link_dir: link
for link in links
}
def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are archived with a valid data directory"""
return {
link.link_dir: link
for link in filter(is_archived, links)
}
def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are unarchived with no data directory or an empty data directory"""
return {
link.link_dir: link
for link in filter(is_unarchived, links)
}
def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that are expected to exist based on the main index"""
all_folders = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
link = None
try:
link = parse_json_link_details(entry.path)
except Exception:
pass
all_folders[entry.path] = link
return all_folders
def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs with a valid index matched to the main index and archived content"""
return {
link.link_dir: link
for link in filter(is_valid, links)
}
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
return {**duplicate, **orphaned, **corrupted, **unrecognized}
def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that conflict with other directories that have the same link URL or timestamp"""
links = list(links)
by_url = {link.url: 0 for link in links}
by_timestamp = {link.timestamp: 0 for link in links}
duplicate_folders = {}
indexed_folders = {link.link_dir for link in links}
data_folders = (
entry.path
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
)
for path in chain(sorted(indexed_folders), sorted(data_folders)):
link = None
try:
link = parse_json_link_details(path)
except Exception:
pass
if link:
# link folder has same timestamp as different link folder
by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
if by_timestamp[link.timestamp] > 1:
duplicate_folders[path] = link
# link folder has same url as different link folder
by_url[link.url] = by_url.get(link.url, 0) + 1
if by_url[link.url] > 1:
duplicate_folders[path] = link
return duplicate_folders
def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that contain a valid index but aren't listed in the main index"""
links = list(links)
indexed_folders = {link.link_dir: link for link in links}
orphaned_folders = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
link = None
try:
link = parse_json_link_details(entry.path)
except Exception:
pass
if link and entry.path not in indexed_folders:
# folder is a valid link data dir with index details, but it's not in the main index
orphaned_folders[entry.path] = link
return orphaned_folders
def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that don't contain a valid index and aren't listed in the main index"""
return {
link.link_dir: link
for link in filter(is_corrupt, links)
}
def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that don't contain recognizable archive data and aren't listed in the main index"""
by_timestamp = {link.timestamp: 0 for link in links}
unrecognized_folders: Dict[str, Optional[Link]] = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
link = None
try:
link = parse_json_link_details(entry.path)
except Exception:
pass
if index_exists and link is None:
# index exists but it's corrupted or unparseable
unrecognized_folders[entry.path] = link
elif not index_exists:
# link details index doesn't exist and the folder isn't in the main index
timestamp = entry.path.rsplit('/', 1)[-1]
if timestamp not in by_timestamp:
unrecognized_folders[entry.path] = link
return unrecognized_folders
def is_valid(link: Link) -> bool:
dir_exists = os.path.exists(link.link_dir)
index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
if not dir_exists:
# unarchived links are not included in the valid list
return False
if dir_exists and not index_exists:
return False
if dir_exists and index_exists:
try:
parsed_link = parse_json_link_details(link.link_dir)
return link.url == parsed_link.url
except Exception:
pass
return False
def is_corrupt(link: Link) -> bool:
if not os.path.exists(link.link_dir):
# unarchived links are not considered corrupt
return False
if is_valid(link):
return False
return True
def is_archived(link: Link) -> bool:
return is_valid(link) and link.is_archived
def is_unarchived(link: Link) -> bool:
if not os.path.exists(link.link_dir):
return True
return not link.is_archived
def fix_invalid_folder_locations(out_dir: str=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
fixed = []
cant_fix = []
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
if os.path.exists(os.path.join(entry.path, 'index.json')):
link = parse_json_link_details(entry.path)
if not link:
continue
if not entry.path.endswith(f'/{link.timestamp}'):
dest = os.path.join(out_dir, ARCHIVE_DIR_NAME, link.timestamp)
if os.path.exists(dest):
cant_fix.append(entry.path)
else:
shutil.move(entry.path, dest)
fixed.append(dest)
if link.link_dir != entry.path:
link = link.overwrite(link_dir=entry.path)
write_json_link_details(link, out_dir=entry.path)
return fixed, cant_fix

View file

@ -1,11 +1,22 @@
__package__ = 'archivebox.legacy.storage'
__package__ = 'archivebox.index'
import os
from datetime import datetime
from typing import List, Optional, Iterator
from ..schema import Link
from .schema import Link
from ..util import (
enforce_types,
ts_to_date,
urlencode,
htmlencode,
urldecode,
wget_output_path,
render_template,
atomic_write,
copy_and_overwrite,
)
from ..config import (
OUTPUT_DIR,
TEMPLATES_DIR,
@ -18,17 +29,6 @@ from ..config import (
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
)
from ..util import (
enforce_types,
ts_to_date,
urlencode,
htmlencode,
urldecode,
wget_output_path,
render_template,
atomic_write,
copy_and_overwrite,
)
join = lambda *paths: os.path.join(*paths)
MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')

View file

@ -1,4 +1,4 @@
__package__ = 'archivebox.legacy.storage'
__package__ = 'archivebox.index'
import os
import sys
@ -7,7 +7,8 @@ import json
from datetime import datetime
from typing import List, Optional, Iterator
from ..schema import Link, ArchiveResult
from .schema import Link, ArchiveResult
from ..util import enforce_types, atomic_write
from ..config import (
VERSION,
OUTPUT_DIR,
@ -17,14 +18,11 @@ from ..config import (
JSON_INDEX_FILENAME,
ARCHIVE_DIR_NAME,
)
from ..util import (
enforce_types,
atomic_write,
)
MAIN_INDEX_HEADER = {
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
'schema': 'archivebox.legacy.storage.json',
'schema': 'archivebox.index.json',
'copyright_info': FOOTER_INFO,
'meta': {
'project': 'ArchiveBox',
@ -43,7 +41,7 @@ MAIN_INDEX_HEADER = {
@enforce_types
def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
"""parse a archive index json file and return the list of links"""
"""parse an archive index json file and return the list of links"""
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(index_path):
@ -110,4 +108,6 @@ def parse_json_links_details(out_dir: str) -> Iterator[Link]:
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
if os.path.exists(os.path.join(entry.path, 'index.json')):
yield parse_json_link_details(entry.path)
link = parse_json_link_details(entry.path)
if link:
yield link

View file

@ -1,3 +1,5 @@
__package__ = 'archivebox.index'
import os
from datetime import datetime
@ -48,7 +50,7 @@ class ArchiveResult:
@classmethod
def from_json(cls, json_info):
from .util import parse_date
from ..util import parse_date
info = {
key: val
@ -60,12 +62,12 @@ class ArchiveResult:
return cls(**info)
def to_json(self, indent=4, sort_keys=True):
from .util import to_json
from ..util import to_json
return to_json(self, indent=indent, sort_keys=sort_keys)
def to_csv(self, cols=None, ljust: int=0, separator: str=','):
from .util import to_json
from ..util import to_json
cols = cols or self.field_names()
return separator.join(
@ -115,7 +117,7 @@ class Link:
return float(self.timestamp) > float(other.timestamp)
def typecheck(self) -> None:
from .config import stderr, ANSI
from ..config import stderr, ANSI
try:
assert self.schema == self.__class__.__name__
assert isinstance(self.timestamp, str) and self.timestamp
@ -176,7 +178,7 @@ class Link:
@classmethod
def from_json(cls, json_info):
from .util import parse_date
from ..util import parse_date
info = {
key: val
@ -200,12 +202,12 @@ class Link:
return cls(**info)
def to_json(self, indent=4, sort_keys=True):
from .util import to_json
from ..util import to_json
return to_json(self, indent=indent, sort_keys=sort_keys)
def to_csv(self, csv_cols: List[str], ljust: int=0, separator: str=','):
from .util import to_json
from ..util import to_json
return separator.join(
to_json(getattr(self, col), indent=None).ljust(ljust)
@ -218,60 +220,60 @@ class Link:
@property
def link_dir(self) -> str:
from .config import CONFIG
from ..config import CONFIG
return os.path.join(CONFIG['ARCHIVE_DIR'], self.timestamp)
@property
def archive_path(self) -> str:
from .config import ARCHIVE_DIR_NAME
from ..config import ARCHIVE_DIR_NAME
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
### URL Helpers
@property
def url_hash(self):
from .util import hashurl
from ..util import hashurl
return hashurl(self.url)
@property
def scheme(self) -> str:
from .util import scheme
from ..util import scheme
return scheme(self.url)
@property
def extension(self) -> str:
from .util import extension
from ..util import extension
return extension(self.url)
@property
def domain(self) -> str:
from .util import domain
from ..util import domain
return domain(self.url)
@property
def path(self) -> str:
from .util import path
from ..util import path
return path(self.url)
@property
def basename(self) -> str:
from .util import basename
from ..util import basename
return basename(self.url)
@property
def base_url(self) -> str:
from .util import base_url
from ..util import base_url
return base_url(self.url)
### Pretty Printing Helpers
@property
def bookmarked_date(self) -> Optional[str]:
from .util import ts_to_date
from ..util import ts_to_date
return ts_to_date(self.timestamp) if self.timestamp else None
@property
def updated_date(self) -> Optional[str]:
from .util import ts_to_date
from ..util import ts_to_date
return ts_to_date(self.updated) if self.updated else None
@property
@ -304,13 +306,13 @@ class Link:
@property
def is_static(self) -> bool:
from .util import is_static_file
from ..util import is_static_file
return is_static_file(self.url)
@property
def is_archived(self) -> bool:
from .config import ARCHIVE_DIR
from .util import domain
from ..config import ARCHIVE_DIR
from ..util import domain
output_paths = (
domain(self.url),
@ -352,7 +354,7 @@ class Link:
def canonical_outputs(self) -> Dict[str, Optional[str]]:
"""predict the expected output paths that should be present after archiving"""
from .util import wget_output_path
from ..util import wget_output_path
canonical = {
'index_path': 'index.html',
'favicon_path': 'favicon.ico',

View file

@ -1,9 +1,9 @@
__package__ = 'archivebox.legacy.storage'
__package__ = 'archivebox.index'
from io import StringIO
from typing import List, Tuple, Iterator
from ..schema import Link
from .schema import Link
from ..util import enforce_types
from ..config import setup_django, OUTPUT_DIR
@ -25,9 +25,19 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
setup_django(out_dir, check_db=True)
from core.models import Page
for link in links:
all_urls = {link.url: link for link in links}
for page in Page.objects.all():
if page.url in all_urls:
info = {k: v for k, v in all_urls.pop(page.url)._asdict().items() if k in Page.keys}
Page.objects.update(**info)
else:
page.delete()
for url, link in all_urls.items():
info = {k: v for k, v in link._asdict().items() if k in Page.keys}
Page.objects.update_or_create(url=link.url, defaults=info)
Page.objects.update_or_create(url=url, defaults=info)
@enforce_types

View file

@ -1,58 +0,0 @@
# This is the example default configiration file for ArchiveBox.
#
# Copy example config from here into your project's ArchiveBox.conf file,
# DO NOT EDIT THIS FILE DIRECTLY!
#
# See the list of all the possible options. documentation, and examples here:
# https://github.com/pirate/ArchiveBox/wiki/Configuration
[GENERAL_CONFIG]
OUTPUT_PERMISSIONS = 755
ONLY_NEW = False
TIMEOUT = 60
MEDIA_TIMEOUT = 3600
ACTIVE_THEME = default
FOOTER_INFO = Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.
URL_BLACKLIST = (://(.*\.)?facebook\.com)|(://(.*\.)?ebay\.com)|(.*\.exe$)
[ARCHIVE_METHOD_TOGGLES]
SAVE_TITLE = True
SAVE_FAVICON = True
SAVE_WGET = True
SAVE_WGET_REQUISITES = True
SAVE_WARC = True
SAVE_PDF = True
SAVE_SCREENSHOT = True
SAVE_DOM = True
SAVE_GIT = True
SAVE_MEDIA = False
SAVE_ARCHIVE_DOT_ORG = True
[ARCHIVE_METHOD_OPTIONS]
CHECK_SSL_VALIDITY = True
RESOLUTION = 1440,900
GIT_DOMAINS = github.com,bitbucket.org,gitlab.com
CROME_HEADLESS = True
CROME_SANDBOX = True
COOKIES_FILE = path/to/cookies.txt
CHROME_USER_DATA_DIR = ~/.config/google-chrome/Default
WGET_USER_AGENT = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36
CHROME_USER_AGENT = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36
[DEPENDENCY_CONFIG]
USE_CURL = True
USE_WGET = True
USE_CHROME = True
USE_YOUTUBEDL = True
USE_GIT = True
CURL_BINARY = curl
GIT_BINARY = git"
WGET_BINARY = wget
YOUTUBEDL_BINARY = youtube-dl
CHROME_BINARY = chromium

View file

@ -1 +0,0 @@
__package__ = 'archivebox.legacy'

View file

@ -1,694 +0,0 @@
import os
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
from datetime import datetime
from .schema import Link, ArchiveResult, ArchiveOutput
from .index import (
load_link_details,
write_link_details,
patch_main_index,
)
from .config import (
CURL_BINARY,
GIT_BINARY,
WGET_BINARY,
YOUTUBEDL_BINARY,
SAVE_FAVICON,
SAVE_TITLE,
SAVE_WGET,
SAVE_WGET_REQUISITES,
SAVE_PDF,
SAVE_SCREENSHOT,
SAVE_DOM,
SAVE_WARC,
SAVE_GIT,
SAVE_MEDIA,
SAVE_ARCHIVE_DOT_ORG,
TIMEOUT,
MEDIA_TIMEOUT,
GIT_DOMAINS,
VERSION,
WGET_USER_AGENT,
CHECK_SSL_VALIDITY,
COOKIES_FILE,
CURL_VERSION,
WGET_VERSION,
CHROME_VERSION,
GIT_VERSION,
YOUTUBEDL_VERSION,
WGET_AUTO_COMPRESSION,
)
from .util import (
enforce_types,
domain,
extension,
without_query,
without_fragment,
fetch_page_title,
is_static_file,
TimedProgress,
chmod_file,
wget_output_path,
chrome_args,
run, PIPE, DEVNULL,
)
from .logs import (
log_link_archiving_started,
log_link_archiving_finished,
log_archive_method_started,
log_archive_method_finished,
)
class ArchiveError(Exception):
def __init__(self, message, hints=None):
super().__init__(message)
self.hints = hints
@enforce_types
def archive_link(link: Link, out_dir: Optional[str]=None) -> Link:
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
ARCHIVE_METHODS = (
('title', should_save_title, save_title),
('favicon', should_save_favicon, save_favicon),
('wget', should_save_wget, save_wget),
('pdf', should_save_pdf, save_pdf),
('screenshot', should_save_screenshot, save_screenshot),
('dom', should_save_dom, save_dom),
('git', should_save_git, save_git),
('media', should_save_media, save_media),
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
)
out_dir = out_dir or link.link_dir
try:
is_new = not os.path.exists(out_dir)
if is_new:
os.makedirs(out_dir)
link = load_link_details(link, out_dir=out_dir)
log_link_archiving_started(link, out_dir, is_new)
link = link.overwrite(updated=datetime.now())
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
for method_name, should_run, method_function in ARCHIVE_METHODS:
try:
if method_name not in link.history:
link.history[method_name] = []
if should_run(link, out_dir):
log_archive_method_started(method_name)
result = method_function(link=link, out_dir=out_dir)
link.history[method_name].append(result)
stats[result.status] += 1
log_archive_method_finished(result)
else:
stats['skipped'] += 1
except Exception as e:
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
method_name,
link.url,
)) from e
# print(' ', stats)
write_link_details(link, out_dir=link.link_dir)
patch_main_index(link)
# # If any changes were made, update the main links index json and html
# was_changed = stats['succeeded'] or stats['failed']
# if was_changed:
# patch_main_index(link)
log_link_archiving_finished(link, link.link_dir, is_new, stats)
except KeyboardInterrupt:
try:
write_link_details(link, out_dir=link.link_dir)
except:
pass
raise
except Exception as err:
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
raise
return link
### Archive Method Functions
@enforce_types
def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
# if link already has valid title, skip it
if link.title and not link.title.lower().startswith('http'):
return False
if is_static_file(link.url):
return False
return SAVE_TITLE
@enforce_types
def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content"""
output: ArchiveOutput = None
cmd = [
CURL_BINARY,
link.url,
'|',
'grep',
'<title',
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
output = fetch_page_title(link.url, timeout=timeout, progress=False)
if not output:
raise ArchiveError('Unable to detect page title')
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CURL_VERSION,
output=output,
status=status,
**timer.stats,
)
@enforce_types
def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
return False
return SAVE_FAVICON
@enforce_types
def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'favicon.ico'
cmd = [
CURL_BINARY,
'--max-time', str(timeout),
'--location',
'--output', str(output),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
chmod_file(output, cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CURL_VERSION,
output=output,
status=status,
**timer.stats,
)
@enforce_types
def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool:
output_path = wget_output_path(link)
out_dir = out_dir or link.link_dir
if output_path and os.path.exists(os.path.join(out_dir, output_path)):
return False
return SAVE_WGET
@enforce_types
def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using wget"""
out_dir = out_dir or link.link_dir
if SAVE_WARC:
warc_dir = os.path.join(out_dir, 'warc')
os.makedirs(warc_dir, exist_ok=True)
warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
output: ArchiveOutput = None
cmd = [
WGET_BINARY,
# '--server-response', # print headers for better error parsing
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
'--restrict-file-names=windows',
'--timeout={}'.format(timeout),
*([] if SAVE_WARC else ['--timestamping']),
*(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
*(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
*(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
*(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
output = wget_output_path(link)
# parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
if line.strip()
]
files_downloaded = (
int(output_tail[-1].strip().split(' ', 2)[1] or 0)
if 'Downloaded:' in output_tail[-1]
else 0
)
# Check for common failure cases
if result.returncode > 0 and files_downloaded < 1:
hints = (
'Got wget response code: {}.'.format(result.returncode),
*output_tail,
)
if b'403: Forbidden' in result.stderr:
raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
if b'404: Not Found' in result.stderr:
raise ArchiveError('404 Not Found', hints)
if b'ERROR 500: Internal Server Error' in result.stderr:
raise ArchiveError('500 Internal Server Error', hints)
raise ArchiveError('Got an error from the server', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=WGET_VERSION,
output=output,
status=status,
**timer.stats,
)
@enforce_types
def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'output.pdf')):
return False
return SAVE_PDF
@enforce_types
def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print PDF of site to file using chrome --headless"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'output.pdf'
cmd = [
*chrome_args(TIMEOUT=timeout),
'--print-to-pdf',
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
if result.returncode:
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to save PDF', hints)
chmod_file('output.pdf', cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CHROME_VERSION,
output=output,
status=status,
**timer.stats,
)
@enforce_types
def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
return False
return SAVE_SCREENSHOT
@enforce_types
def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""take screenshot of site using chrome --headless"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'screenshot.png'
cmd = [
*chrome_args(TIMEOUT=timeout),
'--screenshot',
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
if result.returncode:
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to save screenshot', hints)
chmod_file(output, cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CHROME_VERSION,
output=output,
status=status,
**timer.stats,
)
@enforce_types
def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'output.html')):
return False
return SAVE_DOM
@enforce_types
def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'output.html'
output_path = os.path.join(out_dir, str(output))
cmd = [
*chrome_args(TIMEOUT=timeout),
'--dump-dom',
link.url
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
with open(output_path, 'w+') as f:
result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
if result.returncode:
hints = result.stderr.decode()
raise ArchiveError('Failed to save DOM', hints)
chmod_file(output, cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CHROME_VERSION,
output=output,
status=status,
**timer.stats,
)
@enforce_types
def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'git')):
return False
is_clonable_url = (
(domain(link.url) in GIT_DOMAINS)
or (extension(link.url) == 'git')
)
if not is_clonable_url:
return False
return SAVE_GIT
@enforce_types
def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using git"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'git'
output_path = os.path.join(out_dir, str(output))
os.makedirs(output_path, exist_ok=True)
cmd = [
GIT_BINARY,
'clone',
'--mirror',
'--recursive',
*([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
without_query(without_fragment(link.url)),
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
if result.returncode == 128:
# ignore failed re-download when the folder already exists
pass
elif result.returncode > 0:
hints = 'Got git response code: {}.'.format(result.returncode)
raise ArchiveError('Failed to save git clone', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=GIT_VERSION,
output=output,
status=status,
**timer.stats,
)
@enforce_types
def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'media')):
return False
return SAVE_MEDIA
@enforce_types
def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'media'
output_path = os.path.join(out_dir, str(output))
os.makedirs(output_path, exist_ok=True)
cmd = [
YOUTUBEDL_BINARY,
'--write-description',
'--write-info-json',
'--write-annotations',
'--yes-playlist',
'--write-thumbnail',
'--no-call-home',
'--no-check-certificate',
'--user-agent',
'--all-subs',
'--extract-audio',
'--keep-video',
'--ignore-errors',
'--geo-bypass',
'--audio-format', 'mp3',
'--audio-quality', '320K',
'--embed-thumbnail',
'--add-metadata',
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
chmod_file(output, cwd=out_dir)
if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr
or b'HTTP Error 404' in result.stderr
or b'HTTP Error 403' in result.stderr
or b'URL could be a direct video link' in result.stderr
or b'Unable to extract container ID' in result.stderr):
# These happen too frequently on non-media pages to warrant printing to console
pass
else:
hints = (
'Got youtube-dl response code: {}.'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to save media', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=YOUTUBEDL_VERSION,
output=output,
status=status,
**timer.stats,
)
@enforce_types
def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
# if open(path, 'r').read().strip() != 'None':
return False
return SAVE_ARCHIVE_DOT_ORG
@enforce_types
def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""submit site to archive.org for archiving via their service, save returned archive url"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'archive.org.txt'
archive_org_url = None
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
cmd = [
CURL_BINARY,
'--location',
'--head',
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
'--max-time', str(timeout),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
submit_url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
content_location, errors = parse_archive_dot_org_response(result.stdout)
if content_location:
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
archive_org_url = None
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
elif errors:
raise ArchiveError(', '.join(errors))
else:
raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
if output and not isinstance(output, Exception):
# instead of writing None when archive.org rejects the url write the
# url to resubmit it to archive.org. This is so when the user visits
# the URL in person, it will attempt to re-archive it, and it'll show the
# nicer error message explaining why the url was rejected if it fails.
archive_org_url = archive_org_url or submit_url
with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
f.write(archive_org_url)
chmod_file('archive.org.txt', cwd=out_dir)
output = archive_org_url
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CURL_VERSION,
output=output,
status=status,
**timer.stats,
)
@enforce_types
def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
# Parse archive.org response headers
headers: Dict[str, List[str]] = defaultdict(list)
# lowercase all the header names and store in dict
for header in response.splitlines():
if b':' not in header or not header.strip():
continue
name, val = header.decode().split(':', 1)
headers[name.lower().strip()].append(val.strip())
# Get successful archive url in "content-location" header or any errors
content_location = headers['content-location']
errors = headers['x-archive-wayback-runtime-error']
return content_location, errors

View file

@ -1,626 +0,0 @@
import os
import re
import shutil
from typing import Dict, List, Optional, Iterable
from itertools import chain
from .schema import Link
from .util import (
enforce_types,
TimedProgress,
get_dir_size,
human_readable_size,
)
from .index import (
links_after_timestamp,
load_main_index,
import_new_links,
write_main_index,
)
from .storage.json import (
parse_json_main_index,
parse_json_link_details,
parse_json_links_details,
)
from .storage.sql import parse_sql_main_index, get_admins
from .storage.html import parse_html_main_index
from .archive_methods import archive_link
from .config import (
stderr,
ANSI,
ONLY_NEW,
OUTPUT_DIR,
SOURCES_DIR,
ARCHIVE_DIR,
LOGS_DIR,
CONFIG_FILE,
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
STATIC_DIR_NAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
SQL_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
check_dependencies,
check_data_folder,
setup_django,
write_config_file,
)
from .logs import (
log_archiving_started,
log_archiving_paused,
log_archiving_finished,
log_removal_started,
log_removal_finished,
log_list_started,
log_list_finished,
)
ALLOWED_IN_OUTPUT_DIR = {
'.DS_Store',
'.venv',
'venv',
'virtualenv',
'.virtualenv',
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
STATIC_DIR_NAME,
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
}
@enforce_types
def init():
os.makedirs(OUTPUT_DIR, exist_ok=True)
is_empty = not len(set(os.listdir(OUTPUT_DIR)) - ALLOWED_IN_OUTPUT_DIR)
existing_index = os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
if is_empty and not existing_index:
print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
print(f' {OUTPUT_DIR}')
print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
elif existing_index:
print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
print(f' {OUTPUT_DIR}')
print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
else:
stderr(
("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n"
" You must run init in a completely empty directory, or an existing data folder.\n\n"
" {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
" then run and run 'archivebox init' to pick up where you left off.\n\n"
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
).format(OUTPUT_DIR, **ANSI)
)
raise SystemExit(1)
if existing_index:
print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI))
else:
print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
os.makedirs(SOURCES_DIR, exist_ok=True)
print(f'{SOURCES_DIR}')
os.makedirs(ARCHIVE_DIR, exist_ok=True)
print(f'{ARCHIVE_DIR}')
os.makedirs(LOGS_DIR, exist_ok=True)
print(f'{LOGS_DIR}')
write_config_file({}, out_dir=OUTPUT_DIR)
print(f'{CONFIG_FILE}')
if os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)):
print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
else:
print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
setup_django(OUTPUT_DIR, check_db=False)
from django.conf import settings
assert settings.DATABASE_FILE == os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)
print(f'{settings.DATABASE_FILE}')
print()
from .storage.sql import apply_migrations
for migration_line in apply_migrations(OUTPUT_DIR):
print(f' {migration_line}')
assert os.path.exists(settings.DATABASE_FILE)
# from django.contrib.auth.models import User
# if IS_TTY and not User.objects.filter(is_superuser=True).exists():
# print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
# call_command("createsuperuser", interactive=True)
print()
print('{green}[*] Collecting links from any existing index or archive folders...{reset}'.format(**ANSI))
all_links = {}
if existing_index:
all_links = {
link.url: link
for link in load_main_index(out_dir=OUTPUT_DIR, warn=False)
}
print(' √ Loaded {} links from existing main index...'.format(len(all_links)))
orphaned_json_links = {
link.url: link
for link in parse_json_main_index(OUTPUT_DIR)
if link.url not in all_links
}
if orphaned_json_links:
all_links.update(orphaned_json_links)
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
orphaned_sql_links = {
link.url: link
for link in parse_sql_main_index(OUTPUT_DIR)
if link.url not in all_links
}
if orphaned_sql_links:
all_links.update(orphaned_sql_links)
print(' {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
orphaned_data_dir_links = {
link.url: link
for link in parse_json_links_details(OUTPUT_DIR)
}
orphan_new_links = {
url: link
for url, link in orphaned_data_dir_links.items()
if url not in all_links
}
orphan_duplicates = {
url: link
for url, link in orphaned_data_dir_links.items()
if url in all_links
}
if orphan_new_links:
all_links.update(orphan_new_links)
print(' {lightyellow}√ Added {} orphaned links from existing archive directories...{reset}'.format(len(orphan_new_links), **ANSI))
if orphan_duplicates:
print(' {lightyellow}! Skipped adding {} invalid link data directories that would have overwritten or corrupted existing data.{reset}'.format(len(orphan_duplicates), **ANSI))
orphaned_data_dirs = {folder for folder in orphan_duplicates.keys()}
invalid_folders = {
folder: link
for folder, link in get_invalid_folders(all_links.values(), out_dir=OUTPUT_DIR).items()
if folder not in orphaned_data_dirs
}
if invalid_folders:
print(' {lightyellow}! Skipped adding {} corrupted/unrecognized link data directories that could not be read.{reset}'.format(len(orphan_duplicates), **ANSI))
if orphan_duplicates or invalid_folders:
print(' For more information about the link data directories that were skipped, run:')
print(' archivebox info')
print(' archivebox list --status=invalid')
print(' archivebox list --status=orphaned')
print(' archivebox list --status=duplicate')
write_main_index(list(all_links.values()), out_dir=OUTPUT_DIR)
print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
if existing_index:
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
else:
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
print()
print(' To view your archive index, open:')
print(' {}'.format(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME)))
print()
print(' To add new links, you can run:')
print(" archivebox add 'https://example.com'")
print()
print(' For more usage and examples, run:')
print(' archivebox help')
@enforce_types
def info():
print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
print(f' {OUTPUT_DIR}/*')
num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False, pattern='index.')
size = human_readable_size(num_bytes)
print(f' Size: {size} across {num_files} files')
print()
links = list(load_main_index(out_dir=OUTPUT_DIR))
num_json_links = len(links)
num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=OUTPUT_DIR))
num_html_links = sum(1 for url in parse_html_main_index(out_dir=OUTPUT_DIR))
num_link_details = sum(1 for link in parse_json_links_details(out_dir=OUTPUT_DIR))
users = get_admins().values_list('username', flat=True)
print(f' > JSON Main Index: {num_json_links} links'.ljust(36), f'(found in {JSON_INDEX_FILENAME})')
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
print(f' > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})')
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
print(f' > Admin: {len(users)} users {", ".join(users)}'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
if num_html_links != len(links) or num_sql_links != len(links):
print()
print(' {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI))
print(' archivebox init')
if not users:
print()
print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI))
print(' archivebox manage createsuperuser')
print()
print('{green}[*] Scanning archive collection link data directories...{reset}'.format(**ANSI))
print(f' {ARCHIVE_DIR}/*')
num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
size = human_readable_size(num_bytes)
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
print()
num_indexed = len(get_indexed_folders(links, out_dir=OUTPUT_DIR))
num_archived = len(get_archived_folders(links, out_dir=OUTPUT_DIR))
num_unarchived = len(get_unarchived_folders(links, out_dir=OUTPUT_DIR))
print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
num_present = len(get_present_folders(links, out_dir=OUTPUT_DIR))
num_valid = len(get_valid_folders(links, out_dir=OUTPUT_DIR))
print()
print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
if num_indexed:
print()
print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
print(' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)')
if orphaned:
print()
print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI))
print(' archivebox init')
if num_invalid:
print()
print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI))
print(' archivebox init')
print()
@enforce_types
def update_archive_data(import_path: Optional[str]=None,
resume: Optional[float]=None,
only_new: bool=False,
index_only: bool=False) -> List[Link]:
"""The main ArchiveBox entrancepoint. Everything starts here."""
check_dependencies()
check_data_folder()
# Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path
all_links: List[Link] = []
new_links: List[Link] = []
all_links = load_main_index(out_dir=OUTPUT_DIR)
if import_path:
all_links, new_links = import_new_links(all_links, import_path)
# Step 2: Write updated index with deduped old and new links back to disk
write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
if index_only:
return all_links
# Step 3: Run the archive methods for each link
links = new_links if ONLY_NEW else all_links
log_archiving_started(len(links), resume)
idx: int = 0
link: Link = None # type: ignore
try:
for idx, link in enumerate(links_after_timestamp(links, resume)):
archive_link(link, out_dir=link.link_dir)
except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link.timestamp if link else '0')
raise SystemExit(0)
except:
print()
raise
log_archiving_finished(len(links))
# Step 4: Re-write links index with updated titles, icons, and resources
all_links = load_main_index(out_dir=OUTPUT_DIR)
write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
return all_links
LINK_FILTERS = {
'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
'substring': lambda link, pattern: pattern in link.url,
'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
'domain': lambda link, pattern: link.domain == pattern,
}
@enforce_types
def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
for pattern in filter_patterns:
if LINK_FILTERS[filter_type](link, pattern):
return True
return False
@enforce_types
def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
all_links = load_main_index(out_dir=OUTPUT_DIR)
for link in all_links:
if after is not None and float(link.timestamp) < after:
continue
if before is not None and float(link.timestamp) > before:
continue
if filter_patterns:
if link_matches_filter(link, filter_patterns, filter_type):
yield link
else:
yield link
@enforce_types
def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
after: Optional[float]=None, before: Optional[float]=None,
yes: bool=False, delete: bool=False) -> List[Link]:
check_dependencies()
check_data_folder()
log_list_started(filter_patterns, filter_type)
timer = TimedProgress(360, prefix=' ')
try:
links = list(list_archive_data(
filter_patterns=filter_patterns,
filter_type=filter_type,
after=after,
before=before,
))
finally:
timer.end()
if not len(links):
log_removal_finished(0, 0)
raise SystemExit(1)
log_list_finished(links)
log_removal_started(links, yes=yes, delete=delete)
timer = TimedProgress(360, prefix=' ')
try:
to_keep = []
all_links = load_main_index(out_dir=OUTPUT_DIR)
for link in all_links:
should_remove = (
(after is not None and float(link.timestamp) < after)
or (before is not None and float(link.timestamp) > before)
or link_matches_filter(link, filter_patterns, filter_type)
)
if not should_remove:
to_keep.append(link)
elif should_remove and delete:
shutil.rmtree(link.link_dir)
finally:
timer.end()
write_main_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
log_removal_finished(len(all_links), len(to_keep))
return to_keep
def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links without checking archive status or data directory validity"""
return {
link.link_dir: link
for link in links
}
def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are archived with a valid data directory"""
return {
link.link_dir: link
for link in filter(is_archived, links)
}
def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are unarchived with no data directory or an empty data directory"""
return {
link.link_dir: link
for link in filter(is_unarchived, links)
}
def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that are expected to exist based on the main index"""
all_folders = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
link = None
try:
link = parse_json_link_details(entry.path)
except Exception:
pass
all_folders[entry.path] = link
return all_folders
def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs with a valid index matched to the main index and archived content"""
return {
link.link_dir: link
for link in filter(is_valid, links)
}
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
return {**duplicate, **orphaned, **corrupted, **unrecognized}
def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that conflict with other directories that have the same link URL or timestamp"""
links = list(links)
by_url = {link.url: 0 for link in links}
by_timestamp = {link.timestamp: 0 for link in links}
duplicate_folders = {}
indexed_folders = {link.link_dir for link in links}
data_folders = (
entry.path
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
)
for path in chain(sorted(indexed_folders), sorted(data_folders)):
link = None
try:
link = parse_json_link_details(path)
except Exception:
pass
if link:
# link folder has same timestamp as different link folder
by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
if by_timestamp[link.timestamp] > 1:
duplicate_folders[path] = link
# link folder has same url as different link folder
by_url[link.url] = by_url.get(link.url, 0) + 1
if by_url[link.url] > 1:
duplicate_folders[path] = link
return duplicate_folders
def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that contain a valid index but aren't listed in the main index"""
links = list(links)
indexed_folders = {link.link_dir: link for link in links}
orphaned_folders = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
link = None
try:
link = parse_json_link_details(entry.path)
except Exception:
pass
if index_exists and entry.path not in indexed_folders:
# folder is a valid link data dir with index details, but it's not in the main index
orphaned_folders[entry.path] = link
return orphaned_folders
def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that don't contain a valid index and aren't listed in the main index"""
return {
link.link_dir: link
for link in filter(is_corrupt, links)
}
def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that don't contain recognizable archive data and aren't listed in the main index"""
by_timestamp = {link.timestamp: 0 for link in links}
unrecognized_folders: Dict[str, Optional[Link]] = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
link = None
try:
link = parse_json_link_details(entry.path)
except Exception:
pass
if index_exists and link is None:
# index exists but it's corrupted or unparseable
unrecognized_folders[entry.path] = link
elif not index_exists:
# link details index doesn't exist and the folder isn't in the main index
timestamp = entry.path.rsplit('/', 1)[-1]
if timestamp not in by_timestamp:
unrecognized_folders[entry.path] = link
return unrecognized_folders
def is_valid(link: Link) -> bool:
dir_exists = os.path.exists(link.link_dir)
index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
if not dir_exists:
# unarchived links are not included in the valid list
return False
if dir_exists and not index_exists:
return False
if dir_exists and index_exists:
try:
parsed_link = parse_json_link_details(link.link_dir)
return link.url == parsed_link.url
except Exception:
pass
return False
def is_corrupt(link: Link) -> bool:
if not os.path.exists(link.link_dir):
# unarchived links are not considered corrupt
return False
if is_valid(link):
return False
return True
def is_archived(link: Link) -> bool:
return is_valid(link) and link.is_archived
def is_unarchived(link: Link) -> bool:
if not os.path.exists(link.link_dir):
return True
return not link.is_archived

View file

@ -1,10 +0,0 @@
[mypy_django_plugin]
# specify settings module to use for django.conf.settings, this setting
# could also be specified with DJANGO_SETTINGS_MODULE environment variable
# (it also takes priority over config file)
django_settings = core.settings
# if True, all unknown settings in django.conf.settings will fallback to Any,
# specify it if your settings are loaded dynamically to avoid false positives
ignore_missing_settings = True

View file

@ -1,331 +0,0 @@
"""
Everything related to parsing links from input sources.
For a list of supported services, see the README.md.
For examples of supported import formats see tests/.
Link: {
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
'timestamp': '1544212312.4234',
'title': 'Example.com Page Title',
'tags': 'abc,def',
'sources': [
'output/sources/ril_export.html',
'output/sources/getpocket.com-1523422111.txt',
'output/sources/stdin-234234112312.txt'
]
}
"""
import re
import json
from typing import Tuple, List, IO, Iterable
from datetime import datetime
import xml.etree.ElementTree as etree
from .config import TIMEOUT
from .util import (
htmldecode,
str_between,
URL_REGEX,
check_url_parsing_invariants,
TimedProgress,
Link,
enforce_types,
)
@enforce_types
def parse_links(source_file: str) -> Tuple[List[Link], str]:
"""parse a list of URLs with their metadata from an
RSS feed, bookmarks export, or text file
"""
check_url_parsing_invariants()
PARSERS = (
# Specialized parsers
('Pocket HTML', parse_pocket_html_export),
('Pinboard RSS', parse_pinboard_rss_export),
('Shaarli RSS', parse_shaarli_rss_export),
('Medium RSS', parse_medium_rss_export),
# General parsers
('Netscape HTML', parse_netscape_html_export),
('Generic RSS', parse_rss_export),
('Generic JSON', parse_json_export),
# Fallback parser
('Plain Text', parse_plain_text_export),
)
timer = TimedProgress(TIMEOUT * 4)
with open(source_file, 'r', encoding='utf-8') as file:
for parser_name, parser_func in PARSERS:
try:
links = list(parser_func(file))
if links:
timer.end()
return links, parser_name
except Exception as err: # noqa
# Parsers are tried one by one down the list, and the first one
# that succeeds is used. To see why a certain parser was not used
# due to error or format incompatibility, uncomment this line:
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
pass
timer.end()
return [], 'Failed to parse'
### Import Parser Functions
@enforce_types
def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
html_file.seek(0)
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
for line in html_file:
# example line
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
match = pattern.search(line)
if match:
url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
time = datetime.fromtimestamp(float(match.group(2)))
tags = match.group(3)
title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=tags or '',
sources=[html_file.name],
)
@enforce_types
def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
json_file.seek(0)
links = json.load(json_file)
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
for link in links:
# example line
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
if link:
# Parse URL
url = link.get('href') or link.get('url') or link.get('URL')
if not url:
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
# Parse the timestamp
ts_str = str(datetime.now().timestamp())
if link.get('timestamp'):
# chrome/ff histories use a very precise timestamp
ts_str = str(link['timestamp'] / 10000000)
elif link.get('time'):
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
elif link.get('created_at'):
ts_str = str(json_date(link['created_at']).timestamp())
elif link.get('created'):
ts_str = str(json_date(link['created']).timestamp())
elif link.get('date'):
ts_str = str(json_date(link['date']).timestamp())
elif link.get('bookmarked'):
ts_str = str(json_date(link['bookmarked']).timestamp())
elif link.get('saved'):
ts_str = str(json_date(link['saved']).timestamp())
# Parse the title
title = None
if link.get('title'):
title = link['title'].strip()
elif link.get('description'):
title = link['description'].replace(' — Readability', '').strip()
elif link.get('name'):
title = link['name'].strip()
yield Link(
url=htmldecode(url),
timestamp=ts_str,
title=htmldecode(title) or None,
tags=htmldecode(link.get('tags')) or '',
sources=[json_file.name],
)
@enforce_types
def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse RSS XML-format files into links"""
rss_file.seek(0)
items = rss_file.read().split('<item>')
items = items[1:] if items else []
for item in items:
# example item:
# <item>
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
# <category>Unread</category>
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
# </item>
trailing_removed = item.split('</item>', 1)[0]
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
rows = leading_removed.split('\n')
def get_row(key):
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
url = str_between(get_row('link'), '<link>', '</link>')
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=None,
sources=[rss_file.name],
)
@enforce_types
def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Shaarli-specific RSS XML-format files into links"""
rss_file.seek(0)
entries = rss_file.read().split('<entry>')[1:]
for entry in entries:
# example entry:
# <entry>
# <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
# <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
# <id>https://demo.shaarli.org/?cEV4vw</id>
# <published>2019-01-30T06:06:01+00:00</published>
# <updated>2019-01-30T06:06:01+00:00</updated>
# <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>&#8212; <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
# </entry>
trailing_removed = entry.split('</entry>', 1)[0]
leading_removed = trailing_removed.strip()
rows = leading_removed.split('\n')
def get_row(key):
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
title = str_between(get_row('title'), '<title>', '</title>').strip()
url = str_between(get_row('link'), '<link href="', '" />')
ts_str = str_between(get_row('published'), '<published>', '</published>')
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=None,
sources=[rss_file.name],
)
@enforce_types
def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
html_file.seek(0)
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
for line in html_file:
# example line
# <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
match = pattern.search(line)
if match:
url = match.group(1)
time = datetime.fromtimestamp(float(match.group(2)))
title = match.group(3).strip()
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=None,
sources=[html_file.name],
)
@enforce_types
def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Pinboard RSS feed files into links"""
rss_file.seek(0)
root = etree.parse(rss_file).getroot()
items = root.findall("{http://purl.org/rss/1.0/}item")
for item in items:
find = lambda p: item.find(p).text.strip() if item.find(p) else None # type: ignore
url = find("{http://purl.org/rss/1.0/}link")
tags = find("{http://purl.org/dc/elements/1.1/}subject")
title = find("{http://purl.org/rss/1.0/}title")
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
# Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it:
if ts_str and ts_str[-3:-2] == ":":
ts_str = ts_str[:-3]+ts_str[-2:]
if ts_str:
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
else:
time = datetime.now()
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=htmldecode(tags) or None,
sources=[rss_file.name],
)
@enforce_types
def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Medium RSS feed files into links"""
rss_file.seek(0)
root = etree.parse(rss_file).getroot()
items = root.find("channel").findall("item") # type: ignore
for item in items:
url = item.find("link").text # type: ignore
title = item.find("title").text.strip() # type: ignore
ts_str = item.find("pubDate").text # type: ignore
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=None,
sources=[rss_file.name],
)
@enforce_types
def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]:
"""Parse raw links from each line in a text file"""
text_file.seek(0)
for line in text_file.readlines():
urls = re.findall(URL_REGEX, line) if line.strip() else ()
for url in urls: # type: ignore
yield Link(
url=htmldecode(url),
timestamp=str(datetime.now().timestamp()),
title=None,
tags=None,
sources=[text_file.name],
)

View file

@ -1,89 +0,0 @@
#!/usr/bin/env python3
import re
from argparse import ArgumentParser
from os.path import exists, join
from shutil import rmtree
from typing import List
from .config import ARCHIVE_DIR, OUTPUT_DIR
from .index import (
parse_json_links_index,
write_html_links_index,
write_json_links_index,
)
def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
if not exists(join(OUTPUT_DIR, 'index.json')):
exit('index.json is missing; nothing to do')
compiled = [re.compile(r) for r in regexes]
links = parse_json_links_index(OUTPUT_DIR)
filtered = []
remaining = []
for link in links:
url = link.url
for r in compiled:
if r.search(url):
filtered.append((link, r))
break
else:
remaining.append(link)
if not filtered:
exit('Search did not match any entries.')
print('Filtered out {}/{} urls:'.format(len(filtered), len(links)))
for link, regex in filtered:
url = link.url
print(' {url} via {regex}'.format(url=url, regex=regex.pattern))
if not proceed:
answer = input('Remove {} entries from index? [y/n] '.format(
len(filtered)))
proceed = answer.strip().lower() in ('y', 'yes')
if not proceed:
exit('Aborted')
write_json_links_index(OUTPUT_DIR, remaining)
write_html_links_index(OUTPUT_DIR, remaining)
if delete:
for link, _ in filtered:
data_dir = join(ARCHIVE_DIR, link['timestamp'])
if exists(data_dir):
rmtree(data_dir)
if __name__ == '__main__':
p = ArgumentParser('Index purging tool')
p.add_argument(
'--regex',
'-r',
action='append',
help='Regular expression matching URLs to purge',
)
p.add_argument(
'--delete',
'-d',
action='store_true',
default=False,
help='Delete webpage files from archive',
)
p.add_argument(
'--yes',
'-y',
action='store_true',
default=False,
help='Do not prompt for confirmation',
)
args = p.parse_args()
if args.regex:
cleanup_index(args.regex, proceed=args.yes, delete=args.delete)
else:
p.print_help()

View file

@ -1 +0,0 @@
__package__ = 'archivebox.legacy.storage'

1086
archivebox/main.py Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,68 @@
"""
Everything related to parsing links from input sources.
For a list of supported services, see the README.md.
For examples of supported import formats see tests/.
"""
__package__ = 'archivebox.parsers'
from typing import Tuple, List
from ..config import TIMEOUT
from ..util import (
check_url_parsing_invariants,
TimedProgress,
Link,
enforce_types,
)
from .pocket_html import parse_pocket_html_export
from .pinboard_rss import parse_pinboard_rss_export
from .shaarli_rss import parse_shaarli_rss_export
from .medium_rss import parse_medium_rss_export
from .netscape_html import parse_netscape_html_export
from .generic_rss import parse_generic_rss_export
from .generic_json import parse_generic_json_export
from .generic_txt import parse_generic_txt_export
@enforce_types
def parse_links(source_file: str) -> Tuple[List[Link], str]:
"""parse a list of URLs with their metadata from an
RSS feed, bookmarks export, or text file
"""
check_url_parsing_invariants()
PARSERS = (
# Specialized parsers
('Pocket HTML', parse_pocket_html_export),
('Pinboard RSS', parse_pinboard_rss_export),
('Shaarli RSS', parse_shaarli_rss_export),
('Medium RSS', parse_medium_rss_export),
# General parsers
('Netscape HTML', parse_netscape_html_export),
('Generic RSS', parse_generic_rss_export),
('Generic JSON', parse_generic_json_export),
# Fallback parser
('Plain Text', parse_generic_txt_export),
)
timer = TimedProgress(TIMEOUT * 4)
with open(source_file, 'r', encoding='utf-8') as file:
for parser_name, parser_func in PARSERS:
try:
links = list(parser_func(file))
if links:
timer.end()
return links, parser_name
except Exception as err: # noqa
# Parsers are tried one by one down the list, and the first one
# that succeeds is used. To see why a certain parser was not used
# due to error or format incompatibility, uncomment this line:
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
pass
timer.end()
return [], 'Failed to parse'

View file

@ -0,0 +1,65 @@
__package__ = 'archivebox.parsers'
import json
from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
)
@enforce_types
def parse_generic_json_export(json_file: IO[str]) -> Iterable[Link]:
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
json_file.seek(0)
links = json.load(json_file)
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
for link in links:
# example line
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
if link:
# Parse URL
url = link.get('href') or link.get('url') or link.get('URL')
if not url:
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
# Parse the timestamp
ts_str = str(datetime.now().timestamp())
if link.get('timestamp'):
# chrome/ff histories use a very precise timestamp
ts_str = str(link['timestamp'] / 10000000)
elif link.get('time'):
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
elif link.get('created_at'):
ts_str = str(json_date(link['created_at']).timestamp())
elif link.get('created'):
ts_str = str(json_date(link['created']).timestamp())
elif link.get('date'):
ts_str = str(json_date(link['date']).timestamp())
elif link.get('bookmarked'):
ts_str = str(json_date(link['bookmarked']).timestamp())
elif link.get('saved'):
ts_str = str(json_date(link['saved']).timestamp())
# Parse the title
title = None
if link.get('title'):
title = link['title'].strip()
elif link.get('description'):
title = link['description'].replace(' — Readability', '').strip()
elif link.get('name'):
title = link['name'].strip()
yield Link(
url=htmldecode(url),
timestamp=ts_str,
title=htmldecode(title) or None,
tags=htmldecode(link.get('tags')) or '',
sources=[json_file.name],
)

View file

@ -0,0 +1,49 @@
__package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
str_between,
)
@enforce_types
def parse_generic_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse RSS XML-format files into links"""
rss_file.seek(0)
items = rss_file.read().split('<item>')
items = items[1:] if items else []
for item in items:
# example item:
# <item>
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
# <category>Unread</category>
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
# </item>
trailing_removed = item.split('</item>', 1)[0]
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
rows = leading_removed.split('\n')
def get_row(key):
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
url = str_between(get_row('link'), '<link>', '</link>')
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=None,
sources=[rss_file.name],
)

View file

@ -0,0 +1,30 @@
__package__ = 'archivebox.parsers'
__description__ = 'Plain Text'
import re
from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
URL_REGEX
)
@enforce_types
def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]:
"""Parse raw links from each line in a text file"""
text_file.seek(0)
for line in text_file.readlines():
urls = re.findall(URL_REGEX, line) if line.strip() else ()
for url in urls: # type: ignore
yield Link(
url=htmldecode(url),
timestamp=str(datetime.now().timestamp()),
title=None,
tags=None,
sources=[text_file.name],
)

View file

@ -0,0 +1,35 @@
__package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
from xml.etree import ElementTree
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
)
@enforce_types
def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Medium RSS feed files into links"""
rss_file.seek(0)
root = ElementTree.parse(rss_file).getroot()
items = root.find("channel").findall("item") # type: ignore
for item in items:
url = item.find("link").text # type: ignore
title = item.find("title").text.strip() # type: ignore
ts_str = item.find("pubDate").text # type: ignore
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=None,
sources=[rss_file.name],
)

View file

@ -0,0 +1,39 @@
__package__ = 'archivebox.parsers'
import re
from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
)
@enforce_types
def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
html_file.seek(0)
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
for line in html_file:
# example line
# <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
match = pattern.search(line)
if match:
url = match.group(1)
time = datetime.fromtimestamp(float(match.group(2)))
title = match.group(3).strip()
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=None,
sources=[html_file.name],
)

View file

@ -0,0 +1,47 @@
__package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
from xml.etree import ElementTree
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
)
@enforce_types
def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Pinboard RSS feed files into links"""
rss_file.seek(0)
root = ElementTree.parse(rss_file).getroot()
items = root.findall("{http://purl.org/rss/1.0/}item")
for item in items:
find = lambda p: item.find(p).text.strip() if item.find(p) else None # type: ignore
url = find("{http://purl.org/rss/1.0/}link")
tags = find("{http://purl.org/dc/elements/1.1/}subject")
title = find("{http://purl.org/rss/1.0/}title")
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
# Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it:
if ts_str and ts_str[-3:-2] == ":":
ts_str = ts_str[:-3]+ts_str[-2:]
if ts_str:
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
else:
time = datetime.now()
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=htmldecode(tags) or None,
sources=[rss_file.name],
)

View file

@ -0,0 +1,38 @@
__package__ = 'archivebox.parsers'
import re
from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
)
@enforce_types
def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
html_file.seek(0)
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
for line in html_file:
# example line
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
match = pattern.search(line)
if match:
url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
time = datetime.fromtimestamp(float(match.group(2)))
tags = match.group(3)
title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=tags or '',
sources=[html_file.name],
)

View file

@ -0,0 +1,50 @@
__package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
str_between,
)
@enforce_types
def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Shaarli-specific RSS XML-format files into links"""
rss_file.seek(0)
entries = rss_file.read().split('<entry>')[1:]
for entry in entries:
# example entry:
# <entry>
# <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
# <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
# <id>https://demo.shaarli.org/?cEV4vw</id>
# <published>2019-01-30T06:06:01+00:00</published>
# <updated>2019-01-30T06:06:01+00:00</updated>
# <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>&#8212; <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
# </entry>
trailing_removed = entry.split('</entry>', 1)[0]
leading_removed = trailing_removed.strip()
rows = leading_removed.split('\n')
def get_row(key):
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
title = str_between(get_row('title'), '<title>', '</title>').strip()
url = str_between(get_row('link'), '<link href="', '" />')
ts_str = str_between(get_row('published'), '<published>', '</published>')
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=None,
sources=[rss_file.name],
)

View file

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 15 KiB

View file

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 17 KiB

View file

Before

Width:  |  Height:  |  Size: 1.6 KiB

After

Width:  |  Height:  |  Size: 1.6 KiB

View file

Before

Width:  |  Height:  |  Size: 158 B

After

Width:  |  Height:  |  Size: 158 B

View file

Before

Width:  |  Height:  |  Size: 201 B

After

Width:  |  Height:  |  Size: 201 B

View file

Before

Width:  |  Height:  |  Size: 157 B

After

Width:  |  Height:  |  Size: 157 B

View file

Before

Width:  |  Height:  |  Size: 11 KiB

After

Width:  |  Height:  |  Size: 11 KiB

View file

@ -1,6 +1,7 @@
import os
import re
import sys
import ssl
import json
import time
import shutil
@ -8,7 +9,7 @@ import argparse
from string import Template
from json import JSONEncoder
from typing import List, Optional, Any, Union, IO, Mapping, Tuple
from typing import List, Dict, Optional, Any, Union, IO, Mapping, Tuple
from inspect import signature
from functools import wraps
from hashlib import sha256
@ -28,11 +29,12 @@ from subprocess import (
from base32_crockford import encode as base32_encode # type: ignore
from .schema import Link
from .index.schema import Link
from .config import (
ANSI,
TERM_WIDTH,
SOURCES_DIR,
OUTPUT_DIR,
SOURCES_DIR_NAME,
OUTPUT_PERMISSIONS,
TIMEOUT,
SHOW_PROGRESS,
@ -40,8 +42,9 @@ from .config import (
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
CHROME_OPTIONS,
check_data_folder,
)
from .logs import pretty_path
from .cli.logging import pretty_path
### Parsing Helpers
@ -187,31 +190,36 @@ def check_url_parsing_invariants() -> None:
### Random Helpers
@enforce_types
def handle_stdin_import(raw_text: str) -> str:
if not os.path.exists(SOURCES_DIR):
os.makedirs(SOURCES_DIR)
def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
check_data_folder(out_dir=out_dir)
sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
if not os.path.exists(sources_dir):
os.makedirs(sources_dir)
ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
atomic_write(raw_text, source_path)
return source_path
@enforce_types
def handle_file_import(path: str, timeout: int=TIMEOUT) -> str:
def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
"""download a given url's content into output/sources/domain-<timestamp>.txt"""
check_data_folder(out_dir=out_dir)
if not os.path.exists(SOURCES_DIR):
os.makedirs(SOURCES_DIR)
sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
if not os.path.exists(sources_dir):
os.makedirs(sources_dir)
ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts))
source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts))
source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
print('{}[*] [{}] Downloading {}{}'.format(
ANSI['green'],
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
@ -532,7 +540,6 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str:
if CHECK_SSL_VALIDITY:
resp = urlopen(req, timeout=timeout)
else:
import ssl
insecure = ssl._create_unverified_context()
resp = urlopen(req, timeout=timeout, context=insecure)
@ -662,7 +669,7 @@ def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=Tr
return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
def links_to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
header: bool=True, ljust: int=0, separator: str=',') -> str:
csv_cols = csv_cols or ['timestamp', 'is_archived', 'url']
@ -677,6 +684,8 @@ def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
return '\n'.join((header_str, *row_strs))
def folders_to_str(folders: Dict[str, Optional[Link]]) -> str:
return '\n'.join(f'{folder} {link}' for folder, link in folders.items())
@enforce_types
def render_template(template_path: str, context: Mapping[str, str]) -> str:
@ -713,11 +722,11 @@ def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
os.remove(tmp_file)
def reject_stdin(caller: str) -> None:
def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
"""Tell the user they passed stdin to a command that doesn't accept it"""
if not sys.stdin.isatty():
stdin_raw_text = sys.stdin.read().strip()
if stdin and not stdin.isatty():
stdin_raw_text = stdin.read().strip()
if stdin_raw_text:
print(
'{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
@ -731,9 +740,30 @@ def reject_stdin(caller: str) -> None:
print()
raise SystemExit(1)
def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
if stdin and not stdin.isatty():
return stdin.read()
return None
def set_docstring(text: str):
def decorator(func):
@wraps(func)
def wrapper_with_docstring(*args, **kwargs):
return func(*args, **kwargs)
wrapper_with_docstring.__doc__ = text
return wrapper_with_docstring
return decorator
class SmartFormatter(argparse.HelpFormatter):
def _split_lines(self, text, width):
if '\n' in text:
return text.splitlines()
return argparse.HelpFormatter._split_lines(self, text, width)
class ArchiveError(Exception):
def __init__(self, message, hints=None):
super().__init__(message)
self.hints = hints