ArchiveBox/archivebox/cli/archivebox_schedule.py

#!/usr/bin/env python3

__package__ = 'archivebox.cli'
__command__ = 'archivebox schedule'
__description__ = 'Set ArchiveBox to regularly import URLs at specific times using cron'

import os
import sys
import argparse

from datetime import datetime
from crontab import CronTab, CronSlices


from ..legacy.util import reject_stdin
from ..legacy.config import (
    OUTPUT_DIR,
    LOGS_DIR,
    ARCHIVEBOX_BINARY,
    USER,
    ANSI,
    stderr,
    check_data_folder,
)


CRON_COMMENT = 'archivebox_schedule'


def main(args=None):
    check_data_folder()
    
    args = sys.argv[1:] if args is None else args

    parser = argparse.ArgumentParser(
        prog=__command__,
        description=__description__,
        add_help=True,
    )
    parser.add_argument(
        '--quiet', '-q',
        action='store_true',
        help=("Don't warn about storage space."),
    )
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--add', # '-a',
        action='store_true',
        help='Add a new scheduled ArchiveBox update job to cron',
    )
    parser.add_argument(
        '--every', # '-e',
        type=str,
        default='day',
        help='Run ArchiveBox once every [timeperiod] (hour/day/week/month/year or cron format e.g. "0 0 * * *")',
    )
    group.add_argument(
        '--clear', # '-c'
        action='store_true',
        help=("Stop all ArchiveBox scheduled runs, clear it completely from cron"),
    )
    group.add_argument(
        '--show', # '-s'
        action='store_true',
        help=("Print a list of currently active ArchiveBox cron jobs"),
    )
    group.add_argument(
        '--foreground', '-f',
        action='store_true',
        help=("Launch ArchiveBox as a long-running foreground task "
              "instead of using cron."),
    )
    group.add_argument(
        '--run-all', # '-a',
        action='store_true',
        help='Run all the scheduled jobs once immediately, independent of their configured schedules',
    )
    parser.add_argument(
        'import_path',
        nargs='?',
        type=str,
        default=None,
        help=("Check this path and import any new links on every run "
              "(can be either local file or remote URL)"),
    )
    command = parser.parse_args(args)
    reject_stdin(__command__)

    os.makedirs(LOGS_DIR, exist_ok=True)

    cron = CronTab(user=True)
    cron = dedupe_jobs(cron)

    existing_jobs = list(cron.find_comment(CRON_COMMENT))
    if command.foreground or command.run_all:
        if command.import_path or (not existing_jobs):
            stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
            stderr('    archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
            raise SystemExit(1)
        print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
        if command.run_all:
            try:
                for job in existing_jobs:
                    sys.stdout.write(f'  > {job.command}')
                    sys.stdout.flush()
                    job.run()
                    sys.stdout.write(f'\r  √ {job.command}\n')
            except KeyboardInterrupt:
                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
                raise SystemExit(1)
        if command.foreground:
            try:
                for result in cron.run_scheduler():
                    print(result)
            except KeyboardInterrupt:
                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
                raise SystemExit(1)

    elif command.show:
        if existing_jobs:
            print('\n'.join(str(cmd) for cmd in existing_jobs))
        else:
            stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
            stderr('    To schedule a new job, run:')
            stderr('        archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
        raise SystemExit(0)

    elif command.clear:
        print(cron.remove_all(comment=CRON_COMMENT))
        cron.write()
        raise SystemExit(0)

    elif command.every:
        quoted = lambda s: f'"{s}"' if s and ' ' in s else s
        cmd = [
            'cd',
            quoted(OUTPUT_DIR),
            '&&',
            quoted(ARCHIVEBOX_BINARY),
            *(('add', f'"{command.import_path}"',) if command.import_path else ('update',)),
            '2>&1',
            '>',
            quoted(os.path.join(LOGS_DIR, 'archivebox.log')),

        ]
        new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)

        if command.every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
            set_every = getattr(new_job.every(), command.every)
            set_every()
        elif CronSlices.is_valid(command.every):
            new_job.setall(command.every)
        else:
            stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
            stderr('    It must be one of minute/hour/day/week/month')
            stderr('    or a quoted cron-format schedule like:')
            stderr('        archivebox init --every=day https://example.com/some/rss/feed.xml')
            stderr('        archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
            raise SystemExit(1)

        cron = dedupe_jobs(cron)
        cron.write()

        total_runs = sum(j.frequency_per_year() for j in cron)
        existing_jobs = list(cron.find_comment(CRON_COMMENT))

        print()
        print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
        print('\n'.join(f'  > {cmd}' if str(cmd) == str(new_job) else f'    {cmd}' for cmd in existing_jobs))
        if total_runs > 60 and not command.quiet:
            stderr()
            stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
            stderr(f'    Congrats on being an enthusiastic internet archiver! 👌')
            stderr()
            stderr('    Make sure you have enough storage space available to hold all the data.')
            stderr('    Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
        raise SystemExit(0)


def dedupe_jobs(cron: CronTab) -> CronTab:
    deduped = set()
    for job in list(cron):
        unique_tuple = (str(job.slices), job.command)
        if unique_tuple not in deduped:
            deduped.add(unique_tuple)
        cron.remove(job)

    for schedule, command in deduped:
        job = cron.new(command=command, comment=CRON_COMMENT)
        job.setall(schedule)
        job.enable()

    return cron


if __name__ == '__main__':
    main()
add pipenv, schedule cmd, logs dir, and lots more 2019-04-19 13:09:54 +12:00			`#!/usr/bin/env python3`

			`__package__ = 'archivebox.cli'`
			`__command__ = 'archivebox schedule'`
update docstrings and comments 2019-04-23 06:42:04 +12:00			`__description__ = 'Set ArchiveBox to regularly import URLs at specific times using cron'`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-19 13:09:54 +12:00
			`import os`
			`import sys`
			`import argparse`

			`from datetime import datetime`
			`from crontab import CronTab, CronSlices`


			`from ..legacy.util import reject_stdin`
			`from ..legacy.config import (`
			`OUTPUT_DIR,`
			`LOGS_DIR,`
			`ARCHIVEBOX_BINARY,`
			`USER,`
			`ANSI,`
			`stderr,`
check for data folder when running most subcommands 2019-04-23 11:06:48 +12:00			`check_data_folder,`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-19 13:09:54 +12:00			`)`


			`CRON_COMMENT = 'archivebox_schedule'`


			`def main(args=None):`
check for data folder when running most subcommands 2019-04-23 11:06:48 +12:00			`check_data_folder()`

add pipenv, schedule cmd, logs dir, and lots more 2019-04-19 13:09:54 +12:00			`args = sys.argv[1:] if args is None else args`

			`parser = argparse.ArgumentParser(`
			`prog=__command__,`
			`description=__description__,`
			`add_help=True,`
			`)`
			`parser.add_argument(`
			`'--quiet', '-q',`
			`action='store_true',`
			`help=("Don't warn about storage space."),`
			`)`
			`group = parser.add_mutually_exclusive_group()`
			`group.add_argument(`
			`'--add', # '-a',`
			`action='store_true',`
			`help='Add a new scheduled ArchiveBox update job to cron',`
			`)`
			`parser.add_argument(`
			`'--every', # '-e',`
			`type=str,`
fix bad default in scheduler 2019-04-23 05:21:08 +12:00			`default='day',`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-19 13:09:54 +12:00			`help='Run ArchiveBox once every [timeperiod] (hour/day/week/month/year or cron format e.g. "0 0 * * *")',`
			`)`
			`group.add_argument(`
			`'--clear', # '-c'`
			`action='store_true',`
			`help=("Stop all ArchiveBox scheduled runs, clear it completely from cron"),`
			`)`
			`group.add_argument(`
			`'--show', # '-s'`
			`action='store_true',`
			`help=("Print a list of currently active ArchiveBox cron jobs"),`
			`)`
			`group.add_argument(`
			`'--foreground', '-f',`
			`action='store_true',`
			`help=("Launch ArchiveBox as a long-running foreground task "`
			`"instead of using cron."),`
			`)`
			`group.add_argument(`
			`'--run-all', # '-a',`
			`action='store_true',`
			`help='Run all the scheduled jobs once immediately, independent of their configured schedules',`
			`)`
			`parser.add_argument(`
			`'import_path',`
			`nargs='?',`
			`type=str,`
			`default=None,`
			`help=("Check this path and import any new links on every run "`
			`"(can be either local file or remote URL)"),`
			`)`
			`command = parser.parse_args(args)`
			`reject_stdin(__command__)`

			`os.makedirs(LOGS_DIR, exist_ok=True)`

			`cron = CronTab(user=True)`
			`cron = dedupe_jobs(cron)`

			`existing_jobs = list(cron.find_comment(CRON_COMMENT))`
			`if command.foreground or command.run_all:`
			`if command.import_path or (not existing_jobs):`
			`stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))`
			`stderr(' archivebox schedule --every=hour https://example.com/some/rss/feed.xml')`
			`raise SystemExit(1)`
			`print('{green}[] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), *ANSI))`
			`if command.run_all:`
			`try:`
			`for job in existing_jobs:`
			`sys.stdout.write(f' > {job.command}')`
			`sys.stdout.flush()`
			`job.run()`
			`sys.stdout.write(f'\r √ {job.command}\n')`
			`except KeyboardInterrupt:`
			`print('\n{green}[√] Stopped.{reset}'.format(**ANSI))`
			`raise SystemExit(1)`
			`if command.foreground:`
			`try:`
			`for result in cron.run_scheduler():`
			`print(result)`
			`except KeyboardInterrupt:`
			`print('\n{green}[√] Stopped.{reset}'.format(**ANSI))`
			`raise SystemExit(1)`

			`elif command.show:`
			`if existing_jobs:`
			`print('\n'.join(str(cmd) for cmd in existing_jobs))`
			`else:`
			`stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))`
			`stderr(' To schedule a new job, run:')`
			`stderr(' archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')`
			`raise SystemExit(0)`

			`elif command.clear:`
			`print(cron.remove_all(comment=CRON_COMMENT))`
			`cron.write()`
			`raise SystemExit(0)`

			`elif command.every:`
			`quoted = lambda s: f'"{s}"' if s and ' ' in s else s`
			`cmd = [`
			`'cd',`
			`quoted(OUTPUT_DIR),`
			`'&&',`
			`quoted(ARCHIVEBOX_BINARY),`
			`*(('add', f'"{command.import_path}"',) if command.import_path else ('update',)),`
			`'2>&1',`
			`'>',`
			`quoted(os.path.join(LOGS_DIR, 'archivebox.log')),`

			`]`
			`new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)`

			`if command.every in ('minute', 'hour', 'day', 'week', 'month', 'year'):`
			`set_every = getattr(new_job.every(), command.every)`
			`set_every()`
			`elif CronSlices.is_valid(command.every):`
			`new_job.setall(command.every)`
			`else:`
			`stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))`
			`stderr(' It must be one of minute/hour/day/week/month')`
			`stderr(' or a quoted cron-format schedule like:')`
			`stderr(' archivebox init --every=day https://example.com/some/rss/feed.xml')`
			`stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')`
			`raise SystemExit(1)`

			`cron = dedupe_jobs(cron)`
			`cron.write()`

			`total_runs = sum(j.frequency_per_year() for j in cron)`
			`existing_jobs = list(cron.find_comment(CRON_COMMENT))`

			`print()`
			`print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))`
			`print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))`
			`if total_runs > 60 and not command.quiet:`
			`stderr()`
			`stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))`
			`stderr(f' Congrats on being an enthusiastic internet archiver! 👌')`
			`stderr()`
			`stderr(' Make sure you have enough storage space available to hold all the data.')`
			`stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')`
			`raise SystemExit(0)`


			`def dedupe_jobs(cron: CronTab) -> CronTab:`
			`deduped = set()`
			`for job in list(cron):`
			`unique_tuple = (str(job.slices), job.command)`
			`if unique_tuple not in deduped:`
			`deduped.add(unique_tuple)`
			`cron.remove(job)`

			`for schedule, command in deduped:`
			`job = cron.new(command=command, comment=CRON_COMMENT)`
			`job.setall(schedule)`
			`job.enable()`

			`return cron`


			`if __name__ == '__main__':`
			`main()`