2017-05-05 21:00:30 +12:00
#!/usr/bin/env python3
2019-03-21 18:28:12 +13:00
"""
ArchiveBox command line application .
2017-05-05 21:00:30 +12:00
2019-03-21 18:28:12 +13:00
. / archive and . / bin / archivebox both point to this file ,
but you can also run it directly using ` python3 archive . py `
2017-05-06 11:36:46 +12:00
2019-03-21 18:28:12 +13:00
Usage & Documentation :
https : / / github . com / pirate / ArchiveBox / Wiki
"""
2019-03-28 08:13:47 +13:00
__package__ = ' archivebox '
2019-02-22 11:45:28 +13:00
2019-03-21 18:28:12 +13:00
import os
import sys
2019-03-28 09:44:00 +13:00
import shutil
2019-03-28 08:35:13 +13:00
2019-03-27 12:21:34 +13:00
from typing import List , Optional
2019-03-26 22:33:34 +13:00
2019-03-28 08:35:13 +13:00
from . schema import Link
from . links import links_after_timestamp
from . index import write_links_index , load_links_index
from . archive_methods import archive_link
from . config import (
2018-10-20 08:28:38 +13:00
ONLY_NEW ,
2018-06-11 12:52:15 +12:00
OUTPUT_DIR ,
2019-03-28 08:35:13 +13:00
VERSION ,
2019-03-28 09:44:00 +13:00
ANSI ,
CURL_VERSION ,
GIT_VERSION ,
WGET_VERSION ,
YOUTUBEDL_VERSION ,
CHROME_VERSION ,
USE_CURL ,
USE_WGET ,
USE_CHROME ,
CURL_BINARY ,
GIT_BINARY ,
WGET_BINARY ,
YOUTUBEDL_BINARY ,
CHROME_BINARY ,
FETCH_GIT ,
FETCH_MEDIA ,
2017-10-19 11:38:17 +13:00
)
2019-03-28 08:35:13 +13:00
from . util import (
2019-03-27 16:25:07 +13:00
enforce_types ,
2019-02-27 22:48:38 +13:00
save_remote_source ,
2019-02-22 11:54:44 +13:00
save_stdin_source ,
2019-03-21 18:28:12 +13:00
)
2019-03-28 08:35:13 +13:00
from . logs import (
2019-03-21 18:28:12 +13:00
log_archiving_started ,
log_archiving_paused ,
log_archiving_finished ,
2017-07-04 22:38:07 +12:00
)
2017-06-16 10:33:01 +12:00
2018-06-11 14:02:33 +12:00
__AUTHOR__ = ' Nick Sweeting <git@nicksweeting.com> '
2019-03-28 08:35:13 +13:00
__VERSION__ = VERSION
2019-03-21 18:28:12 +13:00
__DESCRIPTION__ = ' ArchiveBox: The self-hosted internet archive. '
2019-01-15 12:11:48 +13:00
__DOCUMENTATION__ = ' https://github.com/pirate/ArchiveBox/wiki '
2017-05-05 23:27:05 +12:00
2019-02-22 11:45:28 +13:00
2019-03-28 08:35:13 +13:00
2017-10-31 00:09:33 +13:00
def print_help ( ) :
2019-03-21 18:28:12 +13:00
print ( ' ArchiveBox: The self-hosted internet archive. \n ' )
print ( " Documentation: " )
print ( " https://github.com/pirate/ArchiveBox/wiki \n " )
2019-03-23 15:05:45 +13:00
print ( " UI Usage: " )
print ( " Open output/index.html to view your archive. \n " )
print ( " CLI Usage: " )
2019-03-28 08:14:22 +13:00
print ( " mkdir data; cd data/ " )
print ( " archivebox init \n " )
print ( " echo ' https://example.com/some/page ' | archivebox add " )
print ( " archivebox add https://example.com/some/other/page " )
print ( " archivebox add --depth=1 ~/Downloads/bookmarks_export.html " )
print ( " archivebox add --depth=1 https://example.com/feed.rss " )
print ( " archivebox update --resume=15109948213.123 " )
2017-06-30 20:23:19 +12:00
2019-03-28 09:44:00 +13:00
def print_version ( ) :
print ( ' ArchiveBox v {} ' . format ( __VERSION__ ) )
print ( )
print (
' [ {} ] CURL: ' . format ( ' √ ' if USE_CURL else ' X ' ) . ljust ( 14 ) ,
' {} --version \n ' . format ( shutil . which ( CURL_BINARY ) ) ,
' ' * 13 , CURL_VERSION , ' \n ' ,
)
print (
' [ {} ] GIT: ' . format ( ' √ ' if FETCH_GIT else ' X ' ) . ljust ( 14 ) ,
' {} --version \n ' . format ( shutil . which ( GIT_BINARY ) ) ,
' ' * 13 , GIT_VERSION , ' \n ' ,
)
print (
' [ {} ] WGET: ' . format ( ' √ ' if USE_WGET else ' X ' ) . ljust ( 14 ) ,
' {} --version \n ' . format ( shutil . which ( WGET_BINARY ) ) ,
' ' * 13 , WGET_VERSION , ' \n ' ,
)
print (
' [ {} ] YOUTUBEDL: ' . format ( ' √ ' if FETCH_MEDIA else ' X ' ) . ljust ( 14 ) ,
' {} --version \n ' . format ( shutil . which ( YOUTUBEDL_BINARY ) ) ,
' ' * 13 , YOUTUBEDL_VERSION , ' \n ' ,
)
print (
' [ {} ] CHROME: ' . format ( ' √ ' if USE_CHROME else ' X ' ) . ljust ( 14 ) ,
' {} --version \n ' . format ( shutil . which ( CHROME_BINARY ) ) ,
' ' * 13 , CHROME_VERSION , ' \n ' ,
)
def main ( args = None ) - > None :
2019-03-28 08:14:22 +13:00
if args is None :
args = sys . argv
2019-03-21 18:28:12 +13:00
if set ( args ) . intersection ( ( ' -h ' , ' --help ' , ' help ' ) ) or len ( args ) > 2 :
print_help ( )
raise SystemExit ( 0 )
2019-02-19 20:21:28 +13:00
2019-03-23 15:05:45 +13:00
if set ( args ) . intersection ( ( ' --version ' , ' version ' ) ) :
2019-03-28 09:44:00 +13:00
print_version ( )
2019-03-23 15:05:45 +13:00
raise SystemExit ( 0 )
2019-03-21 18:28:12 +13:00
### Handle CLI arguments
# ./archive bookmarks.html
# ./archive 1523422111.234
import_path , resume = None , None
if len ( args ) == 2 :
# if the argument is a string, it's a import_path file to import
# if it's a number, it's a timestamp to resume archiving from
if args [ 1 ] . replace ( ' . ' , ' ' ) . isdigit ( ) :
import_path , resume = None , args [ 1 ]
else :
import_path , resume = args [ 1 ] , None
2017-10-31 00:09:33 +13:00
2019-03-21 18:28:12 +13:00
### Set up output folder
if not os . path . exists ( OUTPUT_DIR ) :
2019-03-28 09:44:00 +13:00
print ( ' {green} [+] Created a new archive directory: {} {reset} ' . format ( OUTPUT_DIR , * * ANSI ) )
2019-03-21 18:28:12 +13:00
os . makedirs ( OUTPUT_DIR )
2019-03-28 09:44:00 +13:00
else :
not_empty = len ( set ( os . listdir ( OUTPUT_DIR ) ) - { ' .DS_Store ' } )
index_exists = os . path . exists ( os . path . join ( OUTPUT_DIR , ' index.json ' ) )
if not_empty and not index_exists :
print (
2019-03-28 11:21:53 +13:00
( " {red} [X] Could not find index.json in the OUTPUT_DIR: {reset} {} \n \n "
" If you ' re trying to update an existing archive, you must set OUTPUT_DIR to or run archivebox from inside the archive folder you ' re trying to update. \n "
" If you ' re trying to create a new archive, you must run archivebox inside a completely empty directory. "
" \n \n "
" {lightred} Hint: {reset} To import a data folder created by an older version of ArchiveBox, \n "
" just cd into the folder and run the archivebox comamnd to pick up where you left off. \n \n "
" (Always make sure your data folder is backed up first before updating ArchiveBox) "
2019-03-28 09:44:00 +13:00
) . format ( OUTPUT_DIR , * * ANSI )
)
raise SystemExit ( 1 )
2019-02-19 20:21:28 +13:00
2019-03-21 18:28:12 +13:00
### Handle ingesting urls piped in through stdin
# (.e.g if user does cat example_urls.txt | ./archive)
if not sys . stdin . isatty ( ) :
stdin_raw_text = sys . stdin . read ( )
if stdin_raw_text and import_path :
print (
' [X] You should pass either a path as an argument, '
' or pass a list of links via stdin, but not both. \n '
)
print_help ( )
raise SystemExit ( 1 )
2017-10-31 00:09:33 +13:00
2019-03-21 18:28:12 +13:00
import_path = save_stdin_source ( stdin_raw_text )
2017-10-31 00:09:33 +13:00
2019-03-21 18:28:12 +13:00
### Handle ingesting urls from a remote file/feed
# (e.g. if an RSS feed URL is used as the import path)
if import_path and any ( import_path . startswith ( s ) for s in ( ' http:// ' , ' https:// ' , ' ftp:// ' ) ) :
import_path = save_remote_source ( import_path )
2019-02-22 11:45:28 +13:00
2019-03-21 18:28:12 +13:00
### Run the main archive update process
2019-03-28 08:14:22 +13:00
update_archive_data ( import_path = import_path , resume = resume )
2017-06-30 20:23:19 +12:00
2017-10-19 11:38:17 +13:00
2019-03-27 16:25:07 +13:00
@enforce_types
def update_archive_data ( import_path : Optional [ str ] = None , resume : Optional [ float ] = None ) - > List [ Link ] :
2019-03-23 08:09:39 +13:00
""" The main ArchiveBox entrancepoint. Everything starts here. """
2018-04-17 23:30:06 +12:00
2019-03-21 18:28:12 +13:00
# Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path
all_links , new_links = load_links_index ( out_dir = OUTPUT_DIR , import_path = import_path )
2019-02-22 11:45:28 +13:00
2019-03-21 18:28:12 +13:00
# Step 2: Write updated index with deduped old and new links back to disk
2019-03-28 11:24:30 +13:00
write_links_index ( links = list ( all_links ) , out_dir = OUTPUT_DIR )
2019-02-22 11:45:28 +13:00
2019-03-21 18:28:12 +13:00
# Step 3: Run the archive methods for each link
links = new_links if ONLY_NEW else all_links
log_archiving_started ( len ( links ) , resume )
2019-03-27 12:21:34 +13:00
idx : int = 0
link : Optional [ Link ] = None
2019-02-22 11:45:28 +13:00
try :
2019-03-21 18:28:12 +13:00
for idx , link in enumerate ( links_after_timestamp ( links , resume ) ) :
2019-03-28 11:24:30 +13:00
archive_link ( link , link_dir = link . link_dir )
2019-02-22 11:45:28 +13:00
2019-03-21 18:28:12 +13:00
except KeyboardInterrupt :
2019-03-27 12:21:34 +13:00
log_archiving_paused ( len ( links ) , idx , link . timestamp if link else ' 0 ' )
2017-06-30 20:23:19 +12:00
raise SystemExit ( 0 )
2019-03-21 18:28:12 +13:00
except :
print ( )
raise
2019-02-19 20:31:53 +13:00
2019-03-21 18:28:12 +13:00
log_archiving_finished ( len ( links ) )
2018-10-20 09:35:08 +13:00
2019-03-21 18:28:12 +13:00
# Step 4: Re-write links index with updated titles, icons, and resources
all_links , _ = load_links_index ( out_dir = OUTPUT_DIR )
2019-03-28 11:24:30 +13:00
write_links_index ( links = list ( all_links ) , out_dir = OUTPUT_DIR , finished = True )
2019-03-26 22:33:34 +13:00
return all_links
2019-02-22 11:45:28 +13:00
2019-03-21 18:28:12 +13:00
if __name__ == ' __main__ ' :
2019-03-28 08:14:22 +13:00
main ( sys . argv )