1
0
Fork 0
mirror of synced 2024-06-27 18:40:52 +12:00

move archive.py to archive

This commit is contained in:
Nick Sweeting 2018-04-17 07:00:06 -04:00
parent 9ea61bf364
commit c4c8da3deb
2 changed files with 59 additions and 37 deletions

View file

@ -48,10 +48,10 @@ Follow the links here to find instructions for exporting bookmarks from each ser
git clone https://github.com/pirate/bookmark-archiver
cd bookmark-archiver/
./setup.sh # install all dependencies
./archive.py ~/Downloads/bookmark_export.html # replace with the path to your export file from step 1
./archive ~/Downloads/bookmark_export.html # replace with the path to your export file from step 1
# OR
./archive.py https://getpocket.com/users/yourusername/feed/all # url to an RSS, html, or json links file
./archive https://getpocket.com/users/yourusername/feed/all # url to an RSS, html, or json links file
```
**3. Done!**
@ -108,10 +108,10 @@ Those numbers are from running it single-threaded on my i5 machine with 50mbps d
You can run it in parallel by using the `resume` feature, or by manually splitting export.html into multiple files:
```bash
./archive.py export.html 1498800000 & # second argument is timestamp to resume downloading from
./archive.py export.html 1498810000 &
./archive.py export.html 1498820000 &
./archive.py export.html 1498830000 &
./archive export.html 1498800000 & # second argument is timestamp to resume downloading from
./archive export.html 1498810000 &
./archive export.html 1498820000 &
./archive export.html 1498830000 &
```
Users have reported running it with 50k+ bookmarks with success (though it will take more RAM while running).
@ -119,7 +119,7 @@ Users have reported running it with 50k+ bookmarks with success (though it will
You can tweak parameters via environment variables, or by editing `config.py` directly:
```bash
env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./archive.py ~/Downloads/bookmarks_export.html
env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./archive ~/Downloads/bookmarks_export.html
```
**Shell Options:**
@ -158,7 +158,7 @@ The chrome/chromium dependency is _optional_ and only required for screenshots a
## Publishing Your Archive
The archive produced by `./archive.py` is suitable for serving on any provider that can host static html (e.g. github pages!).
The archive produced by `./archive` is suitable for serving on any provider that can host static html (e.g. github pages!).
You can also serve it from a home server or VPS by uploading the outputted `html` folder to your web directory, e.g. `/var/www/bookmark-archiver` and configuring your webserver.
@ -236,7 +236,7 @@ Follow the instruction links above in the "Quickstart" section to download your
1. Clone this repo `git clone https://github.com/pirate/bookmark-archiver`
3. `cd bookmark-archiver/`
4. `./archive.py ~/Downloads/bookmarks_export.html`
4. `./archive ~/Downloads/bookmarks_export.html`
You may optionally specify a second argument to `archive.py export.html 153242424324` to resume the archive update at a specific timestamp.
@ -269,7 +269,7 @@ apt update; apt install google-chrome-beta python3 wget
2. Set the environment variable `CHROME_BINARY` to `google-chrome` before running:
```bash
env CHROME_BINARY=google-chrome ./archive.py ~/Downloads/bookmarks_export.html
env CHROME_BINARY=google-chrome ./archive ~/Downloads/bookmarks_export.html
```
If you're having any trouble trying to set up Google Chrome or Chromium, see the Troubleshooting section below.
@ -292,7 +292,7 @@ If you still need help, [the official Python docs](https://docs.python.org/3.6/u
defaults to `chromium-browser` but can be manually specified with the environment variable `CHROME_BINARY`:
```bash
env CHROME_BINARY=/usr/local/bin/chromium-browser ./archive.py ~/Downloads/bookmarks_export.html
env CHROME_BINARY=/usr/local/bin/chromium-browser ./archive ~/Downloads/bookmarks_export.html
```
1. Test to make sure you have Chrome on your `$PATH` with:
@ -320,7 +320,7 @@ brew cask upgrade chromium-browser
4. If a version is displayed and it's `>=59`, make sure `archive.py` is running the right one:
```bash
env CHROME_BINARY=/path/from/step/1/chromium-browser ./archive.py bookmarks_export.html # replace the path with the one you got from step 1
env CHROME_BINARY=/path/from/step/1/chromium-browser ./archive bookmarks_export.html # replace the path with the one you got from step 1
```

View file

@ -25,7 +25,6 @@ from config import (
)
from util import (
download_url,
check_dependencies,
progress,
cleanup_archive,
)
@ -40,26 +39,36 @@ def print_help():
print(" ./archive.py ~/Downloads/bookmarks_export.html\n")
def get_links(new_links_file_path, archive_path=HTML_FOLDER):
def merge_links(archive_path=HTML_FOLDER, import_path=None):
"""get new links from file and optionally append them to links in existing archive"""
# parse and validate the new_links_file
raw_links = parse_links(new_links_file_path)
valid_links = validate_links(raw_links)
all_links = []
if import_path:
# parse and validate the import file
raw_links = parse_links(import_path)
all_links = validate_links(raw_links)
# merge existing links in archive_path and new links
existing_links = []
if archive_path:
existing_links = parse_json_links_index(archive_path)
valid_links = validate_links(existing_links + valid_links)
all_links = validate_links(existing_links + all_links)
num_new_links = len(valid_links) - len(existing_links)
print('[*] [{}] Adding {} new links from {} to index'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
num_new_links,
new_links_file_path,
))
num_new_links = len(all_links) - len(existing_links)
if import_path:
print('[*] [{}] Adding {} new links from {} to index'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
num_new_links,
import_path,
))
else:
print('[*] [{}] Running on existing index with {}{}{} links.'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
ANSI['green'],
len(all_links),
ANSI['reset'],
))
return valid_links
return all_links
def update_archive(archive_path, links, source=None, resume=None, append=True):
"""update or create index.html+json given a path to an export file containing new links"""
@ -91,34 +100,47 @@ def update_archive(archive_path, links, source=None, resume=None, append=True):
if __name__ == '__main__':
argc = len(sys.argv)
if argc < 2 or set(sys.argv).intersection('-h', '--help', 'help'):
if set(sys.argv).intersection('-h', '--help', 'help'):
print_help()
raise SystemExit(0)
source = sys.argv[1] # path to export file
source = sys.argv[1] if argc > 1 else None # path of links file to import
resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from
if argc == 1:
source, resume = None, None
elif argc == 2:
if all(d.isdigit() for d in sys.argv[1].split('.')):
# argv[1] is a resume timestamp
source, resume = None, sys.argv[1]
else:
# argv[1] is a path to a file to import
source, resume = sys.argv[1].strip(), None
elif argc == 3:
source, resume = sys.argv[1].strip(), sys.argv[1]
else:
print_help()
raise SystemExit(1)
# See if archive folder already exists
for out_folder in (HTML_FOLDER, 'bookmarks', 'pocket', 'pinboard', 'html'):
if os.path.exists(out_folder):
for out_dir in (HTML_FOLDER, 'bookmarks', 'pocket', 'pinboard', 'html'):
if os.path.exists(out_dir):
break
else:
out_folder = HTML_FOLDER
archive_path = os.path.join(out_folder, 'archive')
out_dir = HTML_FOLDER
# Step 0: Download url to local file (only happens if a URL is specified instead of local path)
if any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
source = download_url(source)
# Step 1: Parse the links and dedupe them with existing archive
links = get_links(source, archive_path=archive_path)
links = merge_links(archive_path=out_dir, import_path=source)
# Step 2: Write new index
write_links_index(archive_path, links)
write_links_index(out_dir=out_dir, links=links)
# Step 3: Verify folder structure is 1:1 with index
# cleanup_archive(archive_path, links)
# cleanup_archive(out_dir, links)
# Step 4: Run the archive methods for each link
update_archive(archive_path, links, source=source, resume=resume, append=True)
update_archive(out_dir, links, source=source, resume=resume, append=True)