From 5758cc2a78306d1fd085e2afda96ae3be06a8beb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 30 Oct 2017 04:01:59 -0500 Subject: [PATCH] add user-data-dir support for chrome headless --- README.md | 13 ++++++++++--- archive_methods.py | 17 ++++++++++++++--- config.py | 1 + 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 71359ad4..06e10779 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,9 @@ git clone https://github.com/pirate/bookmark-archiver cd bookmark-archiver/ ./setup.sh #install ALL dependencies ./archive.py ~/Downloads/bookmark_export.html # replace with the path to your export file from step 1 + +# OR +./archive.py https://getpocket.com/users/yourusername/feed/all # url to an RSS, html, or json links file ``` **3. Done!** @@ -47,6 +50,7 @@ You can open `service/index.html` to view your archive. (favicons will appear n If you want to host your archive somewhere to share it with other people, see the [Publishing Your Archive](#publishing-your-archive) section below. +If you want to run this as a regular script that pulls new URLs, stick it in `cron` with the second parameter as the URL to your RSS feed. If you have any trouble, see the [Troubleshooting](#troubleshooting) section at the bottom. If you'd like to customize options, see the [Configuration](#configuration) section. @@ -66,11 +70,11 @@ For each sites it saves: - `screenshot.png` 1440x900 screenshot of site using headless chrome - `output.pdf` Printed PDF of site using headless chrome - `archive.org.txt` A link to the saved site on archive.org - - `link.json` A json file containing link info and archive status - `audio/` and `video/` for sites like youtube, soundcloud, etc. (using youtube-dl) (WIP) + - `index.json` JSON index containing link info and archive details + - `index.html` HTML index containing link info and archive details -Wget and Chrome [don't work](https://bugs.chromium.org/p/chromium/issues/detail?id=617931) on sites you need to be logged into (yet). -`chrome --headless` essentially runs in an incognito mode session, until they add support for `--user-data-dir=`. +Wget doesn't work on sites you need to be logged into, but chrome headless does, see the [Configuration](#configuration)* section for `CHROME_USER_DATA_DIR`. **Large Exports & Estimated Runtime:** @@ -113,10 +117,13 @@ env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./arc - submit the page to archive.org: `SUBMIT_ARCHIVE_DOT_ORG` - screenshot: `RESOLUTION` values: [`1440,900`]/`1024,768`/`...` - user agent: `WGET_USER_AGENT` values: [`Wget/1.19.1`]/`"Mozilla/5.0 ..."`/`...` + - chrome profile: `CHROME_USER_DATA_DIR` values: `~/Library/Application\ Support/Google/Chrome/Default`/`/tmp/chrome-profile`/`...` + To capture sites that require a user to be logged in, you must specify a path to a chrome profile (which loads the cookies needed for the user to be logged in). If you don't have an existing chrome profile, create one with `chromium-browser --disable-gpu --user-data-dir=/tmp/chrome-profile`, and log into the sites you need. Then set `CHROME_USER_DATA_DIR=/tmp/chrome-profile` to make Bookmark Archiver use that profile. **Index Options:** - html index template: `INDEX_TEMPLATE` value: `templates/index.html`/`...` - html index row template: `INDEX_ROW_TEMPLATE` value: `templates/index_row.html`/`...` + - html link index template: `LINK_INDEX_TEMPLATE` value: `templates/link_index_fancy.html`/`templates/link_index.html`/`...` (See defaults & more at the top of `config.py`) diff --git a/archive_methods.py b/archive_methods.py index 3af8b226..06f2c58b 100644 --- a/archive_methods.py +++ b/archive_methods.py @@ -19,6 +19,7 @@ from config import ( FETCH_VIDEO, FETCH_FAVICON, WGET_USER_AGENT, + CHROME_USER_DATA_DIR, TIMEOUT, ANSI, ) @@ -35,7 +36,6 @@ _RESULTS_TOTALS = { # globals are bad, mmkay 'failed': 0, } - def archive_links(out_dir, links, export_path, resume=None): check_dependencies() @@ -198,7 +198,7 @@ def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT) @attach_result_to_link('pdf') -def fetch_pdf(out_dir, link, timeout=TIMEOUT): +def fetch_pdf(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR): """print PDF of site to file using chrome --headless""" if link['type'] in ('PDF', 'image'): @@ -210,6 +210,7 @@ def fetch_pdf(out_dir, link, timeout=TIMEOUT): CMD = [ CHROME_BINARY, *'--headless --disable-gpu --print-to-pdf'.split(' '), + *chrome_data_dir_args(user_data_dir), link['url'] ] end = progress(timeout, prefix=' ') @@ -233,7 +234,7 @@ def fetch_pdf(out_dir, link, timeout=TIMEOUT): @attach_result_to_link('screenshot') -def fetch_screenshot(out_dir, link, timeout=TIMEOUT, resolution=RESOLUTION): +def fetch_screenshot(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION): """take screenshot of site using chrome --headless""" if link['type'] in ('PDF', 'image'): @@ -245,6 +246,7 @@ def fetch_screenshot(out_dir, link, timeout=TIMEOUT, resolution=RESOLUTION): CMD = [ CHROME_BINARY, *'--headless --disable-gpu --screenshot'.split(' '), + *chrome_data_dir_args(user_data_dir), '--window-size={}'.format(resolution), link['url'] ] @@ -414,3 +416,12 @@ def fetch_favicon(out_dir, link, timeout=TIMEOUT): # raise # else: # print(' √ Skipping video download') + + +def chrome_data_dir_args(user_data_dir=CHROME_USER_DATA_DIR): + default = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default') + if user_data_dir: + return ('--user-data-dir={}'.format(user_data_dir),) + elif os.path.exists(default): + return ('--user-data-dir={}'.format(default),) + return () diff --git a/config.py b/config.py index 5b65506f..3d95e164 100644 --- a/config.py +++ b/config.py @@ -27,6 +27,7 @@ ARCHIVE_DIR = os.getenv('ARCHIVE_DIR', '') CHROME_BINARY = os.getenv('CHROME_BINARY', 'chromium-browser' ) # change to google-chrome browser if using google-chrome WGET_BINARY = os.getenv('WGET_BINARY', 'wget' ) WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', None) +CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None) TIMEOUT = int(os.getenv('TIMEOUT', '60')) LINK_INDEX_TEMPLATE = os.getenv('LINK_INDEX_TEMPLATE', 'templates/link_index_fancy.html') INDEX_TEMPLATE = os.getenv('INDEX_TEMPLATE', 'templates/index.html')