add user-data-dir support for chrome headless

2024-06-25 01:20:30 +12:00 · 2017-10-30 04:01:59 -05:00 · 2017-10-30 04:01:59 -05:00 · 5758cc2a78
parent 81ab050cd2
commit 5758cc2a78
3 changed files with 25 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -39,6 +39,9 @@ git clone https://github.com/pirate/bookmark-archiver
 cd bookmark-archiver/
 ./setup.sh #install ALL dependencies
 ./archive.py ~/Downloads/bookmark_export.html   # replace with the path to your export file from step 1
+
+# OR
+./archive.py https://getpocket.com/users/yourusername/feed/all  # url to an RSS, html, or json links file
 ```

 **3. Done!**
@ -47,6 +50,7 @@ You can open `service/index.html` to view your archive.  (favicons will appear n

 If you want to host your archive somewhere to share it with other people, see the [Publishing Your Archive](#publishing-your-archive) section below.

+If you want to run this as a regular script that pulls new URLs, stick it in `cron` with the second parameter as the URL to your RSS feed.

 If you have any trouble, see the [Troubleshooting](#troubleshooting) section at the bottom.  
 If you'd like to customize options, see the [Configuration](#configuration) section.  
@ -66,11 +70,11 @@ For each sites it saves:
 - `screenshot.png` 1440x900 screenshot of site using headless chrome
 - `output.pdf` Printed PDF of site using headless chrome
 - `archive.org.txt` A link to the saved site on archive.org
- - `link.json` A json file containing link info and archive status
 - `audio/` and `video/` for sites like youtube, soundcloud, etc. (using youtube-dl) (WIP)
+ - `index.json` JSON index containing link info and archive details
+ - `index.html` HTML index containing link info and archive details

-Wget and Chrome [don't work](https://bugs.chromium.org/p/chromium/issues/detail?id=617931) on sites you need to be logged into (yet).
-`chrome --headless` essentially runs in an incognito mode session, until they add support for `--user-data-dir=`.
+Wget doesn't work on sites you need to be logged into, but chrome headless does, see the [Configuration](#configuration)* section for `CHROME_USER_DATA_DIR`.

 **Large Exports & Estimated Runtime:** 

@ -113,10 +117,13 @@ env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./arc
   - submit the page to archive.org: `SUBMIT_ARCHIVE_DOT_ORG` 
 - screenshot: `RESOLUTION` values: [`1440,900`]/`1024,768`/`...`
 - user agent: `WGET_USER_AGENT` values: [`Wget/1.19.1`]/`"Mozilla/5.0 ..."`/`...`
+ - chrome profile: `CHROME_USER_DATA_DIR` values: `~/Library/Application\ Support/Google/Chrome/Default`/`/tmp/chrome-profile`/`...`
+    To capture sites that require a user to be logged in, you must specify a path to a chrome profile (which loads the cookies needed for the user to be logged in).  If you don't have an existing chrome profile, create one with `chromium-browser --disable-gpu --user-data-dir=/tmp/chrome-profile`, and log into the sites you need.  Then set `CHROME_USER_DATA_DIR=/tmp/chrome-profile` to make Bookmark Archiver use that profile.

 **Index Options:**
 - html index template: `INDEX_TEMPLATE` value:  `templates/index.html`/`...`
 - html index row template: `INDEX_ROW_TEMPLATE` value:  `templates/index_row.html`/`...`
+ - html link index template: `LINK_INDEX_TEMPLATE` value: `templates/link_index_fancy.html`/`templates/link_index.html`/`...`

 (See defaults & more at the top of `config.py`)

--- a/archive_methods.py
+++ b/archive_methods.py
@ -19,6 +19,7 @@ from config import (
    FETCH_VIDEO,
    FETCH_FAVICON,
    WGET_USER_AGENT,
+    CHROME_USER_DATA_DIR,
    TIMEOUT,
    ANSI,
 )
@ -35,7 +36,6 @@ _RESULTS_TOTALS = {   # globals are bad, mmkay
    'failed': 0,
 }

-
 def archive_links(out_dir, links, export_path, resume=None):
    check_dependencies()

@ -198,7 +198,7 @@ def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT)


@attach_result_to_link('pdf')
-def fetch_pdf(out_dir, link, timeout=TIMEOUT):
+def fetch_pdf(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
    """print PDF of site to file using chrome --headless"""

    if link['type'] in ('PDF', 'image'):
@ -210,6 +210,7 @@ def fetch_pdf(out_dir, link, timeout=TIMEOUT):
    CMD = [
        CHROME_BINARY,
        *'--headless --disable-gpu --print-to-pdf'.split(' '),
+        *chrome_data_dir_args(user_data_dir),
        link['url']
    ]
    end = progress(timeout, prefix='      ')
@ -233,7 +234,7 @@ def fetch_pdf(out_dir, link, timeout=TIMEOUT):


@attach_result_to_link('screenshot')
-def fetch_screenshot(out_dir, link, timeout=TIMEOUT, resolution=RESOLUTION):
+def fetch_screenshot(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION):
    """take screenshot of site using chrome --headless"""

    if link['type'] in ('PDF', 'image'):
@ -245,6 +246,7 @@ def fetch_screenshot(out_dir, link, timeout=TIMEOUT, resolution=RESOLUTION):
    CMD = [
        CHROME_BINARY,
        *'--headless --disable-gpu --screenshot'.split(' '),
+        *chrome_data_dir_args(user_data_dir),
        '--window-size={}'.format(resolution),
        link['url']
    ]
@ -414,3 +416,12 @@ def fetch_favicon(out_dir, link, timeout=TIMEOUT):
 #             raise
 #     else:
 #         print('    √ Skipping video download')
+
+
+def chrome_data_dir_args(user_data_dir=CHROME_USER_DATA_DIR):
+    default = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default')
+    if user_data_dir:
+        return ('--user-data-dir={}'.format(user_data_dir),)
+    elif os.path.exists(default):
+        return ('--user-data-dir={}'.format(default),)
+    return ()
--- a/config.py
+++ b/config.py
@ -27,6 +27,7 @@ ARCHIVE_DIR =            os.getenv('ARCHIVE_DIR',            '')
 CHROME_BINARY =          os.getenv('CHROME_BINARY',          'chromium-browser' )  # change to google-chrome browser if using google-chrome
 WGET_BINARY =            os.getenv('WGET_BINARY',            'wget'             )
 WGET_USER_AGENT =        os.getenv('WGET_USER_AGENT',         None)
+CHROME_USER_DATA_DIR =   os.getenv('CHROME_USER_DATA_DIR',    None)
 TIMEOUT =                int(os.getenv('TIMEOUT',            '60'))
 LINK_INDEX_TEMPLATE =    os.getenv('LINK_INDEX_TEMPLATE',    'templates/link_index_fancy.html')
 INDEX_TEMPLATE =         os.getenv('INDEX_TEMPLATE',         'templates/index.html')