From 127c72bd79349f9fad805dbd7f5434b77736961d Mon Sep 17 00:00:00 2001 From: noncetonic Date: Tue, 19 Mar 2019 05:30:06 -0700 Subject: [PATCH 1/5] Adds HEADLESS_USER_AGENT variable Allows setting Headless Chrome's User-Agent to bypass rudimentary anti-scraper/anti-bot checks by sites. https://intoli.com/blog/making-chrome-headless-undetectable/ has more detections if there is desire to get serious about anti-detection --- etc/ArchiveBox.conf.default | 1 + 1 file changed, 1 insertion(+) diff --git a/etc/ArchiveBox.conf.default b/etc/ArchiveBox.conf.default index ce7b1cda..67bc6b2d 100644 --- a/etc/ArchiveBox.conf.default +++ b/etc/ArchiveBox.conf.default @@ -41,6 +41,7 @@ #FETCH_WGET_REQUISITES=True #RESOLUTION="1440,900" #WGET_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36" +#HEADLESS_USER_AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" #GIT_DOMAINS="github.com,bitbucket.org,gitlab.com" #COOKIES_FILE="path/to/cookies.txt" #CHROME_USER_DATA_DIR="~/.config/google-chrome/Default" From a13f22d15afab25f2065cd6ee17374fc43ef02e5 Mon Sep 17 00:00:00 2001 From: noncetonic Date: Tue, 19 Mar 2019 05:32:48 -0700 Subject: [PATCH 2/5] Adds support for HEADLESS_USER_AGENT for Chrome --- archivebox/archive_methods.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 39903d8c..17c7a5ff 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -30,6 +30,7 @@ from config import ( SUBMIT_ARCHIVE_DOT_ORG, COOKIES_FILE, WGET_USER_AGENT, + HEADLESS_USER_AGENT, CHROME_USER_DATA_DIR, CHROME_HEADLESS, CHROME_SANDBOX, @@ -266,6 +267,7 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI '--hide-scrollbars', '--timeout={}'.format((timeout) * 1000), *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')), + *(('--user-agent={}'.format(HEADLESS_USER_AGENT),) if HEADLESS_USER_AGENT else ()), link['url'] ] end = progress(timeout, prefix=' ') @@ -304,6 +306,7 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_ '--hide-scrollbars', '--timeout={}'.format((timeout) * 1000), *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')), + *(('--user-agent={}'.format(HEADLESS_USER_AGENT),) if HEADLESS_USER_AGENT else ()), # '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true link['url'], ] @@ -342,6 +345,7 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI *chrome_headless(user_data_dir=user_data_dir), '--dump-dom', '--timeout={}'.format((timeout) * 1000), + *(('--user-agent={}'.format(HEADLESS_USER_AGENT),) if HEADLESS_USER_AGENT else ()), link['url'] ] end = progress(timeout, prefix=' ') From e230e279298b634c32c555cceccc130258e79d22 Mon Sep 17 00:00:00 2001 From: noncetonic Date: Tue, 19 Mar 2019 08:13:27 -0700 Subject: [PATCH 3/5] Changes HEADLESS_USER_AGENT to CHROME_USER_AGENT --- archivebox/archive_methods.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 17c7a5ff..20e35c28 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -30,7 +30,7 @@ from config import ( SUBMIT_ARCHIVE_DOT_ORG, COOKIES_FILE, WGET_USER_AGENT, - HEADLESS_USER_AGENT, + CHROME_USER_AGENT, CHROME_USER_DATA_DIR, CHROME_HEADLESS, CHROME_SANDBOX, @@ -267,7 +267,7 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI '--hide-scrollbars', '--timeout={}'.format((timeout) * 1000), *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')), - *(('--user-agent={}'.format(HEADLESS_USER_AGENT),) if HEADLESS_USER_AGENT else ()), + *(('--user-agent={}'.format(CHROME_USER_AGENT),) if CHROME_USER_AGENT else ()), link['url'] ] end = progress(timeout, prefix=' ') @@ -306,7 +306,7 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_ '--hide-scrollbars', '--timeout={}'.format((timeout) * 1000), *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')), - *(('--user-agent={}'.format(HEADLESS_USER_AGENT),) if HEADLESS_USER_AGENT else ()), + *(('--user-agent={}'.format(CHROME_USER_AGENT),) if CHROME_USER_AGENT else ()), # '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true link['url'], ] @@ -345,7 +345,7 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI *chrome_headless(user_data_dir=user_data_dir), '--dump-dom', '--timeout={}'.format((timeout) * 1000), - *(('--user-agent={}'.format(HEADLESS_USER_AGENT),) if HEADLESS_USER_AGENT else ()), + *(('--user-agent={}'.format(CHROME_USER_AGENT),) if CHROME_USER_AGENT else ()), link['url'] ] end = progress(timeout, prefix=' ') From 8b648696fc03f871dde325f43f5abf7c4b12aefb Mon Sep 17 00:00:00 2001 From: noncetonic Date: Tue, 19 Mar 2019 08:14:06 -0700 Subject: [PATCH 4/5] Changes HEADLESS_USER_AGENT to CHROME_USER_AGENT --- etc/ArchiveBox.conf.default | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/ArchiveBox.conf.default b/etc/ArchiveBox.conf.default index 67bc6b2d..7abbea6d 100644 --- a/etc/ArchiveBox.conf.default +++ b/etc/ArchiveBox.conf.default @@ -41,7 +41,7 @@ #FETCH_WGET_REQUISITES=True #RESOLUTION="1440,900" #WGET_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36" -#HEADLESS_USER_AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" +#CHROME_USER_AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" #GIT_DOMAINS="github.com,bitbucket.org,gitlab.com" #COOKIES_FILE="path/to/cookies.txt" #CHROME_USER_DATA_DIR="~/.config/google-chrome/Default" From 49c428cb6c95c996efb572415537b0d92053fbbd Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 19 Mar 2019 11:16:00 -0400 Subject: [PATCH 5/5] make both user agents equal --- etc/ArchiveBox.conf.default | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etc/ArchiveBox.conf.default b/etc/ArchiveBox.conf.default index 7abbea6d..dcb8aeac 100644 --- a/etc/ArchiveBox.conf.default +++ b/etc/ArchiveBox.conf.default @@ -40,8 +40,8 @@ #CHECK_SSL_VALIDITY=True #FETCH_WGET_REQUISITES=True #RESOLUTION="1440,900" -#WGET_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36" -#CHROME_USER_AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" +#WGET_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" +#CHROME_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" #GIT_DOMAINS="github.com,bitbucket.org,gitlab.com" #COOKIES_FILE="path/to/cookies.txt" #CHROME_USER_DATA_DIR="~/.config/google-chrome/Default"