From 68326a60ee20e2a8831ae86e9867b352e0f74ca6 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly Date: Tue, 27 Feb 2024 15:30:31 -0600 Subject: [PATCH 1/3] Add cookies file to http request in `download_url` --- archivebox/util.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/archivebox/util.py b/archivebox/util.py index 5321081c..2e1e4907 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -3,6 +3,7 @@ __package__ = 'archivebox' import re import requests import json as pyjson +import http.cookiejar from typing import List, Optional, Any from pathlib import Path @@ -164,13 +165,26 @@ def parse_date(date: Any) -> Optional[datetime]: @enforce_types def download_url(url: str, timeout: int=None) -> str: """Download the contents of a remote url and return the text""" - from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT + from .config import ( + TIMEOUT, + CHECK_SSL_VALIDITY, + WGET_USER_AGENT, + COOKIES_FILE, + ) timeout = timeout or TIMEOUT + + cookie_jar = http.cookiejar.MozillaCookieJar() + if COOKIES_FILE is not None: + cookie_jar.load(COOKIES_FILE, ignore_discard=True, ignore_expires=True) + else: + cookie_jar = None + response = requests.get( url, headers={'User-Agent': WGET_USER_AGENT}, verify=CHECK_SSL_VALIDITY, timeout=timeout, + cookies=cookie_jar, ) content_type = response.headers.get('Content-Type', '') From fe11e1c2f47487b419497bac38aafbd433ed689a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 28 Feb 2024 18:19:44 -0800 Subject: [PATCH 2/3] check if COOKIE_FILE is file --- archivebox/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/util.py b/archivebox/util.py index 2e1e4907..9b570ec9 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -174,7 +174,7 @@ def download_url(url: str, timeout: int=None) -> str: timeout = timeout or TIMEOUT cookie_jar = http.cookiejar.MozillaCookieJar() - if COOKIES_FILE is not None: + if COOKIES_FILE and Path(COOKIES_FILE).is_file(): cookie_jar.load(COOKIES_FILE, ignore_discard=True, ignore_expires=True) else: cookie_jar = None From 4686da91e6b11661c0e57397fe86886416d965d5 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly Date: Tue, 5 Mar 2024 01:48:35 -0600 Subject: [PATCH 3/3] Fix cookies being set incorrectly --- archivebox/util.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/archivebox/util.py b/archivebox/util.py index 2e1e4907..461141c3 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -166,25 +166,25 @@ def parse_date(date: Any) -> Optional[datetime]: def download_url(url: str, timeout: int=None) -> str: """Download the contents of a remote url and return the text""" from .config import ( - TIMEOUT, - CHECK_SSL_VALIDITY, - WGET_USER_AGENT, - COOKIES_FILE, + TIMEOUT, + CHECK_SSL_VALIDITY, + WGET_USER_AGENT, + COOKIES_FILE, ) timeout = timeout or TIMEOUT + session = requests.Session() - cookie_jar = http.cookiejar.MozillaCookieJar() - if COOKIES_FILE is not None: - cookie_jar.load(COOKIES_FILE, ignore_discard=True, ignore_expires=True) - else: - cookie_jar = None + if COOKIES_FILE and Path(COOKIES_FILE).is_file(): + cookie_jar = http.cookiejar.MozillaCookieJar(COOKIES_FILE) + cookie_jar.load(ignore_discard=True, ignore_expires=True) + for cookie in cookie_jar: + session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path) - response = requests.get( + response = session.get( url, headers={'User-Agent': WGET_USER_AGENT}, verify=CHECK_SSL_VALIDITY, timeout=timeout, - cookies=cookie_jar, ) content_type = response.headers.get('Content-Type', '')