Use COOKIES_FILE to fetch page titles (#1364)

2024-04-30 02:23:09 +12:00 · 2024-03-14 01:52:44 -07:00 · 2024-03-14 01:52:44 -07:00 · 48f4b12ae2
parent 099f7d00fe 5082d61613
commit 48f4b12ae2
1 changed files with 16 additions and 2 deletions
--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -3,6 +3,7 @@ __package__ = 'archivebox'
 import re
 import requests
 import json as pyjson
+import http.cookiejar

 from typing import List, Optional, Any
 from pathlib import Path
@ -164,9 +165,22 @@ def parse_date(date: Any) -> Optional[datetime]:
@enforce_types
 def download_url(url: str, timeout: int=None) -> str:
    """Download the contents of a remote url and return the text"""
-    from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT
+    from .config import (
+        TIMEOUT,
+        CHECK_SSL_VALIDITY,
+        WGET_USER_AGENT,
+        COOKIES_FILE,
+    )
    timeout = timeout or TIMEOUT
-    response = requests.get(
+    session = requests.Session()
+
+    if COOKIES_FILE and Path(COOKIES_FILE).is_file():
+        cookie_jar = http.cookiejar.MozillaCookieJar(COOKIES_FILE)
+        cookie_jar.load(ignore_discard=True, ignore_expires=True)
+        for cookie in cookie_jar:
+            session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path)
+
+    response = session.get(
        url,
        headers={'User-Agent': WGET_USER_AGENT},
        verify=CHECK_SSL_VALIDITY,