1
0
Fork 0
mirror of synced 2024-05-17 02:43:16 +12:00

Use COOKIES_FILE to fetch page titles (#1364)

This commit is contained in:
Nick Sweeting 2024-03-14 01:52:44 -07:00 committed by GitHub
commit 48f4b12ae2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -3,6 +3,7 @@ __package__ = 'archivebox'
import re import re
import requests import requests
import json as pyjson import json as pyjson
import http.cookiejar
from typing import List, Optional, Any from typing import List, Optional, Any
from pathlib import Path from pathlib import Path
@ -164,9 +165,22 @@ def parse_date(date: Any) -> Optional[datetime]:
@enforce_types @enforce_types
def download_url(url: str, timeout: int=None) -> str: def download_url(url: str, timeout: int=None) -> str:
"""Download the contents of a remote url and return the text""" """Download the contents of a remote url and return the text"""
from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT from .config import (
TIMEOUT,
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
COOKIES_FILE,
)
timeout = timeout or TIMEOUT timeout = timeout or TIMEOUT
response = requests.get( session = requests.Session()
if COOKIES_FILE and Path(COOKIES_FILE).is_file():
cookie_jar = http.cookiejar.MozillaCookieJar(COOKIES_FILE)
cookie_jar.load(ignore_discard=True, ignore_expires=True)
for cookie in cookie_jar:
session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path)
response = session.get(
url, url,
headers={'User-Agent': WGET_USER_AGENT}, headers={'User-Agent': WGET_USER_AGENT},
verify=CHECK_SSL_VALIDITY, verify=CHECK_SSL_VALIDITY,