1
0
Fork 0
mirror of synced 2024-05-19 11:42:40 +12:00
bulk-downloader-for-reddit/bdfr/resource.py

88 lines
3.3 KiB
Python
Raw Normal View History

2021-02-11 12:09:49 +13:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
2021-02-11 12:09:49 +13:00
import hashlib
2021-03-11 16:20:39 +13:00
import logging
2021-02-11 12:09:49 +13:00
import re
2021-02-26 21:56:05 +13:00
import time
2021-04-18 23:24:11 +12:00
import urllib.parse
2023-01-26 16:23:59 +13:00
from collections.abc import Callable
from typing import Optional
2021-02-11 12:09:49 +13:00
2021-02-26 21:56:05 +13:00
import _hashlib
import requests
2021-02-11 12:09:49 +13:00
from praw.models import Submission
2021-04-12 19:58:32 +12:00
from bdfr.exceptions import BulkDownloaderException
2021-02-26 21:56:05 +13:00
2021-03-11 16:20:39 +13:00
logger = logging.getLogger(__name__)
2021-02-11 12:09:49 +13:00
class Resource:
2021-07-27 15:39:49 +12:00
def __init__(self, source_submission: Submission, url: str, download_function: Callable, extension: str = None):
2021-02-11 12:09:49 +13:00
self.source_submission = source_submission
2021-02-26 21:56:05 +13:00
self.content: Optional[bytes] = None
2021-02-11 12:09:49 +13:00
self.url = url
2021-02-26 21:56:05 +13:00
self.hash: Optional[_hashlib.HASH] = None
self.extension = extension
2021-07-27 15:39:49 +12:00
self.download_function = download_function
2021-02-26 21:56:05 +13:00
if not self.extension:
self.extension = self._determine_extension()
2021-02-11 12:09:49 +13:00
@staticmethod
def retry_download(url: str) -> Callable:
2021-10-02 15:41:57 +13:00
return lambda global_params: Resource.http_download(url, global_params)
2021-02-26 21:56:05 +13:00
2021-07-29 21:10:10 +12:00
def download(self, download_parameters: Optional[dict] = None):
if download_parameters is None:
download_parameters = {}
2021-02-26 21:56:05 +13:00
if not self.content:
try:
2021-07-29 21:10:10 +12:00
content = self.download_function(download_parameters)
except requests.exceptions.ConnectionError as e:
2022-12-03 18:11:17 +13:00
raise BulkDownloaderException(f"Could not download resource: {e}")
except BulkDownloaderException:
raise
2021-02-26 21:56:05 +13:00
if content:
self.content = content
if not self.hash and self.content:
self.create_hash()
2021-02-26 21:56:05 +13:00
2021-02-28 12:40:42 +13:00
def create_hash(self):
self.hash = hashlib.md5(self.content)
2021-04-13 15:17:40 +12:00
def _determine_extension(self) -> Optional[str]:
2022-12-03 18:11:17 +13:00
extension_pattern = re.compile(r".*(\..{3,5})$")
2021-04-18 23:24:11 +12:00
stripped_url = urllib.parse.urlsplit(self.url).path
match = re.search(extension_pattern, stripped_url)
2021-02-26 21:56:05 +13:00
if match:
return match.group(1)
2021-10-02 15:41:57 +13:00
@staticmethod
def http_download(url: str, download_parameters: dict) -> Optional[bytes]:
2022-12-03 18:11:17 +13:00
headers = download_parameters.get("headers")
2021-10-02 15:41:57 +13:00
current_wait_time = 60
2022-12-03 18:11:17 +13:00
if "max_wait_time" in download_parameters:
max_wait_time = download_parameters["max_wait_time"]
2021-10-02 15:41:57 +13:00
else:
max_wait_time = 300
while True:
try:
response = requests.get(url, headers=headers)
2022-12-03 18:11:17 +13:00
if re.match(r"^2\d{2}", str(response.status_code)) and response.content:
2021-10-02 15:41:57 +13:00
return response.content
elif response.status_code in (408, 429):
2022-12-03 18:11:17 +13:00
raise requests.exceptions.ConnectionError(f"Response code {response.status_code}")
2021-10-02 15:41:57 +13:00
else:
raise BulkDownloaderException(
2022-12-03 18:11:17 +13:00
f"Unrecoverable error requesting resource: HTTP Code {response.status_code}"
)
2021-10-02 15:41:57 +13:00
except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
2022-12-03 18:11:17 +13:00
logger.warning(f"Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}")
2021-10-02 15:41:57 +13:00
time.sleep(current_wait_time)
if current_wait_time < max_wait_time:
current_wait_time += 60
else:
2022-12-03 18:11:17 +13:00
logger.error(f"Max wait time exceeded for resource at url {url}")
2021-10-02 15:41:57 +13:00
raise