bulk-downloader-for-reddit/bdfr/resource.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import hashlib
import logging
import re
import time
import urllib.parse
from collections.abc import Callable
from typing import Optional

import _hashlib
import requests
from praw.models import Submission

from bdfr.exceptions import BulkDownloaderException

logger = logging.getLogger(__name__)


class Resource:
    def __init__(self, source_submission: Submission, url: str, download_function: Callable, extension: str = None):
        self.source_submission = source_submission
        self.content: Optional[bytes] = None
        self.url = url
        self.hash: Optional[_hashlib.HASH] = None
        self.extension = extension
        self.download_function = download_function
        if not self.extension:
            self.extension = self._determine_extension()

    @staticmethod
    def retry_download(url: str) -> Callable:
        return lambda global_params: Resource.http_download(url, global_params)

    def download(self, download_parameters: Optional[dict] = None):
        if download_parameters is None:
            download_parameters = {}
        if not self.content:
            try:
                content = self.download_function(download_parameters)
            except requests.exceptions.ConnectionError as e:
                raise BulkDownloaderException(f"Could not download resource: {e}")
            except BulkDownloaderException:
                raise
            if content:
                self.content = content
        if not self.hash and self.content:
            self.create_hash()

    def create_hash(self):
        self.hash = hashlib.md5(self.content)

    def _determine_extension(self) -> Optional[str]:
        extension_pattern = re.compile(r".*(\..{3,5})$")
        stripped_url = urllib.parse.urlsplit(self.url).path
        match = re.search(extension_pattern, stripped_url)
        if match:
            return match.group(1)

    @staticmethod
    def http_download(url: str, download_parameters: dict) -> Optional[bytes]:
        headers = download_parameters.get("headers")
        current_wait_time = 60
        if "max_wait_time" in download_parameters:
            max_wait_time = download_parameters["max_wait_time"]
        else:
            max_wait_time = 300
        while True:
            try:
                response = requests.get(url, headers=headers)
                if re.match(r"^2\d{2}", str(response.status_code)) and response.content:
                    return response.content
                elif response.status_code in (408, 429):
                    raise requests.exceptions.ConnectionError(f"Response code {response.status_code}")
                else:
                    raise BulkDownloaderException(
                        f"Unrecoverable error requesting resource: HTTP Code {response.status_code}"
                    )
            except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
                logger.warning(f"Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}")
                time.sleep(current_wait_time)
                if current_wait_time < max_wait_time:
                    current_wait_time += 60
                else:
                    logger.error(f"Max wait time exceeded for resource at url {url}")
                    raise