1
0
Fork 0
mirror of synced 2024-06-02 18:34:37 +12:00
bulk-downloader-for-reddit/bulkredditdownloader/resource.py

66 lines
2.1 KiB
Python
Raw Normal View History

2021-02-11 12:09:49 +13:00
#!/usr/bin/env python3
# coding=utf-8
import hashlib
2021-03-11 16:20:39 +13:00
import logging
2021-02-11 12:09:49 +13:00
import re
2021-02-26 21:56:05 +13:00
import time
from typing import Optional
2021-02-11 12:09:49 +13:00
2021-02-26 21:56:05 +13:00
import _hashlib
import requests
2021-02-11 12:09:49 +13:00
from praw.models import Submission
2021-03-05 16:32:24 +13:00
from bulkredditdownloader.exceptions import BulkDownloaderException
2021-02-26 21:56:05 +13:00
2021-03-11 16:20:39 +13:00
logger = logging.getLogger(__name__)
2021-02-11 12:09:49 +13:00
class Resource:
2021-02-26 21:56:05 +13:00
def __init__(self, source_submission: Submission, url: str, extension: str = None):
2021-02-11 12:09:49 +13:00
self.source_submission = source_submission
2021-02-26 21:56:05 +13:00
self.content: Optional[bytes] = None
2021-02-11 12:09:49 +13:00
self.url = url
2021-02-26 21:56:05 +13:00
self.hash: Optional[_hashlib.HASH] = None
self.extension = extension
if not self.extension:
self.extension = self._determine_extension()
2021-02-11 12:09:49 +13:00
@staticmethod
2021-02-26 21:56:05 +13:00
def retry_download(url: str, wait_time: int) -> Optional[bytes]:
try:
response = requests.get(url)
if response.status_code == 200:
return response.content
elif response.status_code in (301, 401, 403, 404):
logger.error(f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
return None
2021-02-26 21:56:05 +13:00
else:
raise requests.exceptions.ConnectionError
except requests.exceptions.ConnectionError:
2021-03-11 16:20:39 +13:00
logger.log(9, f'Error occured downloading resource, waiting {wait_time} seconds')
2021-02-26 21:56:05 +13:00
time.sleep(wait_time)
if wait_time < 300:
return Resource.retry_download(url, wait_time + 60)
else:
2021-03-11 16:20:39 +13:00
logger.error(f'Max wait time exceeded for resource at url {url}')
2021-02-26 21:56:05 +13:00
return None
def download(self):
if not self.content:
content = self.retry_download(self.url, 0)
if content:
self.content = content
2021-02-28 12:40:42 +13:00
self.create_hash()
2021-02-26 21:56:05 +13:00
else:
raise BulkDownloaderException('Could not download resource')
2021-02-28 12:40:42 +13:00
def create_hash(self):
self.hash = hashlib.md5(self.content)
2021-02-26 21:56:05 +13:00
def _determine_extension(self) -> str:
2021-03-10 17:07:59 +13:00
extension_pattern = re.compile(r'.*(\..{3,5})(?:\?.*)?$')
2021-02-26 21:56:05 +13:00
match = re.search(extension_pattern, self.url)
if match:
return match.group(1)