1
0
Fork 0
mirror of synced 2024-06-25 17:40:17 +12:00
bulk-downloader-for-reddit/bulkredditdownloader/site_downloaders/base_downloader.py

106 lines
3.4 KiB
Python
Raw Normal View History

2021-02-07 14:33:19 +13:00
#!/usr/bin/env python3
# coding=utf-8
2021-02-07 14:33:19 +13:00
import hashlib
import logging
import re
from abc import ABC, abstractmethod
2021-02-07 14:33:19 +13:00
from pathlib import Path
import requests
2021-02-07 14:33:19 +13:00
from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip
from bulkredditdownloader.utils import GLOBAL
logger = logging.getLogger(__name__)
2021-02-07 14:33:19 +13:00
class BaseDownloader(ABC):
def __init__(self, directory: Path, post: dict):
self.directory = directory
self.post = post
@abstractmethod
def download(self):
raise NotImplementedError
2021-02-07 14:33:19 +13:00
@staticmethod
def _create_hash(content: bytes) -> str:
hash_md5 = hashlib.md5(content)
2021-02-07 14:33:19 +13:00
return hash_md5.hexdigest()
@staticmethod
def _download_resource(filename: Path, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False):
2021-02-07 14:33:19 +13:00
formats = {
"videos": [".mp4", ".webm"],
"images": [".jpg", ".jpeg", ".png", ".bmp"],
"gifs": [".gif"],
"self": []
}
for file_type in GLOBAL.arguments.skip:
for extension in formats[file_type]:
if extension in filename:
raise TypeInSkip
if any(domain in image_url for domain in GLOBAL.arguments.skip_domain):
raise DomainInSkip
headers = [
("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "
"Safari/537.36 OPR/54.0.2952.64"),
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"),
2021-02-07 14:33:19 +13:00
("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),
("Accept-Encoding", "none"),
("Accept-Language", "en-US,en;q=0.8"),
("Connection", "keep-alive")
]
folder_dir.mkdir(exist_ok=True)
2021-02-07 14:33:19 +13:00
if "imgur" not in image_url:
addheaders = headers
else:
addheaders = None
2021-02-07 14:33:19 +13:00
if not silent:
logger.info(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n")
2021-02-07 15:18:46 +13:00
# Loop to attempt download 3 times
2021-02-07 14:33:19 +13:00
for i in range(3):
file_path = Path(folder_dir) / filename
2021-02-07 14:33:19 +13:00
if file_path.is_file():
raise FileAlreadyExistsError
else:
2021-02-07 14:33:19 +13:00
try:
download_content = requests.get(image_url, headers=addheaders).content
2021-02-07 14:33:19 +13:00
except ConnectionResetError:
raise FailedToDownload
file_hash = BaseDownloader._create_hash(download_content)
if GLOBAL.arguments.no_dupes:
if file_hash in GLOBAL.downloadedPosts():
raise FileAlreadyExistsError
GLOBAL.downloadedPosts.add(file_hash)
with open(file_path, 'wb') as file:
file.write(download_content)
if not silent:
logger.info(" " * indent + "Downloaded" + " " * 10)
return
2021-02-07 14:33:19 +13:00
raise FailedToDownload
@staticmethod
def _get_extension(url: str) -> str:
pattern = re.compile(r'(\.(jpg|jpeg|png|mp4|webm|gif))')
2021-02-07 19:38:30 +13:00
if results := re.search(pattern, url):
if len(results.groups()) > 1:
return results[0]
if "v.redd.it" not in url:
return '.jpg'
2021-02-07 14:33:19 +13:00
else:
return '.mp4'