bulk-downloader-for-reddit/bulkredditdownloader/site_downloaders/base_downloader.py

#!/usr/bin/env python3
# coding=utf-8

import hashlib
import logging
import re
from abc import ABC, abstractmethod
from pathlib import Path

import requests

from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip
from bulkredditdownloader.utils import GLOBAL

logger = logging.getLogger(__name__)


class BaseDownloader(ABC):
    def __init__(self, directory: Path, post: dict):
        self.directory = directory
        self.post = post

    @abstractmethod
    def download(self):
        raise NotImplementedError

    @staticmethod
    def _create_hash(content: bytes) -> str:
        hash_md5 = hashlib.md5(content)
        return hash_md5.hexdigest()

    @staticmethod
    def _download_resource(filename: Path, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False):
        formats = {
            "videos": [".mp4", ".webm"],
            "images": [".jpg", ".jpeg", ".png", ".bmp"],
            "gifs": [".gif"],
            "self": []
        }

        for file_type in GLOBAL.arguments.skip:
            for extension in formats[file_type]:
                if extension in filename:
                    raise TypeInSkip

        if any(domain in image_url for domain in GLOBAL.arguments.skip_domain):
            raise DomainInSkip

        headers = [
            ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                           "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "
                           "Safari/537.36 OPR/54.0.2952.64"),
            ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"),
            ("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),
            ("Accept-Encoding", "none"),
            ("Accept-Language", "en-US,en;q=0.8"),
            ("Connection", "keep-alive")
        ]

        folder_dir.mkdir(exist_ok=True)

        if "imgur" not in image_url:
            addheaders = headers
        else:
            addheaders = None

        if not silent:
            logger.info(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n")

        # Loop to attempt download 3 times
        for i in range(3):
            file_path = Path(folder_dir) / filename

            if file_path.is_file():
                raise FileAlreadyExistsError
            else:
                try:
                    download_content = requests.get(image_url, headers=addheaders).content
                except ConnectionResetError:
                    raise FailedToDownload

                file_hash = BaseDownloader._create_hash(download_content)
                if GLOBAL.arguments.no_dupes:
                    if file_hash in GLOBAL.downloadedPosts():
                        raise FileAlreadyExistsError
                GLOBAL.downloadedPosts.add(file_hash)

                with open(file_path, 'wb') as file:
                    file.write(download_content)
                if not silent:
                    logger.info(" " * indent + "Downloaded" + " " * 10)
                return

        raise FailedToDownload

    @staticmethod
    def _get_extension(url: str) -> str:
        pattern = re.compile(r'(\.(jpg|jpeg|png|mp4|webm|gif))')
        if results := re.search(pattern, url):
            if len(results.groups()) > 1:
                return results[0]
        if "v.redd.it" not in url:
            return '.jpg'
        else:
            return '.mp4'