bulk-downloader-for-reddit/bulkredditdownloader/site_downloaders/base_downloader.py

#!/usr/bin/env python3
# coding=utf-8

import hashlib
import logging
import re
from abc import ABC, abstractmethod
from pathlib import Path

import requests

from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip
from bulkredditdownloader.utils import GLOBAL

logger = logging.getLogger(__name__)


class BaseDownloader(ABC):
    def __init__(self, directory: Path, post: dict):
        self.directory = directory
        self.post = post

    @abstractmethod
    def download(self):
        raise NotImplementedError

    @staticmethod
    def _create_hash(content: bytes) -> str:
        hash_md5 = hashlib.md5(content)
        return hash_md5.hexdigest()

    @staticmethod
    def _download_resource(filename: Path, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False):
        formats = {
            "videos": [".mp4", ".webm"],
            "images": [".jpg", ".jpeg", ".png", ".bmp"],
            "gifs": [".gif"],
            "self": []
        }

        for file_type in GLOBAL.arguments.skip:
            for extension in formats[file_type]:
                if extension in filename:
                    raise TypeInSkip

        if any(domain in image_url for domain in GLOBAL.arguments.skip_domain):
            raise DomainInSkip

        headers = [
            ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                           "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "
                           "Safari/537.36 OPR/54.0.2952.64"),
            ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"),
            ("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),
            ("Accept-Encoding", "none"),
            ("Accept-Language", "en-US,en;q=0.8"),
            ("Connection", "keep-alive")
        ]

        folder_dir.mkdir(exist_ok=True)

        if "imgur" not in image_url:
            addheaders = headers
        else:
            addheaders = None

        if not silent:
            logger.info(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n")

        # Loop to attempt download 3 times
        for i in range(3):
            file_path = Path(folder_dir) / filename

            if file_path.is_file():
                raise FileAlreadyExistsError
            else:
                try:
                    download_content = requests.get(image_url, headers=addheaders).content
                except ConnectionResetError:
                    raise FailedToDownload

                file_hash = BaseDownloader._create_hash(download_content)
                if GLOBAL.arguments.no_dupes:
                    if file_hash in GLOBAL.downloadedPosts():
                        raise FileAlreadyExistsError
                GLOBAL.downloadedPosts.add(file_hash)

                with open(file_path, 'wb') as file:
                    file.write(download_content)
                if not silent:
                    logger.info(" " * indent + "Downloaded" + " " * 10)
                return

        raise FailedToDownload

    @staticmethod
    def _get_extension(url: str) -> str:
        pattern = re.compile(r'(\.(jpg|jpeg|png|mp4|webm|gif))')
        if results := re.search(pattern, url):
            if len(results.groups()) > 1:
                return results[0]
        if "v.redd.it" not in url:
            return '.jpg'
        else:
            return '.mp4'
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00			`#!/usr/bin/env python3`
			`# coding=utf-8`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00			`import hashlib`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`import logging`
			`import re`
			`from abc import ABC, abstractmethod`
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00			`from pathlib import Path`

Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`import requests`

Remove utils module for downloaders 2021-02-07 14:33:19 +13:00			`from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip`
			`from bulkredditdownloader.utils import GLOBAL`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00
			`logger = logging.getLogger(__name__)`
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00

			`class BaseDownloader(ABC):`
			`def __init__(self, directory: Path, post: dict):`
			`self.directory = directory`
			`self.post = post`

Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`@abstractmethod`
			`def download(self):`
			`raise NotImplementedError`

Remove utils module for downloaders 2021-02-07 14:33:19 +13:00			`@staticmethod`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`def _create_hash(content: bytes) -> str:`
			`hash_md5 = hashlib.md5(content)`
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00			`return hash_md5.hexdigest()`

			`@staticmethod`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`def _download_resource(filename: Path, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False):`
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00			`formats = {`
			`"videos": [".mp4", ".webm"],`
			`"images": [".jpg", ".jpeg", ".png", ".bmp"],`
			`"gifs": [".gif"],`
			`"self": []`
			`}`

			`for file_type in GLOBAL.arguments.skip:`
			`for extension in formats[file_type]:`
			`if extension in filename:`
			`raise TypeInSkip`

			`if any(domain in image_url for domain in GLOBAL.arguments.skip_domain):`
			`raise DomainInSkip`

			`headers = [`
			`("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "`
			`"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "`
			`"Safari/537.36 OPR/54.0.2952.64"),`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8"),`
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00			`("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),`
			`("Accept-Encoding", "none"),`
			`("Accept-Language", "en-US,en;q=0.8"),`
			`("Connection", "keep-alive")`
			`]`

Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`folder_dir.mkdir(exist_ok=True)`
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00
			`if "imgur" not in image_url:`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`addheaders = headers`
			`else:`
			`addheaders = None`
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00
			`if not silent:`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`logger.info(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n")`
Embed function 2021-02-07 15:18:46 +13:00
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`# Loop to attempt download 3 times`
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00			`for i in range(3):`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`file_path = Path(folder_dir) / filename`
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`if file_path.is_file():`
			`raise FileAlreadyExistsError`
			`else:`
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00			`try:`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`download_content = requests.get(image_url, headers=addheaders).content`
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00			`except ConnectionResetError:`
			`raise FailedToDownload`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00
			`file_hash = BaseDownloader._create_hash(download_content)`
			`if GLOBAL.arguments.no_dupes:`
			`if file_hash in GLOBAL.downloadedPosts():`
			`raise FileAlreadyExistsError`
			`GLOBAL.downloadedPosts.add(file_hash)`

			`with open(file_path, 'wb') as file:`
			`file.write(download_content)`
			`if not silent:`
			`logger.info(" " * indent + "Downloaded" + " " * 10)`
			`return`

Remove utils module for downloaders 2021-02-07 14:33:19 +13:00			`raise FailedToDownload`

			`@staticmethod`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`def _get_extension(url: str) -> str:`
			`pattern = re.compile(r'(\.(jpg\|jpeg\|png\|mp4\|webm\|gif))')`
Add some tests for base_download 2021-02-07 19:38:30 +13:00			`if results := re.search(pattern, url):`
			`if len(results.groups()) > 1:`
			`return results[0]`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`if "v.redd.it" not in url:`
			`return '.jpg'`
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00			`else:`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`return '.mp4'`