From 0d839329e5fe94411fd84b9b5731698da7f3b575 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 7 Feb 2021 11:33:19 +1000 Subject: [PATCH] Remove utils module for downloaders --- .../downloaders/base_downloader.py | 121 ++++++++++++++++++ bulkredditdownloader/downloaders/direct.py | 9 +- .../downloaders/downloader_utils.py | 109 ---------------- bulkredditdownloader/downloaders/erome.py | 13 +- bulkredditdownloader/downloaders/gallery.py | 16 +-- bulkredditdownloader/downloaders/gfycat.py | 10 +- .../downloaders/gif_delivery_network.py | 9 +- bulkredditdownloader/downloaders/imgur.py | 16 +-- bulkredditdownloader/downloaders/redgifs.py | 9 +- bulkredditdownloader/downloaders/self_post.py | 4 +- bulkredditdownloader/downloaders/vreddit.py | 11 +- bulkredditdownloader/downloaders/youtube.py | 8 +- 12 files changed, 175 insertions(+), 160 deletions(-) create mode 100644 bulkredditdownloader/downloaders/base_downloader.py delete mode 100644 bulkredditdownloader/downloaders/downloader_utils.py diff --git a/bulkredditdownloader/downloaders/base_downloader.py b/bulkredditdownloader/downloaders/base_downloader.py new file mode 100644 index 0000000..297e31c --- /dev/null +++ b/bulkredditdownloader/downloaders/base_downloader.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +# coding=utf-8 +import hashlib +import os +import sys +import urllib.request +from abc import ABC +from pathlib import Path + +from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip +from bulkredditdownloader.utils import GLOBAL +from bulkredditdownloader.utils import printToFile as print + + +class BaseDownloader(ABC): + def __init__(self, directory: Path, post: dict): + self.directory = directory + self.post = post + + @staticmethod + def createHash(filename: str) -> str: + hash_md5 = hashlib.md5() + with open(filename, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + @staticmethod + def dlProgress(count: int, block_size: int, total_size: int): + """Function for writing download progress to console """ + download_mbs = int(count * block_size * (10 ** (-6))) + file_size = int(total_size * (10 ** (-6))) + sys.stdout.write("{}Mb/{}Mb\r".format(download_mbs, file_size)) + sys.stdout.flush() + + @staticmethod + def getFile( + filename: str, + short_filename: str, + folder_dir: Path, + image_url: str, + indent: int = 0, + silent: bool = False): + formats = { + "videos": [".mp4", ".webm"], + "images": [".jpg", ".jpeg", ".png", ".bmp"], + "gifs": [".gif"], + "self": [] + } + + for file_type in GLOBAL.arguments.skip: + for extension in formats[file_type]: + if extension in filename: + raise TypeInSkip + + if any(domain in image_url for domain in GLOBAL.arguments.skip_domain): + raise DomainInSkip + + headers = [ + ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 " + "Safari/537.36 OPR/54.0.2952.64"), + ("Accept", "text/html,application/xhtml+xml,application/xml;" + "q=0.9,image/webp,image/apng,*/*;q=0.8"), + ("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"), + ("Accept-Encoding", "none"), + ("Accept-Language", "en-US,en;q=0.8"), + ("Connection", "keep-alive") + ] + + if not os.path.exists(folder_dir): + os.makedirs(folder_dir) + + opener = urllib.request.build_opener() + if "imgur" not in image_url: + opener.addheaders = headers + urllib.request.install_opener(opener) + + if not silent: + print(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n") + + for i in range(3): + file_dir = Path(folder_dir) / filename + temp_dir = Path(folder_dir) / (filename + ".tmp") + + if not (os.path.isfile(file_dir)): + try: + urllib.request.urlretrieve(image_url, temp_dir, reporthook=BaseDownloader.dlProgress) + + file_hash = BaseDownloader.createHash(temp_dir) + if GLOBAL.arguments.no_dupes: + if file_hash in GLOBAL.downloadedPosts(): + os.remove(temp_dir) + raise FileAlreadyExistsError + GLOBAL.downloadedPosts.add(file_hash) + + os.rename(temp_dir, file_dir) + if not silent: + print(" " * indent + "Downloaded" + " " * 10) + return None + except ConnectionResetError: + raise FailedToDownload + except FileNotFoundError: + filename = short_filename + else: + raise FileAlreadyExistsError + raise FailedToDownload + + @staticmethod + def getExtension(link: str): + """Extract file extension from image link. If didn't find any, return '.jpg' """ + image_types = ['jpg', 'png', 'mp4', 'webm', 'gif'] + parsed = link.split('.') + for fileType in image_types: + if fileType in parsed: + return "." + parsed[-1] + else: + if "v.redd.it" not in link: + return '.jpg' + else: + return '.mp4' diff --git a/bulkredditdownloader/downloaders/direct.py b/bulkredditdownloader/downloaders/direct.py index 9dd2c67..23a5c2f 100644 --- a/bulkredditdownloader/downloaders/direct.py +++ b/bulkredditdownloader/downloaders/direct.py @@ -1,17 +1,18 @@ import os import pathlib -from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.utils import GLOBAL -class Direct: +class Direct(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): - post['EXTENSION'] = getExtension(post['CONTENTURL']) + super().__init__(directory, post) + post['EXTENSION'] = self.getExtension(post['CONTENTURL']) if not os.path.exists(directory): os.makedirs(directory) filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] short_filename = post['POSTID'] + post['EXTENSION'] - getFile(filename, short_filename, directory, post['CONTENTURL']) + self.getFile(filename, short_filename, directory, post['CONTENTURL']) diff --git a/bulkredditdownloader/downloaders/downloader_utils.py b/bulkredditdownloader/downloaders/downloader_utils.py deleted file mode 100644 index e5d1043..0000000 --- a/bulkredditdownloader/downloaders/downloader_utils.py +++ /dev/null @@ -1,109 +0,0 @@ -import hashlib -import os -import sys -import urllib.request -from pathlib import Path - -from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip -from bulkredditdownloader.utils import GLOBAL -from bulkredditdownloader.utils import printToFile as print - - -def dlProgress(count: int, block_size: int, total_size: int): - """Function for writing download progress to console - """ - download_mbs = int(count * block_size * (10 ** (-6))) - file_size = int(total_size * (10 ** (-6))) - sys.stdout.write("{}Mb/{}Mb\r".format(download_mbs, file_size)) - sys.stdout.flush() - - -def getExtension(link: str): - """Extract file extension from image link. - If didn't find any, return '.jpg' - """ - image_types = ['jpg', 'png', 'mp4', 'webm', 'gif'] - parsed = link.split('.') - for fileType in image_types: - if fileType in parsed: - return "." + parsed[-1] - else: - if "v.redd.it" not in link: - return '.jpg' - else: - return '.mp4' - - -def getFile(filename: str, short_filename: str, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False): - formats = { - "videos": [".mp4", ".webm"], - "images": [".jpg", ".jpeg", ".png", ".bmp"], - "gifs": [".gif"], - "self": [] - } - - for file_type in GLOBAL.arguments.skip: - for extension in formats[file_type]: - if extension in filename: - raise TypeInSkip - - if any(domain in image_url for domain in GLOBAL.arguments.skip_domain): - raise DomainInSkip - - headers = [ - ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 " - "Safari/537.36 OPR/54.0.2952.64"), - ("Accept", "text/html,application/xhtml+xml,application/xml;" - "q=0.9,image/webp,image/apng,*/*;q=0.8"), - ("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"), - ("Accept-Encoding", "none"), - ("Accept-Language", "en-US,en;q=0.8"), - ("Connection", "keep-alive") - ] - - if not os.path.exists(folder_dir): - os.makedirs(folder_dir) - - opener = urllib.request.build_opener() - if "imgur" not in image_url: - opener.addheaders = headers - urllib.request.install_opener(opener) - - if not silent: - print(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n") - - for i in range(3): - file_dir = Path(folder_dir) / filename - temp_dir = Path(folder_dir) / (filename + ".tmp") - - if not (os.path.isfile(file_dir)): - try: - urllib.request.urlretrieve(image_url, temp_dir, reporthook=dlProgress) - - file_hash = createHash(temp_dir) - if GLOBAL.arguments.no_dupes: - if file_hash in GLOBAL.downloadedPosts(): - os.remove(temp_dir) - raise FileAlreadyExistsError - GLOBAL.downloadedPosts.add(file_hash) - - os.rename(temp_dir, file_dir) - if not silent: - print(" " * indent + "Downloaded" + " " * 10) - return None - except ConnectionResetError: - raise FailedToDownload - except FileNotFoundError: - filename = short_filename - else: - raise FileAlreadyExistsError - raise FailedToDownload - - -def createHash(filename: str) -> str: - hash_md5 = hashlib.md5() - with open(filename, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() diff --git a/bulkredditdownloader/downloaders/erome.py b/bulkredditdownloader/downloaders/erome.py index 2710453..f54e6f5 100644 --- a/bulkredditdownloader/downloaders/erome.py +++ b/bulkredditdownloader/downloaders/erome.py @@ -4,14 +4,15 @@ import urllib.error import urllib.request from html.parser import HTMLParser -from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import printToFile as print -class Erome: +class Erome(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) try: images = self.getLinks(post['CONTENTURL']) except urllib.error.HTTPError: @@ -22,7 +23,7 @@ class Erome: duplicates = 0 if images_length == 1: - extension = getExtension(images[0]) + extension = self.getExtension(images[0]) """Filenames are declared here""" filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] @@ -32,7 +33,7 @@ class Erome: if 'https://' not in image_url or 'http://' not in image_url: image_url = "https://" + image_url - getFile(filename, short_filename, directory, image_url) + self.getFile(filename, short_filename, directory, image_url) else: filename = GLOBAL.config['filename'].format(**post) @@ -48,7 +49,7 @@ class Erome: os.makedirs(folder_dir) for i in range(images_length): - extension = getExtension(images[i]) + extension = self.getExtension(images[i]) filename = str(i + 1) + extension image_url = images[i] @@ -59,7 +60,7 @@ class Erome: print(" {}".format(filename)) try: - getFile(filename, filename, folder_dir, image_url, indent=2) + self.getFile(filename, filename, folder_dir, image_url, indent=2) print() except FileAlreadyExistsError: print(" The file already exists" + " " * 10, end="\n\n") diff --git a/bulkredditdownloader/downloaders/gallery.py b/bulkredditdownloader/downloaders/gallery.py index e3ec461..597d653 100644 --- a/bulkredditdownloader/downloaders/gallery.py +++ b/bulkredditdownloader/downloaders/gallery.py @@ -1,25 +1,23 @@ import json import os +import pathlib import urllib import requests -import pathlib -from bulkredditdownloader.downloaders.downloader_utils import getFile -from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, NotADownloadableLinkError, - TypeInSkip) +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, + NotADownloadableLinkError, TypeInSkip) from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import printToFile as print -class Gallery: +class Gallery(BaseDownloader): def __init__(self, directory: pathlib.Path, post): + super().__init__(directory, post) link = post['CONTENTURL'] self.raw_data = self.getData(link) - self.directory = directory - self.post = post - images = {} count = 0 for model in self.raw_data['posts']['models']: @@ -86,7 +84,7 @@ class Gallery: print("\n ({}/{})".format(i + 1, count)) try: - getFile(filename, short_filename, folder_dir, images[i]['url'], indent=2) + self.getFile(filename, short_filename, folder_dir, images[i]['url'], indent=2) how_many_downloaded += 1 print() diff --git a/bulkredditdownloader/downloaders/gfycat.py b/bulkredditdownloader/downloaders/gfycat.py index 7ab93b4..1bedeb4 100644 --- a/bulkredditdownloader/downloaders/gfycat.py +++ b/bulkredditdownloader/downloaders/gfycat.py @@ -4,22 +4,22 @@ import urllib.request from bs4 import BeautifulSoup -from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.downloaders.gif_delivery_network import GifDeliveryNetwork from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL import pathlib - -class Gfycat: +class Gfycat(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) try: post['MEDIAURL'] = self.getLink(post['CONTENTURL']) except IndexError: raise NotADownloadableLinkError("Could not read the page source") - post['EXTENSION'] = getExtension(post['MEDIAURL']) + post['EXTENSION'] = self.getExtension(post['MEDIAURL']) if not os.path.exists(directory): os.makedirs(directory) @@ -27,7 +27,7 @@ class Gfycat: filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] short_filename = post['POSTID'] + post['EXTENSION'] - getFile(filename, short_filename, directory, post['MEDIAURL']) + self.getFile(filename, short_filename, directory, post['MEDIAURL']) @staticmethod def getLink(url: str) -> str: diff --git a/bulkredditdownloader/downloaders/gif_delivery_network.py b/bulkredditdownloader/downloaders/gif_delivery_network.py index 486bc9a..2c66e1b 100644 --- a/bulkredditdownloader/downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/downloaders/gif_delivery_network.py @@ -4,19 +4,20 @@ import urllib.request from bs4 import BeautifulSoup -from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL -class GifDeliveryNetwork: +class GifDeliveryNetwork(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) try: post['MEDIAURL'] = self.getLink(post['CONTENTURL']) except IndexError: raise NotADownloadableLinkError("Could not read the page source") - post['EXTENSION'] = getExtension(post['MEDIAURL']) + post['EXTENSION'] = self.getExtension(post['MEDIAURL']) if not os.path.exists(directory): os.makedirs(directory) @@ -24,7 +25,7 @@ class GifDeliveryNetwork: filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] short_filename = post['POSTID'] + post['EXTENSION'] - getFile(filename, short_filename, directory, post['MEDIAURL']) + self.getFile(filename, short_filename, directory, post['MEDIAURL']) @staticmethod def getLink(url: str) -> str: diff --git a/bulkredditdownloader/downloaders/imgur.py b/bulkredditdownloader/downloaders/imgur.py index 9b444d0..3b816ff 100644 --- a/bulkredditdownloader/downloaders/imgur.py +++ b/bulkredditdownloader/downloaders/imgur.py @@ -4,19 +4,20 @@ import pathlib import requests +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.downloaders.direct import Direct -from bulkredditdownloader.downloaders.downloader_utils import getFile -from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, ImageNotFound, - NotADownloadableLinkError, TypeInSkip) +from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, + ImageNotFound, NotADownloadableLinkError, TypeInSkip) from bulkredditdownloader.utils import GLOBAL, nameCorrector from bulkredditdownloader.utils import printToFile as print -class Imgur: +class Imgur(BaseDownloader): imgur_image_domain = "https://i.imgur.com/" def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) link = post['CONTENTURL'] if link.endswith(".gifv"): @@ -26,9 +27,6 @@ class Imgur: self.raw_data = self.getData(link) - self.directory = directory - self.post = post - if self.isAlbum: if self.raw_data["album_images"]["count"] != 1: self.downloadAlbum(self.raw_data["album_images"]) @@ -65,7 +63,7 @@ class Imgur: print("\n ({}/{})".format(i + 1, images_length)) try: - getFile(filename, short_filename, folder_dir, image_url, indent=2) + self.getFile(filename, short_filename, folder_dir, image_url, indent=2) how_many_downloaded += 1 print() @@ -101,7 +99,7 @@ class Imgur: filename = GLOBAL.config['filename'].format(**self.post) + extension short_filename = self.post['POSTID'] + extension - getFile(filename, short_filename, self.directory, image_url) + self.getFile(filename, short_filename, self.directory, image_url) @property def isAlbum(self) -> bool: diff --git a/bulkredditdownloader/downloaders/redgifs.py b/bulkredditdownloader/downloaders/redgifs.py index 6d504c5..ff63780 100644 --- a/bulkredditdownloader/downloaders/redgifs.py +++ b/bulkredditdownloader/downloaders/redgifs.py @@ -5,19 +5,20 @@ import urllib.request from bs4 import BeautifulSoup -from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL -class Redgifs: +class Redgifs(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) try: post['MEDIAURL'] = self.getLink(post['CONTENTURL']) except IndexError: raise NotADownloadableLinkError("Could not read the page source") - post['EXTENSION'] = getExtension(post['MEDIAURL']) + post['EXTENSION'] = self.getExtension(post['MEDIAURL']) if not os.path.exists(directory): os.makedirs(directory) @@ -25,7 +26,7 @@ class Redgifs: filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] short_filename = post['POSTID'] + post['EXTENSION'] - getFile(filename, short_filename, directory, post['MEDIAURL']) + self.getFile(filename, short_filename, directory, post['MEDIAURL']) @staticmethod def getLink(url: str) -> str: diff --git a/bulkredditdownloader/downloaders/self_post.py b/bulkredditdownloader/downloaders/self_post.py index fa03e7b..05a7249 100644 --- a/bulkredditdownloader/downloaders/self_post.py +++ b/bulkredditdownloader/downloaders/self_post.py @@ -4,6 +4,7 @@ import os import pathlib from pathlib import Path +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import FileAlreadyExistsError, TypeInSkip from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import printToFile as print @@ -11,8 +12,9 @@ from bulkredditdownloader.utils import printToFile as print VanillaPrint = print -class SelfPost: +class SelfPost(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) if "self" in GLOBAL.arguments.skip: raise TypeInSkip diff --git a/bulkredditdownloader/downloaders/vreddit.py b/bulkredditdownloader/downloaders/vreddit.py index 74b776c..b530d0c 100644 --- a/bulkredditdownloader/downloaders/vreddit.py +++ b/bulkredditdownloader/downloaders/vreddit.py @@ -2,13 +2,14 @@ import os import pathlib import subprocess -from bulkredditdownloader.downloaders.downloader_utils import getFile +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import printToFile as print -class VReddit: +class VReddit(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) extension = ".mp4" if not os.path.exists(directory): os.makedirs(directory) @@ -20,7 +21,7 @@ class VReddit: fnull = open(os.devnull, 'w') subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT) except Exception: - getFile(filename, short_filename, directory, post['CONTENTURL']) + self.getFile(filename, short_filename, directory, post['CONTENTURL']) print("FFMPEG library not found, skipping merging video and audio") else: video_name = post['POSTID'] + "_video" @@ -30,8 +31,8 @@ class VReddit: print(directory, filename, sep="\n") - getFile(video_name, video_name, directory, video_url, silent=True) - getFile(audio_name, audio_name, directory, audio_url, silent=True) + self.getFile(video_name, video_name, directory, video_url, silent=True) + self.getFile(audio_name, audio_name, directory, audio_url, silent=True) try: self._mergeAudio(video_name, audio_name, filename, short_filename, directory) except KeyboardInterrupt: diff --git a/bulkredditdownloader/downloaders/youtube.py b/bulkredditdownloader/downloaders/youtube.py index c5abc81..bcc0c2f 100644 --- a/bulkredditdownloader/downloaders/youtube.py +++ b/bulkredditdownloader/downloaders/youtube.py @@ -4,15 +4,15 @@ import sys import youtube_dl -from bulkredditdownloader.downloaders.downloader_utils import createHash +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import FileAlreadyExistsError from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import printToFile as print - -class Youtube: +class Youtube(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) if not os.path.exists(directory): os.makedirs(directory) @@ -37,7 +37,7 @@ class Youtube: if GLOBAL.arguments.no_dupes: try: - file_hash = createHash(str(location)) + file_hash = self.createHash(str(location)) except FileNotFoundError: return None if file_hash in GLOBAL.downloadedPosts():