1
0
Fork 0
mirror of synced 2024-05-24 06:00:02 +12:00

Remove utils module for downloaders

This commit is contained in:
Serene-Arc 2021-02-07 11:33:19 +10:00 committed by Ali Parlakci
parent be613949fe
commit 0d839329e5
12 changed files with 175 additions and 160 deletions

View file

@ -0,0 +1,121 @@
#!/usr/bin/env python3
# coding=utf-8
import hashlib
import os
import sys
import urllib.request
from abc import ABC
from pathlib import Path
from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip
from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.utils import printToFile as print
class BaseDownloader(ABC):
def __init__(self, directory: Path, post: dict):
self.directory = directory
self.post = post
@staticmethod
def createHash(filename: str) -> str:
hash_md5 = hashlib.md5()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
@staticmethod
def dlProgress(count: int, block_size: int, total_size: int):
"""Function for writing download progress to console """
download_mbs = int(count * block_size * (10 ** (-6)))
file_size = int(total_size * (10 ** (-6)))
sys.stdout.write("{}Mb/{}Mb\r".format(download_mbs, file_size))
sys.stdout.flush()
@staticmethod
def getFile(
filename: str,
short_filename: str,
folder_dir: Path,
image_url: str,
indent: int = 0,
silent: bool = False):
formats = {
"videos": [".mp4", ".webm"],
"images": [".jpg", ".jpeg", ".png", ".bmp"],
"gifs": [".gif"],
"self": []
}
for file_type in GLOBAL.arguments.skip:
for extension in formats[file_type]:
if extension in filename:
raise TypeInSkip
if any(domain in image_url for domain in GLOBAL.arguments.skip_domain):
raise DomainInSkip
headers = [
("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "
"Safari/537.36 OPR/54.0.2952.64"),
("Accept", "text/html,application/xhtml+xml,application/xml;"
"q=0.9,image/webp,image/apng,*/*;q=0.8"),
("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),
("Accept-Encoding", "none"),
("Accept-Language", "en-US,en;q=0.8"),
("Connection", "keep-alive")
]
if not os.path.exists(folder_dir):
os.makedirs(folder_dir)
opener = urllib.request.build_opener()
if "imgur" not in image_url:
opener.addheaders = headers
urllib.request.install_opener(opener)
if not silent:
print(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n")
for i in range(3):
file_dir = Path(folder_dir) / filename
temp_dir = Path(folder_dir) / (filename + ".tmp")
if not (os.path.isfile(file_dir)):
try:
urllib.request.urlretrieve(image_url, temp_dir, reporthook=BaseDownloader.dlProgress)
file_hash = BaseDownloader.createHash(temp_dir)
if GLOBAL.arguments.no_dupes:
if file_hash in GLOBAL.downloadedPosts():
os.remove(temp_dir)
raise FileAlreadyExistsError
GLOBAL.downloadedPosts.add(file_hash)
os.rename(temp_dir, file_dir)
if not silent:
print(" " * indent + "Downloaded" + " " * 10)
return None
except ConnectionResetError:
raise FailedToDownload
except FileNotFoundError:
filename = short_filename
else:
raise FileAlreadyExistsError
raise FailedToDownload
@staticmethod
def getExtension(link: str):
"""Extract file extension from image link. If didn't find any, return '.jpg' """
image_types = ['jpg', 'png', 'mp4', 'webm', 'gif']
parsed = link.split('.')
for fileType in image_types:
if fileType in parsed:
return "." + parsed[-1]
else:
if "v.redd.it" not in link:
return '.jpg'
else:
return '.mp4'

View file

@ -1,17 +1,18 @@
import os
import pathlib
from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.utils import GLOBAL
class Direct:
class Direct(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
post['EXTENSION'] = getExtension(post['CONTENTURL'])
super().__init__(directory, post)
post['EXTENSION'] = self.getExtension(post['CONTENTURL'])
if not os.path.exists(directory):
os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"]
short_filename = post['POSTID'] + post['EXTENSION']
getFile(filename, short_filename, directory, post['CONTENTURL'])
self.getFile(filename, short_filename, directory, post['CONTENTURL'])

View file

@ -1,109 +0,0 @@
import hashlib
import os
import sys
import urllib.request
from pathlib import Path
from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip
from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.utils import printToFile as print
def dlProgress(count: int, block_size: int, total_size: int):
"""Function for writing download progress to console
"""
download_mbs = int(count * block_size * (10 ** (-6)))
file_size = int(total_size * (10 ** (-6)))
sys.stdout.write("{}Mb/{}Mb\r".format(download_mbs, file_size))
sys.stdout.flush()
def getExtension(link: str):
"""Extract file extension from image link.
If didn't find any, return '.jpg'
"""
image_types = ['jpg', 'png', 'mp4', 'webm', 'gif']
parsed = link.split('.')
for fileType in image_types:
if fileType in parsed:
return "." + parsed[-1]
else:
if "v.redd.it" not in link:
return '.jpg'
else:
return '.mp4'
def getFile(filename: str, short_filename: str, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False):
formats = {
"videos": [".mp4", ".webm"],
"images": [".jpg", ".jpeg", ".png", ".bmp"],
"gifs": [".gif"],
"self": []
}
for file_type in GLOBAL.arguments.skip:
for extension in formats[file_type]:
if extension in filename:
raise TypeInSkip
if any(domain in image_url for domain in GLOBAL.arguments.skip_domain):
raise DomainInSkip
headers = [
("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "
"Safari/537.36 OPR/54.0.2952.64"),
("Accept", "text/html,application/xhtml+xml,application/xml;"
"q=0.9,image/webp,image/apng,*/*;q=0.8"),
("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),
("Accept-Encoding", "none"),
("Accept-Language", "en-US,en;q=0.8"),
("Connection", "keep-alive")
]
if not os.path.exists(folder_dir):
os.makedirs(folder_dir)
opener = urllib.request.build_opener()
if "imgur" not in image_url:
opener.addheaders = headers
urllib.request.install_opener(opener)
if not silent:
print(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n")
for i in range(3):
file_dir = Path(folder_dir) / filename
temp_dir = Path(folder_dir) / (filename + ".tmp")
if not (os.path.isfile(file_dir)):
try:
urllib.request.urlretrieve(image_url, temp_dir, reporthook=dlProgress)
file_hash = createHash(temp_dir)
if GLOBAL.arguments.no_dupes:
if file_hash in GLOBAL.downloadedPosts():
os.remove(temp_dir)
raise FileAlreadyExistsError
GLOBAL.downloadedPosts.add(file_hash)
os.rename(temp_dir, file_dir)
if not silent:
print(" " * indent + "Downloaded" + " " * 10)
return None
except ConnectionResetError:
raise FailedToDownload
except FileNotFoundError:
filename = short_filename
else:
raise FileAlreadyExistsError
raise FailedToDownload
def createHash(filename: str) -> str:
hash_md5 = hashlib.md5()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()

View file

@ -4,14 +4,15 @@ import urllib.error
import urllib.request
from html.parser import HTMLParser
from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError
from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.utils import printToFile as print
class Erome:
class Erome(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
try:
images = self.getLinks(post['CONTENTURL'])
except urllib.error.HTTPError:
@ -22,7 +23,7 @@ class Erome:
duplicates = 0
if images_length == 1:
extension = getExtension(images[0])
extension = self.getExtension(images[0])
"""Filenames are declared here"""
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"]
@ -32,7 +33,7 @@ class Erome:
if 'https://' not in image_url or 'http://' not in image_url:
image_url = "https://" + image_url
getFile(filename, short_filename, directory, image_url)
self.getFile(filename, short_filename, directory, image_url)
else:
filename = GLOBAL.config['filename'].format(**post)
@ -48,7 +49,7 @@ class Erome:
os.makedirs(folder_dir)
for i in range(images_length):
extension = getExtension(images[i])
extension = self.getExtension(images[i])
filename = str(i + 1) + extension
image_url = images[i]
@ -59,7 +60,7 @@ class Erome:
print(" {}".format(filename))
try:
getFile(filename, filename, folder_dir, image_url, indent=2)
self.getFile(filename, filename, folder_dir, image_url, indent=2)
print()
except FileAlreadyExistsError:
print(" The file already exists" + " " * 10, end="\n\n")

View file

@ -1,25 +1,23 @@
import json
import os
import pathlib
import urllib
import requests
import pathlib
from bulkredditdownloader.downloaders.downloader_utils import getFile
from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, NotADownloadableLinkError,
TypeInSkip)
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound,
NotADownloadableLinkError, TypeInSkip)
from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.utils import printToFile as print
class Gallery:
class Gallery(BaseDownloader):
def __init__(self, directory: pathlib.Path, post):
super().__init__(directory, post)
link = post['CONTENTURL']
self.raw_data = self.getData(link)
self.directory = directory
self.post = post
images = {}
count = 0
for model in self.raw_data['posts']['models']:
@ -86,7 +84,7 @@ class Gallery:
print("\n ({}/{})".format(i + 1, count))
try:
getFile(filename, short_filename, folder_dir, images[i]['url'], indent=2)
self.getFile(filename, short_filename, folder_dir, images[i]['url'], indent=2)
how_many_downloaded += 1
print()

View file

@ -4,22 +4,22 @@ import urllib.request
from bs4 import BeautifulSoup
from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.downloaders.gif_delivery_network import GifDeliveryNetwork
from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.utils import GLOBAL
import pathlib
class Gfycat:
class Gfycat(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
try:
post['MEDIAURL'] = self.getLink(post['CONTENTURL'])
except IndexError:
raise NotADownloadableLinkError("Could not read the page source")
post['EXTENSION'] = getExtension(post['MEDIAURL'])
post['EXTENSION'] = self.getExtension(post['MEDIAURL'])
if not os.path.exists(directory):
os.makedirs(directory)
@ -27,7 +27,7 @@ class Gfycat:
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"]
short_filename = post['POSTID'] + post['EXTENSION']
getFile(filename, short_filename, directory, post['MEDIAURL'])
self.getFile(filename, short_filename, directory, post['MEDIAURL'])
@staticmethod
def getLink(url: str) -> str:

View file

@ -4,19 +4,20 @@ import urllib.request
from bs4 import BeautifulSoup
from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.utils import GLOBAL
class GifDeliveryNetwork:
class GifDeliveryNetwork(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
try:
post['MEDIAURL'] = self.getLink(post['CONTENTURL'])
except IndexError:
raise NotADownloadableLinkError("Could not read the page source")
post['EXTENSION'] = getExtension(post['MEDIAURL'])
post['EXTENSION'] = self.getExtension(post['MEDIAURL'])
if not os.path.exists(directory):
os.makedirs(directory)
@ -24,7 +25,7 @@ class GifDeliveryNetwork:
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"]
short_filename = post['POSTID'] + post['EXTENSION']
getFile(filename, short_filename, directory, post['MEDIAURL'])
self.getFile(filename, short_filename, directory, post['MEDIAURL'])
@staticmethod
def getLink(url: str) -> str:

View file

@ -4,19 +4,20 @@ import pathlib
import requests
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.downloaders.direct import Direct
from bulkredditdownloader.downloaders.downloader_utils import getFile
from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, ImageNotFound,
NotADownloadableLinkError, TypeInSkip)
from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError,
ImageNotFound, NotADownloadableLinkError, TypeInSkip)
from bulkredditdownloader.utils import GLOBAL, nameCorrector
from bulkredditdownloader.utils import printToFile as print
class Imgur:
class Imgur(BaseDownloader):
imgur_image_domain = "https://i.imgur.com/"
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
link = post['CONTENTURL']
if link.endswith(".gifv"):
@ -26,9 +27,6 @@ class Imgur:
self.raw_data = self.getData(link)
self.directory = directory
self.post = post
if self.isAlbum:
if self.raw_data["album_images"]["count"] != 1:
self.downloadAlbum(self.raw_data["album_images"])
@ -65,7 +63,7 @@ class Imgur:
print("\n ({}/{})".format(i + 1, images_length))
try:
getFile(filename, short_filename, folder_dir, image_url, indent=2)
self.getFile(filename, short_filename, folder_dir, image_url, indent=2)
how_many_downloaded += 1
print()
@ -101,7 +99,7 @@ class Imgur:
filename = GLOBAL.config['filename'].format(**self.post) + extension
short_filename = self.post['POSTID'] + extension
getFile(filename, short_filename, self.directory, image_url)
self.getFile(filename, short_filename, self.directory, image_url)
@property
def isAlbum(self) -> bool:

View file

@ -5,19 +5,20 @@ import urllib.request
from bs4 import BeautifulSoup
from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.utils import GLOBAL
class Redgifs:
class Redgifs(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
try:
post['MEDIAURL'] = self.getLink(post['CONTENTURL'])
except IndexError:
raise NotADownloadableLinkError("Could not read the page source")
post['EXTENSION'] = getExtension(post['MEDIAURL'])
post['EXTENSION'] = self.getExtension(post['MEDIAURL'])
if not os.path.exists(directory):
os.makedirs(directory)
@ -25,7 +26,7 @@ class Redgifs:
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"]
short_filename = post['POSTID'] + post['EXTENSION']
getFile(filename, short_filename, directory, post['MEDIAURL'])
self.getFile(filename, short_filename, directory, post['MEDIAURL'])
@staticmethod
def getLink(url: str) -> str:

View file

@ -4,6 +4,7 @@ import os
import pathlib
from pathlib import Path
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import FileAlreadyExistsError, TypeInSkip
from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.utils import printToFile as print
@ -11,8 +12,9 @@ from bulkredditdownloader.utils import printToFile as print
VanillaPrint = print
class SelfPost:
class SelfPost(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
if "self" in GLOBAL.arguments.skip:
raise TypeInSkip

View file

@ -2,13 +2,14 @@ import os
import pathlib
import subprocess
from bulkredditdownloader.downloaders.downloader_utils import getFile
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.utils import printToFile as print
class VReddit:
class VReddit(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
extension = ".mp4"
if not os.path.exists(directory):
os.makedirs(directory)
@ -20,7 +21,7 @@ class VReddit:
fnull = open(os.devnull, 'w')
subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT)
except Exception:
getFile(filename, short_filename, directory, post['CONTENTURL'])
self.getFile(filename, short_filename, directory, post['CONTENTURL'])
print("FFMPEG library not found, skipping merging video and audio")
else:
video_name = post['POSTID'] + "_video"
@ -30,8 +31,8 @@ class VReddit:
print(directory, filename, sep="\n")
getFile(video_name, video_name, directory, video_url, silent=True)
getFile(audio_name, audio_name, directory, audio_url, silent=True)
self.getFile(video_name, video_name, directory, video_url, silent=True)
self.getFile(audio_name, audio_name, directory, audio_url, silent=True)
try:
self._mergeAudio(video_name, audio_name, filename, short_filename, directory)
except KeyboardInterrupt:

View file

@ -4,15 +4,15 @@ import sys
import youtube_dl
from bulkredditdownloader.downloaders.downloader_utils import createHash
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import FileAlreadyExistsError
from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.utils import printToFile as print
class Youtube:
class Youtube(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
if not os.path.exists(directory):
os.makedirs(directory)
@ -37,7 +37,7 @@ class Youtube:
if GLOBAL.arguments.no_dupes:
try:
file_hash = createHash(str(location))
file_hash = self.createHash(str(location))
except FileNotFoundError:
return None
if file_hash in GLOBAL.downloadedPosts():