diff --git a/.gitignore b/.gitignore index d37883b..85d3ab1 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ __pycache__/ src/__pycache__/ config.json env/ +.vscode/ diff --git a/script.py b/script.py index 51250e4..9a45022 100644 --- a/script.py +++ b/script.py @@ -14,11 +14,17 @@ import webbrowser from io import StringIO from pathlib import Path, PurePath -from src.downloader import Direct, Erome, Gfycat, Imgur, Self +from src.downloaders.Direct import Direct +from src.downloaders.Erome import Erome +from src.downloaders.Gfycat import Gfycat +from src.downloaders.Imgur import Imgur +from src.downloaders.redgifs import Redgifs +from src.downloaders.selfPost import SelfPost +from src.downloaders.gifDeliveryNetwork import GifDeliveryNetwork from src.errors import * from src.parser import LinkDesigner from src.searcher import getPosts -from src.tools import (GLOBAL, createLogFile, jsonFile, nameCorrector, +from src.utils import (GLOBAL, createLogFile, jsonFile, nameCorrector, printToFile) __author__ = "Ali Parlakci" @@ -496,7 +502,8 @@ def downloadPost(SUBMISSION): global lastRequestTime downloaders = { - "imgur":Imgur,"gfycat":Gfycat,"erome":Erome,"direct":Direct,"self":Self + "imgur":Imgur,"gfycat":Gfycat,"erome":Erome,"direct":Direct,"self":SelfPost, + "redgifs":Redgifs, "gifdeliverynetwork": GifDeliveryNetwork } print() @@ -532,7 +539,7 @@ def downloadPost(SUBMISSION): if not (credit['UserRemaining'] == 0 or \ credit['ClientRemaining'] == 0): - """This block of code is needed + """This block of code is needed for API workaround """ while int(time.time() - lastRequestTime) <= 2: pass @@ -568,7 +575,7 @@ def download(submissions): FAILED_FILE = createLogFile("FAILED") for i in range(subsLenght): - print(f"\n({i+1}/{subsLenght}) – r/{submissions[i]['postSubreddit']}", + print(f"\n({i+1}/{subsLenght}) – {submissions[i]['postId']} – r/{submissions[i]['postSubreddit']}", end="") print(f" – {submissions[i]['postType'].upper()}",end="",noPrint=True) diff --git a/src/downloader.py b/src/downloader.py deleted file mode 100644 index a938808..0000000 --- a/src/downloader.py +++ /dev/null @@ -1,536 +0,0 @@ -import io -import json -import os -import sys -import urllib.request -from html.parser import HTMLParser -from multiprocessing import Queue -from pathlib import Path -from urllib.error import HTTPError - -import imgurpython -from bs4 import BeautifulSoup - -from src.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, - FileNameTooLong, ImgurLoginError, - NotADownloadableLinkError) -from src.tools import GLOBAL, nameCorrector, printToFile - -VanillaPrint = print -print = printToFile - -def dlProgress(count, blockSize, totalSize): - """Function for writing download progress to console - """ - - downloadedMbs = int(count*blockSize*(10**(-6))) - fileSize = int(totalSize*(10**(-6))) - sys.stdout.write("{}Mb/{}Mb\r".format(downloadedMbs,fileSize)) - sys.stdout.flush() - -def getExtension(link): - """Extract file extension from image link. - If didn't find any, return '.jpg' - """ - - imageTypes = ['jpg','png','mp4','webm','gif'] - parsed = link.split('.') - for TYPE in imageTypes: - if TYPE in parsed: - return "."+parsed[-1] - else: - if not "v.redd.it" in link: - return '.jpg' - else: - return '.mp4' - -def getFile(fileDir,tempDir,imageURL,indent=0): - """Downloads given file to given directory. - - fileDir -- Full file directory - tempDir -- Full file directory with the extension of '.tmp' - imageURL -- URL to the file to be downloaded - - redditID -- Post's reddit id if renaming the file is necessary. - As too long file names seem not working. - """ - - headers = [ - ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \ - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "\ - "Safari/537.36 OPR/54.0.2952.64"), - ("Accept", "text/html,application/xhtml+xml,application/xml;" \ - "q=0.9,image/webp,image/apng,*/*;q=0.8"), - ("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"), - ("Accept-Encoding", "none"), - ("Accept-Language", "en-US,en;q=0.8"), - ("Connection", "keep-alive") - ] - - opener = urllib.request.build_opener() - if not "imgur" in imageURL: - opener.addheaders = headers - urllib.request.install_opener(opener) - - if not (os.path.isfile(fileDir)): - for i in range(3): - try: - urllib.request.urlretrieve(imageURL, - tempDir, - reporthook=dlProgress) - os.rename(tempDir,fileDir) - except ConnectionResetError as exception: - print(" "*indent + str(exception)) - print(" "*indent + "Trying again\n") - except FileNotFoundError: - raise FileNameTooLong - else: - print(" "*indent+"Downloaded"+" "*10) - break - else: - raise FileAlreadyExistsError - -class Erome: - def __init__(self,directory,post): - try: - IMAGES = self.getLinks(post['postURL']) - except urllib.error.HTTPError: - raise NotADownloadableLinkError("Not a downloadable link") - - imagesLenght = len(IMAGES) - howManyDownloaded = imagesLenght - duplicates = 0 - - if imagesLenght == 1: - - extension = getExtension(IMAGES[0]) - - """Filenames are declared here""" - - title = nameCorrector(post['postTitle']) - print(post["postSubmitter"]+"_"+title+"_"+post['postId']+extension) - - fileDir = directory / ( - post["postSubmitter"]+"_"+title+"_"+post['postId']+extension - ) - tempDir = directory / ( - post["postSubmitter"]+"_"+title+"_"+post['postId']+".tmp" - ) - - imageURL = IMAGES[0] - if 'https://' not in imageURL and 'http://' not in imageURL: - imageURL = "https://" + imageURL - - try: - getFile(fileDir,tempDir,imageURL) - except FileNameTooLong: - fileDir = directory / (post['postId'] + extension) - tempDir = directory / (post['postId'] + '.tmp') - getFile(fileDir,tempDir,imageURL) - - else: - title = nameCorrector(post['postTitle']) - print(post["postSubmitter"]+"_"+title+"_"+post['postId'],end="\n\n") - - folderDir = directory / ( - post["postSubmitter"] + "_" + title + "_" + post['postId'] - ) - - try: - if not os.path.exists(folderDir): - os.makedirs(folderDir) - except FileNotFoundError: - folderDir = directory / post['postId'] - os.makedirs(folderDir) - - for i in range(imagesLenght): - - extension = getExtension(IMAGES[i]) - - fileName = str(i+1) - imageURL = IMAGES[i] - if 'https://' not in imageURL and 'http://' not in imageURL: - imageURL = "https://" + imageURL - - fileDir = folderDir / (fileName + extension) - tempDir = folderDir / (fileName + ".tmp") - - print(" ({}/{})".format(i+1,imagesLenght)) - print(" {}".format(fileName+extension)) - - try: - getFile(fileDir,tempDir,imageURL,indent=2) - print() - except FileAlreadyExistsError: - print(" The file already exists" + " "*10,end="\n\n") - duplicates += 1 - howManyDownloaded -= 1 - - except Exception as exception: - # raise exception - print("\n Could not get the file") - print( - " " - + "{class_name}: {info}".format( - class_name=exception.__class__.__name__, - info=str(exception) - ) - + "\n" - ) - exceptionType = exception - howManyDownloaded -= 1 - - if duplicates == imagesLenght: - raise FileAlreadyExistsError - elif howManyDownloaded + duplicates < imagesLenght: - raise AlbumNotDownloadedCompletely( - "Album Not Downloaded Completely" - ) - - def getLinks(self,url,lineNumber=129): - - content = [] - lineNumber = None - - class EromeParser(HTMLParser): - tag = None - def handle_starttag(self, tag, attrs): - self.tag = {tag:{attr[0]: attr[1] for attr in attrs}} - - pageSource = (urllib.request.urlopen(url).read().decode().split('\n')) - - """ FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS""" - for i in range(len(pageSource)): - obj = EromeParser() - obj.feed(pageSource[i]) - tag = obj.tag - - if tag is not None: - if "div" in tag: - if "id" in tag["div"]: - if tag["div"]["id"] == "album": - lineNumber = i - break - - for line in pageSource[lineNumber:]: - obj = EromeParser() - obj.feed(line) - tag = obj.tag - if tag is not None: - if "img" in tag: - if "class" in tag["img"]: - if tag["img"]["class"]=="img-front": - content.append(tag["img"]["src"]) - elif "source" in tag: - content.append(tag["source"]["src"]) - - return [ - link for link in content \ - if link.endswith("_480p.mp4") or not link.endswith(".mp4") - ] - -class Imgur: - def __init__(self,directory,post): - self.imgurClient = self.initImgur() - - imgurID = self.getId(post['postURL']) - content = self.getLink(imgurID) - - if not os.path.exists(directory): os.makedirs(directory) - - if content['type'] == 'image': - - try: - post['mediaURL'] = content['object'].mp4 - except AttributeError: - post['mediaURL'] = content['object'].link - - post['postExt'] = getExtension(post['mediaURL']) - - title = nameCorrector(post['postTitle']) - - """Filenames are declared here""" - - print(post["postSubmitter"]+"_"+title+"_"+post['postId']+post['postExt']) - - fileDir = directory / ( - post["postSubmitter"] - + "_" + title - + "_" + post['postId'] - + post['postExt'] - ) - - tempDir = directory / ( - post["postSubmitter"] - + "_" + title - + "_" + post['postId'] - + ".tmp" - ) - - try: - getFile(fileDir,tempDir,post['mediaURL']) - except FileNameTooLong: - fileDir = directory / post['postId'] + post['postExt'] - tempDir = directory / post['postId'] + '.tmp' - getFile(fileDir,tempDir,post['mediaURL']) - - elif content['type'] == 'album': - exceptionType = "" - images = content['object'].images - imagesLenght = len(images) - howManyDownloaded = imagesLenght - duplicates = 0 - - title = nameCorrector(post['postTitle']) - print(post["postSubmitter"]+"_"+title+"_"+post['postId'],end="\n\n") - - folderDir = directory / ( - post["postSubmitter"] + "_" + title + "_" + post['postId'] - ) - - try: - if not os.path.exists(folderDir): - os.makedirs(folderDir) - except FileNotFoundError: - folderDir = directory / post['postId'] - os.makedirs(folderDir) - - for i in range(imagesLenght): - try: - imageURL = images[i]['mp4'] - except KeyError: - imageURL = images[i]['link'] - - images[i]['Ext'] = getExtension(imageURL) - - fileName = (str(i+1) - + "_" - + nameCorrector(str(images[i]['title'])) - + "_" - + images[i]['id']) - - """Filenames are declared here""" - - fileDir = folderDir / (fileName + images[i]['Ext']) - tempDir = folderDir / (fileName + ".tmp") - - print(" ({}/{})".format(i+1,imagesLenght)) - print(" {}".format(fileName+images[i]['Ext'])) - - try: - getFile(fileDir,tempDir,imageURL,indent=2) - print() - except FileAlreadyExistsError: - print(" The file already exists" + " "*10,end="\n\n") - duplicates += 1 - howManyDownloaded -= 1 - - # IF FILE NAME IS TOO LONG, IT WONT REGISTER - except FileNameTooLong: - fileName = (str(i+1) + "_" + images[i]['id']) - fileDir = folderDir / (fileName + images[i]['Ext']) - tempDir = folderDir / (fileName + ".tmp") - try: - getFile(fileDir,tempDir,imageURL,indent=2) - # IF STILL TOO LONG - except FileNameTooLong: - fileName = str(i+1) - fileDir = folderDir / (fileName + images[i]['Ext']) - tempDir = folderDir / (fileName + ".tmp") - getFile(fileDir,tempDir,imageURL,indent=2) - - except Exception as exception: - print("\n Could not get the file") - print( - " " - + "{class_name}: {info}".format( - class_name=exception.__class__.__name__, - info=str(exception) - ) - + "\n" - ) - exceptionType = exception - howManyDownloaded -= 1 - - if duplicates == imagesLenght: - raise FileAlreadyExistsError - elif howManyDownloaded + duplicates < imagesLenght: - raise AlbumNotDownloadedCompletely( - "Album Not Downloaded Completely" - ) - - @staticmethod - def initImgur(): - """Initialize imgur api""" - - config = GLOBAL.config - return imgurpython.ImgurClient( - config['imgur_client_id'], - config['imgur_client_secret'] - ) - def getId(self,submissionURL): - """Extract imgur post id - and determine if its a single image or album - """ - - domainLenght = len("imgur.com/") - if submissionURL[-1] == "/": - submissionURL = submissionURL[:-1] - - if "a/" in submissionURL or "gallery/" in submissionURL: - albumId = submissionURL.split("/")[-1] - return {'id':albumId, 'type':'album'} - - else: - url = submissionURL.replace('.','/').split('/') - imageId = url[url.index('com')+1] - return {'id':imageId, 'type':'image'} - - def getLink(self,identity): - """Request imgur object from imgur api - """ - - if identity['type'] == 'image': - return {'object':self.imgurClient.get_image(identity['id']), - 'type':'image'} - elif identity['type'] == 'album': - return {'object':self.imgurClient.get_album(identity['id']), - 'type':'album'} - @staticmethod - def get_credits(): - return Imgur.initImgur().get_credits() - -class Gfycat: - def __init__(self,directory,POST): - try: - POST['mediaURL'] = self.getLink(POST['postURL']) - except IndexError: - raise NotADownloadableLinkError("Could not read the page source") - except Exception as exception: - #debug - raise exception - raise NotADownloadableLinkError("Could not read the page source") - - POST['postExt'] = getExtension(POST['mediaURL']) - - if not os.path.exists(directory): os.makedirs(directory) - title = nameCorrector(POST['postTitle']) - - """Filenames are declared here""" - - print(POST["postSubmitter"]+"_"+title+"_"+POST['postId']+POST['postExt']) - - fileDir = directory / ( - POST["postSubmitter"]+"_"+title+"_"+POST['postId']+POST['postExt'] - ) - tempDir = directory / ( - POST["postSubmitter"]+"_"+title+"_"+POST['postId']+".tmp" - ) - - try: - getFile(fileDir,tempDir,POST['mediaURL']) - except FileNameTooLong: - fileDir = directory / (POST['postId']+POST['postExt']) - tempDir = directory / (POST['postId']+".tmp") - - getFile(fileDir,tempDir,POST['mediaURL']) - - def getLink(self, url, query='" \ "alert(\"You can go back to terminal window now.\");" \ "" @@ -299,10 +299,14 @@ def redditSearcher(posts,SINGLE_POST=False): orderCount = 0 global gfycatCount gfycatCount = 0 + global redgifsCount + redgifsCount = 0 global imgurCount imgurCount = 0 global eromeCount eromeCount = 0 + global gifDeliveryNetworkCount + gifDeliveryNetworkCount = 0 global directCount directCount = 0 global selfCount @@ -394,9 +398,11 @@ def redditSearcher(posts,SINGLE_POST=False): def checkIfMatching(submission): global gfycatCount + global redgifsCount global imgurCount global eromeCount global directCount + global gifDeliveryNetworkCount global selfCount try: @@ -424,6 +430,16 @@ def checkIfMatching(submission): eromeCount += 1 return details + elif 'redgifs' in submission.domain: + details['postType'] = 'redgifs' + redgifsCount += 1 + return details + + elif 'gifdeliverynetwork' in submission.domain: + details['postType'] = 'gifdeliverynetwork' + gifDeliveryNetworkCount += 1 + return details + elif submission.is_self: details['postType'] = 'self' details['postContent'] = submission.selftext @@ -495,7 +511,7 @@ def isDirectLink(URL): return False for extension in imageTypes: - if extension in URL: + if extension in URL.split("/")[-1]: return URL else: return False diff --git a/src/tools.py b/src/utils.py similarity index 98% rename from src/tools.py rename to src/utils.py index 7299f0f..ba6e71e 100644 --- a/src/tools.py +++ b/src/utils.py @@ -11,7 +11,7 @@ class GLOBAL: """Declare global variables""" RUN_TIME = 0 - config = None + config = {'imgur_client_id':None, 'imgur_client_secret': None} arguments = None directory = None defaultConfigDirectory = Path.home() / "Bulk Downloader for Reddit"