1
0
Fork 0
mirror of synced 2024-06-29 11:30:30 +12:00
bulk-downloader-for-reddit/src/downloader.py

464 lines
16 KiB
Python
Raw Normal View History

2018-07-10 10:30:50 +12:00
import io
2018-07-10 07:58:11 +12:00
import os
import sys
import urllib.request
2018-07-24 08:16:56 +12:00
from html.parser import HTMLParser
2018-07-10 07:58:11 +12:00
from pathlib import Path
2018-07-24 08:33:11 +12:00
from urllib.error import HTTPError
2018-07-10 07:58:11 +12:00
import imgurpython
2018-07-12 09:09:20 +12:00
from multiprocessing import Queue
2018-07-10 07:58:11 +12:00
from src.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError,
FileNameTooLong, ImgurLoginError,
NotADownloadableLinkError)
from src.tools import GLOBAL, nameCorrector, printToFile
2018-07-10 10:30:50 +12:00
VanillaPrint = print
2018-07-10 07:58:11 +12:00
print = printToFile
def dlProgress(count, blockSize, totalSize):
"""Function for writing download progress to console
"""
downloadedMbs = int(count*blockSize*(10**(-6)))
fileSize = int(totalSize*(10**(-6)))
sys.stdout.write("\r{}Mb/{}Mb".format(downloadedMbs,fileSize))
sys.stdout.write("\b"*len("\r{}Mb/{}Mb".format(downloadedMbs,fileSize)))
sys.stdout.flush()
def getExtension(link):
"""Extract file extension from image link.
If didn't find any, return '.jpg'
"""
imageTypes = ['jpg','png','mp4','webm','gif']
parsed = link.split('.')
for TYPE in imageTypes:
if TYPE in parsed:
return "."+parsed[-1]
else:
2018-07-19 23:57:16 +12:00
if not "v.redd.it" in link:
return '.jpg'
else:
return '.mp4'
2018-07-10 07:58:11 +12:00
def getFile(fileDir,tempDir,imageURL,indent=0):
"""Downloads given file to given directory.
fileDir -- Full file directory
tempDir -- Full file directory with the extension of '.tmp'
imageURL -- URL to the file to be downloaded
redditID -- Post's reddit id if renaming the file is necessary.
As too long file names seem not working.
"""
if not (os.path.isfile(fileDir)):
for i in range(3):
try:
urllib.request.urlretrieve(imageURL,
tempDir,
reporthook=dlProgress)
os.rename(tempDir,fileDir)
print(" "*indent+"Downloaded"+" "*10)
break
except ConnectionResetError as exception:
print(" "*indent + str(exception))
print(" "*indent + "Trying again\n")
except FileNotFoundError:
raise FileNameTooLong
else:
raise FileAlreadyExistsError
2018-07-24 08:16:56 +12:00
class Erome:
def __init__(self,directory,post):
2018-07-24 08:33:11 +12:00
try:
IMAGES = self.getLinks(post['postURL'])
except urllib.error.HTTPError:
raise NotADownloadableLinkError("Not a downloadable link")
2018-07-24 08:16:56 +12:00
imagesLenght = len(IMAGES)
howManyDownloaded = imagesLenght
duplicates = 0
if imagesLenght == 1:
extension = getExtension(IMAGES[0])
title = nameCorrector(post['postTitle'])
print(title+"_" +post['postId']+extension)
fileDir = title + "_" + post['postId'] + extension
fileDir = directory / fileDir
tempDir = title + "_" + post['postId'] + '.tmp'
tempDir = directory / tempDir
imageURL = "https:" + IMAGES[0]
try:
getFile(fileDir,tempDir,imageURL)
except FileNameTooLong:
fileDir = directory / (post['postId'] + extension)
tempDir = directory / (post['postId'] + '.tmp')
getFile(fileDir,tempDir,imageURL)
else:
title = nameCorrector(post['postTitle'])
print(title+"_"+post['postId'],end="\n\n")
folderDir = directory / (title+"_"+post['postId'])
try:
if not os.path.exists(folderDir):
os.makedirs(folderDir)
except FileNotFoundError:
folderDir = directory / post['postId']
os.makedirs(folderDir)
for i in range(imagesLenght):
extension = getExtension(IMAGES[i])
fileName = str(i+1)
imageURL = "https:" + IMAGES[i]
fileDir = folderDir / (fileName + extension)
tempDir = folderDir / (fileName + ".tmp")
print(" ({}/{})".format(i+1,imagesLenght))
print(" {}".format(fileName+extension))
try:
getFile(fileDir,tempDir,imageURL,indent=2)
print()
except FileAlreadyExistsError:
print(" The file already exists" + " "*10,end="\n\n")
duplicates += 1
howManyDownloaded -= 1
except Exception as exception:
raise exception
print("\n Could not get the file")
print(" " + str(exception) + "\n")
exceptionType = exception
howManyDownloaded -= 1
if duplicates == imagesLenght:
raise FileAlreadyExistsError
elif howManyDownloaded + duplicates < imagesLenght:
raise AlbumNotDownloadedCompletely(
"Album Not Downloaded Completely"
)
def getLinks(self,url,lineNumber=129):
content = []
lineNumber = None
class EromeParser(HTMLParser):
tag = None
def handle_starttag(self, tag, attrs):
self.tag = {tag:{attr[0]: attr[1] for attr in attrs}}
pageSource = (urllib.request.urlopen(url).read().decode().split('\n'))
""" FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS"""
for i in range(len(pageSource)):
obj = EromeParser()
obj.feed(pageSource[i])
tag = obj.tag
if tag is not None:
if "div" in tag:
if "id" in tag["div"]:
if tag["div"]["id"] == "album":
lineNumber = i
break
for line in pageSource[lineNumber:]:
obj = EromeParser()
obj.feed(line)
tag = obj.tag
if tag is not None:
if "img" in tag:
if "class" in tag["img"]:
if tag["img"]["class"]=="img-front":
content.append(tag["img"]["src"])
elif "source" in tag:
content.append(tag["source"]["src"])
return [
link for link in content \
if link.endswith("_480p.mp4") or not link.endswith(".mp4")
]
2018-07-10 07:58:11 +12:00
class Imgur:
def __init__(self,directory,post):
self.imgurClient = self.initImgur()
imgurID = self.getId(post['postURL'])
content = self.getLink(imgurID)
if not os.path.exists(directory): os.makedirs(directory)
if content['type'] == 'image':
try:
post['mediaURL'] = content['object'].mp4
except AttributeError:
post['mediaURL'] = content['object'].link
post['postExt'] = getExtension(post['mediaURL'])
title = nameCorrector(post['postTitle'])
print(title+"_" +post['postId']+post['postExt'])
fileDir = title + "_" + post['postId'] + post['postExt']
fileDir = directory / fileDir
tempDir = title + "_" + post['postId'] + '.tmp'
tempDir = directory / tempDir
try:
getFile(fileDir,tempDir,post['mediaURL'])
except FileNameTooLong:
fileDir = directory / post['postId'] + post['postExt']
tempDir = directory / post['postId'] + '.tmp'
getFile(fileDir,tempDir,post['mediaURL'])
elif content['type'] == 'album':
exceptionType = ""
images = content['object'].images
imagesLenght = len(images)
howManyDownloaded = imagesLenght
duplicates = 0
title = nameCorrector(post['postTitle'])
print(title+"_"+post['postId'],end="\n\n")
folderDir = directory / (title+"_"+post['postId'])
try:
if not os.path.exists(folderDir):
os.makedirs(folderDir)
except FileNotFoundError:
folderDir = directory / post['postId']
os.makedirs(folderDir)
for i in range(imagesLenght):
try:
imageURL = images[i]['mp4']
except KeyError:
imageURL = images[i]['link']
images[i]['Ext'] = getExtension(imageURL)
fileName = (str(i+1)
+ "_"
+ nameCorrector(str(images[i]['title']))
+ "_"
+ images[i]['id'])
fileDir = folderDir / (fileName + images[i]['Ext'])
tempDir = folderDir / (fileName + ".tmp")
print(" ({}/{})".format(i+1,imagesLenght))
print(" {}".format(fileName+images[i]['Ext']))
try:
getFile(fileDir,tempDir,imageURL,indent=2)
print()
except FileAlreadyExistsError:
print(" The file already exists" + " "*10,end="\n\n")
duplicates += 1
howManyDownloaded -= 1
# IF FILE NAME IS TOO LONG, IT WONT REGISTER
except FileNameTooLong:
fileName = (str(i+1) + "_" + images[i]['id'])
fileDir = folderDir / (fileName + images[i]['Ext'])
tempDir = folderDir / (fileName + ".tmp")
try:
getFile(fileDir,tempDir,imageURL,indent=2)
# IF STILL TOO LONG
except FileNameTooLong:
fileName = str(i+1)
fileDir = folderDir / (fileName + images[i]['Ext'])
tempDir = folderDir / (fileName + ".tmp")
getFile(fileDir,tempDir,imageURL,indent=2)
except Exception as exception:
print("\n Could not get the file")
print(" " + str(exception) + "\n")
exceptionType = exception
howManyDownloaded -= 1
if duplicates == imagesLenght:
raise FileAlreadyExistsError
2018-07-24 08:16:56 +12:00
elif howManyDownloaded + duplicates < imagesLenght:
raise AlbumNotDownloadedCompletely(
"Album Not Downloaded Completely"
)
2018-07-10 07:58:11 +12:00
@staticmethod
def initImgur():
"""Initialize imgur api"""
config = GLOBAL.config
2018-07-12 09:09:20 +12:00
return imgurpython.ImgurClient(
2018-07-10 07:58:11 +12:00
config['imgur_client_id'],
config['imgur_client_secret']
)
def getId(self,submissionURL):
"""Extract imgur post id
and determine if its a single image or album
"""
domainLenght = len("imgur.com/")
if submissionURL[-1] == "/":
submissionURL = submissionURL[:-1]
if "a/" in submissionURL or "gallery/" in submissionURL:
albumId = submissionURL.split("/")[-1]
return {'id':albumId, 'type':'album'}
else:
url = submissionURL.replace('.','/').split('/')
imageId = url[url.index('com')+1]
return {'id':imageId, 'type':'image'}
def getLink(self,identity):
"""Request imgur object from imgur api
"""
if identity['type'] == 'image':
return {'object':self.imgurClient.get_image(identity['id']),
'type':'image'}
elif identity['type'] == 'album':
return {'object':self.imgurClient.get_album(identity['id']),
'type':'album'}
2018-07-12 11:06:16 +12:00
@staticmethod
2018-07-10 07:58:11 +12:00
def get_credits():
2018-07-12 11:06:16 +12:00
return Imgur.initImgur().get_credits()
2018-07-10 07:58:11 +12:00
class Gfycat:
def __init__(self,directory,POST):
try:
POST['mediaURL'] = self.getLink(POST['postURL'])
except IndexError:
raise NotADownloadableLinkError("Could not read the page source")
2018-07-10 07:58:11 +12:00
except Exception as exception:
raise NotADownloadableLinkError("Could not read the page source")
2018-07-10 07:58:11 +12:00
POST['postExt'] = getExtension(POST['mediaURL'])
if not os.path.exists(directory): os.makedirs(directory)
title = nameCorrector(POST['postTitle'])
print(title+"_"+POST['postId']+POST['postExt'])
fileDir = directory / (title+"_"+POST['postId']+POST['postExt'])
tempDir = directory / (title+"_"+POST['postId']+".tmp")
2018-07-10 10:30:50 +12:00
try:
getFile(fileDir,tempDir,POST['mediaURL'])
except FileNameTooLong:
fileDir = directory / (POST['postId']+POST['postExt'])
tempDir = directory / (POST['postId']+".tmp")
2018-07-10 07:58:11 +12:00
2018-07-10 10:30:50 +12:00
getFile(fileDir,tempDir,POST['mediaURL'])
2018-07-10 07:58:11 +12:00
def getLink(self, url, query='<source id="mp4Source" src=', lineNumber=105):
"""Extract direct link to the video from page's source
and return it
"""
if '.webm' in url or '.mp4' in url or '.gif' in url:
return url
if url[-1:] == '/':
url = url[:-1]
2018-07-20 00:22:12 +12:00
url = "https://gfycat.com/" + url.split('/')[-1]
2018-07-10 07:58:11 +12:00
pageSource = (urllib.request.urlopen(url).read().decode().split('\n'))
theLine = pageSource[lineNumber]
lenght = len(query)
link = []
for i in range(len(theLine)):
if theLine[i:i+lenght] == query:
cursor = (i+lenght)+1
while not theLine[cursor] == '"':
link.append(theLine[cursor])
cursor += 1
break
if "".join(link) == "":
raise NotADownloadableLinkError("Could not read the page source")
2018-07-10 07:58:11 +12:00
return "".join(link)
class Direct:
def __init__(self,directory,POST):
POST['postExt'] = getExtension(POST['postURL'])
if not os.path.exists(directory): os.makedirs(directory)
title = nameCorrector(POST['postTitle'])
print(title+"_"+POST['postId']+POST['postExt'])
fileDir = title+"_"+POST['postId']+POST['postExt']
fileDir = directory / fileDir
tempDir = title+"_"+POST['postId']+".tmp"
tempDir = directory / tempDir
2018-07-10 10:30:50 +12:00
try:
getFile(fileDir,tempDir,POST['postURL'])
except FileNameTooLong:
fileDir = directory / (POST['postId']+POST['postExt'])
tempDir = directory / (POST['postId']+".tmp")
getFile(fileDir,tempDir,POST['postURL'])
class Self:
def __init__(self,directory,post):
if not os.path.exists(directory): os.makedirs(directory)
title = nameCorrector(post['postTitle'])
print(title+"_"+post['postId']+".md")
fileDir = title+"_"+post['postId']+".md"
fileDir = directory / fileDir
if Path.is_file(fileDir):
raise FileAlreadyExistsError
2018-07-10 10:44:28 +12:00
try:
self.writeToFile(fileDir,post)
except FileNotFoundError:
fileDir = post['postId']+".md"
fileDir = directory / fileDir
2018-07-10 10:30:50 +12:00
2018-07-10 10:44:28 +12:00
self.writeToFile(fileDir,post)
2018-07-10 10:30:50 +12:00
@staticmethod
def writeToFile(directory,post):
content = ("## ["
+ post["postTitle"]
+ "]("
+ post["postURL"]
+ ")\n"
+ post["postContent"]
+ "\n\n---\n\n"
+ "submitted by [u/"
+ post["postSubmitter"]
+ "](https://www.reddit.com/user/"
+ post["postSubmitter"]
+ ")")
with io.open(directory,"w",encoding="utf-8") as FILE:
VanillaPrint(content,file=FILE)
print("Downloaded")