1
0
Fork 0
mirror of synced 2024-06-26 10:00:20 +12:00
bulk-downloader-for-reddit/src/downloader.py

533 lines
18 KiB
Python
Raw Normal View History

2018-07-10 10:30:50 +12:00
import io
import json
2018-07-10 07:58:11 +12:00
import os
import sys
import urllib.request
2018-07-24 08:16:56 +12:00
from html.parser import HTMLParser
from multiprocessing import Queue
2018-07-10 07:58:11 +12:00
from pathlib import Path
2018-07-24 08:33:11 +12:00
from urllib.error import HTTPError
2018-07-10 07:58:11 +12:00
import imgurpython
from bs4 import BeautifulSoup
2018-07-10 07:58:11 +12:00
from src.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError,
FileNameTooLong, ImgurLoginError,
NotADownloadableLinkError)
from src.tools import GLOBAL, nameCorrector, printToFile
2018-07-10 10:30:50 +12:00
VanillaPrint = print
2018-07-10 07:58:11 +12:00
print = printToFile
def dlProgress(count, blockSize, totalSize):
"""Function for writing download progress to console
"""
downloadedMbs = int(count*blockSize*(10**(-6)))
fileSize = int(totalSize*(10**(-6)))
2018-08-14 01:00:21 +12:00
sys.stdout.write("{}Mb/{}Mb\r".format(downloadedMbs,fileSize))
2018-07-10 07:58:11 +12:00
sys.stdout.flush()
def getExtension(link):
"""Extract file extension from image link.
If didn't find any, return '.jpg'
"""
imageTypes = ['jpg','png','mp4','webm','gif']
parsed = link.split('.')
for TYPE in imageTypes:
if TYPE in parsed:
return "."+parsed[-1]
else:
2018-07-19 23:57:16 +12:00
if not "v.redd.it" in link:
return '.jpg'
else:
return '.mp4'
2018-07-10 07:58:11 +12:00
def getFile(fileDir,tempDir,imageURL,indent=0):
"""Downloads given file to given directory.
fileDir -- Full file directory
tempDir -- Full file directory with the extension of '.tmp'
imageURL -- URL to the file to be downloaded
redditID -- Post's reddit id if renaming the file is necessary.
As too long file names seem not working.
"""
2018-08-06 16:35:43 +12:00
headers = [
2018-08-06 18:33:07 +12:00
("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "\
"Safari/537.36 OPR/54.0.2952.64"),
2018-08-06 16:35:43 +12:00
("Accept", "text/html,application/xhtml+xml,application/xml;" \
2018-08-06 18:33:07 +12:00
"q=0.9,image/webp,image/apng,*/*;q=0.8"),
2018-08-06 16:35:43 +12:00
("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),
2018-08-08 09:47:34 +12:00
("Accept-Encoding", "none"),
2018-08-06 16:35:43 +12:00
("Accept-Language", "en-US,en;q=0.8"),
("Connection", "keep-alive")
]
opener = urllib.request.build_opener()
2018-08-23 21:09:56 +12:00
if not "imgur" in imageURL:
opener.addheaders = headers
urllib.request.install_opener(opener)
2018-08-06 16:35:43 +12:00
2018-07-10 07:58:11 +12:00
if not (os.path.isfile(fileDir)):
for i in range(3):
try:
urllib.request.urlretrieve(imageURL,
tempDir,
reporthook=dlProgress)
os.rename(tempDir,fileDir)
except ConnectionResetError as exception:
print(" "*indent + str(exception))
print(" "*indent + "Trying again\n")
except FileNotFoundError:
raise FileNameTooLong
2018-07-25 07:11:12 +12:00
else:
print(" "*indent+"Downloaded"+" "*10)
break
2018-07-10 07:58:11 +12:00
else:
raise FileAlreadyExistsError
2018-07-24 08:16:56 +12:00
class Erome:
def __init__(self,directory,post):
2018-07-24 08:33:11 +12:00
try:
IMAGES = self.getLinks(post['postURL'])
except urllib.error.HTTPError:
raise NotADownloadableLinkError("Not a downloadable link")
2018-07-24 08:16:56 +12:00
imagesLenght = len(IMAGES)
howManyDownloaded = imagesLenght
duplicates = 0
if imagesLenght == 1:
extension = getExtension(IMAGES[0])
"""Filenames are declared here"""
2018-07-24 08:16:56 +12:00
title = nameCorrector(post['postTitle'])
2018-07-25 03:55:33 +12:00
print(post["postSubmitter"]+"_"+title+"_"+post['postId']+extension)
2018-07-24 08:16:56 +12:00
2018-07-24 21:44:53 +12:00
fileDir = directory / (
2018-07-25 07:13:11 +12:00
post["postSubmitter"]+"_"+title+"_"+post['postId']+extension
2018-07-24 21:44:53 +12:00
)
tempDir = directory / (
2018-07-25 07:13:11 +12:00
post["postSubmitter"]+"_"+title+"_"+post['postId']+".tmp"
2018-07-24 21:44:53 +12:00
)
2018-07-24 08:16:56 +12:00
imageURL = "https:" + IMAGES[0]
try:
getFile(fileDir,tempDir,imageURL)
except FileNameTooLong:
fileDir = directory / (post['postId'] + extension)
tempDir = directory / (post['postId'] + '.tmp')
getFile(fileDir,tempDir,imageURL)
else:
title = nameCorrector(post['postTitle'])
2018-07-25 03:55:33 +12:00
print(post["postSubmitter"]+"_"+title+"_"+post['postId'],end="\n\n")
2018-07-24 08:16:56 +12:00
2018-07-24 21:44:53 +12:00
folderDir = directory / (
2018-07-25 03:55:33 +12:00
post["postSubmitter"] + "_" + title + "_" + post['postId']
2018-07-24 21:44:53 +12:00
)
2018-07-24 08:16:56 +12:00
try:
if not os.path.exists(folderDir):
os.makedirs(folderDir)
except FileNotFoundError:
folderDir = directory / post['postId']
os.makedirs(folderDir)
for i in range(imagesLenght):
extension = getExtension(IMAGES[i])
fileName = str(i+1)
imageURL = "https:" + IMAGES[i]
fileDir = folderDir / (fileName + extension)
tempDir = folderDir / (fileName + ".tmp")
print(" ({}/{})".format(i+1,imagesLenght))
print(" {}".format(fileName+extension))
try:
getFile(fileDir,tempDir,imageURL,indent=2)
print()
except FileAlreadyExistsError:
print(" The file already exists" + " "*10,end="\n\n")
duplicates += 1
howManyDownloaded -= 1
except Exception as exception:
2018-07-25 04:27:52 +12:00
# raise exception
2018-07-24 08:16:56 +12:00
print("\n Could not get the file")
2018-07-25 04:27:52 +12:00
print(
" "
+ "{class_name}: {info}".format(
class_name=exception.__class__.__name__,
info=str(exception)
)
+ "\n"
)
2018-07-24 08:16:56 +12:00
exceptionType = exception
howManyDownloaded -= 1
if duplicates == imagesLenght:
raise FileAlreadyExistsError
elif howManyDownloaded + duplicates < imagesLenght:
raise AlbumNotDownloadedCompletely(
"Album Not Downloaded Completely"
)
def getLinks(self,url,lineNumber=129):
content = []
lineNumber = None
class EromeParser(HTMLParser):
tag = None
def handle_starttag(self, tag, attrs):
self.tag = {tag:{attr[0]: attr[1] for attr in attrs}}
pageSource = (urllib.request.urlopen(url).read().decode().split('\n'))
""" FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS"""
for i in range(len(pageSource)):
obj = EromeParser()
obj.feed(pageSource[i])
tag = obj.tag
if tag is not None:
if "div" in tag:
if "id" in tag["div"]:
if tag["div"]["id"] == "album":
lineNumber = i
break
for line in pageSource[lineNumber:]:
obj = EromeParser()
obj.feed(line)
tag = obj.tag
if tag is not None:
if "img" in tag:
if "class" in tag["img"]:
if tag["img"]["class"]=="img-front":
content.append(tag["img"]["src"])
elif "source" in tag:
content.append(tag["source"]["src"])
return [
link for link in content \
if link.endswith("_480p.mp4") or not link.endswith(".mp4")
]
2018-07-10 07:58:11 +12:00
class Imgur:
def __init__(self,directory,post):
self.imgurClient = self.initImgur()
imgurID = self.getId(post['postURL'])
content = self.getLink(imgurID)
if not os.path.exists(directory): os.makedirs(directory)
if content['type'] == 'image':
try:
post['mediaURL'] = content['object'].mp4
except AttributeError:
post['mediaURL'] = content['object'].link
post['postExt'] = getExtension(post['mediaURL'])
2018-07-10 07:58:11 +12:00
title = nameCorrector(post['postTitle'])
"""Filenames are declared here"""
2018-07-25 03:55:33 +12:00
print(post["postSubmitter"]+"_"+title+"_"+post['postId']+post['postExt'])
2018-07-10 07:58:11 +12:00
2018-07-24 21:44:53 +12:00
fileDir = directory / (
2018-07-25 03:55:33 +12:00
post["postSubmitter"]
+ "_" + title
2018-07-24 21:44:53 +12:00
+ "_" + post['postId']
+ post['postExt']
)
tempDir = directory / (
2018-07-25 03:55:33 +12:00
post["postSubmitter"]
+ "_" + title
2018-07-24 21:44:53 +12:00
+ "_" + post['postId']
+ ".tmp"
)
2018-07-10 07:58:11 +12:00
try:
getFile(fileDir,tempDir,post['mediaURL'])
except FileNameTooLong:
fileDir = directory / post['postId'] + post['postExt']
tempDir = directory / post['postId'] + '.tmp'
getFile(fileDir,tempDir,post['mediaURL'])
elif content['type'] == 'album':
exceptionType = ""
images = content['object'].images
imagesLenght = len(images)
howManyDownloaded = imagesLenght
duplicates = 0
title = nameCorrector(post['postTitle'])
2018-07-25 03:55:33 +12:00
print(post["postSubmitter"]+"_"+title+"_"+post['postId'],end="\n\n")
2018-07-10 07:58:11 +12:00
2018-07-24 21:44:53 +12:00
folderDir = directory / (
2018-07-25 03:55:33 +12:00
post["postSubmitter"] + "_" + title + "_" + post['postId']
2018-07-24 21:44:53 +12:00
)
2018-07-10 07:58:11 +12:00
try:
if not os.path.exists(folderDir):
os.makedirs(folderDir)
except FileNotFoundError:
folderDir = directory / post['postId']
os.makedirs(folderDir)
for i in range(imagesLenght):
try:
imageURL = images[i]['mp4']
except KeyError:
imageURL = images[i]['link']
images[i]['Ext'] = getExtension(imageURL)
fileName = (str(i+1)
+ "_"
+ nameCorrector(str(images[i]['title']))
+ "_"
+ images[i]['id'])
"""Filenames are declared here"""
2018-07-10 07:58:11 +12:00
fileDir = folderDir / (fileName + images[i]['Ext'])
tempDir = folderDir / (fileName + ".tmp")
print(" ({}/{})".format(i+1,imagesLenght))
print(" {}".format(fileName+images[i]['Ext']))
try:
getFile(fileDir,tempDir,imageURL,indent=2)
print()
except FileAlreadyExistsError:
print(" The file already exists" + " "*10,end="\n\n")
duplicates += 1
howManyDownloaded -= 1
# IF FILE NAME IS TOO LONG, IT WONT REGISTER
except FileNameTooLong:
fileName = (str(i+1) + "_" + images[i]['id'])
fileDir = folderDir / (fileName + images[i]['Ext'])
tempDir = folderDir / (fileName + ".tmp")
try:
getFile(fileDir,tempDir,imageURL,indent=2)
# IF STILL TOO LONG
except FileNameTooLong:
fileName = str(i+1)
fileDir = folderDir / (fileName + images[i]['Ext'])
tempDir = folderDir / (fileName + ".tmp")
getFile(fileDir,tempDir,imageURL,indent=2)
except Exception as exception:
print("\n Could not get the file")
2018-07-25 04:27:52 +12:00
print(
" "
+ "{class_name}: {info}".format(
class_name=exception.__class__.__name__,
info=str(exception)
)
+ "\n"
)
2018-07-10 07:58:11 +12:00
exceptionType = exception
howManyDownloaded -= 1
if duplicates == imagesLenght:
raise FileAlreadyExistsError
2018-07-24 08:16:56 +12:00
elif howManyDownloaded + duplicates < imagesLenght:
raise AlbumNotDownloadedCompletely(
"Album Not Downloaded Completely"
)
2018-07-10 07:58:11 +12:00
@staticmethod
def initImgur():
"""Initialize imgur api"""
config = GLOBAL.config
2018-07-12 09:09:20 +12:00
return imgurpython.ImgurClient(
2018-07-10 07:58:11 +12:00
config['imgur_client_id'],
config['imgur_client_secret']
)
def getId(self,submissionURL):
"""Extract imgur post id
and determine if its a single image or album
"""
domainLenght = len("imgur.com/")
if submissionURL[-1] == "/":
submissionURL = submissionURL[:-1]
if "a/" in submissionURL or "gallery/" in submissionURL:
albumId = submissionURL.split("/")[-1]
return {'id':albumId, 'type':'album'}
else:
url = submissionURL.replace('.','/').split('/')
imageId = url[url.index('com')+1]
return {'id':imageId, 'type':'image'}
def getLink(self,identity):
"""Request imgur object from imgur api
"""
if identity['type'] == 'image':
return {'object':self.imgurClient.get_image(identity['id']),
'type':'image'}
elif identity['type'] == 'album':
return {'object':self.imgurClient.get_album(identity['id']),
'type':'album'}
2018-07-12 11:06:16 +12:00
@staticmethod
2018-07-10 07:58:11 +12:00
def get_credits():
2018-07-12 11:06:16 +12:00
return Imgur.initImgur().get_credits()
2018-07-10 07:58:11 +12:00
class Gfycat:
def __init__(self,directory,POST):
try:
POST['mediaURL'] = self.getLink(POST['postURL'])
except IndexError:
raise NotADownloadableLinkError("Could not read the page source")
2018-07-10 07:58:11 +12:00
except Exception as exception:
2018-08-08 09:47:34 +12:00
#debug
raise exception
raise NotADownloadableLinkError("Could not read the page source")
2018-07-10 07:58:11 +12:00
POST['postExt'] = getExtension(POST['mediaURL'])
2018-07-10 07:58:11 +12:00
if not os.path.exists(directory): os.makedirs(directory)
title = nameCorrector(POST['postTitle'])
"""Filenames are declared here"""
2018-07-25 03:55:33 +12:00
print(POST["postSubmitter"]+"_"+title+"_"+POST['postId']+POST['postExt'])
2018-07-10 07:58:11 +12:00
2018-07-24 21:44:53 +12:00
fileDir = directory / (
2018-07-25 03:55:33 +12:00
POST["postSubmitter"]+"_"+title+"_"+POST['postId']+POST['postExt']
2018-07-24 21:44:53 +12:00
)
tempDir = directory / (
2018-07-25 03:55:33 +12:00
POST["postSubmitter"]+"_"+title+"_"+POST['postId']+".tmp"
2018-07-24 21:44:53 +12:00
)
2018-07-10 10:30:50 +12:00
try:
getFile(fileDir,tempDir,POST['mediaURL'])
except FileNameTooLong:
fileDir = directory / (POST['postId']+POST['postExt'])
tempDir = directory / (POST['postId']+".tmp")
2018-07-10 07:58:11 +12:00
2018-07-10 10:30:50 +12:00
getFile(fileDir,tempDir,POST['mediaURL'])
2018-07-10 07:58:11 +12:00
def getLink(self, url, query='<source id="mp4Source" src=', lineNumber=105):
"""Extract direct link to the video from page's source
and return it
"""
if '.webm' in url or '.mp4' in url or '.gif' in url:
return url
if url[-1:] == '/':
url = url[:-1]
2018-07-20 00:22:12 +12:00
url = "https://gfycat.com/" + url.split('/')[-1]
2018-07-10 07:58:11 +12:00
pageSource = (urllib.request.urlopen(url).read().decode())
2018-07-10 07:58:11 +12:00
soup = BeautifulSoup(pageSource, "html.parser")
attributes = {"data-react-helmet":"true","type":"application/ld+json"}
content = soup.find("script",attrs=attributes)
2018-07-10 07:58:11 +12:00
if content is None:
raise NotADownloadableLinkError("Could not read the page source")
2018-07-10 07:58:11 +12:00
return json.loads(content.text)["video"]["contentUrl"]
2018-07-10 07:58:11 +12:00
class Direct:
def __init__(self,directory,POST):
POST['postExt'] = getExtension(POST['postURL'])
if not os.path.exists(directory): os.makedirs(directory)
title = nameCorrector(POST['postTitle'])
"""Filenames are declared here"""
2018-07-25 03:55:33 +12:00
print(POST["postSubmitter"]+"_"+title+"_"+POST['postId']+POST['postExt'])
2018-07-10 07:58:11 +12:00
2018-07-24 21:44:53 +12:00
fileDir = directory / (
2018-07-25 03:55:33 +12:00
POST["postSubmitter"]+"_"+title+"_"+POST['postId']+POST['postExt']
2018-07-24 21:44:53 +12:00
)
tempDir = directory / (
2018-07-25 03:55:33 +12:00
POST["postSubmitter"]+"_"+title+"_"+POST['postId']+".tmp"
2018-07-24 21:44:53 +12:00
)
2018-07-10 07:58:11 +12:00
2018-07-10 10:30:50 +12:00
try:
getFile(fileDir,tempDir,POST['postURL'])
except FileNameTooLong:
fileDir = directory / (POST['postId']+POST['postExt'])
tempDir = directory / (POST['postId']+".tmp")
getFile(fileDir,tempDir,POST['postURL'])
class Self:
def __init__(self,directory,post):
if not os.path.exists(directory): os.makedirs(directory)
title = nameCorrector(post['postTitle'])
"""Filenames are declared here"""
2018-07-25 03:55:33 +12:00
print(post["postSubmitter"]+"_"+title+"_"+post['postId']+".md")
2018-07-10 10:30:50 +12:00
2018-07-24 21:44:53 +12:00
fileDir = directory / (
2018-07-25 03:55:33 +12:00
post["postSubmitter"]+"_"+title+"_"+post['postId']+".md"
2018-07-24 21:44:53 +12:00
)
2018-07-10 10:30:50 +12:00
if Path.is_file(fileDir):
raise FileAlreadyExistsError
2018-07-10 10:44:28 +12:00
try:
self.writeToFile(fileDir,post)
except FileNotFoundError:
fileDir = post['postId']+".md"
fileDir = directory / fileDir
2018-07-10 10:30:50 +12:00
2018-07-10 10:44:28 +12:00
self.writeToFile(fileDir,post)
2018-07-10 10:30:50 +12:00
@staticmethod
def writeToFile(directory,post):
"""Self posts are formatted here"""
2018-07-10 10:30:50 +12:00
content = ("## ["
+ post["postTitle"]
+ "]("
+ post["postURL"]
+ ")\n"
+ post["postContent"]
+ "\n\n---\n\n"
2018-07-25 03:55:33 +12:00
+ "submitted to [r/"
+ post["postSubreddit"]
+ "](https://www.reddit.com/r/"
+ post["postSubreddit"]
+ ") by [u/"
2018-07-10 10:30:50 +12:00
+ post["postSubmitter"]
+ "](https://www.reddit.com/user/"
+ post["postSubmitter"]
+ ")")
with io.open(directory,"w",encoding="utf-8") as FILE:
VanillaPrint(content,file=FILE)
print("Downloaded")