1
0
Fork 0
mirror of synced 2024-07-03 13:30:45 +12:00
bulk-downloader-for-reddit/script.py

674 lines
22 KiB
Python
Raw Normal View History

2018-07-10 07:58:11 +12:00
#!/usr/bin/env python
"""
This program downloads imgur, gfycat and direct image and video links of
saved posts from a reddit account. It is written in Python 3.
"""
import argparse
import logging
2018-07-10 07:58:11 +12:00
import os
import sys
import time
from io import StringIO
2018-07-10 07:58:11 +12:00
from pathlib import Path, PurePath
2018-07-10 10:30:50 +12:00
from src.downloader import Direct, Gfycat, Imgur, Self
from src.errors import *
2018-07-10 07:58:11 +12:00
from src.parser import LinkDesigner
from src.searcher import getPosts
from src.tools import (GLOBAL, createLogFile, jsonFile, nameCorrector,
printToFile)
__author__ = "Ali Parlakci"
__license__ = "GPL"
2018-07-13 07:19:46 +12:00
__version__ = "1.1.1"
2018-07-10 07:58:11 +12:00
__maintainer__ = "Ali Parlakci"
__email__ = "parlakciali@gmail.com"
def getConfig(configFileName):
"""Read credentials from config.json file"""
keys = ['imgur_client_id',
'imgur_client_secret']
if os.path.exists(configFileName):
FILE = jsonFile(configFileName)
content = FILE.read()
if "reddit_refresh_token" in content:
if content["reddit_refresh_token"] == "":
FILE.delete("reddit_refresh_token")
for key in keys:
try:
if content[key] == "":
raise KeyError
except KeyError:
print(key,": ")
FILE.add({key:input()})
return jsonFile(configFileName).read()
else:
FILE = jsonFile(configFileName)
configDictionary = {}
for key in keys:
configDictionary[key] = input(key + ": ")
FILE.add(configDictionary)
return FILE.read()
def parseArguments(arguments=[]):
"""Initialize argparse and add arguments"""
parser = argparse.ArgumentParser(allow_abbrev=False,
description="This program downloads " \
"media from reddit " \
"posts")
2018-07-12 03:40:40 +12:00
parser.add_argument("--directory",
2018-07-10 07:58:11 +12:00
help="Specifies the directory where posts will be " \
"downloaded to",
metavar="DIRECTORY")
parser.add_argument("--link","-l",
help="Get posts from link",
metavar="link")
parser.add_argument("--saved",
action="store_true",
help="Triggers saved mode")
parser.add_argument("--submitted",
action="store_true",
help="Gets posts of --user")
parser.add_argument("--upvoted",
action="store_true",
help="Gets upvoted posts of --user")
parser.add_argument("--log",
2018-07-12 08:24:15 +12:00
help="Takes a log file which created by itself " \
"(json files), reads posts and tries downloadin" \
"g them again.",
2018-07-10 07:58:11 +12:00
# type=argparse.FileType('r'),
metavar="LOG FILE")
parser.add_argument("--subreddit",
nargs="+",
help="Triggers subreddit mode and takes subreddit's " \
"name without r/. use \"frontpage\" for frontpage",
metavar="SUBREDDIT",
type=str)
parser.add_argument("--multireddit",
help="Triggers multireddit mode and takes "\
"multireddit's name without m/",
metavar="MULTIREDDIT",
type=str)
parser.add_argument("--user",
help="reddit username if needed. use \"me\" for " \
"current user",
required="--multireddit" in sys.argv or \
"--submitted" in sys.argv,
metavar="redditor",
type=str)
parser.add_argument("--search",
help="Searches for given query in given subreddits",
metavar="query",
type=str)
parser.add_argument("--sort",
help="Either hot, top, new, controversial, rising " \
"or relevance default: hot",
choices=[
"hot","top","new","controversial","rising",
"relevance"
],
metavar="SORT TYPE",
type=str)
parser.add_argument("--limit",
help="default: unlimited",
metavar="Limit",
type=int)
parser.add_argument("--time",
help="Either hour, day, week, month, year or all." \
" default: all",
choices=["all","hour","day","week","month","year"],
metavar="TIME_LIMIT",
type=str)
parser.add_argument("--NoDownload",
help="Just gets the posts and store them in a file" \
" for downloading later",
action="store_true",
default=False)
if arguments == []:
return parser.parse_args()
else:
return parser.parse_args(arguments)
def checkConflicts():
"""Check if command-line arguments are given correcly,
if not, raise errors
"""
if GLOBAL.arguments.user is None:
user = 0
else:
user = 1
2018-07-12 04:25:24 +12:00
modes = ["saved","subreddit","submitted","search","log","link","upvoted"]
2018-07-12 07:18:54 +12:00
values = {
x: 0 if getattr(GLOBAL.arguments,x) is None or \
getattr(GLOBAL.arguments,x) is False \
else 1 \
for x in modes
}
2018-07-10 07:58:11 +12:00
2018-07-12 04:25:24 +12:00
if not sum(values[x] for x in values) == 1:
2018-07-12 04:57:38 +12:00
raise ProgramModeError("Invalid program mode")
2018-07-10 07:58:11 +12:00
2018-07-12 04:25:24 +12:00
if values["search"]+values["saved"] == 2:
2018-07-12 04:57:38 +12:00
raise SearchModeError("You cannot search in your saved posts")
2018-07-10 07:58:11 +12:00
2018-07-12 04:25:24 +12:00
if values["search"]+values["submitted"] == 2:
2018-07-12 04:57:38 +12:00
raise SearchModeError("You cannot search in submitted posts")
2018-07-10 07:58:11 +12:00
2018-07-12 04:25:24 +12:00
if values["search"]+values["upvoted"] == 2:
2018-07-12 04:57:38 +12:00
raise SearchModeError("You cannot search in upvoted posts")
2018-07-10 07:58:11 +12:00
2018-07-12 04:25:24 +12:00
if values["upvoted"]+values["submitted"] == 1 and user == 0:
2018-07-12 04:57:38 +12:00
raise RedditorNameError("No redditor name given")
2018-07-10 07:58:11 +12:00
2018-07-12 03:59:17 +12:00
class PromptUser:
2018-07-12 03:40:40 +12:00
@staticmethod
def chooseFrom(choices):
print()
choicesByIndex = list(str(x) for x in range(len(choices)+1))
for i in range(len(choices)):
print("{indent}[{order}] {mode}".format(
indent=" "*4,order=i+1,mode=choices[i]
))
print(" "*4+"[0] exit\n")
choice = input("> ")
while not choice.lower() in choices+choicesByIndex+["exit"]:
2018-07-12 03:40:40 +12:00
print("Invalid input\n")
programModeIndex = input("> ")
2018-07-13 07:00:43 +12:00
if choice == "0" or choice == "exit":
2018-07-12 22:00:02 +12:00
sys.exit()
2018-07-12 03:40:40 +12:00
elif choice in choicesByIndex:
return choices[int(choice)-1]
else:
return choice
def __init__(self):
2018-07-12 03:56:39 +12:00
print("select program mode:")
2018-07-12 03:40:40 +12:00
programModes = [
"search","subreddit","multireddit",
"submitted","upvoted","saved","log"
]
programMode = self.chooseFrom(programModes)
if programMode == "search":
2018-07-12 08:27:16 +12:00
GLOBAL.arguments.search = input("\nquery: ")
GLOBAL.arguments.subreddit = input("\nsubreddit: ")
2018-07-12 03:40:40 +12:00
2018-07-12 03:56:39 +12:00
print("\nselect sort type:")
2018-07-12 03:40:40 +12:00
sortTypes = [
"relevance","top","new"
]
sortType = self.chooseFrom(sortTypes)
GLOBAL.arguments.sort = sortType
2018-07-12 03:56:39 +12:00
print("\nselect time filter:")
2018-07-12 03:40:40 +12:00
timeFilters = [
"hour","day","week","month","year","all"
]
timeFilter = self.chooseFrom(timeFilters)
GLOBAL.arguments.time = timeFilter
if programMode == "subreddit":
2018-07-13 07:00:43 +12:00
subredditInput = input("subreddit: ")
GLOBAL.arguments.subreddit = subredditInput
while not subredditInput == "":
subredditInput = input("subreddit: ")
GLOBAL.arguments.subreddit += "+" + subredditInput
2018-07-12 03:40:40 +12:00
if " " in GLOBAL.arguments.subreddit:
GLOBAL.arguments.subreddit = "+".join(GLOBAL.arguments.subreddit.split())
2018-07-13 07:00:43 +12:00
# DELETE THE PLUS (+) AT THE END
GLOBAL.arguments.subreddit = GLOBAL.arguments.subreddit[:-1]
print(GLOBAL.arguments.subreddit)
2018-07-12 03:56:39 +12:00
print("\nselect sort type:")
2018-07-12 03:40:40 +12:00
sortTypes = [
"hot","top","new","rising","controversial"
]
sortType = self.chooseFrom(sortTypes)
GLOBAL.arguments.sort = sortType
if sortType in ["top","controversial"]:
2018-07-12 03:56:39 +12:00
print("\nselect time filter:")
2018-07-12 03:40:40 +12:00
timeFilters = [
"hour","day","week","month","year","all"
]
timeFilter = self.chooseFrom(timeFilters)
GLOBAL.arguments.time = timeFilter
else:
GLOBAL.arguments.time = "all"
2018-07-12 06:25:45 +12:00
elif programMode == "multireddit":
2018-07-12 08:27:16 +12:00
GLOBAL.arguments.user = input("\nredditor: ")
GLOBAL.arguments.subreddit = input("\nmultireddit: ")
2018-07-12 03:40:40 +12:00
2018-07-12 03:56:39 +12:00
print("\nselect sort type:")
2018-07-12 03:40:40 +12:00
sortTypes = [
"hot","top","new","rising","controversial"
]
sortType = self.chooseFrom(sortTypes)
GLOBAL.arguments.sort = sortType
if sortType in ["top","controversial"]:
2018-07-12 03:56:39 +12:00
print("\nselect time filter:")
2018-07-12 03:40:40 +12:00
timeFilters = [
"hour","day","week","month","year","all"
]
timeFilter = self.chooseFrom(timeFilters)
GLOBAL.arguments.time = timeFilter
else:
GLOBAL.arguments.time = "all"
elif programMode == "submitted":
GLOBAL.arguments.submitted = True
2018-07-12 08:27:16 +12:00
GLOBAL.arguments.user = input("\nredditor: ")
2018-07-12 06:25:45 +12:00
print("\nselect sort type:")
sortTypes = [
"hot","top","new","controversial"
]
sortType = self.chooseFrom(sortTypes)
GLOBAL.arguments.sort = sortType
if sortType == "top":
print("\nselect time filter:")
timeFilters = [
"hour","day","week","month","year","all"
]
timeFilter = self.chooseFrom(timeFilters)
GLOBAL.arguments.time = timeFilter
else:
GLOBAL.arguments.time = "all"
2018-07-12 03:40:40 +12:00
elif programMode == "upvoted":
GLOBAL.arguments.upvoted = True
2018-07-12 08:27:16 +12:00
GLOBAL.arguments.user = input("\nredditor: ")
2018-07-12 03:40:40 +12:00
elif programMode == "saved":
GLOBAL.arguments.saved = True
elif programMode == "log":
2018-07-12 06:25:45 +12:00
while True:
2018-07-12 08:27:16 +12:00
GLOBAL.arguments.log = input("\nlog file directory:")
2018-07-12 06:25:45 +12:00
if Path(GLOBAL.arguments.log ).is_file():
break
2018-07-12 03:40:40 +12:00
2018-07-12 06:25:45 +12:00
while True:
try:
2018-07-12 08:27:16 +12:00
GLOBAL.arguments.limit = int(input("\nlimit: "))
2018-07-12 06:25:45 +12:00
break
except ValueError:
pass
2018-07-12 03:40:40 +12:00
2018-07-12 03:56:39 +12:00
def prepareAttributes():
ATTRIBUTES = {}
if GLOBAL.arguments.user is not None:
ATTRIBUTES["user"] = GLOBAL.arguments.user
if GLOBAL.arguments.search is not None:
ATTRIBUTES["search"] = GLOBAL.arguments.search
if GLOBAL.arguments.sort == "hot" or \
GLOBAL.arguments.sort == "controversial" or \
GLOBAL.arguments.sort == "rising":
GLOBAL.arguments.sort = "relevance"
if GLOBAL.arguments.sort is not None:
ATTRIBUTES["sort"] = GLOBAL.arguments.sort
else:
if GLOBAL.arguments.submitted:
ATTRIBUTES["sort"] = "new"
else:
ATTRIBUTES["sort"] = "hot"
if GLOBAL.arguments.time is not None:
ATTRIBUTES["time"] = GLOBAL.arguments.time
else:
ATTRIBUTES["time"] = "all"
if GLOBAL.arguments.link is not None:
GLOBAL.arguments.link = GLOBAL.arguments.link.strip("\"")
try:
ATTRIBUTES = LinkDesigner(GLOBAL.arguments.link)
except InvalidRedditLink:
raise InvalidRedditLink
if GLOBAL.arguments.search is not None:
ATTRIBUTES["search"] = GLOBAL.arguments.search
if GLOBAL.arguments.sort is not None:
ATTRIBUTES["sort"] = GLOBAL.arguments.sort
if GLOBAL.arguments.time is not None:
ATTRIBUTES["time"] = GLOBAL.arguments.time
elif GLOBAL.arguments.subreddit is not None:
if type(GLOBAL.arguments.subreddit) == list:
GLOBAL.arguments.subreddit = "+".join(GLOBAL.arguments.subreddit)
ATTRIBUTES["subreddit"] = GLOBAL.arguments.subreddit
elif GLOBAL.arguments.saved is True:
ATTRIBUTES["saved"] = True
elif GLOBAL.arguments.upvoted is True:
ATTRIBUTES["upvoted"] = True
elif GLOBAL.arguments.submitted is not None:
ATTRIBUTES["submitted"] = True
if GLOBAL.arguments.sort == "rising":
raise InvalidSortingType
ATTRIBUTES["limit"] = GLOBAL.arguments.limit
return ATTRIBUTES
def postFromLog(fileName):
"""Analyze a log file and return a list of dictionaries containing
submissions
"""
if Path.is_file(Path(fileName)):
content = jsonFile(fileName).read()
else:
print("File not found")
2018-07-12 22:00:02 +12:00
sys.exit()
2018-07-12 03:56:39 +12:00
try:
del content["HEADER"]
except KeyError:
pass
posts = []
for post in content:
if not content[post][-1]['postType'] == None:
posts.append(content[post][-1])
return posts
2018-07-10 07:58:11 +12:00
def postExists(POST):
"""Figure out a file's name and checks if the file already exists"""
title = nameCorrector(POST['postTitle'])
FILENAME = title + "_" + POST['postId']
PATH = GLOBAL.directory / POST["postSubreddit"]
2018-07-10 12:12:24 +12:00
possibleExtensions = [".jpg",".png",".mp4",".gif",".webm",".md"]
2018-07-10 07:58:11 +12:00
for i in range(2):
for extension in possibleExtensions:
FILE_PATH = PATH / (FILENAME+extension)
if FILE_PATH.exists():
return True
else:
FILENAME = POST['postId']
else:
return False
def download(submissions):
"""Analyze list of submissions and call the right function
to download each one, catch errors, update the log files
"""
subsLenght = len(submissions)
lastRequestTime = 0
downloadedCount = subsLenght
duplicates = 0
BACKUP = {}
FAILED_FILE = createLogFile("FAILED")
for i in range(subsLenght):
print("\n({}/{})".format(i+1,subsLenght))
print(
"https://reddit.com/r/{subreddit}/comments/{id}".format(
subreddit=submissions[i]['postSubreddit'],
id=submissions[i]['postId']
)
)
if postExists(submissions[i]):
result = False
print(submissions[i]['postType'].upper())
2018-07-10 07:58:11 +12:00
print("It already exists")
duplicates += 1
downloadedCount -= 1
continue
directory = GLOBAL.directory / submissions[i]['postSubreddit']
if submissions[i]['postType'] == 'imgur':
print("IMGUR",end="")
while int(time.time() - lastRequestTime) <= 2:
pass
credit = Imgur.get_credits()
IMGUR_RESET_TIME = credit['UserReset']-time.time()
USER_RESET = ("after " \
+ str(int(IMGUR_RESET_TIME/60)) \
+ " Minutes " \
+ str(int(IMGUR_RESET_TIME%60)) \
+ " Seconds")
print(
" => Client: {} - User: {} - Reset {}".format(
credit['ClientRemaining'],
credit['UserRemaining'],
USER_RESET
)
)
if not (credit['UserRemaining'] == 0 or \
credit['ClientRemaining'] == 0):
"""This block of code is needed
"""
while int(time.time() - lastRequestTime) <= 2:
pass
lastRequestTime = time.time()
try:
Imgur(directory,submissions[i])
except FileAlreadyExistsError:
print("It already exists")
duplicates += 1
downloadedCount -= 1
except ImgurLoginError:
print(
"Imgur login failed. Quitting the program "\
"as unexpected errors might occur."
)
2018-07-12 22:00:02 +12:00
sys.exit()
2018-07-10 07:58:11 +12:00
except Exception as exception:
print(exception)
FAILED_FILE.add({int(i+1):[str(exception),submissions[i]]})
downloadedCount -= 1
else:
if credit['UserRemaining'] == 0:
KEYWORD = "user"
elif credit['ClientRemaining'] == 0:
KEYWORD = "client"
print('{} LIMIT EXCEEDED\n'.format(KEYWORD.upper()))
FAILED_FILE.add(
{int(i+1):['{} LIMIT EXCEEDED\n'.format(KEYWORD.upper()),
submissions[i]]}
)
downloadedCount -= 1
elif submissions[i]['postType'] == 'gfycat':
print("GFYCAT")
try:
Gfycat(directory,submissions[i])
except FileAlreadyExistsError:
print("It already exists")
duplicates += 1
downloadedCount -= 1
except NotADownloadableLinkError as exception:
print("Could not read the page source")
FAILED_FILE.add({int(i+1):[str(exception),submissions[i]]})
downloadedCount -= 1
except Exception as exception:
print(exception)
FAILED_FILE.add({int(i+1):[str(exception),submissions[i]]})
downloadedCount -= 1
elif submissions[i]['postType'] == 'direct':
print("DIRECT")
try:
Direct(directory,submissions[i])
except FileAlreadyExistsError:
print("It already exists")
downloadedCount -= 1
duplicates += 1
except Exception as exception:
print(exception)
FAILED_FILE.add({int(i+1):[str(exception),submissions[i]]})
downloadedCount -= 1
2018-07-10 10:30:50 +12:00
elif submissions[i]['postType'] == 'self':
print("SELF")
try:
Self(directory,submissions[i])
except FileAlreadyExistsError:
print("It already exists")
downloadedCount -= 1
duplicates += 1
2018-07-10 11:41:53 +12:00
except Exception as exception:
print(exception)
FAILED_FILE.add({int(i+1):[str(exception),submissions[i]]})
downloadedCount -= 1
2018-07-10 10:30:50 +12:00
2018-07-10 07:58:11 +12:00
else:
print("No match found, skipping...")
downloadedCount -= 1
if duplicates:
print("\n There was {} duplicates".format(duplicates))
if downloadedCount == 0:
print(" Nothing downloaded :(")
else:
print(" Total of {} links downloaded!".format(downloadedCount))
def main():
2018-07-12 03:40:40 +12:00
GLOBAL.arguments = parseArguments()
2018-07-10 07:58:11 +12:00
if GLOBAL.arguments.directory is not None:
GLOBAL.directory = Path(GLOBAL.arguments.directory)
else:
2018-07-12 03:40:40 +12:00
GLOBAL.directory = Path(input("download directory: "))
2018-07-12 06:25:45 +12:00
print("\n"," ".join(sys.argv),"\n")
2018-07-12 04:31:32 +12:00
2018-07-12 04:57:38 +12:00
try:
2018-07-12 03:56:39 +12:00
checkConflicts()
2018-07-12 04:57:38 +12:00
except ProgramModeError as err:
PromptUser()
except Exception as err:
print(err)
2018-07-12 22:00:02 +12:00
sys.exit()
2018-07-10 07:58:11 +12:00
2018-07-13 23:10:21 +12:00
if not Path(GLOBAL.configDirectory).is_dir():
os.makedirs(GLOBAL.configDirectory)
GLOBAL.config = getConfig(GLOBAL.configDirectory / "config.json")
2018-07-10 07:58:11 +12:00
if GLOBAL.arguments.log is not None:
logDir = Path(GLOBAL.arguments.log)
download(postFromLog(logDir))
2018-07-12 22:00:02 +12:00
sys.exit()
2018-07-10 07:58:11 +12:00
try:
POSTS = getPosts(prepareAttributes())
except InsufficientPermission:
print("You do not have permission to do that")
2018-07-12 22:00:02 +12:00
sys.exit()
2018-07-10 07:58:11 +12:00
except NoMatchingSubmissionFound:
print("No matching submission was found")
2018-07-12 22:00:02 +12:00
sys.exit()
2018-07-10 07:58:11 +12:00
except NoRedditSupoort:
print("Reddit does not support that")
2018-07-12 22:00:02 +12:00
sys.exit()
2018-07-10 07:58:11 +12:00
except NoPrawSupport:
print("PRAW does not support that")
2018-07-12 22:00:02 +12:00
sys.exit()
2018-07-10 07:58:11 +12:00
except MultiredditNotFound:
print("Multireddit not found")
2018-07-12 22:00:02 +12:00
sys.exit()
2018-07-10 07:58:11 +12:00
except InvalidSortingType:
print("Invalid sorting type has given")
2018-07-12 22:00:02 +12:00
sys.exit()
2018-07-10 07:58:11 +12:00
except InvalidRedditLink:
print("Invalid reddit link")
2018-07-12 22:00:02 +12:00
sys.exit()
2018-07-10 07:58:11 +12:00
if POSTS is None:
print("I could not find any posts in that URL")
2018-07-12 22:00:02 +12:00
sys.exit()
2018-07-10 07:58:11 +12:00
if GLOBAL.arguments.NoDownload:
2018-07-12 22:00:02 +12:00
sys.exit()
2018-07-10 07:58:11 +12:00
else:
download(POSTS)
if __name__ == "__main__":
log_stream = StringIO()
logging.basicConfig(stream=log_stream, level=logging.INFO)
2018-07-10 07:58:11 +12:00
try:
VanillaPrint = print
print = printToFile
GLOBAL.RUN_TIME = time.time()
main()
except KeyboardInterrupt:
if GLOBAL.directory is None:
GLOBAL.directory = Path(".\\")
2018-07-10 07:58:11 +12:00
print("\nQUITTING...")
except Exception as exception:
logging.error("Runtime error!", exc_info=full_exc_info(sys.exc_info()))
print(log_stream.getvalue())
input("Press enter to quit\n")