1
0
Fork 0
mirror of synced 2024-06-26 10:00:20 +12:00
bulk-downloader-for-reddit/src/searcher.py

483 lines
15 KiB
Python
Raw Normal View History

2018-07-10 07:58:11 +12:00
import os
2018-07-25 22:40:06 +12:00
import sys
2018-07-10 07:58:11 +12:00
import random
import socket
import webbrowser
import praw
2018-07-10 07:58:11 +12:00
from prawcore.exceptions import NotFound, ResponseException, Forbidden
from src.tools import GLOBAL, createLogFile, jsonFile, printToFile
from src.errors import (NoMatchingSubmissionFound, NoPrawSupport,
2018-08-09 09:17:04 +12:00
NoRedditSupport, MultiredditNotFound,
2018-07-10 07:58:11 +12:00
InvalidSortingType, RedditLoginFailed,
InsufficientPermission)
print = printToFile
2018-07-24 22:17:37 +12:00
def beginPraw(config,user_agent = str(socket.gethostname())):
class GetAuth:
def __init__(self,redditInstance,port):
self.redditInstance = redditInstance
self.PORT = int(port)
2018-07-10 07:58:11 +12:00
2018-07-24 22:17:37 +12:00
def recieve_connection(self):
"""Wait for and then return a connected socket..
Opens a TCP connection on port 8080, and waits for a single client.
"""
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server.bind(('localhost', self.PORT))
server.listen(1)
client = server.accept()[0]
server.close()
return client
2018-07-10 07:58:11 +12:00
2018-07-24 22:17:37 +12:00
def send_message(self, message):
"""Send message to client and close the connection."""
self.client.send(
'HTTP/1.1 200 OK\r\n\r\n{}'.format(message).encode('utf-8')
)
self.client.close()
2018-07-10 07:58:11 +12:00
2018-07-24 22:17:37 +12:00
def getRefreshToken(self,*scopes):
state = str(random.randint(0, 65000))
url = self.redditInstance.auth.url(scopes, state, 'permanent')
print("Go to this URL and login to reddit:\n\n",url)
webbrowser.open(url,new=2)
2018-07-10 07:58:11 +12:00
2018-07-24 22:17:37 +12:00
self.client = self.recieve_connection()
data = self.client.recv(1024).decode('utf-8')
2018-08-10 22:08:24 +12:00
str(data)
2018-07-24 22:17:37 +12:00
param_tokens = data.split(' ', 2)[1].split('?', 1)[1].split('&')
params = {
key: value for (key, value) in [token.split('=') \
for token in param_tokens]
}
if state != params['state']:
self.send_message(
client, 'State mismatch. Expected: {} Received: {}'
.format(state, params['state'])
)
raise RedditLoginFailed
elif 'error' in params:
self.send_message(client, params['error'])
raise RedditLoginFailed
refresh_token = self.redditInstance.auth.authorize(params['code'])
2018-07-10 07:58:11 +12:00
self.send_message(
2018-07-24 22:17:37 +12:00
"<script>" \
"alert(\"You can go back to terminal window now.\");" \
"</script>"
2018-07-10 07:58:11 +12:00
)
2018-07-24 22:17:37 +12:00
return (self.redditInstance,refresh_token)
2018-07-10 07:58:11 +12:00
"""Start reddit instance"""
scopes = ['identity','history','read']
port = "1337"
arguments = {
"client_id":GLOBAL.reddit_client_id,
"client_secret":GLOBAL.reddit_client_secret,
"user_agent":user_agent
}
if "reddit_refresh_token" in GLOBAL.config:
arguments["refresh_token"] = GLOBAL.config["reddit_refresh_token"]
reddit = praw.Reddit(**arguments)
try:
reddit.auth.scopes()
except ResponseException:
arguments["redirect_uri"] = "http://localhost:" + str(port)
reddit = praw.Reddit(**arguments)
authorizedInstance = GetAuth(reddit,port).getRefreshToken(*scopes)
reddit = authorizedInstance[0]
refresh_token = authorizedInstance[1]
2018-07-13 23:10:21 +12:00
jsonFile(GLOBAL.configDirectory / "config.json").add({
2018-08-10 22:08:24 +12:00
"reddit_username":str(reddit.user.me()),
2018-07-10 07:58:11 +12:00
"reddit_refresh_token":refresh_token
})
else:
arguments["redirect_uri"] = "http://localhost:" + str(port)
reddit = praw.Reddit(**arguments)
authorizedInstance = GetAuth(reddit,port).getRefreshToken(*scopes)
reddit = authorizedInstance[0]
refresh_token = authorizedInstance[1]
2018-07-13 23:10:21 +12:00
jsonFile(GLOBAL.configDirectory / "config.json").add({
2018-08-10 22:08:24 +12:00
"reddit_username":str(reddit.user.me()),
2018-07-10 07:58:11 +12:00
"reddit_refresh_token":refresh_token
})
return reddit
def getPosts(args):
"""Call PRAW regarding to arguments and pass it to redditSearcher.
Return what redditSearcher has returned.
"""
config = GLOBAL.config
reddit = beginPraw(config)
if args["sort"] == "best":
2018-08-09 09:17:04 +12:00
raise NoPrawSupport("PRAW does not support that")
2018-07-10 07:58:11 +12:00
if "subreddit" in args:
if "search" in args:
if args["subreddit"] == "frontpage":
args["subreddit"] = "all"
if "user" in args:
if args["user"] == "me":
args["user"] = str(reddit.user.me())
if not "search" in args:
if args["sort"] == "top" or args["sort"] == "controversial":
keyword_params = {
"time_filter":args["time"],
"limit":args["limit"]
}
# OTHER SORT TYPES DON'T TAKE TIME_FILTER
else:
keyword_params = {
"limit":args["limit"]
}
else:
keyword_params = {
"time_filter":args["time"],
"limit":args["limit"]
}
if "search" in args:
2018-08-09 18:26:01 +12:00
if GLOBAL.arguments.sort in ["hot","rising","controversial"]:
2018-08-09 09:17:04 +12:00
raise InvalidSortingType("Invalid sorting type has given")
2018-07-10 07:58:11 +12:00
if "subreddit" in args:
print (
"search for \"{search}\" in\n" \
"subreddit: {subreddit}\nsort: {sort}\n" \
"time: {time}\nlimit: {limit}\n".format(
search=args["search"],
limit=args["limit"],
sort=args["sort"],
subreddit=args["subreddit"],
time=args["time"]
2018-07-26 19:08:57 +12:00
).upper(),noPrint=True
2018-07-10 07:58:11 +12:00
)
return redditSearcher(
reddit.subreddit(args["subreddit"]).search(
args["search"],
limit=args["limit"],
sort=args["sort"],
time_filter=args["time"]
)
)
elif "multireddit" in args:
2018-08-09 09:17:04 +12:00
raise NoPrawSupport("PRAW does not support that")
2018-07-10 07:58:11 +12:00
elif "user" in args:
2018-08-09 09:17:04 +12:00
raise NoPrawSupport("PRAW does not support that")
2018-07-10 07:58:11 +12:00
elif "saved" in args:
2018-08-09 09:17:04 +12:00
raise ("Reddit does not support that")
2018-07-10 07:58:11 +12:00
if args["sort"] == "relevance":
2018-08-09 09:17:04 +12:00
raise InvalidSortingType("Invalid sorting type has given")
2018-07-10 07:58:11 +12:00
if "saved" in args:
print(
"saved posts\nuser:{username}\nlimit={limit}\n".format(
username=reddit.user.me(),
limit=args["limit"]
2018-07-26 19:08:57 +12:00
).upper(),noPrint=True
2018-07-10 07:58:11 +12:00
)
return redditSearcher(reddit.user.me().saved(limit=args["limit"]))
if "subreddit" in args:
if args["subreddit"] == "frontpage":
print (
"subreddit: {subreddit}\nsort: {sort}\n" \
"time: {time}\nlimit: {limit}\n".format(
limit=args["limit"],
sort=args["sort"],
subreddit=args["subreddit"],
time=args["time"]
2018-07-26 19:08:57 +12:00
).upper(),noPrint=True
2018-07-10 07:58:11 +12:00
)
return redditSearcher(
getattr(reddit.front,args["sort"]) (**keyword_params)
)
else:
print (
"subreddit: {subreddit}\nsort: {sort}\n" \
"time: {time}\nlimit: {limit}\n".format(
limit=args["limit"],
sort=args["sort"],
subreddit=args["subreddit"],
time=args["time"]
2018-07-26 19:08:57 +12:00
).upper(),noPrint=True
2018-07-10 07:58:11 +12:00
)
return redditSearcher(
getattr(
reddit.subreddit(args["subreddit"]),args["sort"]
) (**keyword_params)
)
elif "multireddit" in args:
print (
"user: {user}\n" \
"multireddit: {multireddit}\nsort: {sort}\n" \
"time: {time}\nlimit: {limit}\n".format(
user=args["user"],
limit=args["limit"],
sort=args["sort"],
multireddit=args["multireddit"],
time=args["time"]
2018-07-26 19:08:57 +12:00
).upper(),noPrint=True
2018-07-10 07:58:11 +12:00
)
try:
return redditSearcher(
getattr(
reddit.multireddit(
args["user"], args["multireddit"]
),args["sort"]
) (**keyword_params)
)
except NotFound:
2018-08-09 09:17:04 +12:00
raise MultiredditNotFound("Multireddit not found")
2018-07-10 07:58:11 +12:00
elif "submitted" in args:
print (
"submitted posts of {user}\nsort: {sort}\n" \
"time: {time}\nlimit: {limit}\n".format(
limit=args["limit"],
sort=args["sort"],
user=args["user"],
time=args["time"]
2018-07-26 19:08:57 +12:00
).upper(),noPrint=True
2018-07-10 07:58:11 +12:00
)
return redditSearcher(
getattr(
reddit.redditor(args["user"]).submissions,args["sort"]
) (**keyword_params)
)
elif "upvoted" in args:
print (
"upvoted posts of {user}\nlimit: {limit}\n".format(
user=args["user"],
limit=args["limit"]
2018-07-26 19:08:57 +12:00
).upper(),noPrint=True
2018-07-10 07:58:11 +12:00
)
try:
return redditSearcher(
reddit.redditor(args["user"]).upvoted(limit=args["limit"])
)
except Forbidden:
2018-08-09 09:17:04 +12:00
raise InsufficientPermission("You do not have permission to do that")
2018-07-10 07:58:11 +12:00
elif "post" in args:
2018-07-26 19:08:57 +12:00
print("post: {post}\n".format(post=args["post"]).upper(),noPrint=True)
2018-07-10 07:58:11 +12:00
return redditSearcher(
reddit.submission(url=args["post"]),SINGLE_POST=True
)
def redditSearcher(posts,SINGLE_POST=False):
"""Check posts and decide if it can be downloaded.
If so, create a dictionary with post details and append them to a list.
Write all of posts to file. Return the list
"""
subList = []
global subCount
subCount = 0
global orderCount
orderCount = 0
global gfycatCount
gfycatCount = 0
global imgurCount
imgurCount = 0
2018-07-24 08:16:56 +12:00
global eromeCount
eromeCount = 0
2018-07-10 07:58:11 +12:00
global directCount
directCount = 0
2018-07-10 10:30:50 +12:00
global selfCount
selfCount = 0
allPosts = {}
2018-07-10 07:58:11 +12:00
2018-07-26 21:25:23 +12:00
print("\nGETTING POSTS")
2018-07-26 19:08:57 +12:00
if GLOBAL.arguments.verbose: print("\n")
2018-07-10 07:58:11 +12:00
postsFile = createLogFile("POSTS")
if SINGLE_POST:
submission = posts
subCount += 1
try:
details = {'postId':submission.id,
'postTitle':submission.title,
'postSubmitter':str(submission.author),
'postType':None,
'postURL':submission.url,
'postSubreddit':submission.subreddit.display_name}
except AttributeError:
pass
result = checkIfMatching(submission)
if result is not None:
details = result
orderCount += 1
2018-07-25 22:40:06 +12:00
if GLOBAL.arguments.verbose:
printSubmission(submission,subCount,orderCount)
subList.append(details)
2018-07-10 07:58:11 +12:00
postsFile.add({subCount:[details]})
else:
2018-07-25 22:40:06 +12:00
try:
for submission in posts:
subCount += 1
if subCount % 100 == 0 and not GLOBAL.arguments.verbose:
sys.stdout.write("")
sys.stdout.flush()
if subCount % 1000 == 0:
2018-07-26 19:08:57 +12:00
sys.stdout.write("\n"+" "*14)
2018-07-25 22:40:06 +12:00
sys.stdout.flush()
try:
details = {'postId':submission.id,
'postTitle':submission.title,
'postSubmitter':str(submission.author),
'postType':None,
'postURL':submission.url,
'postSubreddit':submission.subreddit.display_name}
except AttributeError:
continue
result = checkIfMatching(submission)
if result is not None:
details = result
orderCount += 1
if GLOBAL.arguments.verbose:
printSubmission(submission,subCount,orderCount)
subList.append(details)
allPosts[subCount] = [details]
except KeyboardInterrupt:
2018-07-30 22:36:48 +12:00
print("\nKeyboardInterrupt",noPrint=True)
2018-07-10 10:30:50 +12:00
postsFile.add(allPosts)
2018-07-10 07:58:11 +12:00
2018-07-26 19:08:57 +12:00
if not len(subList) == 0:
if GLOBAL.arguments.NoDownload or GLOBAL.arguments.verbose:
print(
f"\n\nTotal of {len(subList)} submissions found!"
)
print(
f"{gfycatCount} GFYCATs, {imgurCount} IMGURs, " \
f"{eromeCount} EROMEs, {directCount} DIRECTs " \
f"and {selfCount} SELF POSTS",noPrint=True
)
else:
print()
2018-07-10 07:58:11 +12:00
return subList
else:
2018-08-09 09:17:04 +12:00
raise NoMatchingSubmissionFound("No matching submission was found")
2018-07-10 07:58:11 +12:00
def checkIfMatching(submission):
global gfycatCount
global imgurCount
2018-07-24 08:16:56 +12:00
global eromeCount
2018-07-10 07:58:11 +12:00
global directCount
2018-07-10 10:30:50 +12:00
global selfCount
2018-07-10 07:58:11 +12:00
try:
details = {'postId':submission.id,
'postTitle':submission.title,
'postSubmitter':str(submission.author),
'postType':None,
'postURL':submission.url,
'postSubreddit':submission.subreddit.display_name}
except AttributeError:
return None
2018-07-24 08:16:56 +12:00
if 'gfycat' in submission.domain:
details['postType'] = 'gfycat'
gfycatCount += 1
return details
2018-07-10 07:58:11 +12:00
2018-07-24 08:16:56 +12:00
elif 'imgur' in submission.domain:
details['postType'] = 'imgur'
imgurCount += 1
return details
2018-07-10 07:58:11 +12:00
2018-07-24 08:16:56 +12:00
elif 'erome' in submission.domain:
details['postType'] = 'erome'
eromeCount += 1
return details
2018-07-10 07:58:11 +12:00
2018-07-20 22:33:50 +12:00
elif isDirectLink(submission.url) is not False:
2018-07-10 07:58:11 +12:00
details['postType'] = 'direct'
2018-07-19 23:57:16 +12:00
details['postURL'] = isDirectLink(submission.url)
2018-07-10 07:58:11 +12:00
directCount += 1
return details
elif submission.is_self:
details['postType'] = 'self'
2018-07-10 10:30:50 +12:00
details['postContent'] = submission.selftext
selfCount += 1
2018-07-10 07:58:11 +12:00
return details
def printSubmission(SUB,validNumber,totalNumber):
"""Print post's link, title and media link to screen"""
print(validNumber,end=") ")
print(totalNumber,end=" ")
print(
"https://www.reddit.com/"
+"r/"
+SUB.subreddit.display_name
+"/comments/"
+SUB.id
)
print(" "*(len(str(validNumber))
+(len(str(totalNumber)))+3),end="")
try:
print(SUB.title)
except:
SUB.title = "unnamed"
print("SUBMISSION NAME COULD NOT BE READ")
pass
print(" "*(len(str(validNumber))+(len(str(totalNumber)))+3),end="")
print(SUB.url,end="\n\n")
def isDirectLink(URL):
"""Check if link is a direct image link.
2018-07-19 23:57:16 +12:00
If so, return URL,
2018-07-10 07:58:11 +12:00
if not, return False
"""
imageTypes = ['.jpg','.png','.mp4','.webm','.gif']
if URL[-1] == "/":
URL = URL[:-1]
if "i.reddituploads.com" in URL:
2018-07-19 23:57:16 +12:00
return URL
elif "v.redd.it" in URL:
return URL+"/DASH_600_K"
2018-07-10 07:58:11 +12:00
for extension in imageTypes:
if extension in URL:
2018-07-19 23:57:16 +12:00
return URL
2018-07-10 07:58:11 +12:00
else:
return False