1
0
Fork 0
mirror of synced 2024-07-01 04:20:54 +12:00
bulk-downloader-for-reddit/bulkredditdownloader/site_downloaders/erome.py
2021-04-18 16:43:55 +03:00

85 lines
2.7 KiB
Python

#!/usr/bin/env python3
import logging
import re
import urllib.error
import urllib.request
from html.parser import HTMLParser
from typing import Optional
from praw.models import Submission
from bulkredditdownloader.authenticator import Authenticator
from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
logger = logging.getLogger(__name__)
class Erome(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]:
try:
images = self._get_links(self.post.url)
except urllib.error.HTTPError:
raise NotADownloadableLinkError("Not a downloadable link")
if len(images) == 1:
image = images[0]
if not re.match(r'https?://.*', image):
image = "https://" + image
return [Resource(self.post, image)]
else:
out = []
for i, image in enumerate(images):
if not re.match(r'https?://.*', image):
image = "https://" + image
out.append(Resource(self.post, image))
return out
@staticmethod
def _get_links(url: str) -> list[str]:
content = []
line_number = None
# TODO: move to bs4 and requests
class EromeParser(HTMLParser):
tag = None
def handle_starttag(self, tag, attrs):
self.tag = {tag: {attr[0]: attr[1] for attr in attrs}}
page_source = (urllib.request.urlopen(url).read().decode().split('\n'))
""" FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS"""
for i in range(len(page_source)):
obj = EromeParser()
obj.feed(page_source[i])
tag = obj.tag
if tag is not None:
if "div" in tag:
if "id" in tag["div"]:
if tag["div"]["id"] == "album":
line_number = i
break
for line in page_source[line_number:]:
obj = EromeParser()
obj.feed(line)
tag = obj.tag
if tag is not None:
if "img" in tag:
if "class" in tag["img"]:
if tag["img"]["class"] == "img-front":
content.append(tag["img"]["src"])
elif "source" in tag:
content.append(tag["source"]["src"])
return [link for link in content if link.endswith("_480p.mp4") or not link.endswith(".mp4")]