1
0
Fork 0
mirror of synced 2024-06-28 19:10:41 +12:00

Refactor Erome downloader

This commit is contained in:
Serene-Arc 2021-03-17 16:23:00 +10:00 committed by Ali Parlakci
parent 29441e7244
commit bc7ccc0964

View file

@ -2,16 +2,15 @@
import logging import logging
import re import re
import urllib.error
import urllib.request
from html.parser import HTMLParser
from typing import Optional from typing import Optional
import bs4
import requests
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.site_authenticator import SiteAuthenticator
from bulkredditdownloader.exceptions import NotADownloadableLinkError from bulkredditdownloader.exceptions import NotADownloadableLinkError
from bulkredditdownloader.resource import Resource from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_authenticator import SiteAuthenticator
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -22,63 +21,36 @@ class Erome(BaseDownloader):
super().__init__(post) super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
try: images = self._get_links(self.post.url)
images = set(self._get_links(self.post.url)) if not images:
except urllib.error.HTTPError: raise NotADownloadableLinkError('Erome parser could not find any links')
raise NotADownloadableLinkError("Not a downloadable link")
if len(images) == 1: if len(images) == 1:
image = images.pop() image = images.pop()
if not re.match(r'https?://.*', image): image = self._validate_url(image)
image = "https://" + image
return [Resource(self.post, image)] return [Resource(self.post, image)]
else: else:
out = [] out = []
for i, image in enumerate(images): for i, image in enumerate(images):
if not re.match(r'https?://.*', image): image = self._validate_url(image)
image = "https://" + image
out.append(Resource(self.post, image)) out.append(Resource(self.post, image))
return out return out
@staticmethod @staticmethod
def _get_links(url: str) -> list[str]: def _validate_url(image):
content = [] if not re.match(r'https?://.*', image):
line_number = None image = "https://" + image
return image
# TODO: move to bs4 and requests @staticmethod
class EromeParser(HTMLParser): def _get_links(url: str) -> set[str]:
tag = None page = requests.get(url)
soup = bs4.BeautifulSoup(page.text)
front_images = soup.find_all('img', attrs={'class': 'img-front'})
out = [im.get('src') for im in front_images]
def handle_starttag(self, tag, attrs): videos = soup.find_all('source')
self.tag = {tag: {attr[0]: attr[1] for attr in attrs}} out.extend([vid.get('src') for vid in videos])
page_source = (urllib.request.urlopen(url).read().decode().split('\n')) return set(out)
""" FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS"""
for i in range(len(page_source)):
obj = EromeParser()
obj.feed(page_source[i])
tag = obj.tag
if tag is not None:
if "div" in tag:
if "id" in tag["div"]:
if tag["div"]["id"] == "album":
line_number = i
break
for line in page_source[line_number:]:
obj = EromeParser()
obj.feed(line)
tag = obj.tag
if tag is not None:
if "img" in tag:
if "class" in tag["img"]:
if tag["img"]["class"] == "img-front":
content.append(tag["img"]["src"])
elif "source" in tag:
content.append(tag["source"]["src"])
return [link for link in content if link.endswith("_480p.mp4") or not link.endswith(".mp4")]