From bc7ccc0964ac9cac770d7f787e08f070f2037534 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 17 Mar 2021 16:23:00 +1000 Subject: [PATCH] Refactor Erome downloader --- .../site_downloaders/erome.py | 70 ++++++------------- 1 file changed, 21 insertions(+), 49 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index 762e8f9..8dab973 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -2,16 +2,15 @@ import logging import re -import urllib.error -import urllib.request -from html.parser import HTMLParser from typing import Optional +import bs4 +import requests from praw.models import Submission -from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.exceptions import NotADownloadableLinkError from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) @@ -22,63 +21,36 @@ class Erome(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - try: - images = set(self._get_links(self.post.url)) - except urllib.error.HTTPError: - raise NotADownloadableLinkError("Not a downloadable link") + images = self._get_links(self.post.url) + if not images: + raise NotADownloadableLinkError('Erome parser could not find any links') if len(images) == 1: - image = images.pop() - if not re.match(r'https?://.*', image): - image = "https://" + image + image = self._validate_url(image) return [Resource(self.post, image)] else: out = [] for i, image in enumerate(images): - if not re.match(r'https?://.*', image): - image = "https://" + image + image = self._validate_url(image) out.append(Resource(self.post, image)) return out @staticmethod - def _get_links(url: str) -> list[str]: - content = [] - line_number = None + def _validate_url(image): + if not re.match(r'https?://.*', image): + image = "https://" + image + return image - # TODO: move to bs4 and requests - class EromeParser(HTMLParser): - tag = None + @staticmethod + def _get_links(url: str) -> set[str]: + page = requests.get(url) + soup = bs4.BeautifulSoup(page.text) + front_images = soup.find_all('img', attrs={'class': 'img-front'}) + out = [im.get('src') for im in front_images] - def handle_starttag(self, tag, attrs): - self.tag = {tag: {attr[0]: attr[1] for attr in attrs}} + videos = soup.find_all('source') + out.extend([vid.get('src') for vid in videos]) - page_source = (urllib.request.urlopen(url).read().decode().split('\n')) - - """ FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS""" - for i in range(len(page_source)): - obj = EromeParser() - obj.feed(page_source[i]) - tag = obj.tag - - if tag is not None: - if "div" in tag: - if "id" in tag["div"]: - if tag["div"]["id"] == "album": - line_number = i - break - - for line in page_source[line_number:]: - obj = EromeParser() - obj.feed(line) - tag = obj.tag - if tag is not None: - if "img" in tag: - if "class" in tag["img"]: - if tag["img"]["class"] == "img-front": - content.append(tag["img"]["src"]) - elif "source" in tag: - content.append(tag["source"]["src"]) - - return [link for link in content if link.endswith("_480p.mp4") or not link.endswith(".mp4")] + return set(out)