Refactor Erome downloader

2024-06-28 19:10:41 +12:00 · 2021-03-17 16:23:00 +10:00 · 2021-03-17 16:23:00 +10:00 · bc7ccc0964
parent 29441e7244
commit bc7ccc0964
1 changed files with 21 additions and 49 deletions
--- a/bulkredditdownloader/site_downloaders/erome.py
+++ b/bulkredditdownloader/site_downloaders/erome.py
@ -2,16 +2,15 @@
 import logging
 import re
 import urllib.error
 import urllib.request
 from html.parser import HTMLParser
 from typing import Optional
 import bs4
 import requests
 from praw.models import Submission
 from bulkredditdownloader.site_authenticator import SiteAuthenticator
 from bulkredditdownloader.exceptions import NotADownloadableLinkError
 from bulkredditdownloader.resource import Resource
 from bulkredditdownloader.site_authenticator import SiteAuthenticator
 from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
 logger = logging.getLogger(__name__)
@ -22,63 +21,36 @@ class Erome(BaseDownloader):
        super().__init__(post)
    def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
-        try:
+        images = self._get_links(self.post.url)
-            images = set(self._get_links(self.post.url))
+        if not images:
-        except urllib.error.HTTPError:
+            raise NotADownloadableLinkError('Erome parser could not find any links')
            raise NotADownloadableLinkError("Not a downloadable link")
        if len(images) == 1:
            image = images.pop()
-            if not re.match(r'https?://.*', image):
+            image = self._validate_url(image)
                image = "https://" + image
            return [Resource(self.post, image)]
        else:
            out = []
            for i, image in enumerate(images):
-                if not re.match(r'https?://.*', image):
+                image = self._validate_url(image)
                    image = "https://" + image
                out.append(Resource(self.post, image))
            return out
    @staticmethod
-    def _get_links(url: str) -> list[str]:
+    def _validate_url(image):
-        content = []
+        if not re.match(r'https?://.*', image):
-        line_number = None
+            image = "https://" + image
        return image
-        # TODO: move to bs4 and requests
+    @staticmethod
-        class EromeParser(HTMLParser):
+    def _get_links(url: str) -> set[str]:
-            tag = None
+        page = requests.get(url)
        soup = bs4.BeautifulSoup(page.text)
        front_images = soup.find_all('img', attrs={'class': 'img-front'})
        out = [im.get('src') for im in front_images]
-            def handle_starttag(self, tag, attrs):
+        videos = soup.find_all('source')
-                self.tag = {tag: {attr[0]: attr[1] for attr in attrs}}
+        out.extend([vid.get('src') for vid in videos])
-        page_source = (urllib.request.urlopen(url).read().decode().split('\n'))
+        return set(out)
        """ FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS"""
        for i in range(len(page_source)):
            obj = EromeParser()
            obj.feed(page_source[i])
            tag = obj.tag
            if tag is not None:
                if "div" in tag:
                    if "id" in tag["div"]:
                        if tag["div"]["id"] == "album":
                            line_number = i
                            break
        for line in page_source[line_number:]:
            obj = EromeParser()
            obj.feed(line)
            tag = obj.tag
            if tag is not None:
                if "img" in tag:
                    if "class" in tag["img"]:
                        if tag["img"]["class"] == "img-front":
                            content.append(tag["img"]["src"])
                elif "source" in tag:
                    content.append(tag["source"]["src"])
        return [link for link in content if link.endswith("_480p.mp4") or not link.endswith(".mp4")]