Refactor Erome downloader

2024-06-25 09:30:36 +12:00 · 2021-03-17 16:23:00 +10:00 · 2021-03-17 16:23:00 +10:00 · bc7ccc0964
parent 29441e7244
commit bc7ccc0964
1 changed files with 21 additions and 49 deletions
--- a/bulkredditdownloader/site_downloaders/erome.py
+++ b/bulkredditdownloader/site_downloaders/erome.py
@ -2,16 +2,15 @@

 import logging
 import re
-import urllib.error
-import urllib.request
-from html.parser import HTMLParser
 from typing import Optional

+import bs4
+import requests
 from praw.models import Submission

-from bulkredditdownloader.site_authenticator import SiteAuthenticator
 from bulkredditdownloader.exceptions import NotADownloadableLinkError
 from bulkredditdownloader.resource import Resource
+from bulkredditdownloader.site_authenticator import SiteAuthenticator
 from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader

 logger = logging.getLogger(__name__)
@ -22,63 +21,36 @@ class Erome(BaseDownloader):
        super().__init__(post)

    def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
-        try:
-            images = set(self._get_links(self.post.url))
-        except urllib.error.HTTPError:
-            raise NotADownloadableLinkError("Not a downloadable link")
+        images = self._get_links(self.post.url)
+        if not images:
+            raise NotADownloadableLinkError('Erome parser could not find any links')

        if len(images) == 1:
-
            image = images.pop()
-            if not re.match(r'https?://.*', image):
-                image = "https://" + image
+            image = self._validate_url(image)
            return [Resource(self.post, image)]

        else:
            out = []
            for i, image in enumerate(images):
-                if not re.match(r'https?://.*', image):
-                    image = "https://" + image
+                image = self._validate_url(image)
                out.append(Resource(self.post, image))
            return out

    @staticmethod
-    def _get_links(url: str) -> list[str]:
-        content = []
-        line_number = None
+    def _validate_url(image):
+        if not re.match(r'https?://.*', image):
+            image = "https://" + image
+        return image

-        # TODO: move to bs4 and requests
-        class EromeParser(HTMLParser):
-            tag = None
+    @staticmethod
+    def _get_links(url: str) -> set[str]:
+        page = requests.get(url)
+        soup = bs4.BeautifulSoup(page.text)
+        front_images = soup.find_all('img', attrs={'class': 'img-front'})
+        out = [im.get('src') for im in front_images]

-            def handle_starttag(self, tag, attrs):
-                self.tag = {tag: {attr[0]: attr[1] for attr in attrs}}
+        videos = soup.find_all('source')
+        out.extend([vid.get('src') for vid in videos])

-        page_source = (urllib.request.urlopen(url).read().decode().split('\n'))
-
-        """ FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS"""
-        for i in range(len(page_source)):
-            obj = EromeParser()
-            obj.feed(page_source[i])
-            tag = obj.tag
-
-            if tag is not None:
-                if "div" in tag:
-                    if "id" in tag["div"]:
-                        if tag["div"]["id"] == "album":
-                            line_number = i
-                            break
-
-        for line in page_source[line_number:]:
-            obj = EromeParser()
-            obj.feed(line)
-            tag = obj.tag
-            if tag is not None:
-                if "img" in tag:
-                    if "class" in tag["img"]:
-                        if tag["img"]["class"] == "img-front":
-                            content.append(tag["img"]["src"])
-                elif "source" in tag:
-                    content.append(tag["source"]["src"])
-
-        return [link for link in content if link.endswith("_480p.mp4") or not link.endswith(".mp4")]
+        return set(out)