bulk-downloader-for-reddit/bulkredditdownloader/site_downloaders/erome.py

#!/usr/bin/env python3

import logging
import pathlib
import re
import urllib.error
import urllib.request
from html.parser import HTMLParser

from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError
from bulkredditdownloader.utils import GLOBAL

logger = logging.getLogger(__name__)


class Erome(BaseDownloader):
    def __init__(self, directory: pathlib.Path, post: dict):
        super().__init__(directory, post)
        self.download()

    def download(self):
        try:
            images = self._get_links(self.post['CONTENTURL'])
        except urllib.error.HTTPError:
            raise NotADownloadableLinkError("Not a downloadable link")

        images_length = len(images)
        how_many_downloaded = len(images)
        duplicates = 0

        if images_length == 1:
            """Filenames are declared here"""
            filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"]

            image = images[0]
            if not re.match(r'https?://.*', image):
                image = "https://" + image

            self._download_resource(filename, self.directory, image)

        else:
            filename = GLOBAL.config['filename'].format(**self.post)
            logger.info(filename)

            folder_dir = self.directory / filename

            folder_dir.mkdir(exist_ok=True)

            for i, image in enumerate(images):
                extension = self._get_extension(image)
                filename = str(i + 1) + extension

                if not re.match(r'https?://.*', image):
                    image = "https://" + image

                logger.info("  ({}/{})".format(i + 1, images_length))
                logger.info("  {}".format(filename))

                try:
                    self._download_resource(pathlib.Path(filename), folder_dir, image, indent=2)
                except FileAlreadyExistsError:
                    logger.info("  The file already exists" + " " * 10, end="\n\n")
                    duplicates += 1
                    how_many_downloaded -= 1

                except Exception as exception:
                    # raise exception
                    logger.error("\n  Could not get the file")
                    logger.error(
                        "  "
                        + "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception))
                        + "\n"
                    )
                    how_many_downloaded -= 1

            if duplicates == images_length:
                raise FileAlreadyExistsError
            elif how_many_downloaded + duplicates < images_length:
                raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely")

    @staticmethod
    def _get_links(url: str) -> list[str]:
        content = []
        line_number = None

        # TODO: move to bs4 and requests
        class EromeParser(HTMLParser):
            tag = None

            def handle_starttag(self, tag, attrs):
                self.tag = {tag: {attr[0]: attr[1] for attr in attrs}}

        page_source = (urllib.request.urlopen(url).read().decode().split('\n'))

        """ FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS"""
        for i in range(len(page_source)):
            obj = EromeParser()
            obj.feed(page_source[i])
            tag = obj.tag

            if tag is not None:
                if "div" in tag:
                    if "id" in tag["div"]:
                        if tag["div"]["id"] == "album":
                            line_number = i
                            break

        for line in page_source[line_number:]:
            obj = EromeParser()
            obj.feed(line)
            tag = obj.tag
            if tag is not None:
                if "img" in tag:
                    if "class" in tag["img"]:
                        if tag["img"]["class"] == "img-front":
                            content.append(tag["img"]["src"])
                elif "source" in tag:
                    content.append(tag["source"]["src"])

        return [link for link in content if link.endswith("_480p.mp4") or not link.endswith(".mp4")]