1
0
Fork 0
mirror of synced 2024-05-21 12:42:44 +12:00
bulk-downloader-for-reddit/bdfr/site_downloaders/imgur.py
OMEGARAZER 2bafb1b99b
Consolidate flake8 settings
Consolidates sane flake8 settings to pyproject with the Flake8-pyproject plugin.

Does not change logic of test workflow but allows base settings to live in pyproject for anyone using flake8 as an external linter (e.g. vscode)

Also fixes some flake8 errors that were not being picked up by current testing, mostly unused imports.
2022-12-28 10:00:43 -05:00

88 lines
3.4 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import re
from typing import Optional
import bs4
from praw.models import Submission
from bdfr.exceptions import SiteDownloaderError
from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.base_downloader import BaseDownloader
class Imgur(BaseDownloader):
def __init__(self, post: Submission):
super().__init__(post)
self.raw_data = {}
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
self.raw_data = self._get_data(self.post.url)
out = []
if "album_images" in self.raw_data:
images = self.raw_data["album_images"]
for image in images["images"]:
out.append(self._compute_image_url(image))
else:
out.append(self._compute_image_url(self.raw_data))
return out
def _compute_image_url(self, image: dict) -> Resource:
ext = self._validate_extension(image["ext"])
if image.get("prefer_video", False):
ext = ".mp4"
image_url = "https://i.imgur.com/" + image["hash"] + ext
return Resource(self.post, image_url, Resource.retry_download(image_url))
@staticmethod
def _get_data(link: str) -> dict:
try:
imgur_id = re.match(r".*/(.*?)(_d)?(\..{0,})?$", link).group(1)
gallery = "a/" if re.search(r".*/(.*?)(gallery/|a/)", link) else ""
link = f"https://imgur.com/{gallery}{imgur_id}"
except AttributeError:
raise SiteDownloaderError(f"Could not extract Imgur ID from {link}")
res = Imgur.retrieve_url(link, cookies={"over18": "1", "postpagebeta": "0"})
soup = bs4.BeautifulSoup(res.text, "html.parser")
scripts = soup.find_all("script", attrs={"type": "text/javascript"})
scripts = [script.string.replace("\n", "") for script in scripts if script.string]
script_regex = re.compile(r"\s*\(function\(widgetFactory\)\s*{\s*widgetFactory\.mergeConfig\(\'gallery\'")
chosen_script = list(filter(lambda s: re.search(script_regex, s), scripts))
if len(chosen_script) != 1:
raise SiteDownloaderError(f"Could not read page source from {link}")
chosen_script = chosen_script[0]
outer_regex = re.compile(r"widgetFactory\.mergeConfig\(\'gallery\', ({.*})\);")
inner_regex = re.compile(r"image\s*:(.*),\s*group")
try:
image_dict = re.search(outer_regex, chosen_script).group(1)
image_dict = re.search(inner_regex, image_dict).group(1)
except AttributeError:
raise SiteDownloaderError("Could not find image dictionary in page source")
try:
image_dict = json.loads(image_dict)
except json.JSONDecodeError as e:
raise SiteDownloaderError(f"Could not parse received dict as JSON: {e}")
return image_dict
@staticmethod
def _validate_extension(extension_suffix: str) -> str:
extension_suffix = re.sub(r"\?.*", "", extension_suffix)
possible_extensions = (".jpg", ".png", ".mp4", ".gif")
selection = [ext for ext in possible_extensions if ext == extension_suffix]
if len(selection) == 1:
return selection[0]
else:
raise SiteDownloaderError(f'"{extension_suffix}" is not recognized as a valid extension for Imgur')