2bafb1b99b
Consolidates sane flake8 settings to pyproject with the Flake8-pyproject plugin. Does not change logic of test workflow but allows base settings to live in pyproject for anyone using flake8 as an external linter (e.g. vscode) Also fixes some flake8 errors that were not being picked up by current testing, mostly unused imports.
88 lines
3.4 KiB
Python
88 lines
3.4 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import json
|
|
import re
|
|
from typing import Optional
|
|
|
|
import bs4
|
|
from praw.models import Submission
|
|
|
|
from bdfr.exceptions import SiteDownloaderError
|
|
from bdfr.resource import Resource
|
|
from bdfr.site_authenticator import SiteAuthenticator
|
|
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
|
|
|
|
|
class Imgur(BaseDownloader):
|
|
def __init__(self, post: Submission):
|
|
super().__init__(post)
|
|
self.raw_data = {}
|
|
|
|
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
|
self.raw_data = self._get_data(self.post.url)
|
|
|
|
out = []
|
|
if "album_images" in self.raw_data:
|
|
images = self.raw_data["album_images"]
|
|
for image in images["images"]:
|
|
out.append(self._compute_image_url(image))
|
|
else:
|
|
out.append(self._compute_image_url(self.raw_data))
|
|
return out
|
|
|
|
def _compute_image_url(self, image: dict) -> Resource:
|
|
ext = self._validate_extension(image["ext"])
|
|
if image.get("prefer_video", False):
|
|
ext = ".mp4"
|
|
|
|
image_url = "https://i.imgur.com/" + image["hash"] + ext
|
|
return Resource(self.post, image_url, Resource.retry_download(image_url))
|
|
|
|
@staticmethod
|
|
def _get_data(link: str) -> dict:
|
|
try:
|
|
imgur_id = re.match(r".*/(.*?)(_d)?(\..{0,})?$", link).group(1)
|
|
gallery = "a/" if re.search(r".*/(.*?)(gallery/|a/)", link) else ""
|
|
link = f"https://imgur.com/{gallery}{imgur_id}"
|
|
except AttributeError:
|
|
raise SiteDownloaderError(f"Could not extract Imgur ID from {link}")
|
|
|
|
res = Imgur.retrieve_url(link, cookies={"over18": "1", "postpagebeta": "0"})
|
|
|
|
soup = bs4.BeautifulSoup(res.text, "html.parser")
|
|
scripts = soup.find_all("script", attrs={"type": "text/javascript"})
|
|
scripts = [script.string.replace("\n", "") for script in scripts if script.string]
|
|
|
|
script_regex = re.compile(r"\s*\(function\(widgetFactory\)\s*{\s*widgetFactory\.mergeConfig\(\'gallery\'")
|
|
chosen_script = list(filter(lambda s: re.search(script_regex, s), scripts))
|
|
if len(chosen_script) != 1:
|
|
raise SiteDownloaderError(f"Could not read page source from {link}")
|
|
|
|
chosen_script = chosen_script[0]
|
|
|
|
outer_regex = re.compile(r"widgetFactory\.mergeConfig\(\'gallery\', ({.*})\);")
|
|
inner_regex = re.compile(r"image\s*:(.*),\s*group")
|
|
try:
|
|
image_dict = re.search(outer_regex, chosen_script).group(1)
|
|
image_dict = re.search(inner_regex, image_dict).group(1)
|
|
except AttributeError:
|
|
raise SiteDownloaderError("Could not find image dictionary in page source")
|
|
|
|
try:
|
|
image_dict = json.loads(image_dict)
|
|
except json.JSONDecodeError as e:
|
|
raise SiteDownloaderError(f"Could not parse received dict as JSON: {e}")
|
|
|
|
return image_dict
|
|
|
|
@staticmethod
|
|
def _validate_extension(extension_suffix: str) -> str:
|
|
extension_suffix = re.sub(r"\?.*", "", extension_suffix)
|
|
possible_extensions = (".jpg", ".png", ".mp4", ".gif")
|
|
selection = [ext for ext in possible_extensions if ext == extension_suffix]
|
|
if len(selection) == 1:
|
|
return selection[0]
|
|
else:
|
|
raise SiteDownloaderError(f'"{extension_suffix}" is not recognized as a valid extension for Imgur')
|