bulk-downloader-for-reddit/bdfr/site_downloaders/imgur.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import re
from typing import Optional

import bs4
from praw.models import Submission

from bdfr.exceptions import SiteDownloaderError
from bdfr.resource import Resource
from bdfr.site_authenticator import SiteAuthenticator
from bdfr.site_downloaders.base_downloader import BaseDownloader


class Imgur(BaseDownloader):
    def __init__(self, post: Submission):
        super().__init__(post)
        self.raw_data = {}

    def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
        self.raw_data = self._get_data(self.post.url)

        out = []
        if "album_images" in self.raw_data:
            images = self.raw_data["album_images"]
            for image in images["images"]:
                out.append(self._compute_image_url(image))
        else:
            out.append(self._compute_image_url(self.raw_data))
        return out

    def _compute_image_url(self, image: dict) -> Resource:
        ext = self._validate_extension(image["ext"])
        if image.get("prefer_video", False):
            ext = ".mp4"

        image_url = "https://i.imgur.com/" + image["hash"] + ext
        return Resource(self.post, image_url, Resource.retry_download(image_url))

    @staticmethod
    def _get_data(link: str) -> dict:
        try:
            if re.search(r".*/(.*?)(gallery/|a/)", link):
                imgur_id = re.match(r".*/(?:gallery/|a/)(.*?)(?:/.*)?$", link).group(1)
            else:
                imgur_id = re.match(r".*/(.*?)(?:_d)?(?:\..{0,})?$", link).group(1)
            gallery = "a/" if re.search(r".*/(.*?)(gallery/|a/)", link) else ""
            if len(imgur_id) > 7:
                if imgur_id.endswith(("s", "b", "t", "m", "l", "h")):
                    imgur_id = imgur_id[:7]
                else:
                    raise SiteDownloaderError(f"Imgur ID error in link {link}")
            link = f"https://imgur.com/{gallery}{imgur_id}"
        except AttributeError:
            raise SiteDownloaderError(f"Could not extract Imgur ID from {link}")

        res = Imgur.retrieve_url(link, cookies={"over18": "1", "postpagebeta": "0"})

        soup = bs4.BeautifulSoup(res.text, "html.parser")
        scripts = soup.find_all("script", attrs={"type": "text/javascript"})
        scripts = [script.string.replace("\n", "") for script in scripts if script.string]

        script_regex = re.compile(r"\s*\(function\(widgetFactory\)\s*{\s*widgetFactory\.mergeConfig\(\'gallery\'")
        chosen_script = list(filter(lambda s: re.search(script_regex, s), scripts))
        if len(chosen_script) != 1:
            raise SiteDownloaderError(f"Could not read page source from {link}")

        chosen_script = chosen_script[0]

        outer_regex = re.compile(r"widgetFactory\.mergeConfig\(\'gallery\', ({.*})\);")
        inner_regex = re.compile(r"image\s*:(.*),\s*group")
        try:
            image_dict = re.search(outer_regex, chosen_script).group(1)
            image_dict = re.search(inner_regex, image_dict).group(1)
        except AttributeError:
            raise SiteDownloaderError("Could not find image dictionary in page source")

        try:
            image_dict = json.loads(image_dict)
        except json.JSONDecodeError as e:
            raise SiteDownloaderError(f"Could not parse received dict as JSON: {e}")

        return image_dict

    @staticmethod
    def _validate_extension(extension_suffix: str) -> str:
        extension_suffix = re.sub(r"\?.*", "", extension_suffix)
        possible_extensions = (".jpg", ".png", ".mp4", ".gif")
        selection = [ext for ext in possible_extensions if ext == extension_suffix]
        if len(selection) == 1:
            return selection[0]
        else:
            raise SiteDownloaderError(f'"{extension_suffix}" is not recognized as a valid extension for Imgur')
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`#!/usr/bin/env python3`
Standardize shebang and coding declaration Standardizes shebang and coding declarations. Coding matches what's used by install tools such as pip(x). Removes a few init files that were not needed. 2022-12-20 12:32:37 +13:00			`# -- coding: utf-8 --`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00
v1.9.0 (#114) * IMGUR API is no longer used * --skip now accepts file types instead of domain * --skip-domain added * --no-download added * --no-dupe now supports YouTube * Duplicates of older posts will not be dowloaded if --no-dupe and --downloaded-posts options are given together * Invalid characters in MacOS and Linux platforms are removed from filenames * Bug fixes 2020-06-04 03:10:25 +12:00			`import json`
Refactor Imgur class to be hardier 2021-03-21 14:10:06 +13:00			`import re`
Integrate new base_downloader class 2021-02-25 23:40:08 +13:00			`from typing import Optional`
Add tentative typing 2021-02-07 01:29:13 +13:00
Refactor Imgur class to be hardier 2021-03-21 14:10:06 +13:00			`import bs4`
Move to different program structure 2021-02-11 12:10:40 +13:00			`from praw.models import Submission`
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00
Remove unused imports 2021-04-23 23:06:16 +12:00			`from bdfr.exceptions import SiteDownloaderError`
Rename module 2021-04-12 19:58:32 +12:00			`from bdfr.resource import Resource`
			`from bdfr.site_authenticator import SiteAuthenticator`
			`from bdfr.site_downloaders.base_downloader import BaseDownloader`
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00
(maint) code clean up (#187) ## bdfr - Add the bound instance as method parameter - Change methods not using its bound instance to staticmethods - Fix dangerous default argument - Refactor the comparison involving `not` - Refactor unnecessary `else` / `elif` when `if` block has a `raise` statement - Refactor unnecessary `else` / `elif` when `if` block has a `return` statement - Refactor useless `else` block in the loop - Remove implicit `object` from the base class - Remove reimported module - Remove unnecessary generator - Remove unnecessary return statement - Remove unnecessary use of comprehension - Remove unused imports - Use `is` to compare type of objects - Using not x can cause unwanted results ## Dockerfile - use a pinned Python version tag instead of latest - leverage cached requirements Signed-off-by: Vladislav Doster <mvdoster@gmail.com> Co-authored-by: Ali Parlakçı <parlakciali@gmail.com> 2021-02-25 22:32:06 +13:00
Remove utils module for downloaders 2021-02-07 14:33:19 +13:00			`class Imgur(BaseDownloader):`
Remove unused parameter 2021-02-15 18:12:27 +13:00			`def __init__(self, post: Submission):`
			`super().__init__(post)`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`self.raw_data = {}`

Rename file and class 2021-02-26 21:57:05 +13:00			`def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:`
Refactor Imgur class to be hardier 2021-03-21 14:10:06 +13:00			`self.raw_data = self._get_data(self.post.url)`
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00
Refactor Imgur class to be hardier 2021-03-21 14:10:06 +13:00			`out = []`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`if "album_images" in self.raw_data:`
			`images = self.raw_data["album_images"]`
			`for image in images["images"]:`
Rename function 2021-04-23 22:58:01 +12:00			`out.append(self._compute_image_url(image))`
v1.9.0 (#114) * IMGUR API is no longer used * --skip now accepts file types instead of domain * --skip-domain added * --no-download added * --no-dupe now supports YouTube * Duplicates of older posts will not be dowloaded if --no-dupe and --downloaded-posts options are given together * Invalid characters in MacOS and Linux platforms are removed from filenames * Bug fixes 2020-06-04 03:10:25 +12:00			`else:`
Rename function 2021-04-23 22:58:01 +12:00			`out.append(self._compute_image_url(self.raw_data))`
Move to different program structure 2021-02-11 12:10:40 +13:00			`return out`
v1.8.0 (#105) ## Change log - Youtube support added - Custom filenames feature added - Custom folder structure feature added - Unsaving downloaded posts option added - Remove duplicate posts on different subreddits option added - Skipping given domains option added - Keeping track of already downloaded posts on a separate file option added (See --dowloaded-posts in README) - No audio on v.redd.it videos bug fixed (see README for details about ffmpeg) - --default-directory option is added - --default-options is added - --use-local-config option is added - Bug fixes 2020-06-02 00:05:02 +12:00
Rename function 2021-04-23 22:58:01 +12:00			`def _compute_image_url(self, image: dict) -> Resource:`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`ext = self._validate_extension(image["ext"])`
			`if image.get("prefer_video", False):`
			`ext = ".mp4"`
imgur: download videos as mp4 instead of gif Some imgur URLS have the extension ".gifv" and show up as a gif, even though they're actually supposed to be mp4 videos. Imgur serves all videos/gifs as both .gif and .mp4. The image dict has a key "prefer_video" to distinguish the two. This commit overrides the .gif extension if "prefer_video" is true to ensure we download the submission as originally intended. 2021-09-13 09:50:31 +12:00
Format according to the black standard 2022-12-03 18:11:17 +13:00			`image_url = "https://i.imgur.com/" + image["hash"] + ext`
Refactor method to remove max wait time 2021-07-27 16:02:30 +12:00			`return Resource(self.post, image_url, Resource.retry_download(image_url))`
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00
Refactor Imgur class to be hardier 2021-03-21 14:10:06 +13:00			`@staticmethod`
			`def _get_data(link: str) -> dict:`
Imgur updates Update Imgur logic to cover malformed links that cause a redirect leading to the html of the page being saved as an image. 2022-09-19 15:27:17 +12:00			`try:`
Account for new gallery url Coverage for gallery urls 2023-01-10 06:48:24 +13:00			`if re.search(r"./(.?)(gallery/\|a/)", link):`
			`imgur_id = re.match(r"./(?:gallery/\|a/)(.?)(?:/.*)?$", link).group(1)`
			`else:`
Imgur thumbnail coverage Coverage for links posted to thumbnail variations. 2023-01-10 09:34:14 +13:00			`imgur_id = re.match(r"./(.?)(?:_d)?(?:\..{0,})?$", link).group(1)`
			`gallery = "a/" if re.search(r"./(.?)(gallery/\|a/)", link) else ""`
			`if len(imgur_id) > 7:`
			`if imgur_id.endswith(("s", "b", "t", "m", "l", "h")):`
			`imgur_id = imgur_id[:7]`
			`else:`
			`raise SiteDownloaderError(f"Imgur ID error in link {link}")`
			`link = f"https://imgur.com/{gallery}{imgur_id}"`
Imgur updates Update Imgur logic to cover malformed links that cause a redirect leading to the html of the page being saved as an image. 2022-09-19 15:27:17 +12:00			`except AttributeError:`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`raise SiteDownloaderError(f"Could not extract Imgur ID from {link}")`
Remove splice and fix quotes 2021-03-28 13:15:21 +13:00
Format according to the black standard 2022-12-03 18:11:17 +13:00			`res = Imgur.retrieve_url(link, cookies={"over18": "1", "postpagebeta": "0"})`
v1.9.0 (#114) * IMGUR API is no longer used * --skip now accepts file types instead of domain * --skip-domain added * --no-download added * --no-dupe now supports YouTube * Duplicates of older posts will not be dowloaded if --no-dupe and --downloaded-posts options are given together * Invalid characters in MacOS and Linux platforms are removed from filenames * Bug fixes 2020-06-04 03:10:25 +12:00
Format according to the black standard 2022-12-03 18:11:17 +13:00			`soup = bs4.BeautifulSoup(res.text, "html.parser")`
			`scripts = soup.find_all("script", attrs={"type": "text/javascript"})`
			`scripts = [script.string.replace("\n", "") for script in scripts if script.string]`
v1.9.0 (#114) * IMGUR API is no longer used * --skip now accepts file types instead of domain * --skip-domain added * --no-download added * --no-dupe now supports YouTube * Duplicates of older posts will not be dowloaded if --no-dupe and --downloaded-posts options are given together * Invalid characters in MacOS and Linux platforms are removed from filenames * Bug fixes 2020-06-04 03:10:25 +12:00
Format according to the black standard 2022-12-03 18:11:17 +13:00			`script_regex = re.compile(r"\s\(function\(widgetFactory\)\s{\s*widgetFactory\.mergeConfig\(\'gallery\'")`
Refactor Imgur class to be hardier 2021-03-21 14:10:06 +13:00			`chosen_script = list(filter(lambda s: re.search(script_regex, s), scripts))`
			`if len(chosen_script) != 1:`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`raise SiteDownloaderError(f"Could not read page source from {link}")`
v1.9.0 (#114) * IMGUR API is no longer used * --skip now accepts file types instead of domain * --skip-domain added * --no-download added * --no-dupe now supports YouTube * Duplicates of older posts will not be dowloaded if --no-dupe and --downloaded-posts options are given together * Invalid characters in MacOS and Linux platforms are removed from filenames * Bug fixes 2020-06-04 03:10:25 +12:00
Add defensive programming to site downloaders 2021-04-06 13:04:08 +12:00			`chosen_script = chosen_script[0]`
Refactor Imgur class to be hardier 2021-03-21 14:10:06 +13:00
Format according to the black standard 2022-12-03 18:11:17 +13:00			`outer_regex = re.compile(r"widgetFactory\.mergeConfig\(\'gallery\', ({.*})\);")`
			`inner_regex = re.compile(r"image\s:(.),\s*group")`
Add defensive programming to site downloaders 2021-04-06 13:04:08 +12:00			`try:`
			`image_dict = re.search(outer_regex, chosen_script).group(1)`
			`image_dict = re.search(inner_regex, image_dict).group(1)`
			`except AttributeError:`
Consolidate flake8 settings Consolidates sane flake8 settings to pyproject with the Flake8-pyproject plugin. Does not change logic of test workflow but allows base settings to live in pyproject for anyone using flake8 as an external linter (e.g. vscode) Also fixes some flake8 errors that were not being picked up by current testing, mostly unused imports. 2022-12-29 04:00:43 +13:00			`raise SiteDownloaderError("Could not find image dictionary in page source")`
Add defensive programming to site downloaders 2021-04-06 13:04:08 +12:00
			`try:`
			`image_dict = json.loads(image_dict)`
			`except json.JSONDecodeError as e:`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`raise SiteDownloaderError(f"Could not parse received dict as JSON: {e}")`
Refactor Imgur class to be hardier 2021-03-21 14:10:06 +13:00
			`return image_dict`
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00
			`@staticmethod`
Move to inheritance system for downloaders 2021-02-07 17:46:20 +13:00			`def _validate_extension(extension_suffix: str) -> str:`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`extension_suffix = re.sub(r"\?.*", "", extension_suffix)`
			`possible_extensions = (".jpg", ".png", ".mp4", ".gif")`
Refactor Imgur class to be hardier 2021-03-21 14:10:06 +13:00			`selection = [ext for ext in possible_extensions if ext == extension_suffix]`
			`if len(selection) == 1:`
			`return selection[0]`
Pep8 format (#184) * Format file to be PEP8 compliant * Remove unused imports * Format file to PEP8 * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Format file to PEP8 * Format file to PEP8 * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Condense spacing 2021-02-06 21:35:50 +13:00			`else:`
Simplify errors 2021-02-15 19:30:39 +13:00			`raise SiteDownloaderError(f'"{extension_suffix}" is not recognized as a valid extension for Imgur')`