bulk-downloader-for-reddit/bdfr/site_downloaders/download_factory.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import urllib.parse

from bdfr.exceptions import NotADownloadableLinkError
from bdfr.site_downloaders.base_downloader import BaseDownloader
from bdfr.site_downloaders.delay_for_reddit import DelayForReddit
from bdfr.site_downloaders.direct import Direct
from bdfr.site_downloaders.erome import Erome
from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback
from bdfr.site_downloaders.gallery import Gallery
from bdfr.site_downloaders.gfycat import Gfycat
from bdfr.site_downloaders.imgur import Imgur
from bdfr.site_downloaders.pornhub import PornHub
from bdfr.site_downloaders.redgifs import Redgifs
from bdfr.site_downloaders.self_post import SelfPost
from bdfr.site_downloaders.vidble import Vidble
from bdfr.site_downloaders.vreddit import VReddit
from bdfr.site_downloaders.youtube import Youtube


class DownloadFactory:
    @staticmethod
    def pull_lever(url: str) -> type[BaseDownloader]:
        sanitised_url = DownloadFactory.sanitise_url(url).lower()
        if re.match(r"(i\.|m\.|o\.)?imgur", sanitised_url):
            return Imgur
        elif re.match(r"(i\.|thumbs\d\.|v\d\.)?(redgifs|gifdeliverynetwork)", sanitised_url):
            return Redgifs
        elif re.match(r"(thumbs\.|giant\.)?gfycat\.", sanitised_url):
            return Gfycat
        elif re.match(r".*/.*\.[a-zA-Z34]{3,4}(\?[\w;&=]*)?$", sanitised_url) and not DownloadFactory.is_web_resource(
            sanitised_url
        ):
            return Direct
        elif re.match(r"erome\.com.*", sanitised_url):
            return Erome
        elif re.match(r"delayforreddit\.com", sanitised_url):
            return DelayForReddit
        elif re.match(r"reddit\.com/gallery/.*", sanitised_url):
            return Gallery
        elif re.match(r"patreon\.com.*", sanitised_url):
            return Gallery
        elif re.match(r"reddit\.com/r/", sanitised_url):
            return SelfPost
        elif re.match(r"(m\.)?youtu\.?be", sanitised_url):
            return Youtube
        elif re.match(r"i\.redd\.it.*", sanitised_url):
            return Direct
        elif re.match(r"v\.redd\.it.*", sanitised_url):
            return VReddit
        elif re.match(r"pornhub\.com.*", sanitised_url):
            return PornHub
        elif re.match(r"vidble\.com", sanitised_url):
            return Vidble
        elif YtdlpFallback.can_handle_link(sanitised_url):
            return YtdlpFallback
        else:
            raise NotADownloadableLinkError(f"No downloader module exists for url {url}")

    @staticmethod
    def sanitise_url(url: str) -> str:
        beginning_regex = re.compile(r"\s*(www\.?)?")
        split_url = urllib.parse.urlsplit(url)
        split_url = split_url.netloc + split_url.path
        split_url = re.sub(beginning_regex, "", split_url)
        return split_url

    @staticmethod
    def is_web_resource(url: str) -> bool:
        web_extensions = (
            "asp",
            "aspx",
            "cfm",
            "cfml",
            "css",
            "htm",
            "html",
            "js",
            "php",
            "php3",
            "xhtml",
        )
        if re.match(rf'(?i).*/.*\.({"|".join(web_extensions)})$', url):
            return True
        else:
            return False
Add downloader factory 2021-02-11 12:09:37 +13:00			`#!/usr/bin/env python3`
Standardize shebang and coding declaration Standardizes shebang and coding declarations. Coding matches what's used by install tools such as pip(x). Removes a few init files that were not needed. 2022-12-20 12:32:37 +13:00			`# -- coding: utf-8 --`
Add downloader factory 2021-02-11 12:09:37 +13:00
			`import re`
Fix time filters (#279) 2021-04-18 23:24:11 +12:00			`import urllib.parse`
Add downloader factory 2021-02-11 12:09:37 +13:00
Rename module 2021-04-12 19:58:32 +12:00			`from bdfr.exceptions import NotADownloadableLinkError`
			`from bdfr.site_downloaders.base_downloader import BaseDownloader`
Add Delay for Reddit support Adds support for delayforreddit.com non-direct links. 2022-11-06 03:51:33 +13:00			`from bdfr.site_downloaders.delay_for_reddit import DelayForReddit`
Rename module 2021-04-12 19:58:32 +12:00			`from bdfr.site_downloaders.direct import Direct`
			`from bdfr.site_downloaders.erome import Erome`
Rename module to reflect backend change 2021-11-24 13:40:18 +13:00			`from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback`
Rename module 2021-04-12 19:58:32 +12:00			`from bdfr.site_downloaders.gallery import Gallery`
			`from bdfr.site_downloaders.gfycat import Gfycat`
			`from bdfr.site_downloaders.imgur import Imgur`
Add PornHub module 2021-06-25 19:47:49 +12:00			`from bdfr.site_downloaders.pornhub import PornHub`
Rename module 2021-04-12 19:58:32 +12:00			`from bdfr.site_downloaders.redgifs import Redgifs`
			`from bdfr.site_downloaders.self_post import SelfPost`
Add Vidble to download factory 2021-09-11 14:15:35 +12:00			`from bdfr.site_downloaders.vidble import Vidble`
Rename class 2022-07-15 17:05:07 +12:00			`from bdfr.site_downloaders.vreddit import VReddit`
Rename module 2021-04-12 19:58:32 +12:00			`from bdfr.site_downloaders.youtube import Youtube`
Add downloader factory 2021-02-11 12:09:37 +13:00

			`class DownloadFactory:`
			`@staticmethod`
pep585 and pathlib updates 2023-01-26 16:23:59 +13:00			`def pull_lever(url: str) -> type[BaseDownloader]:`
Gfycat/Redgifs coverage Coverage for direct gfycat links that redirect to redgifs. The redirect through the sites themselves are broken but this fixes that. Coverage for o.imgur links and incorrect capitalisation of domains in download_factory. Changed tests for direct as gfycat is handled by the gfycat downloader. fix pornhub test as the previous video was removed. 2023-01-31 08:52:08 +13:00			`sanitised_url = DownloadFactory.sanitise_url(url).lower()`
			`if re.match(r"(i\.\|m\.\|o\.)?imgur", sanitised_url):`
download_factory.py: check if url has ext first 2021-04-04 04:44:53 +12:00			`return Imgur`
Add new Redgifs subdomain Seems there's a v3 subdomain now (looks like it's mostly for mobile) 2023-01-07 05:56:54 +13:00			`elif re.match(r"(i\.\|thumbs\d\.\|v\d\.)?(redgifs\|gifdeliverynetwork)", sanitised_url):`
Redgifs fixed? If this doesn't work then I give up... 2022-09-17 12:41:17 +12:00			`return Redgifs`
Gfycat/Redgifs coverage Coverage for direct gfycat links that redirect to redgifs. The redirect through the sites themselves are broken but this fixes that. Coverage for o.imgur links and incorrect capitalisation of domains in download_factory. Changed tests for direct as gfycat is handled by the gfycat downloader. fix pornhub test as the previous video was removed. 2023-01-31 08:52:08 +13:00			`elif re.match(r"(thumbs\.\|giant\.)?gfycat\.", sanitised_url):`
			`return Gfycat`
Update download_factory.py Attempt to fix #724 Narrows down characters available to extensions in the regex. Outside of 3 and 4, the only extensions that I can think of this doesn't hit are bz2 and 7z (which wasn't caught before). 2022-12-20 16:02:16 +13:00			`elif re.match(r"./.\.[a-zA-Z34]{3,4}(\?[\w;&=]*)?$", sanitised_url) and not DownloadFactory.is_web_resource(`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`sanitised_url`
			`):`
download_factory.py: check if url has ext first 2021-04-04 04:44:53 +12:00			`return Direct`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"erome\.com.*", sanitised_url):`
Add downloader factory 2021-02-11 12:09:37 +13:00			`return Erome`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"delayforreddit\.com", sanitised_url):`
Add Delay for Reddit support Adds support for delayforreddit.com non-direct links. 2022-11-06 03:51:33 +13:00			`return DelayForReddit`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"reddit\.com/gallery/.*", sanitised_url):`
Add some tests for DownloadFactory 2021-03-01 12:51:44 +13:00			`return Gallery`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"patreon\.com.*", sanitised_url):`
Add Patreon image support 2021-12-19 16:44:24 +13:00			`return Gallery`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"reddit\.com/r/", sanitised_url):`
Add some tests for DownloadFactory 2021-03-01 12:51:44 +13:00			`return SelfPost`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"(m\.)?youtu\.?be", sanitised_url):`
Add logic to handle mobile youtube (m.youtube) links. 2021-04-02 18:56:31 +13:00			`return Youtube`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"i\.redd\.it.*", sanitised_url):`
Split regex for download factory 2021-03-28 13:10:46 +13:00			`return Direct`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"v\.redd\.it.*", sanitised_url):`
Rename class 2022-07-15 17:05:07 +12:00			`return VReddit`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"pornhub\.com.*", sanitised_url):`
Add PornHub module 2021-06-25 19:47:49 +12:00			`return PornHub`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"vidble\.com", sanitised_url):`
Add Vidble to download factory 2021-09-11 14:15:35 +12:00			`return Vidble`
Rename module to reflect backend change 2021-11-24 13:40:18 +13:00			`elif YtdlpFallback.can_handle_link(sanitised_url):`
			`return YtdlpFallback`
Add downloader factory 2021-02-11 12:09:37 +13:00			`else:`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`raise NotADownloadableLinkError(f"No downloader module exists for url {url}")`
Fix time filters (#279) 2021-04-18 23:24:11 +12:00
			`@staticmethod`
Rename function 2021-05-25 20:51:24 +12:00			`def sanitise_url(url: str) -> str:`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`beginning_regex = re.compile(r"\s*(www\.?)?")`
Fix time filters (#279) 2021-04-18 23:24:11 +12:00			`split_url = urllib.parse.urlsplit(url)`
			`split_url = split_url.netloc + split_url.path`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`split_url = re.sub(beginning_regex, "", split_url)`
Fix time filters (#279) 2021-04-18 23:24:11 +12:00			`return split_url`
Add blacklist for web filetypes 2021-05-25 20:59:32 +12:00
			`@staticmethod`
			`def is_web_resource(url: str) -> bool:`
			`web_extensions = (`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`"asp",`
			`"aspx",`
			`"cfm",`
			`"cfml",`
			`"css",`
			`"htm",`
			`"html",`
			`"js",`
			`"php",`
			`"php3",`
			`"xhtml",`
Add blacklist for web filetypes 2021-05-25 20:59:32 +12:00			`)`
			`if re.match(rf'(?i)./.\.({"\|".join(web_extensions)})$', url):`
			`return True`
			`else:`
			`return False`