bulk-downloader-for-reddit/bdfr/site_downloaders/download_factory.py

#!/usr/bin/env python3

import re
import urllib.parse

from bdfr.exceptions import NotADownloadableLinkError
from bdfr.site_downloaders.base_downloader import BaseDownloader
from bdfr.site_downloaders.delay_for_reddit import DelayForReddit
from bdfr.site_downloaders.direct import Direct
from bdfr.site_downloaders.erome import Erome
from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback
from bdfr.site_downloaders.gallery import Gallery
from bdfr.site_downloaders.gfycat import Gfycat
from bdfr.site_downloaders.imgur import Imgur
from bdfr.site_downloaders.pornhub import PornHub
from bdfr.site_downloaders.redgifs import Redgifs
from bdfr.site_downloaders.self_post import SelfPost
from bdfr.site_downloaders.vidble import Vidble
from bdfr.site_downloaders.vreddit import VReddit
from bdfr.site_downloaders.youtube import Youtube


class DownloadFactory:
    @staticmethod
    def pull_lever(url: str) -> type[BaseDownloader]:
        sanitised_url = DownloadFactory.sanitise_url(url)
        if re.match(r"(i\.|m\.)?imgur", sanitised_url):
            return Imgur
        elif re.match(r"(i\.|thumbs\d\.|v\d\.)?(redgifs|gifdeliverynetwork)", sanitised_url):
            return Redgifs
        elif re.match(r".*/.*\.[a-zA-Z34]{3,4}(\?[\w;&=]*)?$", sanitised_url) and not DownloadFactory.is_web_resource(
            sanitised_url
        ):
            return Direct
        elif re.match(r"erome\.com.*", sanitised_url):
            return Erome
        elif re.match(r"delayforreddit\.com", sanitised_url):
            return DelayForReddit
        elif re.match(r"reddit\.com/gallery/.*", sanitised_url):
            return Gallery
        elif re.match(r"patreon\.com.*", sanitised_url):
            return Gallery
        elif re.match(r"gfycat\.", sanitised_url):
            return Gfycat
        elif re.match(r"reddit\.com/r/", sanitised_url):
            return SelfPost
        elif re.match(r"(m\.)?youtu\.?be", sanitised_url):
            return Youtube
        elif re.match(r"i\.redd\.it.*", sanitised_url):
            return Direct
        elif re.match(r"v\.redd\.it.*", sanitised_url):
            return VReddit
        elif re.match(r"pornhub\.com.*", sanitised_url):
            return PornHub
        elif re.match(r"vidble\.com", sanitised_url):
            return Vidble
        elif YtdlpFallback.can_handle_link(sanitised_url):
            return YtdlpFallback
        else:
            raise NotADownloadableLinkError(f"No downloader module exists for url {url}")

    @staticmethod
    def sanitise_url(url: str) -> str:
        beginning_regex = re.compile(r"\s*(www\.?)?")
        split_url = urllib.parse.urlsplit(url)
        split_url = split_url.netloc + split_url.path
        split_url = re.sub(beginning_regex, "", split_url)
        return split_url

    @staticmethod
    def is_web_resource(url: str) -> bool:
        web_extensions = (
            "asp",
            "aspx",
            "cfm",
            "cfml",
            "css",
            "htm",
            "html",
            "js",
            "php",
            "php3",
            "xhtml",
        )
        if re.match(rf'(?i).*/.*\.({"|".join(web_extensions)})$', url):
            return True
        else:
            return False
Add downloader factory 2021-02-11 12:09:37 +13:00			`#!/usr/bin/env python3`

			`import re`
Fix time filters (#279) 2021-04-18 23:24:11 +12:00			`import urllib.parse`
Add downloader factory 2021-02-11 12:09:37 +13:00
Rename module 2021-04-12 19:58:32 +12:00			`from bdfr.exceptions import NotADownloadableLinkError`
			`from bdfr.site_downloaders.base_downloader import BaseDownloader`
Add Delay for Reddit support Adds support for delayforreddit.com non-direct links. 2022-11-06 03:51:33 +13:00			`from bdfr.site_downloaders.delay_for_reddit import DelayForReddit`
Rename module 2021-04-12 19:58:32 +12:00			`from bdfr.site_downloaders.direct import Direct`
			`from bdfr.site_downloaders.erome import Erome`
Rename module to reflect backend change 2021-11-24 13:40:18 +13:00			`from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback`
Rename module 2021-04-12 19:58:32 +12:00			`from bdfr.site_downloaders.gallery import Gallery`
			`from bdfr.site_downloaders.gfycat import Gfycat`
			`from bdfr.site_downloaders.imgur import Imgur`
Add PornHub module 2021-06-25 19:47:49 +12:00			`from bdfr.site_downloaders.pornhub import PornHub`
Rename module 2021-04-12 19:58:32 +12:00			`from bdfr.site_downloaders.redgifs import Redgifs`
			`from bdfr.site_downloaders.self_post import SelfPost`
Add Vidble to download factory 2021-09-11 14:15:35 +12:00			`from bdfr.site_downloaders.vidble import Vidble`
Rename class 2022-07-15 17:05:07 +12:00			`from bdfr.site_downloaders.vreddit import VReddit`
Rename module 2021-04-12 19:58:32 +12:00			`from bdfr.site_downloaders.youtube import Youtube`
Add downloader factory 2021-02-11 12:09:37 +13:00

			`class DownloadFactory:`
			`@staticmethod`
pep585 and pathlib updates 2023-01-26 16:23:59 +13:00			`def pull_lever(url: str) -> type[BaseDownloader]:`
Rename function 2021-05-25 20:51:24 +12:00			`sanitised_url = DownloadFactory.sanitise_url(url)`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`if re.match(r"(i\.\|m\.)?imgur", sanitised_url):`
download_factory.py: check if url has ext first 2021-04-04 04:44:53 +12:00			`return Imgur`
Add new Redgifs subdomain Seems there's a v3 subdomain now (looks like it's mostly for mobile) 2023-01-07 05:56:54 +13:00			`elif re.match(r"(i\.\|thumbs\d\.\|v\d\.)?(redgifs\|gifdeliverynetwork)", sanitised_url):`
Redgifs fixed? If this doesn't work then I give up... 2022-09-17 12:41:17 +12:00			`return Redgifs`
Update download_factory.py Attempt to fix #724 Narrows down characters available to extensions in the regex. Outside of 3 and 4, the only extensions that I can think of this doesn't hit are bz2 and 7z (which wasn't caught before). 2022-12-20 16:02:16 +13:00			`elif re.match(r"./.\.[a-zA-Z34]{3,4}(\?[\w;&=]*)?$", sanitised_url) and not DownloadFactory.is_web_resource(`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`sanitised_url`
			`):`
download_factory.py: check if url has ext first 2021-04-04 04:44:53 +12:00			`return Direct`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"erome\.com.*", sanitised_url):`
Add downloader factory 2021-02-11 12:09:37 +13:00			`return Erome`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"delayforreddit\.com", sanitised_url):`
Add Delay for Reddit support Adds support for delayforreddit.com non-direct links. 2022-11-06 03:51:33 +13:00			`return DelayForReddit`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"reddit\.com/gallery/.*", sanitised_url):`
Add some tests for DownloadFactory 2021-03-01 12:51:44 +13:00			`return Gallery`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"patreon\.com.*", sanitised_url):`
Add Patreon image support 2021-12-19 16:44:24 +13:00			`return Gallery`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"gfycat\.", sanitised_url):`
Add some tests for DownloadFactory 2021-03-01 12:51:44 +13:00			`return Gfycat`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"reddit\.com/r/", sanitised_url):`
Add some tests for DownloadFactory 2021-03-01 12:51:44 +13:00			`return SelfPost`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"(m\.)?youtu\.?be", sanitised_url):`
Add logic to handle mobile youtube (m.youtube) links. 2021-04-02 18:56:31 +13:00			`return Youtube`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"i\.redd\.it.*", sanitised_url):`
Split regex for download factory 2021-03-28 13:10:46 +13:00			`return Direct`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"v\.redd\.it.*", sanitised_url):`
Rename class 2022-07-15 17:05:07 +12:00			`return VReddit`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"pornhub\.com.*", sanitised_url):`
Add PornHub module 2021-06-25 19:47:49 +12:00			`return PornHub`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`elif re.match(r"vidble\.com", sanitised_url):`
Add Vidble to download factory 2021-09-11 14:15:35 +12:00			`return Vidble`
Rename module to reflect backend change 2021-11-24 13:40:18 +13:00			`elif YtdlpFallback.can_handle_link(sanitised_url):`
			`return YtdlpFallback`
Add downloader factory 2021-02-11 12:09:37 +13:00			`else:`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`raise NotADownloadableLinkError(f"No downloader module exists for url {url}")`
Fix time filters (#279) 2021-04-18 23:24:11 +12:00
			`@staticmethod`
Rename function 2021-05-25 20:51:24 +12:00			`def sanitise_url(url: str) -> str:`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`beginning_regex = re.compile(r"\s*(www\.?)?")`
Fix time filters (#279) 2021-04-18 23:24:11 +12:00			`split_url = urllib.parse.urlsplit(url)`
			`split_url = split_url.netloc + split_url.path`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`split_url = re.sub(beginning_regex, "", split_url)`
Fix time filters (#279) 2021-04-18 23:24:11 +12:00			`return split_url`
Add blacklist for web filetypes 2021-05-25 20:59:32 +12:00
			`@staticmethod`
			`def is_web_resource(url: str) -> bool:`
			`web_extensions = (`
Format according to the black standard 2022-12-03 18:11:17 +13:00			`"asp",`
			`"aspx",`
			`"cfm",`
			`"cfml",`
			`"css",`
			`"htm",`
			`"html",`
			`"js",`
			`"php",`
			`"php3",`
			`"xhtml",`
Add blacklist for web filetypes 2021-05-25 20:59:32 +12:00			`)`
			`if re.match(rf'(?i)./.\.({"\|".join(web_extensions)})$', url):`
			`return True`
			`else:`
			`return False`