1
0
Fork 0
mirror of synced 2024-05-14 01:02:42 +12:00
bulk-downloader-for-reddit/bdfr/site_downloaders/download_factory.py

87 lines
3 KiB
Python
Raw Normal View History

2021-02-11 12:09:37 +13:00
#!/usr/bin/env python3
# coding=utf-8
import re
2021-04-18 23:24:11 +12:00
import urllib.parse
2021-02-11 12:09:37 +13:00
from typing import Type
2021-04-12 19:58:32 +12:00
from bdfr.exceptions import NotADownloadableLinkError
from bdfr.site_downloaders.base_downloader import BaseDownloader
from bdfr.site_downloaders.direct import Direct
from bdfr.site_downloaders.erome import Erome
from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback
2021-04-12 19:58:32 +12:00
from bdfr.site_downloaders.gallery import Gallery
from bdfr.site_downloaders.gfycat import Gfycat
from bdfr.site_downloaders.imgur import Imgur
2021-06-25 19:47:49 +12:00
from bdfr.site_downloaders.pornhub import PornHub
2021-04-12 19:58:32 +12:00
from bdfr.site_downloaders.redgifs import Redgifs
from bdfr.site_downloaders.self_post import SelfPost
2021-09-11 14:15:35 +12:00
from bdfr.site_downloaders.vidble import Vidble
2022-07-15 17:05:07 +12:00
from bdfr.site_downloaders.vreddit import VReddit
2021-04-12 19:58:32 +12:00
from bdfr.site_downloaders.youtube import Youtube
2021-02-11 12:09:37 +13:00
class DownloadFactory:
@staticmethod
def pull_lever(url: str) -> Type[BaseDownloader]:
2021-05-25 20:51:24 +12:00
sanitised_url = DownloadFactory.sanitise_url(url)
if re.match(r'(i\.|m\.)?imgur', sanitised_url):
return Imgur
elif re.match(r'(i\.)?(redgifs|gifdeliverynetwork)', sanitised_url):
return Redgifs
2021-05-25 20:59:32 +12:00
elif re.match(r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', sanitised_url) and \
not DownloadFactory.is_web_resource(sanitised_url):
return Direct
2021-04-18 23:24:11 +12:00
elif re.match(r'erome\.com.*', sanitised_url):
2021-02-11 12:09:37 +13:00
return Erome
2021-04-18 23:24:11 +12:00
elif re.match(r'reddit\.com/gallery/.*', sanitised_url):
2021-03-01 12:51:44 +13:00
return Gallery
2021-12-19 16:44:24 +13:00
elif re.match(r'patreon\.com.*', sanitised_url):
return Gallery
2021-04-18 23:24:11 +12:00
elif re.match(r'gfycat\.', sanitised_url):
2021-03-01 12:51:44 +13:00
return Gfycat
2021-04-18 23:24:11 +12:00
elif re.match(r'reddit\.com/r/', sanitised_url):
2021-03-01 12:51:44 +13:00
return SelfPost
2021-04-18 23:24:11 +12:00
elif re.match(r'(m\.)?youtu\.?be', sanitised_url):
return Youtube
2021-04-18 23:24:11 +12:00
elif re.match(r'i\.redd\.it.*', sanitised_url):
2021-03-28 13:10:46 +13:00
return Direct
2022-04-26 04:09:09 +12:00
elif re.match(r'v\.redd\.it.*', sanitised_url):
2022-07-15 17:05:07 +12:00
return VReddit
2021-06-25 19:47:49 +12:00
elif re.match(r'pornhub\.com.*', sanitised_url):
return PornHub
2021-09-11 14:15:35 +12:00
elif re.match(r'vidble\.com', sanitised_url):
return Vidble
elif YtdlpFallback.can_handle_link(sanitised_url):
return YtdlpFallback
2021-02-11 12:09:37 +13:00
else:
2021-09-11 14:15:35 +12:00
raise NotADownloadableLinkError(f'No downloader module exists for url {url}')
2021-04-18 23:24:11 +12:00
@staticmethod
2021-05-25 20:51:24 +12:00
def sanitise_url(url: str) -> str:
2021-04-18 23:24:11 +12:00
beginning_regex = re.compile(r'\s*(www\.?)?')
split_url = urllib.parse.urlsplit(url)
split_url = split_url.netloc + split_url.path
split_url = re.sub(beginning_regex, '', split_url)
return split_url
2021-05-25 20:59:32 +12:00
@staticmethod
def is_web_resource(url: str) -> bool:
web_extensions = (
'asp',
2021-05-25 21:33:32 +12:00
'aspx',
2021-05-25 20:59:32 +12:00
'cfm',
'cfml',
'css',
2021-05-25 21:33:32 +12:00
'htm',
2021-05-25 20:59:32 +12:00
'html',
'js',
'php',
2021-05-25 21:33:32 +12:00
'php3',
2021-05-25 20:59:32 +12:00
'xhtml',
)
if re.match(rf'(?i).*/.*\.({"|".join(web_extensions)})$', url):
return True
else:
return False