1
0
Fork 0
mirror of synced 2024-06-28 19:10:41 +12:00

Refactor method to base class

This commit is contained in:
Serene-Arc 2021-04-05 17:21:04 +10:00 committed by Ali Parlakci
parent 500cee4bae
commit 2384c03170
7 changed files with 28 additions and 40 deletions

View file

@ -5,10 +5,12 @@ import logging
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Optional from typing import Optional
import requests
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.exceptions import ResourceNotFound
from bulkredditdownloader.resource import Resource from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_authenticator import SiteAuthenticator
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -22,3 +24,10 @@ class BaseDownloader(ABC):
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
"""Return list of all un-downloaded Resources from submission""" """Return list of all un-downloaded Resources from submission"""
raise NotImplementedError raise NotImplementedError
@staticmethod
def get_link(url: str, cookies: dict = None, headers: dict = None) -> requests.Response:
res = requests.get(url, cookies=cookies, headers=headers)
if res.status_code != 200:
raise ResourceNotFound(f'Server responded with {res.status_code} to {url}')
return res

View file

@ -5,7 +5,6 @@ import re
from typing import Optional from typing import Optional
import bs4 import bs4
import requests
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.exceptions import NotADownloadableLinkError from bulkredditdownloader.exceptions import NotADownloadableLinkError
@ -34,7 +33,7 @@ class Erome(BaseDownloader):
@staticmethod @staticmethod
def _get_links(url: str) -> set[str]: def _get_links(url: str) -> set[str]:
page = requests.get(url) page = Erome.get_link(url)
soup = bs4.BeautifulSoup(page.text, 'html.parser') soup = bs4.BeautifulSoup(page.text, 'html.parser')
front_images = soup.find_all('img', attrs={'class': 'lasyload'}) front_images = soup.find_all('img', attrs={'class': 'lasyload'})
out = [im.get('data-src') for im in front_images] out = [im.get('data-src') for im in front_images]

View file

@ -5,7 +5,6 @@ import re
from typing import Optional from typing import Optional
import bs4 import bs4
import requests
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.exceptions import ResourceNotFound from bulkredditdownloader.exceptions import ResourceNotFound
@ -28,12 +27,12 @@ class Gallery(BaseDownloader):
@staticmethod @staticmethod
def _get_links(url: str) -> list[str]: def _get_links(url: str) -> list[str]:
page = requests.get(url, headers={ resource_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
" Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64", ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64',
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
} }
) page = Gallery.get_link(url, headers=resource_headers)
soup = bs4.BeautifulSoup(page.text, 'html.parser') soup = bs4.BeautifulSoup(page.text, 'html.parser')
links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')}) links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')})

View file

@ -4,7 +4,6 @@ import json
import re import re
from typing import Optional from typing import Optional
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from praw.models import Submission from praw.models import Submission
@ -22,19 +21,14 @@ class Gfycat(GifDeliveryNetwork):
@staticmethod @staticmethod
def _get_link(url: str) -> str: def _get_link(url: str) -> str:
if re.match(r'\.(webm|mp4|gif)$', url):
return url
gfycat_id = re.match(r'.*/(.*?)/?$', url).group(1) gfycat_id = re.match(r'.*/(.*?)/?$', url).group(1)
url = 'https://gfycat.com/' + gfycat_id url = 'https://gfycat.com/' + gfycat_id
response = requests.get(url) response = Gfycat.get_link(url)
page_source = response.text
if 'gifdeliverynetwork' in response.url: if 'gifdeliverynetwork' in response.url:
return GifDeliveryNetwork._get_link(url) return GifDeliveryNetwork._get_link(url)
soup = BeautifulSoup(page_source, 'html.parser') soup = BeautifulSoup(response.text, 'html.parser')
content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'})
out = json.loads(content.contents[0]).get('video').get('contentUrl') out = json.loads(content.contents[0]).get('video').get('contentUrl')

View file

@ -1,9 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import re
from typing import Optional from typing import Optional
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from praw.models import Submission from praw.models import Submission
@ -23,12 +21,9 @@ class GifDeliveryNetwork(BaseDownloader):
@staticmethod @staticmethod
def _get_link(url: str) -> str: def _get_link(url: str) -> str:
if re.match(r'https://.*\.(mp4|webm|gif)(\?.*)?$', url): page = GifDeliveryNetwork.get_link(url)
return url
page_source = requests.get(url).text soup = BeautifulSoup(page.text, 'html.parser')
soup = BeautifulSoup(page_source, 'html.parser')
content = soup.find('source', attrs={'id': 'mp4Source', 'type': 'video/mp4'}) content = soup.find('source', attrs={'id': 'mp4Source', 'type': 'video/mp4'})
if content is None or content.get('src') is None: if content is None or content.get('src') is None:

View file

@ -5,10 +5,9 @@ import re
from typing import Optional from typing import Optional
import bs4 import bs4
import requests
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.exceptions import NotADownloadableLinkError, ResourceNotFound, SiteDownloaderError from bulkredditdownloader.exceptions import NotADownloadableLinkError, SiteDownloaderError
from bulkredditdownloader.resource import Resource from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_authenticator import SiteAuthenticator
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
@ -42,10 +41,7 @@ class Imgur(BaseDownloader):
link = link.replace('i.imgur', 'imgur') link = link.replace('i.imgur', 'imgur')
link = link.rstrip('.gifv') link = link.rstrip('.gifv')
res = requests.get(link, cookies={'over18': '1', 'postpagebeta': '0'}) res = Imgur.get_link(link, cookies={'over18': '1', 'postpagebeta': '0'})
if res.status_code != 200:
raise ResourceNotFound(f'Server responded with {res.status_code} to {link}')
soup = bs4.BeautifulSoup(res.text, 'html.parser') soup = bs4.BeautifulSoup(res.text, 'html.parser')
scripts = soup.find_all('script', attrs={'type': 'text/javascript'}) scripts = soup.find_all('script', attrs={'type': 'text/javascript'})

View file

@ -4,7 +4,6 @@ import json
import re import re
from typing import Optional from typing import Optional
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from praw.models import Submission from praw.models import Submission
@ -23,20 +22,17 @@ class Redgifs(GifDeliveryNetwork):
@staticmethod @staticmethod
def _get_link(url: str) -> str: def _get_link(url: str) -> str:
if re.match(r'https://.*\.(mp4|webm|gif)(\?.*)?$', url):
return url
redgif_id = re.match(r'.*/(.*?)/?$', url).group(1) redgif_id = re.match(r'.*/(.*?)/?$', url).group(1)
url = 'https://redgifs.com/watch/' + redgif_id url = 'https://redgifs.com/watch/' + redgif_id
headers = {'User-Agent': headers = {
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64' ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64',
} }
page_source = requests.get(url, headers=headers).text page = Redgifs.get_link(url, headers=headers)
soup = BeautifulSoup(page_source, 'html.parser') soup = BeautifulSoup(page.text, 'html.parser')
content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'})
if content is None: if content is None: