From 2384c03170c6eb8b6da7221e936c21b93093fe25 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Apr 2021 17:21:04 +1000 Subject: [PATCH] Refactor method to base class --- .../site_downloaders/base_downloader.py | 11 ++++++++++- bulkredditdownloader/site_downloaders/erome.py | 3 +-- bulkredditdownloader/site_downloaders/gallery.py | 11 +++++------ bulkredditdownloader/site_downloaders/gfycat.py | 10 ++-------- .../site_downloaders/gif_delivery_network.py | 9 ++------- bulkredditdownloader/site_downloaders/imgur.py | 8 ++------ bulkredditdownloader/site_downloaders/redgifs.py | 16 ++++++---------- 7 files changed, 28 insertions(+), 40 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/base_downloader.py b/bulkredditdownloader/site_downloaders/base_downloader.py index 458f3bc..9c44de6 100644 --- a/bulkredditdownloader/site_downloaders/base_downloader.py +++ b/bulkredditdownloader/site_downloaders/base_downloader.py @@ -5,10 +5,12 @@ import logging from abc import ABC, abstractmethod from typing import Optional +import requests from praw.models import Submission -from bulkredditdownloader.site_authenticator import SiteAuthenticator +from bulkredditdownloader.exceptions import ResourceNotFound from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_authenticator import SiteAuthenticator logger = logging.getLogger(__name__) @@ -22,3 +24,10 @@ class BaseDownloader(ABC): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: """Return list of all un-downloaded Resources from submission""" raise NotImplementedError + + @staticmethod + def get_link(url: str, cookies: dict = None, headers: dict = None) -> requests.Response: + res = requests.get(url, cookies=cookies, headers=headers) + if res.status_code != 200: + raise ResourceNotFound(f'Server responded with {res.status_code} to {url}') + return res diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index d9b48a3..c452175 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -5,7 +5,6 @@ import re from typing import Optional import bs4 -import requests from praw.models import Submission from bulkredditdownloader.exceptions import NotADownloadableLinkError @@ -34,7 +33,7 @@ class Erome(BaseDownloader): @staticmethod def _get_links(url: str) -> set[str]: - page = requests.get(url) + page = Erome.get_link(url) soup = bs4.BeautifulSoup(page.text, 'html.parser') front_images = soup.find_all('img', attrs={'class': 'lasyload'}) out = [im.get('data-src') for im in front_images] diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 22afc76..bc9390f 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -5,7 +5,6 @@ import re from typing import Optional import bs4 -import requests from praw.models import Submission from bulkredditdownloader.exceptions import ResourceNotFound @@ -28,12 +27,12 @@ class Gallery(BaseDownloader): @staticmethod def _get_links(url: str) -> list[str]: - page = requests.get(url, headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" - " Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", + resource_headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', } - ) + page = Gallery.get_link(url, headers=resource_headers) soup = bs4.BeautifulSoup(page.text, 'html.parser') links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')}) diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py index a5051ca..d54fcf6 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -4,7 +4,6 @@ import json import re from typing import Optional -import requests from bs4 import BeautifulSoup from praw.models import Submission @@ -22,19 +21,14 @@ class Gfycat(GifDeliveryNetwork): @staticmethod def _get_link(url: str) -> str: - if re.match(r'\.(webm|mp4|gif)$', url): - return url - gfycat_id = re.match(r'.*/(.*?)/?$', url).group(1) url = 'https://gfycat.com/' + gfycat_id - response = requests.get(url) - page_source = response.text - + response = Gfycat.get_link(url) if 'gifdeliverynetwork' in response.url: return GifDeliveryNetwork._get_link(url) - soup = BeautifulSoup(page_source, 'html.parser') + soup = BeautifulSoup(response.text, 'html.parser') content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) out = json.loads(content.contents[0]).get('video').get('contentUrl') diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index 15ee76f..878dcb6 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -1,9 +1,7 @@ #!/usr/bin/env python3 -import re from typing import Optional -import requests from bs4 import BeautifulSoup from praw.models import Submission @@ -23,12 +21,9 @@ class GifDeliveryNetwork(BaseDownloader): @staticmethod def _get_link(url: str) -> str: - if re.match(r'https://.*\.(mp4|webm|gif)(\?.*)?$', url): - return url + page = GifDeliveryNetwork.get_link(url) - page_source = requests.get(url).text - - soup = BeautifulSoup(page_source, 'html.parser') + soup = BeautifulSoup(page.text, 'html.parser') content = soup.find('source', attrs={'id': 'mp4Source', 'type': 'video/mp4'}) if content is None or content.get('src') is None: diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index 9e311d6..4314db3 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -5,10 +5,9 @@ import re from typing import Optional import bs4 -import requests from praw.models import Submission -from bulkredditdownloader.exceptions import NotADownloadableLinkError, ResourceNotFound, SiteDownloaderError +from bulkredditdownloader.exceptions import NotADownloadableLinkError, SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -42,10 +41,7 @@ class Imgur(BaseDownloader): link = link.replace('i.imgur', 'imgur') link = link.rstrip('.gifv') - res = requests.get(link, cookies={'over18': '1', 'postpagebeta': '0'}) - - if res.status_code != 200: - raise ResourceNotFound(f'Server responded with {res.status_code} to {link}') + res = Imgur.get_link(link, cookies={'over18': '1', 'postpagebeta': '0'}) soup = bs4.BeautifulSoup(res.text, 'html.parser') scripts = soup.find_all('script', attrs={'type': 'text/javascript'}) diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index e4ee567..46adb8d 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -4,7 +4,6 @@ import json import re from typing import Optional -import requests from bs4 import BeautifulSoup from praw.models import Submission @@ -23,20 +22,17 @@ class Redgifs(GifDeliveryNetwork): @staticmethod def _get_link(url: str) -> str: - if re.match(r'https://.*\.(mp4|webm|gif)(\?.*)?$', url): - return url - redgif_id = re.match(r'.*/(.*?)/?$', url).group(1) url = 'https://redgifs.com/watch/' + redgif_id - headers = {'User-Agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' - ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64' - } + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64', + } - page_source = requests.get(url, headers=headers).text + page = Redgifs.get_link(url, headers=headers) - soup = BeautifulSoup(page_source, 'html.parser') + soup = BeautifulSoup(page.text, 'html.parser') content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) if content is None: