1
0
Fork 0
mirror of synced 2024-06-29 11:30:30 +12:00

Refactor Gallery downloader

This commit is contained in:
Serene-Arc 2021-03-17 16:58:29 +10:00 committed by Ali Parlakci
parent 3e9a846e2e
commit ed26907e0d
2 changed files with 18 additions and 43 deletions

View file

@ -1,15 +1,16 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import json
import logging import logging
import re
from typing import Optional from typing import Optional
import bs4
import requests import requests
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.exceptions import ResourceNotFound
from bulkredditdownloader.exceptions import NotADownloadableLinkError, ResourceNotFound
from bulkredditdownloader.resource import Resource from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_authenticator import SiteAuthenticator
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -18,51 +19,25 @@ logger = logging.getLogger(__name__)
class Gallery(BaseDownloader): class Gallery(BaseDownloader):
def __init__(self, post: Submission): def __init__(self, post: Submission):
super().__init__(post) super().__init__(post)
link = self.post.url
self.raw_data = self._get_data(link)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
images = {} image_urls = self._get_links(self.post.url)
count = 0 if not image_urls:
for model in self.raw_data['posts']['models']: raise ResourceNotFound('No images found in Reddit gallery')
try: return [Resource(self.post, url) for url in image_urls]
for item in self.raw_data['posts']['models'][model]['media']['gallery']['items']:
try:
images[count] = {'id': item['mediaId'], 'url': self.raw_data['posts']
['models'][model]['media']['mediaMetadata'][item['mediaId']]['s']['u']}
count += 1
except KeyError:
continue
except KeyError:
continue
return self._download_album(images)
@staticmethod @staticmethod
def _get_data(link: str) -> dict: def _get_links(url: str) -> list[str]:
headers = { page = requests.get(url, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64", " Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
} }
res = requests.get(link, headers=headers) )
if res.status_code != 200: soup = bs4.BeautifulSoup(page.text)
raise ResourceNotFound(f"Server responded with {res.status_code} to {link}")
page_source = res.text
starting_string = "_r = {" links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')})
ending_string = "</script>" links = [link.get('href') for link in links]
pattern = re.compile(r'(.*?)\?.*$')
starting_string_lenght = len(starting_string) links = [re.search(pattern, link).group(1) for link in links]
try: return links
start_index = page_source.index(starting_string) + starting_string_lenght
end_index = page_source.index(ending_string, start_index)
except ValueError:
raise NotADownloadableLinkError(f"Could not read the page source on {link}")
data = json.loads(page_source[start_index - 1:end_index + 1].strip()[:-1])
return data
def _download_album(self, images: dict):
out = [Resource(self.post, images[image_key]['url']) for image_key in images.keys()]
return out

View file

@ -11,8 +11,8 @@ from bulkredditdownloader.site_downloaders.gallery import Gallery
@pytest.mark.online @pytest.mark.online
@pytest.mark.reddit @pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'expected_len'), ( @pytest.mark.parametrize(('test_submission_id', 'expected_len'), (
('ljyy27', 4),
('m6lvrh', 4), ('m6lvrh', 4),
('ljyy27', 4),
)) ))
def test_gallery(test_submission_id: str, expected_len: int, reddit_instance: praw.Reddit): def test_gallery(test_submission_id: str, expected_len: int, reddit_instance: praw.Reddit):
test_submission = reddit_instance.submission(id=test_submission_id) test_submission = reddit_instance.submission(id=test_submission_id)