From 33312687acce22c46c864c14904397e4655eecdd Mon Sep 17 00:00:00 2001 From: Eli Lipsitz Date: Sun, 12 Sep 2021 16:50:31 -0500 Subject: [PATCH 1/3] imgur: download videos as mp4 instead of gif Some imgur URLS have the extension ".gifv" and show up as a gif, even though they're actually supposed to be mp4 videos. Imgur serves all videos/gifs as both .gif and .mp4. The image dict has a key "prefer_video" to distinguish the two. This commit overrides the .gif extension if "prefer_video" is true to ensure we download the submission as originally intended. --- bdfr/site_downloaders/imgur.py | 6 +++++- tests/site_downloaders/test_imgur.py | 10 +++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index f0b7012..a3e3135 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -32,7 +32,11 @@ class Imgur(BaseDownloader): return out def _compute_image_url(self, image: dict) -> Resource: - image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext']) + ext = self._validate_extension(image['ext']) + if image.get('prefer_video', False): + ext = '.mp4' + + image_url = 'https://i.imgur.com/' + image['hash'] + ext return Resource(self.post, image_url, Resource.retry_download(image_url)) @staticmethod diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index bfb7405..4c754ec 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -111,7 +111,7 @@ def test_imgur_extension_validation_bad(test_extension: str): ), ( 'https://imgur.com/gallery/IjJJdlC', - ('7227d4312a9779b74302724a0cfa9081',), + ('740b006cf9ec9d6f734b6e8f5130bdab',), ), ( 'https://imgur.com/a/dcc84Gt', @@ -142,6 +142,14 @@ def test_imgur_extension_validation_bad(test_extension: str): 'https://imgur.com/ubYwpbk.GIFV', ('d4a774aac1667783f9ed3a1bd02fac0c',), ), + ( + 'https://i.imgur.com/j1CNCZY.gifv', + ('58e7e6d972058c18b7ecde910ca147e3',), + ), + ( + 'https://i.imgur.com/uTvtQsw.gifv', + ('46c86533aa60fc0e09f2a758513e3ac2',), + ), )) def test_find_resources(test_url: str, expected_hashes: list[str]): mock_download = Mock() From 80baab8de7e64e731eb300ade7afbc4474126976 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 14 Sep 2021 13:47:46 +1000 Subject: [PATCH 2/3] Fix bug with different Vidble links --- bdfr/site_downloaders/vidble.py | 8 +++++- tests/site_downloaders/test_vidble.py | 36 ++++++++++++++++----------- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/bdfr/site_downloaders/vidble.py b/bdfr/site_downloaders/vidble.py index 2f8f4f4..5cea0cb 100644 --- a/bdfr/site_downloaders/vidble.py +++ b/bdfr/site_downloaders/vidble.py @@ -22,7 +22,10 @@ class Vidble(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - res = self.get_links(self.post.url) + try: + res = self.get_links(self.post.url) + except AttributeError: + raise SiteDownloaderError(f'Could not read page at {self.post.url}') if not res: raise SiteDownloaderError(rf'No resources found at {self.post.url}') res = [Resource(self.post, r, Resource.retry_download(r)) for r in res] @@ -30,6 +33,9 @@ class Vidble(BaseDownloader): @staticmethod def get_links(url: str) -> set[str]: + if not re.search(r'vidble.com/(show/|album/|watch\?v)', url): + url = re.sub(r'/(\w*?)$', r'/show/\1', url) + page = requests.get(url) soup = bs4.BeautifulSoup(page.text, 'html.parser') content_div = soup.find('div', attrs={'id': 'ContentPlaceHolder1_divContent'}) diff --git a/tests/site_downloaders/test_vidble.py b/tests/site_downloaders/test_vidble.py index 1617bf1..0c5ebb2 100644 --- a/tests/site_downloaders/test_vidble.py +++ b/tests/site_downloaders/test_vidble.py @@ -33,6 +33,9 @@ def test_change_med_url(test_url: str, expected: str): ('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', { 'https://www.vidble.com/0q4nWakqM6kzQWxlePD8N62Dsflev0N9.mp4', }), + ('https://www.vidble.com/pHuwWkOcEb', { + 'https://www.vidble.com/pHuwWkOcEb.jpg', + }), )) def test_get_links(test_url: str, expected: set[str]): results = Vidble.get_links(test_url) @@ -40,21 +43,24 @@ def test_get_links(test_url: str, expected: set[str]): @pytest.mark.parametrize(('test_url', 'expected_hashes'), ( - ('https://www.vidble.com/show/UxsvAssYe5', { - '0ef2f8e0e0b45936d2fb3e6fbdf67e28', - }), - ('https://vidble.com/show/RDFbznUvcN', { - 'c2dd30a71e32369c50eed86f86efff58', - }), - ('https://vidble.com/album/h0jTLs6B', { - '3b3cba02e01c91f9858a95240b942c71', - 'dd6ecf5fc9e936f9fb614eb6a0537f99', - 'b31a942cd8cdda218ed547bbc04c3a27', - '6f77c570b451eef4222804bd52267481', - }), - ('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', { - 'cebe9d5f24dba3b0443e5097f160ca83', - }), + ('https://www.vidble.com/show/UxsvAssYe5', { + '0ef2f8e0e0b45936d2fb3e6fbdf67e28', + }), + ('https://vidble.com/show/RDFbznUvcN', { + 'c2dd30a71e32369c50eed86f86efff58', + }), + ('https://vidble.com/album/h0jTLs6B', { + '3b3cba02e01c91f9858a95240b942c71', + 'dd6ecf5fc9e936f9fb614eb6a0537f99', + 'b31a942cd8cdda218ed547bbc04c3a27', + '6f77c570b451eef4222804bd52267481', + }), + ('https://vidble.com/watch?v=0q4nWakqM6kzQWxlePD8N62Dsflev0N9', { + 'cebe9d5f24dba3b0443e5097f160ca83', + }), + ('https://www.vidble.com/pHuwWkOcEb', { + '585f486dd0b2f23a57bddbd5bf185bc7', + }), )) def test_find_resources(test_url: str, expected_hashes: set[str]): mock_download = Mock() From 01923fda0e18b58e8667fac9502f9fab3aa1d9fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Tue, 14 Sep 2021 21:01:21 +0300 Subject: [PATCH 3/3] Bump version 2.4.1 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 196bd9e..5792355 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ description_file = README.md description_content_type = text/markdown home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit keywords = reddit, download, archive -version = 2.4.0 +version = 2.4.1 author = Ali Parlakci author_email = parlakciali@gmail.com maintainer = Serene Arc