From 0e28c7ed7cf4c582fcca84c80149c7b55c3e0f50 Mon Sep 17 00:00:00 2001
From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com>
Date: Sat, 4 Feb 2023 13:24:38 -0500
Subject: [PATCH] Gfycat API

Moves Gfycat to use API via site access key.

Adds cachetools as dependency to reuse API keys for Gfycat/Redgifs at 95% of their TTL. Include tests to verify caching.

Updates versions of requests/yt-dlp/black/isort/pytest.

Added default timeout to requests calls.

Adds validate-pyproject and blacken-docs to pre-commit as well as updates versions.
---
 .pre-commit-config.yaml                  | 15 +++++++--
 bdfr/__main__.py                         |  2 +-
 bdfr/downloader.py                       |  2 +-
 bdfr/oauth2.py                           |  4 ++-
 bdfr/resource.py                         |  4 +--
 bdfr/site_downloaders/base_downloader.py | 13 ++++++-
 bdfr/site_downloaders/gallery.py         |  2 +-
 bdfr/site_downloaders/gfycat.py          | 43 ++++++++++++++++++++----
 bdfr/site_downloaders/redgifs.py         | 15 +++++----
 bdfr/site_downloaders/vidble.py          |  2 +-
 pyproject.toml                           | 13 +++----
 tests/site_downloaders/test_direct.py    |  5 +--
 tests/site_downloaders/test_gfycat.py    |  7 ++++
 tests/site_downloaders/test_redgifs.py   |  7 ++++
 14 files changed, 101 insertions(+), 33 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 28bd140..0537e57 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,13 +2,18 @@
 # See https://pre-commit.com/hooks.html for more hooks
 
 repos:
+  - repo: https://github.com/abravalheri/validate-pyproject
+    rev: v0.12.1
+    hooks:
+      - id: validate-pyproject
+
   - repo: https://github.com/psf/black
-    rev: 22.12.0
+    rev: 23.1.0
     hooks:
       - id: black
 
   - repo: https://github.com/pycqa/isort
-    rev: 5.11.4
+    rev: 5.12.0
     hooks:
       - id: isort
         name: isort (python)
@@ -23,3 +28,9 @@ repos:
     rev: v0.12.0
     hooks:
       - id: markdownlint
+
+  - repo: https://github.com/adamchainz/blacken-docs
+    rev: 1.13.0
+    hooks:
+      - id: blacken-docs
+        additional_dependencies: [black>=23.1.0]
diff --git a/bdfr/__main__.py b/bdfr/__main__.py
index dadba51..8ae2b5b 100644
--- a/bdfr/__main__.py
+++ b/bdfr/__main__.py
@@ -82,7 +82,7 @@ def _check_version(context, param, value):
     if not value or context.resilient_parsing:
         return
     current = __version__
-    latest = requests.get("https://pypi.org/pypi/bdfr/json").json()["info"]["version"]
+    latest = requests.get("https://pypi.org/pypi/bdfr/json", timeout=10).json()["info"]["version"]
     print(f"You are currently using v{current} the latest is v{latest}")
     context.exit()
 
diff --git a/bdfr/downloader.py b/bdfr/downloader.py
index 84cae37..7ed724c 100644
--- a/bdfr/downloader.py
+++ b/bdfr/downloader.py
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
 
 def _calc_hash(existing_file: Path):
     chunk_size = 1024 * 1024
-    md5_hash = hashlib.md5()
+    md5_hash = hashlib.md5(usedforsecurity=False)
     with existing_file.open("rb") as file:
         chunk = file.read(chunk_size)
         while chunk:
diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py
index ead0553..e9ca904 100644
--- a/bdfr/oauth2.py
+++ b/bdfr/oauth2.py
@@ -26,7 +26,9 @@ class OAuth2Authenticator:
     @staticmethod
     def _check_scopes(wanted_scopes: set[str]):
         response = requests.get(
-            "https://www.reddit.com/api/v1/scopes.json", headers={"User-Agent": "fetch-scopes test"}
+            "https://www.reddit.com/api/v1/scopes.json",
+            headers={"User-Agent": "fetch-scopes test"},
+            timeout=10,
         )
         known_scopes = [scope for scope, data in response.json().items()]
         known_scopes.append("*")
diff --git a/bdfr/resource.py b/bdfr/resource.py
index 37fc521..23e2da1 100644
--- a/bdfr/resource.py
+++ b/bdfr/resource.py
@@ -49,7 +49,7 @@ class Resource:
             self.create_hash()
 
     def create_hash(self):
-        self.hash = hashlib.md5(self.content)
+        self.hash = hashlib.md5(self.content, usedforsecurity=False)
 
     def _determine_extension(self) -> Optional[str]:
         extension_pattern = re.compile(r".*(\..{3,5})$")
@@ -68,7 +68,7 @@ class Resource:
             max_wait_time = 300
         while True:
             try:
-                response = requests.get(url, headers=headers)
+                response = requests.get(url, headers=headers, timeout=10)
                 if re.match(r"^2\d{2}", str(response.status_code)) and response.content:
                     return response.content
                 elif response.status_code in (408, 429):
diff --git a/bdfr/site_downloaders/base_downloader.py b/bdfr/site_downloaders/base_downloader.py
index e4ac111..dafa90b 100644
--- a/bdfr/site_downloaders/base_downloader.py
+++ b/bdfr/site_downloaders/base_downloader.py
@@ -28,10 +28,21 @@ class BaseDownloader(ABC):
     @staticmethod
     def retrieve_url(url: str, cookies: dict = None, headers: dict = None) -> requests.Response:
         try:
-            res = requests.get(url, cookies=cookies, headers=headers)
+            res = requests.get(url, cookies=cookies, headers=headers, timeout=10)
         except requests.exceptions.RequestException as e:
             logger.exception(e)
             raise SiteDownloaderError(f"Failed to get page {url}")
         if res.status_code != 200:
             raise ResourceNotFound(f"Server responded with {res.status_code} to {url}")
         return res
+
+    @staticmethod
+    def post_url(url: str, cookies: dict = None, headers: dict = None, payload: dict = None) -> requests.Response:
+        try:
+            res = requests.post(url, cookies=cookies, headers=headers, json=payload, timeout=10)
+        except requests.exceptions.RequestException as e:
+            logger.exception(e)
+            raise SiteDownloaderError(f"Failed to post to {url}")
+        if res.status_code != 200:
+            raise ResourceNotFound(f"Server responded with {res.status_code} to {url}")
+        return res
diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py
index 6f00410..2e7002f 100644
--- a/bdfr/site_downloaders/gallery.py
+++ b/bdfr/site_downloaders/gallery.py
@@ -42,7 +42,7 @@ class Gallery(BaseDownloader):
             possible_extensions = (".jpg", ".png", ".gif", ".gifv", ".jpeg")
             for extension in possible_extensions:
                 test_url = f"https://i.redd.it/{image_id}{extension}"
-                response = requests.head(test_url)
+                response = requests.head(test_url, timeout=10)
                 if response.status_code == 200:
                     out.append(test_url)
                     break
diff --git a/bdfr/site_downloaders/gfycat.py b/bdfr/site_downloaders/gfycat.py
index 4524689..57194cf 100644
--- a/bdfr/site_downloaders/gfycat.py
+++ b/bdfr/site_downloaders/gfycat.py
@@ -5,7 +5,7 @@ import json
 import re
 from typing import Optional
 
-from bs4 import BeautifulSoup
+from cachetools import TTLCache, cached
 from praw.models import Submission
 
 from bdfr.exceptions import SiteDownloaderError
@@ -21,6 +21,20 @@ class Gfycat(Redgifs):
     def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
         return super().find_resources(authenticator)
 
+    @staticmethod
+    @cached(cache=TTLCache(maxsize=5, ttl=3420))
+    def _get_auth_token() -> str:
+        headers = {
+            "content-type": "text/plain;charset=UTF-8",
+            "host": "weblogin.gfycat.com",
+            "origin": "https://gfycat.com",
+        }
+        payload = {"access_key": "Anr96uuqt9EdamSCwK4txKPjMsf2M95Rfa5FLLhPFucu8H5HTzeutyAa"}
+        token = json.loads(
+            Gfycat.post_url("https://weblogin.gfycat.com/oauth/webtoken", headers=headers, payload=payload).text
+        )["access_token"]
+        return token
+
     @staticmethod
     def _get_link(url: str) -> set[str]:
         gfycat_id = re.match(r".*/(.*?)(?:/?|-.*|\..{3-4})$", url).group(1)
@@ -28,18 +42,33 @@ class Gfycat(Redgifs):
 
         response = Gfycat.retrieve_url(url)
         if re.search(r"(redgifs|gifdeliverynetwork)", response.url):
-            url = url.lower()  # Fixes error with old gfycat/redgifs links
+            url = url.lower()
             return Redgifs._get_link(url)
 
-        soup = BeautifulSoup(response.text, "html.parser")
-        content = soup.find("script", attrs={"data-react-helmet": "true", "type": "application/ld+json"})
+        auth_token = Gfycat._get_auth_token()
+        if not auth_token:
+            raise SiteDownloaderError("Unable to retrieve Gfycat API token")
+
+        headers = {
+            "referer": "https://gfycat.com/",
+            "origin": "https://gfycat.com",
+            "content-type": "application/json",
+            "Authorization": f"Bearer {auth_token}",
+        }
+        content = Gfycat.retrieve_url(f"https://api.gfycat.com/v1/gfycats/{gfycat_id}", headers=headers)
+
+        if content is None:
+            raise SiteDownloaderError("Could not read the API source")
 
         try:
-            out = json.loads(content.contents[0])["video"]["contentUrl"]
+            response_json = json.loads(content.text)
+        except json.JSONDecodeError as e:
+            raise SiteDownloaderError(f"Received data was not valid JSON: {e}")
+
+        try:
+            out = response_json["gfyItem"]["mp4Url"]
         except (IndexError, KeyError, AttributeError) as e:
             raise SiteDownloaderError(f"Failed to download Gfycat link {url}: {e}")
-        except json.JSONDecodeError as e:
-            raise SiteDownloaderError(f"Did not receive valid JSON data: {e}")
         return {
             out,
         }
diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py
index 9c469bc..2942716 100644
--- a/bdfr/site_downloaders/redgifs.py
+++ b/bdfr/site_downloaders/redgifs.py
@@ -6,6 +6,7 @@ import re
 from typing import Optional
 
 import requests
+from cachetools import TTLCache, cached
 from praw.models import Submission
 
 from bdfr.exceptions import SiteDownloaderError
@@ -22,6 +23,12 @@ class Redgifs(BaseDownloader):
         media_urls = self._get_link(self.post.url)
         return [Resource(self.post, m, Resource.retry_download(m), None) for m in media_urls]
 
+    @staticmethod
+    @cached(cache=TTLCache(maxsize=5, ttl=82080))
+    def _get_auth_token() -> str:
+        token = json.loads(Redgifs.retrieve_url("https://api.redgifs.com/v2/auth/temporary").text)["token"]
+        return token
+
     @staticmethod
     def _get_id(url: str) -> str:
         try:
@@ -38,7 +45,7 @@ class Redgifs(BaseDownloader):
     def _get_link(url: str) -> set[str]:
         redgif_id = Redgifs._get_id(url)
 
-        auth_token = json.loads(Redgifs.retrieve_url("https://api.redgifs.com/v2/auth/temporary").text)["token"]
+        auth_token = Redgifs._get_auth_token()
         if not auth_token:
             raise SiteDownloaderError("Unable to retrieve Redgifs API token")
 
@@ -48,7 +55,6 @@ class Redgifs(BaseDownloader):
             "content-type": "application/json",
             "Authorization": f"Bearer {auth_token}",
         }
-
         content = Redgifs.retrieve_url(f"https://api.redgifs.com/v2/gifs/{redgif_id}", headers=headers)
 
         if content is None:
@@ -62,7 +68,7 @@ class Redgifs(BaseDownloader):
         out = set()
         try:
             if response_json["gif"]["type"] == 1:  # type 1 is a video
-                if requests.get(response_json["gif"]["urls"]["hd"], headers=headers).ok:
+                if requests.head(response_json["gif"]["urls"]["hd"], headers=headers, timeout=10).ok:
                     out.add(response_json["gif"]["urls"]["hd"])
                 else:
                     out.add(response_json["gif"]["urls"]["sd"])
@@ -80,7 +86,4 @@ class Redgifs(BaseDownloader):
         except (KeyError, AttributeError):
             raise SiteDownloaderError("Failed to find JSON data in page")
 
-        # Update subdomain if old one is returned
-        out = {re.sub("thumbs2", "thumbs3", link) for link in out}
-        out = {re.sub("thumbs3", "thumbs4", link) for link in out}
         return out
diff --git a/bdfr/site_downloaders/vidble.py b/bdfr/site_downloaders/vidble.py
index aa1e949..9ded201 100644
--- a/bdfr/site_downloaders/vidble.py
+++ b/bdfr/site_downloaders/vidble.py
@@ -37,7 +37,7 @@ class Vidble(BaseDownloader):
         if not re.search(r"vidble.com/(show/|album/|watch\?v)", url):
             url = re.sub(r"/(\w*?)$", r"/show/\1", url)
 
-        page = requests.get(url)
+        page = requests.get(url, timeout=10)
         soup = bs4.BeautifulSoup(page.text, "html.parser")
         content_div = soup.find("div", attrs={"id": "ContentPlaceHolder1_divContent"})
         images = content_div.find_all("img")
diff --git a/pyproject.toml b/pyproject.toml
index dc265b5..690b58a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,12 +25,13 @@ classifiers = [
 dependencies = [
     "appdirs>=1.4.4",
     "beautifulsoup4>=4.10.0",
+    "cachetools>=5.3.0",
     "click>=8.0.0",
     "dict2xml>=1.7.0",
     "praw>=7.2.0",
     "pyyaml>=5.4.1",
-    "requests>=2.25.1",
-    "yt-dlp>=2022.11.11",
+    "requests>=2.28.2",
+    "yt-dlp>=2023.1.6",
 ]
 dynamic = ["version"]
 
@@ -41,11 +42,11 @@ data-files = {"config" = ["bdfr/default_config.cfg",]}
 
 [project.optional-dependencies]
 dev = [
-    "black>=22.12.0",
+    "black>=23.1.0",
     "Flake8-pyproject>=1.2.2",
-    "isort>=5.11.4",
-    "pre-commit>=2.20.0",
-    "pytest>=7.1.0",
+    "isort>=5.12.0",
+    "pre-commit>=3.0.4",
+    "pytest>=7.2.1",
     "tox>=3.27.1",
 ]
 
diff --git a/tests/site_downloaders/test_direct.py b/tests/site_downloaders/test_direct.py
index 42e1623..ada5ef1 100644
--- a/tests/site_downloaders/test_direct.py
+++ b/tests/site_downloaders/test_direct.py
@@ -14,10 +14,7 @@ from bdfr.site_downloaders.direct import Direct
     ("test_url", "expected_hash"),
     (
         ("https://i.redd.it/q6ebualjxzea1.jpg", "6ec154859c777cb401132bb991cb3635"),
-        (
-            "https://file-examples.com/wp-content/uploads/2017/11/file_example_MP3_700KB.mp3",
-            "35257826e20227a8a57d0e5a410e03c7",
-        ),
+        ("https://filesamples.com/samples/audio/mp3/sample3.mp3", "d30a2308f188cbb11d74cf20c357891c"),
     ),
 )
 def test_download_resource(test_url: str, expected_hash: str):
diff --git a/tests/site_downloaders/test_gfycat.py b/tests/site_downloaders/test_gfycat.py
index 2821a7e..545d273 100644
--- a/tests/site_downloaders/test_gfycat.py
+++ b/tests/site_downloaders/test_gfycat.py
@@ -9,6 +9,13 @@ from bdfr.resource import Resource
 from bdfr.site_downloaders.gfycat import Gfycat
 
 
+@pytest.mark.online
+def test_auth_cache():
+    auth1 = Gfycat._get_auth_token()
+    auth2 = Gfycat._get_auth_token()
+    assert auth1 == auth2
+
+
 @pytest.mark.online
 @pytest.mark.parametrize(
     ("test_url", "expected_url"),
diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py
index 9d1a7f5..5589999 100644
--- a/tests/site_downloaders/test_redgifs.py
+++ b/tests/site_downloaders/test_redgifs.py
@@ -10,6 +10,13 @@ from bdfr.resource import Resource
 from bdfr.site_downloaders.redgifs import Redgifs
 
 
+@pytest.mark.online
+def test_auth_cache():
+    auth1 = Redgifs._get_auth_token()
+    auth2 = Redgifs._get_auth_token()
+    assert auth1 == auth2
+
+
 @pytest.mark.parametrize(
     ("test_url", "expected"),
     (