Merge pull request #769 from OMEGARAZER/gfycat-api

2024-10-01 01:30:52 +13:00 · 2023-02-12 11:45:25 +10:00 · 2023-02-12 11:45:25 +10:00 · 0051877e01
commit 0051877e01
parent e5b184ef9a 0e28c7ed7c
14 changed files with 101 additions and 33 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -2,13 +2,18 @@
 # See https://pre-commit.com/hooks.html for more hooks

 repos:
+  - repo: https://github.com/abravalheri/validate-pyproject
+    rev: v0.12.1
+    hooks:
+      - id: validate-pyproject
+
  - repo: https://github.com/psf/black
-    rev: 22.12.0
+    rev: 23.1.0
    hooks:
      - id: black

  - repo: https://github.com/pycqa/isort
-    rev: 5.11.4
+    rev: 5.12.0
    hooks:
      - id: isort
        name: isort (python)
@ -23,3 +28,9 @@ repos:
    rev: v0.12.0
    hooks:
      - id: markdownlint
+
+  - repo: https://github.com/adamchainz/blacken-docs
+    rev: 1.13.0
+    hooks:
+      - id: blacken-docs
+        additional_dependencies: [black>=23.1.0]
--- a/bdfr/main.py
+++ b/bdfr/main.py
@ -81,7 +81,7 @@ def _check_version(context, param, value):
    if not value or context.resilient_parsing:
        return
    current = __version__
-    latest = requests.get("https://pypi.org/pypi/bdfr/json").json()["info"]["version"]
+    latest = requests.get("https://pypi.org/pypi/bdfr/json", timeout=10).json()["info"]["version"]
    print(f"You are currently using v{current} the latest is v{latest}")
    context.exit()

--- a/bdfr/downloader.py
+++ b/bdfr/downloader.py
@ -25,7 +25,7 @@ logger = logging.getLogger(__name__)

 def _calc_hash(existing_file: Path):
    chunk_size = 1024 * 1024
-    md5_hash = hashlib.md5()
+    md5_hash = hashlib.md5(usedforsecurity=False)
    with existing_file.open("rb") as file:
        chunk = file.read(chunk_size)
        while chunk:
--- a/bdfr/oauth2.py
+++ b/bdfr/oauth2.py
@ -25,7 +25,9 @@ class OAuth2Authenticator:
    @staticmethod
    def _check_scopes(wanted_scopes: set[str]):
        response = requests.get(
-            "https://www.reddit.com/api/v1/scopes.json", headers={"User-Agent": "fetch-scopes test"}
+            "https://www.reddit.com/api/v1/scopes.json",
+            headers={"User-Agent": "fetch-scopes test"},
+            timeout=10,
        )
        known_scopes = [scope for scope, data in response.json().items()]
        known_scopes.append("*")
--- a/bdfr/resource.py
+++ b/bdfr/resource.py
@ -48,7 +48,7 @@ class Resource:
            self.create_hash()

    def create_hash(self):
-        self.hash = hashlib.md5(self.content)
+        self.hash = hashlib.md5(self.content, usedforsecurity=False)

    def _determine_extension(self) -> Optional[str]:
        extension_pattern = re.compile(r".*(\..{3,5})$")
@ -67,7 +67,7 @@ class Resource:
            max_wait_time = 300
        while True:
            try:
-                response = requests.get(url, headers=headers)
+                response = requests.get(url, headers=headers, timeout=10)
                if re.match(r"^2\d{2}", str(response.status_code)) and response.content:
                    return response.content
                elif response.status_code in (408, 429):
--- a/bdfr/site_downloaders/base_downloader.py
+++ b/bdfr/site_downloaders/base_downloader.py
@ -27,10 +27,21 @@ class BaseDownloader(ABC):
    @staticmethod
    def retrieve_url(url: str, cookies: dict = None, headers: dict = None) -> requests.Response:
        try:
-            res = requests.get(url, cookies=cookies, headers=headers)
+            res = requests.get(url, cookies=cookies, headers=headers, timeout=10)
        except requests.exceptions.RequestException as e:
            logger.exception(e)
            raise SiteDownloaderError(f"Failed to get page {url}")
        if res.status_code != 200:
            raise ResourceNotFound(f"Server responded with {res.status_code} to {url}")
        return res
+
+    @staticmethod
+    def post_url(url: str, cookies: dict = None, headers: dict = None, payload: dict = None) -> requests.Response:
+        try:
+            res = requests.post(url, cookies=cookies, headers=headers, json=payload, timeout=10)
+        except requests.exceptions.RequestException as e:
+            logger.exception(e)
+            raise SiteDownloaderError(f"Failed to post to {url}")
+        if res.status_code != 200:
+            raise ResourceNotFound(f"Server responded with {res.status_code} to {url}")
+        return res
--- a/bdfr/site_downloaders/gallery.py
+++ b/bdfr/site_downloaders/gallery.py
@ -41,7 +41,7 @@ class Gallery(BaseDownloader):
            possible_extensions = (".jpg", ".png", ".gif", ".gifv", ".jpeg")
            for extension in possible_extensions:
                test_url = f"https://i.redd.it/{image_id}{extension}"
-                response = requests.head(test_url)
+                response = requests.head(test_url, timeout=10)
                if response.status_code == 200:
                    out.append(test_url)
                    break
--- a/bdfr/site_downloaders/gfycat.py
+++ b/bdfr/site_downloaders/gfycat.py
@ -4,7 +4,7 @@ import json
 import re
 from typing import Optional

-from bs4 import BeautifulSoup
+from cachetools import TTLCache, cached
 from praw.models import Submission

 from bdfr.exceptions import SiteDownloaderError
@ -20,6 +20,20 @@ class Gfycat(Redgifs):
    def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
        return super().find_resources(authenticator)

+    @staticmethod
+    @cached(cache=TTLCache(maxsize=5, ttl=3420))
+    def _get_auth_token() -> str:
+        headers = {
+            "content-type": "text/plain;charset=UTF-8",
+            "host": "weblogin.gfycat.com",
+            "origin": "https://gfycat.com",
+        }
+        payload = {"access_key": "Anr96uuqt9EdamSCwK4txKPjMsf2M95Rfa5FLLhPFucu8H5HTzeutyAa"}
+        token = json.loads(
+            Gfycat.post_url("https://weblogin.gfycat.com/oauth/webtoken", headers=headers, payload=payload).text
+        )["access_token"]
+        return token
+
    @staticmethod
    def _get_link(url: str) -> set[str]:
        gfycat_id = re.match(r".*/(.*?)(?:/?|-.*|\..{3-4})$", url).group(1)
@ -27,18 +41,33 @@ class Gfycat(Redgifs):

        response = Gfycat.retrieve_url(url)
        if re.search(r"(redgifs|gifdeliverynetwork)", response.url):
-            url = url.lower()  # Fixes error with old gfycat/redgifs links
+            url = url.lower()
            return Redgifs._get_link(url)

-        soup = BeautifulSoup(response.text, "html.parser")
-        content = soup.find("script", attrs={"data-react-helmet": "true", "type": "application/ld+json"})
+        auth_token = Gfycat._get_auth_token()
+        if not auth_token:
+            raise SiteDownloaderError("Unable to retrieve Gfycat API token")
+
+        headers = {
+            "referer": "https://gfycat.com/",
+            "origin": "https://gfycat.com",
+            "content-type": "application/json",
+            "Authorization": f"Bearer {auth_token}",
+        }
+        content = Gfycat.retrieve_url(f"https://api.gfycat.com/v1/gfycats/{gfycat_id}", headers=headers)
+
+        if content is None:
+            raise SiteDownloaderError("Could not read the API source")

        try:
-            out = json.loads(content.contents[0])["video"]["contentUrl"]
+            response_json = json.loads(content.text)
+        except json.JSONDecodeError as e:
+            raise SiteDownloaderError(f"Received data was not valid JSON: {e}")
+
+        try:
+            out = response_json["gfyItem"]["mp4Url"]
        except (IndexError, KeyError, AttributeError) as e:
            raise SiteDownloaderError(f"Failed to download Gfycat link {url}: {e}")
-        except json.JSONDecodeError as e:
-            raise SiteDownloaderError(f"Did not receive valid JSON data: {e}")
        return {
            out,
        }
--- a/bdfr/site_downloaders/redgifs.py
+++ b/bdfr/site_downloaders/redgifs.py
@ -5,6 +5,7 @@ import re
 from typing import Optional

 import requests
+from cachetools import TTLCache, cached
 from praw.models import Submission

 from bdfr.exceptions import SiteDownloaderError
@ -21,6 +22,12 @@ class Redgifs(BaseDownloader):
        media_urls = self._get_link(self.post.url)
        return [Resource(self.post, m, Resource.retry_download(m), None) for m in media_urls]

+    @staticmethod
+    @cached(cache=TTLCache(maxsize=5, ttl=82080))
+    def _get_auth_token() -> str:
+        token = json.loads(Redgifs.retrieve_url("https://api.redgifs.com/v2/auth/temporary").text)["token"]
+        return token
+
    @staticmethod
    def _get_id(url: str) -> str:
        try:
@ -37,7 +44,7 @@ class Redgifs(BaseDownloader):
    def _get_link(url: str) -> set[str]:
        redgif_id = Redgifs._get_id(url)

-        auth_token = json.loads(Redgifs.retrieve_url("https://api.redgifs.com/v2/auth/temporary").text)["token"]
+        auth_token = Redgifs._get_auth_token()
        if not auth_token:
            raise SiteDownloaderError("Unable to retrieve Redgifs API token")

@ -47,7 +54,6 @@ class Redgifs(BaseDownloader):
            "content-type": "application/json",
            "Authorization": f"Bearer {auth_token}",
        }
-
        content = Redgifs.retrieve_url(f"https://api.redgifs.com/v2/gifs/{redgif_id}", headers=headers)

        if content is None:
@ -61,7 +67,7 @@ class Redgifs(BaseDownloader):
        out = set()
        try:
            if response_json["gif"]["type"] == 1:  # type 1 is a video
-                if requests.get(response_json["gif"]["urls"]["hd"], headers=headers).ok:
+                if requests.head(response_json["gif"]["urls"]["hd"], headers=headers, timeout=10).ok:
                    out.add(response_json["gif"]["urls"]["hd"])
                else:
                    out.add(response_json["gif"]["urls"]["sd"])
@ -79,7 +85,4 @@ class Redgifs(BaseDownloader):
        except (KeyError, AttributeError):
            raise SiteDownloaderError("Failed to find JSON data in page")

-        # Update subdomain if old one is returned
-        out = {re.sub("thumbs2", "thumbs3", link) for link in out}
-        out = {re.sub("thumbs3", "thumbs4", link) for link in out}
        return out
--- a/bdfr/site_downloaders/vidble.py
+++ b/bdfr/site_downloaders/vidble.py
@ -36,7 +36,7 @@ class Vidble(BaseDownloader):
        if not re.search(r"vidble.com/(show/|album/|watch\?v)", url):
            url = re.sub(r"/(\w*?)$", r"/show/\1", url)

-        page = requests.get(url)
+        page = requests.get(url, timeout=10)
        soup = bs4.BeautifulSoup(page.text, "html.parser")
        content_div = soup.find("div", attrs={"id": "ContentPlaceHolder1_divContent"})
        images = content_div.find_all("img")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -25,12 +25,13 @@ classifiers = [
 dependencies = [
    "appdirs>=1.4.4",
    "beautifulsoup4>=4.10.0",
+    "cachetools>=5.3.0",
    "click>=8.0.0",
    "dict2xml>=1.7.0",
    "praw>=7.2.0",
    "pyyaml>=5.4.1",
-    "requests>=2.25.1",
-    "yt-dlp>=2022.11.11",
+    "requests>=2.28.2",
+    "yt-dlp>=2023.1.6",
 ]
 dynamic = ["version"]

@ -41,11 +42,11 @@ data-files = {"config" = ["bdfr/default_config.cfg",]}

 [project.optional-dependencies]
 dev = [
-    "black>=22.12.0",
+    "black>=23.1.0",
    "Flake8-pyproject>=1.2.2",
-    "isort>=5.11.4",
-    "pre-commit>=2.20.0",
-    "pytest>=7.1.0",
+    "isort>=5.12.0",
+    "pre-commit>=3.0.4",
+    "pytest>=7.2.1",
    "tox>=3.27.1",
 ]

--- a/tests/site_downloaders/test_direct.py
+++ b/tests/site_downloaders/test_direct.py
@ -13,10 +13,7 @@ from bdfr.site_downloaders.direct import Direct
    ("test_url", "expected_hash"),
    (
        ("https://i.redd.it/q6ebualjxzea1.jpg", "6ec154859c777cb401132bb991cb3635"),
-        (
-            "https://file-examples.com/wp-content/uploads/2017/11/file_example_MP3_700KB.mp3",
-            "35257826e20227a8a57d0e5a410e03c7",
-        ),
+        ("https://filesamples.com/samples/audio/mp3/sample3.mp3", "d30a2308f188cbb11d74cf20c357891c"),
    ),
 )
 def test_download_resource(test_url: str, expected_hash: str):
--- a/tests/site_downloaders/test_gfycat.py
+++ b/tests/site_downloaders/test_gfycat.py
@ -8,6 +8,13 @@ from bdfr.resource import Resource
 from bdfr.site_downloaders.gfycat import Gfycat


+@pytest.mark.online
+def test_auth_cache():
+    auth1 = Gfycat._get_auth_token()
+    auth2 = Gfycat._get_auth_token()
+    assert auth1 == auth2
+
+
@pytest.mark.online
@pytest.mark.parametrize(
    ("test_url", "expected_url"),
--- a/tests/site_downloaders/test_redgifs.py
+++ b/tests/site_downloaders/test_redgifs.py
@ -9,6 +9,13 @@ from bdfr.resource import Resource
 from bdfr.site_downloaders.redgifs import Redgifs


+@pytest.mark.online
+def test_auth_cache():
+    auth1 = Redgifs._get_auth_token()
+    auth2 = Redgifs._get_auth_token()
+    assert auth1 == auth2
+
+
@pytest.mark.parametrize(
    ("test_url", "expected"),
    (