From 1ba82150723b4e27b9f51566d5544f2e3fc34ecc Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 May 2024 07:49:36 -0700 Subject: [PATCH] only use domain part of uri for hash --- archivebox/abid_utils/abid.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/archivebox/abid_utils/abid.py b/archivebox/abid_utils/abid.py index a45205a4..1d3d28f9 100644 --- a/archivebox/abid_utils/abid.py +++ b/archivebox/abid_utils/abid.py @@ -3,6 +3,7 @@ from typing import NamedTuple, Any, Union, Optional import ulid import uuid6 import hashlib +from urllib.parse import urlparse from uuid import UUID from typeid import TypeID # type: ignore[import-untyped] @@ -100,10 +101,20 @@ def uri_hash(uri: Union[str, bytes]) -> str: """ 'E4A5CCD9AF4ED2A6E0954DF19FD274E9CDDB4853051F033FD518BFC90AA1AC25' """ - if isinstance(uri, str): - uri = uri.encode('utf-8') + if isinstance(uri, bytes): + uri_str: str = uri.decode() + else: + uri_str = uri - return hashlib.sha256(uri).hexdigest().upper() + # only hash the domain part of URLs + if '://' in uri_str: + domain = urlparse(uri_str).host + if domain: + url_str = domain + + uri_bytes = uri_str.encode('utf-8') + + return hashlib.sha256(uri_bytes).hexdigest().upper() def abid_part_from_prefix(prefix: Optional[str]) -> str: """