diff --git a/archivebox/core/models.py b/archivebox/core/models.py index b51f9a59..0e35249d 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -2,10 +2,13 @@ __package__ = 'archivebox.core' import uuid +import ulid import json +import hashlib +from typeid import TypeID from pathlib import Path -from typing import Optional, List +from typing import Optional, List, NamedTuple from importlib import import_module from django.db import models @@ -37,6 +40,13 @@ except AttributeError: JSONField = jsonfield.JSONField +class ULIDParts(NamedTuple): + timestamp: str + url: str + subtype: str + randomness: str + + class Tag(models.Model): """ Based on django-taggit model @@ -99,6 +109,38 @@ class Snapshot(models.Model): keys = ('url', 'timestamp', 'title', 'tags', 'updated') + @property + def ulid_from_timestamp(self): + return str(ulid.from_timestamp(self.added))[:10] + + @property + def ulid_from_urlhash(self): + return str(ulid.from_randomness(self.url_hash))[10:18] + + @property + def ulid_from_type(self): + return '00' + + @property + def ulid_from_randomness(self): + return str(ulid.from_uuid(self.id))[20:] + + @property + def ulid_tuple(self) -> ULIDParts: + return ULIDParts(self.ulid_from_timestamp, self.ulid_from_urlhash, self.ulid_from_type, self.ulid_from_randomness) + + @property + def ulid(self): + return ulid.parse(''.join(self.ulid_tuple)) + + @property + def uuid(self): + return self.ulid.uuid + + @property + def typeid(self): + return TypeID.from_uuid(prefix='snapshot', suffix=self.ulid.uuid) + def __repr__(self) -> str: title = self.title or '-' return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})' @@ -163,7 +205,10 @@ class Snapshot(models.Model): @cached_property def url_hash(self): - return hashurl(self.url) + # return hashurl(self.url) + url_hash = hashlib.new('sha256') + url_hash.update(self.url.encode('utf-8')) + return url_hash.hexdigest()[:16] @cached_property def base_url(self): @@ -271,7 +316,7 @@ class ArchiveResult(models.Model): EXTRACTOR_CHOICES = EXTRACTOR_CHOICES id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') - uuid = models.UUIDField(default=uuid.uuid4, editable=False) + uuid = models.UUIDField(default=uuid.uuid4, editable=True) snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32) @@ -292,6 +337,40 @@ class ArchiveResult(models.Model): def snapshot_dir(self): return Path(self.snapshot.link_dir) + @property + def ulid_from_timestamp(self): + return self.snapshot.ulid_from_timestamp + + @property + def ulid_from_urlhash(self): + return self.snapshot.ulid_from_urlhash + + @property + def ulid_from_snapshot(self): + return str(self.snapshot.ulid)[:18] + + @property + def ulid_from_type(self): + return hashlib.sha256(self.extractor.encode('utf-8')).hexdigest()[:2] + + @property + def ulid_from_randomness(self): + return str(ulid.from_uuid(self.uuid))[20:] + + @property + def ulid_tuple(self) -> ULIDParts: + return ULIDParts(self.ulid_from_timestamp, self.ulid_from_urlhash, self.ulid_from_type, self.ulid_from_randomness) + + @property + def ulid(self): + final_ulid = ulid.parse(''.join(self.ulid_tuple)) + # TODO: migrate self.uuid to match this new uuid + # self.uuid = final_ulid.uuid + return final_ulid + + @property + def typeid(self): + return TypeID.from_uuid(prefix='result', suffix=self.ulid.uuid) @property def extractor_module(self): diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 0c1efbd4..20835e3b 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -263,7 +263,7 @@ CACHES = { 'default': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'}, 'dummy': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'}, 'locmem': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'}, - # 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'}, + 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'}, } EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' diff --git a/pyproject.toml b/pyproject.toml index e3544a80..30c924fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,8 @@ dependencies = [ # - See Github issues for more... "django-signal-webhooks>=0.3.0", "django-admin-data-views>=0.3.1", + "ulid-py>=1.1.0", + "typeid-python>=0.3.0", ] homepage = "https://github.com/ArchiveBox/ArchiveBox"