diff --git a/.gitignore b/.gitignore index 030849c5..7e3fbe26 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ dist/ data/ data*/ output/ +index.sqlite3 # vim *.sw? diff --git a/archivebox/__init__.py b/archivebox/__init__.py index 52f40d83..0924fd32 100755 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -1,7 +1,4 @@ __package__ = 'archivebox' -# monkey patch django timezone to add back utc (it was removed in Django 5.0) -import datetime -from django.utils import timezone -timezone.utc = datetime.timezone.utc +from .monkey_patches import * diff --git a/archivebox/abid_utils/__init__.py b/archivebox/abid_utils/__init__.py new file mode 100644 index 00000000..12c2f475 --- /dev/null +++ b/archivebox/abid_utils/__init__.py @@ -0,0 +1 @@ +__package__ = 'abid_utils' diff --git a/archivebox/abid_utils/abid.py b/archivebox/abid_utils/abid.py new file mode 100644 index 00000000..48597813 --- /dev/null +++ b/archivebox/abid_utils/abid.py @@ -0,0 +1,191 @@ +from typing import NamedTuple, Any, Union, Optional + +import ulid +import uuid6 +import hashlib +from urllib.parse import urlparse + +from uuid import UUID +from typeid import TypeID # type: ignore[import-untyped] +from datetime import datetime + + + +ABID_PREFIX_LEN = 4 +ABID_SUFFIX_LEN = 26 +ABID_LEN = 30 +ABID_TS_LEN = 10 +ABID_URI_LEN = 8 +ABID_SUBTYPE_LEN = 2 +ABID_RAND_LEN = 6 + +DEFAULT_ABID_PREFIX = 'obj_' + + +class ABID(NamedTuple): + """ + e.g. ABID('obj_01HX9FPYTRE4A5CCD901ZYEBQE') + """ + prefix: str # e.g. obj_ + ts: str # e.g. 01HX9FPYTR + uri: str # e.g. E4A5CCD9 + subtype: str # e.g. 01 + rand: str # e.g. ZYEBQE + + def __getattr__(self, attr: str) -> Any: + return getattr(self.ulid, attr) + + def __eq__(self, other: Any) -> bool: + try: + return self.ulid == other.ulid + except AttributeError: + return NotImplemented + + def __str__(self) -> str: + return self.prefix + self.suffix + + def __len__(self) -> int: + return len(self.prefix + self.suffix) + + @classmethod + def parse(cls, buffer: Union[str, UUID, ulid.ULID, TypeID, 'ABID'], prefix=DEFAULT_ABID_PREFIX) -> 'ABID': + assert buffer, f'Attempted to create ABID from null value {buffer}' + + buffer = str(buffer) + if '_' in buffer: + prefix, suffix = buffer.split('_') + else: + prefix, suffix = prefix.strip('_'), buffer + + assert len(prefix) == ABID_PREFIX_LEN - 1 # length without trailing _ + assert len(suffix) == ABID_SUFFIX_LEN, f'Suffix {suffix} from {buffer} was not {ABID_SUFFIX_LEN} chars long' + + return cls( + prefix=abid_part_from_prefix(prefix), + ts=suffix[0:10].upper(), + uri=suffix[10:18].upper(), + subtype=suffix[18:20].upper(), + rand=suffix[20:26].upper(), + ) + + @property + def suffix(self): + return ''.join((self.ts, self.uri, self.subtype, self.rand)) + + @property + def ulid(self) -> ulid.ULID: + return ulid.parse(self.suffix) + + @property + def uuid(self) -> UUID: + return self.ulid.uuid + + @property + def uuid6(self) -> uuid6.UUID: + return uuid6.UUID(hex=self.uuid.hex) + + @property + def typeid(self) -> TypeID: + return TypeID.from_uuid(prefix=self.prefix.strip('_'), suffix=self.uuid6) + + @property + def datetime(self) -> datetime: + return self.ulid.timestamp().datetime + + + +#################################################### + + +def uri_hash(uri: Union[str, bytes]) -> str: + """ + 'E4A5CCD9AF4ED2A6E0954DF19FD274E9CDDB4853051F033FD518BFC90AA1AC25' + """ + if isinstance(uri, bytes): + uri_str: str = uri.decode() + else: + uri_str = uri + + # only hash the domain part of URLs + if '://' in uri_str: + try: + domain = urlparse(uri_str).netloc + if domain: + uri_str = domain + except AttributeError: + pass + + uri_bytes = uri_str.encode('utf-8') + + return hashlib.sha256(uri_bytes).hexdigest().upper() + +def abid_part_from_prefix(prefix: Optional[str]) -> str: + """ + 'snp_' + """ + if prefix is None: + return 'obj_' + + prefix = prefix.strip('_').lower() + assert len(prefix) == 3 + return prefix + '_' + +def abid_part_from_uri(uri: str) -> str: + """ + 'E4A5CCD9' # takes first 8 characters of sha256(url) + """ + uri = str(uri) + return uri_hash(uri)[:ABID_URI_LEN] + +def abid_part_from_ts(ts: Optional[datetime]) -> str: + """ + '01HX9FPYTR' # produces 10 character Timestamp section of ulid based on added date + """ + return str(ulid.from_timestamp(ts) if ts else ulid.new())[:ABID_TS_LEN] + +def abid_part_from_subtype(subtype: str) -> str: + """ + Snapshots have 01 type, other objects have other subtypes like wget/media/etc. + Also allows us to change the ulid spec later by putting special sigil values here. + """ + subtype = str(subtype) + if len(subtype) == ABID_SUBTYPE_LEN: + return subtype + + return hashlib.sha256(subtype.encode('utf-8')).hexdigest()[:ABID_SUBTYPE_LEN].upper() + +def abid_part_from_rand(rand: Union[str, UUID, None, int]) -> str: + """ + 'ZYEBQE' # takes last 6 characters of randomness from existing legacy uuid db field + """ + if rand is None: + # if it's None we generate a new random 6 character hex string + return str(ulid.new())[-ABID_RAND_LEN:] + elif isinstance(rand, UUID): + # if it's a uuid we take the last 6 characters of the ULID represation of it + return str(ulid.from_uuid(rand))[-ABID_RAND_LEN:] + elif isinstance(rand, int): + # if it's a BigAutoInteger field we convert it from an int to a 0-padded string + rand_str = str(rand)[-ABID_RAND_LEN:] + padding_needed = ABID_RAND_LEN - len(rand_str) + rand_str = ('0'*padding_needed) + rand_str + return rand_str + + # otherwise treat it as a string, take the last 6 characters of it verbatim + return str(rand)[-ABID_RAND_LEN:].upper() + + +def abid_from_values(prefix, ts, uri, subtype, rand) -> ABID: + """ + Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src). + """ + + abid = ABID( + prefix=abid_part_from_prefix(prefix), + ts=abid_part_from_ts(ts), + uri=abid_part_from_uri(uri), + subtype=abid_part_from_subtype(subtype), + rand=abid_part_from_rand(rand), + ) + assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for ts={ts} uri={uri} subtyp={subtype} rand={rand}' + return abid diff --git a/archivebox/abid_utils/apps.py b/archivebox/abid_utils/apps.py new file mode 100644 index 00000000..4f2fa465 --- /dev/null +++ b/archivebox/abid_utils/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig + + +class AbidUtilsConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + + name = 'abid_utils' diff --git a/archivebox/index.sqlite3 b/archivebox/abid_utils/migrations/__init__.py similarity index 100% rename from archivebox/index.sqlite3 rename to archivebox/abid_utils/migrations/__init__.py diff --git a/archivebox/abid_utils/models.py b/archivebox/abid_utils/models.py new file mode 100644 index 00000000..de8b3c87 --- /dev/null +++ b/archivebox/abid_utils/models.py @@ -0,0 +1,314 @@ +""" +This file provides the Django ABIDField and ABIDModel base model to inherit from. + +It implements the ArchiveBox ID (ABID) interfaces including abid_values, get_abid, .abid, .uuid, .id. +""" + +from typing import Any, Dict, Union, List, Set, NamedTuple, cast + +from ulid import ULID +from uuid import uuid4, UUID +from typeid import TypeID # type: ignore[import-untyped] +from datetime import datetime +from functools import partial +from charidfield import CharIDField # type: ignore[import-untyped] + +from django.conf import settings +from django.db import models +from django.db.utils import OperationalError +from django.contrib.auth import get_user_model + +from django_stubs_ext.db.models import TypedModelMeta + +from .abid import ( + ABID, + ABID_LEN, + ABID_RAND_LEN, + ABID_SUFFIX_LEN, + DEFAULT_ABID_PREFIX, + abid_part_from_prefix, + abid_from_values +) + +#################################################### + + +# Database Field for typeid/ulid style IDs with a prefix, e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ +ABIDField = partial( + CharIDField, + max_length=ABID_LEN, + help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)", + default=None, + null=True, + blank=True, + db_index=True, + unique=True, +) + +def get_or_create_system_user_pk(username='system'): + """Get or create a system user with is_superuser=True to be the default owner for new DB rows""" + + User = get_user_model() + + # if only one user exists total, return that user + if User.objects.filter(is_superuser=True).count() == 1: + return User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0] + + # otherwise, create a dedicated "system" user + user, created = User.objects.get_or_create(username=username, is_staff=True, is_superuser=True, defaults={'email': '', 'password': ''}) + return user.pk + + +class ABIDModel(models.Model): + """ + Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface. + """ + abid_prefix: str = DEFAULT_ABID_PREFIX # e.g. 'tag_' + abid_ts_src = 'None' # e.g. 'self.created' + abid_uri_src = 'None' # e.g. 'self.uri' + abid_subtype_src = 'None' # e.g. 'self.extractor' + abid_rand_src = 'None' # e.g. 'self.uuid' or 'self.id' + + id = models.UUIDField(primary_key=True, default=uuid4, editable=True) + uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) + abid = ABIDField(prefix=abid_prefix) + + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk) + created = models.DateTimeField(auto_now_add=True) + modified = models.DateTimeField(auto_now=True) + + class Meta(TypedModelMeta): + abstract = True + + def save(self, *args: Any, **kwargs: Any) -> None: + if hasattr(self, 'abid'): + # self.abid = ABID.parse(self.abid) if self.abid else self.get_abid() + self.abid = self.get_abid() + else: + print(f'[!] WARNING: {self.__class__.__name__}.abid is not a DB field so ABID will not be persisted!') + self.abid = self.get_abid() + + super().save(*args, **kwargs) + + @property + def abid_values(self) -> Dict[str, Any]: + return { + 'prefix': self.abid_prefix, + 'ts': eval(self.abid_ts_src), + 'uri': eval(self.abid_uri_src), + 'subtype': eval(self.abid_subtype_src), + 'rand': eval(self.abid_rand_src), + } + + def get_abid(self) -> ABID: + """ + Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src). + """ + prefix, ts, uri, subtype, rand = self.abid_values.values() + + if (not prefix) or prefix == DEFAULT_ABID_PREFIX: + suggested_abid = self.__class__.__name__[:3].lower() + raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})') + + if not ts: + ts = datetime.utcfromtimestamp(0) + print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat()) + + if not uri: + uri = str(self) + print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri) + + if not subtype: + subtype = self.__class__.__name__ + print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype) + + if not rand: + rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk') + print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand) + + abid = abid_from_values( + prefix=prefix, + ts=ts, + uri=uri, + subtype=subtype, + rand=rand, + ) + assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}' + return abid + + @property + def ABID(self) -> ABID: + """ + ULIDParts(timestamp='01HX9FPYTR', url='E4A5CCD9', subtype='00', randomness='ZYEBQE') + """ + return ABID.parse(self.abid) if getattr(self, 'abid', None) else self.get_abid() + + @property + def ULID(self) -> ULID: + """ + Get a ulid.ULID representation of the object's ABID. + """ + return self.ABID.ulid + + @property + def UUID(self) -> UUID: + """ + Get a uuid.UUID (v4) representation of the object's ABID. + """ + return self.ABID.uuid + + @property + def TypeID(self) -> TypeID: + """ + Get a typeid.TypeID (stripe-style) representation of the object's ABID. + """ + return self.ABID.typeid + + + +#################################################### + +# Django helpers +def find_all_abid_prefixes() -> Dict[str, type[models.Model]]: + """ + Return the mapping of all ABID prefixes to their models. + e.g. {'tag_': core.models.Tag, 'snp_': core.models.Snapshot, ...} + """ + import django.apps + prefix_map = {} + + for model in django.apps.apps.get_models(): + abid_prefix = getattr(model, 'abid_prefix', None) + if abid_prefix: + prefix_map[abid_prefix] = model + return prefix_map + +def find_prefix_for_abid(abid: ABID) -> str: + """ + Find the correct prefix for a given ABID that may have be missing a prefix (slow). + e.g. ABID('obj_01BJQMF54D093DXEAWZ6JYRPAQ') -> 'snp_' + """ + # if existing abid prefix is correct, lookup is easy + model = find_model_from_abid(abid) + if model: + assert issubclass(model, ABIDModel) + return model.abid_prefix + + # prefix might be obj_ or missing, fuzzy-search to find any object that matches + return find_obj_from_abid_rand(abid)[0].abid_prefix + +def find_model_from_abid_prefix(prefix: str) -> type[ABIDModel] | None: + """ + Return the Django Model that corresponds to a given ABID prefix. + e.g. 'tag_' -> core.models.Tag + """ + prefix = abid_part_from_prefix(prefix) + + import django.apps + + for model in django.apps.apps.get_models(): + if not issubclass(model, ABIDModel): continue # skip non-ABID-enabled models + if not hasattr(model, 'objects'): continue # skip abstract models + + if (model.abid_prefix == prefix): + return model + + return None + +def find_model_from_abid(abid: ABID) -> type[models.Model] | None: + """ + Shortcut for find_model_from_abid_prefix(abid.prefix) + """ + return find_model_from_abid_prefix(abid.prefix) + +def find_obj_from_abid_rand(rand: Union[ABID, str], model=None) -> List[ABIDModel]: + """ + Find an object corresponding to an ABID by exhaustively searching using its random suffix (slow). + e.g. 'obj_....................JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ') + """ + + # convert str to ABID if necessary + if isinstance(rand, ABID): + abid: ABID = rand + else: + rand = str(rand) + if len(rand) < ABID_SUFFIX_LEN: + padding_needed = ABID_SUFFIX_LEN - len(rand) + rand = ('0'*padding_needed) + rand + abid = ABID.parse(rand) + + import django.apps + + partial_matches: List[ABIDModel] = [] + + models_to_try = cast(Set[type[models.Model]], set(filter(bool, ( + model, + find_model_from_abid(abid), + *django.apps.apps.get_models(), + )))) + # print(abid, abid.rand, abid.uuid, models_to_try) + + for model in models_to_try: + if not issubclass(model, ABIDModel): continue # skip Models that arent ABID-enabled + if not hasattr(model, 'objects'): continue # skip abstract Models + assert hasattr(model, 'objects') # force-fix for type hint nit about missing manager https://github.com/typeddjango/django-stubs/issues/1684 + + # continue on to try fuzzy searching by randomness portion derived from uuid field + try: + qs = [] + if hasattr(model, 'abid'): + qs = model.objects.filter(abid__endswith=abid.rand) + elif hasattr(model, 'uuid'): + qs = model.objects.filter(uuid__endswith=str(abid.uuid)[-ABID_RAND_LEN:]) + elif hasattr(model, 'id'): + # NOTE: this only works on SQLite where every column is a string + # other DB backends like postgres dont let you do __endswith if this is a BigAutoInteger field + + # try to search for uuid=...-2354352 + # try to search for id=...2354352 + # try to search for id=2354352 + qs = model.objects.filter( + models.Q(id__endswith=str(abid.uuid)[-ABID_RAND_LEN:]) + | models.Q(id__endswith=abid.rand) + | models.Q(id__startswith=str(int(abid.rand)) if abid.rand.isdigit() else abid.rand) + ) + + for obj in qs: + if obj.get_abid() == abid: + # found exact match, no need to keep iterating + return [obj] + partial_matches.append(obj) + except OperationalError as err: + print(f'[!] WARNING: Got error while trying to iterate through QuerySet for {model}:', err, '\n') + + return partial_matches + +def find_obj_from_abid(abid: ABID, model=None, fuzzy=False) -> Any: + """ + Find an object with a given ABID by filtering possible models for a matching abid/uuid/id (fast). + e.g. 'snp_01BJQMF54D093DXEAWZ6JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ') + """ + + model = model or find_model_from_abid(abid) + assert model, f'Could not find model that could match this ABID type: {abid}' + + try: + if hasattr(model, 'abid'): + return model.objects.get(abid__endswith=abid.suffix) + if hasattr(model, 'uuid'): + return model.objects.get(uuid=abid.uuid) + return model.objects.get(id=abid.uuid) + except model.DoesNotExist: + # if the model has an abid field then it shouldve matched, pointless to fuzzy search in that case + if hasattr(model, 'abid') or (not fuzzy): + raise + + # continue on to try fuzzy searching by randomness portion derived from uuid field + match_by_rand = find_obj_from_abid_rand(abid, model=model) + if match_by_rand: + if match_by_rand[0].abid_prefix != abid.prefix: + print(f'[!] WARNING: fetched object {match_by_rand} even though prefix {abid.prefix} doesnt match!', abid, '\n') + return match_by_rand + + raise model.DoesNotExist + diff --git a/archivebox/abid_utils/tests.py b/archivebox/abid_utils/tests.py new file mode 100644 index 00000000..7ce503c2 --- /dev/null +++ b/archivebox/abid_utils/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/archivebox/api/apps.py b/archivebox/api/apps.py index e64d943a..d7b8b0d9 100644 --- a/archivebox/api/apps.py +++ b/archivebox/api/apps.py @@ -3,5 +3,9 @@ __package__ = 'archivebox.api' from django.apps import AppConfig + class APIConfig(AppConfig): name = 'api' + + def ready(self): + pass diff --git a/archivebox/api/migrations/0003_outboundwebhook_apitoken_abid_apitoken_uuid_and_more.py b/archivebox/api/migrations/0003_outboundwebhook_apitoken_abid_apitoken_uuid_and_more.py new file mode 100644 index 00000000..5674406a --- /dev/null +++ b/archivebox/api/migrations/0003_outboundwebhook_apitoken_abid_apitoken_uuid_and_more.py @@ -0,0 +1,60 @@ +# Generated by Django 5.0.6 on 2024-05-13 10:58 + +import charidfield.fields +import signal_webhooks.fields +import signal_webhooks.utils +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0002_alter_apitoken_options'), + ] + + operations = [ + migrations.CreateModel( + name='OutboundWebhook', + fields=[ + ('name', models.CharField(db_index=True, help_text='Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).', max_length=255, unique=True, verbose_name='name')), + ('signal', models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='The type of event the webhook should fire for (e.g. Create, Update, Delete).', max_length=255, verbose_name='signal')), + ('ref', models.CharField(db_index=True, help_text='Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model')), + ('endpoint', models.URLField(help_text='External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).', max_length=2047, verbose_name='endpoint')), + ('headers', models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers')), + ('auth_token', signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token')), + ('enabled', models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled')), + ('keep_last_response', models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response')), + ('created', models.DateTimeField(auto_now_add=True, help_text='When the webhook was created.', verbose_name='created')), + ('updated', models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated')), + ('last_response', models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response')), + ('last_success', models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success')), + ('last_failure', models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure')), + ('uuid', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)), + ('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='whk', unique=True)), + ], + options={ + 'verbose_name': 'API Outbound Webhook', + 'abstract': False, + }, + ), + migrations.AddField( + model_name='apitoken', + name='abid', + field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='apt', unique=True), + ), + migrations.AddField( + model_name='apitoken', + name='uuid', + field=models.UUIDField(blank=True, null=True, unique=True), + ), + migrations.AlterField( + model_name='apitoken', + name='id', + field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False), + ), + migrations.AddConstraint( + model_name='outboundwebhook', + constraint=models.UniqueConstraint(fields=('ref', 'endpoint'), name='prevent_duplicate_hooks_api_outboundwebhook'), + ), + ] diff --git a/archivebox/api/migrations/0004_rename_user_apitoken_created_by_apitoken_modified_and_more.py b/archivebox/api/migrations/0004_rename_user_apitoken_created_by_apitoken_modified_and_more.py new file mode 100644 index 00000000..3c44f3fd --- /dev/null +++ b/archivebox/api/migrations/0004_rename_user_apitoken_created_by_apitoken_modified_and_more.py @@ -0,0 +1,58 @@ +# Generated by Django 5.0.6 on 2024-05-13 14:36 + +import abid_utils.models +import charidfield.fields +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0003_outboundwebhook_apitoken_abid_apitoken_uuid_and_more'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.RenameField( + model_name='apitoken', + old_name='user', + new_name='created_by', + ), + migrations.AddField( + model_name='apitoken', + name='modified', + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name='outboundwebhook', + name='created_by', + field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), + ), + migrations.AddField( + model_name='outboundwebhook', + name='id', + field=models.UUIDField(blank=True, null=True, unique=True), + ), + migrations.AddField( + model_name='outboundwebhook', + name='modified', + field=models.DateTimeField(auto_now=True), + ), + migrations.AlterField( + model_name='apitoken', + name='abid', + field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='apt_', unique=True), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='abid', + field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='whk_', unique=True), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='created', + field=models.DateTimeField(auto_now_add=True), + ), + ] diff --git a/archivebox/api/models.py b/archivebox/api/models.py index aefbc47c..177b275f 100644 --- a/archivebox/api/models.py +++ b/archivebox/api/models.py @@ -8,22 +8,39 @@ from django.conf import settings from django.db import models from django.utils import timezone +from signal_webhooks.models import WebhookBase + from django_stubs_ext.db.models import TypedModelMeta +from abid_utils.models import ABIDModel, ABIDField + def generate_secret_token() -> str: # returns cryptographically secure string with len() == 32 return secrets.token_hex(16) -class APIToken(models.Model): - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) +class APIToken(ABIDModel): + """ + A secret key generated by a User that's used to authenticate REST API requests to ArchiveBox. + """ + # ABID: apt____ + abid_prefix = 'apt_' + abid_ts_src = 'self.created' + abid_uri_src = 'self.token' + abid_subtype_src = 'self.user_id' + abid_rand_src = 'self.id' - user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE) + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True) + uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) + abid = ABIDField(prefix=abid_prefix) + + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE) token = models.CharField(max_length=32, default=generate_secret_token, unique=True) created = models.DateTimeField(auto_now_add=True) expires = models.DateTimeField(null=True, blank=True) + class Meta(TypedModelMeta): verbose_name = "API Key" @@ -38,7 +55,8 @@ class APIToken(models.Model): def __json__(self) -> dict: return { "TYPE": "APIToken", - "id": str(self.id), + "uuid": str(self.id), + "abid": str(self.get_abid()), "user_id": str(self.user.id), "user_username": self.user.username, "token": self.token, @@ -61,3 +79,37 @@ class APIToken(models.Model): return True + + + + + +# monkey patch django-signals-webhooks to change how it shows up in Admin UI + +class OutboundWebhook(ABIDModel, WebhookBase): + """ + Model used in place of (extending) signals_webhooks.models.WebhookModel. Swapped using: + settings.SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook' + """ + abid_prefix = 'whk_' + abid_ts_src = 'self.created' + abid_uri_src = 'self.endpoint' + abid_subtype_src = 'self.ref' + abid_rand_src = 'self.id' + + id = models.UUIDField(blank=True, null=True, unique=True, editable=True) + uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True) + abid = ABIDField(prefix=abid_prefix) + + WebhookBase._meta.get_field('name').help_text = ( + 'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).') + WebhookBase._meta.get_field('signal').help_text = ( + 'The type of event the webhook should fire for (e.g. Create, Update, Delete).') + WebhookBase._meta.get_field('ref').help_text = ( + 'Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).') + WebhookBase._meta.get_field('endpoint').help_text = ( + 'External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).') + + class Meta(WebhookBase.Meta): + verbose_name = 'API Outbound Webhook' + diff --git a/archivebox/api/v1_auth.py b/archivebox/api/v1_auth.py index 4cc0f4fa..070aa359 100644 --- a/archivebox/api/v1_auth.py +++ b/archivebox/api/v1_auth.py @@ -47,6 +47,6 @@ def check_api_token(request, token_data: TokenAuthSchema): request=request, ) if user: - return {"success": True, "user_id": str(user.id)} + return {"success": True, "user_id": str(user.pk)} return {"success": False, "user_id": None} diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index f6144ace..9046c361 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -4,13 +4,14 @@ from uuid import UUID from typing import List, Optional from datetime import datetime +from django.db.models import Q from django.shortcuts import get_object_or_404 from ninja import Router, Schema, FilterSchema, Field, Query from ninja.pagination import paginate from core.models import Snapshot, ArchiveResult, Tag - +from abid_utils.abid import ABID router = Router(tags=['Core Models']) @@ -20,24 +21,39 @@ router = Router(tags=['Core Models']) ### ArchiveResult ######################################################################### class ArchiveResultSchema(Schema): - id: UUID + abid: str + uuid: UUID + pk: str + modified: datetime + created: datetime + created_by_id: str - snapshot_id: UUID + snapshot_abid: str snapshot_url: str snapshot_tags: str extractor: str + cmd_version: str cmd: List[str] pwd: str - cmd_version: str - output: str status: str - - created: datetime + output: str @staticmethod - def resolve_id(obj): - return obj.uuid + def resolve_created_by_id(obj): + return str(obj.created_by_id) + + @staticmethod + def resolve_pk(obj): + return str(obj.pk) + + @staticmethod + def resolve_uuid(obj): + return str(obj.uuid) + + @staticmethod + def resolve_abid(obj): + return str(obj.ABID) @staticmethod def resolve_created(obj): @@ -47,18 +63,23 @@ class ArchiveResultSchema(Schema): def resolve_snapshot_url(obj): return obj.snapshot.url + @staticmethod + def resolve_snapshot_abid(obj): + return str(obj.snapshot.ABID) + @staticmethod def resolve_snapshot_tags(obj): return obj.snapshot.tags_str() class ArchiveResultFilterSchema(FilterSchema): - id: Optional[UUID] = Field(None, q='uuid') + uuid: Optional[UUID] = Field(None, q='uuid') + # abid: Optional[str] = Field(None, q='abid') search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains']) - snapshot_id: Optional[UUID] = Field(None, q='snapshot_id') - snapshot_url: Optional[str] = Field(None, q='snapshot__url') - snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name') + snapshot_uuid: Optional[UUID] = Field(None, q='snapshot_uuid__icontains') + snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains') + snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains') status: Optional[str] = Field(None, q='status') output: Optional[str] = Field(None, q='output__icontains') @@ -75,6 +96,7 @@ class ArchiveResultFilterSchema(FilterSchema): @router.get("/archiveresults", response=List[ArchiveResultSchema]) @paginate def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)): + """List all ArchiveResult entries matching these filters.""" qs = ArchiveResult.objects.all() results = filters.filter(qs) return results @@ -82,8 +104,8 @@ def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...) @router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema) def get_archiveresult(request, archiveresult_id: str): - archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id) - return archiveresult + """Get a specific ArchiveResult by abid, uuid, or pk.""" + return ArchiveResult.objects.get(Q(pk__icontains=archiveresult_id) | Q(abid__icontains=archiveresult_id) | Q(uuid__icontains=archiveresult_id)) # @router.post("/archiveresult", response=ArchiveResultSchema) @@ -115,27 +137,50 @@ def get_archiveresult(request, archiveresult_id: str): class SnapshotSchema(Schema): - id: UUID + abid: str + uuid: UUID + pk: str + modified: datetime + created: datetime + created_by_id: str url: str tags: str title: Optional[str] timestamp: str - bookmarked: datetime - added: datetime - updated: datetime archive_path: str + bookmarked: datetime + added: datetime + updated: Optional[datetime] + + num_archiveresults: int archiveresults: List[ArchiveResultSchema] - # @staticmethod - # def resolve_id(obj): - # return str(obj.id) + @staticmethod + def resolve_created_by_id(obj): + return str(obj.created_by_id) + + @staticmethod + def resolve_pk(obj): + return str(obj.pk) + + @staticmethod + def resolve_uuid(obj): + return str(obj.uuid) + + @staticmethod + def resolve_abid(obj): + return str(obj.ABID) @staticmethod def resolve_tags(obj): return obj.tags_str() + @staticmethod + def resolve_num_archiveresults(obj, context): + return obj.archiveresult_set.all().distinct().count() + @staticmethod def resolve_archiveresults(obj, context): if context['request'].with_archiveresults: @@ -144,23 +189,32 @@ class SnapshotSchema(Schema): class SnapshotFilterSchema(FilterSchema): - id: Optional[UUID] = Field(None, q='id') + abid: Optional[str] = Field(None, q='abid__icontains') + uuid: Optional[str] = Field(None, q='uuid__icontains') + pk: Optional[str] = Field(None, q='pk__icontains') + created_by_id: str = Field(None, q='created_by_id__icontains') + created__gte: datetime = Field(None, q='created__gte') + created__lt: datetime = Field(None, q='created__lt') + created: datetime = Field(None, q='created') + modified: datetime = Field(None, q='modified') + modified__gte: datetime = Field(None, q='modified__gte') + modified__lt: datetime = Field(None, q='modified__lt') - search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains']) + search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'abid__icontains', 'uuid__icontains']) url: Optional[str] = Field(None, q='url') tag: Optional[str] = Field(None, q='tags__name') title: Optional[str] = Field(None, q='title__icontains') - timestamp: Optional[str] = Field(None, q='timestamp__startswith') - added: Optional[datetime] = Field(None, q='added') added__gte: Optional[datetime] = Field(None, q='added__gte') added__lt: Optional[datetime] = Field(None, q='added__lt') + @router.get("/snapshots", response=List[SnapshotSchema]) @paginate def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=True): + """List all Snapshot entries matching these filters.""" request.with_archiveresults = with_archiveresults qs = Snapshot.objects.all() @@ -169,8 +223,24 @@ def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_arc @router.get("/snapshot/{snapshot_id}", response=SnapshotSchema) def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): + """Get a specific Snapshot by abid, uuid, or pk.""" request.with_archiveresults = with_archiveresults - snapshot = get_object_or_404(Snapshot, id=snapshot_id) + snapshot = None + try: + snapshot = Snapshot.objects.get(Q(uuid__startswith=snapshot_id) | Q(abid__startswith=snapshot_id)| Q(pk__startswith=snapshot_id)) + except Snapshot.DoesNotExist: + pass + + try: + snapshot = snapshot or Snapshot.objects.get() + except Snapshot.DoesNotExist: + pass + + try: + snapshot = snapshot or Snapshot.objects.get(Q(uuid__icontains=snapshot_id) | Q(abid__icontains=snapshot_id)) + except Snapshot.DoesNotExist: + pass + return snapshot @@ -179,9 +249,9 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): # snapshot = Snapshot.objects.create(**payload.dict()) # return snapshot # -# @router.put("/snapshot/{snapshot_id}", response=SnapshotSchema) -# def update_snapshot(request, snapshot_id: str, payload: SnapshotSchema): -# snapshot = get_object_or_404(Snapshot, id=snapshot_id) +# @router.put("/snapshot/{snapshot_uuid}", response=SnapshotSchema) +# def update_snapshot(request, snapshot_uuid: str, payload: SnapshotSchema): +# snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid) # # for attr, value in payload.dict().items(): # setattr(snapshot, attr, value) @@ -189,9 +259,9 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): # # return snapshot # -# @router.delete("/snapshot/{snapshot_id}") -# def delete_snapshot(request, snapshot_id: str): -# snapshot = get_object_or_404(Snapshot, id=snapshot_id) +# @router.delete("/snapshot/{snapshot_uuid}") +# def delete_snapshot(request, snapshot_uuid: str): +# snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid) # snapshot.delete() # return {"success": True} @@ -201,10 +271,21 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): class TagSchema(Schema): + abid: Optional[UUID] = Field(None, q='abid') + uuid: Optional[UUID] = Field(None, q='uuid') + pk: Optional[UUID] = Field(None, q='pk') + modified: datetime + created: datetime + created_by_id: str + name: str slug: str + @staticmethod + def resolve_created_by_id(obj): + return str(obj.created_by_id) + @router.get("/tags", response=List[TagSchema]) def list_tags(request): return Tag.objects.all() diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 9622c98f..204267d7 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -4,14 +4,18 @@ __command__ = 'archivebox' import os import sys import argparse +import threading +from time import sleep -from typing import Optional, Dict, List, IO, Union +from typing import Optional, Dict, List, IO, Union, Iterable from pathlib import Path -from ..config import OUTPUT_DIR, check_data_folder, check_migrations +from ..config import OUTPUT_DIR, check_data_folder, check_migrations, stderr from importlib import import_module +BUILTIN_LIST = list + CLI_DIR = Path(__file__).resolve().parent # these common commands will appear sorted before any others for ease-of-use @@ -33,6 +37,40 @@ is_valid_cli_module = lambda module, subcommand: ( ) +IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread') # threads we dont have to wait for before exiting + + +def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: Iterable[str]=IGNORED_BG_THREADS, timeout: int=60) -> int: + """ + Block until the specified threads exit. e.g. pass thread_names=('default_hook_handler',) to wait for webhooks. + Useful for waiting for signal handlers, webhooks, etc. to finish running after a mgmt command completes. + """ + + wait_for_all: bool = thread_names == () + + thread_matches = lambda thread, ptns: any(ptn in repr(thread) for ptn in ptns) + + should_wait = lambda thread: ( + not thread_matches(thread, ignore_names) + and (wait_for_all or thread_matches(thread, thread_names))) + + for tries in range(timeout): + all_threads = [*threading.enumerate()] + blocking_threads = [*filter(should_wait, all_threads)] + threads_summary = ', '.join(repr(t) for t in blocking_threads) + if blocking_threads: + sleep(1) + if tries == 5: # only show stderr message if we need to wait more than 5s + stderr( + f'[…] Waiting up to {timeout}s for background jobs (e.g. webhooks) to finish...', + threads_summary, + ) + else: + return tries + + raise Exception('Background threads failed to exit after {tries}s: {threads_summary}') + + def list_subcommands() -> Dict[str, str]: """find and import all valid archivebox_.py files in CLI_DIR""" @@ -79,6 +117,9 @@ def run_subcommand(subcommand: str, module = import_module('.archivebox_{}'.format(subcommand), __package__) module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore + # wait for webhooks, signals, and other background jobs to finish before exit + wait_for_bg_threads_to_exit(timeout=60) + SUBCOMMANDS = list_subcommands() diff --git a/archivebox/config.py b/archivebox/config.py index 939e1554..1637023b 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -37,7 +37,7 @@ from sqlite3 import dbapi2 as sqlite3 from hashlib import md5 from pathlib import Path from datetime import datetime, timezone -from typing import Optional, Type, Tuple, Dict, Union, List +from typing import Optional, Type, Tuple, Dict, Union, List, Any from subprocess import run, PIPE, DEVNULL from configparser import ConfigParser from collections import defaultdict @@ -281,6 +281,7 @@ TEMPLATES_DIR_NAME = 'templates' ARCHIVE_DIR_NAME = 'archive' SOURCES_DIR_NAME = 'sources' LOGS_DIR_NAME = 'logs' +CACHE_DIR_NAME = 'cache' PERSONAS_DIR_NAME = 'personas' CRONTABS_DIR_NAME = 'crontabs' SQL_INDEX_FILENAME = 'index.sqlite3' @@ -360,6 +361,7 @@ ALLOWED_IN_OUTPUT_DIR = { ARCHIVE_DIR_NAME, SOURCES_DIR_NAME, LOGS_DIR_NAME, + CACHE_DIR_NAME, PERSONAS_DIR_NAME, SQL_INDEX_FILENAME, f'{SQL_INDEX_FILENAME}-wal', @@ -511,6 +513,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME}, 'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME}, 'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME}, + 'CACHE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / CACHE_DIR_NAME}, 'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME}, 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME}, 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()}, @@ -1038,6 +1041,11 @@ def get_data_locations(config: ConfigDict) -> ConfigValue: 'enabled': True, 'is_valid': config['LOGS_DIR'].exists(), }, + 'CACHE_DIR': { + 'path': config['CACHE_DIR'].resolve(), + 'enabled': True, + 'is_valid': config['CACHE_DIR'].exists(), + }, 'CUSTOM_TEMPLATES_DIR': { 'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(), 'enabled': bool(config['CUSTOM_TEMPLATES_DIR']), @@ -1299,7 +1307,10 @@ def check_system_config(config: ConfigDict=CONFIG) -> None: stderr() stderr(' Try removing /Default from the end e.g.:') stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0])) - raise SystemExit(2) + + # hard error is too annoying here, instead just set it to nothing + # raise SystemExit(2) + config['CHROME_USER_DATA_DIR'] = None def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: @@ -1385,6 +1396,7 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO (Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True) (Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True) + (Path(output_dir) / CACHE_DIR_NAME).mkdir(exist_ok=True) (Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True) (Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 41e2db68..4bcbc222 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -15,8 +15,8 @@ from django.contrib.auth import get_user_model from django import forms -from signal_webhooks.apps import DjangoSignalWebhooksConfig -from signal_webhooks.admin import WebhookAdmin, WebhookModel +from signal_webhooks.admin import WebhookAdmin, get_webhook_model +# from plugantic.admin import CustomPlugin from ..util import htmldecode, urldecode, ansi_to_html @@ -38,6 +38,7 @@ from config import ( CAN_UPGRADE ) + GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE} # Admin URLs @@ -104,23 +105,16 @@ class ArchiveBoxAdmin(admin.AdminSite): return render(template_name='add.html', request=request, context=context) -# monkey patch django-signals-webhooks to change how it shows up in Admin UI -DjangoSignalWebhooksConfig.verbose_name = 'API' -WebhookModel._meta.get_field('name').help_text = 'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).' -WebhookModel._meta.get_field('signal').help_text = 'The type of event the webhook should fire for (e.g. Create, Update, Delete).' -WebhookModel._meta.get_field('ref').help_text = 'Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).' -WebhookModel._meta.get_field('endpoint').help_text = 'External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).' -WebhookModel._meta.app_label = 'api' - - archivebox_admin = ArchiveBoxAdmin() archivebox_admin.register(get_user_model()) archivebox_admin.register(APIToken) -archivebox_admin.register(WebhookModel, WebhookAdmin) +archivebox_admin.register(get_webhook_model(), WebhookAdmin) archivebox_admin.disable_action('delete_selected') +# archivebox_admin.register(CustomPlugin) -# patch admin with methods to add data views +# patch admin with methods to add data views (implemented by admin_data_views package) +############### Additional sections are defined in settings.ADMIN_DATA_VIEWS ######### from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin) @@ -170,14 +164,41 @@ class SnapshotActionForm(ActionForm): # ) +def get_abid_info(self, obj): + return format_html( + # URL Hash: {}
+ ''' +     ABID:  {}
+     TS:                  {} ({})
+     URI:                 {} ({})
+     SUBTYPE:       {} ({})
+     RAND:              {} ({})

+     ABID AS UUID:  {}    

+ +     .uuid:                 {}    
+     .id:                      {}    
+     .pk:                     {}    

+ ''', + obj.abid, + obj.ABID.ts, obj.abid_values['ts'].isoformat() if isinstance(obj.abid_values['ts'], datetime) else obj.abid_values['ts'], + obj.ABID.uri, str(obj.abid_values['uri']), + obj.ABID.subtype, str(obj.abid_values['subtype']), + obj.ABID.rand, str(obj.abid_values['rand'])[-7:], + obj.ABID.uuid, + obj.uuid, + obj.id, + obj.pk, + ) + + @admin.register(Snapshot, site=archivebox_admin) class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): list_display = ('added', 'title_str', 'files', 'size', 'url_str') sort_fields = ('title_str', 'url_str', 'added', 'files') - readonly_fields = ('info', 'bookmarked', 'added', 'updated') - search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name') - fields = ('timestamp', 'url', 'title', 'tags', *readonly_fields) - list_filter = ('added', 'updated', 'tags', 'archiveresult__status') + readonly_fields = ('admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'identifiers') + search_fields = ('id', 'url', 'abid', 'uuid', 'timestamp', 'title', 'tags__name') + fields = ('url', 'timestamp', 'created_by', 'tags', 'title', *readonly_fields) + list_filter = ('added', 'updated', 'tags', 'archiveresult__status', 'created_by') ordering = ['-added'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] autocomplete_fields = ['tags'] @@ -223,40 +244,46 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): # # ''', # csrf.get_token(self.request), - # obj.id, + # obj.pk, # ) - def info(self, obj): + def admin_actions(self, obj): return format_html( + # URL Hash: {}
+ ''' + Summary page ➡️     + Result files 📑     + Admin actions ⚙️ + ''', + obj.timestamp, + obj.timestamp, + obj.pk, + ) + + def status_info(self, obj): + return format_html( + # URL Hash: {}
''' - UUID: {}     - Timestamp: {}     - URL Hash: {}
Archived: {} ({} files {})     Favicon:     - Status code: {}     + Status code: {}    
Server: {}     Content type: {}     Extension: {}     -

- View Snapshot index ➡️     - View actions ⚙️ ''', - obj.id, - obj.timestamp, - obj.url_hash, '✅' if obj.is_archived else '❌', obj.num_outputs, - self.size(obj), + self.size(obj) or '0kb', f'/archive/{obj.timestamp}/favicon.ico', - obj.status_code or '?', - obj.headers and obj.headers.get('Server') or '?', - obj.headers and obj.headers.get('Content-Type') or '?', - obj.extension or '?', - obj.timestamp, - obj.id, + obj.status_code or '-', + obj.headers and obj.headers.get('Server') or '-', + obj.headers and obj.headers.get('Content-Type') or '-', + obj.extension or '-', ) + def identifiers(self, obj): + return get_abid_info(self, obj) + @admin.display( description='Title', ordering='title', @@ -316,7 +343,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): return format_html( '{}', obj.url, - obj.url, + obj.url[:128], ) def grid_view(self, request, extra_context=None): @@ -419,42 +446,45 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): @admin.register(Tag, site=archivebox_admin) class TagAdmin(admin.ModelAdmin): - list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id') - sort_fields = ('id', 'name', 'slug') - readonly_fields = ('id', 'num_snapshots', 'snapshots') - search_fields = ('id', 'name', 'slug') - fields = (*readonly_fields, 'name', 'slug') + list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'abid') + sort_fields = ('id', 'name', 'slug', 'abid') + readonly_fields = ('created', 'modified', 'identifiers', 'num_snapshots', 'snapshots') + search_fields = ('id', 'abid', 'uuid', 'name', 'slug') + fields = ('name', 'slug', 'created_by', *readonly_fields, ) actions = ['delete_selected'] ordering = ['-id'] - def num_snapshots(self, obj): + def identifiers(self, obj): + return get_abid_info(self, obj) + + def num_snapshots(self, tag): return format_html( '{} total', - obj.id, - obj.snapshot_set.count(), + tag.id, + tag.snapshot_set.count(), ) - def snapshots(self, obj): - total_count = obj.snapshot_set.count() + def snapshots(self, tag): + total_count = tag.snapshot_set.count() return mark_safe('
'.join( format_html( '{} [{}] {}', snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...', - snap.id, - snap.timestamp, + snap.pk, + snap.abid, snap.url, ) - for snap in obj.snapshot_set.order_by('-updated')[:10] - ) + (f'
and {total_count-10} more...' if obj.snapshot_set.count() > 10 else '')) + for snap in tag.snapshot_set.order_by('-updated')[:10] + ) + (f'
and {total_count-10} more...' if tag.snapshot_set.count() > 10 else '')) @admin.register(ArchiveResult, site=archivebox_admin) class ArchiveResultAdmin(admin.ModelAdmin): - list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'tags_str', 'cmd_str', 'status', 'output_str') + list_display = ('start_ts', 'snapshot_info', 'tags_str', 'extractor', 'cmd_str', 'status', 'output_str') sort_fields = ('start_ts', 'extractor', 'status') - readonly_fields = ('id', 'uuid', 'snapshot_str', 'tags_str') - search_fields = ('id', 'uuid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') - fields = (*readonly_fields, 'snapshot', 'extractor', 'status', 'start_ts', 'end_ts', 'output', 'pwd', 'cmd', 'cmd_version') + readonly_fields = ('snapshot_info', 'tags_str', 'created_by', 'created', 'modified', 'identifiers') + search_fields = ('id', 'uuid', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') + fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'cmd', 'start_ts', 'end_ts', 'cmd_version', *readonly_fields) autocomplete_fields = ['snapshot'] list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') @@ -462,33 +492,36 @@ class ArchiveResultAdmin(admin.ModelAdmin): list_per_page = SNAPSHOTS_PER_PAGE @admin.display( - description='snapshot' + description='Snapshot Info' ) - def snapshot_str(self, obj): + def snapshot_info(self, result): return format_html( - '[{}]
' - '{}', - obj.snapshot.timestamp, - obj.snapshot.timestamp, - obj.snapshot.url[:128], + '[{}]   {}   {}
', + result.snapshot.timestamp, + result.snapshot.abid, + result.snapshot.added.strftime('%Y-%m-%d %H:%M'), + result.snapshot.url[:128], ) + def identifiers(self, obj): + return get_abid_info(self, obj) + @admin.display( - description='tags' + description='Snapshot Tags' ) - def tags_str(self, obj): - return obj.snapshot.tags_str() + def tags_str(self, result): + return result.snapshot.tags_str() - def cmd_str(self, obj): + def cmd_str(self, result): return format_html( '
{}
', - ' '.join(obj.cmd) if isinstance(obj.cmd, list) else str(obj.cmd), + ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd), ) - def output_str(self, obj): + def output_str(self, result): return format_html( '↗️
{}
', - obj.snapshot.timestamp, - obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html', - obj.output, + result.snapshot.timestamp, + result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html', + result.output, ) diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index 29b269f6..3da3b93c 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -17,8 +17,6 @@ except AttributeError: def forwards_func(apps, schema_editor): - from core.models import EXTRACTORS - Snapshot = apps.get_model("core", "Snapshot") ArchiveResult = apps.get_model("core", "ArchiveResult") diff --git a/archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py b/archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py new file mode 100644 index 00000000..39d3d570 --- /dev/null +++ b/archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py @@ -0,0 +1,43 @@ +# Generated by Django 5.0.6 on 2024-05-13 10:56 + +import charidfield.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0022_auto_20231023_2008'), + ] + + operations = [ + migrations.AlterModelOptions( + name='archiveresult', + options={'verbose_name': 'Result'}, + ), + migrations.AddField( + model_name='archiveresult', + name='abid', + field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='res_', unique=True), + ), + migrations.AddField( + model_name='snapshot', + name='abid', + field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='snp_', unique=True), + ), + migrations.AddField( + model_name='snapshot', + name='uuid', + field=models.UUIDField(blank=True, null=True, unique=True), + ), + migrations.AddField( + model_name='tag', + name='abid', + field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='tag_', unique=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='extractor', + field=models.CharField(choices=[('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('title', 'title'), ('wget', 'wget')], max_length=32), + ), + ] diff --git a/archivebox/core/migrations/0024_auto_20240513_1143.py b/archivebox/core/migrations/0024_auto_20240513_1143.py new file mode 100644 index 00000000..31f1e773 --- /dev/null +++ b/archivebox/core/migrations/0024_auto_20240513_1143.py @@ -0,0 +1,95 @@ +# Generated by Django 5.0.6 on 2024-05-13 11:43 + +from django.db import migrations +from datetime import datetime +from abid_utils.abid import abid_from_values + + +def calculate_abid(self): + """ + Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src). + """ + prefix = self.abid_prefix + ts = eval(self.abid_ts_src) + uri = eval(self.abid_uri_src) + subtype = eval(self.abid_subtype_src) + rand = eval(self.abid_rand_src) + + if (not prefix) or prefix == 'obj_': + suggested_abid = self.__class__.__name__[:3].lower() + raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})') + + if not ts: + ts = datetime.utcfromtimestamp(0) + print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat()) + + if not uri: + uri = str(self) + print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri) + + if not subtype: + subtype = self.__class__.__name__ + print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype) + + if not rand: + rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk') + print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand) + + abid = abid_from_values( + prefix=prefix, + ts=ts, + uri=uri, + subtype=subtype, + rand=rand, + ) + assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}' + return abid + + +def copy_snapshot_uuids(apps, schema_editor): + Snapshot = apps.get_model("core", "Snapshot") + for snapshot in Snapshot.objects.all(): + snapshot.uuid = snapshot.id + snapshot.save(update_fields=["uuid"]) + +def generate_snapshot_abids(apps, schema_editor): + Snapshot = apps.get_model("core", "Snapshot") + for snapshot in Snapshot.objects.all(): + snapshot.abid_prefix = 'snp_' + snapshot.abid_ts_src = 'self.added' + snapshot.abid_uri_src = 'self.url' + snapshot.abid_subtype_src = '"01"' + snapshot.abid_rand_src = 'self.uuid' + + snapshot.abid = calculate_abid(snapshot) + snapshot.save(update_fields=["abid"]) + +def generate_archiveresult_abids(apps, schema_editor): + ArchiveResult = apps.get_model("core", "ArchiveResult") + Snapshot = apps.get_model("core", "Snapshot") + for result in ArchiveResult.objects.all(): + result.abid_prefix = 'res_' + result.snapshot = Snapshot.objects.get(pk=result.snapshot_id) + result.snapshot_added = result.snapshot.added + result.snapshot_url = result.snapshot.url + result.abid_ts_src = 'self.snapshot_added' + result.abid_uri_src = 'self.snapshot_url' + result.abid_subtype_src = 'self.extractor' + result.abid_rand_src = 'self.id' + + result.abid = calculate_abid(result) + result.uuid = result.abid.uuid + result.save(update_fields=["abid", "uuid"]) + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'), + ] + + operations = [ + migrations.RunPython(copy_snapshot_uuids, reverse_code=migrations.RunPython.noop), + migrations.RunPython(generate_snapshot_abids, reverse_code=migrations.RunPython.noop), + migrations.RunPython(generate_archiveresult_abids, reverse_code=migrations.RunPython.noop), + ] diff --git a/archivebox/core/migrations/0025_alter_archiveresult_uuid.py b/archivebox/core/migrations/0025_alter_archiveresult_uuid.py new file mode 100644 index 00000000..b60d424b --- /dev/null +++ b/archivebox/core/migrations/0025_alter_archiveresult_uuid.py @@ -0,0 +1,19 @@ +# Generated by Django 5.0.6 on 2024-05-13 12:08 + +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0024_auto_20240513_1143'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True), + ), + ] diff --git a/archivebox/core/migrations/0026_archiveresult_created_archiveresult_created_by_and_more.py b/archivebox/core/migrations/0026_archiveresult_created_archiveresult_created_by_and_more.py new file mode 100644 index 00000000..bacdecd9 --- /dev/null +++ b/archivebox/core/migrations/0026_archiveresult_created_archiveresult_created_by_and_more.py @@ -0,0 +1,76 @@ +# Generated by Django 5.0.6 on 2024-05-13 13:01 + +import abid_utils.models +import django.db.models.deletion +import django.utils.timezone +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0025_alter_archiveresult_uuid'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.AddField( + model_name='archiveresult', + name='created', + field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now), + preserve_default=False, + ), + migrations.AddField( + model_name='archiveresult', + name='created_by', + field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), + ), + migrations.AddField( + model_name='archiveresult', + name='modified', + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name='snapshot', + name='created', + field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now), + preserve_default=False, + ), + migrations.AddField( + model_name='snapshot', + name='created_by', + field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), + ), + migrations.AddField( + model_name='snapshot', + name='modified', + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name='tag', + name='created', + field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now), + preserve_default=False, + ), + migrations.AddField( + model_name='tag', + name='created_by', + field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), + ), + migrations.AddField( + model_name='tag', + name='modified', + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name='tag', + name='uuid', + field=models.UUIDField(blank=True, null=True, unique=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(blank=True, null=True, unique=True), + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 0c9733d0..1b896217 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1,11 +1,14 @@ __package__ = 'archivebox.core' -import uuid +from typing import Optional, List, Dict +from django_stubs_ext.db.models import TypedModelMeta + import json +import uuid +from uuid import uuid4 from pathlib import Path -from typing import Optional, List from django.db import models from django.utils.functional import cached_property @@ -15,40 +18,58 @@ from django.urls import reverse from django.db.models import Case, When, Value, IntegerField from django.contrib.auth.models import User # noqa +from abid_utils.models import ABIDModel, ABIDField + from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME from ..system import get_dir_size -from ..util import parse_date, base_url, hashurl +from ..util import parse_date, base_url from ..index.schema import Link from ..index.html import snapshot_icons -from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE +from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS -EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()] + +EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()] STATUS_CHOICES = [ ("succeeded", "succeeded"), ("failed", "failed"), ("skipped", "skipped") ] -try: - JSONField = models.JSONField -except AttributeError: - import jsonfield - JSONField = jsonfield.JSONField -class Tag(models.Model): +# class BaseModel(models.Model): +# # TODO: migrate all models to a shared base class with all our standard fields and helpers: +# # ulid/created/modified/owner/is_deleted/as_json/from_json/etc. +# # +# # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') +# # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True) + +# class Meta(TypedModelMeta): +# abstract = True + + +class Tag(ABIDModel): """ - Based on django-taggit model + Based on django-taggit model + ABID base. """ + abid_prefix = 'tag_' + abid_ts_src = 'self.created' # TODO: add created/modified time + abid_uri_src = 'self.name' + abid_subtype_src = '"03"' + abid_rand_src = 'self.id' + + # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True) id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') + uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) + abid = ABIDField(prefix=abid_prefix) + name = models.CharField(unique=True, blank=False, max_length=100) - - # slug is autoset on save from name, never set it manually slug = models.SlugField(unique=True, blank=True, max_length=100) + # slug is autoset on save from name, never set it manually - class Meta: + class Meta(TypedModelMeta): verbose_name = "Tag" verbose_name_plural = "Tags" @@ -84,8 +105,16 @@ class Tag(models.Model): return super().save(*args, **kwargs) -class Snapshot(models.Model): - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) +class Snapshot(ABIDModel): + abid_prefix = 'snp_' + abid_ts_src = 'self.added' + abid_uri_src = 'self.url' + abid_subtype_src = '"01"' + abid_rand_src = 'self.id' + + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) # legacy pk + uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) + abid = ABIDField(prefix=abid_prefix) url = models.URLField(unique=True, db_index=True) timestamp = models.CharField(max_length=32, unique=True, db_index=True) @@ -98,6 +127,7 @@ class Snapshot(models.Model): keys = ('url', 'timestamp', 'title', 'tags', 'updated') + def __repr__(self) -> str: title = self.title or '-' return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})' @@ -126,8 +156,8 @@ class Snapshot(models.Model): from ..index import load_link_details return load_link_details(self.as_link()) - def tags_str(self, nocache=True) -> str: - cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags' + def tags_str(self, nocache=True) -> str | None: + cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags' calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True)) if nocache: tags_str = calc_tags_str() @@ -157,13 +187,9 @@ class Snapshot(models.Model): return self.as_link().is_archived @cached_property - def num_outputs(self): + def num_outputs(self) -> int: return self.archiveresult_set.filter(status='succeeded').count() - @cached_property - def url_hash(self): - return hashurl(self.url) - @cached_property def base_url(self): return base_url(self.url) @@ -178,7 +204,7 @@ class Snapshot(models.Model): @cached_property def archive_size(self): - cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size' + cache_key = f'{str(self.pk)[:12]}-{(self.updated or self.added).timestamp()}-size' def calc_dir_size(): try: @@ -199,7 +225,7 @@ class Snapshot(models.Model): return None @cached_property - def headers(self) -> Optional[dict]: + def headers(self) -> Optional[Dict[str, str]]: try: return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip()) except Exception: @@ -250,11 +276,37 @@ class Snapshot(models.Model): tags_id = [] for tag in tags: if tag.strip(): - tags_id.append(Tag.objects.get_or_create(name=tag)[0].id) + tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk) self.tags.clear() self.tags.add(*tags_id) + # def get_storage_dir(self, create=True, symlink=True) -> Path: + # date_str = self.added.strftime('%Y%m%d') + # domain_str = domain(self.url) + # abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid) + + # if create and not abs_storage_dir.is_dir(): + # abs_storage_dir.mkdir(parents=True, exist_ok=True) + + # if symlink: + # LINK_PATHS = [ + # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid), + # # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid), + # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid), + # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid), + # ] + # for link_path in LINK_PATHS: + # link_path.parent.mkdir(parents=True, exist_ok=True) + # try: + # link_path.symlink_to(abs_storage_dir) + # except FileExistsError: + # link_path.unlink() + # link_path.symlink_to(abs_storage_dir) + + # return abs_storage_dir + + class ArchiveResultManager(models.Manager): def indexable(self, sorted: bool = True): INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] @@ -266,13 +318,22 @@ class ArchiveResultManager(models.Manager): return qs -class ArchiveResult(models.Model): - id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') - uuid = models.UUIDField(default=uuid.uuid4, editable=False) +class ArchiveResult(ABIDModel): + abid_prefix = 'res_' + abid_ts_src = 'self.snapshot.added' + abid_uri_src = 'self.snapshot.url' + abid_subtype_src = 'self.extractor' + abid_rand_src = 'self.uuid' + EXTRACTOR_CHOICES = EXTRACTOR_CHOICES + + # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) + id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') # legacy pk + uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) + abid = ABIDField(prefix=abid_prefix) snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) - extractor = models.CharField(choices=EXTRACTORS, max_length=32) - cmd = JSONField() + extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32) + cmd = models.JSONField() pwd = models.CharField(max_length=256) cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True) output = models.CharField(max_length=1024) @@ -282,5 +343,69 @@ class ArchiveResult(models.Model): objects = ArchiveResultManager() + class Meta(TypedModelMeta): + verbose_name = 'Result' + def __str__(self): return self.extractor + + @cached_property + def snapshot_dir(self): + return Path(self.snapshot.link_dir) + + + @property + def extractor_module(self): + return EXTRACTORS[self.extractor] + + def output_path(self) -> str: + """return the canonical output filename or directory name within the snapshot dir""" + return self.extractor_module.get_output_path() + + def embed_path(self) -> str: + """ + return the actual runtime-calculated path to the file on-disk that + should be used for user-facing iframe embeds of this result + """ + + if hasattr(self.extractor_module, 'get_embed_path'): + return self.extractor_module.get_embed_path(self) + + return self.extractor_module.get_output_path() + + def legacy_output_path(self): + link = self.snapshot.as_link() + return link.canonical_outputs().get(f'{self.extractor}_path') + + def output_exists(self) -> bool: + return Path(self.output_path()).exists() + + + # def get_storage_dir(self, create=True, symlink=True): + # date_str = self.snapshot.added.strftime('%Y%m%d') + # domain_str = domain(self.snapshot.url) + # abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid) + + # if create and not abs_storage_dir.is_dir(): + # abs_storage_dir.mkdir(parents=True, exist_ok=True) + + # if symlink: + # LINK_PATHS = [ + # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid), + # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid), + # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid), + # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid), + # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid), + # ] + # for link_path in LINK_PATHS: + # link_path.parent.mkdir(parents=True, exist_ok=True) + # try: + # link_path.symlink_to(abs_storage_dir) + # except FileExistsError: + # link_path.unlink() + # link_path.symlink_to(abs_storage_dir) + + # return abs_storage_dir + + # def symlink_index(self, create=True): + # abs_result_dir = self.get_storage_dir(create=create) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index d322f711..870c5681 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -10,6 +10,7 @@ from pathlib import Path from django.utils.crypto import get_random_string from ..config import ( + CONFIG, DEBUG, SECRET_KEY, ALLOWED_HOSTS, @@ -20,6 +21,7 @@ from ..config import ( OUTPUT_DIR, ARCHIVE_DIR, LOGS_DIR, + CACHE_DIR, TIMEZONE, LDAP, @@ -53,6 +55,26 @@ APPEND_SLASH = True DEBUG = DEBUG or ('--debug' in sys.argv) + +# add plugins folders to system path, and load plugins in installed_apps +BUILTIN_PLUGINS_DIR = PACKAGE_DIR / 'plugins' +USER_PLUGINS_DIR = OUTPUT_DIR / 'plugins' +sys.path.insert(0, str(BUILTIN_PLUGINS_DIR)) +sys.path.insert(0, str(USER_PLUGINS_DIR)) + +def find_plugins(plugins_dir): + return { + # plugin_entrypoint.parent.name: import_module(plugin_entrypoint.parent.name).METADATA + plugin_entrypoint.parent.name: plugin_entrypoint.parent + for plugin_entrypoint in plugins_dir.glob('*/apps.py') + } + +INSTALLED_PLUGINS = { + **find_plugins(BUILTIN_PLUGINS_DIR), + **find_plugins(USER_PLUGINS_DIR), +} + + INSTALLED_APPS = [ 'django.contrib.auth', 'django.contrib.contenttypes', @@ -60,13 +82,18 @@ INSTALLED_APPS = [ 'django.contrib.messages', 'django.contrib.staticfiles', 'django.contrib.admin', + 'django_jsonform', + 'signal_webhooks', + 'abid_utils', + 'plugantic', 'core', 'api', + *INSTALLED_PLUGINS.keys(), + 'admin_data_views', - 'signal_webhooks', 'django_extensions', ] @@ -227,6 +254,11 @@ TEMPLATES = [ ### External Service Settings ################################################################################ + +CACHE_DB_FILENAME = 'cache.sqlite3' +CACHE_DB_PATH = CACHE_DIR / CACHE_DB_FILENAME +CACHE_DB_TABLE = 'django_cache' + DATABASE_FILE = Path(OUTPUT_DIR) / SQL_INDEX_FILENAME DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE)) @@ -240,18 +272,28 @@ DATABASES = { }, 'TIME_ZONE': TIMEZONE, # DB setup is sometimes modified at runtime by setup_django() in config.py - } + }, + # 'cache': { + # 'ENGINE': 'django.db.backends.sqlite3', + # 'NAME': CACHE_DB_PATH, + # 'OPTIONS': { + # 'timeout': 60, + # 'check_same_thread': False, + # }, + # 'TIME_ZONE': TIMEZONE, + # }, } +MIGRATION_MODULES = {'signal_webhooks': None} + +# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0 +DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' -CACHE_BACKEND = 'django.core.cache.backends.locmem.LocMemCache' -# CACHE_BACKEND = 'django.core.cache.backends.db.DatabaseCache' -# CACHE_BACKEND = 'django.core.cache.backends.dummy.DummyCache' CACHES = { - 'default': { - 'BACKEND': CACHE_BACKEND, - 'LOCATION': 'django_cache_default', - } + 'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'}, + # 'sqlite': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'}, + # 'dummy': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'}, + # 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'}, } EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' @@ -409,9 +451,11 @@ LOGGING = { # Add default webhook configuration to the User model +SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook' SIGNAL_WEBHOOKS = { "HOOKS": { - "django.contrib.auth.models.User": ..., # ... is a special value that means "use the default autogenerated hooks" + # ... is a special sigil value that means "use the default autogenerated hooks" + "django.contrib.auth.models.User": ..., "core.models.Snapshot": ..., "core.models.ArchiveResult": ..., "core.models.Tag": ..., @@ -421,16 +465,36 @@ SIGNAL_WEBHOOKS = { ADMIN_DATA_VIEWS = { - "NAME": "configuration", + "NAME": "Environment", "URLS": [ { - "route": "live/", + "route": "config/", "view": "core.views.live_config_list_view", - "name": "live", + "name": "Configuration", "items": { "route": "/", "view": "core.views.live_config_value_view", - "name": "live_config_value", + "name": "config_val", + }, + }, + { + "route": "binaries/", + "view": "plugantic.views.binaries_list_view", + "name": "Binaries", + "items": { + "route": "/", + "view": "plugantic.views.binary_detail_view", + "name": "binary", + }, + }, + { + "route": "plugins/", + "view": "plugantic.views.plugins_list_view", + "name": "Plugins", + "items": { + "route": "/", + "view": "plugantic.views.plugin_detail_view", + "name": "plugin", }, }, ], diff --git a/archivebox/core/views.py b/archivebox/core/views.py index e0a58ed7..0a6e4f11 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -3,6 +3,7 @@ __package__ = 'archivebox.core' from typing import Callable from io import StringIO +from pathlib import Path from contextlib import redirect_stdout from django.shortcuts import render, redirect @@ -36,10 +37,14 @@ from ..config import ( CONFIG_SCHEMA, DYNAMIC_CONFIG_SCHEMA, USER_CONFIG, + SAVE_ARCHIVE_DOT_ORG, + PREVIEW_ORIGINALS, ) +from ..logging_util import printable_filesize from ..main import add -from ..util import base_url, ansi_to_html +from ..util import base_url, ansi_to_html, htmlencode, urldecode, urlencode, ts_to_date_str from ..search import query_search_index +from ..extractors.wget import wget_output_path class HomepageView(View): @@ -56,10 +61,80 @@ class HomepageView(View): class SnapshotView(View): # render static html index from filesystem archive//index.html + @staticmethod + def render_live_index(request, snapshot): + TITLE_LOADING_MSG = 'Not yet archived...' + HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org') + + archiveresults = {} + + results = snapshot.archiveresult_set.all() + + for result in results: + embed_path = result.embed_path() + abs_path = result.snapshot_dir / (embed_path or 'None') + + if (result.status == 'succeeded' + and (result.extractor not in HIDDEN_RESULTS) + and embed_path + and abs_path.exists()): + if abs_path.is_dir() and not any(abs_path.glob('*.*')): + continue + + result_info = { + 'name': result.extractor, + 'path': embed_path, + 'ts': ts_to_date_str(result.end_ts), + } + archiveresults[result.extractor] = result_info + + preferred_types = ('singlefile', 'wget', 'screenshot', 'dom', 'media', 'pdf', 'readability', 'mercury') + all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types) + + best_result = {'path': 'None'} + for result_type in preferred_types: + if result_type in archiveresults: + best_result = archiveresults[result_type] + break + + link = snapshot.as_link() + + link_info = link._asdict(extended=True) + + try: + warc_path = 'warc/' + list(Path(snapshot.link_dir).glob('warc/*.warc.*'))[0].name + except IndexError: + warc_path = 'warc/' + + context = { + **link_info, + **link_info['canonical'], + 'title': htmlencode( + link.title + or (link.base_url if link.is_archived else TITLE_LOADING_MSG) + ), + 'extension': link.extension or 'html', + 'tags': link.tags or 'untagged', + 'size': printable_filesize(link.archive_size) if link.archive_size else 'pending', + 'status': 'archived' if link.is_archived else 'not yet archived', + 'status_color': 'success' if link.is_archived else 'danger', + 'oldest_archive_date': ts_to_date_str(link.oldest_archive_date), + 'warc_path': warc_path, + 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG, + 'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS, + 'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name'])), + 'best_result': best_result, + # 'tags_str': 'somealskejrewlkrjwer,werlmwrwlekrjewlkrjwer324m532l,4m32,23m324234', + } + return render(template_name='core/snapshot_live.html', request=request, context=context) + + def get(self, request, path): if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS: return redirect(f'/admin/login/?next={request.path}') + snapshot = None + try: slug, archivefile = path.split('/', 1) except (IndexError, ValueError): @@ -75,7 +150,11 @@ class SnapshotView(View): try: try: snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug)) - response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True) + if archivefile == 'index.html': + # if they requested snapshot index, serve live rendered template instead of static html + response = self.render_live_index(request, snapshot) + else: + response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True) response["Link"] = f'<{snapshot.url}>; rel="canonical"' return response except Snapshot.DoesNotExist: @@ -127,26 +206,33 @@ class SnapshotView(View): status=404, ) except Http404: + assert snapshot # (Snapshot.DoesNotExist is already handled above) + # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png return HttpResponse( format_html( ( '



' - f'Snapshot [{snapshot.timestamp}] exists in DB, but resource {snapshot.timestamp}/' + f'Snapshot [{snapshot.timestamp}]: {snapshot.url}
' + f'was queued on {str(snapshot.added).split(".")[0]}, ' + f'but no files have been saved yet in:
{snapshot.timestamp}/' '{}' - f' does not exist in the snapshot dir yet.

' - 'It\'s possible that this resource type is not available for the Snapshot,
or that the archiving process has not completed yet.
' - f'
# if interrupted, run this cmd to finish archiving this Snapshot
archivebox update -t timestamp {snapshot.timestamp}


' + f'


' + 'It\'s possible {} ' + f'during the last capture on {str(snapshot.added).split(".")[0]},
or that the archiving process has not completed yet.
' + f'
# run this cmd to finish/retry archiving this Snapshot
' + f'archivebox update -t timestamp {snapshot.timestamp}


' '
' 'Next steps:
' f'- list all the Snapshot files .*
' f'- view the Snapshot ./index.html
' - f'- go to the Snapshot admin to edit
' - f'- go to the Snapshot actions to re-archive
' + f'- go to the Snapshot admin to edit
' + f'- go to the Snapshot actions to re-archive
' '- or return to the main index...
' '
' ), - archivefile, + archivefile if str(archivefile) != 'None' else '', + f'the {archivefile} resource could not be fetched' if str(archivefile) != 'None' else 'the original site was not available', ), content_type="text/html", status=404, @@ -369,21 +455,21 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext: for section in CONFIG_SCHEMA.keys(): for key in CONFIG_SCHEMA[section].keys(): - rows['Section'].append(section.replace('_', ' ').title().replace(' Config', '')) + rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '') rows['Key'].append(ItemLink(key, key=key)) rows['Type'].append(mark_safe(f'{find_config_type(key)}')) rows['Value'].append(mark_safe(f'{CONFIG[key]}') if key_is_safe(key) else '******** (redacted)') - rows['Default'].append(mark_safe(f'{find_config_default(key) or 'See here...'}')) + rows['Default'].append(mark_safe(f'{find_config_default(key) or "See here..."}')) # rows['Documentation'].append(mark_safe(f'Wiki: {key}')) rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', []))) section = 'DYNAMIC' for key in DYNAMIC_CONFIG_SCHEMA.keys(): - rows['Section'].append(section.replace('_', ' ').title().replace(' Config', '')) + rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '') rows['Key'].append(ItemLink(key, key=key)) rows['Type'].append(mark_safe(f'{find_config_type(key)}')) rows['Value'].append(mark_safe(f'{CONFIG[key]}') if key_is_safe(key) else '******** (redacted)') - rows['Default'].append(mark_safe(f'{find_config_default(key) or 'See here...'}')) + rows['Default'].append(mark_safe(f'{find_config_default(key) or "See here..."}')) # rows['Documentation'].append(mark_safe(f'Wiki: {key}')) rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '') diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index cb1c6841..a262bba6 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -1,11 +1,13 @@ __package__ = 'archivebox.extractors' +from typing import Callable, Optional, Dict, List, Iterable, Union, Protocol, cast + import os import sys from pathlib import Path - -from typing import Callable, Optional, List, Iterable, Union +from importlib import import_module from datetime import datetime, timezone + from django.db.models import QuerySet from ..config import ( @@ -158,7 +160,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s # bump the updated time on the main Snapshot here, this is critical # to be able to cache summaries of the ArchiveResults for a given # snapshot without having to load all the results from the DB each time. - # (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume + # (we use {Snapshot.pk}-{Snapshot.updated} as the cache key and assume # ArchiveResults are unchanged as long as the updated timestamp is unchanged) snapshot.save() else: @@ -240,3 +242,37 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa log_archiving_finished(num_links) return all_links + + + +EXTRACTORS_DIR = Path(__file__).parent + +class ExtractorModuleProtocol(Protocol): + """Type interface for an Extractor Module (WIP)""" + + get_output_path: Callable + + # TODO: + # get_embed_path: Callable | None + # should_extract(Snapshot) + # extract(Snapshot) + + +def get_extractors(dir: Path=EXTRACTORS_DIR) -> Dict[str, ExtractorModuleProtocol]: + """iterate through archivebox/extractors/*.py and load extractor modules""" + EXTRACTORS = {} + + for filename in EXTRACTORS_DIR.glob('*.py'): + if filename.name.startswith('__'): + continue + + extractor_name = filename.name.replace('.py', '') + + extractor_module = cast(ExtractorModuleProtocol, import_module(f'.{extractor_name}', package=__package__)) + + assert getattr(extractor_module, 'get_output_path') + EXTRACTORS[extractor_name] = extractor_module + + return EXTRACTORS + +EXTRACTORS = get_extractors(EXTRACTORS_DIR) diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index 245315f1..5aa66fa7 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -24,6 +24,8 @@ from ..config import ( ) from ..logging_util import TimedProgress +def get_output_path(): + return 'archive.org.txt' @enforce_types @@ -32,7 +34,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'archive.org.txt').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): # if open(path, 'r', encoding='utf-8').read().strip() != 'None': return False @@ -43,7 +45,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int= """submit site to archive.org for archiving via their service, save returned archive url""" out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'archive.org.txt' + output: ArchiveOutput = get_output_path() archive_org_url = None submit_url = 'https://web.archive.org/save/{}'.format(link.url) # later options take precedence @@ -88,7 +90,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int= archive_org_url = archive_org_url or submit_url with open(str(out_dir / output), 'w', encoding='utf-8') as f: f.write(archive_org_url) - chmod_file('archive.org.txt', cwd=str(out_dir)) + chmod_file(str(out_dir / output), cwd=str(out_dir)) output = archive_org_url return ArchiveResult( diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py index 8a86026f..0035ec87 100644 --- a/archivebox/extractors/dom.py +++ b/archivebox/extractors/dom.py @@ -19,6 +19,9 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + return 'output.html' + @enforce_types def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: @@ -26,8 +29,8 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'output.html').exists(): - if (out_dir / 'output.html').stat().st_size > 1: + if not overwrite and (out_dir / get_output_path()).exists(): + if (out_dir / get_output_path()).stat().st_size > 1: return False return SAVE_DOM @@ -37,7 +40,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> """print HTML of site to file using chrome --dump-html""" out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'output.html' + output: ArchiveOutput = get_output_path() output_path = out_dir / output cmd = [ *chrome_args(), diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index f793f8df..31473b1a 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -8,8 +8,8 @@ from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..system import chmod_file, run from ..util import ( enforce_types, - domain, - dedupe, + domain, + dedupe, ) from ..config import ( TIMEOUT, @@ -33,6 +33,11 @@ def should_save_favicon(link: Link, out_dir: Optional[str]=None, overwrite: Opti return SAVE_FAVICON +@enforce_types +def get_output_path(): + return 'favicon.ico' + + @enforce_types def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: """download site favicon from google's favicon api""" diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py index efef37c2..029e8022 100644 --- a/archivebox/extractors/git.py +++ b/archivebox/extractors/git.py @@ -26,6 +26,19 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + return 'git/' + +def get_embed_path(archiveresult=None): + if not archiveresult: + return get_output_path() + + try: + return get_output_path() + list((archiveresult.snapshot_dir / get_output_path()).glob('*'))[0].name + '/' + except IndexError: + pass + + return get_output_path() @enforce_types def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: @@ -33,7 +46,7 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'git').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False is_clonable_url = ( @@ -51,7 +64,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> """download full site using git""" out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'git' + output: ArchiveOutput = get_output_path() output_path = out_dir / output output_path.mkdir(exist_ok=True) cmd = [ diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 975787ad..9fd48469 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -23,10 +23,14 @@ from ..config import ( ) from ..logging_util import TimedProgress +def get_output_path(): + return 'headers.json' + + @enforce_types def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'headers.json').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_HEADERS @@ -38,7 +42,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) out_dir = Path(out_dir or link.link_dir) output_folder = out_dir.absolute() - output: ArchiveOutput = 'headers.json' + output: ArchiveOutput = get_output_path() status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') @@ -59,7 +63,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) try: json_headers = get_headers(link.url, timeout=timeout) output_folder.mkdir(exist_ok=True) - atomic_write(str(output_folder / "headers.json"), json_headers) + atomic_write(str(output_folder / get_output_path()), json_headers) except (Exception, OSError) as err: status = 'failed' output = err diff --git a/archivebox/extractors/htmltotext.py b/archivebox/extractors/htmltotext.py index 0686f76e..1957579a 100644 --- a/archivebox/extractors/htmltotext.py +++ b/archivebox/extractors/htmltotext.py @@ -19,6 +19,12 @@ from ..util import ( ) from .title import get_html + +def get_output_path(): + return "htmltotext.txt" + + + class HTMLTextExtractor(HTMLParser): TEXT_ATTRS = [ "alt", "cite", "href", "label", @@ -109,7 +115,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite: return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'htmltotext.txt').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_HTMLTOTEXT @@ -120,7 +126,7 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO """extract search-indexing-friendly text from an HTML document""" out_dir = Path(out_dir or link.link_dir) - output = "htmltotext.txt" + output = get_output_path() cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html'] timer = TimedProgress(timeout, prefix=' ') diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index ad4c9c4b..8c33e92d 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -22,13 +22,27 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + return 'media/' + +def get_embed_path(archiveresult=None): + if not archiveresult: + return get_output_path() + + out_dir = archiveresult.snapshot_dir / get_output_path() + try: + return get_output_path() + list(out_dir.glob('*.mp4'))[0].name + except IndexError: + return get_output_path() + + @enforce_types def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'media').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_MEDIA @@ -38,7 +52,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp""" out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'media' + output: ArchiveOutput = get_output_path() output_path = out_dir / output output_path.mkdir(exist_ok=True) # later options take precedence diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index a0f38434..71af1329 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -24,6 +24,12 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + return 'mercury/' + +def get_embed_path(archiveresult=None): + return get_output_path() + 'content.html' + @enforce_types def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError: @@ -44,7 +50,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Opti return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'mercury').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_MERCURY @@ -55,8 +61,8 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) """download reader friendly version using @postlight/mercury-parser""" out_dir = Path(out_dir or link.link_dir) - output_folder = out_dir.absolute() / "mercury" - output = "mercury" + output_folder = out_dir.absolute() / get_output_path() + output = get_output_path() status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py index a6b51948..17bdd47f 100644 --- a/archivebox/extractors/pdf.py +++ b/archivebox/extractors/pdf.py @@ -19,13 +19,17 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + return 'output.pdf' + + @enforce_types def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'output.pdf').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_PDF @@ -36,7 +40,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> """print PDF of site to file using chrome --headless""" out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'output.pdf' + output: ArchiveOutput = get_output_path() cmd = [ *chrome_args(), '--print-to-pdf', @@ -51,7 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> hints = (result.stderr or result.stdout).decode() raise ArchiveError('Failed to save PDF', hints) - chmod_file('output.pdf', cwd=str(out_dir)) + chmod_file(get_output_path(), cwd=str(out_dir)) except Exception as err: status = 'failed' output = err diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index dc2a06b9..155438d3 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -22,6 +22,12 @@ from ..config import ( from ..logging_util import TimedProgress from .title import get_html +def get_output_path(): + return 'readability/' + +def get_embed_path(archiveresult=None): + return get_output_path() + 'content.html' + @enforce_types def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: @@ -29,7 +35,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'readability').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_READABILITY @@ -40,8 +46,8 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO """download reader friendly version using @mozilla/readability""" out_dir = Path(out_dir or link.link_dir) - output_folder = out_dir.absolute() / "readability" - output = "readability" + output_folder = out_dir.absolute() / get_output_path() + output = get_output_path() # Readability Docs: https://github.com/mozilla/readability diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py index 7ed8dd9d..ae380e6f 100644 --- a/archivebox/extractors/screenshot.py +++ b/archivebox/extractors/screenshot.py @@ -19,6 +19,9 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + return 'screenshot.png' + @enforce_types def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: @@ -26,7 +29,7 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'screenshot.png').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_SCREENSHOT @@ -36,7 +39,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO """take screenshot of site using chrome --headless""" out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'screenshot.png' + output: ArchiveOutput = get_output_path() cmd = [ *chrome_args(), '--screenshot', diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 1d5275dd..950ccd9c 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -26,13 +26,17 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + return 'singlefile.html' + + @enforce_types def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'singlefile.html').exists(): + if not overwrite and (out_dir / get_output_path()).exists(): return False return SAVE_SINGLEFILE @@ -43,7 +47,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO """download full site using single-file""" out_dir = out_dir or Path(link.link_dir) - output = "singlefile.html" + output = get_output_path() browser_args = chrome_args(CHROME_TIMEOUT=0) @@ -90,7 +94,8 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO status = 'failed' # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes). cmd[2] = browser_args.replace('"', "\\\"") - err.hints = (result.stdout + result.stderr).decode().split('\n') + if result: + err.hints = (result.stdout + result.stderr).decode().split('\n') output = err finally: timer.end() diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 5decc52c..a1cb769f 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -60,6 +60,7 @@ class TitleParser(HTMLParser): if tag.lower() == "title": self.inside_title_tag = False + @enforce_types def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str: """ @@ -84,6 +85,13 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str: else: return document + +def get_output_path(): + # TODO: actually save title to this file + # (currently only saved in ArchiveResult.output as charfield value, not saved to filesystem) + return 'title.json' + + @enforce_types def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: # if link already has valid title, skip it diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 86dba0ac..cd72be4e 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -35,6 +35,18 @@ from ..config import ( from ..logging_util import TimedProgress +def get_output_path(): + # TODO: actually save output into this folder, instead of do {domain}/**/index.html + return 'wget/' + +def get_embed_path(archiveresult=None): + if not archiveresult: + return get_output_path() + + link = archiveresult.snapshot.as_link() + return wget_output_path(link) + + @enforce_types def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: output_path = wget_output_path(link) diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 6b914446..2a891d7d 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -118,10 +118,10 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str: def snapshot_icons(snapshot) -> str: - cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons' + cache_key = f'{snapshot.pk}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons' def calc_snapshot_icons(): - from core.models import EXTRACTORS + from core.models import EXTRACTOR_CHOICES # start = datetime.now(timezone.utc) archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) @@ -147,12 +147,12 @@ def snapshot_icons(snapshot) -> str: # Missing specific entry for WARC extractor_outputs = defaultdict(lambda: None) - for extractor, _ in EXTRACTORS: + for extractor, _ in EXTRACTOR_CHOICES: for result in archive_results: if result.extractor == extractor and result: extractor_outputs[extractor] = result - for extractor, _ in EXTRACTORS: + for extractor, _ in EXTRACTOR_CHOICES: if extractor not in exclude: existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 8aa4e1c3..c2644eb2 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -192,6 +192,9 @@ class Link: if extended: info.update({ 'snapshot_id': self.snapshot_id, + 'snapshot_uuid': self.snapshot_uuid, + 'snapshot_abid': self.snapshot_abid, + 'link_dir': self.link_dir, 'archive_path': self.archive_path, @@ -261,9 +264,21 @@ class Link: return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust) @cached_property - def snapshot_id(self): + def snapshot(self): from core.models import Snapshot - return str(Snapshot.objects.only('id').get(url=self.url).id) + return Snapshot.objects.only('uuid').get(url=self.url) + + @cached_property + def snapshot_id(self): + return str(self.snapshot.pk) + + @cached_property + def snapshot_uuid(self): + return str(self.snapshot.uuid) + + @cached_property + def snapshot_abid(self): + return str(self.snapshot.ABID) @classmethod def field_names(cls): diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 5081c275..3c4c2a96 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -45,7 +45,8 @@ def write_link_to_sql_index(link: Link): info.pop('tags') try: - info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp + snapshot = Snapshot.objects.get(url=link.url) + info["timestamp"] = snapshot.timestamp except Snapshot.DoesNotExist: while Snapshot.objects.filter(timestamp=info["timestamp"]).exists(): info["timestamp"] = str(float(info["timestamp"]) + 1.0) @@ -57,7 +58,7 @@ def write_link_to_sql_index(link: Link): for entry in entries: if isinstance(entry, dict): result, _ = ArchiveResult.objects.get_or_create( - snapshot_id=snapshot.id, + snapshot_id=snapshot.pk, extractor=extractor, start_ts=parse_date(entry['start_ts']), defaults={ @@ -71,7 +72,7 @@ def write_link_to_sql_index(link: Link): ) else: result, _ = ArchiveResult.objects.update_or_create( - snapshot_id=snapshot.id, + snapshot_id=snapshot.pk, extractor=extractor, start_ts=parse_date(entry.start_ts), defaults={ diff --git a/archivebox/monkey_patches.py b/archivebox/monkey_patches.py new file mode 100644 index 00000000..0dcfa082 --- /dev/null +++ b/archivebox/monkey_patches.py @@ -0,0 +1,16 @@ +__package__ = 'archivebox' + +import django_stubs_ext + +django_stubs_ext.monkeypatch() + + +# monkey patch django timezone to add back utc (it was removed in Django 5.0) +import datetime +from django.utils import timezone +timezone.utc = datetime.timezone.utc + + +# monkey patch django-signals-webhooks to change how it shows up in Admin UI +# from signal_webhooks.apps import DjangoSignalWebhooksConfig +# DjangoSignalWebhooksConfig.verbose_name = 'API' diff --git a/archivebox/plugantic/__init__.py b/archivebox/plugantic/__init__.py new file mode 100644 index 00000000..244d084f --- /dev/null +++ b/archivebox/plugantic/__init__.py @@ -0,0 +1,17 @@ +__package__ = 'archivebox.plugantic' + +from .binproviders import BinProvider +from .binaries import Binary +from .extractors import Extractor +from .replayers import Replayer +from .configs import ConfigSet +from .plugins import Plugin + +# __all__ = [ +# 'BinProvider', +# 'Binary', +# 'Extractor', +# 'Replayer', +# 'ConfigSet', +# 'Plugin', +# ] diff --git a/archivebox/plugantic/admin.py b/archivebox/plugantic/admin.py new file mode 100644 index 00000000..832a820d --- /dev/null +++ b/archivebox/plugantic/admin.py @@ -0,0 +1,26 @@ +# from django.contrib import admin +# from django import forms + +# from django_jsonform.widgets import JSONFormWidget + +# from django_pydantic_field.v2.fields import PydanticSchemaField + +# from .models import CustomPlugin + + +# class PluginForm(forms.ModelForm): +# class Meta: +# model = CustomPlugin +# fields = '__all__' +# widgets = { +# 'items': JSONFormWidget(schema=PluginSchema), +# } + + +# class PluginAdmin(admin.ModelAdmin): +# formfield_overrides = { +# PydanticSchemaField: {"widget": JSONFormWidget}, +# } +# form = PluginForm + + diff --git a/archivebox/plugantic/apps.py b/archivebox/plugantic/apps.py new file mode 100644 index 00000000..c0f1ce71 --- /dev/null +++ b/archivebox/plugantic/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class PluganticConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'plugantic' diff --git a/archivebox/plugantic/binaries.py b/archivebox/plugantic/binaries.py new file mode 100644 index 00000000..4788c361 --- /dev/null +++ b/archivebox/plugantic/binaries.py @@ -0,0 +1,323 @@ +__package__ = 'archivebox.plugantic' + +import sys +import inspect +import importlib +from pathlib import Path + + +from typing import Any, Optional, Dict, List +from typing_extensions import Self +from subprocess import run, PIPE + + +from pydantic_core import ValidationError + +from pydantic import BaseModel, Field, model_validator, computed_field, field_validator, validate_call, field_serializer + +from .binproviders import ( + SemVer, + BinName, + BinProviderName, + HostBinPath, + BinProvider, + EnvProvider, + AptProvider, + BrewProvider, + PipProvider, + ProviderLookupDict, + bin_name, + bin_abspath, + path_is_script, + path_is_executable, +) + + +class Binary(BaseModel): + name: BinName + description: str = Field(default='') + + providers_supported: List[BinProvider] = Field(default=[EnvProvider()], alias='providers') + provider_overrides: Dict[BinProviderName, ProviderLookupDict] = Field(default={}, alias='overrides') + + loaded_provider: Optional[BinProviderName] = Field(default=None, alias='provider') + loaded_abspath: Optional[HostBinPath] = Field(default=None, alias='abspath') + loaded_version: Optional[SemVer] = Field(default=None, alias='version') + + # bin_filename: see below + # is_executable: see below + # is_script + # is_valid: see below + + + @model_validator(mode='after') + def validate(self): + self.loaded_abspath = bin_abspath(self.name) or self.name + self.description = self.description or self.name + + assert self.providers_supported, f'No providers were given for package {self.name}' + + # pull in any overrides from the binproviders + for provider in self.providers_supported: + overrides_by_provider = provider.get_providers_for_bin(self.name) + if overrides_by_provider: + self.provider_overrides[provider.name] = { + **overrides_by_provider, + **self.provider_overrides.get(provider.name, {}), + } + return self + + @field_validator('loaded_abspath', mode='before') + def parse_abspath(cls, value: Any): + return bin_abspath(value) + + @field_validator('loaded_version', mode='before') + def parse_version(cls, value: Any): + return value and SemVer(value) + + @field_serializer('provider_overrides', when_used='json') + def serialize_overrides(self, provider_overrides: Dict[BinProviderName, ProviderLookupDict]) -> Dict[BinProviderName, Dict[str, str]]: + return { + provider_name: { + key: str(val) + for key, val in overrides.items() + } + for provider_name, overrides in provider_overrides.items() + } + + @computed_field # type: ignore[misc] # see mypy issue #1362 + @property + def bin_filename(self) -> BinName: + if self.is_script: + # e.g. '.../Python.framework/Versions/3.11/lib/python3.11/sqlite3/__init__.py' -> sqlite + name = self.name + elif self.loaded_abspath: + # e.g. '/opt/homebrew/bin/wget' -> wget + name = bin_name(self.loaded_abspath) + else: + # e.g. 'ytdlp' -> 'yt-dlp' + name = bin_name(self.name) + return name + + @computed_field # type: ignore[misc] # see mypy issue #1362 + @property + def is_executable(self) -> bool: + try: + assert self.loaded_abspath and path_is_executable(self.loaded_abspath) + return True + except (ValidationError, AssertionError): + return False + + @computed_field # type: ignore[misc] # see mypy issue #1362 + @property + def is_script(self) -> bool: + try: + assert self.loaded_abspath and path_is_script(self.loaded_abspath) + return True + except (ValidationError, AssertionError): + return False + + @computed_field # type: ignore[misc] # see mypy issue #1362 + @property + def is_valid(self) -> bool: + return bool( + self.name + and self.loaded_abspath + and self.loaded_version + and (self.is_executable or self.is_script) + ) + + @validate_call + def install(self) -> Self: + if not self.providers_supported: + return self + + exc = Exception('No providers were able to install binary', self.name, self.providers_supported) + for provider in self.providers_supported: + try: + installed_bin = provider.install(self.name, overrides=self.provider_overrides.get(provider.name)) + if installed_bin: + # print('INSTALLED', self.name, installed_bin) + return self.model_copy(update={ + 'loaded_provider': provider.name, + 'loaded_abspath': installed_bin.abspath, + 'loaded_version': installed_bin.version, + }) + except Exception as err: + print(err) + exc = err + raise exc + + @validate_call + def load(self, cache=True) -> Self: + if self.is_valid: + return self + + if not self.providers_supported: + return self + + exc = Exception('No providers were able to install binary', self.name, self.providers_supported) + for provider in self.providers_supported: + try: + installed_bin = provider.load(self.name, cache=cache, overrides=self.provider_overrides.get(provider.name)) + if installed_bin: + # print('LOADED', provider, self.name, installed_bin) + return self.model_copy(update={ + 'loaded_provider': provider.name, + 'loaded_abspath': installed_bin.abspath, + 'loaded_version': installed_bin.version, + }) + except Exception as err: + print(err) + exc = err + raise exc + + @validate_call + def load_or_install(self, cache=True) -> Self: + if self.is_valid: + return self + + if not self.providers_supported: + return self + + exc = Exception('No providers were able to install binary', self.name, self.providers_supported) + for provider in self.providers_supported: + try: + installed_bin = provider.load_or_install(self.name, overrides=self.provider_overrides.get(provider.name), cache=cache) + if installed_bin: + # print('LOADED_OR_INSTALLED', self.name, installed_bin) + return self.model_copy(update={ + 'loaded_provider': provider.name, + 'loaded_abspath': installed_bin.abspath, + 'loaded_version': installed_bin.version, + }) + except Exception as err: + print(err) + exc = err + raise exc + + @validate_call + def exec(self, args=(), pwd='.'): + assert self.loaded_abspath + assert self.loaded_version + return run([self.loaded_abspath, *args], stdout=PIPE, stderr=PIPE, pwd=pwd) + + + + +class SystemPythonHelpers: + @staticmethod + def get_subdeps() -> str: + return 'python3 python3-minimal python3-pip python3-virtualenv' + + @staticmethod + def get_abspath() -> str: + return sys.executable + + @staticmethod + def get_version() -> str: + return '{}.{}.{}'.format(*sys.version_info[:3]) + + +class SqliteHelpers: + @staticmethod + def get_abspath() -> Path: + import sqlite3 + importlib.reload(sqlite3) + return Path(inspect.getfile(sqlite3)) + + @staticmethod + def get_version() -> SemVer: + import sqlite3 + importlib.reload(sqlite3) + version = sqlite3.version + assert version + return SemVer(version) + +class DjangoHelpers: + @staticmethod + def get_django_abspath() -> str: + import django + return inspect.getfile(django) + + + @staticmethod + def get_django_version() -> str: + import django + return '{}.{}.{} {} ({})'.format(*django.VERSION) + +class YtdlpHelpers: + @staticmethod + def get_ytdlp_subdeps() -> str: + return 'yt-dlp ffmpeg' + + @staticmethod + def get_ytdlp_version() -> str: + import yt_dlp + importlib.reload(yt_dlp) + + version = yt_dlp.version.__version__ + assert version + return version + +class PythonBinary(Binary): + name: BinName = 'python' + + providers_supported: List[BinProvider] = [ + EnvProvider( + subdeps_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_subdeps'}, + abspath_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_abspath'}, + version_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_version'}, + ), + ] + +class SqliteBinary(Binary): + name: BinName = 'sqlite' + providers_supported: List[BinProvider] = [ + EnvProvider( + version_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_version'}, + abspath_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_abspath'}, + ), + ] + +class DjangoBinary(Binary): + name: BinName = 'django' + providers_supported: List[BinProvider] = [ + EnvProvider( + abspath_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_abspath'}, + version_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_version'}, + ), + ] + + + + + +class YtdlpBinary(Binary): + name: BinName = 'yt-dlp' + providers_supported: List[BinProvider] = [ + # EnvProvider(), + PipProvider(version_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_version'}), + BrewProvider(subdeps_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_subdeps'}), + # AptProvider(subdeps_provider={'yt-dlp': lambda: 'yt-dlp ffmpeg'}), + ] + + +class WgetBinary(Binary): + name: BinName = 'wget' + providers_supported: List[BinProvider] = [EnvProvider(), AptProvider()] + + +# if __name__ == '__main__': +# PYTHON_BINARY = PythonBinary() +# SQLITE_BINARY = SqliteBinary() +# DJANGO_BINARY = DjangoBinary() +# WGET_BINARY = WgetBinary() +# YTDLP_BINARY = YtdlpPBinary() + +# print('-------------------------------------DEFINING BINARIES---------------------------------') +# print(PYTHON_BINARY) +# print(SQLITE_BINARY) +# print(DJANGO_BINARY) +# print(WGET_BINARY) +# print(YTDLP_BINARY) diff --git a/archivebox/plugantic/binproviders.py b/archivebox/plugantic/binproviders.py new file mode 100644 index 00000000..1c9933ea --- /dev/null +++ b/archivebox/plugantic/binproviders.py @@ -0,0 +1,561 @@ +__package__ = 'archivebox.plugantic' + +import os +import shutil +import operator + +from typing import Callable, Any, Optional, Type, Dict, Annotated, ClassVar, Literal, cast, TYPE_CHECKING +from typing_extensions import Self +from abc import ABC, abstractmethod +from collections import namedtuple +from pathlib import Path +from subprocess import run, PIPE + +from pydantic_core import core_schema, ValidationError +from pydantic import BaseModel, Field, TypeAdapter, AfterValidator, validate_call, GetCoreSchemaHandler + + + +def func_takes_args_or_kwargs(lambda_func: Callable[..., Any]) -> bool: + """returns True if a lambda func takes args/kwargs of any kind, otherwise false if it's pure/argless""" + code = lambda_func.__code__ + has_args = code.co_argcount > 0 + has_varargs = code.co_flags & 0x04 != 0 + has_varkw = code.co_flags & 0x08 != 0 + return has_args or has_varargs or has_varkw + + +def is_semver_str(semver: Any) -> bool: + if isinstance(semver, str): + return (semver.count('.') == 2 and semver.replace('.', '').isdigit()) + return False + +def semver_to_str(semver: tuple[int, int, int] | str) -> str: + if isinstance(semver, (list, tuple)): + return '.'.join(str(chunk) for chunk in semver) + if is_semver_str(semver): + return semver + raise ValidationError('Tried to convert invalid SemVer: {}'.format(semver)) + + +SemVerTuple = namedtuple('SemVerTuple', ('major', 'minor', 'patch'), defaults=(0, 0, 0)) +SemVerParsableTypes = str | tuple[str | int, ...] | list[str | int] + +class SemVer(SemVerTuple): + major: int + minor: int = 0 + patch: int = 0 + + if TYPE_CHECKING: + full_text: str | None = '' + + def __new__(cls, *args, full_text=None, **kwargs): + # '1.1.1' + if len(args) == 1 and is_semver_str(args[0]): + result = SemVer.parse(args[0]) + + # ('1', '2', '3') + elif len(args) == 1 and isinstance(args[0], (tuple, list)): + result = SemVer.parse(args[0]) + + # (1, '2', None) + elif not all(isinstance(arg, (int, type(None))) for arg in args): + result = SemVer.parse(args) + + # (None) + elif all(chunk in ('', 0, None) for chunk in (*args, *kwargs.values())): + result = None + + # 1, 2, 3 + else: + result = SemVerTuple.__new__(cls, *args, **kwargs) + + if result is not None: + # add first line as extra hidden metadata so it can be logged without having to re-run version cmd + result.full_text = full_text or str(result) + return result + + @classmethod + def parse(cls, version_stdout: SemVerParsableTypes) -> Self | None: + """ + parses a version tag string formatted like into (major, minor, patch) ints + 'Google Chrome 124.0.6367.208' -> (124, 0, 6367) + 'GNU Wget 1.24.5 built on darwin23.2.0.' -> (1, 24, 5) + 'curl 8.4.0 (x86_64-apple-darwin23.0) ...' -> (8, 4, 0) + '2024.04.09' -> (2024, 4, 9) + + """ + # print('INITIAL_VALUE', type(version_stdout).__name__, version_stdout) + + if isinstance(version_stdout, (tuple, list)): + version_stdout = '.'.join(str(chunk) for chunk in version_stdout) + elif isinstance(version_stdout, bytes): + version_stdout = version_stdout.decode() + elif not isinstance(version_stdout, str): + version_stdout = str(version_stdout) + + # no text to work with, return None immediately + if not version_stdout.strip(): + # raise Exception('Tried to parse semver from empty version output (is binary installed and available?)') + return None + + just_numbers = lambda col: col.lower().strip('v').split('+')[0].split('-')[0].split('_')[0] + contains_semver = lambda col: ( + col.count('.') in (1, 2, 3) + and all(chunk.isdigit() for chunk in col.split('.')[:3]) # first 3 chunks can only be nums + ) + + full_text = version_stdout.split('\n')[0].strip() + first_line_columns = full_text.split()[:4] + version_columns = list(filter(contains_semver, map(just_numbers, first_line_columns))) + + # could not find any column of first line that looks like a version number, despite there being some text + if not version_columns: + # raise Exception('Failed to parse semver from version command output: {}'.format(' '.join(first_line_columns))) + return None + + # take first col containing a semver, and truncate it to 3 chunks (e.g. 2024.04.09.91) -> (2024, 04, 09) + first_version_tuple = version_columns[0].split('.', 3)[:3] + + # print('FINAL_VALUE', first_version_tuple) + + return cls(*(int(chunk) for chunk in first_version_tuple), full_text=full_text) + + def __str__(self): + return '.'.join(str(chunk) for chunk in self) + + # @classmethod + # def __get_pydantic_core_schema__(cls, source: Type[Any], handler: GetCoreSchemaHandler) -> core_schema.CoreSchema: + # default_schema = handler(source) + # return core_schema.no_info_after_validator_function( + # cls.parse, + # default_schema, + # serialization=core_schema.plain_serializer_function_ser_schema( + # lambda semver: str(semver), + # info_arg=False, + # return_schema=core_schema.str_schema(), + # ), + # ) + +assert SemVer(None) == None +assert SemVer('') == None +assert SemVer.parse('') == None +assert SemVer(1) == (1, 0, 0) +assert SemVer(1, 2) == (1, 2, 0) +assert SemVer('1.2+234234') == (1, 2, 0) +assert SemVer((1, 2, 3)) == (1, 2, 3) +assert getattr(SemVer((1, 2, 3)), 'full_text') == '1.2.3' +assert SemVer(('1', '2', '3')) == (1, 2, 3) +assert SemVer.parse('5.6.7') == (5, 6, 7) +assert SemVer.parse('124.0.6367.208') == (124, 0, 6367) +assert SemVer.parse('Google Chrome 124.1+234.234') == (124, 1, 0) +assert SemVer.parse('Google Ch1rome 124.0.6367.208') == (124, 0, 6367) +assert SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324') == (124, 0, 6367) +assert getattr(SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324'), 'full_text') == 'Google Chrome 124.0.6367.208+beta_234. 234.234.123' +assert SemVer.parse('Google Chrome') == None + +@validate_call +def bin_name(bin_path_or_name: str | Path) -> str: + name = Path(bin_path_or_name).name + assert len(name) > 1 + assert name.replace('-', '').replace('_', '').replace('.', '').isalnum(), ( + f'Binary name can only contain a-Z0-9-_.: {name}') + return name + +BinName = Annotated[str, AfterValidator(bin_name)] + +@validate_call +def path_is_file(path: Path | str) -> Path: + path = Path(path) if isinstance(path, str) else path + assert path.is_file(), f'Path is not a file: {path}' + return path + +HostExistsPath = Annotated[Path, AfterValidator(path_is_file)] + +@validate_call +def path_is_executable(path: HostExistsPath) -> HostExistsPath: + assert os.access(path, os.X_OK), f'Path is not executable (fix by running chmod +x {path})' + return path + +@validate_call +def path_is_script(path: HostExistsPath) -> HostExistsPath: + SCRIPT_EXTENSIONS = ('.py', '.js', '.sh') + assert path.suffix.lower() in SCRIPT_EXTENSIONS, 'Path is not a script (does not end in {})'.format(', '.join(SCRIPT_EXTENSIONS)) + return path + +HostExecutablePath = Annotated[HostExistsPath, AfterValidator(path_is_executable)] + +@validate_call +def path_is_abspath(path: Path) -> Path: + return path.resolve() + +HostAbsPath = Annotated[HostExistsPath, AfterValidator(path_is_abspath)] +HostBinPath = Annotated[Path, AfterValidator(path_is_abspath), AfterValidator(path_is_file)] + + +@validate_call +def bin_abspath(bin_path_or_name: BinName | Path) -> HostBinPath | None: + assert bin_path_or_name + + if str(bin_path_or_name).startswith('/'): + # already a path, get its absolute form + abspath = Path(bin_path_or_name).resolve() + else: + # not a path yet, get path using os.which + binpath = shutil.which(bin_path_or_name) + if not binpath: + return None + abspath = Path(binpath).resolve() + + try: + return TypeAdapter(HostBinPath).validate_python(abspath) + except ValidationError: + return None + + +@validate_call +def bin_version(bin_path: HostBinPath, args=('--version',)) -> SemVer | None: + return SemVer(run([bin_path, *args], stdout=PIPE).stdout.strip().decode()) + + +class InstalledBin(BaseModel): + abspath: HostBinPath + version: SemVer + + +def is_valid_install_string(pkgs_str: str) -> str: + """Make sure a string is a valid install string for a package manager, e.g. 'yt-dlp ffmpeg'""" + assert pkgs_str + assert all(len(pkg) > 1 for pkg in pkgs_str.split(' ')) + return pkgs_str + +def is_valid_python_dotted_import(import_str: str) -> str: + assert import_str and import_str.replace('.', '').replace('_', '').isalnum() + return import_str + +InstallStr = Annotated[str, AfterValidator(is_valid_install_string)] + +LazyImportStr = Annotated[str, AfterValidator(is_valid_python_dotted_import)] + +ProviderHandler = Callable[..., Any] | Callable[[], Any] # must take no args [], or [bin_name: str, **kwargs] +#ProviderHandlerStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))] +ProviderHandlerRef = LazyImportStr | ProviderHandler +ProviderLookupDict = Dict[str, LazyImportStr] +ProviderType = Literal['abspath', 'version', 'subdeps', 'install'] + + +# class Host(BaseModel): +# machine: str +# system: str +# platform: str +# in_docker: bool +# in_qemu: bool +# python: str + +BinProviderName = Literal['env', 'pip', 'apt', 'brew', 'npm', 'vendor'] + + +class BinProvider(ABC, BaseModel): + name: BinProviderName + + abspath_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_abspath'}, exclude=True) + version_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_version'}, exclude=True) + subdeps_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_subdeps'}, exclude=True) + install_provider: ProviderLookupDict = Field(default={'*': 'self.on_install'}, exclude=True) + + _abspath_cache: ClassVar = {} + _version_cache: ClassVar = {} + _install_cache: ClassVar = {} + + # def provider_version(self) -> SemVer | None: + # """Version of the actual underlying package manager (e.g. pip v20.4.1)""" + # if self.name in ('env', 'vendor'): + # return SemVer('0.0.0') + # installer_binpath = Path(shutil.which(self.name)).resolve() + # return bin_version(installer_binpath) + + # def provider_host(self) -> Host: + # """Information about the host env, archictecture, and OS needed to select & build packages""" + # p = platform.uname() + # return Host( + # machine=p.machine, + # system=p.system, + # platform=platform.platform(), + # python=sys.implementation.name, + # in_docker=os.environ.get('IN_DOCKER', '').lower() == 'true', + # in_qemu=os.environ.get('IN_QEMU', '').lower() == 'true', + # ) + + def get_default_providers(self): + return self.get_providers_for_bin('*') + + def resolve_provider_func(self, provider_func: ProviderHandlerRef | None) -> ProviderHandler | None: + if provider_func is None: + return None + + # if provider_func is a dotted path to a function on self, swap it for the actual function + if isinstance(provider_func, str) and provider_func.startswith('self.'): + provider_func = getattr(self, provider_func.split('self.', 1)[-1]) + + # if provider_func is a dot-formatted import string, import the function + if isinstance(provider_func, str): + from django.utils.module_loading import import_string + + package_name, module_name, classname, path = provider_func.split('.', 3) # -> abc, def, ghi.jkl + + # get .ghi.jkl nested attr present on module abc.def + imported_module = import_string(f'{package_name}.{module_name}.{classname}') + provider_func = operator.attrgetter(path)(imported_module) + + # # abc.def.ghi.jkl -> 1, 2, 3 + # for idx in range(1, len(path)): + # parent_path = '.'.join(path[:-idx]) # abc.def.ghi + # try: + # parent_module = import_string(parent_path) + # provider_func = getattr(parent_module, path[-idx]) + # except AttributeError, ImportError: + # continue + + assert TypeAdapter(ProviderHandler).validate_python(provider_func), ( + f'{self.__class__.__name__} provider func for {bin_name} was not a function or dotted-import path: {provider_func}') + + return provider_func + + @validate_call + def get_providers_for_bin(self, bin_name: str) -> ProviderLookupDict: + providers_for_bin = { + 'abspath': self.abspath_provider.get(bin_name), + 'version': self.version_provider.get(bin_name), + 'subdeps': self.subdeps_provider.get(bin_name), + 'install': self.install_provider.get(bin_name), + } + only_set_providers_for_bin = {k: v for k, v in providers_for_bin.items() if v is not None} + + return only_set_providers_for_bin + + @validate_call + def get_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None) -> ProviderHandler: + """ + Get the provider func for a given key + Dict of provider callbacks + fallback default provider. + e.g. get_provider_for_action(bin_name='yt-dlp', 'install', default_provider=self.on_install, ...) -> Callable + """ + + provider_func_ref = ( + (overrides or {}).get(provider_type) + or self.get_providers_for_bin(bin_name).get(provider_type) + or self.get_default_providers().get(provider_type) + or default_provider + ) + # print('getting provider for action', bin_name, provider_type, provider_func) + + provider_func = self.resolve_provider_func(provider_func_ref) + + assert provider_func, f'No {self.name} provider func was found for {bin_name} in: {self.__class__.__name__}.' + + return provider_func + + @validate_call + def call_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None, **kwargs) -> Any: + provider_func: ProviderHandler = self.get_provider_for_action( + bin_name=bin_name, + provider_type=provider_type, + default_provider=default_provider, + overrides=overrides, + ) + if not func_takes_args_or_kwargs(provider_func): + # if it's a pure argless lambdas, dont pass bin_path and other **kwargs + provider_func_without_args = cast(Callable[[], Any], provider_func) + return provider_func_without_args() + + provider_func = cast(Callable[..., Any], provider_func) + return provider_func(bin_name, **kwargs) + + + + def on_get_abspath(self, bin_name: BinName, **_) -> HostBinPath | None: + print(f'[*] {self.__class__.__name__}: Getting abspath for {bin_name}...') + try: + return bin_abspath(bin_name) + except ValidationError: + return None + + def on_get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, **_) -> SemVer | None: + abspath = abspath or self._abspath_cache.get(bin_name) or self.get_abspath(bin_name) + if not abspath: return None + + print(f'[*] {self.__class__.__name__}: Getting version for {bin_name}...') + try: + return bin_version(abspath) + except ValidationError: + return None + + def on_get_subdeps(self, bin_name: BinName, **_) -> InstallStr: + print(f'[*] {self.__class__.__name__}: Getting subdependencies for {bin_name}') + # ... subdependency calculation logic here + return TypeAdapter(InstallStr).validate_python(bin_name) + + @abstractmethod + def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_): + subdeps = subdeps or self.get_subdeps(bin_name) + print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})') + # ... install logic here + assert True + + + @validate_call + def get_abspath(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> HostBinPath | None: + abspath = self.call_provider_for_action( + bin_name=bin_name, + provider_type='abspath', + default_provider=self.on_get_abspath, + overrides=overrides, + ) + if not abspath: + return None + result = TypeAdapter(HostBinPath).validate_python(abspath) + self._abspath_cache[bin_name] = result + return result + + @validate_call + def get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, overrides: Optional[ProviderLookupDict]=None) -> SemVer | None: + version = self.call_provider_for_action( + bin_name=bin_name, + provider_type='version', + default_provider=self.on_get_version, + overrides=overrides, + abspath=abspath, + ) + if not version: + return None + result = SemVer(version) + self._version_cache[bin_name] = result + return result + + @validate_call + def get_subdeps(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstallStr: + subdeps = self.call_provider_for_action( + bin_name=bin_name, + provider_type='subdeps', + default_provider=self.on_get_subdeps, + overrides=overrides, + ) + if not subdeps: + subdeps = bin_name + result = TypeAdapter(InstallStr).validate_python(subdeps) + return result + + @validate_call + def install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstalledBin | None: + subdeps = self.get_subdeps(bin_name, overrides=overrides) + + self.call_provider_for_action( + bin_name=bin_name, + provider_type='install', + default_provider=self.on_install, + overrides=overrides, + subdeps=subdeps, + ) + + installed_abspath = self.get_abspath(bin_name) + assert installed_abspath, f'Unable to find {bin_name} abspath after installing with {self.name}' + + installed_version = self.get_version(bin_name, abspath=installed_abspath) + assert installed_version, f'Unable to find {bin_name} version after installing with {self.name}' + + result = InstalledBin(abspath=installed_abspath, version=installed_version) + self._install_cache[bin_name] = result + return result + + @validate_call + def load(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=False) -> InstalledBin | None: + installed_abspath = None + installed_version = None + + if cache: + installed_bin = self._install_cache.get(bin_name) + if installed_bin: + return installed_bin + installed_abspath = self._abspath_cache.get(bin_name) + installed_version = self._version_cache.get(bin_name) + + + installed_abspath = installed_abspath or self.get_abspath(bin_name, overrides=overrides) + if not installed_abspath: + return None + + installed_version = installed_version or self.get_version(bin_name, abspath=installed_abspath, overrides=overrides) + if not installed_version: + return None + + return InstalledBin(abspath=installed_abspath, version=installed_version) + + @validate_call + def load_or_install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=True) -> InstalledBin | None: + installed = self.load(bin_name, overrides=overrides, cache=cache) + if not installed: + installed = self.install(bin_name, overrides=overrides) + return installed + + +class PipProvider(BinProvider): + name: BinProviderName = 'pip' + + def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_): + subdeps = subdeps or self.on_get_subdeps(bin_name) + print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})') + + proc = run(['pip', 'install', '--upgrade', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE) + + if proc.returncode != 0: + print(proc.stdout.strip().decode()) + print(proc.stderr.strip().decode()) + raise Exception(f'{self.__class__.__name__}: install got returncode {proc.returncode} while installing {subdeps}: {subdeps}') + + +class AptProvider(BinProvider): + name: BinProviderName = 'apt' + + subdeps_provider: ProviderLookupDict = { + 'yt-dlp': lambda: 'yt-dlp ffmpeg', + } + + def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_): + subdeps = subdeps or self.on_get_subdeps(bin_name) + print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})') + + run(['apt-get', 'update', '-qq']) + proc = run(['apt-get', 'install', '-y', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE) + + if proc.returncode != 0: + print(proc.stdout.strip().decode()) + print(proc.stderr.strip().decode()) + raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}') + +class BrewProvider(BinProvider): + name: BinProviderName = 'brew' + + def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_): + subdeps = subdeps or self.on_get_subdeps(bin_name) + print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})') + + proc = run(['brew', 'install', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE) + + if proc.returncode != 0: + print(proc.stdout.strip().decode()) + print(proc.stderr.strip().decode()) + raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}') + + +class EnvProvider(BinProvider): + name: BinProviderName = 'env' + + abspath_provider: ProviderLookupDict = { + # 'python': lambda: Path('/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/bin/python3.10'), + } + version_provider: ProviderLookupDict = { + # 'python': lambda: '{}.{}.{}'.format(*sys.version_info[:3]), + } + + def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_): + """The env provider is ready-only and does not install any packages, so this is a no-op""" + pass diff --git a/archivebox/plugantic/configs.py b/archivebox/plugantic/configs.py new file mode 100644 index 00000000..671f105c --- /dev/null +++ b/archivebox/plugantic/configs.py @@ -0,0 +1,53 @@ +__package__ = 'archivebox.plugantic' + + +from typing import Optional, List, Literal +from pathlib import Path +from pydantic import BaseModel, Field + + +ConfigSectionName = Literal['GENERAL_CONFIG', 'ARCHIVE_METHOD_TOGGLES', 'ARCHIVE_METHOD_OPTIONS', 'DEPENDENCY_CONFIG'] + + +class ConfigSet(BaseModel): + section: ConfigSectionName = 'GENERAL_CONFIG' + +class WgetToggleConfig(ConfigSet): + section: ConfigSectionName = 'ARCHIVE_METHOD_TOGGLES' + + SAVE_WGET: bool = True + SAVE_WARC: bool = True + +class WgetDependencyConfig(ConfigSet): + section: ConfigSectionName = 'DEPENDENCY_CONFIG' + + WGET_BINARY: str = Field(default='wget') + WGET_ARGS: Optional[List[str]] = Field(default=None) + WGET_EXTRA_ARGS: List[str] = [] + WGET_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] + +class WgetOptionsConfig(ConfigSet): + section: ConfigSectionName = 'ARCHIVE_METHOD_OPTIONS' + + # loaded from shared config + WGET_AUTO_COMPRESSION: bool = Field(default=True) + SAVE_WGET_REQUISITES: bool = Field(default=True) + WGET_USER_AGENT: str = Field(default='', alias='USER_AGENT') + WGET_TIMEOUT: int = Field(default=60, alias='TIMEOUT') + WGET_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY') + WGET_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES') + WGET_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE') + + +CONFIG = { + 'CHECK_SSL_VALIDITY': False, + 'SAVE_WARC': False, + 'TIMEOUT': 999, +} + + +WGET_CONFIG = [ + WgetToggleConfig(**CONFIG), + WgetDependencyConfig(**CONFIG), + WgetOptionsConfig(**CONFIG), +] diff --git a/archivebox/plugantic/extractors.py b/archivebox/plugantic/extractors.py new file mode 100644 index 00000000..3befa5b5 --- /dev/null +++ b/archivebox/plugantic/extractors.py @@ -0,0 +1,118 @@ +__package__ = 'archivebox.plugantic' + +from typing import Optional, List, Literal, Annotated, Dict, Any +from typing_extensions import Self + +from abc import ABC +from pathlib import Path + +from pydantic import BaseModel, model_validator, field_serializer, AfterValidator + +from .binaries import ( + Binary, + YtdlpBinary, + WgetBinary, +) + + +# stubs +class Snapshot: + pass + +class ArchiveResult: + pass + +def get_wget_output_path(*args, **kwargs) -> Path: + return Path('.').resolve() + + + +def no_empty_args(args: List[str]) -> List[str]: + assert all(len(arg) for arg in args) + return args + +ExtractorName = Literal['wget', 'warc', 'media'] + +HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))] +CmdArgsList = Annotated[List[str], AfterValidator(no_empty_args)] + + +class Extractor(ABC, BaseModel): + name: ExtractorName + binary: Binary + + output_path_func: HandlerFuncStr = 'self.get_output_path' + should_extract_func: HandlerFuncStr = 'self.should_extract' + extract_func: HandlerFuncStr = 'self.extract' + exec_func: HandlerFuncStr = 'self.exec' + + default_args: CmdArgsList = [] + extra_args: CmdArgsList = [] + args: Optional[CmdArgsList] = None + + @model_validator(mode='after') + def validate_model(self) -> Self: + if self.args is None: + self.args = [*self.default_args, *self.extra_args] + return self + + @field_serializer('binary', when_used='json') + def dump_binary(binary) -> str: + return binary.name + + def get_output_path(self, snapshot) -> Path: + return Path(self.name) + + def should_extract(self, snapshot) -> bool: + output_dir = self.get_output_path(snapshot) + if output_dir.glob('*.*'): + return False + return True + + + def extract(self, url: str, **kwargs) -> Dict[str, Any]: + output_dir = self.get_output_path(url, **kwargs) + + cmd = [url, *self.args] if self.args is not None else [url, *self.default_args, *self.extra_args] + proc = self.exec(cmd, pwd=output_dir) + + return { + 'status': 'succeeded' if proc.returncode == 0 else 'failed', + 'output': proc.stdout.decode().strip().split('\n')[-1], + 'output_files': list(output_dir.glob('*.*')), + + 'stdout': proc.stdout.decode().strip(), + 'stderr': proc.stderr.decode().strip(), + 'returncode': proc.returncode, + } + + def exec(self, args: CmdArgsList, pwd: Optional[Path]=None): + pwd = pwd or Path('.') + assert self.binary.loaded_provider + return self.binary.exec(args, pwd=pwd) + + +class YtdlpExtractor(Extractor): + name: ExtractorName = 'media' + binary: Binary = YtdlpBinary() + + def get_output_path(self, snapshot) -> Path: + return Path(self.name) + + +class WgetExtractor(Extractor): + name: ExtractorName = 'wget' + binary: Binary = WgetBinary() + + def get_output_path(self, snapshot) -> Path: + return get_wget_output_path(snapshot) + + +class WarcExtractor(Extractor): + name: ExtractorName = 'warc' + binary: Binary = WgetBinary() + + def get_output_path(self, snapshot) -> Path: + return get_wget_output_path(snapshot) + + diff --git a/archivebox/plugantic/ini_to_toml.py b/archivebox/plugantic/ini_to_toml.py new file mode 100644 index 00000000..eec21f74 --- /dev/null +++ b/archivebox/plugantic/ini_to_toml.py @@ -0,0 +1,396 @@ +from typing import Dict, Any, List + +import configparser +import json +import ast + +JSONValue = str | bool | int | None | List['JSONValue'] + +def load_ini_value(val: str) -> JSONValue: + """Convert lax INI values into strict TOML-compliant (JSON) values""" + if val.lower() in ('true', 'yes', '1'): + return True + if val.lower() in ('false', 'no', '0'): + return False + if val.isdigit(): + return int(val) + + try: + return ast.literal_eval(val) + except Exception: + pass + + try: + return json.loads(val) + except Exception as err: + pass + + return val + + +def convert(ini_str: str) -> str: + """Convert a string of INI config into its TOML equivalent (warning: strips comments)""" + + config = configparser.ConfigParser() + config.optionxform = str # capitalize key names + config.read_string(ini_str) + + # Initialize an empty dictionary to store the TOML representation + toml_dict = {} + + # Iterate over each section in the INI configuration + for section in config.sections(): + toml_dict[section] = {} + + # Iterate over each key-value pair in the section + for key, value in config.items(section): + parsed_value = load_ini_value(value) + + # Convert the parsed value to its TOML-compatible JSON representation + toml_dict[section.upper()][key.upper()] = json.dumps(parsed_value) + + # Build the TOML string + toml_str = "" + for section, items in toml_dict.items(): + toml_str += f"[{section}]\n" + for key, value in items.items(): + toml_str += f"{key} = {value}\n" + toml_str += "\n" + + return toml_str.strip() + + + +### Basic Assertions + +test_input = """ +[SERVER_CONFIG] +IS_TTY=False +USE_COLOR=False +SHOW_PROGRESS=False +IN_DOCKER=False +IN_QEMU=False +PUID=501 +PGID=20 +OUTPUT_DIR=/opt/archivebox/data +CONFIG_FILE=/opt/archivebox/data/ArchiveBox.conf +ONLY_NEW=True +TIMEOUT=60 +MEDIA_TIMEOUT=3600 +OUTPUT_PERMISSIONS=644 +RESTRICT_FILE_NAMES=windows +URL_DENYLIST=\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$ +URL_ALLOWLIST=None +ADMIN_USERNAME=None +ADMIN_PASSWORD=None +ENFORCE_ATOMIC_WRITES=True +TAG_SEPARATOR_PATTERN=[,] +SECRET_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +BIND_ADDR=127.0.0.1:8000 +ALLOWED_HOSTS=* +DEBUG=False +PUBLIC_INDEX=True +PUBLIC_SNAPSHOTS=True +PUBLIC_ADD_VIEW=False +FOOTER_INFO=Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests. +SNAPSHOTS_PER_PAGE=40 +CUSTOM_TEMPLATES_DIR=None +TIME_ZONE=UTC +TIMEZONE=UTC +REVERSE_PROXY_USER_HEADER=Remote-User +REVERSE_PROXY_WHITELIST= +LOGOUT_REDIRECT_URL=/ +PREVIEW_ORIGINALS=True +LDAP=False +LDAP_SERVER_URI=None +LDAP_BIND_DN=None +LDAP_BIND_PASSWORD=None +LDAP_USER_BASE=None +LDAP_USER_FILTER=None +LDAP_USERNAME_ATTR=None +LDAP_FIRSTNAME_ATTR=None +LDAP_LASTNAME_ATTR=None +LDAP_EMAIL_ATTR=None +LDAP_CREATE_SUPERUSER=False +SAVE_TITLE=True +SAVE_FAVICON=True +SAVE_WGET=True +SAVE_WGET_REQUISITES=True +SAVE_SINGLEFILE=True +SAVE_READABILITY=True +SAVE_MERCURY=True +SAVE_HTMLTOTEXT=True +SAVE_PDF=True +SAVE_SCREENSHOT=True +SAVE_DOM=True +SAVE_HEADERS=True +SAVE_WARC=True +SAVE_GIT=True +SAVE_MEDIA=True +SAVE_ARCHIVE_DOT_ORG=True +RESOLUTION=1440,2000 +GIT_DOMAINS=github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht +CHECK_SSL_VALIDITY=True +MEDIA_MAX_SIZE=750m +USER_AGENT=None +CURL_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0) +WGET_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5 +CHROME_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) +COOKIES_FILE=None +CHROME_USER_DATA_DIR=None +CHROME_TIMEOUT=0 +CHROME_HEADLESS=True +CHROME_SANDBOX=True +CHROME_EXTRA_ARGS=[] +YOUTUBEDL_ARGS=['--restrict-filenames', '--trim-filenames', '128', '--write-description', '--write-info-json', '--write-annotations', '--write-thumbnail', '--no-call-home', '--write-sub', '--write-auto-subs', '--convert-subs=srt', '--yes-playlist', '--continue', '--no-abort-on-error', '--ignore-errors', '--geo-bypass', '--add-metadata', '--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)'] +YOUTUBEDL_EXTRA_ARGS=[] +WGET_ARGS=['--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off'] +WGET_EXTRA_ARGS=[] +CURL_ARGS=['--silent', '--location', '--compressed'] +CURL_EXTRA_ARGS=[] +GIT_ARGS=['--recursive'] +SINGLEFILE_ARGS=[] +SINGLEFILE_EXTRA_ARGS=[] +MERCURY_ARGS=['--format=text'] +MERCURY_EXTRA_ARGS=[] +FAVICON_PROVIDER=https://www.google.com/s2/favicons?domain={} +USE_INDEXING_BACKEND=True +USE_SEARCHING_BACKEND=True +SEARCH_BACKEND_ENGINE=ripgrep +SEARCH_BACKEND_HOST_NAME=localhost +SEARCH_BACKEND_PORT=1491 +SEARCH_BACKEND_PASSWORD=SecretPassword +SEARCH_PROCESS_HTML=True +SONIC_COLLECTION=archivebox +SONIC_BUCKET=snapshots +SEARCH_BACKEND_TIMEOUT=90 +FTS_SEPARATE_DATABASE=True +FTS_TOKENIZERS=porter unicode61 remove_diacritics 2 +FTS_SQLITE_MAX_LENGTH=1000000000 +USE_CURL=True +USE_WGET=True +USE_SINGLEFILE=True +USE_READABILITY=True +USE_MERCURY=True +USE_GIT=True +USE_CHROME=True +USE_NODE=True +USE_YOUTUBEDL=True +USE_RIPGREP=True +CURL_BINARY=curl +GIT_BINARY=git +WGET_BINARY=wget +SINGLEFILE_BINARY=single-file +READABILITY_BINARY=readability-extractor +MERCURY_BINARY=postlight-parser +YOUTUBEDL_BINARY=yt-dlp +NODE_BINARY=node +RIPGREP_BINARY=rg +CHROME_BINARY=chrome +POCKET_CONSUMER_KEY=None +USER=squash +PACKAGE_DIR=/opt/archivebox/archivebox +TEMPLATES_DIR=/opt/archivebox/archivebox/templates +ARCHIVE_DIR=/opt/archivebox/data/archive +SOURCES_DIR=/opt/archivebox/data/sources +LOGS_DIR=/opt/archivebox/data/logs +PERSONAS_DIR=/opt/archivebox/data/personas +URL_DENYLIST_PTN=re.compile('\\.(css|js|otf|ttf|woff|woff2|gstatic\\.com|googleapis\\.com/css)(\\?.*)?$', re.IGNORECASE|re.MULTILINE) +URL_ALLOWLIST_PTN=None +DIR_OUTPUT_PERMISSIONS=755 +ARCHIVEBOX_BINARY=/opt/archivebox/.venv/bin/archivebox +VERSION=0.8.0 +COMMIT_HASH=102e87578c6036bb0132dd1ebd17f8f05ffc880f +BUILD_TIME=2024-05-15 03:28:05 1715768885 +VERSIONS_AVAILABLE=None +CAN_UPGRADE=False +PYTHON_BINARY=/opt/archivebox/.venv/bin/python3.10 +PYTHON_ENCODING=UTF-8 +PYTHON_VERSION=3.10.14 +DJANGO_BINARY=/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py +DJANGO_VERSION=5.0.6 final (0) +SQLITE_BINARY=/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py +SQLITE_VERSION=2.6.0 +CURL_VERSION=curl 8.4.0 (x86_64-apple-darwin23.0) +WGET_VERSION=GNU Wget 1.24.5 +WGET_AUTO_COMPRESSION=True +RIPGREP_VERSION=ripgrep 14.1.0 +SINGLEFILE_VERSION=None +READABILITY_VERSION=None +MERCURY_VERSION=None +GIT_VERSION=git version 2.44.0 +YOUTUBEDL_VERSION=2024.04.09 +CHROME_VERSION=Google Chrome 124.0.6367.207 +NODE_VERSION=v21.7.3 +""" + + +expected_output = '''[SERVER_CONFIG] +IS_TTY = false +USE_COLOR = false +SHOW_PROGRESS = false +IN_DOCKER = false +IN_QEMU = false +PUID = 501 +PGID = 20 +OUTPUT_DIR = "/opt/archivebox/data" +CONFIG_FILE = "/opt/archivebox/data/ArchiveBox.conf" +ONLY_NEW = true +TIMEOUT = 60 +MEDIA_TIMEOUT = 3600 +OUTPUT_PERMISSIONS = 644 +RESTRICT_FILE_NAMES = "windows" +URL_DENYLIST = "\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$" +URL_ALLOWLIST = null +ADMIN_USERNAME = null +ADMIN_PASSWORD = null +ENFORCE_ATOMIC_WRITES = true +TAG_SEPARATOR_PATTERN = "[,]" +SECRET_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +BIND_ADDR = "127.0.0.1:8000" +ALLOWED_HOSTS = "*" +DEBUG = false +PUBLIC_INDEX = true +PUBLIC_SNAPSHOTS = true +PUBLIC_ADD_VIEW = false +FOOTER_INFO = "Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests." +SNAPSHOTS_PER_PAGE = 40 +CUSTOM_TEMPLATES_DIR = null +TIME_ZONE = "UTC" +TIMEZONE = "UTC" +REVERSE_PROXY_USER_HEADER = "Remote-User" +REVERSE_PROXY_WHITELIST = "" +LOGOUT_REDIRECT_URL = "/" +PREVIEW_ORIGINALS = true +LDAP = false +LDAP_SERVER_URI = null +LDAP_BIND_DN = null +LDAP_BIND_PASSWORD = null +LDAP_USER_BASE = null +LDAP_USER_FILTER = null +LDAP_USERNAME_ATTR = null +LDAP_FIRSTNAME_ATTR = null +LDAP_LASTNAME_ATTR = null +LDAP_EMAIL_ATTR = null +LDAP_CREATE_SUPERUSER = false +SAVE_TITLE = true +SAVE_FAVICON = true +SAVE_WGET = true +SAVE_WGET_REQUISITES = true +SAVE_SINGLEFILE = true +SAVE_READABILITY = true +SAVE_MERCURY = true +SAVE_HTMLTOTEXT = true +SAVE_PDF = true +SAVE_SCREENSHOT = true +SAVE_DOM = true +SAVE_HEADERS = true +SAVE_WARC = true +SAVE_GIT = true +SAVE_MEDIA = true +SAVE_ARCHIVE_DOT_ORG = true +RESOLUTION = [1440, 2000] +GIT_DOMAINS = "github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht" +CHECK_SSL_VALIDITY = true +MEDIA_MAX_SIZE = "750m" +USER_AGENT = null +CURL_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)" +WGET_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5" +CHROME_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)" +COOKIES_FILE = null +CHROME_USER_DATA_DIR = null +CHROME_TIMEOUT = false +CHROME_HEADLESS = true +CHROME_SANDBOX = true +CHROME_EXTRA_ARGS = [] +YOUTUBEDL_ARGS = ["--restrict-filenames", "--trim-filenames", "128", "--write-description", "--write-info-json", "--write-annotations", "--write-thumbnail", "--no-call-home", "--write-sub", "--write-auto-subs", "--convert-subs=srt", "--yes-playlist", "--continue", "--no-abort-on-error", "--ignore-errors", "--geo-bypass", "--add-metadata", "--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)"] +YOUTUBEDL_EXTRA_ARGS = [] +WGET_ARGS = ["--no-verbose", "--adjust-extension", "--convert-links", "--force-directories", "--backup-converted", "--span-hosts", "--no-parent", "-e", "robots=off"] +WGET_EXTRA_ARGS = [] +CURL_ARGS = ["--silent", "--location", "--compressed"] +CURL_EXTRA_ARGS = [] +GIT_ARGS = ["--recursive"] +SINGLEFILE_ARGS = [] +SINGLEFILE_EXTRA_ARGS = [] +MERCURY_ARGS = ["--format=text"] +MERCURY_EXTRA_ARGS = [] +FAVICON_PROVIDER = "https://www.google.com/s2/favicons?domain={}" +USE_INDEXING_BACKEND = true +USE_SEARCHING_BACKEND = true +SEARCH_BACKEND_ENGINE = "ripgrep" +SEARCH_BACKEND_HOST_NAME = "localhost" +SEARCH_BACKEND_PORT = 1491 +SEARCH_BACKEND_PASSWORD = "SecretPassword" +SEARCH_PROCESS_HTML = true +SONIC_COLLECTION = "archivebox" +SONIC_BUCKET = "snapshots" +SEARCH_BACKEND_TIMEOUT = 90 +FTS_SEPARATE_DATABASE = true +FTS_TOKENIZERS = "porter unicode61 remove_diacritics 2" +FTS_SQLITE_MAX_LENGTH = 1000000000 +USE_CURL = true +USE_WGET = true +USE_SINGLEFILE = true +USE_READABILITY = true +USE_MERCURY = true +USE_GIT = true +USE_CHROME = true +USE_NODE = true +USE_YOUTUBEDL = true +USE_RIPGREP = true +CURL_BINARY = "curl" +GIT_BINARY = "git" +WGET_BINARY = "wget" +SINGLEFILE_BINARY = "single-file" +READABILITY_BINARY = "readability-extractor" +MERCURY_BINARY = "postlight-parser" +YOUTUBEDL_BINARY = "yt-dlp" +NODE_BINARY = "node" +RIPGREP_BINARY = "rg" +CHROME_BINARY = "chrome" +POCKET_CONSUMER_KEY = null +USER = "squash" +PACKAGE_DIR = "/opt/archivebox/archivebox" +TEMPLATES_DIR = "/opt/archivebox/archivebox/templates" +ARCHIVE_DIR = "/opt/archivebox/data/archive" +SOURCES_DIR = "/opt/archivebox/data/sources" +LOGS_DIR = "/opt/archivebox/data/logs" +PERSONAS_DIR = "/opt/archivebox/data/personas" +URL_DENYLIST_PTN = "re.compile(\'\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$\', re.IGNORECASE|re.MULTILINE)" +URL_ALLOWLIST_PTN = null +DIR_OUTPUT_PERMISSIONS = 755 +ARCHIVEBOX_BINARY = "/opt/archivebox/.venv/bin/archivebox" +VERSION = "0.8.0" +COMMIT_HASH = "102e87578c6036bb0132dd1ebd17f8f05ffc880f" +BUILD_TIME = "2024-05-15 03:28:05 1715768885" +VERSIONS_AVAILABLE = null +CAN_UPGRADE = false +PYTHON_BINARY = "/opt/archivebox/.venv/bin/python3.10" +PYTHON_ENCODING = "UTF-8" +PYTHON_VERSION = "3.10.14" +DJANGO_BINARY = "/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py" +DJANGO_VERSION = "5.0.6 final (0)" +SQLITE_BINARY = "/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py" +SQLITE_VERSION = "2.6.0" +CURL_VERSION = "curl 8.4.0 (x86_64-apple-darwin23.0)" +WGET_VERSION = "GNU Wget 1.24.5" +WGET_AUTO_COMPRESSION = true +RIPGREP_VERSION = "ripgrep 14.1.0" +SINGLEFILE_VERSION = null +READABILITY_VERSION = null +MERCURY_VERSION = null +GIT_VERSION = "git version 2.44.0" +YOUTUBEDL_VERSION = "2024.04.09" +CHROME_VERSION = "Google Chrome 124.0.6367.207" +NODE_VERSION = "v21.7.3"''' + + +first_output = convert(test_input) # make sure ini -> toml parses correctly +second_output = convert(first_output) # make sure toml -> toml parses/dumps consistently +assert first_output == second_output == expected_output # make sure parsing is indempotent + +# # DEBUGGING +# import sys +# import difflib +# sys.stdout.writelines(difflib.context_diff(first_output, second_output, fromfile='first', tofile='second')) +# print(repr(second_output)) diff --git a/archivebox/plugantic/migrations/0001_initial.py b/archivebox/plugantic/migrations/0001_initial.py new file mode 100644 index 00000000..7e209f59 --- /dev/null +++ b/archivebox/plugantic/migrations/0001_initial.py @@ -0,0 +1,38 @@ +# Generated by Django 5.0.6 on 2024-05-18 00:16 + +import abid_utils.models +import archivebox.plugantic.plugins +import charidfield.fields +import django.core.serializers.json +import django.db.models.deletion +import django_pydantic_field.fields +import uuid +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name='Plugin', + fields=[ + ('created', models.DateTimeField(auto_now_add=True)), + ('modified', models.DateTimeField(auto_now=True)), + ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), + ('uuid', models.UUIDField(blank=True, null=True, unique=True)), + ('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='plg_', unique=True)), + ('schema', django_pydantic_field.fields.PydanticSchemaField(config=None, encoder=django.core.serializers.json.DjangoJSONEncoder, schema=archivebox.plugantic.plugins.Plugin)), + ('created_by', models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + options={ + 'abstract': False, + }, + ), + ] diff --git a/archivebox/plugantic/migrations/0002_alter_plugin_schema.py b/archivebox/plugantic/migrations/0002_alter_plugin_schema.py new file mode 100644 index 00000000..152e2eb3 --- /dev/null +++ b/archivebox/plugantic/migrations/0002_alter_plugin_schema.py @@ -0,0 +1,21 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:16 + +import archivebox.plugantic.plugins +import django.core.serializers.json +import django_pydantic_field.fields +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0001_initial'), + ] + + operations = [ + migrations.AlterField( + model_name='plugin', + name='schema', + field=django_pydantic_field.fields.PydanticSchemaField(config=None, default=None, encoder=django.core.serializers.json.DjangoJSONEncoder, schema=archivebox.plugantic.plugins.Plugin), + ), + ] diff --git a/archivebox/plugantic/migrations/0003_alter_plugin_schema.py b/archivebox/plugantic/migrations/0003_alter_plugin_schema.py new file mode 100644 index 00000000..754ec3b0 --- /dev/null +++ b/archivebox/plugantic/migrations/0003_alter_plugin_schema.py @@ -0,0 +1,21 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:25 + +import archivebox.plugantic.replayers +import django.core.serializers.json +import django_pydantic_field.fields +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0002_alter_plugin_schema'), + ] + + operations = [ + migrations.AlterField( + model_name='plugin', + name='schema', + field=django_pydantic_field.fields.PydanticSchemaField(config=None, default={'embed_template': 'plugins/generic_replayer/templates/embed.html', 'fullpage_template': 'plugins/generic_replayer/templates/fullpage.html', 'name': 'GenericReplayer', 'row_template': 'plugins/generic_replayer/templates/row.html', 'url_pattern': '*'}, encoder=django.core.serializers.json.DjangoJSONEncoder, schema=archivebox.plugantic.replayers.Replayer), + ), + ] diff --git a/archivebox/plugantic/migrations/0004_remove_plugin_schema_plugin_configs_plugin_name.py b/archivebox/plugantic/migrations/0004_remove_plugin_schema_plugin_configs_plugin_name.py new file mode 100644 index 00000000..fce99723 --- /dev/null +++ b/archivebox/plugantic/migrations/0004_remove_plugin_schema_plugin_configs_plugin_name.py @@ -0,0 +1,32 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:28 + +import archivebox.plugantic.configs +import django.core.serializers.json +import django_pydantic_field.compat.django +import django_pydantic_field.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0003_alter_plugin_schema'), + ] + + operations = [ + migrations.RemoveField( + model_name='plugin', + name='schema', + ), + migrations.AddField( + model_name='plugin', + name='configs', + field=django_pydantic_field.fields.PydanticSchemaField(config=None, default=[], encoder=django.core.serializers.json.DjangoJSONEncoder, schema=django_pydantic_field.compat.django.GenericContainer(list, (archivebox.plugantic.configs.ConfigSet,))), + ), + migrations.AddField( + model_name='plugin', + name='name', + field=models.CharField(default='name', max_length=64, unique=True), + preserve_default=False, + ), + ] diff --git a/archivebox/plugantic/migrations/0005_customplugin_delete_plugin.py b/archivebox/plugantic/migrations/0005_customplugin_delete_plugin.py new file mode 100644 index 00000000..31ac4a94 --- /dev/null +++ b/archivebox/plugantic/migrations/0005_customplugin_delete_plugin.py @@ -0,0 +1,39 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:42 + +import abid_utils.models +import charidfield.fields +import django.db.models.deletion +import pathlib +import uuid +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0004_remove_plugin_schema_plugin_configs_plugin_name'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name='CustomPlugin', + fields=[ + ('created', models.DateTimeField(auto_now_add=True)), + ('modified', models.DateTimeField(auto_now=True)), + ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), + ('uuid', models.UUIDField(blank=True, null=True, unique=True)), + ('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='plg_', unique=True)), + ('name', models.CharField(max_length=64, unique=True)), + ('path', models.FilePathField(path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/archivebox/plugins'))), + ('created_by', models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + options={ + 'abstract': False, + }, + ), + migrations.DeleteModel( + name='Plugin', + ), + ] diff --git a/archivebox/plugantic/migrations/0006_alter_customplugin_path.py b/archivebox/plugantic/migrations/0006_alter_customplugin_path.py new file mode 100644 index 00000000..facf6604 --- /dev/null +++ b/archivebox/plugantic/migrations/0006_alter_customplugin_path.py @@ -0,0 +1,19 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:45 + +import pathlib +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0005_customplugin_delete_plugin'), + ] + + operations = [ + migrations.AlterField( + model_name='customplugin', + name='path', + field=models.FilePathField(allow_files=False, allow_folders=True, path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/archivebox/plugins'), recursive=True), + ), + ] diff --git a/archivebox/plugantic/migrations/0007_alter_customplugin_path.py b/archivebox/plugantic/migrations/0007_alter_customplugin_path.py new file mode 100644 index 00000000..0c78fad8 --- /dev/null +++ b/archivebox/plugantic/migrations/0007_alter_customplugin_path.py @@ -0,0 +1,19 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:46 + +import pathlib +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0006_alter_customplugin_path'), + ] + + operations = [ + migrations.AlterField( + model_name='customplugin', + name='path', + field=models.FilePathField(allow_files=False, allow_folders=True, path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data/plugins'), recursive=True), + ), + ] diff --git a/archivebox/plugantic/migrations/0008_alter_customplugin_path.py b/archivebox/plugantic/migrations/0008_alter_customplugin_path.py new file mode 100644 index 00000000..087fe0fc --- /dev/null +++ b/archivebox/plugantic/migrations/0008_alter_customplugin_path.py @@ -0,0 +1,19 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:47 + +import pathlib +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0007_alter_customplugin_path'), + ] + + operations = [ + migrations.AlterField( + model_name='customplugin', + name='path', + field=models.FilePathField(allow_files=False, allow_folders=True, path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data'), recursive=True), + ), + ] diff --git a/archivebox/plugantic/migrations/0009_alter_customplugin_path.py b/archivebox/plugantic/migrations/0009_alter_customplugin_path.py new file mode 100644 index 00000000..57ab3e79 --- /dev/null +++ b/archivebox/plugantic/migrations/0009_alter_customplugin_path.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:48 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0008_alter_customplugin_path'), + ] + + operations = [ + migrations.AlterField( + model_name='customplugin', + name='path', + field=models.FilePathField(allow_files=False, allow_folders=True, path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True), + ), + ] diff --git a/archivebox/plugantic/migrations/0010_alter_customplugin_path.py b/archivebox/plugantic/migrations/0010_alter_customplugin_path.py new file mode 100644 index 00000000..4a8fbd88 --- /dev/null +++ b/archivebox/plugantic/migrations/0010_alter_customplugin_path.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:48 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0009_alter_customplugin_path'), + ] + + operations = [ + migrations.AlterField( + model_name='customplugin', + name='path', + field=models.FilePathField(allow_files=False, allow_folders=True, match='/plugins/*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True), + ), + ] diff --git a/archivebox/plugantic/migrations/0011_alter_customplugin_path.py b/archivebox/plugantic/migrations/0011_alter_customplugin_path.py new file mode 100644 index 00000000..e89b7137 --- /dev/null +++ b/archivebox/plugantic/migrations/0011_alter_customplugin_path.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:48 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0010_alter_customplugin_path'), + ] + + operations = [ + migrations.AlterField( + model_name='customplugin', + name='path', + field=models.FilePathField(allow_files=False, allow_folders=True, match='plugins/*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True), + ), + ] diff --git a/archivebox/plugantic/migrations/0012_alter_customplugin_path.py b/archivebox/plugantic/migrations/0012_alter_customplugin_path.py new file mode 100644 index 00000000..0e3fe5a5 --- /dev/null +++ b/archivebox/plugantic/migrations/0012_alter_customplugin_path.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:49 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0011_alter_customplugin_path'), + ] + + operations = [ + migrations.AlterField( + model_name='customplugin', + name='path', + field=models.FilePathField(allow_files=False, allow_folders=True, default='example_plugin', match='plugins/*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True), + ), + ] diff --git a/archivebox/plugantic/migrations/0013_alter_customplugin_path.py b/archivebox/plugantic/migrations/0013_alter_customplugin_path.py new file mode 100644 index 00000000..4c4069ed --- /dev/null +++ b/archivebox/plugantic/migrations/0013_alter_customplugin_path.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:49 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0012_alter_customplugin_path'), + ] + + operations = [ + migrations.AlterField( + model_name='customplugin', + name='path', + field=models.FilePathField(allow_files=False, allow_folders=True, default='/plugins/example_plugin', match='plugins/*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True), + ), + ] diff --git a/archivebox/plugantic/migrations/0014_alter_customplugin_path.py b/archivebox/plugantic/migrations/0014_alter_customplugin_path.py new file mode 100644 index 00000000..f3424dc6 --- /dev/null +++ b/archivebox/plugantic/migrations/0014_alter_customplugin_path.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:50 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0013_alter_customplugin_path'), + ] + + operations = [ + migrations.AlterField( + model_name='customplugin', + name='path', + field=models.FilePathField(allow_files=False, allow_folders=True, default='/plugins/example_plugin', match='*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data/plugins', recursive=True), + ), + ] diff --git a/archivebox/plugantic/migrations/0015_alter_customplugin_path.py b/archivebox/plugantic/migrations/0015_alter_customplugin_path.py new file mode 100644 index 00000000..a6c9a270 --- /dev/null +++ b/archivebox/plugantic/migrations/0015_alter_customplugin_path.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:51 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0014_alter_customplugin_path'), + ] + + operations = [ + migrations.AlterField( + model_name='customplugin', + name='path', + field=models.FilePathField(allow_files=False, allow_folders=True, match='*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data/plugins', recursive=True), + ), + ] diff --git a/archivebox/plugantic/migrations/0016_delete_customplugin.py b/archivebox/plugantic/migrations/0016_delete_customplugin.py new file mode 100644 index 00000000..2d06d6c5 --- /dev/null +++ b/archivebox/plugantic/migrations/0016_delete_customplugin.py @@ -0,0 +1,16 @@ +# Generated by Django 5.0.6 on 2024-05-18 01:57 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('plugantic', '0015_alter_customplugin_path'), + ] + + operations = [ + migrations.DeleteModel( + name='CustomPlugin', + ), + ] diff --git a/archivebox/plugantic/migrations/__init__.py b/archivebox/plugantic/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/plugantic/models.py b/archivebox/plugantic/models.py new file mode 100644 index 00000000..7ef226ec --- /dev/null +++ b/archivebox/plugantic/models.py @@ -0,0 +1,50 @@ +__package__ = 'archivebox.plugantic' + + +# import uuid +# from django.db import models +# from typing_extensions import Self + +# from django_pydantic_field import SchemaField +# from django.conf import settings + +# from abid_utils.models import ABIDModel, ABIDField + +# # from .plugins import Plugin as PluginSchema, CORE_PLUGIN +# from .binproviders import BinProvider +# from .binaries import Binary +# from .configs import WgetOptionsConfig +# from .extractors import Extractor +# from .replayers import Replayer + + +# PLUGINS_ROOT = settings.CONFIG['OUTPUT_DIR'] / 'plugins' +# PLUGINS_ROOT.mkdir(exist_ok=True) + + +# class CustomPlugin(ABIDModel): +# abid_prefix = 'plg_' +# abid_ts_src = 'self.added' +# abid_uri_src = 'self.name' +# abid_subtype_src = '"09"' +# abid_rand_src = 'self.id' + +# id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) # legacy pk +# uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) +# abid = ABIDField(prefix=abid_prefix) + +# name = models.CharField(max_length=64, blank=False, unique=True) + +# path = models.FilePathField(path=str(PLUGINS_ROOT), match='*', recursive=True, allow_folders=True, allow_files=False) + +# # replayers: list[Replayer] = SchemaField() +# # binaries: list[Replayer] = SchemaField() +# # extractors: list[Replayer] = SchemaField() + + +# # @classmethod +# # def from_loaded_plugin(cls, plugin: PluginSchema) -> Self: +# # new_obj = cls( +# # schema=plugin, +# # ) +# # return new_obj diff --git a/archivebox/plugantic/plugins.py b/archivebox/plugantic/plugins.py new file mode 100644 index 00000000..c34c4703 --- /dev/null +++ b/archivebox/plugantic/plugins.py @@ -0,0 +1,134 @@ +__package__ = 'archivebox.plugantic' + +from typing import List +from typing_extensions import Self + +from pydantic import ( + BaseModel, + ConfigDict, + Field, + model_validator, + validate_call, + SerializeAsAny, +) + +from .binaries import ( + Binary, + PythonBinary, + SqliteBinary, + DjangoBinary, + WgetBinary, + YtdlpBinary, +) +from .extractors import ( + Extractor, + YtdlpExtractor, + WgetExtractor, + WarcExtractor, +) +from .replayers import ( + Replayer, + GENERIC_REPLAYER, + MEDIA_REPLAYER, +) +from .configs import ( + ConfigSet, + WGET_CONFIG, +) + + +class Plugin(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True, extra='ignore', populate_by_name=True) + + name: str = Field(default='baseplugin') # e.g. media + description: str = Field(default='') # e.g. get media using yt-dlp + + configs: List[SerializeAsAny[ConfigSet]] = Field(default=[]) + binaries: List[SerializeAsAny[Binary]] = Field(default=[]) # e.g. [Binary(name='yt-dlp')] + extractors: List[SerializeAsAny[Extractor]] = Field(default=[]) + replayers: List[SerializeAsAny[Replayer]] = Field(default=[]) + + @model_validator(mode='after') + def validate(self): + self.description = self.description or self.name + + @validate_call + def install(self) -> Self: + new_binaries = [] + for idx, binary in enumerate(self.binaries): + new_binaries.append(binary.install() or binary) + return self.model_copy(update={ + 'binaries': new_binaries, + }) + + @validate_call + def load(self, cache=True) -> Self: + new_binaries = [] + for idx, binary in enumerate(self.binaries): + new_binaries.append(binary.load(cache=cache) or binary) + return self.model_copy(update={ + 'binaries': new_binaries, + }) + + @validate_call + def load_or_install(self, cache=True) -> Self: + new_binaries = [] + for idx, binary in enumerate(self.binaries): + new_binaries.append(binary.load_or_install(cache=cache) or binary) + return self.model_copy(update={ + 'binaries': new_binaries, + }) + + +class CorePlugin(Plugin): + name: str = 'core' + configs: List[SerializeAsAny[ConfigSet]] = [] + binaries: List[SerializeAsAny[Binary]] = [PythonBinary(), SqliteBinary(), DjangoBinary()] + extractors: List[SerializeAsAny[Extractor]] = [] + replayers: List[SerializeAsAny[Replayer]] = [GENERIC_REPLAYER] + +class YtdlpPlugin(Plugin): + name: str = 'ytdlp' + configs: List[SerializeAsAny[ConfigSet]] = [] + binaries: List[SerializeAsAny[Binary]] = [YtdlpBinary()] + extractors: List[SerializeAsAny[Extractor]] = [YtdlpExtractor()] + replayers: List[SerializeAsAny[Replayer]] = [MEDIA_REPLAYER] + +class WgetPlugin(Plugin): + name: str = 'wget' + configs: List[SerializeAsAny[ConfigSet]] = [*WGET_CONFIG] + binaries: List[SerializeAsAny[Binary]] = [WgetBinary()] + extractors: List[SerializeAsAny[Extractor]] = [WgetExtractor(), WarcExtractor()] + + +CORE_PLUGIN = CorePlugin() +YTDLP_PLUGIN = YtdlpPlugin() +WGET_PLUGIN = WgetPlugin() +PLUGINS = [ + CORE_PLUGIN, + YTDLP_PLUGIN, + WGET_PLUGIN, +] +LOADED_PLUGINS = PLUGINS + + +import json + +for plugin in PLUGINS: + try: + json.dumps(plugin.model_json_schema(), indent=4) + # print(json.dumps(plugin.model_json_schema(), indent=4)) + except Exception as err: + print(f'Failed to generate JSON schema for {plugin.name}') + raise + +# print('-------------------------------------BEFORE INSTALL---------------------------------') +# for plugin in PLUGINS: +# print(plugin.model_dump_json(indent=4)) +# print('-------------------------------------DURING LOAD/INSTALL---------------------------------') +# for plugin in PLUGINS: + # LOADED_PLUGINS.append(plugin.install()) +# print('-------------------------------------AFTER INSTALL---------------------------------') +# for plugin in LOADED_PLUGINS: + # print(plugin.model_dump_json(indent=4)) + diff --git a/archivebox/plugantic/replayers.py b/archivebox/plugantic/replayers.py new file mode 100644 index 00000000..12ade623 --- /dev/null +++ b/archivebox/plugantic/replayers.py @@ -0,0 +1,26 @@ +__package__ = 'archivebox.plugantic' + + +from pydantic import BaseModel + +# from .binproviders import LazyImportStr + + +class Replayer(BaseModel): + """Describes how to render an ArchiveResult in several contexts""" + name: str = 'GenericReplayer' + url_pattern: str = '*' + + row_template: str = 'plugins/generic_replayer/templates/row.html' + embed_template: str = 'plugins/generic_replayer/templates/embed.html' + fullpage_template: str = 'plugins/generic_replayer/templates/fullpage.html' + + # row_view: LazyImportStr = 'plugins.generic_replayer.views.row_view' + # embed_view: LazyImportStr = 'plugins.generic_replayer.views.embed_view' + # fullpage_view: LazyImportStr = 'plugins.generic_replayer.views.fullpage_view' + # icon_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon' + # thumbnail_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon' + + +GENERIC_REPLAYER = Replayer(name='generic') +MEDIA_REPLAYER = Replayer(name='media') diff --git a/archivebox/plugantic/tests.py b/archivebox/plugantic/tests.py new file mode 100644 index 00000000..7ce503c2 --- /dev/null +++ b/archivebox/plugantic/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/archivebox/plugantic/views.py b/archivebox/plugantic/views.py new file mode 100644 index 00000000..b29a8cf5 --- /dev/null +++ b/archivebox/plugantic/views.py @@ -0,0 +1,169 @@ +__package__ = 'archivebox.plugantic' + +from django.http import HttpRequest +from django.utils.html import format_html, mark_safe + +from admin_data_views.typing import TableContext, ItemContext +from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink + + +from plugantic.plugins import LOADED_PLUGINS +from django.conf import settings + + +@render_with_table_view +def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: + + assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' + + rows = { + "Binary": [], + "From Plugin": [], + "Found Version": [], + "Provided By": [], + "Found Abspath": [], + "Related Configuration": [], + "Overrides": [], + "Description": [], + } + + relevant_configs = { + key: val + for key, val in settings.CONFIG.items() + if '_BINARY' in key or '_VERSION' in key + } + + for plugin in LOADED_PLUGINS: + for binary in plugin.binaries: + binary = binary.load_or_install() + + rows['Binary'].append(ItemLink(binary.name, key=binary.name)) + rows['From Plugin'].append(plugin.name) + rows['Found Version'].append(binary.loaded_version) + rows['Provided By'].append(binary.loaded_provider) + rows['Found Abspath'].append(binary.loaded_abspath) + rows['Related Configuration'].append(mark_safe(', '.join( + f'{config_key}' + for config_key, config_value in relevant_configs.items() + if binary.name.lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower() + # or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower() + ))) + rows['Overrides'].append(str(binary.provider_overrides)) + rows['Description'].append(binary.description) + + return TableContext( + title="Binaries", + table=rows, + ) + +@render_with_item_view +def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: + + assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' + + binary = None + plugin = None + for loaded_plugin in LOADED_PLUGINS: + for loaded_binary in loaded_plugin.binaries: + if loaded_binary.name == key: + binary = loaded_binary + plugin = loaded_plugin + + assert plugin and binary, f'Could not find a binary matching the specified name: {key}' + + binary = binary.load_or_install() + + return ItemContext( + slug=key, + title=key, + data=[ + { + "name": binary.name, + "description": binary.description, + "fields": { + 'plugin': plugin.name, + 'binprovider': binary.loaded_provider, + 'abspath': binary.loaded_abspath, + 'version': binary.loaded_version, + 'overrides': str(binary.provider_overrides), + 'providers': str(binary.providers_supported), + }, + "help_texts": { + # TODO + }, + }, + ], + ) + + +@render_with_table_view +def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: + + assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' + + rows = { + "Name": [], + "binaries": [], + "extractors": [], + "replayers": [], + "configs": [], + "description": [], + } + + + for plugin in LOADED_PLUGINS: + plugin = plugin.load_or_install() + + rows['Name'].append(ItemLink(plugin.name, key=plugin.name)) + rows['binaries'].append(mark_safe(', '.join( + f'{binary.name}' + for binary in plugin.binaries + ))) + rows['extractors'].append(', '.join(extractor.name for extractor in plugin.extractors)) + rows['replayers'].append(', '.join(replayer.name for replayer in plugin.replayers)) + rows['configs'].append(mark_safe(', '.join( + f'{config_key}' + for configset in plugin.configs + for config_key in configset.__fields__.keys() + if config_key != 'section' and config_key in settings.CONFIG + ))) + rows['description'].append(str(plugin.description)) + + return TableContext( + title="Installed plugins", + table=rows, + ) + +@render_with_item_view +def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: + + assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' + + plugin = None + for loaded_plugin in LOADED_PLUGINS: + if loaded_plugin.name == key: + plugin = loaded_plugin + + assert plugin, f'Could not find a plugin matching the specified name: {key}' + + plugin = plugin.load_or_install() + + return ItemContext( + slug=key, + title=key, + data=[ + { + "name": plugin.name, + "description": plugin.description, + "fields": { + 'configs': plugin.configs, + 'binaries': plugin.binaries, + 'extractors': plugin.extractors, + 'replayers': plugin.replayers, + }, + "help_texts": { + # TODO + }, + }, + ], + ) diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 6191ede9..c5a9b13c 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -39,7 +39,7 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: backend = import_backend() if snap: try: - backend.index(snapshot_id=str(snap.id), texts=texts) + backend.index(snapshot_id=str(snap.pk), texts=texts) except Exception as err: stderr() stderr( @@ -54,7 +54,7 @@ def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: if search_backend_enabled(): backend = import_backend() try: - snapshot_ids = backend.search(query) + snapshot_pks = backend.search(query) except Exception as err: stderr() stderr( @@ -64,7 +64,7 @@ def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: raise else: # TODO preserve ordering from backend - qsearch = Snapshot.objects.filter(pk__in=snapshot_ids) + qsearch = Snapshot.objects.filter(pk__in=snapshot_pks) return qsearch return Snapshot.objects.none() @@ -74,9 +74,9 @@ def flush_search_index(snapshots: QuerySet): if not indexing_enabled() or not snapshots: return backend = import_backend() - snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True)) + snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True)) try: - backend.flush(snapshot_ids) + backend.flush(snapshot_pks) except Exception as err: stderr() stderr( diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index 5d4d4cc5..897a26d5 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -277,10 +277,22 @@ $(this).parents('.card').removeClass('selected-card') }) }; + function selectSnapshotIfHotlinked() { + // if we arrive at the index with a url like ??id__startswith=... + // we were hotlinked here with the intention of making it easy for the user to perform some + // actions on the given snapshot. therefore we should preselect the snapshot to save them a click + if (window.location.search.startsWith('?id__startswith=') || window.location.search.startsWith('?id__exact=')) { + const result_checkboxes = [...document.querySelectorAll('#result_list .action-checkbox input[type=checkbox]')] + if (result_checkboxes.length === 1) { + result_checkboxes[0].click() + } + } + } $(document).ready(function() { fix_actions() setupSnapshotGridListToggle() setTimeOffset() + selectSnapshotIfHotlinked() }) diff --git a/archivebox/templates/admin/snapshots_grid.html b/archivebox/templates/admin/snapshots_grid.html index d76e2597..a500b07b 100644 --- a/archivebox/templates/admin/snapshots_grid.html +++ b/archivebox/templates/admin/snapshots_grid.html @@ -147,7 +147,7 @@ {% for obj in results %}
- + {{obj.added}}