1
0
Fork 0
mirror of synced 2024-06-26 18:10:24 +12:00

Merge pull request #479 from afreydev/tags

Tags migration
This commit is contained in:
Cristian Vargas 2020-10-20 08:24:16 -05:00 committed by GitHub
commit 71e111e13f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 392 additions and 18 deletions

View file

@ -3,6 +3,7 @@ on: [push]
env:
MAX_LINE_LENGTH: 110
DOCKER_IMAGE: archivebox-ci
jobs:
lint:
@ -118,12 +119,12 @@ jobs:
- name: Build image
run: |
docker build . -t archivebox
docker build . -t "$DOCKER_IMAGE"
- name: Init data dir
run: |
mkdir data
docker run -v "$PWD"/data:/data archivebox init
docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" init
- name: Run test server
run: |
@ -132,7 +133,7 @@ jobs:
- name: Add link
run: |
docker run -v "$PWD"/data:/data --network host archivebox add http://www.test-nginx-1.local
docker run -v "$PWD"/data:/data --network host "$DOCKER_IMAGE" add http://www.test-nginx-1.local
- name: Add stdin link
run: |
@ -140,8 +141,8 @@ jobs:
- name: List links
run: |
docker run -v "$PWD"/data:/data archivebox list | grep -q "www.test-nginx-1.local" || { echo "The site 1 isn't in the list"; exit 1; }
docker run -v "$PWD"/data:/data archivebox list | grep -q "www.test-nginx-2.local" || { echo "The site 2 isn't in the list"; exit 1; }
docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" list | grep -q "www.test-nginx-1.local" || { echo "The site 1 isn't in the list"; exit 1; }
docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" list | grep -q "www.test-nginx-2.local" || { echo "The site 2 isn't in the list"; exit 1; }
- name: Start docker-compose stack
run: |

View file

@ -9,9 +9,10 @@ from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.shortcuts import render, redirect
from django.contrib.auth import get_user_model
from django import forms
from core.models import Snapshot
from core.forms import AddLinkForm
from core.forms import AddLinkForm, TagField
from core.utils import get_icons
from util import htmldecode, urldecode, ansi_to_html
@ -55,6 +56,32 @@ def delete_snapshots(modeladmin, request, queryset):
delete_snapshots.short_description = "Delete"
class SnapshotAdminForm(forms.ModelForm):
tags = TagField(required=False)
class Meta:
model = Snapshot
fields = "__all__"
def save(self, commit=True):
# Based on: https://stackoverflow.com/a/49933068/3509554
# Get the unsave instance
instance = forms.ModelForm.save(self, False)
tags = self.cleaned_data.pop("tags")
#update save_m2m
def new_save_m2m():
instance.save_tags(tags)
# Do we need to save all changes now?
self.save_m2m = new_save_m2m
if commit:
instance.save()
return instance
class SnapshotAdmin(admin.ModelAdmin):
list_display = ('added', 'title_str', 'url_str', 'files', 'size')
sort_fields = ('title_str', 'url_str', 'added')
@ -65,6 +92,13 @@ class SnapshotAdmin(admin.ModelAdmin):
ordering = ['-added']
actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
actions_template = 'admin/actions_as_select.html'
form = SnapshotAdminForm
def get_queryset(self, request):
return super().get_queryset(request).prefetch_related('tags')
def tag_list(self, obj):
return ', '.join(obj.tags.values_list('name', flat=True))
def id_str(self, obj):
return format_html(
@ -75,9 +109,9 @@ class SnapshotAdmin(admin.ModelAdmin):
def title_str(self, obj):
canon = obj.as_link().canonical_outputs()
tags = ''.join(
format_html('<span>{}</span>', tag.strip())
for tag in obj.tags.split(',')
) if obj.tags else ''
format_html(' <a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
for tag in obj.tags.all()
)
return format_html(
'<a href="/{}">'
'<img src="/{}/{}" class="favicon" onerror="this.remove()">'

View file

@ -3,6 +3,7 @@ __package__ = 'archivebox.core'
from django import forms
from ..util import URL_REGEX
from .utils_taggit import edit_string_for_tags, parse_tags
CHOICES = (
('0', 'depth = 0 (archive just these URLs)'),
@ -12,3 +13,44 @@ CHOICES = (
class AddLinkForm(forms.Form):
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
class TagWidgetMixin:
def format_value(self, value):
if value is not None and not isinstance(value, str):
value = edit_string_for_tags(value)
return super().format_value(value)
class TagWidget(TagWidgetMixin, forms.TextInput):
pass
class TagField(forms.CharField):
widget = TagWidget
def clean(self, value):
value = super().clean(value)
try:
return parse_tags(value)
except ValueError:
raise forms.ValidationError(
"Please provide a comma-separated list of tags."
)
def has_changed(self, initial_value, data_value):
# Always return False if the field is disabled since self.bound_data
# always uses the initial value in this case.
if self.disabled:
return False
try:
data_value = self.clean(data_value)
except forms.ValidationError:
pass
if initial_value is None:
initial_value = []
initial_value = [tag.name for tag in initial_value]
initial_value.sort()
return initial_value != data_value

View file

@ -0,0 +1,70 @@
# Generated by Django 3.0.8 on 2020-10-12 15:20
from django.db import migrations, models
from django.utils.text import slugify
def forwards_func(apps, schema_editor):
SnapshotModel = apps.get_model("core", "Snapshot")
TagModel = apps.get_model("core", "Tag")
db_alias = schema_editor.connection.alias
snapshots = SnapshotModel.objects.all()
for snapshot in snapshots:
tags = snapshot.tags
tag_set = (
set(tag.strip() for tag in (snapshot.tags_old or '').split(','))
)
tag_set.discard("")
for tag in tag_set:
to_add, _ = TagModel.objects.get_or_create(name=tag, slug=slugify(tag))
snapshot.tags.add(to_add)
def reverse_func(apps, schema_editor):
SnapshotModel = apps.get_model("core", "Snapshot")
TagModel = apps.get_model("core", "Tag")
db_alias = schema_editor.connection.alias
snapshots = SnapshotModel.objects.all()
for snapshot in snapshots:
tags = snapshot.tags.values_list("name", flat=True)
snapshot.tags_old = ",".join([tag for tag in tags])
snapshot.save()
class Migration(migrations.Migration):
dependencies = [
('core', '0005_auto_20200728_0326'),
]
operations = [
migrations.RenameField(
model_name='snapshot',
old_name='tags',
new_name='tags_old',
),
migrations.CreateModel(
name='Tag',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=100, unique=True, verbose_name='name')),
('slug', models.SlugField(max_length=100, unique=True, verbose_name='slug')),
],
options={
'verbose_name': 'Tag',
'verbose_name_plural': 'Tags',
},
),
migrations.AddField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(to='core.Tag'),
),
migrations.RunPython(forwards_func, reverse_func),
migrations.RemoveField(
model_name='snapshot',
name='tags_old',
),
]

View file

@ -2,13 +2,55 @@ __package__ = 'archivebox.core'
import uuid
from django.db import models
from django.db import models, transaction
from django.utils.functional import cached_property
from django.utils.text import slugify
from ..util import parse_date
from ..index.schema import Link
class Tag(models.Model):
"""
Based on django-taggit model
"""
name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100)
slug = models.SlugField(verbose_name="slug", unique=True, max_length=100)
class Meta:
verbose_name = "Tag"
verbose_name_plural = "Tags"
def __str__(self):
return self.name
def slugify(self, tag, i=None):
slug = slugify(tag)
if i is not None:
slug += "_%d" % i
return slug
def save(self, *args, **kwargs):
if self._state.adding and not self.slug:
self.slug = self.slugify(self.name)
with transaction.atomic():
slugs = set(
type(self)
._default_manager.filter(slug__startswith=self.slug)
.values_list("slug", flat=True)
)
i = None
while True:
slug = self.slugify(self.name, i)
if slug not in slugs:
self.slug = slug
return super().save(*args, **kwargs)
i = 1 if i is None else i+1
else:
return super().save(*args, **kwargs)
class Snapshot(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
@ -16,11 +58,10 @@ class Snapshot(models.Model):
timestamp = models.CharField(max_length=32, unique=True, db_index=True)
title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
tags = models.CharField(max_length=256, null=True, blank=True, db_index=True)
added = models.DateTimeField(auto_now_add=True, db_index=True)
updated = models.DateTimeField(null=True, blank=True, db_index=True)
# bookmarked = models.DateTimeField()
tags = models.ManyToManyField(Tag)
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
@ -41,7 +82,8 @@ class Snapshot(models.Model):
args = args or self.keys
return {
key: getattr(self, key)
for key in args
if key != 'tags' else self.get_tags_str()
for key in args
}
def as_link(self) -> Link:
@ -50,6 +92,13 @@ class Snapshot(models.Model):
def as_link_with_details(self) -> Link:
from ..index import load_link_details
return load_link_details(self.as_link())
def get_tags_str(self) -> str:
tags = ','.join(
tag.name
for tag in self.tags.all()
) if self.tags.all() else ''
return tags
@cached_property
def bookmarked(self):
@ -96,3 +145,10 @@ class Snapshot(models.Model):
and self.history['title'][-1].output.strip()):
return self.history['title'][-1].output.strip()
return None
def save_tags(self, tags=[]):
tags_id = []
for tag in tags:
tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
self.tags.clear()
self.tags.add(*tags_id)

View file

@ -0,0 +1,113 @@
# Taken from https://github.com/jazzband/django-taggit/blob/3b56adb637ab95aca5036c37a358402c825a367c/taggit/utils.py
def parse_tags(tagstring):
"""
Parses tag input, with multiple word input being activated and
delineated by commas and double quotes. Quotes take precedence, so
they may contain commas.
Returns a sorted list of unique tag names.
Ported from Jonathan Buchanan's `django-tagging
<http://django-tagging.googlecode.com/>`_
"""
if not tagstring:
return []
# Special case - if there are no commas or double quotes in the
# input, we don't *do* a recall... I mean, we know we only need to
# split on spaces.
if "," not in tagstring and '"' not in tagstring:
words = list(set(split_strip(tagstring, " ")))
words.sort()
return words
words = []
buffer = []
# Defer splitting of non-quoted sections until we know if there are
# any unquoted commas.
to_be_split = []
saw_loose_comma = False
open_quote = False
i = iter(tagstring)
try:
while True:
c = next(i)
if c == '"':
if buffer:
to_be_split.append("".join(buffer))
buffer = []
# Find the matching quote
open_quote = True
c = next(i)
while c != '"':
buffer.append(c)
c = next(i)
if buffer:
word = "".join(buffer).strip()
if word:
words.append(word)
buffer = []
open_quote = False
else:
if not saw_loose_comma and c == ",":
saw_loose_comma = True
buffer.append(c)
except StopIteration:
# If we were parsing an open quote which was never closed treat
# the buffer as unquoted.
if buffer:
if open_quote and "," in buffer:
saw_loose_comma = True
to_be_split.append("".join(buffer))
if to_be_split:
if saw_loose_comma:
delimiter = ","
else:
delimiter = " "
for chunk in to_be_split:
words.extend(split_strip(chunk, delimiter))
words = list(set(words))
words.sort()
return words
def split_strip(string, delimiter=","):
"""
Splits ``string`` on ``delimiter``, stripping each resulting string
and returning a list of non-empty strings.
Ported from Jonathan Buchanan's `django-tagging
<http://django-tagging.googlecode.com/>`_
"""
if not string:
return []
words = [w.strip() for w in string.split(delimiter)]
return [w for w in words if w]
def edit_string_for_tags(tags):
"""
Given list of ``Tag`` instances, creates a string representation of
the list suitable for editing by the user, such that submitting the
given string representation back without changing it will give the
same list of tags.
Tag names which contain commas will be double quoted.
If any tag name which isn't being quoted contains whitespace, the
resulting string of tag names will be comma-delimited, otherwise
it will be space-delimited.
Ported from Jonathan Buchanan's `django-tagging
<http://django-tagging.googlecode.com/>`_
"""
names = []
for tag in tags:
name = tag.name
if "," in name or " " in name:
names.append('"%s"' % name)
else:
names.append(name)
return ", ".join(sorted(names))

View file

@ -34,13 +34,19 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) ->
def write_link_to_sql_index(link: Link):
from core.models import Snapshot
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
tags = info.pop("tags")
if tags is None:
tags = []
try:
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
except Snapshot.DoesNotExist:
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
return Snapshot.objects.update_or_create(url=link.url, defaults=info)[0]
snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
snapshot.save_tags(tags)
return snapshot
@enforce_types
@ -65,8 +71,14 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
except Snapshot.DoesNotExist:
snap = write_link_to_sql_index(link)
snap.title = link.title
snap.tags = link.tags
tag_set = (
set(tag.strip() for tag in (link.tags or '').split(','))
)
tag_list = list(tag_set) or []
snap.save()
snap.save_tags(tag_list)

View file

@ -222,3 +222,11 @@ body.model-snapshot.change-list #content .object-tools {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
.tags > a > .tag {
border: 1px solid;
border-radius: 10px;
background-color: #f3f3f3;
padding: 3px;
}

View file

@ -12,7 +12,7 @@ version: '3.7'
services:
archivebox:
# build: .
image: nikisweeting/archivebox:latest
image: ${DOCKER_IMAGE:-nikisweeting/archivebox:latest}
command: server 0.0.0.0:8000
stdin_open: true
tty: true

Binary file not shown.

View file

@ -4,7 +4,7 @@
import os
import subprocess
from pathlib import Path
import json
import json, shutil
import sqlite3
from archivebox.config import OUTPUT_PERMISSIONS
@ -131,4 +131,42 @@ def test_unrecognized_folders(tmp_path, process, disable_extractors_dict):
init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
assert "Skipped adding 1 invalid link data directories" in init_process.stdout.decode("utf-8")
assert init_process.returncode == 0
assert init_process.returncode == 0
def test_tags_migration(tmp_path, disable_extractors_dict):
base_sqlite_path = Path(__file__).parent / 'tags_migration'
if os.path.exists(tmp_path):
shutil.rmtree(tmp_path)
shutil.copytree(str(base_sqlite_path), tmp_path)
os.chdir(tmp_path)
conn = sqlite3.connect("index.sqlite3")
conn.row_factory = sqlite3.Row
c = conn.cursor()
c.execute("SELECT id, tags from core_snapshot")
snapshots = c.fetchall()
snapshots_dict = { sn['id']: sn['tags'] for sn in snapshots}
conn.commit()
conn.close()
init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
conn = sqlite3.connect("index.sqlite3")
conn.row_factory = sqlite3.Row
c = conn.cursor()
c.execute("""
SELECT core_snapshot.id, core_tag.name from core_snapshot
JOIN core_snapshot_tags on core_snapshot_tags.snapshot_id=core_snapshot.id
JOIN core_tag on core_tag.id=core_snapshot_tags.tag_id
""")
tags = c.fetchall()
conn.commit()
conn.close()
for tag in tags:
snapshot_id = tag["id"]
tag_name = tag["name"]
# Check each tag migrated is in the previous field
assert tag_name in snapshots_dict[snapshot_id]