diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 7c4b7d429..24ec48af4 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -74,7 +74,7 @@ jobs:
- name: run tests
run: |
- coverage run -m pytest --create-db
+ coverage run -m pytest --create-db -x
coverage xml -o _shtrove_coverage.xml
env:
DATABASE_PASSWORD: postgres
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 124135277..117a8cdcb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,16 @@
# Change Log
+# [25.4.0] - 2025-06-24
+- delete `RawDatum` model
+ - `trove.digestive_tract.extract` now must succeed before `/trove/ingest` responds
+- rename `IndexcardRdf` (and kids) to `ResourceDescription`
+- move most django models to their own files
+- stop storing `CeleryTaskResult`s forever
+ - new environment variables: `CELERY_RESULT_EXPIRES`, `FAILED_CELERY_RESULT_EXPIRES`
+- fix: `/api/v2/` error generating rss/atom feed links
+- fix: pagination at `/api/v2/sourceconfigs`
+- fix: correct osfmap IRIs (`dcat:accessURL`, `osf:verifiedLink`)
+
# [25.3.3] - 2025-06-17
- smaller `osfmap_json` derived representation (thx bodintsov)
- prepare for next release dropping `RawDatum` model/table:
diff --git a/api/base/views.py b/api/base/views.py
index 6471c8d7d..48755fd56 100644
--- a/api/base/views.py
+++ b/api/base/views.py
@@ -45,12 +45,11 @@ def initial(self, request, *args, **kwargs):
class RootView(views.APIView):
def get(self, request):
links = {
- 'rawdata': 'api:rawdatum-list',
'sources': 'api:source-list',
'users': 'api:user-list',
'status': 'api:status',
- 'rss': 'api:rss',
- 'atom': 'api:atom',
+ 'rss': 'api:feeds.rss',
+ 'atom': 'api:feeds.atom',
}
ret = {k: request.build_absolute_uri(reverse(v)) for k, v in links.items()}
return Response(ret)
diff --git a/api/rawdata/serializers.py b/api/rawdata/serializers.py
deleted file mode 100644
index b76a3d58c..000000000
--- a/api/rawdata/serializers.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from share import models
-
-from api.base import ShareSerializer
-
-
-class RawDatumSerializer(ShareSerializer):
-
- class Meta:
- model = models.RawDatum
- fields = ('id', 'suid', 'datum', 'sha256', 'date_modified', 'date_created')
diff --git a/api/rawdata/urls.py b/api/rawdata/urls.py
deleted file mode 100644
index fe491c80d..000000000
--- a/api/rawdata/urls.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from rest_framework.routers import SimpleRouter
-from api.rawdata import views
-
-
-router = SimpleRouter()
-router.register(r'rawdata', views.RawDataViewSet, basename='rawdatum')
-urlpatterns = router.urls
diff --git a/api/rawdata/views.py b/api/rawdata/views.py
deleted file mode 100644
index 8293402c1..000000000
--- a/api/rawdata/views.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from rest_framework import viewsets
-
-from share import models
-
-from api.base.views import ShareViewSet
-from api.pagination import CursorPagination
-from api.rawdata.serializers import RawDatumSerializer
-
-
-class RawDataViewSet(ShareViewSet, viewsets.ReadOnlyModelViewSet):
- """
- Raw data, exactly as harvested from the data source.
-
- ## Query by object
- To get all the raw data corresponding to a Share object, use the query
- parameters `object_id=<@id>` and `object_type=<@type>`
- """
-
- ordering = ('-id', )
- pagination_class = CursorPagination
- serializer_class = RawDatumSerializer
-
- def get_queryset(self):
- object_id = self.request.query_params.get('object_id', None)
- object_type = self.request.query_params.get('object_type', None)
- if object_id and object_type:
- return models.RawDatum.objects.filter(
- normalizeddata__changeset__changes__target_id=object_id,
- normalizeddata__changeset__changes__target_type__model=object_type
- ).distinct('id').select_related('suid')
- return models.RawDatum.objects.all().select_related('suid')
diff --git a/api/sourceconfigs/views.py b/api/sourceconfigs/views.py
index 2c8647ceb..62e471c16 100644
--- a/api/sourceconfigs/views.py
+++ b/api/sourceconfigs/views.py
@@ -2,12 +2,14 @@
from api.sourceconfigs.serializers import SourceConfigSerializer
from api.base import ShareViewSet
+from api.pagination import CursorPagination
from share.models import SourceConfig
class SourceConfigViewSet(ShareViewSet, viewsets.ReadOnlyModelViewSet):
serializer_class = SourceConfigSerializer
+ pagination_class = CursorPagination
ordering = ('id', )
diff --git a/api/urls.py b/api/urls.py
index ef02ffa73..a672c0c60 100644
--- a/api/urls.py
+++ b/api/urls.py
@@ -9,7 +9,6 @@
urlpatterns = [
url('^$', RootView.as_view()),
url('^', include('api.banners.urls')),
- url('^', include('api.rawdata.urls')),
url('^', include('api.sourceconfigs.urls')),
url('^', include('api.sources.urls')),
url('^', include('api.suids.urls')),
diff --git a/project/settings.py b/project/settings.py
index 8979e67c1..19d5b02c1 100644
--- a/project/settings.py
+++ b/project/settings.py
@@ -341,8 +341,17 @@ def split(string, delim):
},
}
-CELERY_RESULT_EXPIRES = 60 * 60 * 24 * 3 # 4 days
CELERY_RESULT_BACKEND = 'share.celery:CeleryDatabaseBackend'
+CELERY_RESULT_EXPIRES = int(os.environ.get(
+ 'CELERY_RESULT_EXPIRES',
+ 60 * 60 * 24 * 3, # 3 days
+))
+# only successful tasks get the default expiration (above)
+# -- failed tasks kept longer (see `share.celery`)
+FAILED_CELERY_RESULT_EXPIRES = int(os.environ.get(
+ 'FAILED_CELERY_RESULT_EXPIRES',
+ 60 * 60 * 24 * 11, # 11 days
+))
# Don't reject tasks that were present on a worker when it was killed
CELERY_TASK_REJECT_ON_WORKER_LOST = False
@@ -358,7 +367,7 @@ def split(string, delim):
CELERY_TASK_DEFAULT_ROUTING_KEY = 'share_default'
URGENT_TASK_QUEUES = {
- 'trove.digestive_tract.task__extract_and_derive': 'digestive_tract.urgent',
+ 'trove.digestive_tract.task__derive': 'digestive_tract.urgent',
}
@@ -440,6 +449,10 @@ def route_urgent_task(name, args, kwargs, options, task=None, **kw):
SHARE_WEB_URL = os.environ.get('SHARE_WEB_URL', 'http://localhost:8003').rstrip('/') + '/'
SHARE_USER_AGENT = os.environ.get('SHARE_USER_AGENT', 'SHAREbot/{} (+{})'.format(VERSION, SHARE_WEB_URL))
+SHARE_ADMIN_USERNAME = os.environ.get('SHARE_ADMIN_USERNAME', 'admin')
+SHARE_ADMIN_PASSWORD = os.environ.get('SHARE_ADMIN_PASSWORD')
+if DEBUG and (SHARE_ADMIN_PASSWORD is None):
+ SHARE_ADMIN_PASSWORD = 'password'
# Skip some of the more intensive operations on works that surpass these limits
SHARE_LIMITS = {
diff --git a/share/admin/__init__.py b/share/admin/__init__.py
index 7d1328756..6c5fa04c8 100644
--- a/share/admin/__init__.py
+++ b/share/admin/__init__.py
@@ -1,10 +1,6 @@
from django.apps import apps
-from django.urls import re_path as url
from django.contrib import admin
-from django.http import HttpResponseRedirect
-from django.template.response import TemplateResponse
-from django.urls import path, reverse
-from django.utils.html import format_html
+from django.urls import path
from oauth2_provider.models import AccessToken
@@ -15,7 +11,6 @@
CeleryTaskResult,
FeatureFlag,
IndexBackfill,
- RawDatum,
ShareUser,
SiteBanner,
Source,
@@ -51,26 +46,6 @@ class ShareUserAdmin(admin.ModelAdmin):
search_fields = ['username']
-@linked_fk('suid')
-class RawDatumAdmin(admin.ModelAdmin):
- show_full_result_count = False
- list_select_related = ('suid__source_config', )
- list_display = ('id', 'identifier', 'source_config_label', 'datestamp', 'date_created', 'date_modified', )
- readonly_fields = ('datum__pre', 'sha256')
- exclude = ('datum',)
- paginator = TimeLimitedPaginator
-
- def identifier(self, obj):
- return obj.suid.identifier
-
- def source_config_label(self, obj):
- return obj.suid.source_config.label
-
- def datum__pre(self, instance):
- return format_html('
{}', instance.datum)
- datum__pre.short_description = 'datum' # type: ignore[attr-defined]
-
-
class AccessTokenAdmin(admin.ModelAdmin):
raw_id_fields = ('user',)
list_display = ('token', 'user', 'scope')
@@ -91,11 +66,10 @@ def save_model(self, request, obj, form, change):
@linked_fk('source')
class SourceConfigAdmin(admin.ModelAdmin):
- list_display = ('label', 'source_', 'version', 'enabled', 'button_actions')
+ list_display = ('label', 'source_', 'version', 'enabled',)
list_select_related = ('source',)
- readonly_fields = ('button_actions',)
search_fields = ['label', 'source__name', 'source__long_title']
- actions = ['schedule_full_ingest']
+ actions = ['schedule_derive']
def source_(self, obj):
return obj.source.long_title
@@ -104,42 +78,10 @@ def enabled(self, obj):
return not obj.disabled
enabled.boolean = True # type: ignore[attr-defined]
- @admin.action(description='schedule re-ingest of all raw data for each source config')
- def schedule_full_ingest(self, request, queryset):
+ @admin.action(description='schedule re-derive of all cards for each selected source config')
+ def schedule_derive(self, request, queryset):
for _id in queryset.values_list('id', flat=True):
- digestive_tract.task__schedule_extract_and_derive_for_source_config.delay(_id)
-
- def get_urls(self):
- return [
- url(
- r'^(?P.+)/ingest/$',
- self.admin_site.admin_view(self.start_ingest),
- name='source-config-ingest'
- )
- ] + super().get_urls()
-
- def button_actions(self, obj):
- return format_html(
- ' '.join((
- ('Ingest' if not obj.disabled else ''),
- )),
- ingest_href=reverse('admin:source-config-ingest', args=[obj.pk]),
- )
- button_actions.short_description = 'Buttons' # type: ignore[attr-defined]
-
- def start_ingest(self, request, config_id):
- config = self.get_object(request, config_id)
- if request.method == 'POST':
- digestive_tract.task__schedule_extract_and_derive_for_source_config.delay(config.pk)
- url = reverse(
- 'admin:share_sourceconfig_changelist',
- current_app=self.admin_site.name,
- )
- return HttpResponseRedirect(url)
- else:
- context = self.admin_site.each_context(request)
- context['source_config'] = config
- return TemplateResponse(request, 'admin/start-ingest.html', context)
+ digestive_tract.task__schedule_derive_for_source_config.delay(_id)
@linked_fk('user')
@@ -157,26 +99,16 @@ def access_token(self, obj):
@linked_fk('source_config')
@linked_fk('focus_identifier')
@linked_many('formattedmetadatarecord_set', defer=('formatted_metadata',))
-@linked_many('raw_data', defer=('datum',))
@linked_many('indexcard_set')
class SourceUniqueIdentifierAdmin(admin.ModelAdmin):
readonly_fields = ('identifier',)
paginator = TimeLimitedPaginator
- actions = ('reingest', 'delete_cards_for_suid')
+ actions = ('delete_cards_for_suid',)
list_filter = (SourceConfigFilter,)
list_select_related = ('source_config',)
show_full_result_count = False
search_fields = ('identifier',)
- def reingest(self, request, queryset):
- _raw_id_queryset = (
- RawDatum.objects
- .latest_by_suid_queryset(queryset)
- .values_list('id', flat=True)
- )
- for _raw_id in _raw_id_queryset:
- digestive_tract.task__extract_and_derive.delay(raw_id=_raw_id)
-
def delete_cards_for_suid(self, request, queryset):
for suid in queryset:
digestive_tract.expel_suid(suid)
@@ -220,7 +152,6 @@ class FeatureFlagAdmin(admin.ModelAdmin):
admin_site.register(CeleryTaskResult, CeleryTaskResultAdmin)
admin_site.register(FeatureFlag, FeatureFlagAdmin)
admin_site.register(IndexBackfill, IndexBackfillAdmin)
-admin_site.register(RawDatum, RawDatumAdmin)
admin_site.register(ShareUser, ShareUserAdmin)
admin_site.register(SiteBanner, SiteBannerAdmin)
admin_site.register(Source, SourceAdmin)
diff --git a/share/celery.py b/share/celery.py
index a097cbc41..ff0f626c5 100644
--- a/share/celery.py
+++ b/share/celery.py
@@ -1,13 +1,15 @@
+import datetime
import functools
import logging
-
from celery import states
from celery.app.task import Context
from celery.backends.base import BaseDictBackend
from celery.utils.time import maybe_timedelta
+from django.conf import settings
from django.db import transaction
+from django.db.models import Q
from django.utils import timezone
import sentry_sdk
@@ -90,7 +92,10 @@ def _store_result(self, task_id, result, status, traceback=None, request=None, *
@die_on_unhandled
def cleanup(self, expires=None):
- TaskResultCleaner(expires or self.expires).clean()
+ TaskResultCleaner(
+ success_ttl=(expires or self.expires),
+ nonsuccess_ttl=settings.FAILED_CELERY_RESULT_EXPIRES,
+ ).clean()
@die_on_unhandled
def _get_task_meta_for(self, task_id):
@@ -111,20 +116,19 @@ class TaskResultCleaner:
TaskModel = CeleryTaskResult
- TASK_TTLS = {
- }
-
- NO_ARCHIVE = {
- }
-
- def __init__(self, expires, bucket=None, delete=True, chunk_size=5000):
- self.bucket = bucket
+ def __init__(self, success_ttl, nonsuccess_ttl=None, delete=True, chunk_size=5000):
self.chunk_size = chunk_size
self.delete = delete
- self.expires = expires
+ self.success_ttl = success_ttl
+ self.nonsuccess_ttl = nonsuccess_ttl or success_ttl
- def get_ttl(self, task_name):
- return timezone.now() - maybe_timedelta(self.TASK_TTLS.get(task_name, self.expires))
+ @property
+ def success_cutoff(self) -> datetime.datetime:
+ return timezone.now() - maybe_timedelta(self.success_ttl)
+
+ @property
+ def nonsuccess_cutoff(self) -> datetime.datetime:
+ return timezone.now() - maybe_timedelta(self.nonsuccess_ttl)
def get_task_names(self):
qs = self.TaskModel.objects.values('task_name').annotate(name=GroupBy('task_name'))
@@ -137,12 +141,15 @@ def get_task_names(self):
def clean(self):
for name in self.get_task_names():
- logger.debug('Looking for succeeded %s tasks modified before %s', name, self.get_ttl(name))
-
- queryset = self.TaskModel.objects.filter(
- task_name=name,
- status=states.SUCCESS,
- date_modified__lt=self.get_ttl(name)
+ success_q = Q(status=states.SUCCESS, date_modified__lt=self.success_cutoff)
+ nonsuccess_q = (
+ ~Q(status=states.SUCCESS)
+ & Q(date_modified__lt=self.nonsuccess_cutoff)
+ )
+ queryset = (
+ self.TaskModel.objects
+ .filter(task_name=name)
+ .filter(success_q | nonsuccess_q)
)
if not queryset.exists():
diff --git a/share/exceptions.py b/share/exceptions.py
index 801aedc61..45a26679f 100644
--- a/share/exceptions.py
+++ b/share/exceptions.py
@@ -1,31 +1,3 @@
class ShareException(Exception):
pass
-
-
-class HarvestError(ShareException):
- pass
-
-
-class IngestError(ShareException):
- pass
-
-
-class TransformError(IngestError):
- pass
-
-
-class RegulateError(IngestError):
- pass
-
-
-class MergeRequired(IngestError):
- """A node disambiguated to multiple objects in the database.
- """
- pass
-
-
-class IngestConflict(IngestError):
- """Multiple data being ingested at the same time conflicted.
- """
- pass
diff --git a/share/management/commands/delete_pretrove_data.py b/share/management/commands/delete_pretrove_data.py
deleted file mode 100644
index 5b9c0249f..000000000
--- a/share/management/commands/delete_pretrove_data.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from django.db.models import OuterRef, Exists
-from django.utils.translation import gettext as _
-
-from share.management.commands import BaseShareCommand
-from share import models as _db
-
-
-class Command(BaseShareCommand):
- def add_arguments(self, parser):
- parser.add_argument('--chunksize', type=int, default=1024, help='number of RawData per DELETE')
- parser.add_argument('--really-really', action='store_true', help='skip final confirmation prompt before really deleting')
-
- def handle(self, *args, chunksize: int, really_really: bool, **kwargs):
- # note: `share.transform` deleted; `transformer_key` always null for trove-ingested rdf
- _pretrove_configs = _db.SourceConfig.objects.filter(transformer_key__isnull=False)
- _pretrove_configs_with_rawdata = (
- _pretrove_configs
- .annotate(has_rawdata=Exists(
- _db.RawDatum.objects
- .filter(suid__source_config_id=OuterRef('pk'))
- ))
- .filter(has_rawdata=True)
- )
- if not _pretrove_configs_with_rawdata.exists():
- self.stdout.write(self.style.SUCCESS(_('nothing to delete')))
- return
- _sourceconfig_ids_and_labels = list(
- _pretrove_configs_with_rawdata.values_list('id', 'label'),
- )
- self.stdout.write(self.style.WARNING(_('pre-trove source-configs with deletable rawdata:')))
- for __, _sourceconfig_label in _sourceconfig_ids_and_labels:
- self.stdout.write(f'\t{_sourceconfig_label}')
- if really_really or self.input_confirm(self.style.WARNING(_('really DELETE ALL raw metadata records belonging to these source-configs? (y/n)'))):
- _total_deleted = 0
- for _sourceconfig_id, _sourceconfig_label in _sourceconfig_ids_and_labels:
- _total_deleted += self._do_delete_rawdata(_sourceconfig_id, _sourceconfig_label, chunksize)
- self.stdout.write(self.style.SUCCESS(_('deleted %(count)s items') % {'count': _total_deleted}))
- else:
- self.stdout.write(self.style.SUCCESS(_('deleted nothing')))
-
- def _do_delete_rawdata(self, sourceconfig_id, sourceconfig_label, chunksize) -> int:
- # note: `.delete()` cannot be called on sliced querysets, so chunking is more complicated
- # -- before deleting each chunk, query for its last pk to filter on as a sentinel value
- _prior_sentinel_pk = None
- _deleted_count = 0
- _rawdata_qs = (
- _db.RawDatum.objects
- .filter(suid__source_config_id=sourceconfig_id)
- .order_by('pk') # for consistent chunking
- )
- self.stdout.write(_('%(label)s: deleting all rawdata...') % {'label': sourceconfig_label})
- while True: # for each chunk:
- _pk_qs = _rawdata_qs.values_list('pk', flat=True)
- # get the last pk in the chunk
- _sentinel_pk = _pk_qs[chunksize - 1: chunksize].first() or _pk_qs.last()
- if _sentinel_pk is not None:
- if (_prior_sentinel_pk is not None) and (_sentinel_pk <= _prior_sentinel_pk):
- raise RuntimeError(f'sentinel pks not ascending?? got {_sentinel_pk} after {_prior_sentinel_pk}')
- _prior_sentinel_pk = _sentinel_pk
- _chunk_to_delete = _rawdata_qs.filter(pk__lte=_sentinel_pk)
- _chunk_deleted_count, _by_model = _chunk_to_delete.delete()
- if _by_model and set(_by_model.keys()) != {'share.RawDatum'}:
- raise RuntimeError(f'deleted models other than RawDatum?? {_by_model}')
- self.stdout.write(
- _('%(label)s: deleted %(count)s') % {'label': sourceconfig_label, 'count': _chunk_deleted_count},
- )
- _deleted_count += _chunk_deleted_count
- continue # next chunk
- # end
- self.stdout.write(self.style.SUCCESS(
- _('%(label)s: done; deleted %(count)s') % {'label': sourceconfig_label, 'count': _deleted_count},
- ))
- return _deleted_count
diff --git a/share/migrations/0001_squashed_0058_big_rend.py b/share/migrations/0001_squashed_0058_big_rend.py
index 501fe8044..64b388823 100644
--- a/share/migrations/0001_squashed_0058_big_rend.py
+++ b/share/migrations/0001_squashed_0058_big_rend.py
@@ -11,7 +11,9 @@
import django.utils.timezone
import share.models.core
import share.models.fields
-import share.models.ingest
+import share.models._old
+import share.models.source
+import share.models.source_config
import share.models.validators
import share.version
@@ -135,9 +137,6 @@ class Migration(migrations.Migration):
('date_created', models.DateTimeField(auto_now_add=True)),
('date_modified', models.DateTimeField(auto_now=True)),
],
- managers=[
- ('objects', share.models.ingest.NaturalKeyManager('key')),
- ],
),
migrations.CreateModel(
name='HarvestJob',
@@ -167,10 +166,10 @@ class Migration(migrations.Migration):
('name', models.TextField(unique=True)),
('long_title', models.TextField(unique=True)),
('home_page', models.URLField(null=True)),
- ('icon', models.ImageField(null=True, storage=share.models.ingest.SourceIconStorage(), upload_to=share.models.ingest.icon_name)),
+ ('icon', models.ImageField(null=True, storage=share.models._old.SourceIconStorage(), upload_to=share.models._old.icon_name)),
],
managers=[
- ('objects', share.models.ingest.NaturalKeyManager('name')),
+ ('objects', share.models.source.SourceManager()),
],
),
migrations.CreateModel(
@@ -190,7 +189,7 @@ class Migration(migrations.Migration):
('source', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='share.Source')),
],
managers=[
- ('objects', share.models.ingest.NaturalKeyManager('label')),
+ ('objects', share.models.source_config.SourceConfigManager()),
],
),
migrations.CreateModel(
@@ -217,9 +216,6 @@ class Migration(migrations.Migration):
('date_created', models.DateTimeField(auto_now_add=True)),
('date_modified', models.DateTimeField(auto_now=True)),
],
- managers=[
- ('objects', share.models.ingest.NaturalKeyManager('key')),
- ],
),
migrations.CreateModel(
name='RawDatumJob',
@@ -297,7 +293,7 @@ class Migration(migrations.Migration):
migrations.AlterField(
model_name='source',
name='icon',
- field=models.ImageField(blank=True, default='', storage=share.models.ingest.SourceIconStorage(), upload_to=share.models.ingest.icon_name),
+ field=models.ImageField(blank=True, default='', storage=share.models._old.SourceIconStorage(), upload_to=share.models._old.icon_name),
preserve_default=False,
),
migrations.AddField(
@@ -460,7 +456,7 @@ class Migration(migrations.Migration):
migrations.AlterModelManagers(
name='sourceconfig',
managers=[
- ('objects', share.models.ingest.SourceConfigManager('label')),
+ ('objects', share.models.source_config.SourceConfigManager()),
],
),
migrations.AddField(
diff --git a/share/migrations/0061_ensure_auto_users.py b/share/migrations/0061_ensure_auto_users.py
index e422e7eaf..4a20c30af 100644
--- a/share/migrations/0061_ensure_auto_users.py
+++ b/share/migrations/0061_ensure_auto_users.py
@@ -27,15 +27,15 @@ def ensure_share_system_user(apps, schema_editor):
def ensure_share_admin_user(apps, schema_editor):
- import os
ShareUser = apps.get_model('share', 'ShareUser')
-
- admin_username = 'admin'
- admin_user_exists = ShareUser.objects.filter(username=admin_username).exists()
- if not admin_user_exists:
+ if (
+ settings.SHARE_ADMIN_USERNAME
+ and settings.SHARE_ADMIN_PASSWORD
+ and not ShareUser.objects.filter(username=settings.SHARE_ADMIN_USERNAME).exists()
+ ):
ShareUser.objects.create_superuser(
- admin_username,
- os.environ.get('SHARE_ADMIN_PASSWORD', 'password')
+ settings.SHARE_ADMIN_USERNAME,
+ settings.SHARE_ADMIN_PASSWORD,
)
@@ -48,8 +48,10 @@ class Migration(migrations.Migration):
operations = [
migrations.RunPython(
code=ensure_share_system_user,
+ reverse_code=migrations.RunPython.noop,
),
migrations.RunPython(
code=ensure_share_admin_user,
+ reverse_code=migrations.RunPython.noop,
),
]
diff --git a/share/migrations/0078_delete_rawdatum.py b/share/migrations/0078_delete_rawdatum.py
new file mode 100644
index 000000000..7d1104d3f
--- /dev/null
+++ b/share/migrations/0078_delete_rawdatum.py
@@ -0,0 +1,17 @@
+# Generated by Django 3.2.25 on 2025-05-30 17:29
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('share', '0077_big_cleanup_2025'),
+ ('trove', '0009_no_raw_datum'),
+ ]
+
+ operations = [
+ migrations.DeleteModel(
+ name='RawDatum',
+ ),
+ ]
diff --git a/share/models/__init__.py b/share/models/__init__.py
index 338e34ecf..f53ac7b2a 100644
--- a/share/models/__init__.py
+++ b/share/models/__init__.py
@@ -1,21 +1,17 @@
-from share.models.source_unique_identifier import SourceUniqueIdentifier
-from share.models.index_backfill import IndexBackfill
-from share.models.feature_flag import FeatureFlag
-from share.models.core import ShareUser
-from share.models.ingest import (
- Source,
- SourceConfig,
- RawDatum,
-)
from share.models.banner import SiteBanner
from share.models.celery import CeleryTaskResult
+from share.models.core import ShareUser
+from share.models.feature_flag import FeatureFlag
from share.models.fields import DateTimeAwareJSONField
+from share.models.index_backfill import IndexBackfill
+from share.models.source import Source
+from share.models.source_config import SourceConfig
+from share.models.source_unique_identifier import SourceUniqueIdentifier
__all__ = (
'CeleryTaskResult',
'FeatureFlag',
'IndexBackfill',
- 'RawDatum',
'ShareUser',
'SiteBanner',
'Source',
diff --git a/share/models/_old.py b/share/models/_old.py
new file mode 100644
index 000000000..d42a7f8df
--- /dev/null
+++ b/share/models/_old.py
@@ -0,0 +1,9 @@
+# things kept temporarily, until the old migrations that reference them are squashed away
+
+
+def icon_name():
+ ... # removed; stub for past migrations only
+
+
+def SourceIconStorage():
+ ... # removed; stub for past migrations only
diff --git a/share/models/ingest.py b/share/models/ingest.py
deleted file mode 100644
index c5a662c01..000000000
--- a/share/models/ingest.py
+++ /dev/null
@@ -1,275 +0,0 @@
-import datetime
-import hashlib
-import logging
-
-from django.core import validators
-from django.db import connection
-from django.db import models
-from django.db.models.functions import Coalesce
-import sentry_sdk
-
-from share.models.core import ShareUser
-from share.models.fuzzycount import FuzzyCountManager
-from share.models.source_unique_identifier import SourceUniqueIdentifier
-from share.util import chunked, BaseJSONAPIMeta
-
-
-logger = logging.getLogger(__name__)
-__all__ = ('Source', 'SourceConfig', 'RawDatum', )
-
-
-def icon_name():
- ... # removed; stub for past migrations only
-
-
-def SourceIconStorage():
- ... # removed; stub for past migrations only
-
-
-class NaturalKeyManager(models.Manager):
- use_in_migrations = True
-
- def __init__(self, *key_fields):
- super(NaturalKeyManager, self).__init__()
- self.key_fields = key_fields
-
- def get_by_natural_key(self, key):
- return self.get(**dict(zip(self.key_fields, key)))
-
-
-class Source(models.Model):
- name = models.TextField(unique=True)
- long_title = models.TextField(unique=True)
- home_page = models.URLField(null=True, blank=True)
- is_deleted = models.BooleanField(default=False)
-
- # Whether or not this SourceConfig collects original content
- # If True changes made by this source cannot be overwritten
- # This should probably be on SourceConfig but placing it on Source
- # is much easier for the moment.
- # I also haven't seen a situation where a Source has two feeds that we harvest
- # where one provider unreliable metadata but the other does not.
- canonical = models.BooleanField(default=False, db_index=True)
-
- # TODO replace with object permissions, allow multiple sources per user (SHARE-996)
- user = models.OneToOneField('ShareUser', null=True, on_delete=models.CASCADE)
-
- objects = NaturalKeyManager('name')
-
- class JSONAPIMeta(BaseJSONAPIMeta):
- pass
-
- def natural_key(self):
- return (self.name,)
-
- def __repr__(self):
- return '<{}({}, {}, {})>'.format(self.__class__.__name__, self.pk, self.name, self.long_title)
-
- def __str__(self):
- return repr(self)
-
-
-class SourceConfigManager(NaturalKeyManager):
- def get_or_create_push_config(self, user, transformer_key=None):
- assert isinstance(user, ShareUser)
- _config_label = '.'.join((
- user.username,
- transformer_key or 'rdf', # TODO: something cleaner?
- ))
- try:
- _config = SourceConfig.objects.get(label=_config_label)
- except SourceConfig.DoesNotExist:
- _source, _ = Source.objects.get_or_create(
- user_id=user.id,
- defaults={
- 'name': user.username,
- 'long_title': user.username,
- }
- )
- _config, _ = SourceConfig.objects.get_or_create(
- label=_config_label,
- defaults={
- 'source': _source,
- 'transformer_key': transformer_key,
- }
- )
- assert _config.source.user_id == user.id
- assert _config.transformer_key == transformer_key
- return _config
-
-
-class SourceConfig(models.Model):
- # Previously known as the provider's app_label
- label = models.TextField(unique=True)
- version = models.PositiveIntegerField(default=1)
-
- source = models.ForeignKey('Source', on_delete=models.CASCADE, related_name='source_configs')
- base_url = models.URLField(null=True)
- transformer_key = models.TextField(null=True)
-
- disabled = models.BooleanField(default=False)
-
- objects = SourceConfigManager('label')
-
- class JSONAPIMeta(BaseJSONAPIMeta):
- pass
-
- def natural_key(self):
- return (self.label,)
-
- def __repr__(self):
- return '<{}({}, {})>'.format(self.__class__.__name__, self.pk, self.label)
-
- __str__ = __repr__
-
-
-class RawDatumManager(FuzzyCountManager):
-
- def link_to_job(self, job, datum_ids):
- if not datum_ids:
- return True
- logger.debug('Linking RawData to %r', job)
- with connection.cursor() as cursor:
- for chunk in chunked(datum_ids, size=500):
- if not chunk:
- break
- cursor.execute('''
- INSERT INTO "{table}"
- ("{rawdatum}", "{harvestjob}")
- VALUES
- {values}
- ON CONFLICT ("{rawdatum}", "{harvestjob}") DO NOTHING;
- '''.format(
- values=', '.join('%s' for _ in range(len(chunk))), # Nasty hack. Fix when psycopg2 2.7 is released with execute_values
- table=RawDatum.jobs.through._meta.db_table,
- rawdatum=RawDatum.jobs.through._meta.get_field('rawdatum').column,
- harvestjob=RawDatum.jobs.through._meta.get_field('harvestjob').column,
- ), [(raw_id, job.id) for raw_id in chunk])
- return True
-
- def store_datum_for_suid(
- self,
- *,
- suid,
- datum: str,
- mediatype: str,
- datestamp: datetime.datetime,
- expiration_date: datetime.date | None = None,
- ):
- _raw, _raw_created = self.get_or_create(
- suid=suid,
- sha256=hashlib.sha256(datum.encode()).hexdigest(),
- defaults={
- 'datum': datum,
- 'mediatype': mediatype,
- 'datestamp': datestamp,
- 'expiration_date': expiration_date,
- },
- )
- if not _raw_created:
- if _raw.datum != datum:
- _msg = f'hash collision!? {_raw.sha256}\n===\n{_raw.datum}\n===\n{datum}'
- logger.critical(_msg)
- sentry_sdk.capture_message(_msg)
- _raw.mediatype = mediatype
- _raw.expiration_date = expiration_date
- # keep the latest datestamp
- if (not _raw.datestamp) or (datestamp > _raw.datestamp):
- _raw.datestamp = datestamp
- _raw.save(update_fields=('mediatype', 'datestamp', 'expiration_date', 'date_modified'))
- return _raw
-
- def latest_by_suid_id(self, suid_id) -> models.QuerySet:
- return self.latest_by_suid_queryset(
- SourceUniqueIdentifier.objects.filter(id=suid_id),
- )
-
- def latest_by_suid_queryset(self, suid_queryset) -> models.QuerySet:
- return self.filter(id__in=(
- suid_queryset
- .annotate(latest_rawdatum_id=models.Subquery(
- RawDatum.objects
- .filter(suid_id=models.OuterRef('id'))
- .order_by(Coalesce('datestamp', 'date_created').desc(nulls_last=True))
- .values('id')
- [:1]
- ))
- .values('latest_rawdatum_id')
- ))
-
- def latest_for_each_suid(self) -> models.QuerySet:
- # only the latest datum for each described resource
- _latest_pk_subquery = models.Subquery(
- self.filter(suid_id=models.OuterRef('suid_id'))
- .order_by(Coalesce('datestamp', 'date_created').desc(nulls_last=True))
- .values('pk')
- [:1]
- )
- return self.annotate(
- latest_same_suid=_latest_pk_subquery,
- ).filter(pk=models.F('latest_same_suid'))
-
-
-class RawDatum(models.Model):
-
- datum = models.TextField()
- mediatype = models.TextField(null=True, blank=True)
-
- suid = models.ForeignKey(SourceUniqueIdentifier, on_delete=models.CASCADE, related_name='raw_data')
-
- # The sha256 of the datum
- sha256 = models.TextField(validators=[validators.MaxLengthValidator(64)])
-
- datestamp = models.DateTimeField(null=True, help_text=(
- 'The most relevant datetime that can be extracted from this RawDatum. '
- 'This may be, but is not limited to, a deletion, modification, publication, or creation datestamp. '
- 'Ideally, this datetime should be appropriate for determining the chronological order its data will be applied.'
- ))
- expiration_date = models.DateField(
- null=True,
- blank=True,
- help_text='An (optional) date after which this datum is no longer valid.',
- )
-
- date_modified = models.DateTimeField(auto_now=True, editable=False)
- date_created = models.DateTimeField(auto_now_add=True, editable=False)
-
- no_output = models.BooleanField(null=True, help_text=(
- 'Indicates that this RawDatum resulted in an empty graph when transformed. '
- 'This allows the RawDataJanitor to find records that have not been processed. '
- 'Records that result in an empty graph will not have an Indexcard associated with them, '
- 'which would otherwise look like data that has not yet been processed.'
- ))
-
- objects = RawDatumManager()
-
- def is_latest(self):
- return (
- RawDatum.objects
- .latest_by_suid_id(self.suid_id)
- .filter(id=self.id)
- .exists()
- )
-
- @property
- def is_expired(self) -> bool:
- return (
- self.expiration_date is not None
- and self.expiration_date <= datetime.date.today()
- )
-
- class Meta:
- unique_together = ('suid', 'sha256')
- verbose_name_plural = 'Raw Data'
- indexes = [
- models.Index(fields=['no_output'], name='share_rawda_no_outp_f0330f_idx'),
- models.Index(fields=['expiration_date'], name='share_rawdatum_expiration_idx'),
- ]
-
- class JSONAPIMeta(BaseJSONAPIMeta):
- resource_name = 'RawData'
-
- def __repr__(self):
- return '<{}({}, {}, {}...)>'.format(self.__class__.__name__, self.id, self.datestamp, self.sha256[:10])
-
- __str__ = __repr__
diff --git a/share/models/source.py b/share/models/source.py
new file mode 100644
index 000000000..34cef8b61
--- /dev/null
+++ b/share/models/source.py
@@ -0,0 +1,45 @@
+from django.db import models
+
+from share.util import BaseJSONAPIMeta
+
+
+__all__ = ('Source', 'SourceManager',)
+
+
+class SourceManager(models.Manager):
+ use_in_migrations = True
+
+ def get_by_natural_key(self, key):
+ return self.get(name=key)
+
+
+class Source(models.Model):
+ name = models.TextField(unique=True)
+ long_title = models.TextField(unique=True)
+ home_page = models.URLField(null=True, blank=True)
+ is_deleted = models.BooleanField(default=False)
+
+ # Whether or not this SourceConfig collects original content
+ # If True changes made by this source cannot be overwritten
+ # This should probably be on SourceConfig but placing it on Source
+ # is much easier for the moment.
+ # I also haven't seen a situation where a Source has two feeds that we harvest
+ # where one provider unreliable metadata but the other does not.
+ canonical = models.BooleanField(default=False, db_index=True)
+
+ # TODO replace with object permissions, allow multiple sources per user (SHARE-996)
+ user = models.OneToOneField('ShareUser', null=True, on_delete=models.CASCADE)
+
+ objects = SourceManager()
+
+ class JSONAPIMeta(BaseJSONAPIMeta):
+ pass
+
+ def natural_key(self):
+ return (self.name,)
+
+ def __repr__(self):
+ return '<{}({}, {}, {})>'.format(self.__class__.__name__, self.pk, self.name, self.long_title)
+
+ def __str__(self):
+ return repr(self)
diff --git a/share/models/source_config.py b/share/models/source_config.py
new file mode 100644
index 000000000..a23dfcaf9
--- /dev/null
+++ b/share/models/source_config.py
@@ -0,0 +1,68 @@
+
+from django.db import models
+
+from share.models.core import ShareUser
+from share.models.source import Source
+from share.util import BaseJSONAPIMeta
+
+
+__all__ = ('SourceConfig',)
+
+
+class SourceConfigManager(models.Manager):
+ use_in_migrations = True
+
+ def get_by_natural_key(self, key):
+ return self.get(label=key)
+
+ def get_or_create_push_config(self, user, transformer_key=None):
+ assert isinstance(user, ShareUser)
+ _config_label = '.'.join((
+ user.username,
+ transformer_key or 'rdf', # TODO: something cleaner?
+ ))
+ try:
+ _config = SourceConfig.objects.get(label=_config_label)
+ except SourceConfig.DoesNotExist:
+ _source, _ = Source.objects.get_or_create(
+ user_id=user.id,
+ defaults={
+ 'name': user.username,
+ 'long_title': user.username,
+ }
+ )
+ _config, _ = SourceConfig.objects.get_or_create(
+ label=_config_label,
+ defaults={
+ 'source': _source,
+ 'transformer_key': transformer_key,
+ }
+ )
+ assert _config.source.user_id == user.id
+ assert _config.transformer_key == transformer_key
+ return _config
+
+
+class SourceConfig(models.Model):
+ # Previously known as the provider's app_label
+ label = models.TextField(unique=True)
+ version = models.PositiveIntegerField(default=1)
+
+ source = models.ForeignKey('Source', on_delete=models.CASCADE, related_name='source_configs')
+ base_url = models.URLField(null=True)
+ transformer_key = models.TextField(null=True)
+
+ disabled = models.BooleanField(default=False)
+
+ objects = SourceConfigManager()
+
+ class JSONAPIMeta(BaseJSONAPIMeta):
+ pass
+
+ def natural_key(self):
+ return (self.label,)
+
+ def __repr__(self):
+ return '<{}({}, {})>'.format(self.__class__.__name__, self.pk, self.label)
+
+ __str__ = __repr__
diff --git a/share/models/source_unique_identifier.py b/share/models/source_unique_identifier.py
index bc3bbaf5e..05c6eb7d5 100644
--- a/share/models/source_unique_identifier.py
+++ b/share/models/source_unique_identifier.py
@@ -1,6 +1,3 @@
-import datetime
-from typing import Optional
-
from django.db import models
from share.util import BaseJSONAPIMeta
@@ -22,28 +19,6 @@ class JSONAPIMeta(BaseJSONAPIMeta):
class Meta:
unique_together = ('identifier', 'source_config')
- def most_recent_raw_datum(self):
- """fetch the most recent RawDatum for this suid
- """
- return self._most_recent_raw_datum_queryset().first()
-
- def most_recent_raw_datum_id(self):
- return self._most_recent_raw_datum_queryset().values_list('id', flat=True).first()
-
- def _most_recent_raw_datum_queryset(self):
- from share.models import RawDatum
- return RawDatum.objects.latest_by_suid_id(self.id)
-
- def get_date_first_seen(self) -> Optional[datetime.datetime]:
- """when the first RawDatum for this suid was added
- """
- return (
- self.raw_data
- .order_by('date_created')
- .values_list('date_created', flat=True)
- .first()
- )
-
def get_backcompat_sharev2_suid(self):
'''get an equivalent "v2_push" suid for this suid
diff --git a/share/oaipmh/indexcard_repository.py b/share/oaipmh/indexcard_repository.py
index 76de6255e..d9d855f75 100644
--- a/share/oaipmh/indexcard_repository.py
+++ b/share/oaipmh/indexcard_repository.py
@@ -106,7 +106,7 @@ def oai_identifier(self, indexcard):
def _do_identify(self, kwargs, renderer):
_earliest_date = (
- trove_db.LatestIndexcardRdf.objects
+ trove_db.LatestResourceDescription.objects
.order_by('modified')
.values_list('modified', flat=True)
.first()
@@ -213,7 +213,7 @@ def _get_indexcard_page_queryset(self, kwargs, catch=True, last_id=None):
self.errors.append(oai_errors.BadArgument('Invalid value for', 'from'))
else:
_indexcard_queryset = _indexcard_queryset.filter(
- trove_latestindexcardrdf_set__modified__gte=_from,
+ trove_latestresourcedescription_set__modified__gte=_from,
)
if 'until' in kwargs:
try:
@@ -224,7 +224,7 @@ def _get_indexcard_page_queryset(self, kwargs, catch=True, last_id=None):
self.errors.append(oai_errors.BadArgument('Invalid value for', 'until'))
else:
_indexcard_queryset = _indexcard_queryset.filter(
- trove_latestindexcardrdf_set__modified__lte=_until,
+ trove_latestresourcedescription_set__modified__lte=_until,
)
if 'set' in kwargs:
_sourceconfig_ids = tuple(
@@ -246,7 +246,7 @@ def _get_base_indexcard_queryset(self):
def _get_indexcard_queryset_with_annotations(self):
return self._get_base_indexcard_queryset().annotate(
oai_datestamp=Subquery(
- trove_db.LatestIndexcardRdf.objects
+ trove_db.LatestResourceDescription.objects
.filter(indexcard_id=OuterRef('id'))
.values_list('modified', flat=True)
[:1]
diff --git a/share/search/index_strategy/_trovesearch_util.py b/share/search/index_strategy/_trovesearch_util.py
index e38872712..d83f5b16d 100644
--- a/share/search/index_strategy/_trovesearch_util.py
+++ b/share/search/index_strategy/_trovesearch_util.py
@@ -46,9 +46,9 @@
###
# utilities
-def latest_rdf_for_indexcard_pks(indexcard_pks):
+def latest_resource_description_for_indexcard_pks(indexcard_pks):
return (
- trove_db.LatestIndexcardRdf.objects
+ trove_db.LatestResourceDescription.objects
.filter(indexcard_id__in=indexcard_pks)
.filter(Exists( # only index items that have an osfmap_json representation
trove_db.DerivedIndexcard.objects
@@ -61,7 +61,7 @@ def latest_rdf_for_indexcard_pks(indexcard_pks):
.exclude(indexcard__deleted__isnull=False)
.select_related('indexcard__source_record_suid__source_config')
.prefetch_related('indexcard__focus_identifier_set')
- .prefetch_related('indexcard__supplementary_rdf_set')
+ .prefetch_related('indexcard__supplementary_description_set')
)
diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py
index a65eb776f..19cea8d80 100644
--- a/share/search/index_strategy/trovesearch_denorm.py
+++ b/share/search/index_strategy/trovesearch_denorm.py
@@ -224,12 +224,12 @@ def after_chunk(self, messages_chunk: messages.MessagesChunk, affected_indexname
# abstract method from Elastic8IndexStrategy
def build_elastic_actions(self, messages_chunk: messages.MessagesChunk):
- _indexcard_rdf_qs = ts.latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk)
+ _resource_description_qs = ts.latest_resource_description_for_indexcard_pks(messages_chunk.target_ids_chunk)
_remaining_indexcard_pks = set(messages_chunk.target_ids_chunk)
- for _indexcard_rdf in _indexcard_rdf_qs:
- _docbuilder = self._SourcedocBuilder(_indexcard_rdf, messages_chunk.timestamp)
+ for _resource_description in _resource_description_qs:
+ _docbuilder = self._SourcedocBuilder(_resource_description, messages_chunk.timestamp)
if not _docbuilder.should_skip(): # if skipped, will be deleted
- _indexcard_pk = _indexcard_rdf.indexcard_id
+ _indexcard_pk = _resource_description.indexcard_id
_cardsearch_actions = (
self.build_index_action(_doc_id, _doc)
for _doc_id, _doc in _docbuilder.build_cardsearch_docs()
@@ -333,16 +333,16 @@ def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> Value
class _SourcedocBuilder:
'''build elasticsearch sourcedocs for an rdf document
'''
- indexcard_rdf: trove_db.IndexcardRdf
+ resource_description: trove_db.ResourceDescription
chunk_timestamp: int
indexcard: trove_db.Indexcard = dataclasses.field(init=False)
focus_iri: str = dataclasses.field(init=False)
rdfdoc: rdf.RdfTripleDictionary = dataclasses.field(init=False)
def __post_init__(self) -> None:
- self.indexcard = self.indexcard_rdf.indexcard
- self.focus_iri = self.indexcard_rdf.focus_iri
- self.rdfdoc = self.indexcard_rdf.as_rdfdoc_with_supplements()
+ self.indexcard = self.resource_description.indexcard
+ self.focus_iri = self.resource_description.focus_iri
+ self.rdfdoc = self.resource_description.as_rdfdoc_with_supplements()
def should_skip(self) -> bool:
_suid = self.indexcard.source_record_suid
diff --git a/share/util/checksum_iri.py b/share/util/checksum_iri.py
index 012fdbab2..552aeb91c 100644
--- a/share/util/checksum_iri.py
+++ b/share/util/checksum_iri.py
@@ -12,7 +12,7 @@ def _ensure_bytes(bytes_or_something) -> bytes:
def _builtin_checksum(hash_constructor):
- def hexdigest_fn(salt, data) -> str:
+ def hexdigest_fn(salt: str | bytes, data: str | bytes) -> str:
hasher = hash_constructor()
hasher.update(_ensure_bytes(salt))
hasher.update(_ensure_bytes(data))
@@ -37,7 +37,7 @@ def __str__(self):
return f'urn:checksum:{self.checksumalgorithm_name}:{self.salt}:{self.hexdigest}'
@classmethod
- def digest(cls, checksumalgorithm_name, *, salt, raw_data):
+ def digest(cls, checksumalgorithm_name: str, *, salt: str, data: str):
try:
hexdigest_fn = CHECKSUM_ALGORITHMS[checksumalgorithm_name]
except KeyError:
@@ -48,7 +48,7 @@ def digest(cls, checksumalgorithm_name, *, salt, raw_data):
return cls(
checksumalgorithm_name=checksumalgorithm_name,
salt=salt,
- hexdigest=hexdigest_fn(salt, raw_data),
+ hexdigest=hexdigest_fn(salt, data),
)
@classmethod
@@ -56,7 +56,7 @@ def digest_json(cls, checksumalgorithm_name, *, salt, raw_json):
return cls.digest(
checksumalgorithm_name,
salt=salt,
- raw_data=json.dumps(raw_json, sort_keys=True),
+ data=json.dumps(raw_json, sort_keys=True),
)
@classmethod
diff --git a/share/version.py b/share/version.py
index b1fb40a82..191d57ff7 100644
--- a/share/version.py
+++ b/share/version.py
@@ -1,4 +1,4 @@
-__version__ = '25.3.3'
+__version__ = '25.4.0'
def get_share_version() -> str:
diff --git a/templates/admin/start-ingest.html b/templates/admin/start-ingest.html
deleted file mode 100644
index f0008471a..000000000
--- a/templates/admin/start-ingest.html
+++ /dev/null
@@ -1,26 +0,0 @@
-{% extends "admin/base_site.html" %}
-{% load i18n %}
-
-{% block extrastyle %}
-
-{% endblock %}
-
-{% block content %}
-{% trans "schedule full (re)ingest" %}
-
- for source config "{{ source_config.label }}"
-
-
-{% endblock %}
diff --git a/tests/api/test_generated_endpoints.py b/tests/api/test_generated_endpoints.py
index d0605f6c8..ab4c10902 100644
--- a/tests/api/test_generated_endpoints.py
+++ b/tests/api/test_generated_endpoints.py
@@ -5,57 +5,63 @@
# TODO these tests belong somewhere else
@pytest.mark.django_db
-@pytest.mark.parametrize('endpoint, factory', [
- ('rawdata', factories.RawDatumFactory),
+@pytest.mark.parametrize('endpoint, factory, autocreated_count', [
+ ('site_banners', factories.SiteBannerFactory, 0),
+ ('sourceconfigs', factories.SourceConfigFactory, 0),
+ ('sources', factories.SourceFactory, 1),
])
class TestPagination:
- def test_no_prev(self, client, endpoint, factory):
+ def test_no_prev(self, client, endpoint, factory, autocreated_count):
resp = client.get('/api/v2/{}/'.format(endpoint))
assert resp.status_code == 200
- assert resp.json()['data'] == []
- assert resp.json()['links']['prev'] is None
- assert resp.json()['links']['next'] is None
+ _json = resp.json()
+ assert len(_json['data']) == autocreated_count
+ _links = _json.get('links', {})
+ assert _links.get('prev') is None
+ assert _links.get('next') is None
- def test_one(self, client, endpoint, factory):
+ def test_one(self, client, endpoint, factory, autocreated_count):
factory()
resp = client.get('/api/v2/{}/'.format(endpoint))
assert resp.status_code == 200
- assert len(resp.json()['data']) == 1
- assert resp.json()['links']['prev'] is None
- assert resp.json()['links']['next'] is None
-
- def test_full_page(self, client, endpoint, factory):
- for _ in range(10):
- factory()
+ _json = resp.json()
+ assert len(_json['data']) == autocreated_count + 1
+ _links = _json.get('links', {})
+ assert _links.get('prev') is None
+ assert _links.get('next') is None
+ def test_full_page(self, client, endpoint, factory, autocreated_count):
+ factory.create_batch(10 - autocreated_count)
resp = client.get('/api/v2/{}/'.format(endpoint))
assert resp.status_code == 200
+ _json = resp.json()
+ assert len(_json['data']) == 10
+ _links = _json.get('links', {})
+ assert _links.get('prev') is None
+ assert _links.get('next') is None
- assert len(resp.json()['data']) == 10
- assert resp.json()['links']['prev'] is None
- assert resp.json()['links']['next'] is None
-
- def test_next_page(self, client, endpoint, factory):
- for _ in range(20):
- factory()
-
+ def test_next_page(self, client, endpoint, factory, autocreated_count):
+ factory.create_batch(20 - autocreated_count)
resp = client.get('/api/v2/{}/'.format(endpoint))
assert resp.status_code == 200
- assert len(resp.json()['data']) == 10
- assert resp.json()['links']['prev'] is None
- assert resp.json()['links']['next'] is not None
- assert 'page%5Bcursor%5D' in resp.json()['links']['next']
+ _json = resp.json()
+ assert len(_json['data']) == 10
+ _links = _json.get('links', {})
+ assert _links.get('prev') is None
+ assert _links.get('next') is not None
+ assert 'page%5Bcursor%5D' in _links['next']
- resp2 = client.get(resp.json()['links']['next'])
+ resp2 = client.get(_links['next'])
assert resp2.status_code == 200
- assert resp2.json()['links']['next'] is None
+ _json2 = resp2.json()
+ assert _json2['links'].get('next') is None
- assert set(x['id'] for x in resp.json()['data']) & set(x['id'] for x in resp2.json()['data']) == set()
+ assert set(x['id'] for x in _json['data']) & set(x['id'] for x in _json2['data']) == set()
- def test_bad_cursor(self, client, endpoint, factory):
+ def test_bad_cursor(self, client, endpoint, factory, autocreated_count):
resp = client.get(f'/api/v2/{endpoint}/', {'page[cursor]': 1})
assert resp.status_code == 404
assert resp.json() == {'errors': [{
diff --git a/tests/api/test_readonly_endpoints.py b/tests/api/test_readonly_endpoints.py
index 57fd600ee..614207ccd 100644
--- a/tests/api/test_readonly_endpoints.py
+++ b/tests/api/test_readonly_endpoints.py
@@ -19,22 +19,6 @@ def get_test_data(endpoint_type):
return test_data
-@pytest.mark.django_db
-class TestRawDataEndpoint:
- endpoint = '/api/v2/rawdata/'
-
- def test_status(self, client):
- assert client.get(self.endpoint).status_code == 200
-
- def test_post(self, client, trusted_user):
- assert client.post(
- self.endpoint,
- json.dumps(get_test_data('RawData')),
- content_type='application/vnd.api+json',
- HTTP_AUTHORIZATION='Bearer ' + trusted_user.oauth2_provider_accesstoken.first().token,
- ).status_code == 405
-
-
@pytest.mark.django_db
class TestSiteBannersEndpoint:
endpoint = '/api/v2/site_banners/'
diff --git a/tests/conftest.py b/tests/conftest.py
index 61eef76dd..6fe8424cd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,7 +12,6 @@
from oauth2_provider.models import AccessToken, Application
-from share.models import RawDatum
from share.models import ShareUser
from share.models import SourceUniqueIdentifier
@@ -96,18 +95,6 @@ def suid(source_config):
return suid
-@pytest.fixture
-def raw_data(suid):
- raw_data = RawDatum(suid=suid, datum='{}')
- raw_data.save()
- return raw_data
-
-
-@pytest.fixture
-def raw_data_id(raw_data):
- return raw_data.id
-
-
@contextlib.contextmanager
def rolledback_transaction(loglabel):
class ExpectedRollback(Exception):
diff --git a/tests/factories/__init__.py b/tests/factories/__init__.py
index d23f4f316..84c3c8300 100644
--- a/tests/factories/__init__.py
+++ b/tests/factories/__init__.py
@@ -1,4 +1,3 @@
-import hashlib
import uuid
import factory
@@ -51,25 +50,15 @@ class Meta:
model = share_db.SourceUniqueIdentifier
-class RawDatumFactory(DjangoModelFactory):
- datum = factory.Sequence(lambda n: f'{n}{fake.text()}')
- suid = factory.SubFactory(SourceUniqueIdentifierFactory)
- sha256 = factory.LazyAttribute(lambda r: hashlib.sha256(r.datum.encode()).hexdigest())
+class SiteBannerFactory(DjangoModelFactory):
+ title = factory.Faker('word')
+ description = factory.Faker('sentence')
+ color = fuzzy.FuzzyChoice(list(share_db.SiteBanner.COLOR.keys()))
+ created_by = factory.SubFactory(ShareUserFactory)
+ last_modified_by = factory.SubFactory(ShareUserFactory)
class Meta:
- model = share_db.RawDatum
-
- @classmethod
- def _generate(cls, create, attrs):
- raw_datum = super()._generate(create, attrs)
-
- # HACK: allow overriding auto_now_add on date_created
- date_created = attrs.pop('date_created', None)
- if date_created is not None:
- raw_datum.date_created = date_created
- raw_datum.save()
-
- return raw_datum
+ model = share_db.SiteBanner
class CeleryTaskResultFactory(DjangoModelFactory):
@@ -99,15 +88,14 @@ class Meta:
model = trove_db.Indexcard
-class LatestIndexcardRdfFactory(DjangoModelFactory):
- from_raw_datum = factory.SubFactory(RawDatumFactory)
+class LatestResourceDescriptionFactory(DjangoModelFactory):
indexcard = factory.SubFactory(IndexcardFactory)
focus_iri = factory.Sequence(lambda x: f'http://test.example/{x}')
rdf_as_turtle = factory.Sequence(lambda x: f' a ')
# turtle_checksum_iri =
class Meta:
- model = trove_db.LatestIndexcardRdf
+ model = trove_db.LatestResourceDescription
class DerivedIndexcardFactory(DjangoModelFactory):
diff --git a/tests/share/models/test_rawdata.py b/tests/share/models/test_rawdata.py
deleted file mode 100644
index 4c046f89d..000000000
--- a/tests/share/models/test_rawdata.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import datetime
-import pytest
-import hashlib
-
-from django.core import exceptions
-from django.db.utils import IntegrityError
-
-from share.models import RawDatum
-
-
-def get_now():
- return datetime.datetime.now(tz=datetime.timezone.utc)
-
-
-@pytest.mark.django_db
-class TestRawDatum:
-
- def test_doesnt_mangle_data(self, suid):
- rd = RawDatum(suid=suid, datum='This is just some data')
- rd.save()
-
- assert RawDatum.objects.first().datum == 'This is just some data'
-
- def test_must_have_data(self, suid):
- rd = RawDatum(suid)
-
- with pytest.raises(exceptions.ValidationError) as e:
- rd.clean_fields()
- rd.save()
-
- assert 'This field cannot be blank.' == e.value.message_dict['datum'][0]
-
- def test_must_have_suid(self):
- rd = RawDatum(datum='SomeData')
-
- with pytest.raises(IntegrityError) as e:
- rd.save()
-
- assert 'null value in column "suid_id"' in e.value.args[0]
-
- def test_store_data_by_suid(self, suid):
- _now = get_now()
- rd = RawDatum.objects.store_datum_for_suid(
- suid=suid,
- datum='mydatums',
- mediatype='text/plain',
- datestamp=_now,
- )
-
- assert rd.date_modified is not None
- assert rd.date_created is not None
-
- assert rd.datum == 'mydatums'
- assert rd.datestamp == _now
- assert rd.suid_id == suid.id
- assert rd.sha256 == hashlib.sha256(b'mydatums').hexdigest()
-
- def test_store_data_dedups_simple(self, suid):
- rd1 = RawDatum.objects.store_datum_for_suid(
- suid=suid,
- datum='mydatums',
- mediatype='text/plain',
- datestamp=get_now(),
- )
- rd2 = RawDatum.objects.store_datum_for_suid(
- suid=suid,
- datum='mydatums',
- mediatype='text/plain',
- datestamp=get_now(),
- )
- rd3 = RawDatum.objects.store_datum_for_suid(
- suid=suid,
- datum='mydatums',
- mediatype='text/plain',
- datestamp=get_now(),
- )
-
- assert rd1.pk == rd2.pk == rd3.pk
- assert rd1.sha256 == rd2.sha256 == rd3.sha256
- assert rd1.datestamp < rd2.datestamp < rd3.datestamp < get_now()
- assert rd1.date_created == rd2.date_created == rd3.date_created
- assert rd1.date_modified < rd2.date_modified < rd3.date_modified
-
- def test_is_expired(self):
- rd = RawDatum()
- assert rd.expiration_date is None
- assert not rd.is_expired
- _today = datetime.date.today()
- rd.expiration_date = datetime.date(_today.year - 1, _today.month, _today.day)
- assert rd.is_expired
- rd.expiration_date = datetime.date(_today.year, _today.month, _today.day)
- assert rd.is_expired
- rd.expiration_date = datetime.date(_today.year + 1, _today.month, _today.day)
- assert not rd.is_expired
diff --git a/tests/share/models/test_suid.py b/tests/share/models/test_suid.py
deleted file mode 100644
index a6cdcc394..000000000
--- a/tests/share/models/test_suid.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import pytest
-
-from tests.factories import (
- RawDatumFactory,
- SourceUniqueIdentifierFactory,
-)
-
-
-@pytest.mark.django_db
-class TestSourceUniqueIdentifier:
-
- def test_most_recent_raw_datum(self):
- suid = SourceUniqueIdentifierFactory()
-
- RawDatumFactory(suid=suid, datestamp=None, date_created='2021-01-01 00:00Z')
- expected = RawDatumFactory(suid=suid, datestamp='2021-01-04 00:00Z')
- RawDatumFactory(suid=suid, datestamp='2021-01-01 00:00Z')
- RawDatumFactory(suid=suid, datestamp='2021-01-02 00:00Z')
- RawDatumFactory(suid=suid, datestamp='2021-01-03 00:00Z')
-
- actual = suid.most_recent_raw_datum()
- assert expected == actual
-
- def test_most_recent_raw_datum__datestamp_wins(self):
- suid = SourceUniqueIdentifierFactory()
-
- RawDatumFactory(suid=suid, datestamp='2021-01-01 00:00Z', date_created='2021-01-02 00:00Z')
- expected = RawDatumFactory(suid=suid, datestamp='2021-01-02 00:00Z', date_created='2021-01-01 00:00Z')
-
- actual = suid.most_recent_raw_datum()
- assert expected == actual
-
- def test_most_recent_raw_datum_no_datestamps(self):
- suid = SourceUniqueIdentifierFactory()
-
- expected = RawDatumFactory(suid=suid, datestamp=None, date_created='2021-01-02 00:00Z')
- RawDatumFactory(suid=suid, datestamp=None, date_created='2021-01-01 00:00Z')
-
- actual = suid.most_recent_raw_datum()
- assert expected == actual
-
- def test_date_first_seen(self):
- suid = SourceUniqueIdentifierFactory()
-
- expected = RawDatumFactory(suid=suid).date_created
- for _ in range(7):
- RawDatumFactory(suid=suid)
-
- actual = suid.get_date_first_seen()
- assert expected == actual
-
- def test_date_first_seen_when_no_data(self):
- suid = SourceUniqueIdentifierFactory()
- actual = suid.get_date_first_seen()
- assert actual is None
diff --git a/tests/share/test_celery.py b/tests/share/test_celery.py
index 51b6d2721..af2d49b11 100644
--- a/tests/share/test_celery.py
+++ b/tests/share/test_celery.py
@@ -1,49 +1,72 @@
-import pytest
-import datetime
-
+import contextlib
+from datetime import timedelta
from unittest import mock
+import pytest
from django.utils import timezone
from share.celery import TaskResultCleaner, CeleryTaskResult
-
from tests import factories
-@pytest.mark.usefixtures('nested_django_db')
-class TestResultArchiver:
+@contextlib.contextmanager
+def long_now(new_now=None):
+ _now = new_now or timezone.now()
+ with mock.patch.object(timezone, 'now', return_value=_now):
+ yield _now
- @pytest.fixture(scope='class', autouse=True)
- def task_result_data(self, class_scoped_django_db):
- return factories.CeleryTaskResultFactory.create_batch(100)
+
+@pytest.mark.django_db
+class TestResultCleaner:
def test_delete_false(self):
- trc = TaskResultCleaner(datetime.timedelta(weeks=520), delete=False)
+ factories.CeleryTaskResultFactory.create_batch(10)
+ trc = TaskResultCleaner(timedelta(weeks=520), delete=False)
assert trc.delete_queryset(CeleryTaskResult.objects.all()) == 0
- assert CeleryTaskResult.objects.count() != 0
+ assert CeleryTaskResult.objects.count() == 10
def test_delete_queryset(self):
- trc = TaskResultCleaner(datetime.timedelta(weeks=520))
- assert trc.delete_queryset(CeleryTaskResult.objects.all()) == 100
+ factories.CeleryTaskResultFactory.create_batch(10)
+ trc = TaskResultCleaner(timedelta(weeks=520))
+ assert trc.delete_queryset(CeleryTaskResult.objects.all()) == 10
assert CeleryTaskResult.objects.count() == 0
- def test_get_ttl_default(self):
- trc = TaskResultCleaner(datetime.timedelta(weeks=520))
- assert ((timezone.now() - datetime.timedelta(weeks=520)) - trc.get_ttl('non-existant-task')) < datetime.timedelta(seconds=2)
-
- def test_get_ttl(self):
- trc = TaskResultCleaner(datetime.timedelta(weeks=520))
- trc.TASK_TTLS['existant-task'] = datetime.timedelta(days=1)
- assert ((timezone.now() - datetime.timedelta(days=1)) - trc.get_ttl('existant-task')) < datetime.timedelta(seconds=2)
-
- def test_clean(self):
- trc = TaskResultCleaner(0, bucket=mock.Mock())
- factories.CeleryTaskResultFactory.create_batch(100, status='SUCCESS')
- trc.clean()
- assert CeleryTaskResult.objects.count() <= 100 # There's an autouse fixture that makes 100
-
- def test_clean_chunksize(self):
- trc = TaskResultCleaner(0, bucket=mock.Mock(), chunk_size=1)
- factories.CeleryTaskResultFactory.create_batch(100, status='SUCCESS')
- trc.clean()
- assert CeleryTaskResult.objects.count() <= 100 # There's an autouse fixture that makes 100
+ def test_success_cutoff(self, settings):
+ with long_now() as _now:
+ trc = TaskResultCleaner(timedelta(days=3).total_seconds())
+ _expected = _now - timedelta(days=3)
+ assert trc.success_cutoff == _expected
+
+ def test_nonsuccess_cutoff(self, settings):
+ with long_now() as _now:
+ trc = TaskResultCleaner(
+ success_ttl=timedelta(days=3),
+ nonsuccess_ttl=timedelta(days=5),
+ )
+ assert trc.success_cutoff == _now - timedelta(days=3)
+ assert trc.nonsuccess_cutoff == _now - timedelta(days=5)
+
+ @pytest.mark.parametrize('batch_size', [1, 1111])
+ def test_clean(self, batch_size):
+ with long_now() as _now:
+ with long_now(_now - timedelta(days=7)):
+ # all should be deleted:
+ factories.CeleryTaskResultFactory.create_batch(10, status='SUCCESS')
+ factories.CeleryTaskResultFactory.create_batch(7, status='FAILED')
+ with long_now(_now - timedelta(days=4)):
+ # successes should be deleted:
+ factories.CeleryTaskResultFactory.create_batch(10, status='SUCCESS')
+ factories.CeleryTaskResultFactory.create_batch(7, status='FAILED')
+ # none should be deleted:
+ factories.CeleryTaskResultFactory.create_batch(10, status='SUCCESS')
+ factories.CeleryTaskResultFactory.create_batch(7, status='FAILED')
+ # end setup
+ assert CeleryTaskResult.objects.count() == 51
+ trc = TaskResultCleaner(
+ success_ttl=timedelta(days=3),
+ nonsuccess_ttl=timedelta(days=5),
+ chunk_size=batch_size,
+ )
+ trc.clean()
+ assert CeleryTaskResult.objects.filter(status='SUCCESS').count() == 10
+ assert CeleryTaskResult.objects.exclude(status='SUCCESS').count() == 14
diff --git a/tests/share/test_oaipmh_trove.py b/tests/share/test_oaipmh_trove.py
index b8bed7421..0bdd7df1b 100644
--- a/tests/share/test_oaipmh_trove.py
+++ b/tests/share/test_oaipmh_trove.py
@@ -44,9 +44,9 @@ def oai_request(data, request_method, expect_errors=False):
class TestOAIVerbs:
@pytest.fixture(scope='class')
def oai_indexcard(self, class_scoped_django_db):
- _latest_indexcard_rdf = factories.LatestIndexcardRdfFactory()
+ _latest_resource_description = factories.LatestResourceDescriptionFactory()
return factories.DerivedIndexcardFactory(
- upriver_indexcard=_latest_indexcard_rdf.indexcard,
+ upriver_indexcard=_latest_resource_description.indexcard,
deriver_identifier=trove_db.ResourceIdentifier.objects.get_or_create_for_iri(str(OAI_DC)),
derived_text='',
)
@@ -165,17 +165,17 @@ def oai_indexcards(self, class_scoped_django_db):
trove_db.ResourceIdentifier.objects
.get_or_create_for_iri(str(OAI_DC))
)
- _latest_rdfs = [
- factories.LatestIndexcardRdfFactory()
+ _latest_resource_descriptions = [
+ factories.LatestResourceDescriptionFactory()
for i in range(17)
]
return [
factories.DerivedIndexcardFactory(
- upriver_indexcard=_latest_rdf.indexcard,
+ upriver_indexcard=_latest_resource_description.indexcard,
deriver_identifier=_deriver_identifier,
derived_text='',
)
- for _latest_rdf in _latest_rdfs
+ for _latest_resource_description in _latest_resource_descriptions
]
def test_lists(self, oai_indexcards, monkeypatch):
diff --git a/tests/trove/derive/_base.py b/tests/trove/derive/_base.py
index bf07e659f..da7cceff6 100644
--- a/tests/trove/derive/_base.py
+++ b/tests/trove/derive/_base.py
@@ -34,18 +34,17 @@ def run_input_output_test(self, given_input, expected_output):
def _get_deriver(self, input_doc: DeriverTestDoc):
_mock_suid = mock.Mock()
_mock_suid.id = '--suid_id--'
- _mock_suid.get_date_first_seen.return_value = datetime.datetime(2345, 1, 1)
_mock_suid.get_backcompat_sharev2_suid.return_value = _mock_suid
_mock_suid.identifier = '--sourceunique-id--'
_mock_suid.source_config.label = '--sourceconfig-label--'
_mock_suid.source_config.source.long_title = '--source-title--'
- _mock_indexcard_rdf = mock.Mock()
- _mock_indexcard_rdf.id = '--indexcardf-id--'
- _mock_indexcard_rdf.modified = datetime.datetime(2345, 2, 2)
- _mock_indexcard_rdf.as_rdfdoc_with_supplements.return_value = rdf.RdfGraph(input_doc.tripledict)
- _mock_indexcard_rdf.focus_iri = input_doc.focus_iri
- _mock_indexcard_rdf.from_raw_datum_id = '--rawdatum-id--'
- _mock_indexcard_rdf.indexcard.id = '--indexcard-id--'
- _mock_indexcard_rdf.indexcard.source_record_suid = _mock_suid
- return self.deriver_class(_mock_indexcard_rdf)
+ _mock_resource_description = mock.Mock()
+ _mock_resource_description.id = '--resdes-id--'
+ _mock_resource_description.modified = datetime.datetime(2345, 2, 2)
+ _mock_resource_description.as_rdfdoc_with_supplements.return_value = rdf.RdfGraph(input_doc.tripledict)
+ _mock_resource_description.focus_iri = input_doc.focus_iri
+ _mock_resource_description.indexcard.id = '--indexcard-id--'
+ _mock_resource_description.indexcard.source_record_suid = _mock_suid
+ _mock_resource_description.indexcard.created = datetime.datetime(2345, 1, 1)
+ return self.deriver_class(_mock_resource_description)
diff --git a/tests/trove/derive/test_osfmap_json_mini.py b/tests/trove/derive/test_osfmap_json_mini.py
index 7c7da6a5d..aa54e44ab 100644
--- a/tests/trove/derive/test_osfmap_json_mini.py
+++ b/tests/trove/derive/test_osfmap_json_mini.py
@@ -154,6 +154,12 @@ class TestOsfmapJsonMiniDeriver(BaseIndexcardDeriverTest):
},
'osfmap-registration': {
"@id": "https://osf.example/2c4st",
+ "accessService": [{
+ "@id": "https://osf.example",
+ "identifier": [{"@value": "https://osf.example"}],
+ "name": [{"@value": "OSF"}],
+ "resourceType": [{"@id": "Agent"}, {"@id": "Organization"}],
+ }],
"resourceType": [
{"@id": "Registration"}
],
@@ -455,6 +461,10 @@ class TestOsfmapJsonMiniDeriver(BaseIndexcardDeriverTest):
]
}
],
+ "qualifiedAttribution": [{
+ "agent": [{"@id": "https://osf.example/bhcjn"}],
+ "hadRole": [{"@id": "osf:admin-contributor"}],
+ }],
"archivedAt": [
{"@id": "https://archive.example/details/osf-registrations-2c4st-v1"}
],
diff --git a/tests/trove/derive/test_sharev2_elastic.py b/tests/trove/derive/test_sharev2_elastic.py
index dd0510d14..c7fa87fd0 100644
--- a/tests/trove/derive/test_sharev2_elastic.py
+++ b/tests/trove/derive/test_sharev2_elastic.py
@@ -46,7 +46,6 @@ def assert_outputs_equal(self, expected, actual):
"papers": False,
"supplements": False
},
- "rawdatum_id": "--rawdatum-id--",
"retracted": False,
"source_config": "--sourceconfig-label--",
"source_unique_id": "--sourceunique-id--",
@@ -123,7 +122,6 @@ def assert_outputs_equal(self, expected, actual):
"papers": False,
"supplements": False
},
- "rawdatum_id": "--rawdatum-id--",
"retracted": False,
"source_config": "--sourceconfig-label--",
"source_unique_id": "--sourceunique-id--",
@@ -197,7 +195,6 @@ def assert_outputs_equal(self, expected, actual):
"publishers": [
"OSF Registries"
],
- "rawdatum_id": "--rawdatum-id--",
"retracted": False,
"source_config": "--sourceconfig-label--",
"source_unique_id": "--sourceunique-id--",
diff --git a/tests/trove/digestive_tract/test_derive.py b/tests/trove/digestive_tract/test_derive.py
index f606ead1a..ec1f6b40a 100644
--- a/tests/trove/digestive_tract/test_derive.py
+++ b/tests/trove/digestive_tract/test_derive.py
@@ -14,12 +14,11 @@ class TestDigestiveTractDerive(TestCase):
def setUpTestData(cls):
cls.focus_iri = _BLARG.this
_focus_ident = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(cls.focus_iri)
- _raw = factories.RawDatumFactory()
- cls.indexcard = trove_db.Indexcard.objects.create(source_record_suid=_raw.suid)
+ _suid = factories.SourceUniqueIdentifierFactory()
+ cls.indexcard = trove_db.Indexcard.objects.create(source_record_suid=_suid)
cls.indexcard.focus_identifier_set.add(_focus_ident)
- cls.latest_rdf = trove_db.LatestIndexcardRdf.objects.create(
+ cls.latest_resource_description = trove_db.LatestResourceDescription.objects.create(
indexcard=cls.indexcard,
- from_raw_datum=_raw,
focus_iri=cls.focus_iri,
rdf_as_turtle='''@prefix blarg: .
blarg:this
@@ -39,13 +38,10 @@ def test_derive(self):
})
def test_derive_with_supplementary(self):
- _supp_raw = factories.RawDatumFactory(
- suid=factories.SourceUniqueIdentifierFactory(is_supplementary=True),
- )
- trove_db.SupplementaryIndexcardRdf.objects.create(
+ _supp_suid = factories.SourceUniqueIdentifierFactory(is_supplementary=True)
+ trove_db.SupplementaryResourceDescription.objects.create(
indexcard=self.indexcard,
- from_raw_datum=_supp_raw,
- supplementary_suid=_supp_raw.suid,
+ supplementary_suid=_supp_suid,
focus_iri=self.focus_iri,
rdf_as_turtle='''@prefix blarg: .
blarg:this blarg:unlike blarg:nonthing .
diff --git a/tests/trove/digestive_tract/test_expel.py b/tests/trove/digestive_tract/test_expel.py
index 88a2d6f47..7f2345eb2 100644
--- a/tests/trove/digestive_tract/test_expel.py
+++ b/tests/trove/digestive_tract/test_expel.py
@@ -4,10 +4,16 @@
from django.test import TestCase
from share import models as share_db
-from tests import factories
+from tests.trove.factories import (
+ create_indexcard,
+ create_supplement,
+)
from trove import digestive_tract
from trove import models as trove_db
-from trove.vocab.namespaces import BLARG
+from trove.vocab.namespaces import (
+ BLARG,
+ TROVE,
+)
class TestDigestiveTractExpel(TestCase):
@@ -15,9 +21,12 @@ class TestDigestiveTractExpel(TestCase):
def setUpTestData(cls):
cls.focus_1 = BLARG.this1
cls.focus_2 = BLARG.this2
- cls.raw_1, cls.indexcard_1 = _setup_ingested(cls.focus_1)
- cls.raw_2, cls.indexcard_2 = _setup_ingested(cls.focus_2)
- cls.raw_supp = _setup_supplementary(cls.focus_1, cls.raw_1.suid, cls.indexcard_1)
+ cls.indexcard_1 = create_indexcard(cls.focus_1, deriver_iris=[TROVE['derive/osfmap_json']])
+ cls.indexcard_2 = create_indexcard(cls.focus_2, deriver_iris=[TROVE['derive/osfmap_json']])
+ cls.suid_1 = cls.indexcard_1.source_record_suid
+ cls.suid_2 = cls.indexcard_2.source_record_suid
+ cls.supp = create_supplement(cls.indexcard_1, cls.focus_1)
+ cls.supp_suid = cls.supp.supplementary_suid
def setUp(self):
super().setUp()
@@ -43,13 +52,12 @@ def test_setup(self):
self.assertIsNone(self.indexcard_1.deleted)
self.assertIsNone(self.indexcard_2.deleted)
self.assertEqual(share_db.SourceUniqueIdentifier.objects.count(), 3)
- self.assertEqual(share_db.RawDatum.objects.count(), 3)
- self.assertIsNotNone(self.indexcard_1.latest_rdf)
- self.assertIsNotNone(self.indexcard_2.latest_rdf)
- self.assertEqual(self.indexcard_1.archived_rdf_set.count(), 1)
- self.assertEqual(self.indexcard_2.archived_rdf_set.count(), 1)
- self.assertEqual(self.indexcard_1.supplementary_rdf_set.count(), 1)
- self.assertEqual(self.indexcard_2.supplementary_rdf_set.count(), 0)
+ self.assertIsNotNone(self.indexcard_1.latest_resource_description)
+ self.assertIsNotNone(self.indexcard_2.latest_resource_description)
+ self.assertEqual(self.indexcard_1.archived_description_set.count(), 1)
+ self.assertEqual(self.indexcard_2.archived_description_set.count(), 1)
+ self.assertEqual(self.indexcard_1.supplementary_description_set.count(), 1)
+ self.assertEqual(self.indexcard_2.supplementary_description_set.count(), 0)
self.assertEqual(self.indexcard_1.derived_indexcard_set.count(), 1)
self.assertEqual(self.indexcard_2.derived_indexcard_set.count(), 1)
# neither notified indexes nor enqueued re-derive
@@ -58,25 +66,24 @@ def test_setup(self):
def test_expel(self):
with mock.patch('trove.digestive_tract.expel_suid') as _mock_expel_suid:
- _user = self.raw_1.suid.source_config.source.user
- digestive_tract.expel(from_user=_user, record_identifier=self.raw_1.suid.identifier)
- _mock_expel_suid.assert_called_once_with(self.raw_1.suid)
+ _user = self.suid_1.source_config.source.user
+ digestive_tract.expel(from_user=_user, record_identifier=self.suid_1.identifier)
+ _mock_expel_suid.assert_called_once_with(self.suid_1)
def test_expel_suid(self):
- digestive_tract.expel_suid(self.raw_1.suid)
+ digestive_tract.expel_suid(self.suid_1)
self.indexcard_1.refresh_from_db()
self.indexcard_2.refresh_from_db()
self.assertIsNotNone(self.indexcard_1.deleted)
self.assertIsNone(self.indexcard_2.deleted)
self.assertEqual(share_db.SourceUniqueIdentifier.objects.count(), 3)
- self.assertEqual(share_db.RawDatum.objects.count(), 3)
- with self.assertRaises(trove_db.LatestIndexcardRdf.DoesNotExist):
- self.indexcard_1.latest_rdf # deleted
- self.assertIsNotNone(self.indexcard_2.latest_rdf)
- self.assertEqual(self.indexcard_1.archived_rdf_set.count(), 1) # not deleted
- self.assertEqual(self.indexcard_2.archived_rdf_set.count(), 1)
- self.assertEqual(self.indexcard_1.supplementary_rdf_set.count(), 1) # not deleted
- self.assertEqual(self.indexcard_2.supplementary_rdf_set.count(), 0)
+ with self.assertRaises(trove_db.LatestResourceDescription.DoesNotExist):
+ self.indexcard_1.latest_resource_description # deleted
+ self.assertIsNotNone(self.indexcard_2.latest_resource_description)
+ self.assertEqual(self.indexcard_1.archived_description_set.count(), 1) # not deleted
+ self.assertEqual(self.indexcard_2.archived_description_set.count(), 1)
+ self.assertEqual(self.indexcard_1.supplementary_description_set.count(), 1) # not deleted
+ self.assertEqual(self.indexcard_2.supplementary_description_set.count(), 0)
self.assertEqual(self.indexcard_1.derived_indexcard_set.count(), 0) # deleted
self.assertEqual(self.indexcard_2.derived_indexcard_set.count(), 1)
# notified indexes of update; did not enqueue re-derive
@@ -84,19 +91,18 @@ def test_expel_suid(self):
self.mock_derive_task.delay.assert_not_called()
def test_expel_supplementary_suid(self):
- digestive_tract.expel_suid(self.raw_supp.suid)
+ digestive_tract.expel_suid(self.supp_suid)
self.indexcard_1.refresh_from_db()
self.indexcard_2.refresh_from_db()
self.assertIsNone(self.indexcard_1.deleted)
self.assertIsNone(self.indexcard_2.deleted)
self.assertEqual(share_db.SourceUniqueIdentifier.objects.count(), 3)
- self.assertEqual(share_db.RawDatum.objects.count(), 3)
- self.assertIsNotNone(self.indexcard_1.latest_rdf)
- self.assertIsNotNone(self.indexcard_2.latest_rdf)
- self.assertEqual(self.indexcard_1.archived_rdf_set.count(), 1)
- self.assertEqual(self.indexcard_2.archived_rdf_set.count(), 1)
- self.assertEqual(self.indexcard_1.supplementary_rdf_set.count(), 0) # deleted
- self.assertEqual(self.indexcard_2.supplementary_rdf_set.count(), 0)
+ self.assertIsNotNone(self.indexcard_1.latest_resource_description)
+ self.assertIsNotNone(self.indexcard_2.latest_resource_description)
+ self.assertEqual(self.indexcard_1.archived_description_set.count(), 1)
+ self.assertEqual(self.indexcard_2.archived_description_set.count(), 1)
+ self.assertEqual(self.indexcard_1.supplementary_description_set.count(), 0) # deleted
+ self.assertEqual(self.indexcard_2.supplementary_description_set.count(), 0)
self.assertEqual(self.indexcard_1.derived_indexcard_set.count(), 1)
self.assertEqual(self.indexcard_2.derived_indexcard_set.count(), 1)
# did not notify indexes of update; did enqueue re-derive
@@ -110,22 +116,22 @@ def test_expel_expired_task(self):
def test_expel_expired(self):
_today = datetime.date.today()
- self.raw_2.expiration_date = _today
- self.raw_2.save()
+ _latest = self.indexcard_2.latest_resource_description
+ _latest.expiration_date = _today
+ _latest.save()
digestive_tract.expel_expired_data(_today)
self.indexcard_1.refresh_from_db()
self.indexcard_2.refresh_from_db()
self.assertIsNone(self.indexcard_1.deleted)
self.assertIsNotNone(self.indexcard_2.deleted) # marked deleted
self.assertEqual(share_db.SourceUniqueIdentifier.objects.count(), 3)
- self.assertEqual(share_db.RawDatum.objects.count(), 3)
- self.assertIsNotNone(self.indexcard_1.latest_rdf)
- with self.assertRaises(trove_db.LatestIndexcardRdf.DoesNotExist):
- self.indexcard_2.latest_rdf # deleted
- self.assertEqual(self.indexcard_1.archived_rdf_set.count(), 1)
- self.assertEqual(self.indexcard_2.archived_rdf_set.count(), 1) # not deleted
- self.assertEqual(self.indexcard_1.supplementary_rdf_set.count(), 1)
- self.assertEqual(self.indexcard_2.supplementary_rdf_set.count(), 0) # deleted
+ self.assertIsNotNone(self.indexcard_1.latest_resource_description)
+ with self.assertRaises(trove_db.LatestResourceDescription.DoesNotExist):
+ self.indexcard_2.latest_resource_description # deleted
+ self.assertEqual(self.indexcard_1.archived_description_set.count(), 1)
+ self.assertEqual(self.indexcard_2.archived_description_set.count(), 1) # not deleted
+ self.assertEqual(self.indexcard_1.supplementary_description_set.count(), 1)
+ self.assertEqual(self.indexcard_2.supplementary_description_set.count(), 0) # deleted
self.assertEqual(self.indexcard_1.derived_indexcard_set.count(), 1)
self.assertEqual(self.indexcard_2.derived_indexcard_set.count(), 0) # deleted
# notified indexes of update; did not enqueue re-derive
@@ -134,71 +140,22 @@ def test_expel_expired(self):
def test_expel_expired_supplement(self):
_today = datetime.date.today()
- self.raw_supp.expiration_date = _today
- self.raw_supp.save()
+ self.supp.expiration_date = _today
+ self.supp.save()
digestive_tract.expel_expired_data(_today)
self.indexcard_1.refresh_from_db()
self.indexcard_2.refresh_from_db()
self.assertIsNone(self.indexcard_1.deleted)
self.assertIsNone(self.indexcard_2.deleted)
self.assertEqual(share_db.SourceUniqueIdentifier.objects.count(), 3)
- self.assertEqual(share_db.RawDatum.objects.count(), 3)
- self.assertIsNotNone(self.indexcard_1.latest_rdf)
- self.assertIsNotNone(self.indexcard_2.latest_rdf)
- self.assertEqual(self.indexcard_1.archived_rdf_set.count(), 1)
- self.assertEqual(self.indexcard_2.archived_rdf_set.count(), 1)
- self.assertEqual(self.indexcard_1.supplementary_rdf_set.count(), 0) # deleted
- self.assertEqual(self.indexcard_2.supplementary_rdf_set.count(), 0)
+ self.assertIsNotNone(self.indexcard_1.latest_resource_description)
+ self.assertIsNotNone(self.indexcard_2.latest_resource_description)
+ self.assertEqual(self.indexcard_1.archived_description_set.count(), 1)
+ self.assertEqual(self.indexcard_2.archived_description_set.count(), 1)
+ self.assertEqual(self.indexcard_1.supplementary_description_set.count(), 0) # deleted
+ self.assertEqual(self.indexcard_2.supplementary_description_set.count(), 0)
self.assertEqual(self.indexcard_1.derived_indexcard_set.count(), 1)
self.assertEqual(self.indexcard_2.derived_indexcard_set.count(), 1)
# did not notify indexes of update; did enqueue re-derive
self.assertEqual(self.notified_indexcard_ids, set())
self.mock_derive_task.delay.assert_called_once_with(self.indexcard_1.id)
-
-
-def _setup_ingested(focus_iri: str):
- _focus_ident = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(focus_iri)
- _suid = factories.SourceUniqueIdentifierFactory(
- focus_identifier=_focus_ident,
- )
- _raw = factories.RawDatumFactory(suid=_suid)
- _indexcard = trove_db.Indexcard.objects.create(source_record_suid=_raw.suid)
- _indexcard.focus_identifier_set.add(_focus_ident)
- _latest_rdf = trove_db.LatestIndexcardRdf.objects.create(
- indexcard=_indexcard,
- from_raw_datum=_raw,
- focus_iri=focus_iri,
- rdf_as_turtle='...',
- )
- trove_db.ArchivedIndexcardRdf.objects.create(
- indexcard=_indexcard,
- from_raw_datum=_raw,
- focus_iri=focus_iri,
- rdf_as_turtle=_latest_rdf.rdf_as_turtle,
- )
- _deriver_iri = BLARG.deriver
- _deriver_ident = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_deriver_iri)
- trove_db.DerivedIndexcard.objects.create(
- upriver_indexcard=_indexcard,
- deriver_identifier=_deriver_ident,
- derived_checksum_iri='...',
- derived_text='...',
- )
- return _raw, _indexcard
-
-
-def _setup_supplementary(focus_iri, main_suid, indexcard):
- _supp_suid = factories.SourceUniqueIdentifierFactory(
- focus_identifier=main_suid.focus_identifier,
- source_config=main_suid.source_config,
- is_supplementary=True,
- )
- _supp_raw = factories.RawDatumFactory(suid=_supp_suid)
- trove_db.SupplementaryIndexcardRdf.objects.create(
- indexcard=indexcard,
- from_raw_datum=_supp_raw,
- supplementary_suid=_supp_suid,
- focus_iri=focus_iri,
- rdf_as_turtle='...',
- )
- return _supp_raw
diff --git a/tests/trove/digestive_tract/test_extract.py b/tests/trove/digestive_tract/test_extract.py
index 57afd3ca0..393c857ae 100644
--- a/tests/trove/digestive_tract/test_extract.py
+++ b/tests/trove/digestive_tract/test_extract.py
@@ -6,44 +6,45 @@
from trove import digestive_tract
from trove import exceptions as trove_exceptions
from trove import models as trove_db
+from trove.vocab import mediatypes
from trove.vocab.namespaces import BLARG as _BLARG
class TestDigestiveTractExtract(TestCase):
@classmethod
def setUpTestData(cls):
- _focus_ident = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_BLARG.this)
- cls.raw = factories.RawDatumFactory(
- mediatype='text/turtle',
- datum='''@prefix blarg: .
+ cls.user = factories.ShareUserFactory()
+ cls.focus_iri = _BLARG.this
+ cls.suid = digestive_tract.sniff(from_user=cls.user, focus_iri=cls.focus_iri)
+ cls.raw_turtle = '''@prefix blarg: .
blarg:this
a blarg:Thing ;
blarg:like blarg:that .
-''',
- suid__focus_identifier=_focus_ident,
+'''
+ cls.supp_suid = digestive_tract.sniff(
+ from_user=cls.user,
+ focus_iri=cls.focus_iri,
+ record_identifier=f'supp:{cls.focus_iri}',
+ is_supplementary=True,
)
- cls.supplementary_raw = factories.RawDatumFactory(
- mediatype='text/turtle',
- datum='''@prefix blarg: .
+ cls.supp_raw_turtle = '''@prefix blarg: .
blarg:this blarg:like blarg:another ;
blarg:unlike blarg:nonthing .
-''',
- suid=factories.SourceUniqueIdentifierFactory(
- source_config=cls.raw.suid.source_config,
- focus_identifier=cls.raw.suid.focus_identifier,
- is_supplementary=True,
- ),
- )
+'''
def test_setup(self):
self.assertEqual(trove_db.Indexcard.objects.all().count(), 0)
- self.assertEqual(trove_db.LatestIndexcardRdf.objects.all().count(), 0)
- self.assertEqual(trove_db.ArchivedIndexcardRdf.objects.all().count(), 0)
- self.assertEqual(trove_db.SupplementaryIndexcardRdf.objects.all().count(), 0)
+ self.assertEqual(trove_db.LatestResourceDescription.objects.all().count(), 0)
+ self.assertEqual(trove_db.ArchivedResourceDescription.objects.all().count(), 0)
+ self.assertEqual(trove_db.SupplementaryResourceDescription.objects.all().count(), 0)
def test_extract(self):
- (_indexcard,) = digestive_tract.extract(self.raw)
- self.assertEqual(_indexcard.source_record_suid_id, self.raw.suid_id)
+ (_indexcard,) = digestive_tract.extract(
+ suid=self.suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=self.raw_turtle,
+ )
+ self.assertEqual(_indexcard.source_record_suid_id, self.suid.id)
_focus_idents = list(
_indexcard.focus_identifier_set.values_list('sufficiently_unique_iri', flat=True),
)
@@ -52,50 +53,114 @@ def test_extract(self):
_indexcard.focustype_identifier_set.values_list('sufficiently_unique_iri', flat=True),
)
self.assertEqual(_focustype_idents, ['://blarg.example/vocab/Thing'])
- self.assertEqual(list(_indexcard.supplementary_rdf_set.all()), [])
- _latest_rdf = _indexcard.latest_rdf
- self.assertEqual(_latest_rdf.from_raw_datum_id, self.raw.id)
- self.assertEqual(_latest_rdf.indexcard_id, _indexcard.id)
- self.assertEqual(_latest_rdf.focus_iri, _BLARG.this)
- self.assertEqual(_latest_rdf.as_rdf_tripledict(), {
+ self.assertEqual(list(_indexcard.supplementary_description_set.all()), [])
+ _latest_resource_description = _indexcard.latest_resource_description
+ self.assertEqual(_latest_resource_description.indexcard_id, _indexcard.id)
+ self.assertEqual(_latest_resource_description.focus_iri, _BLARG.this)
+ self.assertIsNone(_latest_resource_description.expiration_date)
+ self.assertEqual(_latest_resource_description.as_rdf_tripledict(), {
_BLARG.this: {
rdf.RDF.type: {_BLARG.Thing},
_BLARG.like: {_BLARG.that},
},
})
- self.assertEqual(_latest_rdf.as_rdfdoc_with_supplements().tripledict, {
+ self.assertEqual(_latest_resource_description.as_rdfdoc_with_supplements().tripledict, {
_BLARG.this: {
rdf.RDF.type: {_BLARG.Thing},
_BLARG.like: {_BLARG.that},
},
})
+ def test_extract_before_expiration(self):
+ _expir = datetime.date.today() + datetime.timedelta(days=3)
+ (_indexcard,) = digestive_tract.extract(
+ suid=self.suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=self.raw_turtle,
+ expiration_date=_expir,
+ )
+ self.assertEqual(_indexcard.source_record_suid_id, self.suid.id)
+ _focus_idents = list(
+ _indexcard.focus_identifier_set.values_list('sufficiently_unique_iri', flat=True),
+ )
+ self.assertEqual(_focus_idents, ['://blarg.example/vocab/this'])
+ _focustype_idents = list(
+ _indexcard.focustype_identifier_set.values_list('sufficiently_unique_iri', flat=True),
+ )
+ self.assertEqual(_focustype_idents, ['://blarg.example/vocab/Thing'])
+ self.assertEqual(list(_indexcard.supplementary_description_set.all()), [])
+ _latest_resource_description = _indexcard.latest_resource_description
+ self.assertEqual(_latest_resource_description.indexcard_id, _indexcard.id)
+ self.assertEqual(_latest_resource_description.focus_iri, _BLARG.this)
+ self.assertEqual(_latest_resource_description.expiration_date, _expir)
+ self.assertEqual(_latest_resource_description.as_rdf_tripledict(), {
+ _BLARG.this: {
+ rdf.RDF.type: {_BLARG.Thing},
+ _BLARG.like: {_BLARG.that},
+ },
+ })
+ self.assertEqual(_latest_resource_description.as_rdfdoc_with_supplements().tripledict, {
+ _BLARG.this: {
+ rdf.RDF.type: {_BLARG.Thing},
+ _BLARG.like: {_BLARG.that},
+ },
+ })
+
+ def test_extract_supplement_before_expiration(self):
+ (_indexcard,) = digestive_tract.extract(
+ suid=self.suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=self.raw_turtle,
+ )
+ _expir = datetime.date.today() + datetime.timedelta(days=5)
+ (_supped_indexcard,) = digestive_tract.extract(
+ suid=self.supp_suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=self.supp_raw_turtle,
+ expiration_date=_expir,
+ )
+ self.assertEqual(_indexcard, _supped_indexcard)
+ (_supp_rdf,) = _indexcard.supplementary_description_set.all()
+ self.assertEqual(_supp_rdf.expiration_date, _expir)
+
def test_extract_supplementary_without_prior(self):
- _cards = digestive_tract.extract(self.supplementary_raw)
+ _cards = digestive_tract.extract(
+ suid=self.supp_suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=self.supp_raw_turtle,
+ )
self.assertEqual(_cards, [])
self.assertEqual(trove_db.Indexcard.objects.all().count(), 0)
- self.assertEqual(trove_db.LatestIndexcardRdf.objects.all().count(), 0)
- self.assertEqual(trove_db.ArchivedIndexcardRdf.objects.all().count(), 0)
- self.assertEqual(trove_db.SupplementaryIndexcardRdf.objects.all().count(), 0)
+ self.assertEqual(trove_db.LatestResourceDescription.objects.all().count(), 0)
+ self.assertEqual(trove_db.ArchivedResourceDescription.objects.all().count(), 0)
+ self.assertEqual(trove_db.SupplementaryResourceDescription.objects.all().count(), 0)
def test_extract_supplementary(self):
- (_orig_indexcard,) = digestive_tract.extract(self.raw)
- _orig_timestamp = _orig_indexcard.latest_rdf.modified
- (_indexcard,) = digestive_tract.extract(self.supplementary_raw)
+ (_orig_indexcard,) = digestive_tract.extract(
+ suid=self.suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=self.raw_turtle,
+ )
+ _orig_timestamp = _orig_indexcard.latest_resource_description.modified
+ (_indexcard,) = digestive_tract.extract(
+ suid=self.supp_suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=self.supp_raw_turtle,
+ )
self.assertEqual(_orig_indexcard.id, _indexcard.id)
- self.assertEqual(_indexcard.source_record_suid_id, self.raw.suid_id)
- (_supp_rdf,) = _indexcard.supplementary_rdf_set.all()
- self.assertEqual(_supp_rdf.from_raw_datum_id, self.supplementary_raw.id)
+ self.assertEqual(_indexcard.source_record_suid_id, self.suid.id)
+ (_supp_rdf,) = _indexcard.supplementary_description_set.all()
self.assertEqual(_supp_rdf.indexcard_id, _indexcard.id)
self.assertEqual(_supp_rdf.focus_iri, _BLARG.this)
+ self.assertIsNone(_supp_rdf.expiration_date)
self.assertEqual(_supp_rdf.as_rdf_tripledict(), {
_BLARG.this: {
_BLARG.like: {_BLARG.another},
_BLARG.unlike: {_BLARG.nonthing},
},
})
- self.assertEqual(_indexcard.latest_rdf.modified, _orig_timestamp)
- self.assertEqual(_indexcard.latest_rdf.as_rdfdoc_with_supplements().tripledict, {
+ self.assertEqual(_indexcard.latest_resource_description.modified, _orig_timestamp)
+ self.assertEqual(_indexcard.latest_resource_description.as_rdfdoc_with_supplements().tripledict, {
_BLARG.this: {
rdf.RDF.type: {_BLARG.Thing},
_BLARG.like: {_BLARG.that, _BLARG.another},
@@ -104,50 +169,65 @@ def test_extract_supplementary(self):
})
def test_extract_empty_with_prior(self):
- (_prior_indexcard,) = digestive_tract.extract(self.raw)
- self.assertFalse(self.raw.no_output)
+ (_prior_indexcard,) = digestive_tract.extract(
+ suid=self.suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=self.raw_turtle,
+ )
self.assertIsNone(_prior_indexcard.deleted)
- # add a later raw
- _empty_raw = factories.RawDatumFactory(
- mediatype='text/turtle',
- datum=' ',
- suid=self.raw.suid,
- )
- (_indexcard,) = digestive_tract.extract(_empty_raw)
- self.assertTrue(_empty_raw.no_output)
+ # extract an empty
+ (_indexcard,) = digestive_tract.extract(
+ suid=self.suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=' ', # no data
+ )
self.assertEqual(_indexcard.id, _prior_indexcard.id)
self.assertIsNotNone(_indexcard.deleted)
- with self.assertRaises(trove_db.LatestIndexcardRdf.DoesNotExist):
- _indexcard.latest_rdf
+ with self.assertRaises(trove_db.LatestResourceDescription.DoesNotExist):
+ _indexcard.latest_resource_description
def test_extract_empty_without_prior(self):
- _empty_raw = factories.RawDatumFactory(
- mediatype='text/turtle',
- datum=' ',
+ _cards = digestive_tract.extract(
+ suid=self.suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=' ', # no data
)
- _cards = digestive_tract.extract(_empty_raw)
self.assertEqual(_cards, [])
- self.assertTrue(_empty_raw.no_output)
def test_extract_empty_supplementary(self):
- (_orig_indexcard,) = digestive_tract.extract(self.raw)
- digestive_tract.extract(self.supplementary_raw)
- self.assertTrue(_orig_indexcard.supplementary_rdf_set.exists())
- _empty_raw = factories.RawDatumFactory(
- mediatype='text/turtle',
- datum='',
- suid=self.supplementary_raw.suid,
- )
- (_indexcard,) = digestive_tract.extract(_empty_raw)
+ (_orig_indexcard,) = digestive_tract.extract(
+ suid=self.suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=self.raw_turtle,
+ )
+ digestive_tract.extract(
+ suid=self.supp_suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=self.supp_raw_turtle,
+ )
+ self.assertTrue(_orig_indexcard.supplementary_description_set.exists())
+ (_indexcard,) = digestive_tract.extract(
+ suid=self.supp_suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=' ', # no data
+ )
self.assertEqual(_indexcard.id, _orig_indexcard.id)
- self.assertFalse(_orig_indexcard.supplementary_rdf_set.exists())
+ self.assertFalse(_orig_indexcard.supplementary_description_set.exists())
- def test_extract_expired(self):
- self.raw.expiration_date = datetime.date.today()
+ def test_extract_after_expiration(self):
with self.assertRaises(trove_exceptions.CannotDigestExpiredDatum):
- digestive_tract.extract(self.raw)
+ digestive_tract.extract(
+ suid=self.suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=self.raw_turtle,
+ expiration_date=datetime.date.today(),
+ )
- def test_extract_expired_supplement(self):
- self.supplementary_raw.expiration_date = datetime.date.today()
+ def test_extract_supp_after_expiration(self):
with self.assertRaises(trove_exceptions.CannotDigestExpiredDatum):
- digestive_tract.extract(self.supplementary_raw)
+ digestive_tract.extract(
+ suid=self.supp_suid,
+ record_mediatype=mediatypes.TURTLE,
+ raw_record=self.supp_raw_turtle,
+ expiration_date=datetime.date.today(),
+ )
diff --git a/tests/trove/digestive_tract/test_sniff.py b/tests/trove/digestive_tract/test_sniff.py
new file mode 100644
index 000000000..c421ac381
--- /dev/null
+++ b/tests/trove/digestive_tract/test_sniff.py
@@ -0,0 +1,90 @@
+from django.test import TestCase
+
+from share import models as share_db
+from tests import factories
+from trove import digestive_tract
+from trove import exceptions as trove_exceptions
+from trove.vocab.namespaces import BLARG
+
+
+class TestDigestiveTractSniff(TestCase):
+ @classmethod
+ def setUpTestData(cls):
+ cls.user = factories.ShareUserFactory()
+
+ def test_setup(self):
+ self.assertEqual(share_db.SourceConfig.objects.all().count(), 0)
+ self.assertEqual(share_db.SourceUniqueIdentifier.objects.all().count(), 0)
+
+ def test_sniff(self):
+ digestive_tract.sniff(
+ from_user=self.user,
+ record_identifier='blarg',
+ focus_iri=BLARG.this,
+ )
+ (_suid,) = share_db.SourceUniqueIdentifier.objects.all()
+ self.assertEqual(_suid.identifier, 'blarg')
+ self.assertEqual(_suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this')
+ self.assertEqual(_suid.source_config.source.user_id, self.user.id)
+ self.assertFalse(_suid.is_supplementary)
+
+ def test_sniff_implicit_record_identifier(self):
+ digestive_tract.sniff(
+ from_user=self.user,
+ focus_iri=BLARG.this,
+ )
+ (_suid,) = share_db.SourceUniqueIdentifier.objects.all()
+ self.assertEqual(_suid.identifier, BLARG.this)
+ self.assertEqual(_suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this')
+ self.assertEqual(_suid.source_config.source.user_id, self.user.id)
+ self.assertFalse(_suid.is_supplementary)
+
+ def test_sniff_supplementary(self):
+ digestive_tract.sniff(
+ from_user=self.user,
+ record_identifier='blarg',
+ focus_iri=BLARG.this,
+ is_supplementary=True,
+ )
+ (_suid,) = share_db.SourceUniqueIdentifier.objects.all()
+ self.assertEqual(_suid.identifier, 'blarg')
+ self.assertEqual(_suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this')
+ self.assertEqual(_suid.source_config.source.user_id, self.user.id)
+ self.assertTrue(_suid.is_supplementary)
+
+ def test_error_focus_iri(self):
+ with self.assertRaises(trove_exceptions.DigestiveError):
+ digestive_tract.sniff(from_user=self.user, focus_iri='blam')
+ with self.assertRaises(trove_exceptions.DigestiveError):
+ digestive_tract.sniff(from_user=self.user, focus_iri='')
+
+ def test_error_missing_record_identifier(self):
+ with self.assertRaises(trove_exceptions.DigestiveError):
+ digestive_tract.sniff(from_user=self.user, focus_iri=BLARG.foo, is_supplementary=True)
+
+ def test_error_change_focus(self):
+ digestive_tract.sniff(
+ from_user=self.user,
+ record_identifier='foo',
+ focus_iri=BLARG.bar,
+ )
+ with self.assertRaises(trove_exceptions.DigestiveError):
+ digestive_tract.sniff(
+ from_user=self.user,
+ record_identifier='foo',
+ focus_iri=BLARG.different,
+ )
+
+ def test_error_change_supplementariness(self):
+ digestive_tract.sniff(
+ from_user=self.user,
+ focus_iri=BLARG.foo,
+ record_identifier='foo-supp',
+ is_supplementary=True,
+ )
+ with self.assertRaises(trove_exceptions.DigestiveError):
+ digestive_tract.sniff(
+ from_user=self.user,
+ focus_iri=BLARG.foo,
+ record_identifier='foo-supp',
+ )
diff --git a/tests/trove/digestive_tract/test_swallow.py b/tests/trove/digestive_tract/test_swallow.py
deleted file mode 100644
index 968b8d668..000000000
--- a/tests/trove/digestive_tract/test_swallow.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import datetime
-from unittest import mock
-from django.test import TestCase
-
-from tests import factories
-from trove import digestive_tract
-from share import models as share_db
-
-
-class TestDigestiveTractSwallow(TestCase):
- @classmethod
- def setUpTestData(cls):
- cls.user = factories.ShareUserFactory()
- cls.turtle = '''
-@prefix blarg: .
-blarg:this
- a blarg:Thing ;
- blarg:like blarg:that .
-'''
-
- def test_setup(self):
- self.assertEqual(share_db.RawDatum.objects.all().count(), 0)
-
- def test_swallow(self):
- with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task:
- digestive_tract.swallow(
- from_user=self.user,
- record=self.turtle,
- record_identifier='blarg',
- record_mediatype='text/turtle',
- focus_iri='http://blarg.example/vocab/this',
- )
- (_raw,) = share_db.RawDatum.objects.all()
- self.assertEqual(_raw.datum, self.turtle)
- self.assertEqual(_raw.mediatype, 'text/turtle')
- self.assertIsNone(_raw.expiration_date)
- self.assertEqual(_raw.suid.identifier, 'blarg')
- self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this')
- self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id)
- self.assertFalse(_raw.suid.is_supplementary)
- _mock_task.delay.assert_called_once_with(_raw.id, urgent=False)
-
- def test_swallow_urgent(self):
- with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task:
- digestive_tract.swallow(
- from_user=self.user,
- record=self.turtle,
- record_identifier='blarg',
- record_mediatype='text/turtle',
- focus_iri='http://blarg.example/vocab/this',
- urgent=True
- )
- (_raw,) = share_db.RawDatum.objects.all()
- self.assertEqual(_raw.datum, self.turtle)
- self.assertEqual(_raw.mediatype, 'text/turtle')
- self.assertIsNone(_raw.expiration_date)
- self.assertEqual(_raw.suid.identifier, 'blarg')
- self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this')
- self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id)
- self.assertFalse(_raw.suid.is_supplementary)
- _mock_task.delay.assert_called_once_with(_raw.id, urgent=True)
-
- def test_swallow_supplementary(self):
- with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task:
- digestive_tract.swallow(
- from_user=self.user,
- record=self.turtle,
- record_identifier='blarg',
- record_mediatype='text/turtle',
- focus_iri='http://blarg.example/vocab/this',
- is_supplementary=True,
- )
- (_raw,) = share_db.RawDatum.objects.all()
- self.assertEqual(_raw.datum, self.turtle)
- self.assertEqual(_raw.mediatype, 'text/turtle')
- self.assertIsNone(_raw.expiration_date)
- self.assertEqual(_raw.suid.identifier, 'blarg')
- self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this')
- self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id)
- self.assertTrue(_raw.suid.is_supplementary)
- _mock_task.delay.assert_called_once_with(_raw.id, urgent=False)
-
- def test_swallow_with_expiration(self):
- with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task:
- digestive_tract.swallow(
- from_user=self.user,
- record=self.turtle,
- record_identifier='blarg',
- record_mediatype='text/turtle',
- focus_iri='http://blarg.example/vocab/this',
- expiration_date=datetime.date(2048, 1, 3),
- )
- (_raw,) = share_db.RawDatum.objects.all()
- self.assertEqual(_raw.datum, self.turtle)
- self.assertEqual(_raw.mediatype, 'text/turtle')
- self.assertEqual(_raw.expiration_date, datetime.date(2048, 1, 3))
- self.assertEqual(_raw.suid.identifier, 'blarg')
- self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this')
- self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id)
- self.assertFalse(_raw.suid.is_supplementary)
- _mock_task.delay.assert_called_once_with(_raw.id, urgent=False)
-
- def test_swallow_supplementary_with_expiration(self):
- with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task:
- digestive_tract.swallow(
- from_user=self.user,
- record=self.turtle,
- record_identifier='blarg',
- record_mediatype='text/turtle',
- focus_iri='http://blarg.example/vocab/this',
- is_supplementary=True,
- expiration_date=datetime.date(2047, 1, 3),
- )
- (_raw,) = share_db.RawDatum.objects.all()
- self.assertEqual(_raw.datum, self.turtle)
- self.assertEqual(_raw.mediatype, 'text/turtle')
- self.assertEqual(_raw.expiration_date, datetime.date(2047, 1, 3))
- self.assertEqual(_raw.suid.identifier, 'blarg')
- self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this')
- self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id)
- self.assertTrue(_raw.suid.is_supplementary)
- _mock_task.delay.assert_called_once_with(_raw.id, urgent=False)
diff --git a/tests/trove/factories.py b/tests/trove/factories.py
index 1a7d4b31b..406fb5599 100644
--- a/tests/trove/factories.py
+++ b/tests/trove/factories.py
@@ -1,4 +1,6 @@
from collections.abc import Collection
+import time
+import uuid
from tests import factories
@@ -6,6 +8,7 @@
from trove import models as trove_db
from trove import digestive_tract
+from trove.vocab.namespaces import BLARG
__all__ = (
@@ -17,17 +20,21 @@
def create_indexcard(
- focus_iri: str,
+ focus_iri: str | None = None,
rdf_twopledict: rdf.RdfTwopleDictionary | None = None,
rdf_tripledict: rdf.RdfTripleDictionary | None = None,
deriver_iris: Collection[str] = (),
) -> trove_db.Indexcard:
- _suid = factories.SourceUniqueIdentifierFactory()
+ _focus_iri = focus_iri or BLARG[str(uuid.uuid4())]
+ _focus_ident = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_focus_iri)
+ _suid = factories.SourceUniqueIdentifierFactory(
+ focus_identifier=_focus_ident,
+ )
_indexcard = trove_db.Indexcard.objects.create(source_record_suid=_suid)
_indexcard.focus_identifier_set.add(
- trove_db.ResourceIdentifier.objects.get_or_create_for_iri(focus_iri),
+ trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_focus_iri),
)
- update_indexcard_content(_indexcard, focus_iri, rdf_twopledict, rdf_tripledict)
+ update_indexcard_content(_indexcard, _focus_iri, rdf_twopledict, rdf_tripledict)
if deriver_iris:
digestive_tract.derive(_indexcard, deriver_iris)
return _indexcard
@@ -35,25 +42,13 @@ def create_indexcard(
def update_indexcard_content(
indexcard: trove_db.Indexcard,
- focus_iri: str,
+ focus_iri: str | None = None,
rdf_twopledict: rdf.RdfTwopleDictionary | None = None,
rdf_tripledict: rdf.RdfTripleDictionary | None = None,
) -> None:
- _card_content = _combined_tripledict(focus_iri, rdf_twopledict, rdf_tripledict)
- _card_content_turtle = rdf.turtle_from_tripledict(_card_content)
- _raw = factories.RawDatumFactory(suid=indexcard.source_record_suid, datum=_card_content_turtle)
- indexcard.focus_identifier_set.add(
- trove_db.ResourceIdentifier.objects.get_or_create_for_iri(focus_iri),
- )
- trove_db.LatestIndexcardRdf.objects.update_or_create(
- indexcard=indexcard,
- defaults={
- 'from_raw_datum': _raw,
- 'focus_iri': focus_iri,
- 'rdf_as_turtle': _card_content_turtle,
- 'turtle_checksum_iri': 'foo', # not enforced
- },
- )
+ _focus_iri = focus_iri or indexcard.latest_resource_description.focus_iri
+ _card_content = _combined_tripledict(_focus_iri, rdf_twopledict, rdf_tripledict)
+ indexcard.update_resource_description(_focus_iri, _card_content)
def create_supplement(
@@ -61,18 +56,17 @@ def create_supplement(
focus_iri: str,
rdf_twopledict: rdf.RdfTwopleDictionary | None = None,
rdf_tripledict: rdf.RdfTripleDictionary | None = None,
-) -> trove_db.SupplementaryIndexcardRdf:
- _supp_suid = factories.SourceUniqueIdentifierFactory()
- _supp_content = _combined_tripledict(focus_iri, rdf_twopledict, rdf_tripledict)
- _supp_content_turtle = rdf.turtle_from_tripledict(_supp_content)
- _supp_raw = factories.RawDatumFactory(suid=_supp_suid, datum=_supp_content_turtle)
- return trove_db.SupplementaryIndexcardRdf.objects.create(
- from_raw_datum=_supp_raw,
- indexcard=indexcard,
- supplementary_suid=_supp_suid,
- focus_iri=focus_iri,
- rdf_as_turtle=_supp_content_turtle,
- turtle_checksum_iri='sup', # not enforced
+) -> trove_db.SupplementaryResourceDescription:
+ _main_suid = indexcard.source_record_suid
+ _supp_suid = factories.SourceUniqueIdentifierFactory(
+ focus_identifier=_main_suid.focus_identifier,
+ source_config=_main_suid.source_config,
+ is_supplementary=True,
+ )
+ return indexcard.update_supplementary_description(
+ _supp_suid,
+ focus_iri,
+ _combined_tripledict(focus_iri, rdf_twopledict, rdf_tripledict),
)
@@ -99,4 +93,8 @@ def _combined_tripledict(
_graph.add_twopledict(focus_iri, rdf_twopledict)
if rdf_tripledict is not None:
_graph.add_tripledict(rdf_tripledict)
- return _graph.tripledict
+ return _graph.tripledict or {
+ focus_iri: {
+ BLARG.timeNonce: {rdf.literal(time.time_ns())},
+ },
+ }
diff --git a/tests/trove/views/test_ingest.py b/tests/trove/views/test_ingest.py
index fc3f5d464..18e8e4995 100644
--- a/tests/trove/views/test_ingest.py
+++ b/tests/trove/views/test_ingest.py
@@ -27,15 +27,16 @@ def test_post(self):
HTTP_AUTHORIZATION=self.user.authorization(),
)
self.assertEqual(_resp.status_code, HTTPStatus.CREATED)
- _mock_tract.swallow.assert_called_once_with(
+ _mock_tract.ingest.assert_called_once_with(
from_user=self.user,
- record='turtleturtleturtle',
+ raw_record='turtleturtleturtle',
record_identifier='blarg',
record_mediatype='text/turtle',
focus_iri='https://foo.example/blarg',
urgent=True,
is_supplementary=False,
expiration_date=None,
+ restore_deleted=True,
)
def test_post_nonurgent(self):
@@ -51,15 +52,16 @@ def test_post_nonurgent(self):
HTTP_AUTHORIZATION=self.user.authorization(),
)
self.assertEqual(_resp.status_code, HTTPStatus.CREATED)
- _mock_tract.swallow.assert_called_once_with(
+ _mock_tract.ingest.assert_called_once_with(
from_user=self.user,
- record='turtleturtleturtle',
+ raw_record='turtleturtleturtle',
record_identifier='blarg',
record_mediatype='text/turtle',
focus_iri='https://foo.example/blarg',
urgent=False,
is_supplementary=False,
expiration_date=None,
+ restore_deleted=True,
)
def test_post_supplementary(self):
@@ -75,15 +77,16 @@ def test_post_supplementary(self):
HTTP_AUTHORIZATION=self.user.authorization(),
)
self.assertEqual(_resp.status_code, HTTPStatus.CREATED)
- _mock_tract.swallow.assert_called_once_with(
+ _mock_tract.ingest.assert_called_once_with(
from_user=self.user,
- record='turtleturtleturtle',
+ raw_record='turtleturtleturtle',
record_identifier='blarg',
record_mediatype='text/turtle',
focus_iri='https://foo.example/blarg',
urgent=True,
is_supplementary=True,
expiration_date=None,
+ restore_deleted=True,
)
def test_post_with_expiration(self):
@@ -100,15 +103,16 @@ def test_post_with_expiration(self):
HTTP_AUTHORIZATION=self.user.authorization(),
)
self.assertEqual(_resp.status_code, HTTPStatus.CREATED)
- _mock_tract.swallow.assert_called_once_with(
+ _mock_tract.ingest.assert_called_once_with(
from_user=self.user,
- record='turtleturtleturtle',
+ raw_record='turtleturtleturtle',
record_identifier='blarg',
record_mediatype='text/turtle',
focus_iri='https://foo.example/blarg',
urgent=True,
is_supplementary=True,
expiration_date=datetime.date(2055, 5, 5),
+ restore_deleted=True,
)
def test_delete(self):
@@ -135,7 +139,7 @@ def test_anonymous_post(self):
data='turtleturtleturtle',
)
self.assertEqual(_resp.status_code, HTTPStatus.UNAUTHORIZED)
- self.assertFalse(_mock_tract.swallow.called)
+ self.assertFalse(_mock_tract.ingest.called)
def test_nontrusted_post(self):
with patch_feature_flag(FeatureFlag.FORBID_UNTRUSTED_FEED):
@@ -152,7 +156,7 @@ def test_nontrusted_post(self):
HTTP_AUTHORIZATION=_nontrusted_user.authorization(),
)
self.assertEqual(_resp.status_code, HTTPStatus.FORBIDDEN)
- self.assertFalse(_mock_tract.swallow.called)
+ self.assertFalse(_mock_tract.ingest.called)
def test_anonymous_delete(self):
with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract:
@@ -185,4 +189,4 @@ def test_invalid_expiration_date(self):
HTTP_AUTHORIZATION=self.user.authorization(),
)
self.assertEqual(_resp.status_code, HTTPStatus.BAD_REQUEST)
- self.assertFalse(_mock_tract.swallow.called)
+ self.assertFalse(_mock_tract.ingest.called)
diff --git a/trove/admin.py b/trove/admin.py
index 8db71772e..5ef20eac3 100644
--- a/trove/admin.py
+++ b/trove/admin.py
@@ -5,12 +5,12 @@
from share.admin.util import TimeLimitedPaginator, linked_fk, linked_many
from share.search.index_messenger import IndexMessenger
from trove.models import (
- ArchivedIndexcardRdf,
+ ArchivedResourceDescription,
DerivedIndexcard,
Indexcard,
- LatestIndexcardRdf,
+ LatestResourceDescription,
ResourceIdentifier,
- SupplementaryIndexcardRdf,
+ SupplementaryResourceDescription,
)
@@ -30,10 +30,10 @@ class ResourceIdentifierAdmin(admin.ModelAdmin):
@admin.register(Indexcard, site=admin_site)
-@linked_many('archived_rdf_set', defer=('rdf_as_turtle',))
-@linked_many('supplementary_rdf_set', defer=('rdf_as_turtle',))
+@linked_many('archived_description_set', defer=('rdf_as_turtle',))
+@linked_many('supplementary_description_set', defer=('rdf_as_turtle',))
@linked_many('derived_indexcard_set', defer=('derived_text',))
-@linked_fk('latest_rdf')
+@linked_fk('latest_resource_description')
@linked_fk('source_record_suid')
@linked_many('focustype_identifier_set')
@linked_many('focus_identifier_set')
@@ -57,10 +57,9 @@ def _freshen_index(self, request, queryset):
_freshen_index.short_description = 'freshen indexcard in search index'
-@admin.register(LatestIndexcardRdf, site=admin_site)
-@linked_fk('from_raw_datum')
+@admin.register(LatestResourceDescription, site=admin_site)
@linked_fk('indexcard')
-class LatestIndexcardRdfAdmin(admin.ModelAdmin):
+class LatestResourceDescriptionAdmin(admin.ModelAdmin):
readonly_fields = (
'created',
'modified',
@@ -79,10 +78,9 @@ def rdf_as_turtle__pre(self, instance):
rdf_as_turtle__pre.short_description = 'rdf as turtle'
-@admin.register(ArchivedIndexcardRdf, site=admin_site)
-@linked_fk('from_raw_datum')
+@admin.register(ArchivedResourceDescription, site=admin_site)
@linked_fk('indexcard')
-class ArchivedIndexcardRdfAdmin(admin.ModelAdmin):
+class ArchivedResourceDescriptionAdmin(admin.ModelAdmin):
readonly_fields = (
'created',
'modified',
@@ -92,8 +90,8 @@ class ArchivedIndexcardRdfAdmin(admin.ModelAdmin):
)
exclude = ('rdf_as_turtle',)
paginator = TimeLimitedPaginator
- list_display = ('id', 'indexcard', 'from_raw_datum', 'created', 'modified')
- list_select_related = ('indexcard', 'from_raw_datum',)
+ list_display = ('id', 'indexcard', 'created', 'modified')
+ list_select_related = ('indexcard',)
show_full_result_count = False
def rdf_as_turtle__pre(self, instance):
@@ -101,11 +99,10 @@ def rdf_as_turtle__pre(self, instance):
rdf_as_turtle__pre.short_description = 'rdf as turtle'
-@admin.register(SupplementaryIndexcardRdf, site=admin_site)
-@linked_fk('from_raw_datum')
+@admin.register(SupplementaryResourceDescription, site=admin_site)
@linked_fk('indexcard')
@linked_fk('supplementary_suid')
-class SupplementaryIndexcardRdfAdmin(admin.ModelAdmin):
+class SupplementaryResourceDescriptionAdmin(admin.ModelAdmin):
readonly_fields = (
'created',
'modified',
@@ -115,8 +112,8 @@ class SupplementaryIndexcardRdfAdmin(admin.ModelAdmin):
)
exclude = ('rdf_as_turtle',)
paginator = TimeLimitedPaginator
- list_display = ('id', 'indexcard', 'from_raw_datum', 'created', 'modified')
- list_select_related = ('indexcard', 'from_raw_datum',)
+ list_display = ('id', 'indexcard', 'created', 'modified')
+ list_select_related = ('indexcard',)
show_full_result_count = False
def rdf_as_turtle__pre(self, instance):
diff --git a/trove/derive/_base.py b/trove/derive/_base.py
index 9909e8f19..bc8d8b583 100644
--- a/trove/derive/_base.py
+++ b/trove/derive/_base.py
@@ -2,18 +2,18 @@
from primitive_metadata import primitive_rdf
-from trove.models import IndexcardRdf
+from trove.models.resource_description import ResourceDescription
class IndexcardDeriver(abc.ABC):
- upriver_rdf: IndexcardRdf
+ upstream_description: ResourceDescription
focus_iri: str
data: primitive_rdf.RdfGraph
- def __init__(self, upriver_rdf: IndexcardRdf):
- self.upriver_rdf = upriver_rdf
- self.focus_iri = upriver_rdf.focus_iri
- self.data = upriver_rdf.as_rdfdoc_with_supplements()
+ def __init__(self, upstream_description: ResourceDescription):
+ self.upstream_description = upstream_description
+ self.focus_iri = upstream_description.focus_iri
+ self.data = upstream_description.as_rdfdoc_with_supplements()
def q(self, pathset):
# convenience for querying self.data on self.focus_iri
diff --git a/trove/derive/osfmap_json_mini.py b/trove/derive/osfmap_json_mini.py
index c4da33e08..cd4520f62 100644
--- a/trove/derive/osfmap_json_mini.py
+++ b/trove/derive/osfmap_json_mini.py
@@ -2,36 +2,8 @@
from trove.derive.osfmap_json import OsfmapJsonFullDeriver
from trove.vocab.namespaces import TROVE
-INCLUDED_PREDICATE_SET = frozenset({
- ns.RDF.type,
- ns.DCTERMS.title,
- ns.DCTERMS.creator,
- ns.DCTERMS.date,
- ns.DCTERMS.created,
- ns.FOAF.name,
- ns.OWL.sameAs,
- ns.DCTERMS.conformsTo,
- ns.DCTERMS.dateCopyrighted,
- ns.DCTERMS.description,
- ns.DCTERMS.hasPart,
- ns.DCTERMS.isVersionOf,
- ns.DCTERMS.modified,
- ns.DCTERMS.publisher,
- ns.DCTERMS.rights,
- ns.DCTERMS.subject,
- ns.DCTERMS.isPartOf,
- ns.DCTERMS.identifier,
- ns.SKOS.inScheme,
- ns.SKOS.prefLabel,
- ns.OSFMAP.affiliation,
- ns.OSFMAP.archivedAt,
- ns.DCTERMS.dateAccepted,
- ns.DCTERMS.dateModified,
- ns.OSFMAP.hostingInstitution,
- ns.OSFMAP.keyword,
- ns.OSFMAP.fileName,
- ns.OSFMAP.filePath,
- ns.OSFMAP.isContainedBy
+EXCLUDED_PREDICATE_SET = frozenset({
+ ns.OSFMAP.contains,
})
@@ -57,4 +29,4 @@ def convert_tripledict(self):
@staticmethod
def _should_keep_predicate(predicate: str) -> bool:
- return predicate in INCLUDED_PREDICATE_SET
+ return predicate not in EXCLUDED_PREDICATE_SET
diff --git a/trove/derive/sharev2_elastic.py b/trove/derive/sharev2_elastic.py
index c00f45925..27c7b3a06 100644
--- a/trove/derive/sharev2_elastic.py
+++ b/trove/derive/sharev2_elastic.py
@@ -146,7 +146,7 @@ def should_skip(self) -> bool:
# abstract method from IndexcardDeriver
def derive_card_as_text(self):
- _suid = self.upriver_rdf.indexcard.source_record_suid
+ _suid = self.upstream_description.indexcard.source_record_suid
try: # maintain doc id in the sharev2 index
_suid = _suid.get_backcompat_sharev2_suid()
except share_db.SourceUniqueIdentifier.DoesNotExist:
@@ -157,10 +157,9 @@ def derive_card_as_text(self):
###
# metadata about the record/indexcard in this system
'id': IDObfuscator.encode(_suid),
- 'indexcard_id': self.upriver_rdf.indexcard.id,
- 'rawdatum_id': self.upriver_rdf.from_raw_datum_id,
- 'date_created': _suid.get_date_first_seen().isoformat(),
- 'date_modified': self.upriver_rdf.modified.isoformat(),
+ 'indexcard_id': self.upstream_description.indexcard.id,
+ 'date_created': self.upstream_description.indexcard.created.isoformat(),
+ 'date_modified': self.upstream_description.modified.isoformat(),
'sources': [_source_name],
'source_config': _suid.source_config.label,
'source_unique_id': _suid.identifier,
diff --git a/trove/digestive_tract.py b/trove/digestive_tract.py
index e409eceb8..a91a9d633 100644
--- a/trove/digestive_tract.py
+++ b/trove/digestive_tract.py
@@ -2,11 +2,11 @@
leaning (perhaps too far) into "ingest" as metaphor
-swallow: store a given record by checksum; queue for extraction
+sniff: set up identifiers about a record
extract: gather rdf graph from a record; store as index card(s)
-derive: build other kinds of index cards from the extracted rdf
+derive: build other representations from latest card version(s)
'''
-__all__ = ('swallow', 'extract', 'derive')
+__all__ = ('sniff', 'extract', 'derive', 'expel', 'ingest')
import copy
import datetime
@@ -23,43 +23,79 @@
from trove.exceptions import (
CannotDigestExpiredDatum,
DigestiveError,
- MissingMediatype,
)
from trove.extract import get_rdf_extractor_class
from trove.derive import get_deriver_classes
+from trove.util.iris import smells_like_iri
from trove.vocab.namespaces import RDFS, RDF, OWL
logger = logging.getLogger(__name__)
-@transaction.atomic
-def swallow(
+def ingest(
*, # all keyword-args
from_user: share_db.ShareUser,
- record: str,
- record_identifier: str,
- record_mediatype: str,
focus_iri: str,
- datestamp: datetime.datetime | None = None, # default "now"
+ record_mediatype: str,
+ raw_record: str,
+ record_identifier: str = '', # default focus_iri
+ is_supplementary: bool = False,
expiration_date: datetime.date | None = None, # default "never"
+ restore_deleted: bool = False,
urgent: bool = False,
- is_supplementary: bool = False,
):
- '''swallow: store a given record by checksum; queue for extraction
+ '''ingest: shorthand for sniff + extract + (eventual) derive'''
+ _suid = sniff(
+ from_user=from_user,
+ record_identifier=record_identifier,
+ focus_iri=focus_iri,
+ is_supplementary=is_supplementary,
+ )
+ if _suid.source_config.disabled or _suid.source_config.source.is_deleted:
+ expel_suid(_suid)
+ else:
+ _extracted_cards = extract(
+ suid=_suid,
+ record_mediatype=record_mediatype,
+ raw_record=raw_record,
+ restore_deleted=restore_deleted,
+ expiration_date=expiration_date,
+ )
+ for _card in _extracted_cards:
+ task__derive.delay(_card.id, urgent=urgent)
+
+
+@transaction.atomic
+def sniff(
+ *, # all keyword-args
+ from_user: share_db.ShareUser,
+ focus_iri: str,
+ record_identifier: str = '',
+ is_supplementary: bool = False,
+) -> share_db.SourceUniqueIdentifier:
+ '''sniff: get a vague sense of a metadata record without touching the record itself
+
+ ensures in the database:
+ * `share.models.Source`/`SourceConfig` for given `from_user`, with...
+ * `share.models.SourceUniqueIdentifier` for given `record_identifier`, with...
+ * `trove.models.ResourceIdentifier` for given `focus_iri`
- will create (or update) one of each:
- Source (from whom/where is it?)
- SourceConfig (how did/do we get it?)
- SourceUniqueIdentifier (by what name do/would they know it?)
- RawDatum ("it", a metadata record)
+ returns the `SourceUniqueIdentifier`, as the center of that constellation
+
+ for a given `(from_user, record_identifier)` pair, `focus_iri` and `is_supplementary`
+ must not change -- raises `DigestiveError` if called again with different values
'''
- if not isinstance(record, str):
- raise DigestiveError('datum must be a string')
+ if not smells_like_iri(focus_iri):
+ raise DigestiveError(f'invalid focus_iri "{focus_iri}"')
+ if is_supplementary and not record_identifier:
+ raise DigestiveError(f'supplementary records must have non-empty record_identifier! focus_iri={focus_iri} from_user={from_user}')
+ if is_supplementary and (record_identifier == focus_iri):
+ raise DigestiveError(f'supplementary records must have record_identifier distinct from their focus! focus_iri={focus_iri} record_identifier={record_identifier} from_user={from_user}')
_source_config = share_db.SourceConfig.objects.get_or_create_push_config(from_user)
_suid, _suid_created = share_db.SourceUniqueIdentifier.objects.get_or_create(
source_config=_source_config,
- identifier=record_identifier,
+ identifier=record_identifier or focus_iri,
defaults={
'is_supplementary': is_supplementary,
},
@@ -73,45 +109,43 @@ def swallow(
else:
if _suid.focus_identifier_id != _focus_identifier.id:
raise DigestiveError(f'suid focus_identifier should not change! suid={_suid}, focus changed from {_suid.focus_identifier} to {_focus_identifier}')
- _raw = share_db.RawDatum.objects.store_datum_for_suid(
- suid=_suid,
- datum=record,
- mediatype=record_mediatype,
- datestamp=(datestamp or datetime.datetime.now(tz=datetime.timezone.utc)),
- expiration_date=expiration_date,
- )
- _task = task__extract_and_derive.delay(_raw.id, urgent=urgent)
- return _task.id
+ return _suid
-def extract(raw: share_db.RawDatum, *, undelete_indexcards=False) -> list[trove_db.Indexcard]:
+def extract(
+ suid: share_db.SourceUniqueIdentifier,
+ record_mediatype: str,
+ raw_record: str,
+ *,
+ expiration_date: datetime.date | None = None, # default "never"
+ restore_deleted: bool = False,
+) -> list[trove_db.Indexcard]:
'''extract: gather rdf graph from a record; store as index card(s)
may create (or update):
ResourceIdentifier (for each described resource and its types)
Indexcard (with identifiers and type-identifiers for each described resource)
- ArchivedIndexcardRdf (all extracted metadata, if non-supplementary)
- LatestIndexcardRdf (all extracted metadata, if latest raw and non-supplementary)
- SupplementaryIndexcardRdf (all extracted metadata, if supplementary)
+ ArchivedResourceDescription (all extracted metadata, if non-supplementary)
+ LatestResourceDescription (all extracted metadata, if latest raw and non-supplementary)
+ SupplementaryResourceDescription (all extracted metadata, if supplementary)
may delete:
- LatestIndexcardRdf (previously extracted from the record, but no longer present)
+ LatestResourceDescription (previously extracted from the record, but no longer present)
'''
- assert raw.mediatype is not None, 'raw datum has no mediatype -- did you mean to call extract_legacy?'
- if raw.is_expired:
- raise CannotDigestExpiredDatum(raw)
+ if (expiration_date is not None) and (expiration_date <= datetime.date.today()):
+ raise CannotDigestExpiredDatum(suid, expiration_date)
_tripledicts_by_focus_iri = {}
- _extractor = get_rdf_extractor_class(raw.mediatype)(raw.suid.source_config)
+ _extractor = get_rdf_extractor_class(record_mediatype)(suid.source_config)
# TODO normalize (or just validate) tripledict:
# - synonymous iris should be grouped (only one as subject-key, others under owl:sameAs)
# - focus should have rdf:type
# - no subject-key iris which collide by trove_db.ResourceIdentifier equivalence
# - connected graph (all subject-key iris reachable from focus, or reverse for vocab terms?)
- _extracted_tripledict: primitive_rdf.RdfTripleDictionary = _extractor.extract_rdf(raw.datum)
+ _extracted_tripledict: primitive_rdf.RdfTripleDictionary = _extractor.extract_rdf(raw_record)
if _extracted_tripledict:
try:
- _focus_iri = raw.suid.focus_identifier.find_equivalent_iri(_extracted_tripledict)
+ _focus_iri = suid.focus_identifier.find_equivalent_iri(_extracted_tripledict)
except ValueError:
- raise DigestiveError(f'could not find {raw.suid.focus_identifier} in {raw}')
+ raise DigestiveError(f'could not find {suid.focus_identifier} in """{raw_record}"""')
_tripledicts_by_focus_iri[_focus_iri] = _extracted_tripledict
# special case: if the record defines an ontology, create a
# card for each subject iri that starts with the focus iri
@@ -125,15 +159,17 @@ def extract(raw: share_db.RawDatum, *, undelete_indexcards=False) -> list[trove_
(_iri, RDFS.isDefinedBy, _focus_iri),
)
_tripledicts_by_focus_iri[_iri] = _term_tripledict
- if raw.suid.is_supplementary:
+ if suid.is_supplementary:
return trove_db.Indexcard.objects.supplement_indexcards_from_tripledicts(
- from_raw_datum=raw,
+ supplementary_suid=suid,
rdf_tripledicts_by_focus_iri=_tripledicts_by_focus_iri,
+ expiration_date=expiration_date,
)
return trove_db.Indexcard.objects.save_indexcards_from_tripledicts(
- from_raw_datum=raw,
+ suid=suid,
rdf_tripledicts_by_focus_iri=_tripledicts_by_focus_iri,
- undelete=undelete_indexcards,
+ restore_deleted=restore_deleted,
+ expiration_date=expiration_date,
)
@@ -146,12 +182,12 @@ def derive(indexcard: trove_db.Indexcard, deriver_iris=None):
if indexcard.deleted:
return []
try:
- _latest_rdf = indexcard.latest_rdf
- except trove_db.LatestIndexcardRdf.DoesNotExist:
+ _latest_resource_description = indexcard.latest_resource_description
+ except trove_db.LatestResourceDescription.DoesNotExist:
return []
_derived_list = []
for _deriver_class in get_deriver_classes(deriver_iris):
- _deriver = _deriver_class(upriver_rdf=_latest_rdf)
+ _deriver = _deriver_class(upstream_description=_latest_resource_description)
_deriver_identifier = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_deriver.deriver_iri())
if _deriver.should_skip():
trove_db.DerivedIndexcard.objects.filter(
@@ -160,7 +196,7 @@ def derive(indexcard: trove_db.Indexcard, deriver_iris=None):
).delete()
else:
_derived_text = _deriver.derive_card_as_text()
- _derived_checksum_iri = ChecksumIri.digest('sha-256', salt='', raw_data=_derived_text)
+ _derived_checksum_iri = ChecksumIri.digest('sha-256', salt='', data=_derived_text)
_derived, _ = trove_db.DerivedIndexcard.objects.update_or_create(
upriver_indexcard=indexcard,
deriver_identifier=_deriver_identifier,
@@ -185,65 +221,43 @@ def expel(from_user: share_db.ShareUser, record_identifier: str):
def expel_suid(suid: share_db.SourceUniqueIdentifier) -> None:
for _indexcard in trove_db.Indexcard.objects.filter(source_record_suid=suid):
_indexcard.pls_delete()
- _expel_supplementary_rdf(
- trove_db.SupplementaryIndexcardRdf.objects.filter(supplementary_suid=suid),
+ _expel_supplementary_descriptions(
+ trove_db.SupplementaryResourceDescription.objects.filter(supplementary_suid=suid),
)
def expel_expired_data(today: datetime.date) -> None:
# mark indexcards deleted if their latest update has now expired
for _indexcard in trove_db.Indexcard.objects.filter(
- trove_latestindexcardrdf_set__from_raw_datum__expiration_date__lte=today,
+ trove_latestresourcedescription_set__expiration_date__lte=today,
):
_indexcard.pls_delete()
# delete expired supplementary metadata
- _expel_supplementary_rdf(
- trove_db.SupplementaryIndexcardRdf.objects.filter(
- from_raw_datum__expiration_date__lte=today,
- ),
+ _expel_supplementary_descriptions(
+ trove_db.SupplementaryResourceDescription.objects.filter(expiration_date__lte=today),
)
-def _expel_supplementary_rdf(supplementary_rdf_queryset) -> None:
+def _expel_supplementary_descriptions(supplementary_rdf_queryset) -> None:
# delete expired supplementary metadata
_affected_indexcards = set()
- for _supplementary_rdf in supplementary_rdf_queryset.select_related('indexcard'):
- if not _supplementary_rdf.indexcard.deleted:
- _affected_indexcards.add(_supplementary_rdf.indexcard)
- _supplementary_rdf.delete()
+ for _supplement in supplementary_rdf_queryset.select_related('indexcard'):
+ if not _supplement.indexcard.deleted:
+ _affected_indexcards.add(_supplement.indexcard)
+ _supplement.delete()
for _indexcard in _affected_indexcards:
task__derive.delay(_indexcard.id)
### BEGIN celery tasks
-@celery.shared_task(acks_late=True, bind=True)
-def task__extract_and_derive(task: celery.Task, raw_id: int, urgent=False):
- _raw = (
- share_db.RawDatum.objects
- .select_related('suid__source_config__source')
- .get(id=raw_id)
- )
- _source_config = _raw.suid.source_config
- if _source_config.disabled or _source_config.source.is_deleted:
- expel_suid(_raw.suid)
- else:
- if not _raw.mediatype:
- raise MissingMediatype(_raw)
- _indexcards = extract(_raw, undelete_indexcards=urgent)
- if _raw.is_latest():
- _messenger = IndexMessenger(celery_app=task.app)
- for _indexcard in _indexcards:
- derive(_indexcard)
- _messenger.notify_indexcard_update(_indexcards, urgent=urgent)
-
-
@celery.shared_task(acks_late=True, bind=True)
def task__derive(
task: celery.Task,
indexcard_id: int,
deriver_iri: str | None = None,
notify_index=True,
+ urgent=False,
):
_indexcard = trove_db.Indexcard.objects.get(id=indexcard_id)
derive(
@@ -253,21 +267,18 @@ def task__derive(
# TODO: avoid unnecessary work; let IndexStrategy subscribe to a specific
# IndexcardDeriver (perhaps by deriver-specific MessageType?)
if notify_index:
- IndexMessenger(celery_app=task.app).notify_indexcard_update([_indexcard])
+ IndexMessenger(celery_app=task.app).notify_indexcard_update([_indexcard], urgent=urgent)
@celery.shared_task(acks_late=True)
-def task__schedule_extract_and_derive_for_source_config(source_config_id: int):
- _raw_id_qs = (
- share_db.RawDatum.objects
- .latest_by_suid_queryset(
- share_db.SourceUniqueIdentifier.objects
- .filter(source_config_id=source_config_id)
- )
+def task__schedule_derive_for_source_config(source_config_id: int, notify_index=False):
+ _indexcard_id_qs = (
+ trove_db.Indexcard.objects
+ .filter(source_record_suid__source_config_id=source_config_id)
.values_list('id', flat=True)
)
- for _raw_id in _raw_id_qs.iterator():
- task__extract_and_derive.delay(_raw_id)
+ for _indexcard_id in _indexcard_id_qs.iterator():
+ task__derive.delay(_indexcard_id, notify_index=notify_index)
@celery.shared_task(acks_late=True)
diff --git a/trove/management/commands/ingest_from_another_shtrove.py b/trove/management/commands/ingest_from_another_shtrove.py
deleted file mode 100644
index 09ab22aa6..000000000
--- a/trove/management/commands/ingest_from_another_shtrove.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import functools
-from itertools import islice
-import re
-from urllib.parse import urlunsplit
-
-from django.conf import settings
-from django.core.management.base import BaseCommand
-import requests
-
-from share import models as share_db
-from trove import digestive_tract
-from trove.vocab import mediatypes
-
-
-class Command(BaseCommand):
- help = "ingest metadata from another SHARE/trove instance"
-
- def add_arguments(self, parser):
- parser.add_argument("host", help="host name of the shtrove instance (e.g. 'staging-share.osf.io')")
- parser.add_argument("--count", type=int, default=333)
-
- def handle(self, *args, host, count, **options):
- if not settings.DEBUG:
- raise Exception('this command not meant for non-debug use')
- _ingested_count = 0
- _skipped_count = 0
- for _datum in islice(self._iter_datums(host), count):
- if self._ingest(_datum):
- _ingested_count += 1
- else:
- _skipped_count += 1
- self.stdout.write(
- self.style.SUCCESS(f'ingested {_ingested_count} (skipped {_skipped_count}) from {host}')
- )
-
- def _iter_datums(self, host: str):
- _url = urlunsplit(('https', host, '/api/v2/rawdata/', '', ''))
- while _url:
- self.stdout.write('fetching a page...')
- _json = requests.get(_url, headers={'Accept': mediatypes.JSONAPI}).json()
- for _item in _json['data']:
- yield _item['attributes']['datum']
- _url = _json['links'].get('next')
-
- def _ingest(self, datum: str) -> bool:
- # HACK: get only turtle files by checking it starts with a prefix (unreliable, generally, but good enough for this)
- _smells_like_turtle = datum.startswith('@prefix ') or datum.startswith('PREFIX ')
- if _smells_like_turtle:
- _first_subject_match = re.search(
- r'^<([^>\s]+)>', # HACK: depends on specific serialization
- datum,
- re.MULTILINE,
- )
- if _first_subject_match:
- _subject_iri = _first_subject_match.group(1)
- digestive_tract.swallow(
- from_user=self._application_user,
- record=datum,
- record_identifier=_subject_iri,
- record_mediatype=mediatypes.TURTLE,
- focus_iri=_subject_iri,
- )
- return True
- return False
-
- @functools.cached_property
- def _application_user(self):
- return share_db.ShareUser.objects.get(username=settings.APPLICATION_USERNAME)
diff --git a/trove/migrations/0009_no_raw_datum.py b/trove/migrations/0009_no_raw_datum.py
new file mode 100644
index 000000000..47129a367
--- /dev/null
+++ b/trove/migrations/0009_no_raw_datum.py
@@ -0,0 +1,27 @@
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('trove', '0008_expiration_dates'),
+ ]
+
+ operations = [
+ migrations.RemoveConstraint(
+ model_name='archivedindexcardrdf',
+ name='trove_archivedindexcardrdf_uniq_archived_version',
+ ),
+ migrations.RemoveField(
+ model_name='archivedindexcardrdf',
+ name='from_raw_datum',
+ ),
+ migrations.RemoveField(
+ model_name='latestindexcardrdf',
+ name='from_raw_datum',
+ ),
+ migrations.RemoveField(
+ model_name='supplementaryindexcardrdf',
+ name='from_raw_datum',
+ ),
+ ]
diff --git a/trove/migrations/0010_resource_description_rename.py b/trove/migrations/0010_resource_description_rename.py
new file mode 100644
index 000000000..79cfd8d96
--- /dev/null
+++ b/trove/migrations/0010_resource_description_rename.py
@@ -0,0 +1,44 @@
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('trove', '0009_no_raw_datum'),
+ ]
+
+ operations = [
+ migrations.RenameModel(
+ old_name='ArchivedIndexcardRdf',
+ new_name='ArchivedResourceDescription',
+ ),
+ migrations.RenameModel(
+ old_name='LatestIndexcardRdf',
+ new_name='LatestResourceDescription',
+ ),
+ migrations.RenameModel(
+ old_name='SupplementaryIndexcardRdf',
+ new_name='SupplementaryResourceDescription',
+ ),
+ migrations.AlterField(
+ model_name='archivedresourcedescription',
+ name='indexcard',
+ field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='trove_archivedresourcedescription_set', to='trove.indexcard'),
+ ),
+ migrations.AlterField(
+ model_name='latestresourcedescription',
+ name='indexcard',
+ field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='trove_latestresourcedescription_set', to='trove.indexcard'),
+ ),
+ migrations.AlterField(
+ model_name='supplementaryresourcedescription',
+ name='indexcard',
+ field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='trove_supplementaryresourcedescription_set', to='trove.indexcard'),
+ ),
+ migrations.AlterField(
+ model_name='supplementaryresourcedescription',
+ name='supplementary_suid',
+ field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='supplementary_description_set', to='share.sourceuniqueidentifier'),
+ ),
+ ]
diff --git a/trove/models/__init__.py b/trove/models/__init__.py
index acaadd7c4..60318fbc6 100644
--- a/trove/models/__init__.py
+++ b/trove/models/__init__.py
@@ -1,18 +1,18 @@
__all__ = (
- 'ResourceIdentifier',
- 'Indexcard',
- 'IndexcardRdf',
- 'LatestIndexcardRdf',
- 'ArchivedIndexcardRdf',
- 'SupplementaryIndexcardRdf',
+ 'ArchivedResourceDescription',
'DerivedIndexcard',
+ 'Indexcard',
+ 'LatestResourceDescription',
+ 'ResourceDescription',
+ 'ResourceIdentifier',
+ 'SupplementaryResourceDescription',
)
-from .indexcard import (
- ArchivedIndexcardRdf,
- DerivedIndexcard,
- Indexcard,
- IndexcardRdf,
- LatestIndexcardRdf,
- SupplementaryIndexcardRdf,
+from .derived_indexcard import DerivedIndexcard
+from .indexcard import Indexcard
+from .resource_description import (
+ ArchivedResourceDescription,
+ LatestResourceDescription,
+ ResourceDescription,
+ SupplementaryResourceDescription,
)
from .resource_identifier import ResourceIdentifier
diff --git a/trove/models/derived_indexcard.py b/trove/models/derived_indexcard.py
new file mode 100644
index 000000000..52f0d3989
--- /dev/null
+++ b/trove/models/derived_indexcard.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from django.db import models
+from primitive_metadata import primitive_rdf as rdf
+
+from trove.models.resource_identifier import ResourceIdentifier
+
+__all__ = ('DerivedIndexcard',)
+
+
+class DerivedIndexcard(models.Model):
+ # auto:
+ created = models.DateTimeField(auto_now_add=True)
+ modified = models.DateTimeField(auto_now=True)
+
+ # required:
+ upriver_indexcard = models.ForeignKey(
+ 'trove.Indexcard',
+ on_delete=models.CASCADE,
+ related_name='derived_indexcard_set',
+ )
+ deriver_identifier = models.ForeignKey(ResourceIdentifier, on_delete=models.PROTECT, related_name='+')
+ derived_checksum_iri = models.TextField()
+ derived_text = models.TextField() # TODO: store elsewhere by checksum
+
+ class Meta:
+ constraints = [
+ models.UniqueConstraint(
+ fields=('upriver_indexcard', 'deriver_identifier'),
+ name='%(app_label)s_%(class)s_upriverindexcard_deriveridentifier',
+ ),
+ ]
+
+ def __repr__(self):
+ return f'<{self.__class__.__qualname__}({self.id}, {self.upriver_indexcard.uuid}, "{self.deriver_identifier.sufficiently_unique_iri}")'
+
+ def __str__(self):
+ return repr(self)
+
+ @property
+ def deriver_cls(self):
+ from trove.derive import get_deriver_classes
+ (_deriver_cls,) = get_deriver_classes(self.deriver_identifier.raw_iri_list)
+ return _deriver_cls
+
+ def as_rdf_literal(self) -> rdf.Literal:
+ return rdf.literal(
+ self.derived_text,
+ datatype_iris=self.deriver_cls.derived_datatype_iris(),
+ )
diff --git a/trove/models/indexcard.py b/trove/models/indexcard.py
index 6ae24b4b0..ba6de67d3 100644
--- a/trove/models/indexcard.py
+++ b/trove/models/indexcard.py
@@ -1,4 +1,5 @@
from __future__ import annotations
+import datetime
import uuid
from django.db import models
@@ -9,11 +10,21 @@
from share import models as share_db # TODO: break this dependency
from share.util.checksum_iri import ChecksumIri
from trove.exceptions import DigestiveError
+from trove.models.derived_indexcard import DerivedIndexcard
+from trove.models.resource_description import (
+ ArchivedResourceDescription,
+ ResourceDescription,
+ LatestResourceDescription,
+ SupplementaryResourceDescription,
+)
from trove.models.resource_identifier import ResourceIdentifier
from trove.vocab.namespaces import RDF
from trove.vocab.trove import trove_indexcard_iri, trove_indexcard_namespace
+__all__ = ('Indexcard',)
+
+
class IndexcardManager(models.Manager):
def get_for_iri(self, iri: str):
_uuid = rdf.iri_minus_namespace(iri, namespace=trove_indexcard_namespace())
@@ -22,21 +33,21 @@ def get_for_iri(self, iri: str):
@transaction.atomic
def save_indexcards_from_tripledicts(
self, *,
- from_raw_datum: share_db.RawDatum,
+ suid: share_db.SourceUniqueIdentifier,
rdf_tripledicts_by_focus_iri: dict[str, rdf.RdfTripleDictionary],
- undelete: bool = False,
+ restore_deleted: bool = False,
+ expiration_date: datetime.date | None = None,
) -> list['Indexcard']:
- assert not from_raw_datum.suid.is_supplementary
- from_raw_datum.no_output = (not rdf_tripledicts_by_focus_iri)
- from_raw_datum.save(update_fields=['no_output'])
+ assert not suid.is_supplementary
_indexcards = []
_seen_focus_identifier_ids: set[str] = set()
for _focus_iri, _tripledict in rdf_tripledicts_by_focus_iri.items():
_indexcard = self.save_indexcard_from_tripledict(
- from_raw_datum=from_raw_datum,
+ suid=suid,
rdf_tripledict=_tripledict,
focus_iri=_focus_iri,
- undelete=undelete,
+ restore_deleted=restore_deleted,
+ expiration_date=expiration_date,
)
_focus_identifier_ids = {_fid.id for _fid in _indexcard.focus_identifier_set.all()}
if not _seen_focus_identifier_ids.isdisjoint(_focus_identifier_ids):
@@ -50,7 +61,7 @@ def save_indexcards_from_tripledicts(
# cards seen previously on this suid (but not this time) treated as deleted
for _indexcard_to_delete in (
Indexcard.objects
- .filter(source_record_suid=from_raw_datum.suid)
+ .filter(source_record_suid=suid)
.exclude(id__in=[_card.id for _card in _indexcards])
):
_indexcard_to_delete.pls_delete()
@@ -60,28 +71,25 @@ def save_indexcards_from_tripledicts(
@transaction.atomic
def supplement_indexcards_from_tripledicts(
self, *,
- from_raw_datum: share_db.RawDatum,
+ supplementary_suid: share_db.SourceUniqueIdentifier,
rdf_tripledicts_by_focus_iri: dict[str, rdf.RdfTripleDictionary],
+ expiration_date: datetime.date | None = None,
) -> list[Indexcard]:
- assert from_raw_datum.suid.is_supplementary
- assert not from_raw_datum.is_expired
- from_raw_datum.no_output = (not rdf_tripledicts_by_focus_iri)
- from_raw_datum.save(update_fields=['no_output'])
- if not from_raw_datum.is_latest():
- return []
+ assert supplementary_suid.is_supplementary
_indexcards = []
for _focus_iri, _tripledict in rdf_tripledicts_by_focus_iri.items():
_indexcards.extend(self.supplement_indexcards(
- from_raw_datum=from_raw_datum,
+ supplementary_suid=supplementary_suid,
rdf_tripledict=_tripledict,
focus_iri=_focus_iri,
+ expiration_date=expiration_date,
))
_seen_indexcard_ids = {_card.id for _card in _indexcards}
# supplementary data seen previously on this suid (but not this time) should be deleted
for _supplement_to_delete in (
- SupplementaryIndexcardRdf.objects
- .filter(supplementary_suid=from_raw_datum.suid)
- .exclude(from_raw_datum=from_raw_datum)
+ SupplementaryResourceDescription.objects
+ .filter(supplementary_suid=supplementary_suid)
+ .exclude(indexcard__in=_indexcards)
):
if _supplement_to_delete.indexcard_id not in _seen_indexcard_ids:
_indexcards.append(_supplement_to_delete.indexcard)
@@ -91,13 +99,13 @@ def supplement_indexcards_from_tripledicts(
@transaction.atomic
def save_indexcard_from_tripledict(
self, *,
- from_raw_datum: share_db.RawDatum,
+ suid: share_db.SourceUniqueIdentifier,
rdf_tripledict: rdf.RdfTripleDictionary,
focus_iri: str,
- undelete: bool = False,
+ restore_deleted: bool = False,
+ expiration_date: datetime.date | None = None,
):
- assert not from_raw_datum.suid.is_supplementary
- assert not from_raw_datum.is_expired
+ assert not suid.is_supplementary
_focus_identifier_set = (
ResourceIdentifier.objects
.save_equivalent_identifier_set(rdf_tripledict, focus_iri)
@@ -107,42 +115,40 @@ def save_indexcard_from_tripledict(
for _iri in rdf_tripledict[focus_iri].get(RDF.type, ())
]
_indexcard = Indexcard.objects.filter(
- source_record_suid=from_raw_datum.suid,
+ source_record_suid=suid,
focus_identifier_set__in=_focus_identifier_set,
).first()
if _indexcard is None:
- _indexcard = Indexcard.objects.create(source_record_suid=from_raw_datum.suid)
- if undelete and _indexcard.deleted:
+ _indexcard = Indexcard.objects.create(source_record_suid=suid)
+ if restore_deleted and _indexcard.deleted:
_indexcard.deleted = None
_indexcard.save()
_indexcard.focus_identifier_set.set(_focus_identifier_set)
_indexcard.focustype_identifier_set.set(_focustype_identifier_set)
- _indexcard.update_rdf(
- from_raw_datum=from_raw_datum,
- rdf_tripledict=rdf_tripledict,
- focus_iri=focus_iri,
- )
+ _indexcard.update_resource_description(focus_iri, rdf_tripledict, expiration_date=expiration_date)
return _indexcard
@transaction.atomic
def supplement_indexcards(
self, *,
- from_raw_datum: share_db.RawDatum,
+ supplementary_suid: share_db.SourceUniqueIdentifier,
rdf_tripledict: rdf.RdfTripleDictionary,
focus_iri: str,
+ expiration_date: datetime.date | None = None,
) -> list[Indexcard]:
- assert from_raw_datum.suid.is_supplementary
+ assert supplementary_suid.is_supplementary
# supplement indexcards with the same focus from the same source_config
# (if none exist, fine, nothing gets supplemented)
_indexcards = list(Indexcard.objects.filter(
- source_record_suid__source_config_id=from_raw_datum.suid.source_config_id,
+ source_record_suid__source_config_id=supplementary_suid.source_config_id,
focus_identifier_set__in=ResourceIdentifier.objects.queryset_for_iri(focus_iri),
))
for _indexcard in _indexcards:
- _indexcard.update_supplementary_rdf(
- from_raw_datum=from_raw_datum,
+ _indexcard.update_supplementary_description(
+ supplementary_suid=supplementary_suid,
rdf_tripledict=rdf_tripledict,
focus_iri=focus_iri,
+ expiration_date=expiration_date,
)
return _indexcards
@@ -181,26 +187,26 @@ class Meta:
]
@property
- def latest_rdf(self) -> LatestIndexcardRdf:
- '''convenience for the "other side" of LatestIndexcardRdf.indexcard
+ def latest_resource_description(self) -> LatestResourceDescription:
+ '''convenience for the "other side" of LatestResourceDescription.indexcard
'''
- return self.trove_latestindexcardrdf_set.get() # may raise DoesNotExist
+ return self.trove_latestresourcedescription_set.get() # may raise DoesNotExist
@property
- def archived_rdf_set(self):
- '''convenience for the "other side" of ArchivedIndexcardRdf.indexcard
+ def archived_description_set(self):
+ '''convenience for the "other side" of ArchivedResourceDescription.indexcard
returns a RelatedManager
'''
- return self.trove_archivedindexcardrdf_set
+ return self.trove_archivedresourcedescription_set
@property
- def supplementary_rdf_set(self):
- '''convenience for the "other side" of SupplementaryIndexcardRdf.indexcard
+ def supplementary_description_set(self):
+ '''convenience for the "other side" of SupplementaryResourceDescription.indexcard
returns a RelatedManager
'''
- return self.trove_supplementaryindexcardrdf_set
+ return self.trove_supplementaryresourcedescription_set
def get_iri(self):
return trove_indexcard_iri(self.uuid)
@@ -210,8 +216,8 @@ def pls_delete(self, *, notify_indexes=True):
if self.deleted is None:
self.deleted = timezone.now()
self.save()
- ( # actually delete LatestIndexcardRdf:
- LatestIndexcardRdf.objects
+ ( # actually delete LatestResourceDescription:
+ LatestResourceDescription.objects
.filter(indexcard=self)
.delete()
)
@@ -232,205 +238,63 @@ def __str__(self):
return repr(self)
@transaction.atomic
- def update_rdf(
+ def update_resource_description(
self,
- from_raw_datum: share_db.RawDatum,
focus_iri: str,
rdf_tripledict: rdf.RdfTripleDictionary,
- ) -> 'IndexcardRdf':
+ expiration_date: datetime.date | None = None,
+ ) -> ResourceDescription:
if focus_iri not in rdf_tripledict:
raise DigestiveError(f'expected {focus_iri} in {set(rdf_tripledict.keys())}')
_rdf_as_turtle, _turtle_checksum_iri = _turtlify(rdf_tripledict)
- _archived, _archived_created = ArchivedIndexcardRdf.objects.get_or_create(
+ _archived, _archived_created = ArchivedResourceDescription.objects.get_or_create(
indexcard=self,
- from_raw_datum=from_raw_datum,
turtle_checksum_iri=_turtle_checksum_iri,
defaults={
'rdf_as_turtle': _rdf_as_turtle,
'focus_iri': focus_iri,
- 'expiration_date': from_raw_datum.expiration_date,
+ 'expiration_date': expiration_date,
},
)
if (not _archived_created) and (_archived.rdf_as_turtle != _rdf_as_turtle):
raise DigestiveError(f'hash collision? {_archived}\n===\n{_rdf_as_turtle}')
- if not self.deleted and from_raw_datum.is_latest():
- _latest_indexcard_rdf, _created = LatestIndexcardRdf.objects.update_or_create(
+ if not self.deleted:
+ _latest_resource_description, _created = LatestResourceDescription.objects.update_or_create(
indexcard=self,
defaults={
- 'from_raw_datum': from_raw_datum,
'turtle_checksum_iri': _turtle_checksum_iri,
'rdf_as_turtle': _rdf_as_turtle,
'focus_iri': focus_iri,
- 'expiration_date': from_raw_datum.expiration_date,
+ 'expiration_date': expiration_date,
},
)
- return _latest_indexcard_rdf
+ return _latest_resource_description
return _archived
- def update_supplementary_rdf(
+ def update_supplementary_description(
self,
- from_raw_datum: share_db.RawDatum,
+ supplementary_suid: share_db.SourceUniqueIdentifier,
focus_iri: str,
rdf_tripledict: rdf.RdfTripleDictionary,
- ) -> SupplementaryIndexcardRdf:
+ expiration_date: datetime.date | None = None,
+ ) -> SupplementaryResourceDescription:
+ assert supplementary_suid.is_supplementary
if focus_iri not in rdf_tripledict:
raise DigestiveError(f'expected {focus_iri} in {set(rdf_tripledict.keys())}')
_rdf_as_turtle, _turtle_checksum_iri = _turtlify(rdf_tripledict)
- _supplement_rdf, _ = SupplementaryIndexcardRdf.objects.update_or_create(
+ _supplement_rdf, _ = SupplementaryResourceDescription.objects.update_or_create(
indexcard=self,
- supplementary_suid=from_raw_datum.suid,
+ supplementary_suid=supplementary_suid,
defaults={
- 'from_raw_datum': from_raw_datum,
'turtle_checksum_iri': _turtle_checksum_iri,
'rdf_as_turtle': _rdf_as_turtle,
'focus_iri': focus_iri,
- 'expiration_date': from_raw_datum.expiration_date,
+ 'expiration_date': expiration_date,
},
)
return _supplement_rdf
-class IndexcardRdf(models.Model):
- # auto:
- created = models.DateTimeField(auto_now_add=True)
- modified = models.DateTimeField(auto_now=True)
-
- # required:
- from_raw_datum = models.ForeignKey(
- share_db.RawDatum,
- on_delete=models.DO_NOTHING, # allows faster bulk-deletion of unused RawDatum (but errors deleting used RawDatum)
- related_name='+',
- )
- indexcard = models.ForeignKey(
- Indexcard,
- on_delete=models.CASCADE,
- related_name='%(app_label)s_%(class)s_set',
- )
- turtle_checksum_iri = models.TextField(db_index=True)
- focus_iri = models.TextField() # exact iri used in rdf_as_turtle
- rdf_as_turtle = models.TextField() # TODO: store elsewhere by checksum
-
- # optional:
- expiration_date = models.DateField(
- null=True,
- blank=True,
- help_text='An (optional) date when this description will no longer be valid.',
- )
-
- def as_rdf_tripledict(self) -> rdf.RdfTripleDictionary:
- return rdf.tripledict_from_turtle(self.rdf_as_turtle)
-
- def as_quoted_graph(self) -> rdf.QuotedGraph:
- return rdf.QuotedGraph(
- self.as_rdf_tripledict(),
- focus_iri=self.focus_iri,
- )
-
- def as_rdfdoc_with_supplements(self) -> rdf.RdfGraph:
- '''build an rdf graph composed of this rdf and all current card supplements'''
- _rdfdoc = rdf.RdfGraph(self.as_rdf_tripledict())
- for _supplementary_rdf in self.indexcard.supplementary_rdf_set.all():
- _rdfdoc.add_tripledict(_supplementary_rdf.as_rdf_tripledict())
- return _rdfdoc
-
- class Meta:
- abstract = True
-
- def __repr__(self):
- return f'<{self.__class__.__qualname__}({self.id}, "{self.focus_iri}")'
-
- def __str__(self):
- return repr(self)
-
-
-class LatestIndexcardRdf(IndexcardRdf):
- # just the most recent version of this indexcard
- class Meta:
- constraints = [
- models.UniqueConstraint(
- fields=('indexcard',),
- name='%(app_label)s_%(class)s_uniq_indexcard',
- ),
- ]
- indexes = [
- models.Index(fields=('modified',)), # for OAI-PMH selective harvest
- models.Index(fields=['expiration_date']), # for expiring
- ]
-
-
-class ArchivedIndexcardRdf(IndexcardRdf):
- # all versions of an indexcard over time (including the latest)
- class Meta:
- constraints = [
- models.UniqueConstraint(
- fields=('indexcard', 'from_raw_datum', 'turtle_checksum_iri'),
- name='%(app_label)s_%(class)s_uniq_archived_version',
- ),
- ]
-
-
-class SupplementaryIndexcardRdf(IndexcardRdf):
- # supplementary (non-descriptive) metadata from the same source (just the most recent)
- supplementary_suid = models.ForeignKey(
- share_db.SourceUniqueIdentifier,
- on_delete=models.CASCADE,
- related_name='supplementary_rdf_set',
- )
-
- class Meta:
- constraints = [
- models.UniqueConstraint(
- fields=('indexcard', 'supplementary_suid'),
- name='%(app_label)s_%(class)s_uniq_supplement',
- ),
- ]
- indexes = [
- models.Index(fields=['expiration_date']), # for expiring
- ]
-
-
-class DerivedIndexcard(models.Model):
- # auto:
- created = models.DateTimeField(auto_now_add=True)
- modified = models.DateTimeField(auto_now=True)
-
- # required:
- upriver_indexcard = models.ForeignKey(
- Indexcard,
- on_delete=models.CASCADE,
- related_name='derived_indexcard_set',
- )
- deriver_identifier = models.ForeignKey(ResourceIdentifier, on_delete=models.PROTECT, related_name='+')
- derived_checksum_iri = models.TextField()
- derived_text = models.TextField() # TODO: store elsewhere by checksum
-
- class Meta:
- constraints = [
- models.UniqueConstraint(
- fields=('upriver_indexcard', 'deriver_identifier'),
- name='%(app_label)s_%(class)s_upriverindexcard_deriveridentifier',
- ),
- ]
-
- def __repr__(self):
- return f'<{self.__class__.__qualname__}({self.id}, {self.upriver_indexcard.uuid}, "{self.deriver_identifier.sufficiently_unique_iri}")'
-
- def __str__(self):
- return repr(self)
-
- @property
- def deriver_cls(self):
- from trove.derive import get_deriver_classes
- (_deriver_cls,) = get_deriver_classes(self.deriver_identifier.raw_iri_list)
- return _deriver_cls
-
- def as_rdf_literal(self) -> rdf.Literal:
- return rdf.literal(
- self.derived_text,
- datatype_iris=self.deriver_cls.derived_datatype_iris(),
- )
-
-
###
# local helpers
@@ -438,6 +302,6 @@ def _turtlify(rdf_tripledict: rdf.RdfTripleDictionary) -> tuple[str, str]:
'''return turtle serialization and checksum iri of that serialization'''
_rdf_as_turtle = rdf.turtle_from_tripledict(rdf_tripledict)
_turtle_checksum_iri = str(
- ChecksumIri.digest('sha-256', salt='', raw_data=_rdf_as_turtle),
+ ChecksumIri.digest('sha-256', salt='', data=_rdf_as_turtle),
)
return (_rdf_as_turtle, _turtle_checksum_iri)
diff --git a/trove/models/resource_description.py b/trove/models/resource_description.py
new file mode 100644
index 000000000..d5b43ffc1
--- /dev/null
+++ b/trove/models/resource_description.py
@@ -0,0 +1,118 @@
+from __future__ import annotations
+import datetime
+
+from django.db import models
+from primitive_metadata import primitive_rdf as rdf
+
+__all__ = (
+ 'ArchivedResourceDescription',
+ 'ResourceDescription',
+ 'LatestResourceDescription',
+ 'SupplementaryResourceDescription',
+)
+
+
+class ResourceDescription(models.Model):
+ # auto:
+ created = models.DateTimeField(auto_now_add=True)
+ modified = models.DateTimeField(auto_now=True)
+
+ # required:
+ indexcard = models.ForeignKey(
+ 'trove.Indexcard',
+ on_delete=models.CASCADE,
+ related_name='%(app_label)s_%(class)s_set',
+ )
+ turtle_checksum_iri = models.TextField(db_index=True)
+ focus_iri = models.TextField() # exact iri used in rdf_as_turtle
+ rdf_as_turtle = models.TextField() # TODO: store elsewhere by checksum
+
+ # optional:
+ expiration_date = models.DateField(
+ null=True,
+ blank=True,
+ help_text='An (optional) date when this description will no longer be valid.',
+ )
+
+ class Meta:
+ abstract = True
+
+ @property
+ def is_expired(self) -> bool:
+ return (
+ self.expiration_date is not None
+ and self.expiration_date <= datetime.date.today()
+ )
+
+ def as_rdf_tripledict(self) -> rdf.RdfTripleDictionary:
+ return rdf.tripledict_from_turtle(self.rdf_as_turtle)
+
+ def as_quoted_graph(self) -> rdf.QuotedGraph:
+ return rdf.QuotedGraph(
+ self.as_rdf_tripledict(),
+ focus_iri=self.focus_iri,
+ )
+
+ def as_rdfdoc_with_supplements(self) -> rdf.RdfGraph:
+ '''build an rdf graph composed of this rdf and all current card supplements'''
+ _rdfdoc = rdf.RdfGraph(self.as_rdf_tripledict())
+ for _supplement in self.indexcard.supplementary_description_set.all():
+ _rdfdoc.add_tripledict(_supplement.as_rdf_tripledict())
+ return _rdfdoc
+
+ def __repr__(self):
+ return f'<{self.__class__.__qualname__}({self.id}, "{self.focus_iri}")'
+
+ def __str__(self):
+ return repr(self)
+
+
+class LatestResourceDescription(ResourceDescription):
+ # just the most recent version of this indexcard
+ class Meta:
+ constraints = [
+ models.UniqueConstraint(
+ fields=('indexcard',),
+ name='trove_latestindexcardrdf_uniq_indexcard',
+ # TODO when on django 5.2:
+ # name='%(app_label)s_%(class)s_uniq_indexcard',
+ # ...and add migration with `AlterConstraint` to rename
+ ),
+ ]
+ indexes = [
+ models.Index(
+ fields=('modified',), # for OAI-PMH selective harvest
+ name='trove_lates_modifie_c6b0b1_idx',
+ # TODO when on django 5.2:
+ # remove explicit name, add migration with `RenameIndex` to match
+ ),
+ models.Index(fields=['expiration_date']), # for expiring
+ ]
+
+
+class ArchivedResourceDescription(ResourceDescription):
+ # all versions of an indexcard over time (including the latest)
+ pass
+
+
+class SupplementaryResourceDescription(ResourceDescription):
+ # supplementary (non-descriptive) metadata from the same source (just the most recent)
+ supplementary_suid = models.ForeignKey(
+ 'share.SourceUniqueIdentifier',
+ on_delete=models.CASCADE,
+ related_name='supplementary_description_set',
+ )
+
+ class Meta:
+ constraints = [
+ models.UniqueConstraint(
+ fields=('indexcard', 'supplementary_suid'),
+ name='trove_supplementaryindexcardrdf_uniq_supplement',
+ # TODO when on django 5.2:
+ # name='%(app_label)s_%(class)s_uniq_supplement',
+ # ...and add migration with `AlterConstraint` to rename
+ ),
+ ]
+ indexes = [
+ models.Index(fields=['expiration_date']), # for expiring
+ ]
diff --git a/trove/trovebrowse_gathering.py b/trove/trovebrowse_gathering.py
index 76903d158..3da36167a 100644
--- a/trove/trovebrowse_gathering.py
+++ b/trove/trovebrowse_gathering.py
@@ -36,8 +36,8 @@ def gather_cards_focused_on(focus, *, blend_cards: bool):
_identifier_qs = trove_db.ResourceIdentifier.objects.queryset_for_iris(focus.iris)
_indexcard_qs = trove_db.Indexcard.objects.filter(focus_identifier_set__in=_identifier_qs)
if blend_cards:
- for _latest_rdf in trove_db.LatestIndexcardRdf.objects.filter(indexcard__in=_indexcard_qs):
- yield from rdf.iter_tripleset(_latest_rdf.as_rdf_tripledict())
+ for _latest_resource_description in trove_db.LatestResourceDescription.objects.filter(indexcard__in=_indexcard_qs):
+ yield from rdf.iter_tripleset(_latest_resource_description.as_rdf_tripledict())
else:
for _indexcard in _indexcard_qs:
_card_iri = _indexcard.get_iri()
diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py
index 0d2fcb719..4f548774d 100644
--- a/trove/trovesearch/trovesearch_gathering.py
+++ b/trove/trovesearch/trovesearch_gathering.py
@@ -370,8 +370,8 @@ def _load_cards_and_contents(*, card_iris=None, value_iris=None, deriver_iri) ->
def _load_cards_and_extracted_rdf_contents(card_iris=None, value_iris=None) -> dict[str, IndexcardFocus]:
_card_namespace = trove_indexcard_namespace()
- _indexcard_rdf_qs = (
- trove_db.LatestIndexcardRdf.objects
+ _resource_description_qs = (
+ trove_db.LatestResourceDescription.objects
.select_related('indexcard')
.prefetch_related('indexcard__focus_identifier_set')
)
@@ -380,19 +380,19 @@ def _load_cards_and_extracted_rdf_contents(card_iris=None, value_iris=None) -> d
iri_minus_namespace(_card_iri, namespace=_card_namespace)
for _card_iri in card_iris
}
- _indexcard_rdf_qs = _indexcard_rdf_qs.filter(indexcard__uuid__in=_indexcard_uuids)
+ _resource_description_qs = _resource_description_qs.filter(indexcard__uuid__in=_indexcard_uuids)
if value_iris is not None:
- _indexcard_rdf_qs = _indexcard_rdf_qs.filter(
+ _resource_description_qs = _resource_description_qs.filter(
indexcard__focus_identifier_set__in=(
trove_db.ResourceIdentifier.objects
.queryset_for_iris(value_iris)
),
)
_card_foci: dict[str, IndexcardFocus] = {}
- for _indexcard_rdf in _indexcard_rdf_qs:
- _card = _indexcard_rdf.indexcard
+ for _resource_description in _resource_description_qs:
+ _card = _resource_description.indexcard
_card_iri = _card.get_iri()
- _quoted_graph = _indexcard_rdf.as_quoted_graph()
+ _quoted_graph = _resource_description.as_quoted_graph()
_quoted_graph.add(
(_quoted_graph.focus_iri, FOAF.isPrimaryTopicOf, _card_iri),
)
diff --git a/trove/util/iris.py b/trove/util/iris.py
index 35d9123f4..736758a64 100644
--- a/trove/util/iris.py
+++ b/trove/util/iris.py
@@ -90,6 +90,8 @@ def get_sufficiently_unique_iri_and_scheme(iri: str) -> tuple[str, str]:
if _scheme_match:
_scheme = _scheme_match.group().lower()
_remainder = iri[_scheme_match.end():]
+ if not _remainder.startswith(COLON):
+ raise trove_exceptions.IriInvalid(f'does not look like an iri (got "{iri}")')
if not _remainder.startswith(COLON_SLASH_SLASH):
# for an iri without '://', assume nothing!
return (iri, _scheme)
@@ -179,3 +181,30 @@ def unquote_iri(iri: str) -> str:
break
_unquoted_iri = _next_unquoted_iri
return _unquoted_iri
+
+
+def smells_like_iri(maybe_iri: str) -> bool:
+ '''check a string starts like an IRI (does not fully validate)
+
+ >>> smells_like_iri('https://blarg.example/hello')
+ True
+ >>> smells_like_iri('foo:bar') # URN
+ True
+
+ >>> smells_like_iri('://blarg.example/hello')
+ False
+ >>> smells_like_iri('foo/bar')
+ False
+ >>> smells_like_iri('foo')
+ False
+ >>> smells_like_iri(7)
+ False
+ '''
+ try:
+ return (
+ isinstance(maybe_iri, str)
+ # nonempty suffuniq-iri and scheme
+ and all(get_sufficiently_unique_iri_and_scheme(maybe_iri))
+ )
+ except trove_exceptions.IriInvalid:
+ return False
diff --git a/trove/util/queryparams.py b/trove/util/queryparams.py
index 0a9bb5d75..bdf667f56 100644
--- a/trove/util/queryparams.py
+++ b/trove/util/queryparams.py
@@ -127,15 +127,23 @@ def get_bool_value(
if_absent: bool = False, # by default, param absence is falsy
if_empty: bool = True, # by default, presence (with empty value) is truthy
) -> bool:
- _value = get_single_value(queryparams, queryparam_name)
- if _value is None:
- return if_absent
- if _value == '':
- return if_empty
- return parse_booly_str(_value)
+ return parse_booly_str(
+ get_single_value(queryparams, queryparam_name),
+ if_absent=if_absent,
+ if_empty=if_empty,
+ )
-def parse_booly_str(value: str):
+def parse_booly_str(
+ value: str | None,
+ *,
+ if_absent: bool = False, # by default, param absence is falsy
+ if_empty: bool = True, # by default, presence (with empty value) is truthy
+) -> bool:
+ if value is None:
+ return if_absent
+ if value == '':
+ return if_empty
_lowered = value.lower()
if _lowered in TRUTHY_VALUES:
return True
diff --git a/trove/util/trove_params.py b/trove/util/trove_params.py
index 920960679..c693de112 100644
--- a/trove/util/trove_params.py
+++ b/trove/util/trove_params.py
@@ -100,26 +100,19 @@ def _gather_attrpaths(cls, queryparams: _qp.QueryparamDict, shorthand: rdf.IriSh
if _fields_params:
_requested: dict[str, list[Propertypath]] = defaultdict(list)
for _param_name, _param_value in _fields_params:
- try:
- (_typenames,) = filter(bool, _param_name.bracketed_names)
- except (IndexError, ValueError):
- raise trove_exceptions.InvalidQueryParamName(
- f'expected "fields[TYPE]" (with exactly one non-empty bracketed segment)'
- f' (got "{_param_name}")'
- )
- else:
- for _type in _qp.split_queryparam_value(_typenames):
- _type_key = (
- GLOB_PATHSTEP
- if _type == GLOB_PATHSTEP
- else shorthand.expand_iri(_type)
- )
- _requested[_type_key].extend(
- (
- parse_propertypath(_path_value, shorthand)
- for _path_value in _qp.split_queryparam_value(_param_value)
- )
+ if _param_name.bracketed_names: # e.g. "fields[TYPE1,TYPE2,TYPE3]=..."
+ _typenames = _qp.split_queryparam_value(_param_name.bracketed_names[0])
+ else: # omitted brackets equivalent to "fields[*]" (apply to any type)
+ _typenames = [GLOB_PATHSTEP]
+ for _typename in _typenames:
+ if _typename != GLOB_PATHSTEP:
+ _typename = shorthand.expand_iri(_typename)
+ _requested[_typename].extend(
+ ( # list of field paths in query param value
+ parse_propertypath(_path_value, shorthand)
+ for _path_value in _qp.split_queryparam_value(_param_value)
)
+ )
_attrpaths = _attrpaths.with_new(freeze(_requested))
return _attrpaths
diff --git a/trove/views/ingest.py b/trove/views/ingest.py
index 6d401c806..73d6cb021 100644
--- a/trove/views/ingest.py
+++ b/trove/views/ingest.py
@@ -5,9 +5,10 @@
from django import http
from django.views import View
-from share import exceptions
from share.models.feature_flag import FeatureFlag
from trove import digestive_tract
+from trove import exceptions as trove_exceptions
+from trove.util.queryparams import parse_booly_str
logger = logging.getLogger(__name__)
@@ -30,8 +31,6 @@ def post(self, request):
if not _focus_iri:
return http.HttpResponse('focus_iri queryparam required', status=HTTPStatus.BAD_REQUEST)
_record_identifier = request.GET.get('record_identifier')
- if not _record_identifier:
- return http.HttpResponse('record_identifier queryparam required', status=HTTPStatus.BAD_REQUEST)
_expiration_date_str = request.GET.get('expiration_date')
if _expiration_date_str is None:
_expiration_date = None
@@ -40,22 +39,24 @@ def post(self, request):
_expiration_date = datetime.date.fromisoformat(_expiration_date_str)
except ValueError:
return http.HttpResponse('expiration_date queryparam must be in ISO-8601 date format (YYYY-MM-DD)', status=HTTPStatus.BAD_REQUEST)
+ _nonurgent = parse_booly_str(request.GET.get('nonurgent'))
try:
- digestive_tract.swallow(
+ digestive_tract.ingest(
+ raw_record=request.body.decode(encoding='utf-8'),
+ record_mediatype=request.content_type,
from_user=request.user,
- record=request.body.decode(encoding='utf-8'),
record_identifier=_record_identifier,
- record_mediatype=request.content_type,
focus_iri=_focus_iri,
- urgent=(request.GET.get('nonurgent') is None),
is_supplementary=(request.GET.get('is_supplementary') is not None),
+ urgent=(not _nonurgent),
expiration_date=_expiration_date,
+ restore_deleted=True,
)
- except exceptions.IngestError as e:
+ except trove_exceptions.DigestiveError as e:
logger.exception(str(e))
return http.HttpResponse(str(e), status=HTTPStatus.BAD_REQUEST)
else:
- # TODO: include link to view status (return task id from `swallow`?)
+ # TODO: include (link to?) extracted card(s)
return http.HttpResponse(status=HTTPStatus.CREATED)
def delete(self, request):
diff --git a/trove/vocab/osfmap.py b/trove/vocab/osfmap.py
index 731834ade..d67e545e8 100644
--- a/trove/vocab/osfmap.py
+++ b/trove/vocab/osfmap.py
@@ -356,13 +356,13 @@
literal('isSupplementedBy', language='en'),
},
},
- OSFMAP.verifiedLinks: {
+ OSFMAP.verifiedLink: {
RDF.type: {RDF.Property},
RDFS.label: {
literal('Verified Links', language='en'),
},
JSONAPI_MEMBERNAME: {
- literal('verifiedLinks', language='en'),
+ literal('verifiedLink', language='en'),
},
},
OSFMAP.archivedAt: {
@@ -572,10 +572,10 @@
literal('accessService', language='en'),
},
},
- DCAT.accessUrl: {
+ DCAT.accessURL: {
RDF.type: {RDF.Property},
JSONAPI_MEMBERNAME: {
- literal('accessUrl', language='en'),
+ literal('accessURL', language='en'),
},
},
OSFMAP.hostingInstitution: {