diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 7c4b7d429..24ec48af4 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -74,7 +74,7 @@ jobs: - name: run tests run: | - coverage run -m pytest --create-db + coverage run -m pytest --create-db -x coverage xml -o _shtrove_coverage.xml env: DATABASE_PASSWORD: postgres diff --git a/CHANGELOG.md b/CHANGELOG.md index 124135277..117a8cdcb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Change Log +# [25.4.0] - 2025-06-24 +- delete `RawDatum` model + - `trove.digestive_tract.extract` now must succeed before `/trove/ingest` responds +- rename `IndexcardRdf` (and kids) to `ResourceDescription` +- move most django models to their own files +- stop storing `CeleryTaskResult`s forever + - new environment variables: `CELERY_RESULT_EXPIRES`, `FAILED_CELERY_RESULT_EXPIRES` +- fix: `/api/v2/` error generating rss/atom feed links +- fix: pagination at `/api/v2/sourceconfigs` +- fix: correct osfmap IRIs (`dcat:accessURL`, `osf:verifiedLink`) + # [25.3.3] - 2025-06-17 - smaller `osfmap_json` derived representation (thx bodintsov) - prepare for next release dropping `RawDatum` model/table: diff --git a/api/base/views.py b/api/base/views.py index 6471c8d7d..48755fd56 100644 --- a/api/base/views.py +++ b/api/base/views.py @@ -45,12 +45,11 @@ def initial(self, request, *args, **kwargs): class RootView(views.APIView): def get(self, request): links = { - 'rawdata': 'api:rawdatum-list', 'sources': 'api:source-list', 'users': 'api:user-list', 'status': 'api:status', - 'rss': 'api:rss', - 'atom': 'api:atom', + 'rss': 'api:feeds.rss', + 'atom': 'api:feeds.atom', } ret = {k: request.build_absolute_uri(reverse(v)) for k, v in links.items()} return Response(ret) diff --git a/api/rawdata/serializers.py b/api/rawdata/serializers.py deleted file mode 100644 index b76a3d58c..000000000 --- a/api/rawdata/serializers.py +++ /dev/null @@ -1,10 +0,0 @@ -from share import models - -from api.base import ShareSerializer - - -class RawDatumSerializer(ShareSerializer): - - class Meta: - model = models.RawDatum - fields = ('id', 'suid', 'datum', 'sha256', 'date_modified', 'date_created') diff --git a/api/rawdata/urls.py b/api/rawdata/urls.py deleted file mode 100644 index fe491c80d..000000000 --- a/api/rawdata/urls.py +++ /dev/null @@ -1,7 +0,0 @@ -from rest_framework.routers import SimpleRouter -from api.rawdata import views - - -router = SimpleRouter() -router.register(r'rawdata', views.RawDataViewSet, basename='rawdatum') -urlpatterns = router.urls diff --git a/api/rawdata/views.py b/api/rawdata/views.py deleted file mode 100644 index 8293402c1..000000000 --- a/api/rawdata/views.py +++ /dev/null @@ -1,31 +0,0 @@ -from rest_framework import viewsets - -from share import models - -from api.base.views import ShareViewSet -from api.pagination import CursorPagination -from api.rawdata.serializers import RawDatumSerializer - - -class RawDataViewSet(ShareViewSet, viewsets.ReadOnlyModelViewSet): - """ - Raw data, exactly as harvested from the data source. - - ## Query by object - To get all the raw data corresponding to a Share object, use the query - parameters `object_id=<@id>` and `object_type=<@type>` - """ - - ordering = ('-id', ) - pagination_class = CursorPagination - serializer_class = RawDatumSerializer - - def get_queryset(self): - object_id = self.request.query_params.get('object_id', None) - object_type = self.request.query_params.get('object_type', None) - if object_id and object_type: - return models.RawDatum.objects.filter( - normalizeddata__changeset__changes__target_id=object_id, - normalizeddata__changeset__changes__target_type__model=object_type - ).distinct('id').select_related('suid') - return models.RawDatum.objects.all().select_related('suid') diff --git a/api/sourceconfigs/views.py b/api/sourceconfigs/views.py index 2c8647ceb..62e471c16 100644 --- a/api/sourceconfigs/views.py +++ b/api/sourceconfigs/views.py @@ -2,12 +2,14 @@ from api.sourceconfigs.serializers import SourceConfigSerializer from api.base import ShareViewSet +from api.pagination import CursorPagination from share.models import SourceConfig class SourceConfigViewSet(ShareViewSet, viewsets.ReadOnlyModelViewSet): serializer_class = SourceConfigSerializer + pagination_class = CursorPagination ordering = ('id', ) diff --git a/api/urls.py b/api/urls.py index ef02ffa73..a672c0c60 100644 --- a/api/urls.py +++ b/api/urls.py @@ -9,7 +9,6 @@ urlpatterns = [ url('^$', RootView.as_view()), url('^', include('api.banners.urls')), - url('^', include('api.rawdata.urls')), url('^', include('api.sourceconfigs.urls')), url('^', include('api.sources.urls')), url('^', include('api.suids.urls')), diff --git a/project/settings.py b/project/settings.py index 8979e67c1..19d5b02c1 100644 --- a/project/settings.py +++ b/project/settings.py @@ -341,8 +341,17 @@ def split(string, delim): }, } -CELERY_RESULT_EXPIRES = 60 * 60 * 24 * 3 # 4 days CELERY_RESULT_BACKEND = 'share.celery:CeleryDatabaseBackend' +CELERY_RESULT_EXPIRES = int(os.environ.get( + 'CELERY_RESULT_EXPIRES', + 60 * 60 * 24 * 3, # 3 days +)) +# only successful tasks get the default expiration (above) +# -- failed tasks kept longer (see `share.celery`) +FAILED_CELERY_RESULT_EXPIRES = int(os.environ.get( + 'FAILED_CELERY_RESULT_EXPIRES', + 60 * 60 * 24 * 11, # 11 days +)) # Don't reject tasks that were present on a worker when it was killed CELERY_TASK_REJECT_ON_WORKER_LOST = False @@ -358,7 +367,7 @@ def split(string, delim): CELERY_TASK_DEFAULT_ROUTING_KEY = 'share_default' URGENT_TASK_QUEUES = { - 'trove.digestive_tract.task__extract_and_derive': 'digestive_tract.urgent', + 'trove.digestive_tract.task__derive': 'digestive_tract.urgent', } @@ -440,6 +449,10 @@ def route_urgent_task(name, args, kwargs, options, task=None, **kw): SHARE_WEB_URL = os.environ.get('SHARE_WEB_URL', 'http://localhost:8003').rstrip('/') + '/' SHARE_USER_AGENT = os.environ.get('SHARE_USER_AGENT', 'SHAREbot/{} (+{})'.format(VERSION, SHARE_WEB_URL)) +SHARE_ADMIN_USERNAME = os.environ.get('SHARE_ADMIN_USERNAME', 'admin') +SHARE_ADMIN_PASSWORD = os.environ.get('SHARE_ADMIN_PASSWORD') +if DEBUG and (SHARE_ADMIN_PASSWORD is None): + SHARE_ADMIN_PASSWORD = 'password' # Skip some of the more intensive operations on works that surpass these limits SHARE_LIMITS = { diff --git a/share/admin/__init__.py b/share/admin/__init__.py index 7d1328756..6c5fa04c8 100644 --- a/share/admin/__init__.py +++ b/share/admin/__init__.py @@ -1,10 +1,6 @@ from django.apps import apps -from django.urls import re_path as url from django.contrib import admin -from django.http import HttpResponseRedirect -from django.template.response import TemplateResponse -from django.urls import path, reverse -from django.utils.html import format_html +from django.urls import path from oauth2_provider.models import AccessToken @@ -15,7 +11,6 @@ CeleryTaskResult, FeatureFlag, IndexBackfill, - RawDatum, ShareUser, SiteBanner, Source, @@ -51,26 +46,6 @@ class ShareUserAdmin(admin.ModelAdmin): search_fields = ['username'] -@linked_fk('suid') -class RawDatumAdmin(admin.ModelAdmin): - show_full_result_count = False - list_select_related = ('suid__source_config', ) - list_display = ('id', 'identifier', 'source_config_label', 'datestamp', 'date_created', 'date_modified', ) - readonly_fields = ('datum__pre', 'sha256') - exclude = ('datum',) - paginator = TimeLimitedPaginator - - def identifier(self, obj): - return obj.suid.identifier - - def source_config_label(self, obj): - return obj.suid.source_config.label - - def datum__pre(self, instance): - return format_html('
{}
', instance.datum) - datum__pre.short_description = 'datum' # type: ignore[attr-defined] - - class AccessTokenAdmin(admin.ModelAdmin): raw_id_fields = ('user',) list_display = ('token', 'user', 'scope') @@ -91,11 +66,10 @@ def save_model(self, request, obj, form, change): @linked_fk('source') class SourceConfigAdmin(admin.ModelAdmin): - list_display = ('label', 'source_', 'version', 'enabled', 'button_actions') + list_display = ('label', 'source_', 'version', 'enabled',) list_select_related = ('source',) - readonly_fields = ('button_actions',) search_fields = ['label', 'source__name', 'source__long_title'] - actions = ['schedule_full_ingest'] + actions = ['schedule_derive'] def source_(self, obj): return obj.source.long_title @@ -104,42 +78,10 @@ def enabled(self, obj): return not obj.disabled enabled.boolean = True # type: ignore[attr-defined] - @admin.action(description='schedule re-ingest of all raw data for each source config') - def schedule_full_ingest(self, request, queryset): + @admin.action(description='schedule re-derive of all cards for each selected source config') + def schedule_derive(self, request, queryset): for _id in queryset.values_list('id', flat=True): - digestive_tract.task__schedule_extract_and_derive_for_source_config.delay(_id) - - def get_urls(self): - return [ - url( - r'^(?P.+)/ingest/$', - self.admin_site.admin_view(self.start_ingest), - name='source-config-ingest' - ) - ] + super().get_urls() - - def button_actions(self, obj): - return format_html( - ' '.join(( - ('Ingest' if not obj.disabled else ''), - )), - ingest_href=reverse('admin:source-config-ingest', args=[obj.pk]), - ) - button_actions.short_description = 'Buttons' # type: ignore[attr-defined] - - def start_ingest(self, request, config_id): - config = self.get_object(request, config_id) - if request.method == 'POST': - digestive_tract.task__schedule_extract_and_derive_for_source_config.delay(config.pk) - url = reverse( - 'admin:share_sourceconfig_changelist', - current_app=self.admin_site.name, - ) - return HttpResponseRedirect(url) - else: - context = self.admin_site.each_context(request) - context['source_config'] = config - return TemplateResponse(request, 'admin/start-ingest.html', context) + digestive_tract.task__schedule_derive_for_source_config.delay(_id) @linked_fk('user') @@ -157,26 +99,16 @@ def access_token(self, obj): @linked_fk('source_config') @linked_fk('focus_identifier') @linked_many('formattedmetadatarecord_set', defer=('formatted_metadata',)) -@linked_many('raw_data', defer=('datum',)) @linked_many('indexcard_set') class SourceUniqueIdentifierAdmin(admin.ModelAdmin): readonly_fields = ('identifier',) paginator = TimeLimitedPaginator - actions = ('reingest', 'delete_cards_for_suid') + actions = ('delete_cards_for_suid',) list_filter = (SourceConfigFilter,) list_select_related = ('source_config',) show_full_result_count = False search_fields = ('identifier',) - def reingest(self, request, queryset): - _raw_id_queryset = ( - RawDatum.objects - .latest_by_suid_queryset(queryset) - .values_list('id', flat=True) - ) - for _raw_id in _raw_id_queryset: - digestive_tract.task__extract_and_derive.delay(raw_id=_raw_id) - def delete_cards_for_suid(self, request, queryset): for suid in queryset: digestive_tract.expel_suid(suid) @@ -220,7 +152,6 @@ class FeatureFlagAdmin(admin.ModelAdmin): admin_site.register(CeleryTaskResult, CeleryTaskResultAdmin) admin_site.register(FeatureFlag, FeatureFlagAdmin) admin_site.register(IndexBackfill, IndexBackfillAdmin) -admin_site.register(RawDatum, RawDatumAdmin) admin_site.register(ShareUser, ShareUserAdmin) admin_site.register(SiteBanner, SiteBannerAdmin) admin_site.register(Source, SourceAdmin) diff --git a/share/celery.py b/share/celery.py index a097cbc41..ff0f626c5 100644 --- a/share/celery.py +++ b/share/celery.py @@ -1,13 +1,15 @@ +import datetime import functools import logging - from celery import states from celery.app.task import Context from celery.backends.base import BaseDictBackend from celery.utils.time import maybe_timedelta +from django.conf import settings from django.db import transaction +from django.db.models import Q from django.utils import timezone import sentry_sdk @@ -90,7 +92,10 @@ def _store_result(self, task_id, result, status, traceback=None, request=None, * @die_on_unhandled def cleanup(self, expires=None): - TaskResultCleaner(expires or self.expires).clean() + TaskResultCleaner( + success_ttl=(expires or self.expires), + nonsuccess_ttl=settings.FAILED_CELERY_RESULT_EXPIRES, + ).clean() @die_on_unhandled def _get_task_meta_for(self, task_id): @@ -111,20 +116,19 @@ class TaskResultCleaner: TaskModel = CeleryTaskResult - TASK_TTLS = { - } - - NO_ARCHIVE = { - } - - def __init__(self, expires, bucket=None, delete=True, chunk_size=5000): - self.bucket = bucket + def __init__(self, success_ttl, nonsuccess_ttl=None, delete=True, chunk_size=5000): self.chunk_size = chunk_size self.delete = delete - self.expires = expires + self.success_ttl = success_ttl + self.nonsuccess_ttl = nonsuccess_ttl or success_ttl - def get_ttl(self, task_name): - return timezone.now() - maybe_timedelta(self.TASK_TTLS.get(task_name, self.expires)) + @property + def success_cutoff(self) -> datetime.datetime: + return timezone.now() - maybe_timedelta(self.success_ttl) + + @property + def nonsuccess_cutoff(self) -> datetime.datetime: + return timezone.now() - maybe_timedelta(self.nonsuccess_ttl) def get_task_names(self): qs = self.TaskModel.objects.values('task_name').annotate(name=GroupBy('task_name')) @@ -137,12 +141,15 @@ def get_task_names(self): def clean(self): for name in self.get_task_names(): - logger.debug('Looking for succeeded %s tasks modified before %s', name, self.get_ttl(name)) - - queryset = self.TaskModel.objects.filter( - task_name=name, - status=states.SUCCESS, - date_modified__lt=self.get_ttl(name) + success_q = Q(status=states.SUCCESS, date_modified__lt=self.success_cutoff) + nonsuccess_q = ( + ~Q(status=states.SUCCESS) + & Q(date_modified__lt=self.nonsuccess_cutoff) + ) + queryset = ( + self.TaskModel.objects + .filter(task_name=name) + .filter(success_q | nonsuccess_q) ) if not queryset.exists(): diff --git a/share/exceptions.py b/share/exceptions.py index 801aedc61..45a26679f 100644 --- a/share/exceptions.py +++ b/share/exceptions.py @@ -1,31 +1,3 @@ class ShareException(Exception): pass - - -class HarvestError(ShareException): - pass - - -class IngestError(ShareException): - pass - - -class TransformError(IngestError): - pass - - -class RegulateError(IngestError): - pass - - -class MergeRequired(IngestError): - """A node disambiguated to multiple objects in the database. - """ - pass - - -class IngestConflict(IngestError): - """Multiple data being ingested at the same time conflicted. - """ - pass diff --git a/share/management/commands/delete_pretrove_data.py b/share/management/commands/delete_pretrove_data.py deleted file mode 100644 index 5b9c0249f..000000000 --- a/share/management/commands/delete_pretrove_data.py +++ /dev/null @@ -1,73 +0,0 @@ -from django.db.models import OuterRef, Exists -from django.utils.translation import gettext as _ - -from share.management.commands import BaseShareCommand -from share import models as _db - - -class Command(BaseShareCommand): - def add_arguments(self, parser): - parser.add_argument('--chunksize', type=int, default=1024, help='number of RawData per DELETE') - parser.add_argument('--really-really', action='store_true', help='skip final confirmation prompt before really deleting') - - def handle(self, *args, chunksize: int, really_really: bool, **kwargs): - # note: `share.transform` deleted; `transformer_key` always null for trove-ingested rdf - _pretrove_configs = _db.SourceConfig.objects.filter(transformer_key__isnull=False) - _pretrove_configs_with_rawdata = ( - _pretrove_configs - .annotate(has_rawdata=Exists( - _db.RawDatum.objects - .filter(suid__source_config_id=OuterRef('pk')) - )) - .filter(has_rawdata=True) - ) - if not _pretrove_configs_with_rawdata.exists(): - self.stdout.write(self.style.SUCCESS(_('nothing to delete'))) - return - _sourceconfig_ids_and_labels = list( - _pretrove_configs_with_rawdata.values_list('id', 'label'), - ) - self.stdout.write(self.style.WARNING(_('pre-trove source-configs with deletable rawdata:'))) - for __, _sourceconfig_label in _sourceconfig_ids_and_labels: - self.stdout.write(f'\t{_sourceconfig_label}') - if really_really or self.input_confirm(self.style.WARNING(_('really DELETE ALL raw metadata records belonging to these source-configs? (y/n)'))): - _total_deleted = 0 - for _sourceconfig_id, _sourceconfig_label in _sourceconfig_ids_and_labels: - _total_deleted += self._do_delete_rawdata(_sourceconfig_id, _sourceconfig_label, chunksize) - self.stdout.write(self.style.SUCCESS(_('deleted %(count)s items') % {'count': _total_deleted})) - else: - self.stdout.write(self.style.SUCCESS(_('deleted nothing'))) - - def _do_delete_rawdata(self, sourceconfig_id, sourceconfig_label, chunksize) -> int: - # note: `.delete()` cannot be called on sliced querysets, so chunking is more complicated - # -- before deleting each chunk, query for its last pk to filter on as a sentinel value - _prior_sentinel_pk = None - _deleted_count = 0 - _rawdata_qs = ( - _db.RawDatum.objects - .filter(suid__source_config_id=sourceconfig_id) - .order_by('pk') # for consistent chunking - ) - self.stdout.write(_('%(label)s: deleting all rawdata...') % {'label': sourceconfig_label}) - while True: # for each chunk: - _pk_qs = _rawdata_qs.values_list('pk', flat=True) - # get the last pk in the chunk - _sentinel_pk = _pk_qs[chunksize - 1: chunksize].first() or _pk_qs.last() - if _sentinel_pk is not None: - if (_prior_sentinel_pk is not None) and (_sentinel_pk <= _prior_sentinel_pk): - raise RuntimeError(f'sentinel pks not ascending?? got {_sentinel_pk} after {_prior_sentinel_pk}') - _prior_sentinel_pk = _sentinel_pk - _chunk_to_delete = _rawdata_qs.filter(pk__lte=_sentinel_pk) - _chunk_deleted_count, _by_model = _chunk_to_delete.delete() - if _by_model and set(_by_model.keys()) != {'share.RawDatum'}: - raise RuntimeError(f'deleted models other than RawDatum?? {_by_model}') - self.stdout.write( - _('%(label)s: deleted %(count)s') % {'label': sourceconfig_label, 'count': _chunk_deleted_count}, - ) - _deleted_count += _chunk_deleted_count - continue # next chunk - # end - self.stdout.write(self.style.SUCCESS( - _('%(label)s: done; deleted %(count)s') % {'label': sourceconfig_label, 'count': _deleted_count}, - )) - return _deleted_count diff --git a/share/migrations/0001_squashed_0058_big_rend.py b/share/migrations/0001_squashed_0058_big_rend.py index 501fe8044..64b388823 100644 --- a/share/migrations/0001_squashed_0058_big_rend.py +++ b/share/migrations/0001_squashed_0058_big_rend.py @@ -11,7 +11,9 @@ import django.utils.timezone import share.models.core import share.models.fields -import share.models.ingest +import share.models._old +import share.models.source +import share.models.source_config import share.models.validators import share.version @@ -135,9 +137,6 @@ class Migration(migrations.Migration): ('date_created', models.DateTimeField(auto_now_add=True)), ('date_modified', models.DateTimeField(auto_now=True)), ], - managers=[ - ('objects', share.models.ingest.NaturalKeyManager('key')), - ], ), migrations.CreateModel( name='HarvestJob', @@ -167,10 +166,10 @@ class Migration(migrations.Migration): ('name', models.TextField(unique=True)), ('long_title', models.TextField(unique=True)), ('home_page', models.URLField(null=True)), - ('icon', models.ImageField(null=True, storage=share.models.ingest.SourceIconStorage(), upload_to=share.models.ingest.icon_name)), + ('icon', models.ImageField(null=True, storage=share.models._old.SourceIconStorage(), upload_to=share.models._old.icon_name)), ], managers=[ - ('objects', share.models.ingest.NaturalKeyManager('name')), + ('objects', share.models.source.SourceManager()), ], ), migrations.CreateModel( @@ -190,7 +189,7 @@ class Migration(migrations.Migration): ('source', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='share.Source')), ], managers=[ - ('objects', share.models.ingest.NaturalKeyManager('label')), + ('objects', share.models.source_config.SourceConfigManager()), ], ), migrations.CreateModel( @@ -217,9 +216,6 @@ class Migration(migrations.Migration): ('date_created', models.DateTimeField(auto_now_add=True)), ('date_modified', models.DateTimeField(auto_now=True)), ], - managers=[ - ('objects', share.models.ingest.NaturalKeyManager('key')), - ], ), migrations.CreateModel( name='RawDatumJob', @@ -297,7 +293,7 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='source', name='icon', - field=models.ImageField(blank=True, default='', storage=share.models.ingest.SourceIconStorage(), upload_to=share.models.ingest.icon_name), + field=models.ImageField(blank=True, default='', storage=share.models._old.SourceIconStorage(), upload_to=share.models._old.icon_name), preserve_default=False, ), migrations.AddField( @@ -460,7 +456,7 @@ class Migration(migrations.Migration): migrations.AlterModelManagers( name='sourceconfig', managers=[ - ('objects', share.models.ingest.SourceConfigManager('label')), + ('objects', share.models.source_config.SourceConfigManager()), ], ), migrations.AddField( diff --git a/share/migrations/0061_ensure_auto_users.py b/share/migrations/0061_ensure_auto_users.py index e422e7eaf..4a20c30af 100644 --- a/share/migrations/0061_ensure_auto_users.py +++ b/share/migrations/0061_ensure_auto_users.py @@ -27,15 +27,15 @@ def ensure_share_system_user(apps, schema_editor): def ensure_share_admin_user(apps, schema_editor): - import os ShareUser = apps.get_model('share', 'ShareUser') - - admin_username = 'admin' - admin_user_exists = ShareUser.objects.filter(username=admin_username).exists() - if not admin_user_exists: + if ( + settings.SHARE_ADMIN_USERNAME + and settings.SHARE_ADMIN_PASSWORD + and not ShareUser.objects.filter(username=settings.SHARE_ADMIN_USERNAME).exists() + ): ShareUser.objects.create_superuser( - admin_username, - os.environ.get('SHARE_ADMIN_PASSWORD', 'password') + settings.SHARE_ADMIN_USERNAME, + settings.SHARE_ADMIN_PASSWORD, ) @@ -48,8 +48,10 @@ class Migration(migrations.Migration): operations = [ migrations.RunPython( code=ensure_share_system_user, + reverse_code=migrations.RunPython.noop, ), migrations.RunPython( code=ensure_share_admin_user, + reverse_code=migrations.RunPython.noop, ), ] diff --git a/share/migrations/0078_delete_rawdatum.py b/share/migrations/0078_delete_rawdatum.py new file mode 100644 index 000000000..7d1104d3f --- /dev/null +++ b/share/migrations/0078_delete_rawdatum.py @@ -0,0 +1,17 @@ +# Generated by Django 3.2.25 on 2025-05-30 17:29 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('share', '0077_big_cleanup_2025'), + ('trove', '0009_no_raw_datum'), + ] + + operations = [ + migrations.DeleteModel( + name='RawDatum', + ), + ] diff --git a/share/models/__init__.py b/share/models/__init__.py index 338e34ecf..f53ac7b2a 100644 --- a/share/models/__init__.py +++ b/share/models/__init__.py @@ -1,21 +1,17 @@ -from share.models.source_unique_identifier import SourceUniqueIdentifier -from share.models.index_backfill import IndexBackfill -from share.models.feature_flag import FeatureFlag -from share.models.core import ShareUser -from share.models.ingest import ( - Source, - SourceConfig, - RawDatum, -) from share.models.banner import SiteBanner from share.models.celery import CeleryTaskResult +from share.models.core import ShareUser +from share.models.feature_flag import FeatureFlag from share.models.fields import DateTimeAwareJSONField +from share.models.index_backfill import IndexBackfill +from share.models.source import Source +from share.models.source_config import SourceConfig +from share.models.source_unique_identifier import SourceUniqueIdentifier __all__ = ( 'CeleryTaskResult', 'FeatureFlag', 'IndexBackfill', - 'RawDatum', 'ShareUser', 'SiteBanner', 'Source', diff --git a/share/models/_old.py b/share/models/_old.py new file mode 100644 index 000000000..d42a7f8df --- /dev/null +++ b/share/models/_old.py @@ -0,0 +1,9 @@ +# things kept temporarily, until the old migrations that reference them are squashed away + + +def icon_name(): + ... # removed; stub for past migrations only + + +def SourceIconStorage(): + ... # removed; stub for past migrations only diff --git a/share/models/ingest.py b/share/models/ingest.py deleted file mode 100644 index c5a662c01..000000000 --- a/share/models/ingest.py +++ /dev/null @@ -1,275 +0,0 @@ -import datetime -import hashlib -import logging - -from django.core import validators -from django.db import connection -from django.db import models -from django.db.models.functions import Coalesce -import sentry_sdk - -from share.models.core import ShareUser -from share.models.fuzzycount import FuzzyCountManager -from share.models.source_unique_identifier import SourceUniqueIdentifier -from share.util import chunked, BaseJSONAPIMeta - - -logger = logging.getLogger(__name__) -__all__ = ('Source', 'SourceConfig', 'RawDatum', ) - - -def icon_name(): - ... # removed; stub for past migrations only - - -def SourceIconStorage(): - ... # removed; stub for past migrations only - - -class NaturalKeyManager(models.Manager): - use_in_migrations = True - - def __init__(self, *key_fields): - super(NaturalKeyManager, self).__init__() - self.key_fields = key_fields - - def get_by_natural_key(self, key): - return self.get(**dict(zip(self.key_fields, key))) - - -class Source(models.Model): - name = models.TextField(unique=True) - long_title = models.TextField(unique=True) - home_page = models.URLField(null=True, blank=True) - is_deleted = models.BooleanField(default=False) - - # Whether or not this SourceConfig collects original content - # If True changes made by this source cannot be overwritten - # This should probably be on SourceConfig but placing it on Source - # is much easier for the moment. - # I also haven't seen a situation where a Source has two feeds that we harvest - # where one provider unreliable metadata but the other does not. - canonical = models.BooleanField(default=False, db_index=True) - - # TODO replace with object permissions, allow multiple sources per user (SHARE-996) - user = models.OneToOneField('ShareUser', null=True, on_delete=models.CASCADE) - - objects = NaturalKeyManager('name') - - class JSONAPIMeta(BaseJSONAPIMeta): - pass - - def natural_key(self): - return (self.name,) - - def __repr__(self): - return '<{}({}, {}, {})>'.format(self.__class__.__name__, self.pk, self.name, self.long_title) - - def __str__(self): - return repr(self) - - -class SourceConfigManager(NaturalKeyManager): - def get_or_create_push_config(self, user, transformer_key=None): - assert isinstance(user, ShareUser) - _config_label = '.'.join(( - user.username, - transformer_key or 'rdf', # TODO: something cleaner? - )) - try: - _config = SourceConfig.objects.get(label=_config_label) - except SourceConfig.DoesNotExist: - _source, _ = Source.objects.get_or_create( - user_id=user.id, - defaults={ - 'name': user.username, - 'long_title': user.username, - } - ) - _config, _ = SourceConfig.objects.get_or_create( - label=_config_label, - defaults={ - 'source': _source, - 'transformer_key': transformer_key, - } - ) - assert _config.source.user_id == user.id - assert _config.transformer_key == transformer_key - return _config - - -class SourceConfig(models.Model): - # Previously known as the provider's app_label - label = models.TextField(unique=True) - version = models.PositiveIntegerField(default=1) - - source = models.ForeignKey('Source', on_delete=models.CASCADE, related_name='source_configs') - base_url = models.URLField(null=True) - transformer_key = models.TextField(null=True) - - disabled = models.BooleanField(default=False) - - objects = SourceConfigManager('label') - - class JSONAPIMeta(BaseJSONAPIMeta): - pass - - def natural_key(self): - return (self.label,) - - def __repr__(self): - return '<{}({}, {})>'.format(self.__class__.__name__, self.pk, self.label) - - __str__ = __repr__ - - -class RawDatumManager(FuzzyCountManager): - - def link_to_job(self, job, datum_ids): - if not datum_ids: - return True - logger.debug('Linking RawData to %r', job) - with connection.cursor() as cursor: - for chunk in chunked(datum_ids, size=500): - if not chunk: - break - cursor.execute(''' - INSERT INTO "{table}" - ("{rawdatum}", "{harvestjob}") - VALUES - {values} - ON CONFLICT ("{rawdatum}", "{harvestjob}") DO NOTHING; - '''.format( - values=', '.join('%s' for _ in range(len(chunk))), # Nasty hack. Fix when psycopg2 2.7 is released with execute_values - table=RawDatum.jobs.through._meta.db_table, - rawdatum=RawDatum.jobs.through._meta.get_field('rawdatum').column, - harvestjob=RawDatum.jobs.through._meta.get_field('harvestjob').column, - ), [(raw_id, job.id) for raw_id in chunk]) - return True - - def store_datum_for_suid( - self, - *, - suid, - datum: str, - mediatype: str, - datestamp: datetime.datetime, - expiration_date: datetime.date | None = None, - ): - _raw, _raw_created = self.get_or_create( - suid=suid, - sha256=hashlib.sha256(datum.encode()).hexdigest(), - defaults={ - 'datum': datum, - 'mediatype': mediatype, - 'datestamp': datestamp, - 'expiration_date': expiration_date, - }, - ) - if not _raw_created: - if _raw.datum != datum: - _msg = f'hash collision!? {_raw.sha256}\n===\n{_raw.datum}\n===\n{datum}' - logger.critical(_msg) - sentry_sdk.capture_message(_msg) - _raw.mediatype = mediatype - _raw.expiration_date = expiration_date - # keep the latest datestamp - if (not _raw.datestamp) or (datestamp > _raw.datestamp): - _raw.datestamp = datestamp - _raw.save(update_fields=('mediatype', 'datestamp', 'expiration_date', 'date_modified')) - return _raw - - def latest_by_suid_id(self, suid_id) -> models.QuerySet: - return self.latest_by_suid_queryset( - SourceUniqueIdentifier.objects.filter(id=suid_id), - ) - - def latest_by_suid_queryset(self, suid_queryset) -> models.QuerySet: - return self.filter(id__in=( - suid_queryset - .annotate(latest_rawdatum_id=models.Subquery( - RawDatum.objects - .filter(suid_id=models.OuterRef('id')) - .order_by(Coalesce('datestamp', 'date_created').desc(nulls_last=True)) - .values('id') - [:1] - )) - .values('latest_rawdatum_id') - )) - - def latest_for_each_suid(self) -> models.QuerySet: - # only the latest datum for each described resource - _latest_pk_subquery = models.Subquery( - self.filter(suid_id=models.OuterRef('suid_id')) - .order_by(Coalesce('datestamp', 'date_created').desc(nulls_last=True)) - .values('pk') - [:1] - ) - return self.annotate( - latest_same_suid=_latest_pk_subquery, - ).filter(pk=models.F('latest_same_suid')) - - -class RawDatum(models.Model): - - datum = models.TextField() - mediatype = models.TextField(null=True, blank=True) - - suid = models.ForeignKey(SourceUniqueIdentifier, on_delete=models.CASCADE, related_name='raw_data') - - # The sha256 of the datum - sha256 = models.TextField(validators=[validators.MaxLengthValidator(64)]) - - datestamp = models.DateTimeField(null=True, help_text=( - 'The most relevant datetime that can be extracted from this RawDatum. ' - 'This may be, but is not limited to, a deletion, modification, publication, or creation datestamp. ' - 'Ideally, this datetime should be appropriate for determining the chronological order its data will be applied.' - )) - expiration_date = models.DateField( - null=True, - blank=True, - help_text='An (optional) date after which this datum is no longer valid.', - ) - - date_modified = models.DateTimeField(auto_now=True, editable=False) - date_created = models.DateTimeField(auto_now_add=True, editable=False) - - no_output = models.BooleanField(null=True, help_text=( - 'Indicates that this RawDatum resulted in an empty graph when transformed. ' - 'This allows the RawDataJanitor to find records that have not been processed. ' - 'Records that result in an empty graph will not have an Indexcard associated with them, ' - 'which would otherwise look like data that has not yet been processed.' - )) - - objects = RawDatumManager() - - def is_latest(self): - return ( - RawDatum.objects - .latest_by_suid_id(self.suid_id) - .filter(id=self.id) - .exists() - ) - - @property - def is_expired(self) -> bool: - return ( - self.expiration_date is not None - and self.expiration_date <= datetime.date.today() - ) - - class Meta: - unique_together = ('suid', 'sha256') - verbose_name_plural = 'Raw Data' - indexes = [ - models.Index(fields=['no_output'], name='share_rawda_no_outp_f0330f_idx'), - models.Index(fields=['expiration_date'], name='share_rawdatum_expiration_idx'), - ] - - class JSONAPIMeta(BaseJSONAPIMeta): - resource_name = 'RawData' - - def __repr__(self): - return '<{}({}, {}, {}...)>'.format(self.__class__.__name__, self.id, self.datestamp, self.sha256[:10]) - - __str__ = __repr__ diff --git a/share/models/source.py b/share/models/source.py new file mode 100644 index 000000000..34cef8b61 --- /dev/null +++ b/share/models/source.py @@ -0,0 +1,45 @@ +from django.db import models + +from share.util import BaseJSONAPIMeta + + +__all__ = ('Source', 'SourceManager',) + + +class SourceManager(models.Manager): + use_in_migrations = True + + def get_by_natural_key(self, key): + return self.get(name=key) + + +class Source(models.Model): + name = models.TextField(unique=True) + long_title = models.TextField(unique=True) + home_page = models.URLField(null=True, blank=True) + is_deleted = models.BooleanField(default=False) + + # Whether or not this SourceConfig collects original content + # If True changes made by this source cannot be overwritten + # This should probably be on SourceConfig but placing it on Source + # is much easier for the moment. + # I also haven't seen a situation where a Source has two feeds that we harvest + # where one provider unreliable metadata but the other does not. + canonical = models.BooleanField(default=False, db_index=True) + + # TODO replace with object permissions, allow multiple sources per user (SHARE-996) + user = models.OneToOneField('ShareUser', null=True, on_delete=models.CASCADE) + + objects = SourceManager() + + class JSONAPIMeta(BaseJSONAPIMeta): + pass + + def natural_key(self): + return (self.name,) + + def __repr__(self): + return '<{}({}, {}, {})>'.format(self.__class__.__name__, self.pk, self.name, self.long_title) + + def __str__(self): + return repr(self) diff --git a/share/models/source_config.py b/share/models/source_config.py new file mode 100644 index 000000000..a23dfcaf9 --- /dev/null +++ b/share/models/source_config.py @@ -0,0 +1,68 @@ + +from django.db import models + +from share.models.core import ShareUser +from share.models.source import Source +from share.util import BaseJSONAPIMeta + + +__all__ = ('SourceConfig',) + + +class SourceConfigManager(models.Manager): + use_in_migrations = True + + def get_by_natural_key(self, key): + return self.get(label=key) + + def get_or_create_push_config(self, user, transformer_key=None): + assert isinstance(user, ShareUser) + _config_label = '.'.join(( + user.username, + transformer_key or 'rdf', # TODO: something cleaner? + )) + try: + _config = SourceConfig.objects.get(label=_config_label) + except SourceConfig.DoesNotExist: + _source, _ = Source.objects.get_or_create( + user_id=user.id, + defaults={ + 'name': user.username, + 'long_title': user.username, + } + ) + _config, _ = SourceConfig.objects.get_or_create( + label=_config_label, + defaults={ + 'source': _source, + 'transformer_key': transformer_key, + } + ) + assert _config.source.user_id == user.id + assert _config.transformer_key == transformer_key + return _config + + +class SourceConfig(models.Model): + # Previously known as the provider's app_label + label = models.TextField(unique=True) + version = models.PositiveIntegerField(default=1) + + source = models.ForeignKey('Source', on_delete=models.CASCADE, related_name='source_configs') + base_url = models.URLField(null=True) + transformer_key = models.TextField(null=True) + + disabled = models.BooleanField(default=False) + + objects = SourceConfigManager() + + class JSONAPIMeta(BaseJSONAPIMeta): + pass + + def natural_key(self): + return (self.label,) + + def __repr__(self): + return '<{}({}, {})>'.format(self.__class__.__name__, self.pk, self.label) + + __str__ = __repr__ diff --git a/share/models/source_unique_identifier.py b/share/models/source_unique_identifier.py index bc3bbaf5e..05c6eb7d5 100644 --- a/share/models/source_unique_identifier.py +++ b/share/models/source_unique_identifier.py @@ -1,6 +1,3 @@ -import datetime -from typing import Optional - from django.db import models from share.util import BaseJSONAPIMeta @@ -22,28 +19,6 @@ class JSONAPIMeta(BaseJSONAPIMeta): class Meta: unique_together = ('identifier', 'source_config') - def most_recent_raw_datum(self): - """fetch the most recent RawDatum for this suid - """ - return self._most_recent_raw_datum_queryset().first() - - def most_recent_raw_datum_id(self): - return self._most_recent_raw_datum_queryset().values_list('id', flat=True).first() - - def _most_recent_raw_datum_queryset(self): - from share.models import RawDatum - return RawDatum.objects.latest_by_suid_id(self.id) - - def get_date_first_seen(self) -> Optional[datetime.datetime]: - """when the first RawDatum for this suid was added - """ - return ( - self.raw_data - .order_by('date_created') - .values_list('date_created', flat=True) - .first() - ) - def get_backcompat_sharev2_suid(self): '''get an equivalent "v2_push" suid for this suid diff --git a/share/oaipmh/indexcard_repository.py b/share/oaipmh/indexcard_repository.py index 76de6255e..d9d855f75 100644 --- a/share/oaipmh/indexcard_repository.py +++ b/share/oaipmh/indexcard_repository.py @@ -106,7 +106,7 @@ def oai_identifier(self, indexcard): def _do_identify(self, kwargs, renderer): _earliest_date = ( - trove_db.LatestIndexcardRdf.objects + trove_db.LatestResourceDescription.objects .order_by('modified') .values_list('modified', flat=True) .first() @@ -213,7 +213,7 @@ def _get_indexcard_page_queryset(self, kwargs, catch=True, last_id=None): self.errors.append(oai_errors.BadArgument('Invalid value for', 'from')) else: _indexcard_queryset = _indexcard_queryset.filter( - trove_latestindexcardrdf_set__modified__gte=_from, + trove_latestresourcedescription_set__modified__gte=_from, ) if 'until' in kwargs: try: @@ -224,7 +224,7 @@ def _get_indexcard_page_queryset(self, kwargs, catch=True, last_id=None): self.errors.append(oai_errors.BadArgument('Invalid value for', 'until')) else: _indexcard_queryset = _indexcard_queryset.filter( - trove_latestindexcardrdf_set__modified__lte=_until, + trove_latestresourcedescription_set__modified__lte=_until, ) if 'set' in kwargs: _sourceconfig_ids = tuple( @@ -246,7 +246,7 @@ def _get_base_indexcard_queryset(self): def _get_indexcard_queryset_with_annotations(self): return self._get_base_indexcard_queryset().annotate( oai_datestamp=Subquery( - trove_db.LatestIndexcardRdf.objects + trove_db.LatestResourceDescription.objects .filter(indexcard_id=OuterRef('id')) .values_list('modified', flat=True) [:1] diff --git a/share/search/index_strategy/_trovesearch_util.py b/share/search/index_strategy/_trovesearch_util.py index e38872712..d83f5b16d 100644 --- a/share/search/index_strategy/_trovesearch_util.py +++ b/share/search/index_strategy/_trovesearch_util.py @@ -46,9 +46,9 @@ ### # utilities -def latest_rdf_for_indexcard_pks(indexcard_pks): +def latest_resource_description_for_indexcard_pks(indexcard_pks): return ( - trove_db.LatestIndexcardRdf.objects + trove_db.LatestResourceDescription.objects .filter(indexcard_id__in=indexcard_pks) .filter(Exists( # only index items that have an osfmap_json representation trove_db.DerivedIndexcard.objects @@ -61,7 +61,7 @@ def latest_rdf_for_indexcard_pks(indexcard_pks): .exclude(indexcard__deleted__isnull=False) .select_related('indexcard__source_record_suid__source_config') .prefetch_related('indexcard__focus_identifier_set') - .prefetch_related('indexcard__supplementary_rdf_set') + .prefetch_related('indexcard__supplementary_description_set') ) diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index a65eb776f..19cea8d80 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -224,12 +224,12 @@ def after_chunk(self, messages_chunk: messages.MessagesChunk, affected_indexname # abstract method from Elastic8IndexStrategy def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): - _indexcard_rdf_qs = ts.latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) + _resource_description_qs = ts.latest_resource_description_for_indexcard_pks(messages_chunk.target_ids_chunk) _remaining_indexcard_pks = set(messages_chunk.target_ids_chunk) - for _indexcard_rdf in _indexcard_rdf_qs: - _docbuilder = self._SourcedocBuilder(_indexcard_rdf, messages_chunk.timestamp) + for _resource_description in _resource_description_qs: + _docbuilder = self._SourcedocBuilder(_resource_description, messages_chunk.timestamp) if not _docbuilder.should_skip(): # if skipped, will be deleted - _indexcard_pk = _indexcard_rdf.indexcard_id + _indexcard_pk = _resource_description.indexcard_id _cardsearch_actions = ( self.build_index_action(_doc_id, _doc) for _doc_id, _doc in _docbuilder.build_cardsearch_docs() @@ -333,16 +333,16 @@ def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> Value class _SourcedocBuilder: '''build elasticsearch sourcedocs for an rdf document ''' - indexcard_rdf: trove_db.IndexcardRdf + resource_description: trove_db.ResourceDescription chunk_timestamp: int indexcard: trove_db.Indexcard = dataclasses.field(init=False) focus_iri: str = dataclasses.field(init=False) rdfdoc: rdf.RdfTripleDictionary = dataclasses.field(init=False) def __post_init__(self) -> None: - self.indexcard = self.indexcard_rdf.indexcard - self.focus_iri = self.indexcard_rdf.focus_iri - self.rdfdoc = self.indexcard_rdf.as_rdfdoc_with_supplements() + self.indexcard = self.resource_description.indexcard + self.focus_iri = self.resource_description.focus_iri + self.rdfdoc = self.resource_description.as_rdfdoc_with_supplements() def should_skip(self) -> bool: _suid = self.indexcard.source_record_suid diff --git a/share/util/checksum_iri.py b/share/util/checksum_iri.py index 012fdbab2..552aeb91c 100644 --- a/share/util/checksum_iri.py +++ b/share/util/checksum_iri.py @@ -12,7 +12,7 @@ def _ensure_bytes(bytes_or_something) -> bytes: def _builtin_checksum(hash_constructor): - def hexdigest_fn(salt, data) -> str: + def hexdigest_fn(salt: str | bytes, data: str | bytes) -> str: hasher = hash_constructor() hasher.update(_ensure_bytes(salt)) hasher.update(_ensure_bytes(data)) @@ -37,7 +37,7 @@ def __str__(self): return f'urn:checksum:{self.checksumalgorithm_name}:{self.salt}:{self.hexdigest}' @classmethod - def digest(cls, checksumalgorithm_name, *, salt, raw_data): + def digest(cls, checksumalgorithm_name: str, *, salt: str, data: str): try: hexdigest_fn = CHECKSUM_ALGORITHMS[checksumalgorithm_name] except KeyError: @@ -48,7 +48,7 @@ def digest(cls, checksumalgorithm_name, *, salt, raw_data): return cls( checksumalgorithm_name=checksumalgorithm_name, salt=salt, - hexdigest=hexdigest_fn(salt, raw_data), + hexdigest=hexdigest_fn(salt, data), ) @classmethod @@ -56,7 +56,7 @@ def digest_json(cls, checksumalgorithm_name, *, salt, raw_json): return cls.digest( checksumalgorithm_name, salt=salt, - raw_data=json.dumps(raw_json, sort_keys=True), + data=json.dumps(raw_json, sort_keys=True), ) @classmethod diff --git a/share/version.py b/share/version.py index b1fb40a82..191d57ff7 100644 --- a/share/version.py +++ b/share/version.py @@ -1,4 +1,4 @@ -__version__ = '25.3.3' +__version__ = '25.4.0' def get_share_version() -> str: diff --git a/templates/admin/start-ingest.html b/templates/admin/start-ingest.html deleted file mode 100644 index f0008471a..000000000 --- a/templates/admin/start-ingest.html +++ /dev/null @@ -1,26 +0,0 @@ -{% extends "admin/base_site.html" %} -{% load i18n %} - -{% block extrastyle %} - -{% endblock %} - -{% block content %} -

{% trans "schedule full (re)ingest" %}

-
-

for source config "{{ source_config.label }}"

-
- {% csrf_token %} - - -
-
-{% endblock %} diff --git a/tests/api/test_generated_endpoints.py b/tests/api/test_generated_endpoints.py index d0605f6c8..ab4c10902 100644 --- a/tests/api/test_generated_endpoints.py +++ b/tests/api/test_generated_endpoints.py @@ -5,57 +5,63 @@ # TODO these tests belong somewhere else @pytest.mark.django_db -@pytest.mark.parametrize('endpoint, factory', [ - ('rawdata', factories.RawDatumFactory), +@pytest.mark.parametrize('endpoint, factory, autocreated_count', [ + ('site_banners', factories.SiteBannerFactory, 0), + ('sourceconfigs', factories.SourceConfigFactory, 0), + ('sources', factories.SourceFactory, 1), ]) class TestPagination: - def test_no_prev(self, client, endpoint, factory): + def test_no_prev(self, client, endpoint, factory, autocreated_count): resp = client.get('/api/v2/{}/'.format(endpoint)) assert resp.status_code == 200 - assert resp.json()['data'] == [] - assert resp.json()['links']['prev'] is None - assert resp.json()['links']['next'] is None + _json = resp.json() + assert len(_json['data']) == autocreated_count + _links = _json.get('links', {}) + assert _links.get('prev') is None + assert _links.get('next') is None - def test_one(self, client, endpoint, factory): + def test_one(self, client, endpoint, factory, autocreated_count): factory() resp = client.get('/api/v2/{}/'.format(endpoint)) assert resp.status_code == 200 - assert len(resp.json()['data']) == 1 - assert resp.json()['links']['prev'] is None - assert resp.json()['links']['next'] is None - - def test_full_page(self, client, endpoint, factory): - for _ in range(10): - factory() + _json = resp.json() + assert len(_json['data']) == autocreated_count + 1 + _links = _json.get('links', {}) + assert _links.get('prev') is None + assert _links.get('next') is None + def test_full_page(self, client, endpoint, factory, autocreated_count): + factory.create_batch(10 - autocreated_count) resp = client.get('/api/v2/{}/'.format(endpoint)) assert resp.status_code == 200 + _json = resp.json() + assert len(_json['data']) == 10 + _links = _json.get('links', {}) + assert _links.get('prev') is None + assert _links.get('next') is None - assert len(resp.json()['data']) == 10 - assert resp.json()['links']['prev'] is None - assert resp.json()['links']['next'] is None - - def test_next_page(self, client, endpoint, factory): - for _ in range(20): - factory() - + def test_next_page(self, client, endpoint, factory, autocreated_count): + factory.create_batch(20 - autocreated_count) resp = client.get('/api/v2/{}/'.format(endpoint)) assert resp.status_code == 200 - assert len(resp.json()['data']) == 10 - assert resp.json()['links']['prev'] is None - assert resp.json()['links']['next'] is not None - assert 'page%5Bcursor%5D' in resp.json()['links']['next'] + _json = resp.json() + assert len(_json['data']) == 10 + _links = _json.get('links', {}) + assert _links.get('prev') is None + assert _links.get('next') is not None + assert 'page%5Bcursor%5D' in _links['next'] - resp2 = client.get(resp.json()['links']['next']) + resp2 = client.get(_links['next']) assert resp2.status_code == 200 - assert resp2.json()['links']['next'] is None + _json2 = resp2.json() + assert _json2['links'].get('next') is None - assert set(x['id'] for x in resp.json()['data']) & set(x['id'] for x in resp2.json()['data']) == set() + assert set(x['id'] for x in _json['data']) & set(x['id'] for x in _json2['data']) == set() - def test_bad_cursor(self, client, endpoint, factory): + def test_bad_cursor(self, client, endpoint, factory, autocreated_count): resp = client.get(f'/api/v2/{endpoint}/', {'page[cursor]': 1}) assert resp.status_code == 404 assert resp.json() == {'errors': [{ diff --git a/tests/api/test_readonly_endpoints.py b/tests/api/test_readonly_endpoints.py index 57fd600ee..614207ccd 100644 --- a/tests/api/test_readonly_endpoints.py +++ b/tests/api/test_readonly_endpoints.py @@ -19,22 +19,6 @@ def get_test_data(endpoint_type): return test_data -@pytest.mark.django_db -class TestRawDataEndpoint: - endpoint = '/api/v2/rawdata/' - - def test_status(self, client): - assert client.get(self.endpoint).status_code == 200 - - def test_post(self, client, trusted_user): - assert client.post( - self.endpoint, - json.dumps(get_test_data('RawData')), - content_type='application/vnd.api+json', - HTTP_AUTHORIZATION='Bearer ' + trusted_user.oauth2_provider_accesstoken.first().token, - ).status_code == 405 - - @pytest.mark.django_db class TestSiteBannersEndpoint: endpoint = '/api/v2/site_banners/' diff --git a/tests/conftest.py b/tests/conftest.py index 61eef76dd..6fe8424cd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,7 +12,6 @@ from oauth2_provider.models import AccessToken, Application -from share.models import RawDatum from share.models import ShareUser from share.models import SourceUniqueIdentifier @@ -96,18 +95,6 @@ def suid(source_config): return suid -@pytest.fixture -def raw_data(suid): - raw_data = RawDatum(suid=suid, datum='{}') - raw_data.save() - return raw_data - - -@pytest.fixture -def raw_data_id(raw_data): - return raw_data.id - - @contextlib.contextmanager def rolledback_transaction(loglabel): class ExpectedRollback(Exception): diff --git a/tests/factories/__init__.py b/tests/factories/__init__.py index d23f4f316..84c3c8300 100644 --- a/tests/factories/__init__.py +++ b/tests/factories/__init__.py @@ -1,4 +1,3 @@ -import hashlib import uuid import factory @@ -51,25 +50,15 @@ class Meta: model = share_db.SourceUniqueIdentifier -class RawDatumFactory(DjangoModelFactory): - datum = factory.Sequence(lambda n: f'{n}{fake.text()}') - suid = factory.SubFactory(SourceUniqueIdentifierFactory) - sha256 = factory.LazyAttribute(lambda r: hashlib.sha256(r.datum.encode()).hexdigest()) +class SiteBannerFactory(DjangoModelFactory): + title = factory.Faker('word') + description = factory.Faker('sentence') + color = fuzzy.FuzzyChoice(list(share_db.SiteBanner.COLOR.keys())) + created_by = factory.SubFactory(ShareUserFactory) + last_modified_by = factory.SubFactory(ShareUserFactory) class Meta: - model = share_db.RawDatum - - @classmethod - def _generate(cls, create, attrs): - raw_datum = super()._generate(create, attrs) - - # HACK: allow overriding auto_now_add on date_created - date_created = attrs.pop('date_created', None) - if date_created is not None: - raw_datum.date_created = date_created - raw_datum.save() - - return raw_datum + model = share_db.SiteBanner class CeleryTaskResultFactory(DjangoModelFactory): @@ -99,15 +88,14 @@ class Meta: model = trove_db.Indexcard -class LatestIndexcardRdfFactory(DjangoModelFactory): - from_raw_datum = factory.SubFactory(RawDatumFactory) +class LatestResourceDescriptionFactory(DjangoModelFactory): indexcard = factory.SubFactory(IndexcardFactory) focus_iri = factory.Sequence(lambda x: f'http://test.example/{x}') rdf_as_turtle = factory.Sequence(lambda x: f' a ') # turtle_checksum_iri = class Meta: - model = trove_db.LatestIndexcardRdf + model = trove_db.LatestResourceDescription class DerivedIndexcardFactory(DjangoModelFactory): diff --git a/tests/share/models/test_rawdata.py b/tests/share/models/test_rawdata.py deleted file mode 100644 index 4c046f89d..000000000 --- a/tests/share/models/test_rawdata.py +++ /dev/null @@ -1,94 +0,0 @@ -import datetime -import pytest -import hashlib - -from django.core import exceptions -from django.db.utils import IntegrityError - -from share.models import RawDatum - - -def get_now(): - return datetime.datetime.now(tz=datetime.timezone.utc) - - -@pytest.mark.django_db -class TestRawDatum: - - def test_doesnt_mangle_data(self, suid): - rd = RawDatum(suid=suid, datum='This is just some data') - rd.save() - - assert RawDatum.objects.first().datum == 'This is just some data' - - def test_must_have_data(self, suid): - rd = RawDatum(suid) - - with pytest.raises(exceptions.ValidationError) as e: - rd.clean_fields() - rd.save() - - assert 'This field cannot be blank.' == e.value.message_dict['datum'][0] - - def test_must_have_suid(self): - rd = RawDatum(datum='SomeData') - - with pytest.raises(IntegrityError) as e: - rd.save() - - assert 'null value in column "suid_id"' in e.value.args[0] - - def test_store_data_by_suid(self, suid): - _now = get_now() - rd = RawDatum.objects.store_datum_for_suid( - suid=suid, - datum='mydatums', - mediatype='text/plain', - datestamp=_now, - ) - - assert rd.date_modified is not None - assert rd.date_created is not None - - assert rd.datum == 'mydatums' - assert rd.datestamp == _now - assert rd.suid_id == suid.id - assert rd.sha256 == hashlib.sha256(b'mydatums').hexdigest() - - def test_store_data_dedups_simple(self, suid): - rd1 = RawDatum.objects.store_datum_for_suid( - suid=suid, - datum='mydatums', - mediatype='text/plain', - datestamp=get_now(), - ) - rd2 = RawDatum.objects.store_datum_for_suid( - suid=suid, - datum='mydatums', - mediatype='text/plain', - datestamp=get_now(), - ) - rd3 = RawDatum.objects.store_datum_for_suid( - suid=suid, - datum='mydatums', - mediatype='text/plain', - datestamp=get_now(), - ) - - assert rd1.pk == rd2.pk == rd3.pk - assert rd1.sha256 == rd2.sha256 == rd3.sha256 - assert rd1.datestamp < rd2.datestamp < rd3.datestamp < get_now() - assert rd1.date_created == rd2.date_created == rd3.date_created - assert rd1.date_modified < rd2.date_modified < rd3.date_modified - - def test_is_expired(self): - rd = RawDatum() - assert rd.expiration_date is None - assert not rd.is_expired - _today = datetime.date.today() - rd.expiration_date = datetime.date(_today.year - 1, _today.month, _today.day) - assert rd.is_expired - rd.expiration_date = datetime.date(_today.year, _today.month, _today.day) - assert rd.is_expired - rd.expiration_date = datetime.date(_today.year + 1, _today.month, _today.day) - assert not rd.is_expired diff --git a/tests/share/models/test_suid.py b/tests/share/models/test_suid.py deleted file mode 100644 index a6cdcc394..000000000 --- a/tests/share/models/test_suid.py +++ /dev/null @@ -1,55 +0,0 @@ -import pytest - -from tests.factories import ( - RawDatumFactory, - SourceUniqueIdentifierFactory, -) - - -@pytest.mark.django_db -class TestSourceUniqueIdentifier: - - def test_most_recent_raw_datum(self): - suid = SourceUniqueIdentifierFactory() - - RawDatumFactory(suid=suid, datestamp=None, date_created='2021-01-01 00:00Z') - expected = RawDatumFactory(suid=suid, datestamp='2021-01-04 00:00Z') - RawDatumFactory(suid=suid, datestamp='2021-01-01 00:00Z') - RawDatumFactory(suid=suid, datestamp='2021-01-02 00:00Z') - RawDatumFactory(suid=suid, datestamp='2021-01-03 00:00Z') - - actual = suid.most_recent_raw_datum() - assert expected == actual - - def test_most_recent_raw_datum__datestamp_wins(self): - suid = SourceUniqueIdentifierFactory() - - RawDatumFactory(suid=suid, datestamp='2021-01-01 00:00Z', date_created='2021-01-02 00:00Z') - expected = RawDatumFactory(suid=suid, datestamp='2021-01-02 00:00Z', date_created='2021-01-01 00:00Z') - - actual = suid.most_recent_raw_datum() - assert expected == actual - - def test_most_recent_raw_datum_no_datestamps(self): - suid = SourceUniqueIdentifierFactory() - - expected = RawDatumFactory(suid=suid, datestamp=None, date_created='2021-01-02 00:00Z') - RawDatumFactory(suid=suid, datestamp=None, date_created='2021-01-01 00:00Z') - - actual = suid.most_recent_raw_datum() - assert expected == actual - - def test_date_first_seen(self): - suid = SourceUniqueIdentifierFactory() - - expected = RawDatumFactory(suid=suid).date_created - for _ in range(7): - RawDatumFactory(suid=suid) - - actual = suid.get_date_first_seen() - assert expected == actual - - def test_date_first_seen_when_no_data(self): - suid = SourceUniqueIdentifierFactory() - actual = suid.get_date_first_seen() - assert actual is None diff --git a/tests/share/test_celery.py b/tests/share/test_celery.py index 51b6d2721..af2d49b11 100644 --- a/tests/share/test_celery.py +++ b/tests/share/test_celery.py @@ -1,49 +1,72 @@ -import pytest -import datetime - +import contextlib +from datetime import timedelta from unittest import mock +import pytest from django.utils import timezone from share.celery import TaskResultCleaner, CeleryTaskResult - from tests import factories -@pytest.mark.usefixtures('nested_django_db') -class TestResultArchiver: +@contextlib.contextmanager +def long_now(new_now=None): + _now = new_now or timezone.now() + with mock.patch.object(timezone, 'now', return_value=_now): + yield _now - @pytest.fixture(scope='class', autouse=True) - def task_result_data(self, class_scoped_django_db): - return factories.CeleryTaskResultFactory.create_batch(100) + +@pytest.mark.django_db +class TestResultCleaner: def test_delete_false(self): - trc = TaskResultCleaner(datetime.timedelta(weeks=520), delete=False) + factories.CeleryTaskResultFactory.create_batch(10) + trc = TaskResultCleaner(timedelta(weeks=520), delete=False) assert trc.delete_queryset(CeleryTaskResult.objects.all()) == 0 - assert CeleryTaskResult.objects.count() != 0 + assert CeleryTaskResult.objects.count() == 10 def test_delete_queryset(self): - trc = TaskResultCleaner(datetime.timedelta(weeks=520)) - assert trc.delete_queryset(CeleryTaskResult.objects.all()) == 100 + factories.CeleryTaskResultFactory.create_batch(10) + trc = TaskResultCleaner(timedelta(weeks=520)) + assert trc.delete_queryset(CeleryTaskResult.objects.all()) == 10 assert CeleryTaskResult.objects.count() == 0 - def test_get_ttl_default(self): - trc = TaskResultCleaner(datetime.timedelta(weeks=520)) - assert ((timezone.now() - datetime.timedelta(weeks=520)) - trc.get_ttl('non-existant-task')) < datetime.timedelta(seconds=2) - - def test_get_ttl(self): - trc = TaskResultCleaner(datetime.timedelta(weeks=520)) - trc.TASK_TTLS['existant-task'] = datetime.timedelta(days=1) - assert ((timezone.now() - datetime.timedelta(days=1)) - trc.get_ttl('existant-task')) < datetime.timedelta(seconds=2) - - def test_clean(self): - trc = TaskResultCleaner(0, bucket=mock.Mock()) - factories.CeleryTaskResultFactory.create_batch(100, status='SUCCESS') - trc.clean() - assert CeleryTaskResult.objects.count() <= 100 # There's an autouse fixture that makes 100 - - def test_clean_chunksize(self): - trc = TaskResultCleaner(0, bucket=mock.Mock(), chunk_size=1) - factories.CeleryTaskResultFactory.create_batch(100, status='SUCCESS') - trc.clean() - assert CeleryTaskResult.objects.count() <= 100 # There's an autouse fixture that makes 100 + def test_success_cutoff(self, settings): + with long_now() as _now: + trc = TaskResultCleaner(timedelta(days=3).total_seconds()) + _expected = _now - timedelta(days=3) + assert trc.success_cutoff == _expected + + def test_nonsuccess_cutoff(self, settings): + with long_now() as _now: + trc = TaskResultCleaner( + success_ttl=timedelta(days=3), + nonsuccess_ttl=timedelta(days=5), + ) + assert trc.success_cutoff == _now - timedelta(days=3) + assert trc.nonsuccess_cutoff == _now - timedelta(days=5) + + @pytest.mark.parametrize('batch_size', [1, 1111]) + def test_clean(self, batch_size): + with long_now() as _now: + with long_now(_now - timedelta(days=7)): + # all should be deleted: + factories.CeleryTaskResultFactory.create_batch(10, status='SUCCESS') + factories.CeleryTaskResultFactory.create_batch(7, status='FAILED') + with long_now(_now - timedelta(days=4)): + # successes should be deleted: + factories.CeleryTaskResultFactory.create_batch(10, status='SUCCESS') + factories.CeleryTaskResultFactory.create_batch(7, status='FAILED') + # none should be deleted: + factories.CeleryTaskResultFactory.create_batch(10, status='SUCCESS') + factories.CeleryTaskResultFactory.create_batch(7, status='FAILED') + # end setup + assert CeleryTaskResult.objects.count() == 51 + trc = TaskResultCleaner( + success_ttl=timedelta(days=3), + nonsuccess_ttl=timedelta(days=5), + chunk_size=batch_size, + ) + trc.clean() + assert CeleryTaskResult.objects.filter(status='SUCCESS').count() == 10 + assert CeleryTaskResult.objects.exclude(status='SUCCESS').count() == 14 diff --git a/tests/share/test_oaipmh_trove.py b/tests/share/test_oaipmh_trove.py index b8bed7421..0bdd7df1b 100644 --- a/tests/share/test_oaipmh_trove.py +++ b/tests/share/test_oaipmh_trove.py @@ -44,9 +44,9 @@ def oai_request(data, request_method, expect_errors=False): class TestOAIVerbs: @pytest.fixture(scope='class') def oai_indexcard(self, class_scoped_django_db): - _latest_indexcard_rdf = factories.LatestIndexcardRdfFactory() + _latest_resource_description = factories.LatestResourceDescriptionFactory() return factories.DerivedIndexcardFactory( - upriver_indexcard=_latest_indexcard_rdf.indexcard, + upriver_indexcard=_latest_resource_description.indexcard, deriver_identifier=trove_db.ResourceIdentifier.objects.get_or_create_for_iri(str(OAI_DC)), derived_text='', ) @@ -165,17 +165,17 @@ def oai_indexcards(self, class_scoped_django_db): trove_db.ResourceIdentifier.objects .get_or_create_for_iri(str(OAI_DC)) ) - _latest_rdfs = [ - factories.LatestIndexcardRdfFactory() + _latest_resource_descriptions = [ + factories.LatestResourceDescriptionFactory() for i in range(17) ] return [ factories.DerivedIndexcardFactory( - upriver_indexcard=_latest_rdf.indexcard, + upriver_indexcard=_latest_resource_description.indexcard, deriver_identifier=_deriver_identifier, derived_text='', ) - for _latest_rdf in _latest_rdfs + for _latest_resource_description in _latest_resource_descriptions ] def test_lists(self, oai_indexcards, monkeypatch): diff --git a/tests/trove/derive/_base.py b/tests/trove/derive/_base.py index bf07e659f..da7cceff6 100644 --- a/tests/trove/derive/_base.py +++ b/tests/trove/derive/_base.py @@ -34,18 +34,17 @@ def run_input_output_test(self, given_input, expected_output): def _get_deriver(self, input_doc: DeriverTestDoc): _mock_suid = mock.Mock() _mock_suid.id = '--suid_id--' - _mock_suid.get_date_first_seen.return_value = datetime.datetime(2345, 1, 1) _mock_suid.get_backcompat_sharev2_suid.return_value = _mock_suid _mock_suid.identifier = '--sourceunique-id--' _mock_suid.source_config.label = '--sourceconfig-label--' _mock_suid.source_config.source.long_title = '--source-title--' - _mock_indexcard_rdf = mock.Mock() - _mock_indexcard_rdf.id = '--indexcardf-id--' - _mock_indexcard_rdf.modified = datetime.datetime(2345, 2, 2) - _mock_indexcard_rdf.as_rdfdoc_with_supplements.return_value = rdf.RdfGraph(input_doc.tripledict) - _mock_indexcard_rdf.focus_iri = input_doc.focus_iri - _mock_indexcard_rdf.from_raw_datum_id = '--rawdatum-id--' - _mock_indexcard_rdf.indexcard.id = '--indexcard-id--' - _mock_indexcard_rdf.indexcard.source_record_suid = _mock_suid - return self.deriver_class(_mock_indexcard_rdf) + _mock_resource_description = mock.Mock() + _mock_resource_description.id = '--resdes-id--' + _mock_resource_description.modified = datetime.datetime(2345, 2, 2) + _mock_resource_description.as_rdfdoc_with_supplements.return_value = rdf.RdfGraph(input_doc.tripledict) + _mock_resource_description.focus_iri = input_doc.focus_iri + _mock_resource_description.indexcard.id = '--indexcard-id--' + _mock_resource_description.indexcard.source_record_suid = _mock_suid + _mock_resource_description.indexcard.created = datetime.datetime(2345, 1, 1) + return self.deriver_class(_mock_resource_description) diff --git a/tests/trove/derive/test_osfmap_json_mini.py b/tests/trove/derive/test_osfmap_json_mini.py index 7c7da6a5d..aa54e44ab 100644 --- a/tests/trove/derive/test_osfmap_json_mini.py +++ b/tests/trove/derive/test_osfmap_json_mini.py @@ -154,6 +154,12 @@ class TestOsfmapJsonMiniDeriver(BaseIndexcardDeriverTest): }, 'osfmap-registration': { "@id": "https://osf.example/2c4st", + "accessService": [{ + "@id": "https://osf.example", + "identifier": [{"@value": "https://osf.example"}], + "name": [{"@value": "OSF"}], + "resourceType": [{"@id": "Agent"}, {"@id": "Organization"}], + }], "resourceType": [ {"@id": "Registration"} ], @@ -455,6 +461,10 @@ class TestOsfmapJsonMiniDeriver(BaseIndexcardDeriverTest): ] } ], + "qualifiedAttribution": [{ + "agent": [{"@id": "https://osf.example/bhcjn"}], + "hadRole": [{"@id": "osf:admin-contributor"}], + }], "archivedAt": [ {"@id": "https://archive.example/details/osf-registrations-2c4st-v1"} ], diff --git a/tests/trove/derive/test_sharev2_elastic.py b/tests/trove/derive/test_sharev2_elastic.py index dd0510d14..c7fa87fd0 100644 --- a/tests/trove/derive/test_sharev2_elastic.py +++ b/tests/trove/derive/test_sharev2_elastic.py @@ -46,7 +46,6 @@ def assert_outputs_equal(self, expected, actual): "papers": False, "supplements": False }, - "rawdatum_id": "--rawdatum-id--", "retracted": False, "source_config": "--sourceconfig-label--", "source_unique_id": "--sourceunique-id--", @@ -123,7 +122,6 @@ def assert_outputs_equal(self, expected, actual): "papers": False, "supplements": False }, - "rawdatum_id": "--rawdatum-id--", "retracted": False, "source_config": "--sourceconfig-label--", "source_unique_id": "--sourceunique-id--", @@ -197,7 +195,6 @@ def assert_outputs_equal(self, expected, actual): "publishers": [ "OSF Registries" ], - "rawdatum_id": "--rawdatum-id--", "retracted": False, "source_config": "--sourceconfig-label--", "source_unique_id": "--sourceunique-id--", diff --git a/tests/trove/digestive_tract/test_derive.py b/tests/trove/digestive_tract/test_derive.py index f606ead1a..ec1f6b40a 100644 --- a/tests/trove/digestive_tract/test_derive.py +++ b/tests/trove/digestive_tract/test_derive.py @@ -14,12 +14,11 @@ class TestDigestiveTractDerive(TestCase): def setUpTestData(cls): cls.focus_iri = _BLARG.this _focus_ident = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(cls.focus_iri) - _raw = factories.RawDatumFactory() - cls.indexcard = trove_db.Indexcard.objects.create(source_record_suid=_raw.suid) + _suid = factories.SourceUniqueIdentifierFactory() + cls.indexcard = trove_db.Indexcard.objects.create(source_record_suid=_suid) cls.indexcard.focus_identifier_set.add(_focus_ident) - cls.latest_rdf = trove_db.LatestIndexcardRdf.objects.create( + cls.latest_resource_description = trove_db.LatestResourceDescription.objects.create( indexcard=cls.indexcard, - from_raw_datum=_raw, focus_iri=cls.focus_iri, rdf_as_turtle='''@prefix blarg: . blarg:this @@ -39,13 +38,10 @@ def test_derive(self): }) def test_derive_with_supplementary(self): - _supp_raw = factories.RawDatumFactory( - suid=factories.SourceUniqueIdentifierFactory(is_supplementary=True), - ) - trove_db.SupplementaryIndexcardRdf.objects.create( + _supp_suid = factories.SourceUniqueIdentifierFactory(is_supplementary=True) + trove_db.SupplementaryResourceDescription.objects.create( indexcard=self.indexcard, - from_raw_datum=_supp_raw, - supplementary_suid=_supp_raw.suid, + supplementary_suid=_supp_suid, focus_iri=self.focus_iri, rdf_as_turtle='''@prefix blarg: . blarg:this blarg:unlike blarg:nonthing . diff --git a/tests/trove/digestive_tract/test_expel.py b/tests/trove/digestive_tract/test_expel.py index 88a2d6f47..7f2345eb2 100644 --- a/tests/trove/digestive_tract/test_expel.py +++ b/tests/trove/digestive_tract/test_expel.py @@ -4,10 +4,16 @@ from django.test import TestCase from share import models as share_db -from tests import factories +from tests.trove.factories import ( + create_indexcard, + create_supplement, +) from trove import digestive_tract from trove import models as trove_db -from trove.vocab.namespaces import BLARG +from trove.vocab.namespaces import ( + BLARG, + TROVE, +) class TestDigestiveTractExpel(TestCase): @@ -15,9 +21,12 @@ class TestDigestiveTractExpel(TestCase): def setUpTestData(cls): cls.focus_1 = BLARG.this1 cls.focus_2 = BLARG.this2 - cls.raw_1, cls.indexcard_1 = _setup_ingested(cls.focus_1) - cls.raw_2, cls.indexcard_2 = _setup_ingested(cls.focus_2) - cls.raw_supp = _setup_supplementary(cls.focus_1, cls.raw_1.suid, cls.indexcard_1) + cls.indexcard_1 = create_indexcard(cls.focus_1, deriver_iris=[TROVE['derive/osfmap_json']]) + cls.indexcard_2 = create_indexcard(cls.focus_2, deriver_iris=[TROVE['derive/osfmap_json']]) + cls.suid_1 = cls.indexcard_1.source_record_suid + cls.suid_2 = cls.indexcard_2.source_record_suid + cls.supp = create_supplement(cls.indexcard_1, cls.focus_1) + cls.supp_suid = cls.supp.supplementary_suid def setUp(self): super().setUp() @@ -43,13 +52,12 @@ def test_setup(self): self.assertIsNone(self.indexcard_1.deleted) self.assertIsNone(self.indexcard_2.deleted) self.assertEqual(share_db.SourceUniqueIdentifier.objects.count(), 3) - self.assertEqual(share_db.RawDatum.objects.count(), 3) - self.assertIsNotNone(self.indexcard_1.latest_rdf) - self.assertIsNotNone(self.indexcard_2.latest_rdf) - self.assertEqual(self.indexcard_1.archived_rdf_set.count(), 1) - self.assertEqual(self.indexcard_2.archived_rdf_set.count(), 1) - self.assertEqual(self.indexcard_1.supplementary_rdf_set.count(), 1) - self.assertEqual(self.indexcard_2.supplementary_rdf_set.count(), 0) + self.assertIsNotNone(self.indexcard_1.latest_resource_description) + self.assertIsNotNone(self.indexcard_2.latest_resource_description) + self.assertEqual(self.indexcard_1.archived_description_set.count(), 1) + self.assertEqual(self.indexcard_2.archived_description_set.count(), 1) + self.assertEqual(self.indexcard_1.supplementary_description_set.count(), 1) + self.assertEqual(self.indexcard_2.supplementary_description_set.count(), 0) self.assertEqual(self.indexcard_1.derived_indexcard_set.count(), 1) self.assertEqual(self.indexcard_2.derived_indexcard_set.count(), 1) # neither notified indexes nor enqueued re-derive @@ -58,25 +66,24 @@ def test_setup(self): def test_expel(self): with mock.patch('trove.digestive_tract.expel_suid') as _mock_expel_suid: - _user = self.raw_1.suid.source_config.source.user - digestive_tract.expel(from_user=_user, record_identifier=self.raw_1.suid.identifier) - _mock_expel_suid.assert_called_once_with(self.raw_1.suid) + _user = self.suid_1.source_config.source.user + digestive_tract.expel(from_user=_user, record_identifier=self.suid_1.identifier) + _mock_expel_suid.assert_called_once_with(self.suid_1) def test_expel_suid(self): - digestive_tract.expel_suid(self.raw_1.suid) + digestive_tract.expel_suid(self.suid_1) self.indexcard_1.refresh_from_db() self.indexcard_2.refresh_from_db() self.assertIsNotNone(self.indexcard_1.deleted) self.assertIsNone(self.indexcard_2.deleted) self.assertEqual(share_db.SourceUniqueIdentifier.objects.count(), 3) - self.assertEqual(share_db.RawDatum.objects.count(), 3) - with self.assertRaises(trove_db.LatestIndexcardRdf.DoesNotExist): - self.indexcard_1.latest_rdf # deleted - self.assertIsNotNone(self.indexcard_2.latest_rdf) - self.assertEqual(self.indexcard_1.archived_rdf_set.count(), 1) # not deleted - self.assertEqual(self.indexcard_2.archived_rdf_set.count(), 1) - self.assertEqual(self.indexcard_1.supplementary_rdf_set.count(), 1) # not deleted - self.assertEqual(self.indexcard_2.supplementary_rdf_set.count(), 0) + with self.assertRaises(trove_db.LatestResourceDescription.DoesNotExist): + self.indexcard_1.latest_resource_description # deleted + self.assertIsNotNone(self.indexcard_2.latest_resource_description) + self.assertEqual(self.indexcard_1.archived_description_set.count(), 1) # not deleted + self.assertEqual(self.indexcard_2.archived_description_set.count(), 1) + self.assertEqual(self.indexcard_1.supplementary_description_set.count(), 1) # not deleted + self.assertEqual(self.indexcard_2.supplementary_description_set.count(), 0) self.assertEqual(self.indexcard_1.derived_indexcard_set.count(), 0) # deleted self.assertEqual(self.indexcard_2.derived_indexcard_set.count(), 1) # notified indexes of update; did not enqueue re-derive @@ -84,19 +91,18 @@ def test_expel_suid(self): self.mock_derive_task.delay.assert_not_called() def test_expel_supplementary_suid(self): - digestive_tract.expel_suid(self.raw_supp.suid) + digestive_tract.expel_suid(self.supp_suid) self.indexcard_1.refresh_from_db() self.indexcard_2.refresh_from_db() self.assertIsNone(self.indexcard_1.deleted) self.assertIsNone(self.indexcard_2.deleted) self.assertEqual(share_db.SourceUniqueIdentifier.objects.count(), 3) - self.assertEqual(share_db.RawDatum.objects.count(), 3) - self.assertIsNotNone(self.indexcard_1.latest_rdf) - self.assertIsNotNone(self.indexcard_2.latest_rdf) - self.assertEqual(self.indexcard_1.archived_rdf_set.count(), 1) - self.assertEqual(self.indexcard_2.archived_rdf_set.count(), 1) - self.assertEqual(self.indexcard_1.supplementary_rdf_set.count(), 0) # deleted - self.assertEqual(self.indexcard_2.supplementary_rdf_set.count(), 0) + self.assertIsNotNone(self.indexcard_1.latest_resource_description) + self.assertIsNotNone(self.indexcard_2.latest_resource_description) + self.assertEqual(self.indexcard_1.archived_description_set.count(), 1) + self.assertEqual(self.indexcard_2.archived_description_set.count(), 1) + self.assertEqual(self.indexcard_1.supplementary_description_set.count(), 0) # deleted + self.assertEqual(self.indexcard_2.supplementary_description_set.count(), 0) self.assertEqual(self.indexcard_1.derived_indexcard_set.count(), 1) self.assertEqual(self.indexcard_2.derived_indexcard_set.count(), 1) # did not notify indexes of update; did enqueue re-derive @@ -110,22 +116,22 @@ def test_expel_expired_task(self): def test_expel_expired(self): _today = datetime.date.today() - self.raw_2.expiration_date = _today - self.raw_2.save() + _latest = self.indexcard_2.latest_resource_description + _latest.expiration_date = _today + _latest.save() digestive_tract.expel_expired_data(_today) self.indexcard_1.refresh_from_db() self.indexcard_2.refresh_from_db() self.assertIsNone(self.indexcard_1.deleted) self.assertIsNotNone(self.indexcard_2.deleted) # marked deleted self.assertEqual(share_db.SourceUniqueIdentifier.objects.count(), 3) - self.assertEqual(share_db.RawDatum.objects.count(), 3) - self.assertIsNotNone(self.indexcard_1.latest_rdf) - with self.assertRaises(trove_db.LatestIndexcardRdf.DoesNotExist): - self.indexcard_2.latest_rdf # deleted - self.assertEqual(self.indexcard_1.archived_rdf_set.count(), 1) - self.assertEqual(self.indexcard_2.archived_rdf_set.count(), 1) # not deleted - self.assertEqual(self.indexcard_1.supplementary_rdf_set.count(), 1) - self.assertEqual(self.indexcard_2.supplementary_rdf_set.count(), 0) # deleted + self.assertIsNotNone(self.indexcard_1.latest_resource_description) + with self.assertRaises(trove_db.LatestResourceDescription.DoesNotExist): + self.indexcard_2.latest_resource_description # deleted + self.assertEqual(self.indexcard_1.archived_description_set.count(), 1) + self.assertEqual(self.indexcard_2.archived_description_set.count(), 1) # not deleted + self.assertEqual(self.indexcard_1.supplementary_description_set.count(), 1) + self.assertEqual(self.indexcard_2.supplementary_description_set.count(), 0) # deleted self.assertEqual(self.indexcard_1.derived_indexcard_set.count(), 1) self.assertEqual(self.indexcard_2.derived_indexcard_set.count(), 0) # deleted # notified indexes of update; did not enqueue re-derive @@ -134,71 +140,22 @@ def test_expel_expired(self): def test_expel_expired_supplement(self): _today = datetime.date.today() - self.raw_supp.expiration_date = _today - self.raw_supp.save() + self.supp.expiration_date = _today + self.supp.save() digestive_tract.expel_expired_data(_today) self.indexcard_1.refresh_from_db() self.indexcard_2.refresh_from_db() self.assertIsNone(self.indexcard_1.deleted) self.assertIsNone(self.indexcard_2.deleted) self.assertEqual(share_db.SourceUniqueIdentifier.objects.count(), 3) - self.assertEqual(share_db.RawDatum.objects.count(), 3) - self.assertIsNotNone(self.indexcard_1.latest_rdf) - self.assertIsNotNone(self.indexcard_2.latest_rdf) - self.assertEqual(self.indexcard_1.archived_rdf_set.count(), 1) - self.assertEqual(self.indexcard_2.archived_rdf_set.count(), 1) - self.assertEqual(self.indexcard_1.supplementary_rdf_set.count(), 0) # deleted - self.assertEqual(self.indexcard_2.supplementary_rdf_set.count(), 0) + self.assertIsNotNone(self.indexcard_1.latest_resource_description) + self.assertIsNotNone(self.indexcard_2.latest_resource_description) + self.assertEqual(self.indexcard_1.archived_description_set.count(), 1) + self.assertEqual(self.indexcard_2.archived_description_set.count(), 1) + self.assertEqual(self.indexcard_1.supplementary_description_set.count(), 0) # deleted + self.assertEqual(self.indexcard_2.supplementary_description_set.count(), 0) self.assertEqual(self.indexcard_1.derived_indexcard_set.count(), 1) self.assertEqual(self.indexcard_2.derived_indexcard_set.count(), 1) # did not notify indexes of update; did enqueue re-derive self.assertEqual(self.notified_indexcard_ids, set()) self.mock_derive_task.delay.assert_called_once_with(self.indexcard_1.id) - - -def _setup_ingested(focus_iri: str): - _focus_ident = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(focus_iri) - _suid = factories.SourceUniqueIdentifierFactory( - focus_identifier=_focus_ident, - ) - _raw = factories.RawDatumFactory(suid=_suid) - _indexcard = trove_db.Indexcard.objects.create(source_record_suid=_raw.suid) - _indexcard.focus_identifier_set.add(_focus_ident) - _latest_rdf = trove_db.LatestIndexcardRdf.objects.create( - indexcard=_indexcard, - from_raw_datum=_raw, - focus_iri=focus_iri, - rdf_as_turtle='...', - ) - trove_db.ArchivedIndexcardRdf.objects.create( - indexcard=_indexcard, - from_raw_datum=_raw, - focus_iri=focus_iri, - rdf_as_turtle=_latest_rdf.rdf_as_turtle, - ) - _deriver_iri = BLARG.deriver - _deriver_ident = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_deriver_iri) - trove_db.DerivedIndexcard.objects.create( - upriver_indexcard=_indexcard, - deriver_identifier=_deriver_ident, - derived_checksum_iri='...', - derived_text='...', - ) - return _raw, _indexcard - - -def _setup_supplementary(focus_iri, main_suid, indexcard): - _supp_suid = factories.SourceUniqueIdentifierFactory( - focus_identifier=main_suid.focus_identifier, - source_config=main_suid.source_config, - is_supplementary=True, - ) - _supp_raw = factories.RawDatumFactory(suid=_supp_suid) - trove_db.SupplementaryIndexcardRdf.objects.create( - indexcard=indexcard, - from_raw_datum=_supp_raw, - supplementary_suid=_supp_suid, - focus_iri=focus_iri, - rdf_as_turtle='...', - ) - return _supp_raw diff --git a/tests/trove/digestive_tract/test_extract.py b/tests/trove/digestive_tract/test_extract.py index 57afd3ca0..393c857ae 100644 --- a/tests/trove/digestive_tract/test_extract.py +++ b/tests/trove/digestive_tract/test_extract.py @@ -6,44 +6,45 @@ from trove import digestive_tract from trove import exceptions as trove_exceptions from trove import models as trove_db +from trove.vocab import mediatypes from trove.vocab.namespaces import BLARG as _BLARG class TestDigestiveTractExtract(TestCase): @classmethod def setUpTestData(cls): - _focus_ident = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_BLARG.this) - cls.raw = factories.RawDatumFactory( - mediatype='text/turtle', - datum='''@prefix blarg: . + cls.user = factories.ShareUserFactory() + cls.focus_iri = _BLARG.this + cls.suid = digestive_tract.sniff(from_user=cls.user, focus_iri=cls.focus_iri) + cls.raw_turtle = '''@prefix blarg: . blarg:this a blarg:Thing ; blarg:like blarg:that . -''', - suid__focus_identifier=_focus_ident, +''' + cls.supp_suid = digestive_tract.sniff( + from_user=cls.user, + focus_iri=cls.focus_iri, + record_identifier=f'supp:{cls.focus_iri}', + is_supplementary=True, ) - cls.supplementary_raw = factories.RawDatumFactory( - mediatype='text/turtle', - datum='''@prefix blarg: . + cls.supp_raw_turtle = '''@prefix blarg: . blarg:this blarg:like blarg:another ; blarg:unlike blarg:nonthing . -''', - suid=factories.SourceUniqueIdentifierFactory( - source_config=cls.raw.suid.source_config, - focus_identifier=cls.raw.suid.focus_identifier, - is_supplementary=True, - ), - ) +''' def test_setup(self): self.assertEqual(trove_db.Indexcard.objects.all().count(), 0) - self.assertEqual(trove_db.LatestIndexcardRdf.objects.all().count(), 0) - self.assertEqual(trove_db.ArchivedIndexcardRdf.objects.all().count(), 0) - self.assertEqual(trove_db.SupplementaryIndexcardRdf.objects.all().count(), 0) + self.assertEqual(trove_db.LatestResourceDescription.objects.all().count(), 0) + self.assertEqual(trove_db.ArchivedResourceDescription.objects.all().count(), 0) + self.assertEqual(trove_db.SupplementaryResourceDescription.objects.all().count(), 0) def test_extract(self): - (_indexcard,) = digestive_tract.extract(self.raw) - self.assertEqual(_indexcard.source_record_suid_id, self.raw.suid_id) + (_indexcard,) = digestive_tract.extract( + suid=self.suid, + record_mediatype=mediatypes.TURTLE, + raw_record=self.raw_turtle, + ) + self.assertEqual(_indexcard.source_record_suid_id, self.suid.id) _focus_idents = list( _indexcard.focus_identifier_set.values_list('sufficiently_unique_iri', flat=True), ) @@ -52,50 +53,114 @@ def test_extract(self): _indexcard.focustype_identifier_set.values_list('sufficiently_unique_iri', flat=True), ) self.assertEqual(_focustype_idents, ['://blarg.example/vocab/Thing']) - self.assertEqual(list(_indexcard.supplementary_rdf_set.all()), []) - _latest_rdf = _indexcard.latest_rdf - self.assertEqual(_latest_rdf.from_raw_datum_id, self.raw.id) - self.assertEqual(_latest_rdf.indexcard_id, _indexcard.id) - self.assertEqual(_latest_rdf.focus_iri, _BLARG.this) - self.assertEqual(_latest_rdf.as_rdf_tripledict(), { + self.assertEqual(list(_indexcard.supplementary_description_set.all()), []) + _latest_resource_description = _indexcard.latest_resource_description + self.assertEqual(_latest_resource_description.indexcard_id, _indexcard.id) + self.assertEqual(_latest_resource_description.focus_iri, _BLARG.this) + self.assertIsNone(_latest_resource_description.expiration_date) + self.assertEqual(_latest_resource_description.as_rdf_tripledict(), { _BLARG.this: { rdf.RDF.type: {_BLARG.Thing}, _BLARG.like: {_BLARG.that}, }, }) - self.assertEqual(_latest_rdf.as_rdfdoc_with_supplements().tripledict, { + self.assertEqual(_latest_resource_description.as_rdfdoc_with_supplements().tripledict, { _BLARG.this: { rdf.RDF.type: {_BLARG.Thing}, _BLARG.like: {_BLARG.that}, }, }) + def test_extract_before_expiration(self): + _expir = datetime.date.today() + datetime.timedelta(days=3) + (_indexcard,) = digestive_tract.extract( + suid=self.suid, + record_mediatype=mediatypes.TURTLE, + raw_record=self.raw_turtle, + expiration_date=_expir, + ) + self.assertEqual(_indexcard.source_record_suid_id, self.suid.id) + _focus_idents = list( + _indexcard.focus_identifier_set.values_list('sufficiently_unique_iri', flat=True), + ) + self.assertEqual(_focus_idents, ['://blarg.example/vocab/this']) + _focustype_idents = list( + _indexcard.focustype_identifier_set.values_list('sufficiently_unique_iri', flat=True), + ) + self.assertEqual(_focustype_idents, ['://blarg.example/vocab/Thing']) + self.assertEqual(list(_indexcard.supplementary_description_set.all()), []) + _latest_resource_description = _indexcard.latest_resource_description + self.assertEqual(_latest_resource_description.indexcard_id, _indexcard.id) + self.assertEqual(_latest_resource_description.focus_iri, _BLARG.this) + self.assertEqual(_latest_resource_description.expiration_date, _expir) + self.assertEqual(_latest_resource_description.as_rdf_tripledict(), { + _BLARG.this: { + rdf.RDF.type: {_BLARG.Thing}, + _BLARG.like: {_BLARG.that}, + }, + }) + self.assertEqual(_latest_resource_description.as_rdfdoc_with_supplements().tripledict, { + _BLARG.this: { + rdf.RDF.type: {_BLARG.Thing}, + _BLARG.like: {_BLARG.that}, + }, + }) + + def test_extract_supplement_before_expiration(self): + (_indexcard,) = digestive_tract.extract( + suid=self.suid, + record_mediatype=mediatypes.TURTLE, + raw_record=self.raw_turtle, + ) + _expir = datetime.date.today() + datetime.timedelta(days=5) + (_supped_indexcard,) = digestive_tract.extract( + suid=self.supp_suid, + record_mediatype=mediatypes.TURTLE, + raw_record=self.supp_raw_turtle, + expiration_date=_expir, + ) + self.assertEqual(_indexcard, _supped_indexcard) + (_supp_rdf,) = _indexcard.supplementary_description_set.all() + self.assertEqual(_supp_rdf.expiration_date, _expir) + def test_extract_supplementary_without_prior(self): - _cards = digestive_tract.extract(self.supplementary_raw) + _cards = digestive_tract.extract( + suid=self.supp_suid, + record_mediatype=mediatypes.TURTLE, + raw_record=self.supp_raw_turtle, + ) self.assertEqual(_cards, []) self.assertEqual(trove_db.Indexcard.objects.all().count(), 0) - self.assertEqual(trove_db.LatestIndexcardRdf.objects.all().count(), 0) - self.assertEqual(trove_db.ArchivedIndexcardRdf.objects.all().count(), 0) - self.assertEqual(trove_db.SupplementaryIndexcardRdf.objects.all().count(), 0) + self.assertEqual(trove_db.LatestResourceDescription.objects.all().count(), 0) + self.assertEqual(trove_db.ArchivedResourceDescription.objects.all().count(), 0) + self.assertEqual(trove_db.SupplementaryResourceDescription.objects.all().count(), 0) def test_extract_supplementary(self): - (_orig_indexcard,) = digestive_tract.extract(self.raw) - _orig_timestamp = _orig_indexcard.latest_rdf.modified - (_indexcard,) = digestive_tract.extract(self.supplementary_raw) + (_orig_indexcard,) = digestive_tract.extract( + suid=self.suid, + record_mediatype=mediatypes.TURTLE, + raw_record=self.raw_turtle, + ) + _orig_timestamp = _orig_indexcard.latest_resource_description.modified + (_indexcard,) = digestive_tract.extract( + suid=self.supp_suid, + record_mediatype=mediatypes.TURTLE, + raw_record=self.supp_raw_turtle, + ) self.assertEqual(_orig_indexcard.id, _indexcard.id) - self.assertEqual(_indexcard.source_record_suid_id, self.raw.suid_id) - (_supp_rdf,) = _indexcard.supplementary_rdf_set.all() - self.assertEqual(_supp_rdf.from_raw_datum_id, self.supplementary_raw.id) + self.assertEqual(_indexcard.source_record_suid_id, self.suid.id) + (_supp_rdf,) = _indexcard.supplementary_description_set.all() self.assertEqual(_supp_rdf.indexcard_id, _indexcard.id) self.assertEqual(_supp_rdf.focus_iri, _BLARG.this) + self.assertIsNone(_supp_rdf.expiration_date) self.assertEqual(_supp_rdf.as_rdf_tripledict(), { _BLARG.this: { _BLARG.like: {_BLARG.another}, _BLARG.unlike: {_BLARG.nonthing}, }, }) - self.assertEqual(_indexcard.latest_rdf.modified, _orig_timestamp) - self.assertEqual(_indexcard.latest_rdf.as_rdfdoc_with_supplements().tripledict, { + self.assertEqual(_indexcard.latest_resource_description.modified, _orig_timestamp) + self.assertEqual(_indexcard.latest_resource_description.as_rdfdoc_with_supplements().tripledict, { _BLARG.this: { rdf.RDF.type: {_BLARG.Thing}, _BLARG.like: {_BLARG.that, _BLARG.another}, @@ -104,50 +169,65 @@ def test_extract_supplementary(self): }) def test_extract_empty_with_prior(self): - (_prior_indexcard,) = digestive_tract.extract(self.raw) - self.assertFalse(self.raw.no_output) + (_prior_indexcard,) = digestive_tract.extract( + suid=self.suid, + record_mediatype=mediatypes.TURTLE, + raw_record=self.raw_turtle, + ) self.assertIsNone(_prior_indexcard.deleted) - # add a later raw - _empty_raw = factories.RawDatumFactory( - mediatype='text/turtle', - datum=' ', - suid=self.raw.suid, - ) - (_indexcard,) = digestive_tract.extract(_empty_raw) - self.assertTrue(_empty_raw.no_output) + # extract an empty + (_indexcard,) = digestive_tract.extract( + suid=self.suid, + record_mediatype=mediatypes.TURTLE, + raw_record=' ', # no data + ) self.assertEqual(_indexcard.id, _prior_indexcard.id) self.assertIsNotNone(_indexcard.deleted) - with self.assertRaises(trove_db.LatestIndexcardRdf.DoesNotExist): - _indexcard.latest_rdf + with self.assertRaises(trove_db.LatestResourceDescription.DoesNotExist): + _indexcard.latest_resource_description def test_extract_empty_without_prior(self): - _empty_raw = factories.RawDatumFactory( - mediatype='text/turtle', - datum=' ', + _cards = digestive_tract.extract( + suid=self.suid, + record_mediatype=mediatypes.TURTLE, + raw_record=' ', # no data ) - _cards = digestive_tract.extract(_empty_raw) self.assertEqual(_cards, []) - self.assertTrue(_empty_raw.no_output) def test_extract_empty_supplementary(self): - (_orig_indexcard,) = digestive_tract.extract(self.raw) - digestive_tract.extract(self.supplementary_raw) - self.assertTrue(_orig_indexcard.supplementary_rdf_set.exists()) - _empty_raw = factories.RawDatumFactory( - mediatype='text/turtle', - datum='', - suid=self.supplementary_raw.suid, - ) - (_indexcard,) = digestive_tract.extract(_empty_raw) + (_orig_indexcard,) = digestive_tract.extract( + suid=self.suid, + record_mediatype=mediatypes.TURTLE, + raw_record=self.raw_turtle, + ) + digestive_tract.extract( + suid=self.supp_suid, + record_mediatype=mediatypes.TURTLE, + raw_record=self.supp_raw_turtle, + ) + self.assertTrue(_orig_indexcard.supplementary_description_set.exists()) + (_indexcard,) = digestive_tract.extract( + suid=self.supp_suid, + record_mediatype=mediatypes.TURTLE, + raw_record=' ', # no data + ) self.assertEqual(_indexcard.id, _orig_indexcard.id) - self.assertFalse(_orig_indexcard.supplementary_rdf_set.exists()) + self.assertFalse(_orig_indexcard.supplementary_description_set.exists()) - def test_extract_expired(self): - self.raw.expiration_date = datetime.date.today() + def test_extract_after_expiration(self): with self.assertRaises(trove_exceptions.CannotDigestExpiredDatum): - digestive_tract.extract(self.raw) + digestive_tract.extract( + suid=self.suid, + record_mediatype=mediatypes.TURTLE, + raw_record=self.raw_turtle, + expiration_date=datetime.date.today(), + ) - def test_extract_expired_supplement(self): - self.supplementary_raw.expiration_date = datetime.date.today() + def test_extract_supp_after_expiration(self): with self.assertRaises(trove_exceptions.CannotDigestExpiredDatum): - digestive_tract.extract(self.supplementary_raw) + digestive_tract.extract( + suid=self.supp_suid, + record_mediatype=mediatypes.TURTLE, + raw_record=self.supp_raw_turtle, + expiration_date=datetime.date.today(), + ) diff --git a/tests/trove/digestive_tract/test_sniff.py b/tests/trove/digestive_tract/test_sniff.py new file mode 100644 index 000000000..c421ac381 --- /dev/null +++ b/tests/trove/digestive_tract/test_sniff.py @@ -0,0 +1,90 @@ +from django.test import TestCase + +from share import models as share_db +from tests import factories +from trove import digestive_tract +from trove import exceptions as trove_exceptions +from trove.vocab.namespaces import BLARG + + +class TestDigestiveTractSniff(TestCase): + @classmethod + def setUpTestData(cls): + cls.user = factories.ShareUserFactory() + + def test_setup(self): + self.assertEqual(share_db.SourceConfig.objects.all().count(), 0) + self.assertEqual(share_db.SourceUniqueIdentifier.objects.all().count(), 0) + + def test_sniff(self): + digestive_tract.sniff( + from_user=self.user, + record_identifier='blarg', + focus_iri=BLARG.this, + ) + (_suid,) = share_db.SourceUniqueIdentifier.objects.all() + self.assertEqual(_suid.identifier, 'blarg') + self.assertEqual(_suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this') + self.assertEqual(_suid.source_config.source.user_id, self.user.id) + self.assertFalse(_suid.is_supplementary) + + def test_sniff_implicit_record_identifier(self): + digestive_tract.sniff( + from_user=self.user, + focus_iri=BLARG.this, + ) + (_suid,) = share_db.SourceUniqueIdentifier.objects.all() + self.assertEqual(_suid.identifier, BLARG.this) + self.assertEqual(_suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this') + self.assertEqual(_suid.source_config.source.user_id, self.user.id) + self.assertFalse(_suid.is_supplementary) + + def test_sniff_supplementary(self): + digestive_tract.sniff( + from_user=self.user, + record_identifier='blarg', + focus_iri=BLARG.this, + is_supplementary=True, + ) + (_suid,) = share_db.SourceUniqueIdentifier.objects.all() + self.assertEqual(_suid.identifier, 'blarg') + self.assertEqual(_suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this') + self.assertEqual(_suid.source_config.source.user_id, self.user.id) + self.assertTrue(_suid.is_supplementary) + + def test_error_focus_iri(self): + with self.assertRaises(trove_exceptions.DigestiveError): + digestive_tract.sniff(from_user=self.user, focus_iri='blam') + with self.assertRaises(trove_exceptions.DigestiveError): + digestive_tract.sniff(from_user=self.user, focus_iri='') + + def test_error_missing_record_identifier(self): + with self.assertRaises(trove_exceptions.DigestiveError): + digestive_tract.sniff(from_user=self.user, focus_iri=BLARG.foo, is_supplementary=True) + + def test_error_change_focus(self): + digestive_tract.sniff( + from_user=self.user, + record_identifier='foo', + focus_iri=BLARG.bar, + ) + with self.assertRaises(trove_exceptions.DigestiveError): + digestive_tract.sniff( + from_user=self.user, + record_identifier='foo', + focus_iri=BLARG.different, + ) + + def test_error_change_supplementariness(self): + digestive_tract.sniff( + from_user=self.user, + focus_iri=BLARG.foo, + record_identifier='foo-supp', + is_supplementary=True, + ) + with self.assertRaises(trove_exceptions.DigestiveError): + digestive_tract.sniff( + from_user=self.user, + focus_iri=BLARG.foo, + record_identifier='foo-supp', + ) diff --git a/tests/trove/digestive_tract/test_swallow.py b/tests/trove/digestive_tract/test_swallow.py deleted file mode 100644 index 968b8d668..000000000 --- a/tests/trove/digestive_tract/test_swallow.py +++ /dev/null @@ -1,122 +0,0 @@ -import datetime -from unittest import mock -from django.test import TestCase - -from tests import factories -from trove import digestive_tract -from share import models as share_db - - -class TestDigestiveTractSwallow(TestCase): - @classmethod - def setUpTestData(cls): - cls.user = factories.ShareUserFactory() - cls.turtle = ''' -@prefix blarg: . -blarg:this - a blarg:Thing ; - blarg:like blarg:that . -''' - - def test_setup(self): - self.assertEqual(share_db.RawDatum.objects.all().count(), 0) - - def test_swallow(self): - with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task: - digestive_tract.swallow( - from_user=self.user, - record=self.turtle, - record_identifier='blarg', - record_mediatype='text/turtle', - focus_iri='http://blarg.example/vocab/this', - ) - (_raw,) = share_db.RawDatum.objects.all() - self.assertEqual(_raw.datum, self.turtle) - self.assertEqual(_raw.mediatype, 'text/turtle') - self.assertIsNone(_raw.expiration_date) - self.assertEqual(_raw.suid.identifier, 'blarg') - self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this') - self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id) - self.assertFalse(_raw.suid.is_supplementary) - _mock_task.delay.assert_called_once_with(_raw.id, urgent=False) - - def test_swallow_urgent(self): - with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task: - digestive_tract.swallow( - from_user=self.user, - record=self.turtle, - record_identifier='blarg', - record_mediatype='text/turtle', - focus_iri='http://blarg.example/vocab/this', - urgent=True - ) - (_raw,) = share_db.RawDatum.objects.all() - self.assertEqual(_raw.datum, self.turtle) - self.assertEqual(_raw.mediatype, 'text/turtle') - self.assertIsNone(_raw.expiration_date) - self.assertEqual(_raw.suid.identifier, 'blarg') - self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this') - self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id) - self.assertFalse(_raw.suid.is_supplementary) - _mock_task.delay.assert_called_once_with(_raw.id, urgent=True) - - def test_swallow_supplementary(self): - with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task: - digestive_tract.swallow( - from_user=self.user, - record=self.turtle, - record_identifier='blarg', - record_mediatype='text/turtle', - focus_iri='http://blarg.example/vocab/this', - is_supplementary=True, - ) - (_raw,) = share_db.RawDatum.objects.all() - self.assertEqual(_raw.datum, self.turtle) - self.assertEqual(_raw.mediatype, 'text/turtle') - self.assertIsNone(_raw.expiration_date) - self.assertEqual(_raw.suid.identifier, 'blarg') - self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this') - self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id) - self.assertTrue(_raw.suid.is_supplementary) - _mock_task.delay.assert_called_once_with(_raw.id, urgent=False) - - def test_swallow_with_expiration(self): - with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task: - digestive_tract.swallow( - from_user=self.user, - record=self.turtle, - record_identifier='blarg', - record_mediatype='text/turtle', - focus_iri='http://blarg.example/vocab/this', - expiration_date=datetime.date(2048, 1, 3), - ) - (_raw,) = share_db.RawDatum.objects.all() - self.assertEqual(_raw.datum, self.turtle) - self.assertEqual(_raw.mediatype, 'text/turtle') - self.assertEqual(_raw.expiration_date, datetime.date(2048, 1, 3)) - self.assertEqual(_raw.suid.identifier, 'blarg') - self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this') - self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id) - self.assertFalse(_raw.suid.is_supplementary) - _mock_task.delay.assert_called_once_with(_raw.id, urgent=False) - - def test_swallow_supplementary_with_expiration(self): - with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task: - digestive_tract.swallow( - from_user=self.user, - record=self.turtle, - record_identifier='blarg', - record_mediatype='text/turtle', - focus_iri='http://blarg.example/vocab/this', - is_supplementary=True, - expiration_date=datetime.date(2047, 1, 3), - ) - (_raw,) = share_db.RawDatum.objects.all() - self.assertEqual(_raw.datum, self.turtle) - self.assertEqual(_raw.mediatype, 'text/turtle') - self.assertEqual(_raw.expiration_date, datetime.date(2047, 1, 3)) - self.assertEqual(_raw.suid.identifier, 'blarg') - self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/vocab/this') - self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id) - self.assertTrue(_raw.suid.is_supplementary) - _mock_task.delay.assert_called_once_with(_raw.id, urgent=False) diff --git a/tests/trove/factories.py b/tests/trove/factories.py index 1a7d4b31b..406fb5599 100644 --- a/tests/trove/factories.py +++ b/tests/trove/factories.py @@ -1,4 +1,6 @@ from collections.abc import Collection +import time +import uuid from tests import factories @@ -6,6 +8,7 @@ from trove import models as trove_db from trove import digestive_tract +from trove.vocab.namespaces import BLARG __all__ = ( @@ -17,17 +20,21 @@ def create_indexcard( - focus_iri: str, + focus_iri: str | None = None, rdf_twopledict: rdf.RdfTwopleDictionary | None = None, rdf_tripledict: rdf.RdfTripleDictionary | None = None, deriver_iris: Collection[str] = (), ) -> trove_db.Indexcard: - _suid = factories.SourceUniqueIdentifierFactory() + _focus_iri = focus_iri or BLARG[str(uuid.uuid4())] + _focus_ident = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_focus_iri) + _suid = factories.SourceUniqueIdentifierFactory( + focus_identifier=_focus_ident, + ) _indexcard = trove_db.Indexcard.objects.create(source_record_suid=_suid) _indexcard.focus_identifier_set.add( - trove_db.ResourceIdentifier.objects.get_or_create_for_iri(focus_iri), + trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_focus_iri), ) - update_indexcard_content(_indexcard, focus_iri, rdf_twopledict, rdf_tripledict) + update_indexcard_content(_indexcard, _focus_iri, rdf_twopledict, rdf_tripledict) if deriver_iris: digestive_tract.derive(_indexcard, deriver_iris) return _indexcard @@ -35,25 +42,13 @@ def create_indexcard( def update_indexcard_content( indexcard: trove_db.Indexcard, - focus_iri: str, + focus_iri: str | None = None, rdf_twopledict: rdf.RdfTwopleDictionary | None = None, rdf_tripledict: rdf.RdfTripleDictionary | None = None, ) -> None: - _card_content = _combined_tripledict(focus_iri, rdf_twopledict, rdf_tripledict) - _card_content_turtle = rdf.turtle_from_tripledict(_card_content) - _raw = factories.RawDatumFactory(suid=indexcard.source_record_suid, datum=_card_content_turtle) - indexcard.focus_identifier_set.add( - trove_db.ResourceIdentifier.objects.get_or_create_for_iri(focus_iri), - ) - trove_db.LatestIndexcardRdf.objects.update_or_create( - indexcard=indexcard, - defaults={ - 'from_raw_datum': _raw, - 'focus_iri': focus_iri, - 'rdf_as_turtle': _card_content_turtle, - 'turtle_checksum_iri': 'foo', # not enforced - }, - ) + _focus_iri = focus_iri or indexcard.latest_resource_description.focus_iri + _card_content = _combined_tripledict(_focus_iri, rdf_twopledict, rdf_tripledict) + indexcard.update_resource_description(_focus_iri, _card_content) def create_supplement( @@ -61,18 +56,17 @@ def create_supplement( focus_iri: str, rdf_twopledict: rdf.RdfTwopleDictionary | None = None, rdf_tripledict: rdf.RdfTripleDictionary | None = None, -) -> trove_db.SupplementaryIndexcardRdf: - _supp_suid = factories.SourceUniqueIdentifierFactory() - _supp_content = _combined_tripledict(focus_iri, rdf_twopledict, rdf_tripledict) - _supp_content_turtle = rdf.turtle_from_tripledict(_supp_content) - _supp_raw = factories.RawDatumFactory(suid=_supp_suid, datum=_supp_content_turtle) - return trove_db.SupplementaryIndexcardRdf.objects.create( - from_raw_datum=_supp_raw, - indexcard=indexcard, - supplementary_suid=_supp_suid, - focus_iri=focus_iri, - rdf_as_turtle=_supp_content_turtle, - turtle_checksum_iri='sup', # not enforced +) -> trove_db.SupplementaryResourceDescription: + _main_suid = indexcard.source_record_suid + _supp_suid = factories.SourceUniqueIdentifierFactory( + focus_identifier=_main_suid.focus_identifier, + source_config=_main_suid.source_config, + is_supplementary=True, + ) + return indexcard.update_supplementary_description( + _supp_suid, + focus_iri, + _combined_tripledict(focus_iri, rdf_twopledict, rdf_tripledict), ) @@ -99,4 +93,8 @@ def _combined_tripledict( _graph.add_twopledict(focus_iri, rdf_twopledict) if rdf_tripledict is not None: _graph.add_tripledict(rdf_tripledict) - return _graph.tripledict + return _graph.tripledict or { + focus_iri: { + BLARG.timeNonce: {rdf.literal(time.time_ns())}, + }, + } diff --git a/tests/trove/views/test_ingest.py b/tests/trove/views/test_ingest.py index fc3f5d464..18e8e4995 100644 --- a/tests/trove/views/test_ingest.py +++ b/tests/trove/views/test_ingest.py @@ -27,15 +27,16 @@ def test_post(self): HTTP_AUTHORIZATION=self.user.authorization(), ) self.assertEqual(_resp.status_code, HTTPStatus.CREATED) - _mock_tract.swallow.assert_called_once_with( + _mock_tract.ingest.assert_called_once_with( from_user=self.user, - record='turtleturtleturtle', + raw_record='turtleturtleturtle', record_identifier='blarg', record_mediatype='text/turtle', focus_iri='https://foo.example/blarg', urgent=True, is_supplementary=False, expiration_date=None, + restore_deleted=True, ) def test_post_nonurgent(self): @@ -51,15 +52,16 @@ def test_post_nonurgent(self): HTTP_AUTHORIZATION=self.user.authorization(), ) self.assertEqual(_resp.status_code, HTTPStatus.CREATED) - _mock_tract.swallow.assert_called_once_with( + _mock_tract.ingest.assert_called_once_with( from_user=self.user, - record='turtleturtleturtle', + raw_record='turtleturtleturtle', record_identifier='blarg', record_mediatype='text/turtle', focus_iri='https://foo.example/blarg', urgent=False, is_supplementary=False, expiration_date=None, + restore_deleted=True, ) def test_post_supplementary(self): @@ -75,15 +77,16 @@ def test_post_supplementary(self): HTTP_AUTHORIZATION=self.user.authorization(), ) self.assertEqual(_resp.status_code, HTTPStatus.CREATED) - _mock_tract.swallow.assert_called_once_with( + _mock_tract.ingest.assert_called_once_with( from_user=self.user, - record='turtleturtleturtle', + raw_record='turtleturtleturtle', record_identifier='blarg', record_mediatype='text/turtle', focus_iri='https://foo.example/blarg', urgent=True, is_supplementary=True, expiration_date=None, + restore_deleted=True, ) def test_post_with_expiration(self): @@ -100,15 +103,16 @@ def test_post_with_expiration(self): HTTP_AUTHORIZATION=self.user.authorization(), ) self.assertEqual(_resp.status_code, HTTPStatus.CREATED) - _mock_tract.swallow.assert_called_once_with( + _mock_tract.ingest.assert_called_once_with( from_user=self.user, - record='turtleturtleturtle', + raw_record='turtleturtleturtle', record_identifier='blarg', record_mediatype='text/turtle', focus_iri='https://foo.example/blarg', urgent=True, is_supplementary=True, expiration_date=datetime.date(2055, 5, 5), + restore_deleted=True, ) def test_delete(self): @@ -135,7 +139,7 @@ def test_anonymous_post(self): data='turtleturtleturtle', ) self.assertEqual(_resp.status_code, HTTPStatus.UNAUTHORIZED) - self.assertFalse(_mock_tract.swallow.called) + self.assertFalse(_mock_tract.ingest.called) def test_nontrusted_post(self): with patch_feature_flag(FeatureFlag.FORBID_UNTRUSTED_FEED): @@ -152,7 +156,7 @@ def test_nontrusted_post(self): HTTP_AUTHORIZATION=_nontrusted_user.authorization(), ) self.assertEqual(_resp.status_code, HTTPStatus.FORBIDDEN) - self.assertFalse(_mock_tract.swallow.called) + self.assertFalse(_mock_tract.ingest.called) def test_anonymous_delete(self): with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract: @@ -185,4 +189,4 @@ def test_invalid_expiration_date(self): HTTP_AUTHORIZATION=self.user.authorization(), ) self.assertEqual(_resp.status_code, HTTPStatus.BAD_REQUEST) - self.assertFalse(_mock_tract.swallow.called) + self.assertFalse(_mock_tract.ingest.called) diff --git a/trove/admin.py b/trove/admin.py index 8db71772e..5ef20eac3 100644 --- a/trove/admin.py +++ b/trove/admin.py @@ -5,12 +5,12 @@ from share.admin.util import TimeLimitedPaginator, linked_fk, linked_many from share.search.index_messenger import IndexMessenger from trove.models import ( - ArchivedIndexcardRdf, + ArchivedResourceDescription, DerivedIndexcard, Indexcard, - LatestIndexcardRdf, + LatestResourceDescription, ResourceIdentifier, - SupplementaryIndexcardRdf, + SupplementaryResourceDescription, ) @@ -30,10 +30,10 @@ class ResourceIdentifierAdmin(admin.ModelAdmin): @admin.register(Indexcard, site=admin_site) -@linked_many('archived_rdf_set', defer=('rdf_as_turtle',)) -@linked_many('supplementary_rdf_set', defer=('rdf_as_turtle',)) +@linked_many('archived_description_set', defer=('rdf_as_turtle',)) +@linked_many('supplementary_description_set', defer=('rdf_as_turtle',)) @linked_many('derived_indexcard_set', defer=('derived_text',)) -@linked_fk('latest_rdf') +@linked_fk('latest_resource_description') @linked_fk('source_record_suid') @linked_many('focustype_identifier_set') @linked_many('focus_identifier_set') @@ -57,10 +57,9 @@ def _freshen_index(self, request, queryset): _freshen_index.short_description = 'freshen indexcard in search index' -@admin.register(LatestIndexcardRdf, site=admin_site) -@linked_fk('from_raw_datum') +@admin.register(LatestResourceDescription, site=admin_site) @linked_fk('indexcard') -class LatestIndexcardRdfAdmin(admin.ModelAdmin): +class LatestResourceDescriptionAdmin(admin.ModelAdmin): readonly_fields = ( 'created', 'modified', @@ -79,10 +78,9 @@ def rdf_as_turtle__pre(self, instance): rdf_as_turtle__pre.short_description = 'rdf as turtle' -@admin.register(ArchivedIndexcardRdf, site=admin_site) -@linked_fk('from_raw_datum') +@admin.register(ArchivedResourceDescription, site=admin_site) @linked_fk('indexcard') -class ArchivedIndexcardRdfAdmin(admin.ModelAdmin): +class ArchivedResourceDescriptionAdmin(admin.ModelAdmin): readonly_fields = ( 'created', 'modified', @@ -92,8 +90,8 @@ class ArchivedIndexcardRdfAdmin(admin.ModelAdmin): ) exclude = ('rdf_as_turtle',) paginator = TimeLimitedPaginator - list_display = ('id', 'indexcard', 'from_raw_datum', 'created', 'modified') - list_select_related = ('indexcard', 'from_raw_datum',) + list_display = ('id', 'indexcard', 'created', 'modified') + list_select_related = ('indexcard',) show_full_result_count = False def rdf_as_turtle__pre(self, instance): @@ -101,11 +99,10 @@ def rdf_as_turtle__pre(self, instance): rdf_as_turtle__pre.short_description = 'rdf as turtle' -@admin.register(SupplementaryIndexcardRdf, site=admin_site) -@linked_fk('from_raw_datum') +@admin.register(SupplementaryResourceDescription, site=admin_site) @linked_fk('indexcard') @linked_fk('supplementary_suid') -class SupplementaryIndexcardRdfAdmin(admin.ModelAdmin): +class SupplementaryResourceDescriptionAdmin(admin.ModelAdmin): readonly_fields = ( 'created', 'modified', @@ -115,8 +112,8 @@ class SupplementaryIndexcardRdfAdmin(admin.ModelAdmin): ) exclude = ('rdf_as_turtle',) paginator = TimeLimitedPaginator - list_display = ('id', 'indexcard', 'from_raw_datum', 'created', 'modified') - list_select_related = ('indexcard', 'from_raw_datum',) + list_display = ('id', 'indexcard', 'created', 'modified') + list_select_related = ('indexcard',) show_full_result_count = False def rdf_as_turtle__pre(self, instance): diff --git a/trove/derive/_base.py b/trove/derive/_base.py index 9909e8f19..bc8d8b583 100644 --- a/trove/derive/_base.py +++ b/trove/derive/_base.py @@ -2,18 +2,18 @@ from primitive_metadata import primitive_rdf -from trove.models import IndexcardRdf +from trove.models.resource_description import ResourceDescription class IndexcardDeriver(abc.ABC): - upriver_rdf: IndexcardRdf + upstream_description: ResourceDescription focus_iri: str data: primitive_rdf.RdfGraph - def __init__(self, upriver_rdf: IndexcardRdf): - self.upriver_rdf = upriver_rdf - self.focus_iri = upriver_rdf.focus_iri - self.data = upriver_rdf.as_rdfdoc_with_supplements() + def __init__(self, upstream_description: ResourceDescription): + self.upstream_description = upstream_description + self.focus_iri = upstream_description.focus_iri + self.data = upstream_description.as_rdfdoc_with_supplements() def q(self, pathset): # convenience for querying self.data on self.focus_iri diff --git a/trove/derive/osfmap_json_mini.py b/trove/derive/osfmap_json_mini.py index c4da33e08..cd4520f62 100644 --- a/trove/derive/osfmap_json_mini.py +++ b/trove/derive/osfmap_json_mini.py @@ -2,36 +2,8 @@ from trove.derive.osfmap_json import OsfmapJsonFullDeriver from trove.vocab.namespaces import TROVE -INCLUDED_PREDICATE_SET = frozenset({ - ns.RDF.type, - ns.DCTERMS.title, - ns.DCTERMS.creator, - ns.DCTERMS.date, - ns.DCTERMS.created, - ns.FOAF.name, - ns.OWL.sameAs, - ns.DCTERMS.conformsTo, - ns.DCTERMS.dateCopyrighted, - ns.DCTERMS.description, - ns.DCTERMS.hasPart, - ns.DCTERMS.isVersionOf, - ns.DCTERMS.modified, - ns.DCTERMS.publisher, - ns.DCTERMS.rights, - ns.DCTERMS.subject, - ns.DCTERMS.isPartOf, - ns.DCTERMS.identifier, - ns.SKOS.inScheme, - ns.SKOS.prefLabel, - ns.OSFMAP.affiliation, - ns.OSFMAP.archivedAt, - ns.DCTERMS.dateAccepted, - ns.DCTERMS.dateModified, - ns.OSFMAP.hostingInstitution, - ns.OSFMAP.keyword, - ns.OSFMAP.fileName, - ns.OSFMAP.filePath, - ns.OSFMAP.isContainedBy +EXCLUDED_PREDICATE_SET = frozenset({ + ns.OSFMAP.contains, }) @@ -57,4 +29,4 @@ def convert_tripledict(self): @staticmethod def _should_keep_predicate(predicate: str) -> bool: - return predicate in INCLUDED_PREDICATE_SET + return predicate not in EXCLUDED_PREDICATE_SET diff --git a/trove/derive/sharev2_elastic.py b/trove/derive/sharev2_elastic.py index c00f45925..27c7b3a06 100644 --- a/trove/derive/sharev2_elastic.py +++ b/trove/derive/sharev2_elastic.py @@ -146,7 +146,7 @@ def should_skip(self) -> bool: # abstract method from IndexcardDeriver def derive_card_as_text(self): - _suid = self.upriver_rdf.indexcard.source_record_suid + _suid = self.upstream_description.indexcard.source_record_suid try: # maintain doc id in the sharev2 index _suid = _suid.get_backcompat_sharev2_suid() except share_db.SourceUniqueIdentifier.DoesNotExist: @@ -157,10 +157,9 @@ def derive_card_as_text(self): ### # metadata about the record/indexcard in this system 'id': IDObfuscator.encode(_suid), - 'indexcard_id': self.upriver_rdf.indexcard.id, - 'rawdatum_id': self.upriver_rdf.from_raw_datum_id, - 'date_created': _suid.get_date_first_seen().isoformat(), - 'date_modified': self.upriver_rdf.modified.isoformat(), + 'indexcard_id': self.upstream_description.indexcard.id, + 'date_created': self.upstream_description.indexcard.created.isoformat(), + 'date_modified': self.upstream_description.modified.isoformat(), 'sources': [_source_name], 'source_config': _suid.source_config.label, 'source_unique_id': _suid.identifier, diff --git a/trove/digestive_tract.py b/trove/digestive_tract.py index e409eceb8..a91a9d633 100644 --- a/trove/digestive_tract.py +++ b/trove/digestive_tract.py @@ -2,11 +2,11 @@ leaning (perhaps too far) into "ingest" as metaphor -swallow: store a given record by checksum; queue for extraction +sniff: set up identifiers about a record extract: gather rdf graph from a record; store as index card(s) -derive: build other kinds of index cards from the extracted rdf +derive: build other representations from latest card version(s) ''' -__all__ = ('swallow', 'extract', 'derive') +__all__ = ('sniff', 'extract', 'derive', 'expel', 'ingest') import copy import datetime @@ -23,43 +23,79 @@ from trove.exceptions import ( CannotDigestExpiredDatum, DigestiveError, - MissingMediatype, ) from trove.extract import get_rdf_extractor_class from trove.derive import get_deriver_classes +from trove.util.iris import smells_like_iri from trove.vocab.namespaces import RDFS, RDF, OWL logger = logging.getLogger(__name__) -@transaction.atomic -def swallow( +def ingest( *, # all keyword-args from_user: share_db.ShareUser, - record: str, - record_identifier: str, - record_mediatype: str, focus_iri: str, - datestamp: datetime.datetime | None = None, # default "now" + record_mediatype: str, + raw_record: str, + record_identifier: str = '', # default focus_iri + is_supplementary: bool = False, expiration_date: datetime.date | None = None, # default "never" + restore_deleted: bool = False, urgent: bool = False, - is_supplementary: bool = False, ): - '''swallow: store a given record by checksum; queue for extraction + '''ingest: shorthand for sniff + extract + (eventual) derive''' + _suid = sniff( + from_user=from_user, + record_identifier=record_identifier, + focus_iri=focus_iri, + is_supplementary=is_supplementary, + ) + if _suid.source_config.disabled or _suid.source_config.source.is_deleted: + expel_suid(_suid) + else: + _extracted_cards = extract( + suid=_suid, + record_mediatype=record_mediatype, + raw_record=raw_record, + restore_deleted=restore_deleted, + expiration_date=expiration_date, + ) + for _card in _extracted_cards: + task__derive.delay(_card.id, urgent=urgent) + + +@transaction.atomic +def sniff( + *, # all keyword-args + from_user: share_db.ShareUser, + focus_iri: str, + record_identifier: str = '', + is_supplementary: bool = False, +) -> share_db.SourceUniqueIdentifier: + '''sniff: get a vague sense of a metadata record without touching the record itself + + ensures in the database: + * `share.models.Source`/`SourceConfig` for given `from_user`, with... + * `share.models.SourceUniqueIdentifier` for given `record_identifier`, with... + * `trove.models.ResourceIdentifier` for given `focus_iri` - will create (or update) one of each: - Source (from whom/where is it?) - SourceConfig (how did/do we get it?) - SourceUniqueIdentifier (by what name do/would they know it?) - RawDatum ("it", a metadata record) + returns the `SourceUniqueIdentifier`, as the center of that constellation + + for a given `(from_user, record_identifier)` pair, `focus_iri` and `is_supplementary` + must not change -- raises `DigestiveError` if called again with different values ''' - if not isinstance(record, str): - raise DigestiveError('datum must be a string') + if not smells_like_iri(focus_iri): + raise DigestiveError(f'invalid focus_iri "{focus_iri}"') + if is_supplementary and not record_identifier: + raise DigestiveError(f'supplementary records must have non-empty record_identifier! focus_iri={focus_iri} from_user={from_user}') + if is_supplementary and (record_identifier == focus_iri): + raise DigestiveError(f'supplementary records must have record_identifier distinct from their focus! focus_iri={focus_iri} record_identifier={record_identifier} from_user={from_user}') _source_config = share_db.SourceConfig.objects.get_or_create_push_config(from_user) _suid, _suid_created = share_db.SourceUniqueIdentifier.objects.get_or_create( source_config=_source_config, - identifier=record_identifier, + identifier=record_identifier or focus_iri, defaults={ 'is_supplementary': is_supplementary, }, @@ -73,45 +109,43 @@ def swallow( else: if _suid.focus_identifier_id != _focus_identifier.id: raise DigestiveError(f'suid focus_identifier should not change! suid={_suid}, focus changed from {_suid.focus_identifier} to {_focus_identifier}') - _raw = share_db.RawDatum.objects.store_datum_for_suid( - suid=_suid, - datum=record, - mediatype=record_mediatype, - datestamp=(datestamp or datetime.datetime.now(tz=datetime.timezone.utc)), - expiration_date=expiration_date, - ) - _task = task__extract_and_derive.delay(_raw.id, urgent=urgent) - return _task.id + return _suid -def extract(raw: share_db.RawDatum, *, undelete_indexcards=False) -> list[trove_db.Indexcard]: +def extract( + suid: share_db.SourceUniqueIdentifier, + record_mediatype: str, + raw_record: str, + *, + expiration_date: datetime.date | None = None, # default "never" + restore_deleted: bool = False, +) -> list[trove_db.Indexcard]: '''extract: gather rdf graph from a record; store as index card(s) may create (or update): ResourceIdentifier (for each described resource and its types) Indexcard (with identifiers and type-identifiers for each described resource) - ArchivedIndexcardRdf (all extracted metadata, if non-supplementary) - LatestIndexcardRdf (all extracted metadata, if latest raw and non-supplementary) - SupplementaryIndexcardRdf (all extracted metadata, if supplementary) + ArchivedResourceDescription (all extracted metadata, if non-supplementary) + LatestResourceDescription (all extracted metadata, if latest raw and non-supplementary) + SupplementaryResourceDescription (all extracted metadata, if supplementary) may delete: - LatestIndexcardRdf (previously extracted from the record, but no longer present) + LatestResourceDescription (previously extracted from the record, but no longer present) ''' - assert raw.mediatype is not None, 'raw datum has no mediatype -- did you mean to call extract_legacy?' - if raw.is_expired: - raise CannotDigestExpiredDatum(raw) + if (expiration_date is not None) and (expiration_date <= datetime.date.today()): + raise CannotDigestExpiredDatum(suid, expiration_date) _tripledicts_by_focus_iri = {} - _extractor = get_rdf_extractor_class(raw.mediatype)(raw.suid.source_config) + _extractor = get_rdf_extractor_class(record_mediatype)(suid.source_config) # TODO normalize (or just validate) tripledict: # - synonymous iris should be grouped (only one as subject-key, others under owl:sameAs) # - focus should have rdf:type # - no subject-key iris which collide by trove_db.ResourceIdentifier equivalence # - connected graph (all subject-key iris reachable from focus, or reverse for vocab terms?) - _extracted_tripledict: primitive_rdf.RdfTripleDictionary = _extractor.extract_rdf(raw.datum) + _extracted_tripledict: primitive_rdf.RdfTripleDictionary = _extractor.extract_rdf(raw_record) if _extracted_tripledict: try: - _focus_iri = raw.suid.focus_identifier.find_equivalent_iri(_extracted_tripledict) + _focus_iri = suid.focus_identifier.find_equivalent_iri(_extracted_tripledict) except ValueError: - raise DigestiveError(f'could not find {raw.suid.focus_identifier} in {raw}') + raise DigestiveError(f'could not find {suid.focus_identifier} in """{raw_record}"""') _tripledicts_by_focus_iri[_focus_iri] = _extracted_tripledict # special case: if the record defines an ontology, create a # card for each subject iri that starts with the focus iri @@ -125,15 +159,17 @@ def extract(raw: share_db.RawDatum, *, undelete_indexcards=False) -> list[trove_ (_iri, RDFS.isDefinedBy, _focus_iri), ) _tripledicts_by_focus_iri[_iri] = _term_tripledict - if raw.suid.is_supplementary: + if suid.is_supplementary: return trove_db.Indexcard.objects.supplement_indexcards_from_tripledicts( - from_raw_datum=raw, + supplementary_suid=suid, rdf_tripledicts_by_focus_iri=_tripledicts_by_focus_iri, + expiration_date=expiration_date, ) return trove_db.Indexcard.objects.save_indexcards_from_tripledicts( - from_raw_datum=raw, + suid=suid, rdf_tripledicts_by_focus_iri=_tripledicts_by_focus_iri, - undelete=undelete_indexcards, + restore_deleted=restore_deleted, + expiration_date=expiration_date, ) @@ -146,12 +182,12 @@ def derive(indexcard: trove_db.Indexcard, deriver_iris=None): if indexcard.deleted: return [] try: - _latest_rdf = indexcard.latest_rdf - except trove_db.LatestIndexcardRdf.DoesNotExist: + _latest_resource_description = indexcard.latest_resource_description + except trove_db.LatestResourceDescription.DoesNotExist: return [] _derived_list = [] for _deriver_class in get_deriver_classes(deriver_iris): - _deriver = _deriver_class(upriver_rdf=_latest_rdf) + _deriver = _deriver_class(upstream_description=_latest_resource_description) _deriver_identifier = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_deriver.deriver_iri()) if _deriver.should_skip(): trove_db.DerivedIndexcard.objects.filter( @@ -160,7 +196,7 @@ def derive(indexcard: trove_db.Indexcard, deriver_iris=None): ).delete() else: _derived_text = _deriver.derive_card_as_text() - _derived_checksum_iri = ChecksumIri.digest('sha-256', salt='', raw_data=_derived_text) + _derived_checksum_iri = ChecksumIri.digest('sha-256', salt='', data=_derived_text) _derived, _ = trove_db.DerivedIndexcard.objects.update_or_create( upriver_indexcard=indexcard, deriver_identifier=_deriver_identifier, @@ -185,65 +221,43 @@ def expel(from_user: share_db.ShareUser, record_identifier: str): def expel_suid(suid: share_db.SourceUniqueIdentifier) -> None: for _indexcard in trove_db.Indexcard.objects.filter(source_record_suid=suid): _indexcard.pls_delete() - _expel_supplementary_rdf( - trove_db.SupplementaryIndexcardRdf.objects.filter(supplementary_suid=suid), + _expel_supplementary_descriptions( + trove_db.SupplementaryResourceDescription.objects.filter(supplementary_suid=suid), ) def expel_expired_data(today: datetime.date) -> None: # mark indexcards deleted if their latest update has now expired for _indexcard in trove_db.Indexcard.objects.filter( - trove_latestindexcardrdf_set__from_raw_datum__expiration_date__lte=today, + trove_latestresourcedescription_set__expiration_date__lte=today, ): _indexcard.pls_delete() # delete expired supplementary metadata - _expel_supplementary_rdf( - trove_db.SupplementaryIndexcardRdf.objects.filter( - from_raw_datum__expiration_date__lte=today, - ), + _expel_supplementary_descriptions( + trove_db.SupplementaryResourceDescription.objects.filter(expiration_date__lte=today), ) -def _expel_supplementary_rdf(supplementary_rdf_queryset) -> None: +def _expel_supplementary_descriptions(supplementary_rdf_queryset) -> None: # delete expired supplementary metadata _affected_indexcards = set() - for _supplementary_rdf in supplementary_rdf_queryset.select_related('indexcard'): - if not _supplementary_rdf.indexcard.deleted: - _affected_indexcards.add(_supplementary_rdf.indexcard) - _supplementary_rdf.delete() + for _supplement in supplementary_rdf_queryset.select_related('indexcard'): + if not _supplement.indexcard.deleted: + _affected_indexcards.add(_supplement.indexcard) + _supplement.delete() for _indexcard in _affected_indexcards: task__derive.delay(_indexcard.id) ### BEGIN celery tasks -@celery.shared_task(acks_late=True, bind=True) -def task__extract_and_derive(task: celery.Task, raw_id: int, urgent=False): - _raw = ( - share_db.RawDatum.objects - .select_related('suid__source_config__source') - .get(id=raw_id) - ) - _source_config = _raw.suid.source_config - if _source_config.disabled or _source_config.source.is_deleted: - expel_suid(_raw.suid) - else: - if not _raw.mediatype: - raise MissingMediatype(_raw) - _indexcards = extract(_raw, undelete_indexcards=urgent) - if _raw.is_latest(): - _messenger = IndexMessenger(celery_app=task.app) - for _indexcard in _indexcards: - derive(_indexcard) - _messenger.notify_indexcard_update(_indexcards, urgent=urgent) - - @celery.shared_task(acks_late=True, bind=True) def task__derive( task: celery.Task, indexcard_id: int, deriver_iri: str | None = None, notify_index=True, + urgent=False, ): _indexcard = trove_db.Indexcard.objects.get(id=indexcard_id) derive( @@ -253,21 +267,18 @@ def task__derive( # TODO: avoid unnecessary work; let IndexStrategy subscribe to a specific # IndexcardDeriver (perhaps by deriver-specific MessageType?) if notify_index: - IndexMessenger(celery_app=task.app).notify_indexcard_update([_indexcard]) + IndexMessenger(celery_app=task.app).notify_indexcard_update([_indexcard], urgent=urgent) @celery.shared_task(acks_late=True) -def task__schedule_extract_and_derive_for_source_config(source_config_id: int): - _raw_id_qs = ( - share_db.RawDatum.objects - .latest_by_suid_queryset( - share_db.SourceUniqueIdentifier.objects - .filter(source_config_id=source_config_id) - ) +def task__schedule_derive_for_source_config(source_config_id: int, notify_index=False): + _indexcard_id_qs = ( + trove_db.Indexcard.objects + .filter(source_record_suid__source_config_id=source_config_id) .values_list('id', flat=True) ) - for _raw_id in _raw_id_qs.iterator(): - task__extract_and_derive.delay(_raw_id) + for _indexcard_id in _indexcard_id_qs.iterator(): + task__derive.delay(_indexcard_id, notify_index=notify_index) @celery.shared_task(acks_late=True) diff --git a/trove/management/commands/ingest_from_another_shtrove.py b/trove/management/commands/ingest_from_another_shtrove.py deleted file mode 100644 index 09ab22aa6..000000000 --- a/trove/management/commands/ingest_from_another_shtrove.py +++ /dev/null @@ -1,68 +0,0 @@ -import functools -from itertools import islice -import re -from urllib.parse import urlunsplit - -from django.conf import settings -from django.core.management.base import BaseCommand -import requests - -from share import models as share_db -from trove import digestive_tract -from trove.vocab import mediatypes - - -class Command(BaseCommand): - help = "ingest metadata from another SHARE/trove instance" - - def add_arguments(self, parser): - parser.add_argument("host", help="host name of the shtrove instance (e.g. 'staging-share.osf.io')") - parser.add_argument("--count", type=int, default=333) - - def handle(self, *args, host, count, **options): - if not settings.DEBUG: - raise Exception('this command not meant for non-debug use') - _ingested_count = 0 - _skipped_count = 0 - for _datum in islice(self._iter_datums(host), count): - if self._ingest(_datum): - _ingested_count += 1 - else: - _skipped_count += 1 - self.stdout.write( - self.style.SUCCESS(f'ingested {_ingested_count} (skipped {_skipped_count}) from {host}') - ) - - def _iter_datums(self, host: str): - _url = urlunsplit(('https', host, '/api/v2/rawdata/', '', '')) - while _url: - self.stdout.write('fetching a page...') - _json = requests.get(_url, headers={'Accept': mediatypes.JSONAPI}).json() - for _item in _json['data']: - yield _item['attributes']['datum'] - _url = _json['links'].get('next') - - def _ingest(self, datum: str) -> bool: - # HACK: get only turtle files by checking it starts with a prefix (unreliable, generally, but good enough for this) - _smells_like_turtle = datum.startswith('@prefix ') or datum.startswith('PREFIX ') - if _smells_like_turtle: - _first_subject_match = re.search( - r'^<([^>\s]+)>', # HACK: depends on specific serialization - datum, - re.MULTILINE, - ) - if _first_subject_match: - _subject_iri = _first_subject_match.group(1) - digestive_tract.swallow( - from_user=self._application_user, - record=datum, - record_identifier=_subject_iri, - record_mediatype=mediatypes.TURTLE, - focus_iri=_subject_iri, - ) - return True - return False - - @functools.cached_property - def _application_user(self): - return share_db.ShareUser.objects.get(username=settings.APPLICATION_USERNAME) diff --git a/trove/migrations/0009_no_raw_datum.py b/trove/migrations/0009_no_raw_datum.py new file mode 100644 index 000000000..47129a367 --- /dev/null +++ b/trove/migrations/0009_no_raw_datum.py @@ -0,0 +1,27 @@ +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('trove', '0008_expiration_dates'), + ] + + operations = [ + migrations.RemoveConstraint( + model_name='archivedindexcardrdf', + name='trove_archivedindexcardrdf_uniq_archived_version', + ), + migrations.RemoveField( + model_name='archivedindexcardrdf', + name='from_raw_datum', + ), + migrations.RemoveField( + model_name='latestindexcardrdf', + name='from_raw_datum', + ), + migrations.RemoveField( + model_name='supplementaryindexcardrdf', + name='from_raw_datum', + ), + ] diff --git a/trove/migrations/0010_resource_description_rename.py b/trove/migrations/0010_resource_description_rename.py new file mode 100644 index 000000000..79cfd8d96 --- /dev/null +++ b/trove/migrations/0010_resource_description_rename.py @@ -0,0 +1,44 @@ +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('trove', '0009_no_raw_datum'), + ] + + operations = [ + migrations.RenameModel( + old_name='ArchivedIndexcardRdf', + new_name='ArchivedResourceDescription', + ), + migrations.RenameModel( + old_name='LatestIndexcardRdf', + new_name='LatestResourceDescription', + ), + migrations.RenameModel( + old_name='SupplementaryIndexcardRdf', + new_name='SupplementaryResourceDescription', + ), + migrations.AlterField( + model_name='archivedresourcedescription', + name='indexcard', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='trove_archivedresourcedescription_set', to='trove.indexcard'), + ), + migrations.AlterField( + model_name='latestresourcedescription', + name='indexcard', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='trove_latestresourcedescription_set', to='trove.indexcard'), + ), + migrations.AlterField( + model_name='supplementaryresourcedescription', + name='indexcard', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='trove_supplementaryresourcedescription_set', to='trove.indexcard'), + ), + migrations.AlterField( + model_name='supplementaryresourcedescription', + name='supplementary_suid', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='supplementary_description_set', to='share.sourceuniqueidentifier'), + ), + ] diff --git a/trove/models/__init__.py b/trove/models/__init__.py index acaadd7c4..60318fbc6 100644 --- a/trove/models/__init__.py +++ b/trove/models/__init__.py @@ -1,18 +1,18 @@ __all__ = ( - 'ResourceIdentifier', - 'Indexcard', - 'IndexcardRdf', - 'LatestIndexcardRdf', - 'ArchivedIndexcardRdf', - 'SupplementaryIndexcardRdf', + 'ArchivedResourceDescription', 'DerivedIndexcard', + 'Indexcard', + 'LatestResourceDescription', + 'ResourceDescription', + 'ResourceIdentifier', + 'SupplementaryResourceDescription', ) -from .indexcard import ( - ArchivedIndexcardRdf, - DerivedIndexcard, - Indexcard, - IndexcardRdf, - LatestIndexcardRdf, - SupplementaryIndexcardRdf, +from .derived_indexcard import DerivedIndexcard +from .indexcard import Indexcard +from .resource_description import ( + ArchivedResourceDescription, + LatestResourceDescription, + ResourceDescription, + SupplementaryResourceDescription, ) from .resource_identifier import ResourceIdentifier diff --git a/trove/models/derived_indexcard.py b/trove/models/derived_indexcard.py new file mode 100644 index 000000000..52f0d3989 --- /dev/null +++ b/trove/models/derived_indexcard.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from django.db import models +from primitive_metadata import primitive_rdf as rdf + +from trove.models.resource_identifier import ResourceIdentifier + +__all__ = ('DerivedIndexcard',) + + +class DerivedIndexcard(models.Model): + # auto: + created = models.DateTimeField(auto_now_add=True) + modified = models.DateTimeField(auto_now=True) + + # required: + upriver_indexcard = models.ForeignKey( + 'trove.Indexcard', + on_delete=models.CASCADE, + related_name='derived_indexcard_set', + ) + deriver_identifier = models.ForeignKey(ResourceIdentifier, on_delete=models.PROTECT, related_name='+') + derived_checksum_iri = models.TextField() + derived_text = models.TextField() # TODO: store elsewhere by checksum + + class Meta: + constraints = [ + models.UniqueConstraint( + fields=('upriver_indexcard', 'deriver_identifier'), + name='%(app_label)s_%(class)s_upriverindexcard_deriveridentifier', + ), + ] + + def __repr__(self): + return f'<{self.__class__.__qualname__}({self.id}, {self.upriver_indexcard.uuid}, "{self.deriver_identifier.sufficiently_unique_iri}")' + + def __str__(self): + return repr(self) + + @property + def deriver_cls(self): + from trove.derive import get_deriver_classes + (_deriver_cls,) = get_deriver_classes(self.deriver_identifier.raw_iri_list) + return _deriver_cls + + def as_rdf_literal(self) -> rdf.Literal: + return rdf.literal( + self.derived_text, + datatype_iris=self.deriver_cls.derived_datatype_iris(), + ) diff --git a/trove/models/indexcard.py b/trove/models/indexcard.py index 6ae24b4b0..ba6de67d3 100644 --- a/trove/models/indexcard.py +++ b/trove/models/indexcard.py @@ -1,4 +1,5 @@ from __future__ import annotations +import datetime import uuid from django.db import models @@ -9,11 +10,21 @@ from share import models as share_db # TODO: break this dependency from share.util.checksum_iri import ChecksumIri from trove.exceptions import DigestiveError +from trove.models.derived_indexcard import DerivedIndexcard +from trove.models.resource_description import ( + ArchivedResourceDescription, + ResourceDescription, + LatestResourceDescription, + SupplementaryResourceDescription, +) from trove.models.resource_identifier import ResourceIdentifier from trove.vocab.namespaces import RDF from trove.vocab.trove import trove_indexcard_iri, trove_indexcard_namespace +__all__ = ('Indexcard',) + + class IndexcardManager(models.Manager): def get_for_iri(self, iri: str): _uuid = rdf.iri_minus_namespace(iri, namespace=trove_indexcard_namespace()) @@ -22,21 +33,21 @@ def get_for_iri(self, iri: str): @transaction.atomic def save_indexcards_from_tripledicts( self, *, - from_raw_datum: share_db.RawDatum, + suid: share_db.SourceUniqueIdentifier, rdf_tripledicts_by_focus_iri: dict[str, rdf.RdfTripleDictionary], - undelete: bool = False, + restore_deleted: bool = False, + expiration_date: datetime.date | None = None, ) -> list['Indexcard']: - assert not from_raw_datum.suid.is_supplementary - from_raw_datum.no_output = (not rdf_tripledicts_by_focus_iri) - from_raw_datum.save(update_fields=['no_output']) + assert not suid.is_supplementary _indexcards = [] _seen_focus_identifier_ids: set[str] = set() for _focus_iri, _tripledict in rdf_tripledicts_by_focus_iri.items(): _indexcard = self.save_indexcard_from_tripledict( - from_raw_datum=from_raw_datum, + suid=suid, rdf_tripledict=_tripledict, focus_iri=_focus_iri, - undelete=undelete, + restore_deleted=restore_deleted, + expiration_date=expiration_date, ) _focus_identifier_ids = {_fid.id for _fid in _indexcard.focus_identifier_set.all()} if not _seen_focus_identifier_ids.isdisjoint(_focus_identifier_ids): @@ -50,7 +61,7 @@ def save_indexcards_from_tripledicts( # cards seen previously on this suid (but not this time) treated as deleted for _indexcard_to_delete in ( Indexcard.objects - .filter(source_record_suid=from_raw_datum.suid) + .filter(source_record_suid=suid) .exclude(id__in=[_card.id for _card in _indexcards]) ): _indexcard_to_delete.pls_delete() @@ -60,28 +71,25 @@ def save_indexcards_from_tripledicts( @transaction.atomic def supplement_indexcards_from_tripledicts( self, *, - from_raw_datum: share_db.RawDatum, + supplementary_suid: share_db.SourceUniqueIdentifier, rdf_tripledicts_by_focus_iri: dict[str, rdf.RdfTripleDictionary], + expiration_date: datetime.date | None = None, ) -> list[Indexcard]: - assert from_raw_datum.suid.is_supplementary - assert not from_raw_datum.is_expired - from_raw_datum.no_output = (not rdf_tripledicts_by_focus_iri) - from_raw_datum.save(update_fields=['no_output']) - if not from_raw_datum.is_latest(): - return [] + assert supplementary_suid.is_supplementary _indexcards = [] for _focus_iri, _tripledict in rdf_tripledicts_by_focus_iri.items(): _indexcards.extend(self.supplement_indexcards( - from_raw_datum=from_raw_datum, + supplementary_suid=supplementary_suid, rdf_tripledict=_tripledict, focus_iri=_focus_iri, + expiration_date=expiration_date, )) _seen_indexcard_ids = {_card.id for _card in _indexcards} # supplementary data seen previously on this suid (but not this time) should be deleted for _supplement_to_delete in ( - SupplementaryIndexcardRdf.objects - .filter(supplementary_suid=from_raw_datum.suid) - .exclude(from_raw_datum=from_raw_datum) + SupplementaryResourceDescription.objects + .filter(supplementary_suid=supplementary_suid) + .exclude(indexcard__in=_indexcards) ): if _supplement_to_delete.indexcard_id not in _seen_indexcard_ids: _indexcards.append(_supplement_to_delete.indexcard) @@ -91,13 +99,13 @@ def supplement_indexcards_from_tripledicts( @transaction.atomic def save_indexcard_from_tripledict( self, *, - from_raw_datum: share_db.RawDatum, + suid: share_db.SourceUniqueIdentifier, rdf_tripledict: rdf.RdfTripleDictionary, focus_iri: str, - undelete: bool = False, + restore_deleted: bool = False, + expiration_date: datetime.date | None = None, ): - assert not from_raw_datum.suid.is_supplementary - assert not from_raw_datum.is_expired + assert not suid.is_supplementary _focus_identifier_set = ( ResourceIdentifier.objects .save_equivalent_identifier_set(rdf_tripledict, focus_iri) @@ -107,42 +115,40 @@ def save_indexcard_from_tripledict( for _iri in rdf_tripledict[focus_iri].get(RDF.type, ()) ] _indexcard = Indexcard.objects.filter( - source_record_suid=from_raw_datum.suid, + source_record_suid=suid, focus_identifier_set__in=_focus_identifier_set, ).first() if _indexcard is None: - _indexcard = Indexcard.objects.create(source_record_suid=from_raw_datum.suid) - if undelete and _indexcard.deleted: + _indexcard = Indexcard.objects.create(source_record_suid=suid) + if restore_deleted and _indexcard.deleted: _indexcard.deleted = None _indexcard.save() _indexcard.focus_identifier_set.set(_focus_identifier_set) _indexcard.focustype_identifier_set.set(_focustype_identifier_set) - _indexcard.update_rdf( - from_raw_datum=from_raw_datum, - rdf_tripledict=rdf_tripledict, - focus_iri=focus_iri, - ) + _indexcard.update_resource_description(focus_iri, rdf_tripledict, expiration_date=expiration_date) return _indexcard @transaction.atomic def supplement_indexcards( self, *, - from_raw_datum: share_db.RawDatum, + supplementary_suid: share_db.SourceUniqueIdentifier, rdf_tripledict: rdf.RdfTripleDictionary, focus_iri: str, + expiration_date: datetime.date | None = None, ) -> list[Indexcard]: - assert from_raw_datum.suid.is_supplementary + assert supplementary_suid.is_supplementary # supplement indexcards with the same focus from the same source_config # (if none exist, fine, nothing gets supplemented) _indexcards = list(Indexcard.objects.filter( - source_record_suid__source_config_id=from_raw_datum.suid.source_config_id, + source_record_suid__source_config_id=supplementary_suid.source_config_id, focus_identifier_set__in=ResourceIdentifier.objects.queryset_for_iri(focus_iri), )) for _indexcard in _indexcards: - _indexcard.update_supplementary_rdf( - from_raw_datum=from_raw_datum, + _indexcard.update_supplementary_description( + supplementary_suid=supplementary_suid, rdf_tripledict=rdf_tripledict, focus_iri=focus_iri, + expiration_date=expiration_date, ) return _indexcards @@ -181,26 +187,26 @@ class Meta: ] @property - def latest_rdf(self) -> LatestIndexcardRdf: - '''convenience for the "other side" of LatestIndexcardRdf.indexcard + def latest_resource_description(self) -> LatestResourceDescription: + '''convenience for the "other side" of LatestResourceDescription.indexcard ''' - return self.trove_latestindexcardrdf_set.get() # may raise DoesNotExist + return self.trove_latestresourcedescription_set.get() # may raise DoesNotExist @property - def archived_rdf_set(self): - '''convenience for the "other side" of ArchivedIndexcardRdf.indexcard + def archived_description_set(self): + '''convenience for the "other side" of ArchivedResourceDescription.indexcard returns a RelatedManager ''' - return self.trove_archivedindexcardrdf_set + return self.trove_archivedresourcedescription_set @property - def supplementary_rdf_set(self): - '''convenience for the "other side" of SupplementaryIndexcardRdf.indexcard + def supplementary_description_set(self): + '''convenience for the "other side" of SupplementaryResourceDescription.indexcard returns a RelatedManager ''' - return self.trove_supplementaryindexcardrdf_set + return self.trove_supplementaryresourcedescription_set def get_iri(self): return trove_indexcard_iri(self.uuid) @@ -210,8 +216,8 @@ def pls_delete(self, *, notify_indexes=True): if self.deleted is None: self.deleted = timezone.now() self.save() - ( # actually delete LatestIndexcardRdf: - LatestIndexcardRdf.objects + ( # actually delete LatestResourceDescription: + LatestResourceDescription.objects .filter(indexcard=self) .delete() ) @@ -232,205 +238,63 @@ def __str__(self): return repr(self) @transaction.atomic - def update_rdf( + def update_resource_description( self, - from_raw_datum: share_db.RawDatum, focus_iri: str, rdf_tripledict: rdf.RdfTripleDictionary, - ) -> 'IndexcardRdf': + expiration_date: datetime.date | None = None, + ) -> ResourceDescription: if focus_iri not in rdf_tripledict: raise DigestiveError(f'expected {focus_iri} in {set(rdf_tripledict.keys())}') _rdf_as_turtle, _turtle_checksum_iri = _turtlify(rdf_tripledict) - _archived, _archived_created = ArchivedIndexcardRdf.objects.get_or_create( + _archived, _archived_created = ArchivedResourceDescription.objects.get_or_create( indexcard=self, - from_raw_datum=from_raw_datum, turtle_checksum_iri=_turtle_checksum_iri, defaults={ 'rdf_as_turtle': _rdf_as_turtle, 'focus_iri': focus_iri, - 'expiration_date': from_raw_datum.expiration_date, + 'expiration_date': expiration_date, }, ) if (not _archived_created) and (_archived.rdf_as_turtle != _rdf_as_turtle): raise DigestiveError(f'hash collision? {_archived}\n===\n{_rdf_as_turtle}') - if not self.deleted and from_raw_datum.is_latest(): - _latest_indexcard_rdf, _created = LatestIndexcardRdf.objects.update_or_create( + if not self.deleted: + _latest_resource_description, _created = LatestResourceDescription.objects.update_or_create( indexcard=self, defaults={ - 'from_raw_datum': from_raw_datum, 'turtle_checksum_iri': _turtle_checksum_iri, 'rdf_as_turtle': _rdf_as_turtle, 'focus_iri': focus_iri, - 'expiration_date': from_raw_datum.expiration_date, + 'expiration_date': expiration_date, }, ) - return _latest_indexcard_rdf + return _latest_resource_description return _archived - def update_supplementary_rdf( + def update_supplementary_description( self, - from_raw_datum: share_db.RawDatum, + supplementary_suid: share_db.SourceUniqueIdentifier, focus_iri: str, rdf_tripledict: rdf.RdfTripleDictionary, - ) -> SupplementaryIndexcardRdf: + expiration_date: datetime.date | None = None, + ) -> SupplementaryResourceDescription: + assert supplementary_suid.is_supplementary if focus_iri not in rdf_tripledict: raise DigestiveError(f'expected {focus_iri} in {set(rdf_tripledict.keys())}') _rdf_as_turtle, _turtle_checksum_iri = _turtlify(rdf_tripledict) - _supplement_rdf, _ = SupplementaryIndexcardRdf.objects.update_or_create( + _supplement_rdf, _ = SupplementaryResourceDescription.objects.update_or_create( indexcard=self, - supplementary_suid=from_raw_datum.suid, + supplementary_suid=supplementary_suid, defaults={ - 'from_raw_datum': from_raw_datum, 'turtle_checksum_iri': _turtle_checksum_iri, 'rdf_as_turtle': _rdf_as_turtle, 'focus_iri': focus_iri, - 'expiration_date': from_raw_datum.expiration_date, + 'expiration_date': expiration_date, }, ) return _supplement_rdf -class IndexcardRdf(models.Model): - # auto: - created = models.DateTimeField(auto_now_add=True) - modified = models.DateTimeField(auto_now=True) - - # required: - from_raw_datum = models.ForeignKey( - share_db.RawDatum, - on_delete=models.DO_NOTHING, # allows faster bulk-deletion of unused RawDatum (but errors deleting used RawDatum) - related_name='+', - ) - indexcard = models.ForeignKey( - Indexcard, - on_delete=models.CASCADE, - related_name='%(app_label)s_%(class)s_set', - ) - turtle_checksum_iri = models.TextField(db_index=True) - focus_iri = models.TextField() # exact iri used in rdf_as_turtle - rdf_as_turtle = models.TextField() # TODO: store elsewhere by checksum - - # optional: - expiration_date = models.DateField( - null=True, - blank=True, - help_text='An (optional) date when this description will no longer be valid.', - ) - - def as_rdf_tripledict(self) -> rdf.RdfTripleDictionary: - return rdf.tripledict_from_turtle(self.rdf_as_turtle) - - def as_quoted_graph(self) -> rdf.QuotedGraph: - return rdf.QuotedGraph( - self.as_rdf_tripledict(), - focus_iri=self.focus_iri, - ) - - def as_rdfdoc_with_supplements(self) -> rdf.RdfGraph: - '''build an rdf graph composed of this rdf and all current card supplements''' - _rdfdoc = rdf.RdfGraph(self.as_rdf_tripledict()) - for _supplementary_rdf in self.indexcard.supplementary_rdf_set.all(): - _rdfdoc.add_tripledict(_supplementary_rdf.as_rdf_tripledict()) - return _rdfdoc - - class Meta: - abstract = True - - def __repr__(self): - return f'<{self.__class__.__qualname__}({self.id}, "{self.focus_iri}")' - - def __str__(self): - return repr(self) - - -class LatestIndexcardRdf(IndexcardRdf): - # just the most recent version of this indexcard - class Meta: - constraints = [ - models.UniqueConstraint( - fields=('indexcard',), - name='%(app_label)s_%(class)s_uniq_indexcard', - ), - ] - indexes = [ - models.Index(fields=('modified',)), # for OAI-PMH selective harvest - models.Index(fields=['expiration_date']), # for expiring - ] - - -class ArchivedIndexcardRdf(IndexcardRdf): - # all versions of an indexcard over time (including the latest) - class Meta: - constraints = [ - models.UniqueConstraint( - fields=('indexcard', 'from_raw_datum', 'turtle_checksum_iri'), - name='%(app_label)s_%(class)s_uniq_archived_version', - ), - ] - - -class SupplementaryIndexcardRdf(IndexcardRdf): - # supplementary (non-descriptive) metadata from the same source (just the most recent) - supplementary_suid = models.ForeignKey( - share_db.SourceUniqueIdentifier, - on_delete=models.CASCADE, - related_name='supplementary_rdf_set', - ) - - class Meta: - constraints = [ - models.UniqueConstraint( - fields=('indexcard', 'supplementary_suid'), - name='%(app_label)s_%(class)s_uniq_supplement', - ), - ] - indexes = [ - models.Index(fields=['expiration_date']), # for expiring - ] - - -class DerivedIndexcard(models.Model): - # auto: - created = models.DateTimeField(auto_now_add=True) - modified = models.DateTimeField(auto_now=True) - - # required: - upriver_indexcard = models.ForeignKey( - Indexcard, - on_delete=models.CASCADE, - related_name='derived_indexcard_set', - ) - deriver_identifier = models.ForeignKey(ResourceIdentifier, on_delete=models.PROTECT, related_name='+') - derived_checksum_iri = models.TextField() - derived_text = models.TextField() # TODO: store elsewhere by checksum - - class Meta: - constraints = [ - models.UniqueConstraint( - fields=('upriver_indexcard', 'deriver_identifier'), - name='%(app_label)s_%(class)s_upriverindexcard_deriveridentifier', - ), - ] - - def __repr__(self): - return f'<{self.__class__.__qualname__}({self.id}, {self.upriver_indexcard.uuid}, "{self.deriver_identifier.sufficiently_unique_iri}")' - - def __str__(self): - return repr(self) - - @property - def deriver_cls(self): - from trove.derive import get_deriver_classes - (_deriver_cls,) = get_deriver_classes(self.deriver_identifier.raw_iri_list) - return _deriver_cls - - def as_rdf_literal(self) -> rdf.Literal: - return rdf.literal( - self.derived_text, - datatype_iris=self.deriver_cls.derived_datatype_iris(), - ) - - ### # local helpers @@ -438,6 +302,6 @@ def _turtlify(rdf_tripledict: rdf.RdfTripleDictionary) -> tuple[str, str]: '''return turtle serialization and checksum iri of that serialization''' _rdf_as_turtle = rdf.turtle_from_tripledict(rdf_tripledict) _turtle_checksum_iri = str( - ChecksumIri.digest('sha-256', salt='', raw_data=_rdf_as_turtle), + ChecksumIri.digest('sha-256', salt='', data=_rdf_as_turtle), ) return (_rdf_as_turtle, _turtle_checksum_iri) diff --git a/trove/models/resource_description.py b/trove/models/resource_description.py new file mode 100644 index 000000000..d5b43ffc1 --- /dev/null +++ b/trove/models/resource_description.py @@ -0,0 +1,118 @@ +from __future__ import annotations +import datetime + +from django.db import models +from primitive_metadata import primitive_rdf as rdf + +__all__ = ( + 'ArchivedResourceDescription', + 'ResourceDescription', + 'LatestResourceDescription', + 'SupplementaryResourceDescription', +) + + +class ResourceDescription(models.Model): + # auto: + created = models.DateTimeField(auto_now_add=True) + modified = models.DateTimeField(auto_now=True) + + # required: + indexcard = models.ForeignKey( + 'trove.Indexcard', + on_delete=models.CASCADE, + related_name='%(app_label)s_%(class)s_set', + ) + turtle_checksum_iri = models.TextField(db_index=True) + focus_iri = models.TextField() # exact iri used in rdf_as_turtle + rdf_as_turtle = models.TextField() # TODO: store elsewhere by checksum + + # optional: + expiration_date = models.DateField( + null=True, + blank=True, + help_text='An (optional) date when this description will no longer be valid.', + ) + + class Meta: + abstract = True + + @property + def is_expired(self) -> bool: + return ( + self.expiration_date is not None + and self.expiration_date <= datetime.date.today() + ) + + def as_rdf_tripledict(self) -> rdf.RdfTripleDictionary: + return rdf.tripledict_from_turtle(self.rdf_as_turtle) + + def as_quoted_graph(self) -> rdf.QuotedGraph: + return rdf.QuotedGraph( + self.as_rdf_tripledict(), + focus_iri=self.focus_iri, + ) + + def as_rdfdoc_with_supplements(self) -> rdf.RdfGraph: + '''build an rdf graph composed of this rdf and all current card supplements''' + _rdfdoc = rdf.RdfGraph(self.as_rdf_tripledict()) + for _supplement in self.indexcard.supplementary_description_set.all(): + _rdfdoc.add_tripledict(_supplement.as_rdf_tripledict()) + return _rdfdoc + + def __repr__(self): + return f'<{self.__class__.__qualname__}({self.id}, "{self.focus_iri}")' + + def __str__(self): + return repr(self) + + +class LatestResourceDescription(ResourceDescription): + # just the most recent version of this indexcard + class Meta: + constraints = [ + models.UniqueConstraint( + fields=('indexcard',), + name='trove_latestindexcardrdf_uniq_indexcard', + # TODO when on django 5.2: + # name='%(app_label)s_%(class)s_uniq_indexcard', + # ...and add migration with `AlterConstraint` to rename + ), + ] + indexes = [ + models.Index( + fields=('modified',), # for OAI-PMH selective harvest + name='trove_lates_modifie_c6b0b1_idx', + # TODO when on django 5.2: + # remove explicit name, add migration with `RenameIndex` to match + ), + models.Index(fields=['expiration_date']), # for expiring + ] + + +class ArchivedResourceDescription(ResourceDescription): + # all versions of an indexcard over time (including the latest) + pass + + +class SupplementaryResourceDescription(ResourceDescription): + # supplementary (non-descriptive) metadata from the same source (just the most recent) + supplementary_suid = models.ForeignKey( + 'share.SourceUniqueIdentifier', + on_delete=models.CASCADE, + related_name='supplementary_description_set', + ) + + class Meta: + constraints = [ + models.UniqueConstraint( + fields=('indexcard', 'supplementary_suid'), + name='trove_supplementaryindexcardrdf_uniq_supplement', + # TODO when on django 5.2: + # name='%(app_label)s_%(class)s_uniq_supplement', + # ...and add migration with `AlterConstraint` to rename + ), + ] + indexes = [ + models.Index(fields=['expiration_date']), # for expiring + ] diff --git a/trove/trovebrowse_gathering.py b/trove/trovebrowse_gathering.py index 76903d158..3da36167a 100644 --- a/trove/trovebrowse_gathering.py +++ b/trove/trovebrowse_gathering.py @@ -36,8 +36,8 @@ def gather_cards_focused_on(focus, *, blend_cards: bool): _identifier_qs = trove_db.ResourceIdentifier.objects.queryset_for_iris(focus.iris) _indexcard_qs = trove_db.Indexcard.objects.filter(focus_identifier_set__in=_identifier_qs) if blend_cards: - for _latest_rdf in trove_db.LatestIndexcardRdf.objects.filter(indexcard__in=_indexcard_qs): - yield from rdf.iter_tripleset(_latest_rdf.as_rdf_tripledict()) + for _latest_resource_description in trove_db.LatestResourceDescription.objects.filter(indexcard__in=_indexcard_qs): + yield from rdf.iter_tripleset(_latest_resource_description.as_rdf_tripledict()) else: for _indexcard in _indexcard_qs: _card_iri = _indexcard.get_iri() diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index 0d2fcb719..4f548774d 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -370,8 +370,8 @@ def _load_cards_and_contents(*, card_iris=None, value_iris=None, deriver_iri) -> def _load_cards_and_extracted_rdf_contents(card_iris=None, value_iris=None) -> dict[str, IndexcardFocus]: _card_namespace = trove_indexcard_namespace() - _indexcard_rdf_qs = ( - trove_db.LatestIndexcardRdf.objects + _resource_description_qs = ( + trove_db.LatestResourceDescription.objects .select_related('indexcard') .prefetch_related('indexcard__focus_identifier_set') ) @@ -380,19 +380,19 @@ def _load_cards_and_extracted_rdf_contents(card_iris=None, value_iris=None) -> d iri_minus_namespace(_card_iri, namespace=_card_namespace) for _card_iri in card_iris } - _indexcard_rdf_qs = _indexcard_rdf_qs.filter(indexcard__uuid__in=_indexcard_uuids) + _resource_description_qs = _resource_description_qs.filter(indexcard__uuid__in=_indexcard_uuids) if value_iris is not None: - _indexcard_rdf_qs = _indexcard_rdf_qs.filter( + _resource_description_qs = _resource_description_qs.filter( indexcard__focus_identifier_set__in=( trove_db.ResourceIdentifier.objects .queryset_for_iris(value_iris) ), ) _card_foci: dict[str, IndexcardFocus] = {} - for _indexcard_rdf in _indexcard_rdf_qs: - _card = _indexcard_rdf.indexcard + for _resource_description in _resource_description_qs: + _card = _resource_description.indexcard _card_iri = _card.get_iri() - _quoted_graph = _indexcard_rdf.as_quoted_graph() + _quoted_graph = _resource_description.as_quoted_graph() _quoted_graph.add( (_quoted_graph.focus_iri, FOAF.isPrimaryTopicOf, _card_iri), ) diff --git a/trove/util/iris.py b/trove/util/iris.py index 35d9123f4..736758a64 100644 --- a/trove/util/iris.py +++ b/trove/util/iris.py @@ -90,6 +90,8 @@ def get_sufficiently_unique_iri_and_scheme(iri: str) -> tuple[str, str]: if _scheme_match: _scheme = _scheme_match.group().lower() _remainder = iri[_scheme_match.end():] + if not _remainder.startswith(COLON): + raise trove_exceptions.IriInvalid(f'does not look like an iri (got "{iri}")') if not _remainder.startswith(COLON_SLASH_SLASH): # for an iri without '://', assume nothing! return (iri, _scheme) @@ -179,3 +181,30 @@ def unquote_iri(iri: str) -> str: break _unquoted_iri = _next_unquoted_iri return _unquoted_iri + + +def smells_like_iri(maybe_iri: str) -> bool: + '''check a string starts like an IRI (does not fully validate) + + >>> smells_like_iri('https://blarg.example/hello') + True + >>> smells_like_iri('foo:bar') # URN + True + + >>> smells_like_iri('://blarg.example/hello') + False + >>> smells_like_iri('foo/bar') + False + >>> smells_like_iri('foo') + False + >>> smells_like_iri(7) + False + ''' + try: + return ( + isinstance(maybe_iri, str) + # nonempty suffuniq-iri and scheme + and all(get_sufficiently_unique_iri_and_scheme(maybe_iri)) + ) + except trove_exceptions.IriInvalid: + return False diff --git a/trove/util/queryparams.py b/trove/util/queryparams.py index 0a9bb5d75..bdf667f56 100644 --- a/trove/util/queryparams.py +++ b/trove/util/queryparams.py @@ -127,15 +127,23 @@ def get_bool_value( if_absent: bool = False, # by default, param absence is falsy if_empty: bool = True, # by default, presence (with empty value) is truthy ) -> bool: - _value = get_single_value(queryparams, queryparam_name) - if _value is None: - return if_absent - if _value == '': - return if_empty - return parse_booly_str(_value) + return parse_booly_str( + get_single_value(queryparams, queryparam_name), + if_absent=if_absent, + if_empty=if_empty, + ) -def parse_booly_str(value: str): +def parse_booly_str( + value: str | None, + *, + if_absent: bool = False, # by default, param absence is falsy + if_empty: bool = True, # by default, presence (with empty value) is truthy +) -> bool: + if value is None: + return if_absent + if value == '': + return if_empty _lowered = value.lower() if _lowered in TRUTHY_VALUES: return True diff --git a/trove/util/trove_params.py b/trove/util/trove_params.py index 920960679..c693de112 100644 --- a/trove/util/trove_params.py +++ b/trove/util/trove_params.py @@ -100,26 +100,19 @@ def _gather_attrpaths(cls, queryparams: _qp.QueryparamDict, shorthand: rdf.IriSh if _fields_params: _requested: dict[str, list[Propertypath]] = defaultdict(list) for _param_name, _param_value in _fields_params: - try: - (_typenames,) = filter(bool, _param_name.bracketed_names) - except (IndexError, ValueError): - raise trove_exceptions.InvalidQueryParamName( - f'expected "fields[TYPE]" (with exactly one non-empty bracketed segment)' - f' (got "{_param_name}")' - ) - else: - for _type in _qp.split_queryparam_value(_typenames): - _type_key = ( - GLOB_PATHSTEP - if _type == GLOB_PATHSTEP - else shorthand.expand_iri(_type) - ) - _requested[_type_key].extend( - ( - parse_propertypath(_path_value, shorthand) - for _path_value in _qp.split_queryparam_value(_param_value) - ) + if _param_name.bracketed_names: # e.g. "fields[TYPE1,TYPE2,TYPE3]=..." + _typenames = _qp.split_queryparam_value(_param_name.bracketed_names[0]) + else: # omitted brackets equivalent to "fields[*]" (apply to any type) + _typenames = [GLOB_PATHSTEP] + for _typename in _typenames: + if _typename != GLOB_PATHSTEP: + _typename = shorthand.expand_iri(_typename) + _requested[_typename].extend( + ( # list of field paths in query param value + parse_propertypath(_path_value, shorthand) + for _path_value in _qp.split_queryparam_value(_param_value) ) + ) _attrpaths = _attrpaths.with_new(freeze(_requested)) return _attrpaths diff --git a/trove/views/ingest.py b/trove/views/ingest.py index 6d401c806..73d6cb021 100644 --- a/trove/views/ingest.py +++ b/trove/views/ingest.py @@ -5,9 +5,10 @@ from django import http from django.views import View -from share import exceptions from share.models.feature_flag import FeatureFlag from trove import digestive_tract +from trove import exceptions as trove_exceptions +from trove.util.queryparams import parse_booly_str logger = logging.getLogger(__name__) @@ -30,8 +31,6 @@ def post(self, request): if not _focus_iri: return http.HttpResponse('focus_iri queryparam required', status=HTTPStatus.BAD_REQUEST) _record_identifier = request.GET.get('record_identifier') - if not _record_identifier: - return http.HttpResponse('record_identifier queryparam required', status=HTTPStatus.BAD_REQUEST) _expiration_date_str = request.GET.get('expiration_date') if _expiration_date_str is None: _expiration_date = None @@ -40,22 +39,24 @@ def post(self, request): _expiration_date = datetime.date.fromisoformat(_expiration_date_str) except ValueError: return http.HttpResponse('expiration_date queryparam must be in ISO-8601 date format (YYYY-MM-DD)', status=HTTPStatus.BAD_REQUEST) + _nonurgent = parse_booly_str(request.GET.get('nonurgent')) try: - digestive_tract.swallow( + digestive_tract.ingest( + raw_record=request.body.decode(encoding='utf-8'), + record_mediatype=request.content_type, from_user=request.user, - record=request.body.decode(encoding='utf-8'), record_identifier=_record_identifier, - record_mediatype=request.content_type, focus_iri=_focus_iri, - urgent=(request.GET.get('nonurgent') is None), is_supplementary=(request.GET.get('is_supplementary') is not None), + urgent=(not _nonurgent), expiration_date=_expiration_date, + restore_deleted=True, ) - except exceptions.IngestError as e: + except trove_exceptions.DigestiveError as e: logger.exception(str(e)) return http.HttpResponse(str(e), status=HTTPStatus.BAD_REQUEST) else: - # TODO: include link to view status (return task id from `swallow`?) + # TODO: include (link to?) extracted card(s) return http.HttpResponse(status=HTTPStatus.CREATED) def delete(self, request): diff --git a/trove/vocab/osfmap.py b/trove/vocab/osfmap.py index 731834ade..d67e545e8 100644 --- a/trove/vocab/osfmap.py +++ b/trove/vocab/osfmap.py @@ -356,13 +356,13 @@ literal('isSupplementedBy', language='en'), }, }, - OSFMAP.verifiedLinks: { + OSFMAP.verifiedLink: { RDF.type: {RDF.Property}, RDFS.label: { literal('Verified Links', language='en'), }, JSONAPI_MEMBERNAME: { - literal('verifiedLinks', language='en'), + literal('verifiedLink', language='en'), }, }, OSFMAP.archivedAt: { @@ -572,10 +572,10 @@ literal('accessService', language='en'), }, }, - DCAT.accessUrl: { + DCAT.accessURL: { RDF.type: {RDF.Property}, JSONAPI_MEMBERNAME: { - literal('accessUrl', language='en'), + literal('accessURL', language='en'), }, }, OSFMAP.hostingInstitution: {