CentreForDigitalHumanities
diff --git a/‎CITATION.cff‎
Lines changed: 2 additions & 2 deletions b/‎CITATION.cff‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backend/addcorpus/constants.py‎
Lines changed: 2 additions & 6 deletions b/‎backend/addcorpus/constants.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎backend/addcorpus/migrations/0032_corpusconfiguration_has_named_entities.py‎
Lines changed: 18 additions & 0 deletions b/‎backend/addcorpus/migrations/0032_corpusconfiguration_has_named_entities.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎backend/addcorpus/models.py‎
Lines changed: 25 additions & 28 deletions b/‎backend/addcorpus/models.py‎
Lines changed: 25 additions & 28 deletions
diff --git a/‎backend/addcorpus/python_corpora/corpus.py‎
Lines changed: 0 additions & 14 deletions b/‎backend/addcorpus/python_corpora/corpus.py‎
Lines changed: 0 additions & 14 deletions
diff --git a/‎backend/addcorpus/python_corpora/save_corpus.py‎
Lines changed: 21 additions & 1 deletion b/‎backend/addcorpus/python_corpora/save_corpus.py‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎backend/addcorpus/serializers.py‎
Lines changed: 1 addition & 2 deletions b/‎backend/addcorpus/serializers.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎backend/corpora/dutchnewspapers/dutchnewspapers_public.py‎
Lines changed: 2 additions & 2 deletions b/‎backend/corpora/dutchnewspapers/dutchnewspapers_public.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backend/corpora/parliament/utils/parlamint_v4.py‎
Lines changed: 2 additions & 2 deletions b/‎backend/corpora/parliament/utils/parlamint_v4.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backend/corpora/traces_of_sound/__init__.py‎ b/‎backend/corpora/traces_of_sound/__init__.py‎
@@ -35,5 +35,5 @@ keywords:
   - elasticsearch
   - natural language processing
 license: MIT
-version: 5.20.0
-date-released: '2025-08-07'
+version: 5.22.0
+date-released: '2025-09-03'
@@ -46,21 +46,17 @@ class VisualizationType(Enum):
     'visualizedField',
     'normalize',
     'ngramSettings',
-    'scan',
-    'tab-scan'
     'p',
     'tags',
-    'context',
     'tab',
+    'document_link',
 ]
 '''
 Field names that cannot be used because they interfere with other functionality.
 
 This is usually because they are also query parameters in frontend routes, and using them
 would make routing ambiguous.
 
-`query` is also forbidden because it is a reserved column in CSV downloads. Likewise,
-`context` is forbidden because it's used in download requests.
+`query` and `document_link` are forbidden because they are reserved columns in CSV downloads.
 
-`scan` and `tab-scan` are added because they interfere with element IDs in the DOM.
 '''
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.22 on 2025-08-21 12:28
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('addcorpus', '0031_corpus_owner'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='corpusconfiguration',
+            name='has_named_entities',
+            field=models.BooleanField(default=False, help_text='whether this corpus has named entity annotations'),
+        ),
+    ]
@@ -254,6 +254,10 @@ class CorpusConfiguration(models.Model):
         default=False,
         help_text='whether this corpus has word models',
     )
+    has_named_entities = models.BooleanField(
+        default=False,
+        help_text='whether this corpus has named entity annotations',
+    )
     default_sort = models.JSONField(
         blank=True,
         validators=[validate_sort_configuration],
@@ -298,34 +302,27 @@ def clean(self):
                 ])
 
     @property
-    def has_named_entities(self):
-        from es.search import total_hits
+    def visible_fields(self) -> models.QuerySet['Field']:
+        fields = self.fields.all()
+
+        if not self.has_named_entities:
+            fields = fields.exclude(name__endswith=':ner-kw').exclude(name__endswith=':ner')
+
+        return fields
+
+
+class FieldDisplayTypes(models.TextChoices):
+    TEXT_CONTENT = ('text_content', 'text content')
+    TEXT = (MappingType.TEXT.value, 'text')
+    KEYWORD = (MappingType.KEYWORD.value, 'keyword')
+    DATE = (MappingType.DATE.value, 'date')
+    DATE_RANGE = (MappingType.DATE_RANGE.value, 'date_range')
+    INTEGER = (MappingType.INTEGER.value, 'integer')
+    FLOAT = (MappingType.FLOAT.value, 'float')
+    BOOLEAN = (MappingType.BOOLEAN.value, 'boolean')
+    GEO_POINT = (MappingType.GEO_POINT.value, 'geo_point')
+    URL = ('url', 'url')
 
-        client = elasticsearch(self.corpus.name)
-        try:
-            # we check if any fields exist for filtering named entities
-            ner_exists = client.search(
-                index=self.es_index, query={"exists": {"field": "*:ner-kw"}}, size=0
-            )
-            if total_hits(ner_exists):
-                return True
-        except:
-            return False
-        return False
-
-
-FIELD_DISPLAY_TYPES = [
-    ('text_content', 'text content'),
-    (MappingType.TEXT.value, 'text'),
-    (MappingType.KEYWORD.value, 'keyword'),
-    (MappingType.DATE.value, 'date'),
-    (MappingType.DATE_RANGE.value, 'date_range'),
-    (MappingType.INTEGER.value, 'integer'),
-    (MappingType.FLOAT.value, 'float'),
-    (MappingType.BOOLEAN.value, 'boolean'),
-    (MappingType.GEO_POINT.value, 'geo_point'),
-    ('url', 'url'),
-]
 
 FIELD_VISUALIZATIONS = [
     (VisualizationType.RESULTS_COUNT.value, 'Number of results'),
@@ -364,7 +361,7 @@ class Field(models.Model):
     )
     display_type = models.CharField(
         max_length=16,
-        choices=FIELD_DISPLAY_TYPES,
+        choices=FieldDisplayTypes.choices,
         help_text='as what type of data this field is rendered in the interface',
     )
     description = models.CharField(
 
@@ -187,20 +187,6 @@ def word_models_present(self):
         '''
         return self.word_model_path is not None and isdir(self.word_model_path)
 
-    @property
-    def new_highlight(self):
-        '''
-        if the corpus has been re-indexed using the top-level term vector 'with_positions_offsets'
-        for the main content field, needed for the updated highlighter
-        TODO: remove this property and its references when all corpora are reindexed using the
-        current definitions (with the top-level term vector for speech)
-        '''
-        try:
-            highlight_corpora = settings.NEW_HIGHLIGHT_CORPORA
-        except Exception:
-            return False
-        return self.title in highlight_corpora
-
     '''
     Allow the downloading of source images
     '''
 
@@ -1,9 +1,11 @@
 import os
 from django.db import transaction
 from django.core.files.images import ImageFile
-from datetime import date, datetime
+import warnings
 import sys
 
+from es.client import elasticsearch
+from es.search import total_hits
 from addcorpus.python_corpora.corpus import CorpusDefinition, FieldDefinition
 from addcorpus.models import Corpus, CorpusConfiguration, Field, CorpusDocumentationPage
 from addcorpus.python_corpora.load_corpus import load_all_corpus_definitions, corpus_dir
@@ -37,6 +39,7 @@ def _save_corpus_configuration(corpus: Corpus, corpus_definition: CorpusDefiniti
     _save_corpus_fields_in_database(corpus_definition, configuration)
     _save_corpus_image(corpus_definition, configuration)
     _save_corpus_documentation(corpus_definition, configuration)
+    _save_has_named_entities(configuration)
 
 def get_defined_attributes(object, attributes):
     get = lambda attr: object.__getattribute__(attr)
@@ -160,6 +163,23 @@ def _save_corpus_documentation(corpus_definition: CorpusDefinition, configuratio
                 pages.delete()
 
 
+def _save_has_named_entities(configuration: CorpusConfiguration):
+    # we check if any fields exist for filtering named entities
+    if any(field.name.endswith(':ner-kw') for field in configuration.fields.all()):
+        client = elasticsearch(configuration.corpus.name)
+        try:
+            ner_exists = client.search(
+                index=configuration.es_index,
+                query={"exists": {"field": "*:ner-kw"}},
+                size=0
+            )
+            if total_hits(ner_exists):
+                configuration.has_named_entities = True
+                configuration.save()
+        except Exception as e:
+            warnings.warn(Warning('Could not check named enities:', e))
+
+
 def _prepare_for_import(corpus: Corpus):
     corpus.has_python_definition = True
     corpus.active = False
 
@@ -78,11 +78,10 @@ def to_internal_value(self, data):
         return super().to_internal_value(value)
 
 class CorpusConfigurationSerializer(serializers.ModelSerializer):
-    fields = FieldSerializer(many=True)
+    fields = FieldSerializer(many=True, source='visible_fields')
     languages = serializers.ListField(child=LanguageField())
     category = PrettyChoiceField(choices=CATEGORIES)
     default_sort = NonEmptyJSONField()
-    has_named_entities = serializers.ReadOnlyField()
 
     class Meta:
         model = CorpusConfiguration
 
@@ -33,7 +33,7 @@ class DutchNewspapersPublic(XMLCorpusDefinition):
     description = "Collection of Dutch newspapers in the public domain, digitised by the Koninklijke Bibliotheek."
     min_date = datetime(year=1600, month=1, day=1)
     max_date = datetime(year=1876, month=12, day=31)
-    data_directory = settings.DUTCHNEWSPAPERS_DATA
+    data_directory = getattr(settings, 'DUTCHNEWSPAPERS_DATA', None)
     es_index = getattr(settings, 'DUTCHNEWSPAPERS_ES_INDEX', 'dutchnewspapers-public')
     image = 'dutchnewspapers.jpg'
     languages = ['nl']
@@ -110,7 +110,7 @@ def sources(self, start=min_date, end=max_date):
                         })
                         yield full_path, meta_dict
 
-    titlefile = join(corpus_dir('dutchnewspapers-public'), 'newspaper_titles.txt')
+    titlefile = join(os.path.dirname(__file__), 'newspaper_titles.txt')
     with open(titlefile, encoding='utf-8') as f:
         papers = f.readlines()
     paper_count = len(papers)
 
@@ -2,10 +2,10 @@
 from string import punctuation
 from typing import Iterable
 
-from ianalyzer_readers.extract import XML, Combined, Metadata
-from ianalyzer_readers.xml_tag import Tag
 from bs4.element import NavigableString, Tag as Node
 from bs4 import BeautifulSoup
+from ianalyzer_readers.extract import Combined, Metadata, XML
+from ianalyzer_readers.xml_tag import Tag
 
 from addcorpus.es_mappings import non_indexed_text_mapping, keyword_mapping
 from addcorpus.python_corpora.corpus import FieldDefinition