Skip to content

Commit 7825aaa

Browse files
Merge branch 'release/5.22.0'
2 parents 3e30959 + dee4a7b commit 7825aaa

File tree

93 files changed

+1468
-885
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

93 files changed

+1468
-885
lines changed

CITATION.cff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,5 @@ keywords:
3535
- elasticsearch
3636
- natural language processing
3737
license: MIT
38-
version: 5.20.0
39-
date-released: '2025-08-07'
38+
version: 5.22.0
39+
date-released: '2025-09-03'

backend/addcorpus/constants.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,21 +46,17 @@ class VisualizationType(Enum):
4646
'visualizedField',
4747
'normalize',
4848
'ngramSettings',
49-
'scan',
50-
'tab-scan'
5149
'p',
5250
'tags',
53-
'context',
5451
'tab',
52+
'document_link',
5553
]
5654
'''
5755
Field names that cannot be used because they interfere with other functionality.
5856
5957
This is usually because they are also query parameters in frontend routes, and using them
6058
would make routing ambiguous.
6159
62-
`query` is also forbidden because it is a reserved column in CSV downloads. Likewise,
63-
`context` is forbidden because it's used in download requests.
60+
`query` and `document_link` are forbidden because they are reserved columns in CSV downloads.
6461
65-
`scan` and `tab-scan` are added because they interfere with element IDs in the DOM.
6662
'''
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Generated by Django 4.2.22 on 2025-08-21 12:28
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('addcorpus', '0031_corpus_owner'),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name='corpusconfiguration',
15+
name='has_named_entities',
16+
field=models.BooleanField(default=False, help_text='whether this corpus has named entity annotations'),
17+
),
18+
]

backend/addcorpus/models.py

Lines changed: 25 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,10 @@ class CorpusConfiguration(models.Model):
254254
default=False,
255255
help_text='whether this corpus has word models',
256256
)
257+
has_named_entities = models.BooleanField(
258+
default=False,
259+
help_text='whether this corpus has named entity annotations',
260+
)
257261
default_sort = models.JSONField(
258262
blank=True,
259263
validators=[validate_sort_configuration],
@@ -298,34 +302,27 @@ def clean(self):
298302
])
299303

300304
@property
301-
def has_named_entities(self):
302-
from es.search import total_hits
305+
def visible_fields(self) -> models.QuerySet['Field']:
306+
fields = self.fields.all()
307+
308+
if not self.has_named_entities:
309+
fields = fields.exclude(name__endswith=':ner-kw').exclude(name__endswith=':ner')
310+
311+
return fields
312+
313+
314+
class FieldDisplayTypes(models.TextChoices):
315+
TEXT_CONTENT = ('text_content', 'text content')
316+
TEXT = (MappingType.TEXT.value, 'text')
317+
KEYWORD = (MappingType.KEYWORD.value, 'keyword')
318+
DATE = (MappingType.DATE.value, 'date')
319+
DATE_RANGE = (MappingType.DATE_RANGE.value, 'date_range')
320+
INTEGER = (MappingType.INTEGER.value, 'integer')
321+
FLOAT = (MappingType.FLOAT.value, 'float')
322+
BOOLEAN = (MappingType.BOOLEAN.value, 'boolean')
323+
GEO_POINT = (MappingType.GEO_POINT.value, 'geo_point')
324+
URL = ('url', 'url')
303325

304-
client = elasticsearch(self.corpus.name)
305-
try:
306-
# we check if any fields exist for filtering named entities
307-
ner_exists = client.search(
308-
index=self.es_index, query={"exists": {"field": "*:ner-kw"}}, size=0
309-
)
310-
if total_hits(ner_exists):
311-
return True
312-
except:
313-
return False
314-
return False
315-
316-
317-
FIELD_DISPLAY_TYPES = [
318-
('text_content', 'text content'),
319-
(MappingType.TEXT.value, 'text'),
320-
(MappingType.KEYWORD.value, 'keyword'),
321-
(MappingType.DATE.value, 'date'),
322-
(MappingType.DATE_RANGE.value, 'date_range'),
323-
(MappingType.INTEGER.value, 'integer'),
324-
(MappingType.FLOAT.value, 'float'),
325-
(MappingType.BOOLEAN.value, 'boolean'),
326-
(MappingType.GEO_POINT.value, 'geo_point'),
327-
('url', 'url'),
328-
]
329326

330327
FIELD_VISUALIZATIONS = [
331328
(VisualizationType.RESULTS_COUNT.value, 'Number of results'),
@@ -364,7 +361,7 @@ class Field(models.Model):
364361
)
365362
display_type = models.CharField(
366363
max_length=16,
367-
choices=FIELD_DISPLAY_TYPES,
364+
choices=FieldDisplayTypes.choices,
368365
help_text='as what type of data this field is rendered in the interface',
369366
)
370367
description = models.CharField(

backend/addcorpus/python_corpora/corpus.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -187,20 +187,6 @@ def word_models_present(self):
187187
'''
188188
return self.word_model_path is not None and isdir(self.word_model_path)
189189

190-
@property
191-
def new_highlight(self):
192-
'''
193-
if the corpus has been re-indexed using the top-level term vector 'with_positions_offsets'
194-
for the main content field, needed for the updated highlighter
195-
TODO: remove this property and its references when all corpora are reindexed using the
196-
current definitions (with the top-level term vector for speech)
197-
'''
198-
try:
199-
highlight_corpora = settings.NEW_HIGHLIGHT_CORPORA
200-
except Exception:
201-
return False
202-
return self.title in highlight_corpora
203-
204190
'''
205191
Allow the downloading of source images
206192
'''

backend/addcorpus/python_corpora/save_corpus.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import os
22
from django.db import transaction
33
from django.core.files.images import ImageFile
4-
from datetime import date, datetime
4+
import warnings
55
import sys
66

7+
from es.client import elasticsearch
8+
from es.search import total_hits
79
from addcorpus.python_corpora.corpus import CorpusDefinition, FieldDefinition
810
from addcorpus.models import Corpus, CorpusConfiguration, Field, CorpusDocumentationPage
911
from addcorpus.python_corpora.load_corpus import load_all_corpus_definitions, corpus_dir
@@ -37,6 +39,7 @@ def _save_corpus_configuration(corpus: Corpus, corpus_definition: CorpusDefiniti
3739
_save_corpus_fields_in_database(corpus_definition, configuration)
3840
_save_corpus_image(corpus_definition, configuration)
3941
_save_corpus_documentation(corpus_definition, configuration)
42+
_save_has_named_entities(configuration)
4043

4144
def get_defined_attributes(object, attributes):
4245
get = lambda attr: object.__getattribute__(attr)
@@ -160,6 +163,23 @@ def _save_corpus_documentation(corpus_definition: CorpusDefinition, configuratio
160163
pages.delete()
161164

162165

166+
def _save_has_named_entities(configuration: CorpusConfiguration):
167+
# we check if any fields exist for filtering named entities
168+
if any(field.name.endswith(':ner-kw') for field in configuration.fields.all()):
169+
client = elasticsearch(configuration.corpus.name)
170+
try:
171+
ner_exists = client.search(
172+
index=configuration.es_index,
173+
query={"exists": {"field": "*:ner-kw"}},
174+
size=0
175+
)
176+
if total_hits(ner_exists):
177+
configuration.has_named_entities = True
178+
configuration.save()
179+
except Exception as e:
180+
warnings.warn(Warning('Could not check named enities:', e))
181+
182+
163183
def _prepare_for_import(corpus: Corpus):
164184
corpus.has_python_definition = True
165185
corpus.active = False

backend/addcorpus/serializers.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,11 +78,10 @@ def to_internal_value(self, data):
7878
return super().to_internal_value(value)
7979

8080
class CorpusConfigurationSerializer(serializers.ModelSerializer):
81-
fields = FieldSerializer(many=True)
81+
fields = FieldSerializer(many=True, source='visible_fields')
8282
languages = serializers.ListField(child=LanguageField())
8383
category = PrettyChoiceField(choices=CATEGORIES)
8484
default_sort = NonEmptyJSONField()
85-
has_named_entities = serializers.ReadOnlyField()
8685

8786
class Meta:
8887
model = CorpusConfiguration

backend/corpora/dutchnewspapers/dutchnewspapers_public.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class DutchNewspapersPublic(XMLCorpusDefinition):
3333
description = "Collection of Dutch newspapers in the public domain, digitised by the Koninklijke Bibliotheek."
3434
min_date = datetime(year=1600, month=1, day=1)
3535
max_date = datetime(year=1876, month=12, day=31)
36-
data_directory = settings.DUTCHNEWSPAPERS_DATA
36+
data_directory = getattr(settings, 'DUTCHNEWSPAPERS_DATA', None)
3737
es_index = getattr(settings, 'DUTCHNEWSPAPERS_ES_INDEX', 'dutchnewspapers-public')
3838
image = 'dutchnewspapers.jpg'
3939
languages = ['nl']
@@ -110,7 +110,7 @@ def sources(self, start=min_date, end=max_date):
110110
})
111111
yield full_path, meta_dict
112112

113-
titlefile = join(corpus_dir('dutchnewspapers-public'), 'newspaper_titles.txt')
113+
titlefile = join(os.path.dirname(__file__), 'newspaper_titles.txt')
114114
with open(titlefile, encoding='utf-8') as f:
115115
papers = f.readlines()
116116
paper_count = len(papers)

backend/corpora/parliament/utils/parlamint_v4.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
from string import punctuation
33
from typing import Iterable
44

5-
from ianalyzer_readers.extract import XML, Combined, Metadata
6-
from ianalyzer_readers.xml_tag import Tag
75
from bs4.element import NavigableString, Tag as Node
86
from bs4 import BeautifulSoup
7+
from ianalyzer_readers.extract import Combined, Metadata, XML
8+
from ianalyzer_readers.xml_tag import Tag
99

1010
from addcorpus.es_mappings import non_indexed_text_mapping, keyword_mapping
1111
from addcorpus.python_corpora.corpus import FieldDefinition

backend/corpora/traces_of_sound/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)