Skip to content

Commit 5af31c8

Browse files
committed
Merge branch 'release/5.28.0'
2 parents 5ec1be3 + 4e501f6 commit 5af31c8

File tree

62 files changed

+1037
-441
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+1037
-441
lines changed

CITATION.cff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,5 @@ keywords:
3535
- elasticsearch
3636
- natural language processing
3737
license: MIT
38-
version: 5.27.1
39-
date-released: '2026-02-02'
38+
version: 5.28.0
39+
date-released: '2026-02-20'

backend/addcorpus/es_mappings.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from typing import Dict
22
from addcorpus.es_settings import add_language_string, stopwords_available, stemming_available
3+
from langcodes import standardize_tag
34

45
def primary_mapping_type(es_mapping: Dict) -> str:
56
return es_mapping.get('type', None)
@@ -26,16 +27,21 @@ def main_content_mapping(
2627
"type": "token_count",
2728
"analyzer": "standard"
2829
}
29-
if stopword_analysis and stopwords_available(language):
30+
31+
if not language:
32+
return mapping
33+
tag = standardize_tag(language, macro=True)
34+
35+
if stopword_analysis and stopwords_available(tag):
3036
multifields['clean'] = {
3137
"type": "text",
32-
"analyzer": add_language_string('clean', language),
38+
"analyzer": add_language_string('clean', tag),
3339
"term_vector": "with_positions_offsets" # include character positions for highlighting
3440
}
35-
if stemming_analysis and stemming_available(language):
41+
if stemming_analysis and stemming_available(tag):
3642
multifields['stemmed'] = {
3743
"type": "text",
38-
"analyzer": add_language_string('stemmed', language),
44+
"analyzer": add_language_string('stemmed', tag),
3945
"term_vector": "with_positions_offsets",
4046
}
4147
mapping['fields'] = multifields

backend/addcorpus/es_settings.py

Lines changed: 59 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
import warnings
3+
from typing import Dict
34

45
from django.conf import settings
56
from langcodes import Language, standardize_tag
@@ -85,27 +86,30 @@ def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False):
8586
for language in languages:
8687
# do not attach language isocodes if there is just one language
8788

89+
tag = standardize_tag(language, macro=True)
90+
8891
if stopword_analysis or stemming_analysis:
89-
if not set_stopword_filter(settings, add_language_string(stopword_filter_name, language), language):
92+
if not set_stopword_filter(settings, add_language_string(stopword_filter_name, tag), tag):
9093
continue # skip languages for which we do not have a stopword list
9194

9295
if stopword_analysis:
9396
set_clean_analyzer(
9497
settings,
95-
add_language_string(stopword_filter_name, language),
96-
add_language_string(clean_analyzer_name, language),
98+
tag,
99+
add_language_string(stopword_filter_name, tag),
100+
add_language_string(clean_analyzer_name, tag),
97101
)
98102
if stemming_analysis:
99-
if not stemming_available(language):
103+
if not stemming_available(tag):
100104
warnings.warn('You specified `stemming_analysis=True`, but \
101105
there is no stemmer available for this language')
102106
continue
103107
set_stemmed_analyzer(
104108
settings,
105-
add_language_string(stopword_filter_name, language),
106-
add_language_string(stemmer_filter_name, language),
107-
add_language_string(stemmed_analyzer_name, language),
108-
language
109+
tag,
110+
add_language_string(stopword_filter_name, tag),
111+
add_language_string(stemmer_filter_name, tag),
112+
add_language_string(stemmed_analyzer_name, tag),
109113
)
110114

111115
return settings
@@ -127,12 +131,30 @@ def make_stopword_filter(language):
127131
except:
128132
return None
129133

130-
def make_clean_analyzer(stopword_filter_name):
131-
return {
132-
"tokenizer": "standard",
133-
"char_filter": ["number_filter"],
134-
"filter": ["lowercase", stopword_filter_name]
135-
}
134+
def _standard_analyzer(language: str):
135+
'''
136+
Basic analyzer for a language.
137+
'''
138+
if language in ['zh', 'ja', 'ko']:
139+
return {
140+
'tokenizer': 'standard',
141+
'filter': [
142+
'cjk_width',
143+
'lowercase',
144+
]
145+
}
146+
else:
147+
return {
148+
'tokenizer': 'standard',
149+
'char_filter': ['number_filter'],
150+
'filter': ['lowercase']
151+
}
152+
153+
def make_clean_analyzer(language: str, stopword_filter_name: str) -> Dict:
154+
analyzer = _standard_analyzer(language)
155+
analyzer['filter'].append(stopword_filter_name)
156+
return analyzer
157+
136158

137159
def make_stemmer_filter(language):
138160
stemmer_language = get_language_key(language)
@@ -141,12 +163,13 @@ def make_stemmer_filter(language):
141163
"language": stemmer_language
142164
}
143165

144-
def make_stemmed_analyzer(stopword_filter_name, stemmer_filter_name):
145-
return {
146-
"tokenizer": "standard",
147-
"char_filter": ["number_filter"],
148-
"filter": ["lowercase", stopword_filter_name, stemmer_filter_name]
149-
}
166+
def make_stemmed_analyzer(
167+
language: str, stopword_filter_name: str, stemmer_filter_name: str
168+
) -> Dict:
169+
analyzer = make_clean_analyzer(language, stopword_filter_name)
170+
analyzer['filter'].append(stemmer_filter_name)
171+
return analyzer
172+
150173

151174
def get_stopwords_from_settings(es_settings, analyzer):
152175
try:
@@ -158,12 +181,19 @@ def get_stopwords_from_settings(es_settings, analyzer):
158181
except:
159182
return []
160183

161-
def set_stemmed_analyzer(settings, stopword_filter_name, stemmer_filter_name, stemmed_analyzer_name, language):
184+
def set_stemmed_analyzer(
185+
settings: Dict,
186+
language: str,
187+
stopword_filter_name: str,
188+
stemmer_filter_name: str,
189+
stemmed_analyzer_name: str,
190+
) -> None:
162191
filters = settings['analysis'].get('filter', {})
163192
filters.update({stemmer_filter_name: make_stemmer_filter(language)})
164193
settings['analysis']['filter'] = filters
165194
analyzers = settings['analysis'].get('analyzer')
166-
analyzers.update({stemmed_analyzer_name: make_stemmed_analyzer(stopword_filter_name, stemmer_filter_name)})
195+
analyzers.update({stemmed_analyzer_name: make_stemmed_analyzer(
196+
language, stopword_filter_name, stemmer_filter_name)})
167197
settings['analysis']['analyzer'] = analyzers
168198

169199
def set_char_filter(settings):
@@ -182,8 +212,13 @@ def set_stopword_filter(settings, stopword_filter_name, language):
182212
settings['analysis']['filter'] = filters
183213
return True
184214

185-
def set_clean_analyzer(settings, stopword_filter_name, clean_analyzer_name):
186-
clean_analyzer = make_clean_analyzer(stopword_filter_name)
215+
def set_clean_analyzer(
216+
settings: Dict,
217+
language: str,
218+
stopword_filter_name: str,
219+
clean_analyzer_name: str,
220+
) -> None:
221+
clean_analyzer = make_clean_analyzer(language, stopword_filter_name)
187222
analyzers = settings['analysis'].get('analyzer', {})
188223
analyzers.update({clean_analyzer_name: clean_analyzer})
189224
settings["analysis"]['analyzer'] = analyzers

0 commit comments

Comments
 (0)