11import os
22import warnings
3+ from typing import Dict
34
45from django .conf import settings
56from langcodes import Language , standardize_tag
@@ -85,27 +86,30 @@ def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False):
8586 for language in languages :
8687 # do not attach language isocodes if there is just one language
8788
89+ tag = standardize_tag (language , macro = True )
90+
8891 if stopword_analysis or stemming_analysis :
89- if not set_stopword_filter (settings , add_language_string (stopword_filter_name , language ), language ):
92+ if not set_stopword_filter (settings , add_language_string (stopword_filter_name , tag ), tag ):
9093 continue # skip languages for which we do not have a stopword list
9194
9295 if stopword_analysis :
9396 set_clean_analyzer (
9497 settings ,
95- add_language_string (stopword_filter_name , language ),
96- add_language_string (clean_analyzer_name , language ),
98+ tag ,
99+ add_language_string (stopword_filter_name , tag ),
100+ add_language_string (clean_analyzer_name , tag ),
97101 )
98102 if stemming_analysis :
99- if not stemming_available (language ):
103+ if not stemming_available (tag ):
100104 warnings .warn ('You specified `stemming_analysis=True`, but \
101105 there is no stemmer available for this language' )
102106 continue
103107 set_stemmed_analyzer (
104108 settings ,
105- add_language_string ( stopword_filter_name , language ) ,
106- add_language_string (stemmer_filter_name , language ),
107- add_language_string (stemmed_analyzer_name , language ),
108- language
109+ tag ,
110+ add_language_string (stopword_filter_name , tag ),
111+ add_language_string (stemmer_filter_name , tag ),
112+ add_language_string ( stemmed_analyzer_name , tag ),
109113 )
110114
111115 return settings
@@ -127,12 +131,30 @@ def make_stopword_filter(language):
127131 except :
128132 return None
129133
130- def make_clean_analyzer (stopword_filter_name ):
131- return {
132- "tokenizer" : "standard" ,
133- "char_filter" : ["number_filter" ],
134- "filter" : ["lowercase" , stopword_filter_name ]
135- }
134+ def _standard_analyzer (language : str ):
135+ '''
136+ Basic analyzer for a language.
137+ '''
138+ if language in ['zh' , 'ja' , 'ko' ]:
139+ return {
140+ 'tokenizer' : 'standard' ,
141+ 'filter' : [
142+ 'cjk_width' ,
143+ 'lowercase' ,
144+ ]
145+ }
146+ else :
147+ return {
148+ 'tokenizer' : 'standard' ,
149+ 'char_filter' : ['number_filter' ],
150+ 'filter' : ['lowercase' ]
151+ }
152+
153+ def make_clean_analyzer (language : str , stopword_filter_name : str ) -> Dict :
154+ analyzer = _standard_analyzer (language )
155+ analyzer ['filter' ].append (stopword_filter_name )
156+ return analyzer
157+
136158
137159def make_stemmer_filter (language ):
138160 stemmer_language = get_language_key (language )
@@ -141,12 +163,13 @@ def make_stemmer_filter(language):
141163 "language" : stemmer_language
142164 }
143165
144- def make_stemmed_analyzer (stopword_filter_name , stemmer_filter_name ):
145- return {
146- "tokenizer" : "standard" ,
147- "char_filter" : ["number_filter" ],
148- "filter" : ["lowercase" , stopword_filter_name , stemmer_filter_name ]
149- }
166+ def make_stemmed_analyzer (
167+ language : str , stopword_filter_name : str , stemmer_filter_name : str
168+ ) -> Dict :
169+ analyzer = make_clean_analyzer (language , stopword_filter_name )
170+ analyzer ['filter' ].append (stemmer_filter_name )
171+ return analyzer
172+
150173
151174def get_stopwords_from_settings (es_settings , analyzer ):
152175 try :
@@ -158,12 +181,19 @@ def get_stopwords_from_settings(es_settings, analyzer):
158181 except :
159182 return []
160183
161- def set_stemmed_analyzer (settings , stopword_filter_name , stemmer_filter_name , stemmed_analyzer_name , language ):
184+ def set_stemmed_analyzer (
185+ settings : Dict ,
186+ language : str ,
187+ stopword_filter_name : str ,
188+ stemmer_filter_name : str ,
189+ stemmed_analyzer_name : str ,
190+ ) -> None :
162191 filters = settings ['analysis' ].get ('filter' , {})
163192 filters .update ({stemmer_filter_name : make_stemmer_filter (language )})
164193 settings ['analysis' ]['filter' ] = filters
165194 analyzers = settings ['analysis' ].get ('analyzer' )
166- analyzers .update ({stemmed_analyzer_name : make_stemmed_analyzer (stopword_filter_name , stemmer_filter_name )})
195+ analyzers .update ({stemmed_analyzer_name : make_stemmed_analyzer (
196+ language , stopword_filter_name , stemmer_filter_name )})
167197 settings ['analysis' ]['analyzer' ] = analyzers
168198
169199def set_char_filter (settings ):
@@ -182,8 +212,13 @@ def set_stopword_filter(settings, stopword_filter_name, language):
182212 settings ['analysis' ]['filter' ] = filters
183213 return True
184214
185- def set_clean_analyzer (settings , stopword_filter_name , clean_analyzer_name ):
186- clean_analyzer = make_clean_analyzer (stopword_filter_name )
215+ def set_clean_analyzer (
216+ settings : Dict ,
217+ language : str ,
218+ stopword_filter_name : str ,
219+ clean_analyzer_name : str ,
220+ ) -> None :
221+ clean_analyzer = make_clean_analyzer (language , stopword_filter_name )
187222 analyzers = settings ['analysis' ].get ('analyzer' , {})
188223 analyzers .update ({clean_analyzer_name : clean_analyzer })
189224 settings ["analysis" ]['analyzer' ] = analyzers
0 commit comments