Skip to content

Commit d77ee3b

Browse files
committed
Allow proper definitions of multiplexer and conditional token filters
Including propagating the nested filters into the index settings Fixes #1212
1 parent e1b4653 commit d77ee3b

File tree

3 files changed

+227
-3
lines changed

3 files changed

+227
-3
lines changed

elasticsearch_dsl/analysis.py

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import six
22

33
from .connections import get_connection
4-
from .utils import AttrDict, DslBase
4+
from .utils import AttrDict, DslBase, merge
55

66
__all__ = [
77
'tokenizer', 'analyzer', 'char_filter', 'token_filter', 'normalizer'
@@ -18,7 +18,7 @@ def _type_shortcut(cls, name_or_instance, type=None, **kwargs):
1818
if not (type or kwargs):
1919
return cls.get_dsl_class('builtin')(name_or_instance)
2020

21-
return cls.get_dsl_class('custom')(name_or_instance, type or 'custom', **kwargs)
21+
return cls.get_dsl_class(type, 'custom')(name_or_instance, type or 'custom', **kwargs)
2222

2323
class CustomAnalysis(object):
2424
name = 'custom'
@@ -50,6 +50,13 @@ def get_analysis_definition(self):
5050
if filters:
5151
out['filter'] = filters
5252

53+
# any sub filter definitions like multiplexers etc?
54+
for f in self.filter:
55+
if hasattr(f, 'get_analysis_definition'):
56+
d = f.get_analysis_definition()
57+
if d:
58+
merge(out, d, True)
59+
5360
char_filters = {f._name: f.get_definition()
5461
for f in self.char_filter if hasattr(f, 'get_definition')}
5562
if char_filters:
@@ -154,6 +161,59 @@ class BuiltinTokenFilter(BuiltinAnalysis, TokenFilter):
154161
class CustomTokenFilter(CustomAnalysis, TokenFilter):
155162
pass
156163

164+
class MultiplexerTokenFilter(CustomTokenFilter):
165+
name = 'multiplexer'
166+
167+
def get_definition(self):
168+
d = super(CustomTokenFilter, self).get_definition()
169+
170+
if 'filters' in d:
171+
d['filters'] = [
172+
# comma delimited string given by user
173+
fs if isinstance(fs, six.string_types) else
174+
# list of strings or TokenFilter objects
175+
', '.join(f.to_dict() if hasattr(f, 'to_dict') else f for f in fs)
176+
177+
for fs in self.filters
178+
]
179+
return d
180+
181+
def get_analysis_definition(self):
182+
if not hasattr(self, 'filters'):
183+
return {}
184+
185+
fs = {}
186+
d = {'filter': fs}
187+
for filters in self.filters:
188+
if isinstance(filters, six.string_types):
189+
continue
190+
fs.update({f._name: f.get_definition()
191+
for f in filters if hasattr(f, 'get_definition')})
192+
return d
193+
194+
class ConditionalTokenFilter(CustomTokenFilter):
195+
name = 'condition'
196+
197+
def get_definition(self):
198+
d = super(CustomTokenFilter, self).get_definition()
199+
if 'filter' in d:
200+
d['filter'] = [
201+
f.to_dict() if hasattr(f, 'to_dict') else f
202+
for f in self.filter
203+
]
204+
return d
205+
206+
def get_analysis_definition(self):
207+
if not hasattr(self, 'filter'):
208+
return {}
209+
210+
return {
211+
'filter': {
212+
f._name: f.get_definition()
213+
for f in self.filter if hasattr(f, 'get_definition')
214+
}
215+
}
216+
157217

158218
class CharFilter(AnalysisBase, DslBase):
159219
_type_name = 'char_filter'

elasticsearch_dsl/utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,10 +214,12 @@ class DslBase(object):
214214
_param_defs = {}
215215

216216
@classmethod
217-
def get_dsl_class(cls, name):
217+
def get_dsl_class(cls, name, default=None):
218218
try:
219219
return cls._classes[name]
220220
except KeyError:
221+
if default is not None:
222+
return cls._classes[default]
221223
raise UnknownDslObject('DSL class `{}` does not exist in {}.'.format(name, cls._type_name))
222224

223225
def __init__(self, _expand__to_dot=EXPAND__TO_DOT, **params):

test_elasticsearch_dsl/test_analysis.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# coding: utf-8
22
from elasticsearch_dsl import analysis
33

4+
from pytest import raises
5+
46
def test_analyzer_serializes_as_name():
57
a = analysis.analyzer('my_analyzer')
68

@@ -19,6 +21,166 @@ def test_analyzer_has_definition():
1921
'filter': ["lowercase"],
2022
} == a.get_definition()
2123

24+
def test_simple_multiplexer_filter():
25+
a = analysis.analyzer(
26+
'my_analyzer',
27+
tokenizer='keyword',
28+
filter=[
29+
analysis.token_filter(
30+
'my_multi',
31+
'multiplexer',
32+
filters=['lowercase', 'lowercase, stop']
33+
)
34+
]
35+
)
36+
37+
assert {
38+
"analyzer": {
39+
"my_analyzer": {
40+
"filter": [
41+
"my_multi"
42+
],
43+
"tokenizer": "keyword",
44+
"type": "custom"
45+
}
46+
},
47+
"filter": {
48+
"my_multi": {
49+
"filters": [
50+
"lowercase",
51+
"lowercase, stop"
52+
],
53+
"type": "multiplexer"
54+
}
55+
}
56+
} == a.get_analysis_definition()
57+
58+
def test_multiplexer_with_custom_filter():
59+
a = analysis.analyzer(
60+
'my_analyzer',
61+
tokenizer='keyword',
62+
filter=[
63+
analysis.token_filter(
64+
'my_multi',
65+
'multiplexer',
66+
filters=[
67+
[
68+
analysis.token_filter(
69+
'en',
70+
'snowball',
71+
language='English'
72+
)
73+
],
74+
'lowercase, stop'
75+
]
76+
)
77+
]
78+
)
79+
80+
assert {
81+
"analyzer": {
82+
"my_analyzer": {
83+
"filter": [
84+
"my_multi"
85+
],
86+
"tokenizer": "keyword",
87+
"type": "custom"
88+
}
89+
},
90+
"filter": {
91+
"en": {
92+
"type": "snowball",
93+
"language": "English"
94+
},
95+
"my_multi": {
96+
"filters": [
97+
"en",
98+
"lowercase, stop"
99+
],
100+
"type": "multiplexer"
101+
}
102+
}
103+
} == a.get_analysis_definition()
104+
105+
def test_conditional_token_filter():
106+
a = analysis.analyzer(
107+
'my_cond',
108+
tokenizer=analysis.tokenizer('keyword'),
109+
filter=[
110+
analysis.token_filter(
111+
'testing',
112+
'condition',
113+
script={'source': 'return true'},
114+
filter=[
115+
'lowercase',
116+
analysis.token_filter(
117+
'en',
118+
'snowball',
119+
language='English'
120+
)
121+
]
122+
),
123+
'stop'
124+
]
125+
)
126+
127+
assert {
128+
"analyzer": {
129+
"my_cond": {
130+
"filter": [
131+
"testing",
132+
"stop"
133+
],
134+
"tokenizer": "keyword",
135+
"type": "custom"
136+
}
137+
},
138+
"filter": {
139+
"en": {
140+
"language": "English",
141+
"type": "snowball"
142+
},
143+
"testing": {
144+
"script": {"source": "return true"},
145+
"filter": [
146+
"lowercase",
147+
"en"
148+
],
149+
"type": "condition"
150+
}
151+
}
152+
} == a.get_analysis_definition()
153+
154+
def test_conflicting_nested_filters_cause_error():
155+
a = analysis.analyzer(
156+
'my_cond',
157+
tokenizer=analysis.tokenizer('keyword'),
158+
filter=[
159+
analysis.token_filter(
160+
'en',
161+
'stemmer',
162+
language='english'
163+
),
164+
analysis.token_filter(
165+
'testing',
166+
'condition',
167+
script={'source': 'return true'},
168+
filter=[
169+
'lowercase',
170+
analysis.token_filter(
171+
'en',
172+
'snowball',
173+
language='English'
174+
)
175+
]
176+
)
177+
]
178+
)
179+
180+
with raises(ValueError):
181+
a.get_analysis_definition()
182+
183+
22184
def test_normalizer_serializes_as_name():
23185
n = analysis.normalizer('my_normalizer')
24186

0 commit comments

Comments
 (0)