You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
/** Maximum subword character length. Longer subword tokens are excluded from the output. Defaults to `15`. */
45
45
max_subword_size?: integer
46
+
/** Minimum subword character length. Shorter subword tokens are excluded from the output. Defaults to `2`. */
46
47
min_subword_size?: integer
48
+
/** Minimum word character length. Shorter word tokens are excluded from the output. Defaults to `5`. */
47
49
min_word_size?: integer
50
+
/** If `true`, only include the longest matching subword. Defaults to `false`. */
48
51
only_longest_match?: boolean
52
+
/** A list of subwords to look for in the token stream. If found, the subword is included in the token output.
53
+
* Either this parameter or `word_list_path` must be specified.*/
49
54
word_list?: string[]
55
+
/** Path to a file that contains a list of subwords to find in the token stream. If found, the subword is included in the token output.
56
+
* This path must be absolute or relative to the config location, and the file must be UTF-8 encoded. Each token in the file must be separated by a line break.
57
+
* Either this parameter or `word_list` must be specified. */
50
58
word_list_path?: string
51
59
}
52
60
@@ -56,6 +64,13 @@ export class DictionaryDecompounderTokenFilter extends CompoundWordTokenFilterBa
/** String used in shingles as a replacement for empty positions that do not contain a token. This filler token is only used in shingles, not original unigrams. Defaults to an underscore (`_`). */
88
110
filler_token?: string
89
-
max_shingle_size?: integer|string// TODO: should be only int
90
-
min_shingle_size?: integer|string// TODO: should be only int
111
+
/** Maximum number of tokens to concatenate when creating shingles. Defaults to `2`. */
112
+
max_shingle_size?: Stringified<integer>
113
+
/** Minimum number of tokens to concatenate when creating shingles. Defaults to `2`. */
114
+
min_shingle_size?: Stringified<integer>
115
+
/** If `true`, the output includes the original input tokens. If `false`, the output only includes shingles; the original input tokens are removed. Defaults to `true`. */
91
116
output_unigrams?: boolean
117
+
/** If `true`, the output includes the original input tokens only if no shingles are produced; if shingles are produced, the output only includes shingles. Defaults to `false`. */
92
118
output_unigrams_if_no_shingles?: boolean
119
+
/** Separator used to concatenate adjacent tokens to form a shingle. Defaults to a space (`" "`). */
93
120
token_separator?: string
94
121
}
95
122
96
123
exportclassStopTokenFilterextendsTokenFilterBase{
97
124
type: 'stop'
125
+
/** If `true`, stop word matching is case insensitive. For example, if `true`, a stop word of the matches and removes `The`, `THE`, or `the`. Defaults to `false`. */
98
126
ignore_case?: boolean
127
+
/** If `true`, the last token of a stream is removed if it’s a stop word. Defaults to `true`. */
99
128
remove_trailing?: boolean
129
+
/** Language value, such as `_arabic_` or `_thai_`. Defaults to `_english_`. */
100
130
stopwords?: StopWords
131
+
/** Path to a file that contains a list of stop words to remove.
132
+
* This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each stop word in the file must be separated by a line break. */
/** Expands definitions for equivalent synonym rules. Defaults to `true`. */
111
143
expand?: boolean
144
+
/** Sets the synonym rules format. */
112
145
format?: SynonymFormat
146
+
/** If `true` ignores errors while parsing the synonym rules. It is important to note that only those synonym rules which cannot get parsed are ignored. Defaults to the value of the `updateable` setting. */
113
147
lenient?: boolean
148
+
/** Used to define inline synonyms. */
114
149
synonyms?: string[]
150
+
/** Used to provide a synonym file. This path must be absolute or relative to the `config` location. */
115
151
synonyms_path?: string
152
+
/** Provide a synonym set created via Synonyms Management APIs. */
116
153
synonyms_set?: string
154
+
/** Controls the tokenizers that will be used to tokenize the synonym, this parameter is for backwards compatibility for indices that created before 6.0.
155
+
* @deprecated 6.0.0 */
117
156
tokenizer?: string
157
+
/** If `true` allows reloading search analyzers to pick up changes to synonym files. Only to be used for search analyzers. Defaults to `false`. */
/** If `true`, the filter produces catenated tokens for chains of alphanumeric characters separated by non-alphabetic delimiters. Defaults to `false`. */
135
171
catenate_all?: boolean
172
+
/** If `true`, the filter produces catenated tokens for chains of numeric characters separated by non-alphabetic delimiters. Defaults to `false`. */
136
173
catenate_numbers?: boolean
174
+
/** If `true`, the filter produces catenated tokens for chains of alphabetical characters separated by non-alphabetic delimiters. Defaults to `false`. */
137
175
catenate_words?: boolean
176
+
/** If `true`, the filter includes tokens consisting of only numeric characters in the output. If `false`, the filter excludes these tokens from the output. Defaults to `true`. */
138
177
generate_number_parts?: boolean
178
+
/** If `true`, the filter includes tokens consisting of only alphabetical characters in the output. If `false`, the filter excludes these tokens from the output. Defaults to `true`. */
139
179
generate_word_parts?: boolean
180
+
/** If `true`, the filter includes the original version of any split tokens in the output. This original version includes non-alphanumeric delimiters. Defaults to `false`. */
140
181
preserve_original?: Stringified<boolean>
182
+
/** Array of tokens the filter won’t split. */
141
183
protected_words?: string[]
184
+
/** Path to a file that contains a list of tokens the filter won’t split.
185
+
* This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each token in the file must be separated by a line break. */
142
186
protected_words_path?: string
187
+
/** If `true`, the filter splits tokens at letter case transitions. For example: camelCase -> [ camel, Case ]. Defaults to `true`. */
143
188
split_on_case_change?: boolean
189
+
/** If `true`, the filter splits tokens at letter-number transitions. For example: j2se -> [ j, 2, se ]. Defaults to `true`. */
144
190
split_on_numerics?: boolean
191
+
/** If `true`, the filter removes the English possessive (`'s`) from the end of each token. For example: O'Neil's -> [ O, Neil ]. Defaults to `true`. */
145
192
stem_english_possessive?: boolean
193
+
/** Array of custom type mappings for characters. This allows you to map non-alphanumeric characters as numeric or alphanumeric to avoid splitting on those characters. */
146
194
type_table?: string[]
195
+
/** Path to a file that contains custom type mappings for characters. This allows you to map non-alphanumeric characters as numeric or alphanumeric to avoid splitting on those characters. */
/** If `true`, the filter adjusts the offsets of split or catenated tokens to better reflect their actual position in the token stream. Defaults to `true`. */
152
206
adjust_offsets?: boolean
153
-
catenate_all?: boolean
154
-
catenate_numbers?: boolean
155
-
catenate_words?: boolean
156
-
generate_number_parts?: boolean
157
-
generate_word_parts?: boolean
207
+
/** If `true`, the filter skips tokens with a keyword attribute of true. Defaults to `false`. */
0 commit comments