Skip to content

Commit f1101b0

Browse files
committed
Token filter updates
1 parent c8b4de5 commit f1101b0

File tree

5 files changed

+137
-15
lines changed

5 files changed

+137
-15
lines changed

specification/_types/analysis/StopWords.ts

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,49 @@
1717
* under the License.
1818
*/
1919

20+
export enum StopWord {
21+
_arabic_,
22+
_armenian_,
23+
_basque_,
24+
_bengali_,
25+
_brazilian_,
26+
_bulgarian_,
27+
_catalan_,
28+
_cjk_,
29+
_czech_,
30+
_danish_,
31+
_dutch_,
32+
_english_,
33+
_estonian_,
34+
_finnish_,
35+
_french_,
36+
_galician_,
37+
_german_,
38+
_greek_,
39+
_hindi_,
40+
_hungarian_,
41+
_indonesian_,
42+
_irish_,
43+
_italian_,
44+
_latvian_,
45+
_lithuanian_,
46+
_norwegian_,
47+
_persian_,
48+
_portuguese_,
49+
_romanian_,
50+
_russian_,
51+
_serbian_,
52+
_sorani_,
53+
_spanish_,
54+
_swedish_,
55+
_thai_,
56+
_turkish_
57+
}
58+
2059
/**
2160
* Language value, such as _arabic_ or _thai_. Defaults to _english_.
2261
* Each language value corresponds to a predefined list of stop words in Lucene. See Stop words by language for supported language values and their stop words.
2362
* Also accepts an array of stop words.
2463
* @class_serializer: StopWordsFormatter
2564
*/
26-
export type StopWords = string | string[]
65+
export type StopWords = StopWord | StopWord[]

specification/_types/analysis/kuromoji-plugin.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,19 @@ import { integer } from '@_types/Numeric'
2121
import { CharFilterBase } from './char_filters'
2222
import { TokenizerBase } from './tokenizers'
2323
import { TokenFilterBase } from './token_filters'
24+
import { StopWords } from './StopWords'
2425

2526
export class KuromojiAnalyzer {
2627
type: 'kuromoji'
2728
mode: KuromojiTokenizationMode
2829
user_dictionary?: string
2930
}
3031

32+
export class JaStopTokenFilter extends TokenFilterBase {
33+
type: 'ja_stop'
34+
stopwords?: StopWords
35+
}
36+
3137
export class KuromojiIterationMarkCharFilter extends CharFilterBase {
3238
type: 'kuromoji_iteration_mark'
3339
normalize_kana: boolean

specification/_types/analysis/languages.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,25 +18,30 @@
1818
*/
1919

2020
export enum SnowballLanguage {
21+
Arabic,
2122
Armenian,
2223
Basque,
2324
Catalan,
2425
Danish,
2526
Dutch,
2627
English,
28+
Estonian,
2729
Finnish,
2830
French,
2931
German,
3032
German2,
3133
Hungarian,
3234
Italian,
35+
Irish,
3336
Kp,
37+
Lithuanian,
3438
Lovins,
3539
Norwegian,
3640
Porter,
3741
Portuguese,
3842
Romanian,
3943
Russian,
44+
Serbian,
4045
Spanish,
4146
Swedish,
4247
Turkish

specification/_types/analysis/nori-plugin.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919

2020
import { TokenizerBase } from './tokenizers'
21+
import { TokenFilterBase } from './token_filters'
2122

2223
export enum NoriDecompoundMode {
2324
discard,
@@ -32,3 +33,10 @@ export class NoriTokenizer extends TokenizerBase {
3233
user_dictionary?: string
3334
user_dictionary_rules?: string[]
3435
}
36+
37+
export class NoriPartOfSpeechTokenFilter extends TokenFilterBase {
38+
type: 'nori_part_of_speech'
39+
/** An array of part-of-speech tags that should be removed. */
40+
stoptags?: string[]
41+
}
42+

specification/_types/analysis/token_filters.ts

Lines changed: 78 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,10 @@ import {
3030
import {
3131
KuromojiPartOfSpeechTokenFilter,
3232
KuromojiReadingFormTokenFilter,
33-
KuromojiStemmerTokenFilter
33+
KuromojiStemmerTokenFilter,
34+
JaStopTokenFilter
3435
} from './kuromoji-plugin'
36+
import { NoriPartOfSpeechTokenFilter } from './nori-plugin'
3537
import { SnowballLanguage } from './languages'
3638
import { PhoneticTokenFilter } from './phonetic-plugin'
3739
import { StopWords } from './StopWords'
@@ -210,72 +212,112 @@ export class WordDelimiterGraphTokenFilter extends WordDelimiterTokenFilterBase
210212

211213
export class AsciiFoldingTokenFilter extends TokenFilterBase {
212214
type: 'asciifolding'
215+
/** If `true`, emit both original tokens and folded tokens. Defaults to `false`. */
213216
preserve_original?: Stringified<boolean>
214217
}
215218

216219
export class CommonGramsTokenFilter extends TokenFilterBase {
217220
type: 'common_grams'
221+
/** A list of tokens. The filter generates bigrams for these tokens.
222+
* Either this or the `common_words_path` parameter is required. */
218223
common_words?: string[]
224+
/** Path to a file containing a list of tokens. The filter generates bigrams for these tokens.
225+
* This path must be absolute or relative to the `config` location. The file must be UTF-8 encoded. Each token in the file must be separated by a line break.
226+
* Either this or the `common_words` parameter is required. */
219227
common_words_path?: string
228+
/** If `true`, matches for common words matching are case-insensitive. Defaults to `false`. */
220229
ignore_case?: boolean
230+
/** If `true`, the filter excludes the following tokens from the output:
231+
* - Unigrams for common words
232+
* - Unigrams for terms followed by common words
233+
* Defaults to `false`. We recommend enabling this parameter for search analyzers. */
221234
query_mode?: boolean
222235
}
223236

224237
export class ConditionTokenFilter extends TokenFilterBase {
225238
type: 'condition'
239+
/** Array of token filters. If a token matches the predicate script in the `script` parameter, these filters are applied to the token in the order provided. */
226240
filter: string[]
241+
/** Predicate script used to apply token filters. If a token matches this script, the filters in the `filter` parameter are applied to the token. */
227242
script: Script
228243
}
229244

230245
export class ElisionTokenFilter extends TokenFilterBase {
231246
type: 'elision'
247+
/** List of elisions to remove.
248+
* To be removed, the elision must be at the beginning of a token and be immediately followed by an apostrophe. Both the elision and apostrophe are removed.
249+
* For custom `elision` filters, either this parameter or `articles_path` must be specified. */
232250
articles?: string[]
251+
/** Path to a file that contains a list of elisions to remove.
252+
* This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each elision in the file must be separated by a line break.
253+
* To be removed, the elision must be at the beginning of a token and be immediately followed by an apostrophe. Both the elision and apostrophe are removed.
254+
* For custom `elision` filters, either this parameter or `articles` must be specified. */
233255
articles_path?: string
256+
/** If `true`, elision matching is case insensitive. If `false`, elision matching is case sensitive. Defaults to `false`. */
234257
articles_case?: Stringified<boolean>
235258
}
236259

237260
export class FingerprintTokenFilter extends TokenFilterBase {
238261
type: 'fingerprint'
262+
/** Maximum character length, including whitespace, of the output token. Defaults to `255`. Concatenated tokens longer than this will result in no token output. */
239263
max_output_size?: integer
264+
/** Character to use to concatenate the token stream input. Defaults to a space. */
240265
separator?: string
241266
}
242267

243268
export class HunspellTokenFilter extends TokenFilterBase {
244269
type: 'hunspell'
270+
/** If `true`, duplicate tokens are removed from the filter’s output. Defaults to `true`. */
245271
dedup?: boolean
272+
/** One or more `.dic` files (e.g, `en_US.dic`, my_custom.dic) to use for the Hunspell dictionary.
273+
* By default, the `hunspell` filter uses all `.dic` files in the `<$ES_PATH_CONF>/hunspell/<locale>` directory specified using the `lang`, `language`, or `locale` parameter. */
246274
dictionary?: string
275+
/** Locale directory used to specify the `.aff` and `.dic` files for a Hunspell dictionary.
276+
* @aliases lang, language */
247277
locale: string
278+
/** If `true`, only the longest stemmed version of each token is included in the output. If `false`, all stemmed versions of the token are included. Defaults to `false`. */
248279
longest_only?: boolean
249280
}
250281

251-
export class JaStopTokenFilter extends TokenFilterBase {
252-
type: 'ja_stop'
253-
stopwords?: StopWords
254-
}
255-
256282
export enum KeepTypesMode {
257283
include,
258284
exclude
259285
}
260286

261287
export class KeepTypesTokenFilter extends TokenFilterBase {
262288
type: 'keep_types'
289+
/** Indicates whether to keep or remove the specified token types. */
263290
mode?: KeepTypesMode
264-
types?: string[]
291+
/** List of token types to keep or remove. */
292+
types: string[]
265293
}
266294

267295
export class KeepWordsTokenFilter extends TokenFilterBase {
268296
type: 'keep'
297+
/** List of words to keep. Only tokens that match words in this list are included in the output.
298+
* Either this parameter or `keep_words_path` must be specified. */
269299
keep_words?: string[]
300+
/** If `true`, lowercase all keep words. Defaults to `false`. */
270301
keep_words_case?: boolean
302+
/** Path to a file that contains a list of words to keep. Only tokens that match words in this list are included in the output.
303+
* This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each word in the file must be separated by a line break.
304+
* Either this parameter or `keep_words` must be specified. */
271305
keep_words_path?: string
272306
}
273307

274308
export class KeywordMarkerTokenFilter extends TokenFilterBase {
275309
type: 'keyword_marker'
310+
/** If `true`, matching for the `keywords` and `keywords_path` parameters ignores letter case. Defaults to `false`. */
276311
ignore_case?: boolean
312+
/** Array of keywords. Tokens that match these keywords are not stemmed.
313+
* This parameter, `keywords_path`, or `keywords_pattern` must be specified. You cannot specify this parameter and `keywords_pattern`. */
277314
keywords?: string | string[]
315+
/** Path to a file that contains a list of keywords. Tokens that match these keywords are not stemmed.
316+
* This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each word in the file must be separated by a line break.
317+
* This parameter, `keywords`, or `keywords_pattern` must be specified. You cannot specify this parameter and `keywords_pattern`. */
278318
keywords_path?: string
319+
/** Java regular expression used to match tokens. Tokens that match this expression are marked as keywords and not stemmed.
320+
* This parameter, `keywords`, or `keywords_path` must be specified. You cannot specify this parameter and `keywords` or `keywords_pattern`. */
279321
keywords_pattern?: string
280322
}
281323

@@ -285,50 +327,65 @@ export class KStemTokenFilter extends TokenFilterBase {
285327

286328
export class LengthTokenFilter extends TokenFilterBase {
287329
type: 'length'
330+
/** Maximum character length of a token. Longer tokens are excluded from the output. Defaults to `Integer.MAX_VALUE`, which is `2^31-1` or `2147483647`. */
288331
max?: integer
332+
/** Minimum character length of a token. Shorter tokens are excluded from the output. Defaults to `0`. */
289333
min?: integer
290334
}
291335

292336
export class LimitTokenCountTokenFilter extends TokenFilterBase {
293337
type: 'limit'
338+
/** If `true`, the limit filter exhausts the token stream, even if the `max_token_count` has already been reached. Defaults to `false`. */
294339
consume_all_tokens?: boolean
340+
/** Maximum number of tokens to keep. Once this limit is reached, any remaining tokens are excluded from the output. Defaults to `1`. */
295341
max_token_count?: Stringified<integer>
296342
}
297343

344+
export enum LowercaseTokenFilterLanguages {
345+
greek,
346+
irish,
347+
turkish
348+
}
349+
298350
export class LowercaseTokenFilter extends TokenFilterBase {
299351
type: 'lowercase'
300-
language?: string
352+
/** Language-specific lowercase token filter to use. */
353+
language?: LowercaseTokenFilterLanguages
301354
}
302355

303356
export class MultiplexerTokenFilter extends TokenFilterBase {
304357
type: 'multiplexer'
358+
/** A list of token filters to apply to incoming tokens. */
305359
filters: string[]
360+
/** If `true` (the default) then emit the original token in addition to the filtered tokens. */
306361
preserve_original?: Stringified<boolean>
307362
}
308363

309364
export class NGramTokenFilter extends TokenFilterBase {
310365
type: 'ngram'
366+
/** Maximum length of characters in a gram. Defaults to `2`. */
311367
max_gram?: integer
368+
/** Minimum length of characters in a gram. Defaults to `1`. */
312369
min_gram?: integer
370+
/** Emits original token when set to `true`. Defaults to `false`. */
313371
preserve_original?: Stringified<boolean>
314372
}
315373

316-
export class NoriPartOfSpeechTokenFilter extends TokenFilterBase {
317-
type: 'nori_part_of_speech'
318-
stoptags?: string[]
319-
}
320-
321374
export class PatternCaptureTokenFilter extends TokenFilterBase {
322375
type: 'pattern_capture'
376+
/** A list of regular expressions to match. */
323377
patterns: string[]
378+
/** If set to `true` (the default) it will emit the original token. */
324379
preserve_original?: Stringified<boolean>
325380
}
326381

327382
export class PatternReplaceTokenFilter extends TokenFilterBase {
328383
type: 'pattern_replace'
384+
/** If `true`, all substrings matching the pattern parameter’s regular expression are replaced. If `false`, the filter replaces only the first matching substring in each token. Defaults to `true`. */
329385
all?: boolean
330-
flags?: string
386+
/** Regular expression, written in Java’s regular expression syntax. The filter replaces token substrings matching this pattern with the substring in the `replacement` parameter. */
331387
pattern: string
388+
/** Replacement substring. Defaults to an empty substring (`""`). */
332389
replacement?: string
333390
}
334391

@@ -338,6 +395,7 @@ export class PorterStemTokenFilter extends TokenFilterBase {
338395

339396
export class PredicateTokenFilter extends TokenFilterBase {
340397
type: 'predicate_token_filter'
398+
/** Script containing a condition used to filter incoming tokens. Only tokens that match this script are included in the output. */
341399
script: Script
342400
}
343401

@@ -351,12 +409,15 @@ export class ReverseTokenFilter extends TokenFilterBase {
351409

352410
export class SnowballTokenFilter extends TokenFilterBase {
353411
type: 'snowball'
412+
/** Controls the language used by the stemmer. */
354413
language?: SnowballLanguage
355414
}
356415

357416
export class StemmerOverrideTokenFilter extends TokenFilterBase {
358417
type: 'stemmer_override'
418+
/** A list of mapping rules to use. */
359419
rules?: string[]
420+
/** A path (either relative to `config` location, or absolute) to a list of mappings. */
360421
rules_path?: string
361422
}
362423

@@ -372,11 +433,13 @@ export class TrimTokenFilter extends TokenFilterBase {
372433

373434
export class TruncateTokenFilter extends TokenFilterBase {
374435
type: 'truncate'
436+
/** Character limit for each token. Tokens exceeding this limit are truncated. Defaults to `10`. */
375437
length?: integer
376438
}
377439

378440
export class UniqueTokenFilter extends TokenFilterBase {
379441
type: 'unique'
442+
/** If `true`, only remove duplicate tokens in the same position. Defaults to `false`. */
380443
only_on_same_position?: boolean
381444
}
382445

@@ -539,6 +602,7 @@ export type TokenFilterDefinition =
539602
| UppercaseTokenFilter
540603
| WordDelimiterGraphTokenFilter
541604
| WordDelimiterTokenFilter
605+
| JaStopTokenFilter
542606
| KuromojiStemmerTokenFilter
543607
| KuromojiReadingFormTokenFilter
544608
| KuromojiPartOfSpeechTokenFilter

0 commit comments

Comments
 (0)