Skip to content

Commit f6e40b3

Browse files
committed
Add several missing token filter types
1 parent f22adb6 commit f6e40b3

File tree

1 file changed

+105
-1
lines changed

1 file changed

+105
-1
lines changed

specification/_types/analysis/token_filters.ts

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,94 @@ export class UppercaseTokenFilter extends TokenFilterBase {
342342
type: 'uppercase'
343343
}
344344

345+
export class ApostropheTokenFilter extends TokenFilterBase {
346+
type: 'apostrophe'
347+
}
348+
349+
export class ArabicNormalizationTokenFilter extends TokenFilterBase {
350+
type: 'arabic_normalization'
351+
}
352+
353+
export enum CjkBigramIgnoredScript {
354+
han,
355+
hangul,
356+
hiragana,
357+
katakana
358+
}
359+
360+
export class CjkBigramTokenFilter extends TokenFilterBase {
361+
type: 'cjk_bigram'
362+
/** Array of character scripts for which to disable bigrams. */
363+
ignored_scripts?: CjkBigramIgnoredScript[]
364+
/** If `true`, emit tokens in both bigram and unigram form. If `false`, a CJK character is output in unigram form when it has no adjacent characters. Defaults to `false`. */
365+
output_unigrams?: boolean
366+
}
367+
368+
export class CjkWidthTokenFilter extends TokenFilterBase {
369+
type: 'cjk_width'
370+
}
371+
372+
export class ClassicTokenFilter extends TokenFilterBase {
373+
type: 'classic'
374+
}
375+
376+
export class DecimalDigitTokenFilter extends TokenFilterBase {
377+
type: 'decimal_digit'
378+
}
379+
380+
export class FlattenGraphTokenFilter extends TokenFilterBase {
381+
type: 'flatten_graph'
382+
}
383+
384+
export class GermanNormalizationTokenFilter extends TokenFilterBase {
385+
type: 'german_normalization'
386+
}
387+
388+
export class HindiNormalizationTokenFilter extends TokenFilterBase {
389+
type: 'hindi_normalization'
390+
}
391+
392+
export class IndicNormalizationTokenFilter extends TokenFilterBase {
393+
type: 'indic_normalization'
394+
}
395+
396+
export class KeywordRepeatTokenFilter extends TokenFilterBase {
397+
type: 'keyword_repeat'
398+
}
399+
400+
export class MinHashTokenFilter extends TokenFilterBase {
401+
type: 'min_hash'
402+
/** Number of buckets to which hashes are assigned. Defaults to `512`. */
403+
bucket_count?: integer
404+
/** Number of ways to hash each token in the stream. Defaults to `1`. */
405+
hash_count?: integer
406+
/** Number of hashes to keep from each bucket. Defaults to `1`.
407+
* Hashes are retained by ascending size, starting with the bucket’s smallest hash first. */
408+
hash_set_size?: integer
409+
/** If `true`, the filter fills empty buckets with the value of the first non-empty bucket to its circular right if the `hash_set_size` is `1`. If the `bucket_count` argument is greater than 1, this parameter defaults to `true`. Otherwise, this parameter defaults to `false`. */
410+
with_rotation?: boolean
411+
}
412+
413+
export class PersianNormalizationTokenFilter extends TokenFilterBase {
414+
type: 'persian_normalization'
415+
}
416+
417+
export class ScandinavianFoldingTokenFilter extends TokenFilterBase {
418+
type: 'scandinavian_folding'
419+
}
420+
421+
export class ScandinavianNormalizationTokenFilter extends TokenFilterBase {
422+
type: 'scandinavian_normalization'
423+
}
424+
425+
export class SerbianNormalizationTokenFilter extends TokenFilterBase {
426+
type: 'serbian_normalization'
427+
}
428+
429+
export class SoraniNormalizationTokenFilter extends TokenFilterBase {
430+
type: 'sorani_normalization'
431+
}
432+
345433
/**
346434
* @codegen_names name, definition
347435
* @ext_doc_id analysis-tokenfilters
@@ -354,34 +442,50 @@ export type TokenFilter = string | TokenFilterDefinition
354442
* @non_exhaustive
355443
*/
356444
export type TokenFilterDefinition =
445+
| ApostropheTokenFilter
446+
| ArabicNormalizationTokenFilter
357447
| AsciiFoldingTokenFilter
448+
| CjkBigramTokenFilter
449+
| CjkWidthTokenFilter
450+
| ClassicTokenFilter
358451
| CommonGramsTokenFilter
359452
| ConditionTokenFilter
453+
| DecimalDigitTokenFilter
360454
| DelimitedPayloadTokenFilter
361-
//DictionaryDecompounderTokenFilter |
362455
| EdgeNGramTokenFilter
363456
| ElisionTokenFilter
364457
| FingerprintTokenFilter
458+
| FlattenGraphTokenFilter
459+
| GermanNormalizationTokenFilter
460+
| HindiNormalizationTokenFilter
365461
| HunspellTokenFilter
366462
| HyphenationDecompounderTokenFilter
463+
| IndicNormalizationTokenFilter
367464
| KeepTypesTokenFilter
368465
| KeepWordsTokenFilter
369466
| KeywordMarkerTokenFilter
467+
| KeywordRepeatTokenFilter
370468
| KStemTokenFilter
371469
| LengthTokenFilter
372470
| LimitTokenCountTokenFilter
373471
| LowercaseTokenFilter
472+
| MinHashTokenFilter
374473
| MultiplexerTokenFilter
375474
| NGramTokenFilter
376475
| NoriPartOfSpeechTokenFilter
377476
| PatternCaptureTokenFilter
378477
| PatternReplaceTokenFilter
478+
| PersianNormalizationTokenFilter
379479
| PorterStemTokenFilter
380480
| PredicateTokenFilter
381481
| RemoveDuplicatesTokenFilter
382482
| ReverseTokenFilter
483+
| ScandinavianFoldingTokenFilter
484+
| ScandinavianNormalizationTokenFilter
485+
| SerbianNormalizationTokenFilter
383486
| ShingleTokenFilter
384487
| SnowballTokenFilter
488+
| SoraniNormalizationTokenFilter
385489
| StemmerOverrideTokenFilter
386490
| StemmerTokenFilter
387491
| StopTokenFilter

0 commit comments

Comments
 (0)