Skip to content

Commit c8b4de5

Browse files
committed
Lots of docstrings and combining redundant definitions into parent classes
1 parent f6e40b3 commit c8b4de5

File tree

1 file changed

+72
-30
lines changed

1 file changed

+72
-30
lines changed

specification/_types/analysis/token_filters.ts

Lines changed: 72 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,20 @@ export class TokenFilterBase {
4141
}
4242

4343
export class CompoundWordTokenFilterBase extends TokenFilterBase {
44-
hyphenation_patterns_path?: string
44+
/** Maximum subword character length. Longer subword tokens are excluded from the output. Defaults to `15`. */
4545
max_subword_size?: integer
46+
/** Minimum subword character length. Shorter subword tokens are excluded from the output. Defaults to `2`. */
4647
min_subword_size?: integer
48+
/** Minimum word character length. Shorter word tokens are excluded from the output. Defaults to `5`. */
4749
min_word_size?: integer
50+
/** If `true`, only include the longest matching subword. Defaults to `false`. */
4851
only_longest_match?: boolean
52+
/** A list of subwords to look for in the token stream. If found, the subword is included in the token output.
53+
* Either this parameter or `word_list_path` must be specified.*/
4954
word_list?: string[]
55+
/** Path to a file that contains a list of subwords to find in the token stream. If found, the subword is included in the token output.
56+
* This path must be absolute or relative to the config location, and the file must be UTF-8 encoded. Each token in the file must be separated by a line break.
57+
* Either this parameter or `word_list` must be specified. */
5058
word_list_path?: string
5159
}
5260

@@ -56,6 +64,13 @@ export class DictionaryDecompounderTokenFilter extends CompoundWordTokenFilterBa
5664

5765
export class HyphenationDecompounderTokenFilter extends CompoundWordTokenFilterBase {
5866
type: 'hyphenation_decompounder'
67+
/** Path to an Apache FOP (Formatting Objects Processor) XML hyphenation pattern file.
68+
* This path must be absolute or relative to the `config` location. Only FOP v1.2 compatible files are supported. */
69+
hyphenation_patterns_path: string
70+
/** If `true`, do not match sub tokens in tokens that are in the word list. Defaults to `false`. */
71+
no_sub_matches?: boolean
72+
/** If `true`, do not allow overlapping tokens. Defaults to `false`. */
73+
no_overlapping_matches?: boolean
5974
}
6075

6176
export enum DelimitedPayloadEncoding {
@@ -66,7 +81,9 @@ export enum DelimitedPayloadEncoding {
6681

6782
export class DelimitedPayloadTokenFilter extends TokenFilterBase {
6883
type: 'delimited_payload'
84+
/** Character used to separate tokens from payloads. Defaults to `|`. */
6985
delimiter?: string
86+
/** Data type for the stored payload. */
7087
encoding?: DelimitedPayloadEncoding
7188
}
7289

@@ -77,27 +94,42 @@ export enum EdgeNGramSide {
7794

7895
export class EdgeNGramTokenFilter extends TokenFilterBase {
7996
type: 'edge_ngram'
97+
/** Maximum character length of a gram. For custom token filters, defaults to `2`. For the built-in edge_ngram filter, defaults to `1`. */
8098
max_gram?: integer
99+
/** Minimum character length of a gram. Defaults to `1`. */
81100
min_gram?: integer
101+
/** Indicates whether to truncate tokens from the `front` or `back`. Defaults to `front`. */
82102
side?: EdgeNGramSide
103+
/** Emits original token when set to `true`. Defaults to `false`. */
83104
preserve_original?: Stringified<boolean>
84105
}
85106

86107
export class ShingleTokenFilter extends TokenFilterBase {
87108
type: 'shingle'
109+
/** String used in shingles as a replacement for empty positions that do not contain a token. This filler token is only used in shingles, not original unigrams. Defaults to an underscore (`_`). */
88110
filler_token?: string
89-
max_shingle_size?: integer | string // TODO: should be only int
90-
min_shingle_size?: integer | string // TODO: should be only int
111+
/** Maximum number of tokens to concatenate when creating shingles. Defaults to `2`. */
112+
max_shingle_size?: Stringified<integer>
113+
/** Minimum number of tokens to concatenate when creating shingles. Defaults to `2`. */
114+
min_shingle_size?: Stringified<integer>
115+
/** If `true`, the output includes the original input tokens. If `false`, the output only includes shingles; the original input tokens are removed. Defaults to `true`. */
91116
output_unigrams?: boolean
117+
/** If `true`, the output includes the original input tokens only if no shingles are produced; if shingles are produced, the output only includes shingles. Defaults to `false`. */
92118
output_unigrams_if_no_shingles?: boolean
119+
/** Separator used to concatenate adjacent tokens to form a shingle. Defaults to a space (`" "`). */
93120
token_separator?: string
94121
}
95122

96123
export class StopTokenFilter extends TokenFilterBase {
97124
type: 'stop'
125+
/** If `true`, stop word matching is case insensitive. For example, if `true`, a stop word of the matches and removes `The`, `THE`, or `the`. Defaults to `false`. */
98126
ignore_case?: boolean
127+
/** If `true`, the last token of a stream is removed if it’s a stop word. Defaults to `true`. */
99128
remove_trailing?: boolean
129+
/** Language value, such as `_arabic_` or `_thai_`. Defaults to `_english_`. */
100130
stopwords?: StopWords
131+
/** Path to a file that contains a list of stop words to remove.
132+
* This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each stop word in the file must be separated by a line break. */
101133
stopwords_path?: string
102134
}
103135

@@ -106,64 +138,74 @@ export enum SynonymFormat {
106138
wordnet
107139
}
108140

109-
export class SynonymGraphTokenFilter extends TokenFilterBase {
110-
type: 'synonym_graph'
141+
export class SynonymTokenFilterBase extends TokenFilterBase {
142+
/** Expands definitions for equivalent synonym rules. Defaults to `true`. */
111143
expand?: boolean
144+
/** Sets the synonym rules format. */
112145
format?: SynonymFormat
146+
/** If `true` ignores errors while parsing the synonym rules. It is important to note that only those synonym rules which cannot get parsed are ignored. Defaults to the value of the `updateable` setting. */
113147
lenient?: boolean
148+
/** Used to define inline synonyms. */
114149
synonyms?: string[]
150+
/** Used to provide a synonym file. This path must be absolute or relative to the `config` location. */
115151
synonyms_path?: string
152+
/** Provide a synonym set created via Synonyms Management APIs. */
116153
synonyms_set?: string
154+
/** Controls the tokenizers that will be used to tokenize the synonym, this parameter is for backwards compatibility for indices that created before 6.0.
155+
* @deprecated 6.0.0 */
117156
tokenizer?: string
157+
/** If `true` allows reloading search analyzers to pick up changes to synonym files. Only to be used for search analyzers. Defaults to `false`. */
118158
updateable?: boolean
119159
}
120160

121-
export class SynonymTokenFilter extends TokenFilterBase {
161+
export class SynonymGraphTokenFilter extends SynonymTokenFilterBase {
162+
type: 'synonym_graph'
163+
}
164+
165+
export class SynonymTokenFilter extends SynonymTokenFilterBase {
122166
type: 'synonym'
123-
expand?: boolean
124-
format?: SynonymFormat
125-
lenient?: boolean
126-
synonyms?: string[]
127-
synonyms_path?: string
128-
synonyms_set?: string
129-
tokenizer?: string
130-
updateable?: boolean
131167
}
132168

133-
export class WordDelimiterTokenFilter extends TokenFilterBase {
134-
type: 'word_delimiter'
169+
export class WordDelimiterTokenFilterBase extends TokenFilterBase {
170+
/** If `true`, the filter produces catenated tokens for chains of alphanumeric characters separated by non-alphabetic delimiters. Defaults to `false`. */
135171
catenate_all?: boolean
172+
/** If `true`, the filter produces catenated tokens for chains of numeric characters separated by non-alphabetic delimiters. Defaults to `false`. */
136173
catenate_numbers?: boolean
174+
/** If `true`, the filter produces catenated tokens for chains of alphabetical characters separated by non-alphabetic delimiters. Defaults to `false`. */
137175
catenate_words?: boolean
176+
/** If `true`, the filter includes tokens consisting of only numeric characters in the output. If `false`, the filter excludes these tokens from the output. Defaults to `true`. */
138177
generate_number_parts?: boolean
178+
/** If `true`, the filter includes tokens consisting of only alphabetical characters in the output. If `false`, the filter excludes these tokens from the output. Defaults to `true`. */
139179
generate_word_parts?: boolean
180+
/** If `true`, the filter includes the original version of any split tokens in the output. This original version includes non-alphanumeric delimiters. Defaults to `false`. */
140181
preserve_original?: Stringified<boolean>
182+
/** Array of tokens the filter won’t split. */
141183
protected_words?: string[]
184+
/** Path to a file that contains a list of tokens the filter won’t split.
185+
* This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each token in the file must be separated by a line break. */
142186
protected_words_path?: string
187+
/** If `true`, the filter splits tokens at letter case transitions. For example: camelCase -> [ camel, Case ]. Defaults to `true`. */
143188
split_on_case_change?: boolean
189+
/** If `true`, the filter splits tokens at letter-number transitions. For example: j2se -> [ j, 2, se ]. Defaults to `true`. */
144190
split_on_numerics?: boolean
191+
/** If `true`, the filter removes the English possessive (`'s`) from the end of each token. For example: O'Neil's -> [ O, Neil ]. Defaults to `true`. */
145192
stem_english_possessive?: boolean
193+
/** Array of custom type mappings for characters. This allows you to map non-alphanumeric characters as numeric or alphanumeric to avoid splitting on those characters. */
146194
type_table?: string[]
195+
/** Path to a file that contains custom type mappings for characters. This allows you to map non-alphanumeric characters as numeric or alphanumeric to avoid splitting on those characters. */
147196
type_table_path?: string
148197
}
149198

150-
export class WordDelimiterGraphTokenFilter extends TokenFilterBase {
199+
export class WordDelimiterTokenFilter extends WordDelimiterTokenFilterBase {
200+
type: 'word_delimiter'
201+
}
202+
203+
export class WordDelimiterGraphTokenFilter extends WordDelimiterTokenFilterBase {
151204
type: 'word_delimiter_graph'
205+
/** If `true`, the filter adjusts the offsets of split or catenated tokens to better reflect their actual position in the token stream. Defaults to `true`. */
152206
adjust_offsets?: boolean
153-
catenate_all?: boolean
154-
catenate_numbers?: boolean
155-
catenate_words?: boolean
156-
generate_number_parts?: boolean
157-
generate_word_parts?: boolean
207+
/** If `true`, the filter skips tokens with a keyword attribute of true. Defaults to `false`. */
158208
ignore_keywords?: boolean
159-
preserve_original?: Stringified<boolean>
160-
protected_words?: string[]
161-
protected_words_path?: string
162-
split_on_case_change?: boolean
163-
split_on_numerics?: boolean
164-
stem_english_possessive?: boolean
165-
type_table?: string[]
166-
type_table_path?: string
167209
}
168210

169211
export class AsciiFoldingTokenFilter extends TokenFilterBase {

0 commit comments

Comments
 (0)