@@ -113,7 +113,130 @@ message GlobalVectorKMeansTreeIndex {
113113 KMeansTreeSettings vector_settings = 3 ;
114114}
115115
116- // Represent secondary index
116+ message FulltextIndexSettings {
117+ // Specifies the layout strategy for storing and updating the full-text index
118+ enum Layout {
119+ LAYOUT_UNSPECIFIED = 0 ;
120+
121+ // Uses a single flat inverted index table (indexImplTable)
122+ // Example source table:
123+ // ┌────┬────────────────────────────┐
124+ // │ id │ text │
125+ // ├────┼────────────────────────────┤
126+ // │ 1 │ "The quick brown fox" │
127+ // │ 2 │ "The quick blue hare" │
128+ // └────┴────────────────────────────┘
129+ // Example inverted index table (indexImplTable):
130+ // ┌──────────────┬────┐
131+ // │ __ydb_token │ id │
132+ // ├──────────────┼────┤
133+ // │ "blue" │ 2 │
134+ // │ "brown" │ 1 │
135+ // │ "fox" │ 1 │
136+ // │ "hare" │ 2 │
137+ // │ "quick" │ 1 │
138+ // │ "quick" │ 2 │
139+ // │ "The" │ 1 │
140+ // │ "The" │ 2 │
141+ // └──────────────┴────┘
142+ FLAT = 1 ;
143+ }
144+
145+ // Specifies how text is tokenized during indexing
146+ enum Tokenizer {
147+ TOKENIZER_UNSPECIFIED = 0 ;
148+
149+ // Splits text only by whitespace
150+ // Does not split on punctuation
151+ // Example:
152+ // Text: "foo-bar baz_lorem ipsum"
153+ // Tokens: ["foo-bar", "baz_lorem", "ipsum"]
154+ WHITESPACE = 1 ;
155+
156+ // Applies general language-aware tokenization
157+ // Splits text on whitespace and punctuation
158+ // Example:
159+ // Text: "foo-bar baz_lorem ipsum"
160+ // Tokens: ["foo", "bar", "baz", "lorem", "ipsum"]
161+ STANDARD = 2 ;
162+
163+ // Treats the entire input as a single token
164+ // No splitting is performed
165+ // Example:
166+ // Text: "Hello World!"
167+ // Tokens: ["Hello World!"]
168+ KEYWORD = 3 ;
169+ }
170+
171+ // See Layout enum
172+ Layout layout = 1 ;
173+
174+ // See Tokenizer enum
175+ Tokenizer tokenizer = 2 ;
176+
177+ // Language used for language-sensitive operations like stopword filtering
178+ // Example: language = "english"
179+ // By default is not specified and no language-specific logic is applied
180+ string language = 3 ;
181+
182+ // Whether to convert tokens to lowercase
183+ // Example:
184+ // Token: "Quick"
185+ // Output: "quick"
186+ bool use_filter_lowercase = 100 ;
187+
188+ // Whether to remove common stopwords like "the", "a", "is"
189+ // Example: language = "english"
190+ // Tokens: ["the", "quick", "brown"]
191+ // Output: ["quick", "brown"]
192+ bool use_filter_stopwords = 110 ;
193+
194+ // Whether to apply character n-gram indexing to each token
195+ // Must be used with filter_ngram_min_length and filter_ngram_max_length
196+ // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
197+ // Token: "search"
198+ // Output: ["sea", "ear", "arc", "rch", "sear", "earc", "arch"]
199+ bool use_filter_ngram = 120 ;
200+
201+ // Whether to apply edge n-gram indexing (prefix-based) to each token
202+ // Used with filter_ngram_min_length and filter_ngram_max_length
203+ // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
204+ // Token: "search"
205+ // Output: ["sea", "sear"]
206+ bool use_filter_edge_ngram = 121 ;
207+
208+ // Minimum length of n-grams to generate (inclusive)
209+ // Must be used with use_filter_ngram or use_filter_edge_ngram
210+ // Default value is 3
211+ uint32 filter_ngram_min_length = 122 ;
212+
213+ // Maximum length of n-grams to generate (inclusive)
214+ // Must be used with use_filter_ngram or use_filter_edge_ngram
215+ // Default value is 4
216+ uint32 filter_ngram_max_length = 123 ;
217+
218+ // Whether to filter tokens by their length
219+ // Must be used with filter_length_min or filter_length_max
220+ // Example: filter_length_min = 4, filter_length_max = 6
221+ // Tokens: ["foo", "fooba", "foobar", "foobarbaz"]
222+ // Output: ["fooba", "foobar"]
223+ bool use_filter_length = 130 ;
224+
225+ // Minimum token length to keep (inclusive)
226+ // Must be used with use_filter_length
227+ uint32 filter_length_min = 131 ;
228+
229+ // Maximum token length to keep (inclusive)
230+ // Must be used with use_filter_length
231+ uint32 filter_length_max = 132 ;
232+ }
233+
234+ message GlobalFulltextIndex {
235+ GlobalIndexSettings settings = 1 ;
236+ FulltextIndexSettings fulltext_settings = 2 ;
237+ }
238+
239+ // Represent table index
117240message TableIndex {
118241 // Name of index
119242 string name = 1 ;
@@ -125,12 +248,13 @@ message TableIndex {
125248 GlobalAsyncIndex global_async_index = 4 ;
126249 GlobalUniqueIndex global_unique_index = 6 ;
127250 GlobalVectorKMeansTreeIndex global_vector_kmeans_tree_index = 7 ;
251+ GlobalFulltextIndex global_fulltext_index = 8 ;
128252 }
129253 // list of columns content to be copied in to index table
130254 repeated string data_columns = 5 ;
131255}
132256
133- // Represent secondary index with index state
257+ // Represent table index with index state
134258message TableIndexDescription {
135259 enum Status {
136260 STATUS_UNSPECIFIED = 0 ;
@@ -648,7 +772,7 @@ message CreateTableRequest {
648772 // Table profile
649773 TableProfile profile = 5 ;
650774 Ydb.Operations.OperationParams operation_params = 6 ;
651- // List of secondary indexes
775+ // List of table indexes
652776 repeated TableIndex indexes = 7 ;
653777 // Table rows time to live settings
654778 TtlSettings ttl_settings = 8 ;
@@ -726,9 +850,9 @@ message AlterTableRequest {
726850 TtlSettings set_ttl_settings = 7 ;
727851 google.protobuf.Empty drop_ttl_settings = 8 ;
728852 }
729- // Add secondary indexes
853+ // Add table indexes
730854 repeated TableIndex add_indexes = 9 ;
731- // Remove secondary indexes
855+ // Remove table indexes
732856 repeated string drop_indexes = 10 ;
733857 // Change table storage settings
734858 StorageSettings alter_storage_settings = 11 ;
0 commit comments