@@ -113,7 +113,149 @@ message GlobalVectorKMeansTreeIndex {
113113 KMeansTreeSettings vector_settings = 3 ;
114114}
115115
116- // Represent secondary index
116+ message FulltextIndexSettings {
117+ // Specifies the layout strategy for storing and updating the full-text index
118+ enum Layout {
119+ LAYOUT_UNSPECIFIED = 0 ;
120+
121+ // Uses a single flat inverted index table (indexImplTable)
122+ // Example source table:
123+ // ┌────┬────────────────────────────┐
124+ // │ id │ text │
125+ // ├────┼────────────────────────────┤
126+ // │ 1 │ "The quick brown fox" │
127+ // │ 2 │ "The quick blue hare" │
128+ // └────┴────────────────────────────┘
129+ // Example inverted index table (indexImplTable):
130+ // ┌──────────────┬────┐
131+ // │ __ydb_token │ id │
132+ // ├──────────────┼────┤
133+ // │ "blue" │ 2 │
134+ // │ "brown" │ 1 │
135+ // │ "fox" │ 1 │
136+ // │ "hare" │ 2 │
137+ // │ "quick" │ 1 │
138+ // │ "quick" │ 2 │
139+ // │ "The" │ 1 │
140+ // │ "The" │ 2 │
141+ // └──────────────┴────┘
142+ // Supports a single column only
143+ FLAT = 1 ;
144+ }
145+
146+ // Specifies how text is tokenized during indexing
147+ enum Tokenizer {
148+ TOKENIZER_UNSPECIFIED = 0 ;
149+
150+ // Splits text only by whitespace
151+ // Does not split on punctuation
152+ // Example:
153+ // Text: "foo-bar baz_lorem ipsum"
154+ // Tokens: ["foo-bar", "baz_lorem", "ipsum"]
155+ WHITESPACE = 1 ;
156+
157+ // Applies general language-aware tokenization
158+ // Splits text on whitespace and punctuation
159+ // Example:
160+ // Text: "foo-bar baz_lorem ipsum"
161+ // Tokens: ["foo", "bar", "baz", "lorem", "ipsum"]
162+ STANDARD = 2 ;
163+
164+ // Treats the entire input as a single token
165+ // No splitting is performed
166+ // Example:
167+ // Text: "Hello World!"
168+ // Tokens: ["Hello World!"]
169+ KEYWORD = 3 ;
170+ }
171+
172+ // Represents text analyzers settings
173+ message Analyzers {
174+ // See Tokenizer enum
175+ optional Tokenizer tokenizer = 1 ;
176+
177+ // Language used for language-sensitive operations like stopword filtering
178+ // Example: language = "english"
179+ // By default is not specified and no language-specific logic is applied
180+ optional string language = 2 ;
181+
182+ // Whether to convert tokens to lowercase
183+ // Example:
184+ // Token: "Quick"
185+ // Output: "quick"
186+ optional bool use_filter_lowercase = 100 ;
187+
188+ // Whether to remove common stopwords like "the", "a", "is"
189+ // Example: language = "english"
190+ // Tokens: ["the", "quick", "brown"]
191+ // Output: ["quick", "brown"]
192+ optional bool use_filter_stopwords = 110 ;
193+
194+ // Whether to apply character n-gram indexing to each token
195+ // Must be used with filter_ngram_min_length and filter_ngram_max_length
196+ // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
197+ // Token: "search"
198+ // Output: ["sea", "ear", "arc", "rch", "sear", "earc", "arch"]
199+ optional bool use_filter_ngram = 120 ;
200+
201+ // Whether to apply edge n-gram indexing (prefix-based) to each token
202+ // Used with filter_ngram_min_length and filter_ngram_max_length
203+ // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
204+ // Token: "search"
205+ // Output: ["sea", "sear"]
206+ optional bool use_filter_edge_ngram = 121 ;
207+
208+ // Minimum length of n-grams to generate (inclusive)
209+ // Must be used with use_filter_ngram or use_filter_edge_ngram
210+ // Default value is 3
211+ optional int32 filter_ngram_min_length = 122 [(Ydb.value ) = ">= 0" ];
212+
213+ // Maximum length of n-grams to generate (inclusive)
214+ // Must be used with use_filter_ngram or use_filter_edge_ngram
215+ // Default value is 4
216+ optional int32 filter_ngram_max_length = 123 [(Ydb.value ) = ">= 0" ];
217+
218+ // Whether to filter tokens by their length
219+ // Must be used with filter_length_min or filter_length_max
220+ // Example: filter_length_min = 4, filter_length_max = 6
221+ // Tokens: ["foo", "fooba", "foobar", "foobarbaz"]
222+ // Output: ["fooba", "foobar"]
223+ optional bool use_filter_length = 130 ;
224+
225+ // Minimum token length to keep (inclusive)
226+ // Must be used with use_filter_length
227+ optional int32 filter_length_min = 131 [(Ydb.value ) = ">= 0" ];
228+
229+ // Maximum token length to keep (inclusive)
230+ // Must be used with use_filter_length
231+ optional int32 filter_length_max = 132 [(Ydb.value ) = ">= 0" ];
232+ }
233+
234+ // Represents text analyzers settings for a specific column
235+ message ColumnAnalyzers {
236+ // Name of the column to be indexed
237+ optional string column = 1 ;
238+
239+ // Analyzer settings specific to this column
240+ Analyzers analyzers = 2 ;
241+ }
242+
243+ // See Layout enum
244+ optional Layout layout = 1 ;
245+
246+ // List of columns and their fulltext settings
247+ // Currently, this list should contain a single entry with specified analyzers
248+ // Later, some columns may not use analyzers and will be indexed as-is
249+ // This list must always match TableIndex.index_columns
250+ repeated ColumnAnalyzers columns = 2 ;
251+ }
252+
253+ message GlobalFulltextIndex {
254+ GlobalIndexSettings settings = 1 ;
255+ FulltextIndexSettings fulltext_settings = 2 ;
256+ }
257+
258+ // Represent table index
117259message TableIndex {
118260 // Name of index
119261 string name = 1 ;
@@ -125,12 +267,13 @@ message TableIndex {
125267 GlobalAsyncIndex global_async_index = 4 ;
126268 GlobalUniqueIndex global_unique_index = 6 ;
127269 GlobalVectorKMeansTreeIndex global_vector_kmeans_tree_index = 7 ;
270+ GlobalFulltextIndex global_fulltext_index = 8 ;
128271 }
129272 // list of columns content to be copied in to index table
130273 repeated string data_columns = 5 ;
131274}
132275
133- // Represent secondary index with index state
276+ // Represent table index with index state
134277message TableIndexDescription {
135278 enum Status {
136279 STATUS_UNSPECIFIED = 0 ;
@@ -149,6 +292,7 @@ message TableIndexDescription {
149292 GlobalAsyncIndex global_async_index = 5 ;
150293 GlobalUniqueIndex global_unique_index = 8 ;
151294 GlobalVectorKMeansTreeIndex global_vector_kmeans_tree_index = 9 ;
295+ GlobalFulltextIndex global_fulltext_index = 10 ;
152296 }
153297 Status status = 4 ;
154298 // list of columns content to be copied in to index table
@@ -648,7 +792,7 @@ message CreateTableRequest {
648792 // Table profile
649793 TableProfile profile = 5 ;
650794 Ydb.Operations.OperationParams operation_params = 6 ;
651- // List of secondary indexes
795+ // List of table indexes
652796 repeated TableIndex indexes = 7 ;
653797 // Table rows time to live settings
654798 TtlSettings ttl_settings = 8 ;
@@ -726,9 +870,9 @@ message AlterTableRequest {
726870 TtlSettings set_ttl_settings = 7 ;
727871 google.protobuf.Empty drop_ttl_settings = 8 ;
728872 }
729- // Add secondary indexes
873+ // Add table indexes
730874 repeated TableIndex add_indexes = 9 ;
731- // Remove secondary indexes
875+ // Remove table indexes
732876 repeated string drop_indexes = 10 ;
733877 // Change table storage settings
734878 StorageSettings alter_storage_settings = 11 ;
0 commit comments