Skip to content

Commit db34b46

Browse files
maximyurchukGazizonoki
authored andcommitted
Fulltext index: Initial version (#23698)
1 parent d2d7b1f commit db34b46

File tree

2 files changed

+150
-6
lines changed

2 files changed

+150
-6
lines changed

.github/last_commit.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
a259bb7ecddc03e6802baf19019a34a051b2559d
1+
ccb159ff5e6a49d325f2342fea08f07e9bd30a83

src/api/protos/ydb_table.proto

Lines changed: 149 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,149 @@ message GlobalVectorKMeansTreeIndex {
113113
KMeansTreeSettings vector_settings = 3;
114114
}
115115

116-
// Represent secondary index
116+
message FulltextIndexSettings {
117+
// Specifies the layout strategy for storing and updating the full-text index
118+
enum Layout {
119+
LAYOUT_UNSPECIFIED = 0;
120+
121+
// Uses a single flat inverted index table (indexImplTable)
122+
// Example source table:
123+
// ┌────┬────────────────────────────┐
124+
// │ id │ text │
125+
// ├────┼────────────────────────────┤
126+
// │ 1 │ "The quick brown fox" │
127+
// │ 2 │ "The quick blue hare" │
128+
// └────┴────────────────────────────┘
129+
// Example inverted index table (indexImplTable):
130+
// ┌──────────────┬────┐
131+
// │ __ydb_token │ id │
132+
// ├──────────────┼────┤
133+
// │ "blue" │ 2 │
134+
// │ "brown" │ 1 │
135+
// │ "fox" │ 1 │
136+
// │ "hare" │ 2 │
137+
// │ "quick" │ 1 │
138+
// │ "quick" │ 2 │
139+
// │ "The" │ 1 │
140+
// │ "The" │ 2 │
141+
// └──────────────┴────┘
142+
// Supports a single column only
143+
FLAT = 1;
144+
}
145+
146+
// Specifies how text is tokenized during indexing
147+
enum Tokenizer {
148+
TOKENIZER_UNSPECIFIED = 0;
149+
150+
// Splits text only by whitespace
151+
// Does not split on punctuation
152+
// Example:
153+
// Text: "foo-bar baz_lorem ipsum"
154+
// Tokens: ["foo-bar", "baz_lorem", "ipsum"]
155+
WHITESPACE = 1;
156+
157+
// Applies general language-aware tokenization
158+
// Splits text on whitespace and punctuation
159+
// Example:
160+
// Text: "foo-bar baz_lorem ipsum"
161+
// Tokens: ["foo", "bar", "baz", "lorem", "ipsum"]
162+
STANDARD = 2;
163+
164+
// Treats the entire input as a single token
165+
// No splitting is performed
166+
// Example:
167+
// Text: "Hello World!"
168+
// Tokens: ["Hello World!"]
169+
KEYWORD = 3;
170+
}
171+
172+
// Represents text analyzers settings
173+
message Analyzers {
174+
// See Tokenizer enum
175+
optional Tokenizer tokenizer = 1;
176+
177+
// Language used for language-sensitive operations like stopword filtering
178+
// Example: language = "english"
179+
// By default is not specified and no language-specific logic is applied
180+
optional string language = 2;
181+
182+
// Whether to convert tokens to lowercase
183+
// Example:
184+
// Token: "Quick"
185+
// Output: "quick"
186+
optional bool use_filter_lowercase = 100;
187+
188+
// Whether to remove common stopwords like "the", "a", "is"
189+
// Example: language = "english"
190+
// Tokens: ["the", "quick", "brown"]
191+
// Output: ["quick", "brown"]
192+
optional bool use_filter_stopwords = 110;
193+
194+
// Whether to apply character n-gram indexing to each token
195+
// Must be used with filter_ngram_min_length and filter_ngram_max_length
196+
// Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
197+
// Token: "search"
198+
// Output: ["sea", "ear", "arc", "rch", "sear", "earc", "arch"]
199+
optional bool use_filter_ngram = 120;
200+
201+
// Whether to apply edge n-gram indexing (prefix-based) to each token
202+
// Used with filter_ngram_min_length and filter_ngram_max_length
203+
// Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
204+
// Token: "search"
205+
// Output: ["sea", "sear"]
206+
optional bool use_filter_edge_ngram = 121;
207+
208+
// Minimum length of n-grams to generate (inclusive)
209+
// Must be used with use_filter_ngram or use_filter_edge_ngram
210+
// Default value is 3
211+
optional int32 filter_ngram_min_length = 122 [(Ydb.value) = ">= 0"];
212+
213+
// Maximum length of n-grams to generate (inclusive)
214+
// Must be used with use_filter_ngram or use_filter_edge_ngram
215+
// Default value is 4
216+
optional int32 filter_ngram_max_length = 123 [(Ydb.value) = ">= 0"];
217+
218+
// Whether to filter tokens by their length
219+
// Must be used with filter_length_min or filter_length_max
220+
// Example: filter_length_min = 4, filter_length_max = 6
221+
// Tokens: ["foo", "fooba", "foobar", "foobarbaz"]
222+
// Output: ["fooba", "foobar"]
223+
optional bool use_filter_length = 130;
224+
225+
// Minimum token length to keep (inclusive)
226+
// Must be used with use_filter_length
227+
optional int32 filter_length_min = 131 [(Ydb.value) = ">= 0"];
228+
229+
// Maximum token length to keep (inclusive)
230+
// Must be used with use_filter_length
231+
optional int32 filter_length_max = 132 [(Ydb.value) = ">= 0"];
232+
}
233+
234+
// Represents text analyzers settings for a specific column
235+
message ColumnAnalyzers {
236+
// Name of the column to be indexed
237+
optional string column = 1;
238+
239+
// Analyzer settings specific to this column
240+
Analyzers analyzers = 2;
241+
}
242+
243+
// See Layout enum
244+
optional Layout layout = 1;
245+
246+
// List of columns and their fulltext settings
247+
// Currently, this list should contain a single entry with specified analyzers
248+
// Later, some columns may not use analyzers and will be indexed as-is
249+
// This list must always match TableIndex.index_columns
250+
repeated ColumnAnalyzers columns = 2;
251+
}
252+
253+
message GlobalFulltextIndex {
254+
GlobalIndexSettings settings = 1;
255+
FulltextIndexSettings fulltext_settings = 2;
256+
}
257+
258+
// Represent table index
117259
message TableIndex {
118260
// Name of index
119261
string name = 1;
@@ -125,12 +267,13 @@ message TableIndex {
125267
GlobalAsyncIndex global_async_index = 4;
126268
GlobalUniqueIndex global_unique_index = 6;
127269
GlobalVectorKMeansTreeIndex global_vector_kmeans_tree_index = 7;
270+
GlobalFulltextIndex global_fulltext_index = 8;
128271
}
129272
// list of columns content to be copied in to index table
130273
repeated string data_columns = 5;
131274
}
132275

133-
// Represent secondary index with index state
276+
// Represent table index with index state
134277
message TableIndexDescription {
135278
enum Status {
136279
STATUS_UNSPECIFIED = 0;
@@ -149,6 +292,7 @@ message TableIndexDescription {
149292
GlobalAsyncIndex global_async_index = 5;
150293
GlobalUniqueIndex global_unique_index = 8;
151294
GlobalVectorKMeansTreeIndex global_vector_kmeans_tree_index = 9;
295+
GlobalFulltextIndex global_fulltext_index = 10;
152296
}
153297
Status status = 4;
154298
// list of columns content to be copied in to index table
@@ -648,7 +792,7 @@ message CreateTableRequest {
648792
// Table profile
649793
TableProfile profile = 5;
650794
Ydb.Operations.OperationParams operation_params = 6;
651-
// List of secondary indexes
795+
// List of table indexes
652796
repeated TableIndex indexes = 7;
653797
// Table rows time to live settings
654798
TtlSettings ttl_settings = 8;
@@ -726,9 +870,9 @@ message AlterTableRequest {
726870
TtlSettings set_ttl_settings = 7;
727871
google.protobuf.Empty drop_ttl_settings = 8;
728872
}
729-
// Add secondary indexes
873+
// Add table indexes
730874
repeated TableIndex add_indexes = 9;
731-
// Remove secondary indexes
875+
// Remove table indexes
732876
repeated string drop_indexes = 10;
733877
// Change table storage settings
734878
StorageSettings alter_storage_settings = 11;

0 commit comments

Comments
 (0)