Skip to content

Commit abfc289

Browse files
kungagithub-actions[bot]
authored andcommitted
Initial public api for fulltext index (#23381)
1 parent 9965a24 commit abfc289

File tree

2 files changed

+130
-6
lines changed

2 files changed

+130
-6
lines changed

.github/last_commit.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
3c40f4d3fdca895220ac3b08d4f7d54e8c3aee4c
1+
5a9d3b720ac59796906673c5bdb547d07b68243a

src/api/protos/ydb_table.proto

Lines changed: 129 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,130 @@ message GlobalVectorKMeansTreeIndex {
113113
KMeansTreeSettings vector_settings = 3;
114114
}
115115

116-
// Represent secondary index
116+
message FulltextIndexSettings {
117+
// Specifies the layout strategy for storing and updating the full-text index
118+
enum Layout {
119+
LAYOUT_UNSPECIFIED = 0;
120+
121+
// Uses a single flat inverted index table (indexImplTable)
122+
// Example source table:
123+
// ┌────┬────────────────────────────┐
124+
// │ id │ text │
125+
// ├────┼────────────────────────────┤
126+
// │ 1 │ "The quick brown fox" │
127+
// │ 2 │ "The quick blue hare" │
128+
// └────┴────────────────────────────┘
129+
// Example inverted index table (indexImplTable):
130+
// ┌──────────────┬────┐
131+
// │ __ydb_token │ id │
132+
// ├──────────────┼────┤
133+
// │ "blue" │ 2 │
134+
// │ "brown" │ 1 │
135+
// │ "fox" │ 1 │
136+
// │ "hare" │ 2 │
137+
// │ "quick" │ 1 │
138+
// │ "quick" │ 2 │
139+
// │ "The" │ 1 │
140+
// │ "The" │ 2 │
141+
// └──────────────┴────┘
142+
FLAT = 1;
143+
}
144+
145+
// Specifies how text is tokenized during indexing
146+
enum Tokenizer {
147+
TOKENIZER_UNSPECIFIED = 0;
148+
149+
// Splits text only by whitespace
150+
// Does not split on punctuation
151+
// Example:
152+
// Text: "foo-bar baz_lorem ipsum"
153+
// Tokens: ["foo-bar", "baz_lorem", "ipsum"]
154+
WHITESPACE = 1;
155+
156+
// Applies general language-aware tokenization
157+
// Splits text on whitespace and punctuation
158+
// Example:
159+
// Text: "foo-bar baz_lorem ipsum"
160+
// Tokens: ["foo", "bar", "baz", "lorem", "ipsum"]
161+
STANDARD = 2;
162+
163+
// Treats the entire input as a single token
164+
// No splitting is performed
165+
// Example:
166+
// Text: "Hello World!"
167+
// Tokens: ["Hello World!"]
168+
KEYWORD = 3;
169+
}
170+
171+
// See Layout enum
172+
Layout layout = 1;
173+
174+
// See Tokenizer enum
175+
Tokenizer tokenizer = 2;
176+
177+
// Language used for language-sensitive operations like stopword filtering
178+
// Example: language = "english"
179+
// By default is not specified and no language-specific logic is applied
180+
string language = 3;
181+
182+
// Whether to convert tokens to lowercase
183+
// Example:
184+
// Token: "Quick"
185+
// Output: "quick"
186+
bool use_filter_lowercase = 100;
187+
188+
// Whether to remove common stopwords like "the", "a", "is"
189+
// Example: language = "english"
190+
// Tokens: ["the", "quick", "brown"]
191+
// Output: ["quick", "brown"]
192+
bool use_filter_stopwords = 110;
193+
194+
// Whether to apply character n-gram indexing to each token
195+
// Must be used with filter_ngram_min_length and filter_ngram_max_length
196+
// Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
197+
// Token: "search"
198+
// Output: ["sea", "ear", "arc", "rch", "sear", "earc", "arch"]
199+
bool use_filter_ngram = 120;
200+
201+
// Whether to apply edge n-gram indexing (prefix-based) to each token
202+
// Used with filter_ngram_min_length and filter_ngram_max_length
203+
// Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
204+
// Token: "search"
205+
// Output: ["sea", "sear"]
206+
bool use_filter_edge_ngram = 121;
207+
208+
// Minimum length of n-grams to generate (inclusive)
209+
// Must be used with use_filter_ngram or use_filter_edge_ngram
210+
// Default value is 3
211+
uint32 filter_ngram_min_length = 122;
212+
213+
// Maximum length of n-grams to generate (inclusive)
214+
// Must be used with use_filter_ngram or use_filter_edge_ngram
215+
// Default value is 4
216+
uint32 filter_ngram_max_length = 123;
217+
218+
// Whether to filter tokens by their length
219+
// Must be used with filter_length_min or filter_length_max
220+
// Example: filter_length_min = 4, filter_length_max = 6
221+
// Tokens: ["foo", "fooba", "foobar", "foobarbaz"]
222+
// Output: ["fooba", "foobar"]
223+
bool use_filter_length = 130;
224+
225+
// Minimum token length to keep (inclusive)
226+
// Must be used with use_filter_length
227+
uint32 filter_length_min = 131;
228+
229+
// Maximum token length to keep (inclusive)
230+
// Must be used with use_filter_length
231+
uint32 filter_length_max = 132;
232+
}
233+
234+
message GlobalFulltextIndex {
235+
GlobalIndexSettings settings = 1;
236+
FulltextIndexSettings fulltext_settings = 2;
237+
}
238+
239+
// Represent table index
117240
message TableIndex {
118241
// Name of index
119242
string name = 1;
@@ -125,12 +248,13 @@ message TableIndex {
125248
GlobalAsyncIndex global_async_index = 4;
126249
GlobalUniqueIndex global_unique_index = 6;
127250
GlobalVectorKMeansTreeIndex global_vector_kmeans_tree_index = 7;
251+
GlobalFulltextIndex global_fulltext_index = 8;
128252
}
129253
// list of columns content to be copied in to index table
130254
repeated string data_columns = 5;
131255
}
132256

133-
// Represent secondary index with index state
257+
// Represent table index with index state
134258
message TableIndexDescription {
135259
enum Status {
136260
STATUS_UNSPECIFIED = 0;
@@ -648,7 +772,7 @@ message CreateTableRequest {
648772
// Table profile
649773
TableProfile profile = 5;
650774
Ydb.Operations.OperationParams operation_params = 6;
651-
// List of secondary indexes
775+
// List of table indexes
652776
repeated TableIndex indexes = 7;
653777
// Table rows time to live settings
654778
TtlSettings ttl_settings = 8;
@@ -726,9 +850,9 @@ message AlterTableRequest {
726850
TtlSettings set_ttl_settings = 7;
727851
google.protobuf.Empty drop_ttl_settings = 8;
728852
}
729-
// Add secondary indexes
853+
// Add table indexes
730854
repeated TableIndex add_indexes = 9;
731-
// Remove secondary indexes
855+
// Remove table indexes
732856
repeated string drop_indexes = 10;
733857
// Change table storage settings
734858
StorageSettings alter_storage_settings = 11;

0 commit comments

Comments
 (0)