Skip to content

Commit 39c3696

Browse files
kungagithub-actions[bot]
authored andcommitted
Support multiple columns in fulltext index (#24439)
1 parent 2412ccc commit 39c3696

File tree

2 files changed

+77
-59
lines changed

2 files changed

+77
-59
lines changed

.github/last_commit.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
4e76cbcda6df56a31adbfc6506f7b0cbdf4a1240
1+
835e2f5f88c3c3e25f26ca50788d94f003a344dd

src/api/protos/ydb_table.proto

Lines changed: 76 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ message FulltextIndexSettings {
139139
// │ "The" │ 1 │
140140
// │ "The" │ 2 │
141141
// └──────────────┴────┘
142+
// Supports a single column only
142143
FLAT = 1;
143144
}
144145

@@ -168,67 +169,84 @@ message FulltextIndexSettings {
168169
KEYWORD = 3;
169170
}
170171

172+
// Represents text analyzers settings
173+
message Analyzers {
174+
// See Tokenizer enum
175+
Tokenizer tokenizer = 1;
176+
177+
// Language used for language-sensitive operations like stopword filtering
178+
// Example: language = "english"
179+
// By default is not specified and no language-specific logic is applied
180+
string language = 2;
181+
182+
// Whether to convert tokens to lowercase
183+
// Example:
184+
// Token: "Quick"
185+
// Output: "quick"
186+
bool use_filter_lowercase = 100;
187+
188+
// Whether to remove common stopwords like "the", "a", "is"
189+
// Example: language = "english"
190+
// Tokens: ["the", "quick", "brown"]
191+
// Output: ["quick", "brown"]
192+
bool use_filter_stopwords = 110;
193+
194+
// Whether to apply character n-gram indexing to each token
195+
// Must be used with filter_ngram_min_length and filter_ngram_max_length
196+
// Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
197+
// Token: "search"
198+
// Output: ["sea", "ear", "arc", "rch", "sear", "earc", "arch"]
199+
bool use_filter_ngram = 120;
200+
201+
// Whether to apply edge n-gram indexing (prefix-based) to each token
202+
// Used with filter_ngram_min_length and filter_ngram_max_length
203+
// Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
204+
// Token: "search"
205+
// Output: ["sea", "sear"]
206+
bool use_filter_edge_ngram = 121;
207+
208+
// Minimum length of n-grams to generate (inclusive)
209+
// Must be used with use_filter_ngram or use_filter_edge_ngram
210+
// Default value is 3
211+
int32 filter_ngram_min_length = 122 [(Ydb.value) = ">= 0"];
212+
213+
// Maximum length of n-grams to generate (inclusive)
214+
// Must be used with use_filter_ngram or use_filter_edge_ngram
215+
// Default value is 4
216+
int32 filter_ngram_max_length = 123 [(Ydb.value) = ">= 0"];
217+
218+
// Whether to filter tokens by their length
219+
// Must be used with filter_length_min or filter_length_max
220+
// Example: filter_length_min = 4, filter_length_max = 6
221+
// Tokens: ["foo", "fooba", "foobar", "foobarbaz"]
222+
// Output: ["fooba", "foobar"]
223+
bool use_filter_length = 130;
224+
225+
// Minimum token length to keep (inclusive)
226+
// Must be used with use_filter_length
227+
int32 filter_length_min = 131 [(Ydb.value) = ">= 0"];
228+
229+
// Maximum token length to keep (inclusive)
230+
// Must be used with use_filter_length
231+
int32 filter_length_max = 132 [(Ydb.value) = ">= 0"];
232+
}
233+
234+
// Represents text analyzers settings for a specific column
235+
message ColumnAnalyzers {
236+
// Name of the column to be indexed
237+
string column = 1;
238+
239+
// Analyzer settings specific to this column
240+
Analyzers analyzers = 2;
241+
}
242+
171243
// See Layout enum
172244
Layout layout = 1;
173245

174-
// See Tokenizer enum
175-
Tokenizer tokenizer = 2;
176-
177-
// Language used for language-sensitive operations like stopword filtering
178-
// Example: language = "english"
179-
// By default is not specified and no language-specific logic is applied
180-
string language = 3;
181-
182-
// Whether to convert tokens to lowercase
183-
// Example:
184-
// Token: "Quick"
185-
// Output: "quick"
186-
bool use_filter_lowercase = 100;
187-
188-
// Whether to remove common stopwords like "the", "a", "is"
189-
// Example: language = "english"
190-
// Tokens: ["the", "quick", "brown"]
191-
// Output: ["quick", "brown"]
192-
bool use_filter_stopwords = 110;
193-
194-
// Whether to apply character n-gram indexing to each token
195-
// Must be used with filter_ngram_min_length and filter_ngram_max_length
196-
// Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
197-
// Token: "search"
198-
// Output: ["sea", "ear", "arc", "rch", "sear", "earc", "arch"]
199-
bool use_filter_ngram = 120;
200-
201-
// Whether to apply edge n-gram indexing (prefix-based) to each token
202-
// Used with filter_ngram_min_length and filter_ngram_max_length
203-
// Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
204-
// Token: "search"
205-
// Output: ["sea", "sear"]
206-
bool use_filter_edge_ngram = 121;
207-
208-
// Minimum length of n-grams to generate (inclusive)
209-
// Must be used with use_filter_ngram or use_filter_edge_ngram
210-
// Default value is 3
211-
int32 filter_ngram_min_length = 122 [(Ydb.value) = ">= 0"];
212-
213-
// Maximum length of n-grams to generate (inclusive)
214-
// Must be used with use_filter_ngram or use_filter_edge_ngram
215-
// Default value is 4
216-
int32 filter_ngram_max_length = 123 [(Ydb.value) = ">= 0"];
217-
218-
// Whether to filter tokens by their length
219-
// Must be used with filter_length_min or filter_length_max
220-
// Example: filter_length_min = 4, filter_length_max = 6
221-
// Tokens: ["foo", "fooba", "foobar", "foobarbaz"]
222-
// Output: ["fooba", "foobar"]
223-
bool use_filter_length = 130;
224-
225-
// Minimum token length to keep (inclusive)
226-
// Must be used with use_filter_length
227-
int32 filter_length_min = 131 [(Ydb.value) = ">= 0"];
228-
229-
// Maximum token length to keep (inclusive)
230-
// Must be used with use_filter_length
231-
int32 filter_length_max = 132 [(Ydb.value) = ">= 0"];
246+
// List of columns and their fulltext settings
247+
// Currently, this list should contain a single entry
248+
// And provided column should be the only one in the TableIndex.index_columns list
249+
repeated ColumnAnalyzers columns = 2;
232250
}
233251

234252
message GlobalFulltextIndex {

0 commit comments

Comments
 (0)