Initial public api for fulltext index (#23381)

kunga · github-actions[bot] · commit abfc28913dbf · 2025-09-29T10:37:30.000Z
diff --git a/.github/last_commit.txt b/.github/last_commit.txt
@@ -1 +1 @@
-3c40f4d3fdca895220ac3b08d4f7d54e8c3aee4c
+5a9d3b720ac59796906673c5bdb547d07b68243a
diff --git a/src/api/protos/ydb_table.proto b/src/api/protos/ydb_table.proto
@@ -113,7 +113,130 @@ message GlobalVectorKMeansTreeIndex {
     KMeansTreeSettings vector_settings = 3;
 }
 
-// Represent secondary index
+message FulltextIndexSettings {
+    // Specifies the layout strategy for storing and updating the full-text index
+    enum Layout {
+        LAYOUT_UNSPECIFIED = 0;
+
+        // Uses a single flat inverted index table (indexImplTable)
+        // Example source table:
+        //     ┌────┬────────────────────────────┐
+        //     │ id │ text                       │
+        //     ├────┼────────────────────────────┤
+        //     │ 1  │ "The quick brown fox"      │
+        //     │ 2  │ "The quick blue hare"      │
+        //     └────┴────────────────────────────┘
+        // Example inverted index table (indexImplTable):
+        //     ┌──────────────┬────┐
+        //     │ __ydb_token  │ id │
+        //     ├──────────────┼────┤
+        //     │ "blue"       │ 2  │
+        //     │ "brown"      │ 1  │
+        //     │ "fox"        │ 1  │
+        //     │ "hare"       │ 2  │
+        //     │ "quick"      │ 1  │
+        //     │ "quick"      │ 2  │
+        //     │ "The"        │ 1  │
+        //     │ "The"        │ 2  │
+        //     └──────────────┴────┘
+        FLAT = 1;
+    }
+
+    // Specifies how text is tokenized during indexing
+    enum Tokenizer {
+        TOKENIZER_UNSPECIFIED = 0;
+
+        // Splits text only by whitespace
+        // Does not split on punctuation
+        // Example:
+        //   Text: "foo-bar baz_lorem ipsum"
+        //   Tokens: ["foo-bar", "baz_lorem", "ipsum"]
+        WHITESPACE = 1;
+
+        // Applies general language-aware tokenization
+        // Splits text on whitespace and punctuation
+        // Example:
+        //   Text: "foo-bar baz_lorem ipsum"
+        //   Tokens: ["foo", "bar", "baz", "lorem", "ipsum"]
+        STANDARD = 2;
+
+        // Treats the entire input as a single token
+        // No splitting is performed
+        // Example:
+        //   Text: "Hello World!"
+        //   Tokens: ["Hello World!"]
+        KEYWORD = 3;
+    }
+
+    // See Layout enum
+    Layout layout = 1;
+
+    // See Tokenizer enum
+    Tokenizer tokenizer = 2;
+
+    // Language used for language-sensitive operations like stopword filtering
+    // Example: language = "english"
+    // By default is not specified and no language-specific logic is applied
+    string language = 3;
+
+    // Whether to convert tokens to lowercase
+    // Example:
+    //   Token: "Quick"
+    //   Output: "quick"
+    bool use_filter_lowercase = 100;
+
+    // Whether to remove common stopwords like "the", "a", "is"
+    // Example: language = "english"
+    //   Tokens: ["the", "quick", "brown"]
+    //   Output: ["quick", "brown"]
+    bool use_filter_stopwords = 110;
+
+    // Whether to apply character n-gram indexing to each token
+    // Must be used with filter_ngram_min_length and filter_ngram_max_length
+    // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
+    //   Token: "search"
+    //   Output: ["sea", "ear", "arc", "rch", "sear", "earc", "arch"]
+    bool use_filter_ngram = 120;
+
+    // Whether to apply edge n-gram indexing (prefix-based) to each token
+    // Used with filter_ngram_min_length and filter_ngram_max_length
+    // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4
+    //   Token: "search"
+    //   Output: ["sea", "sear"]
+    bool use_filter_edge_ngram = 121;
+
+    // Minimum length of n-grams to generate (inclusive)
+    // Must be used with use_filter_ngram or use_filter_edge_ngram
+    // Default value is 3
+    uint32 filter_ngram_min_length = 122;
+
+    // Maximum length of n-grams to generate (inclusive)
+    // Must be used with use_filter_ngram or use_filter_edge_ngram
+    // Default value is 4
+    uint32 filter_ngram_max_length = 123;
+
+    // Whether to filter tokens by their length
+    // Must be used with filter_length_min or filter_length_max
+    // Example: filter_length_min = 4, filter_length_max = 6
+    //   Tokens: ["foo", "fooba", "foobar", "foobarbaz"]
+    //   Output: ["fooba", "foobar"]
+    bool use_filter_length = 130;
+
+    // Minimum token length to keep (inclusive)
+    // Must be used with use_filter_length
+    uint32 filter_length_min = 131;
+
+    // Maximum token length to keep (inclusive)
+    // Must be used with use_filter_length
+    uint32 filter_length_max = 132;
+}
+
+message GlobalFulltextIndex {
+    GlobalIndexSettings settings = 1;
+    FulltextIndexSettings fulltext_settings = 2;
+}
+
+// Represent table index
 message TableIndex {
     // Name of index
     string name = 1;
@@ -125,12 +248,13 @@ message TableIndex {
        GlobalAsyncIndex global_async_index = 4;
        GlobalUniqueIndex global_unique_index = 6;
        GlobalVectorKMeansTreeIndex global_vector_kmeans_tree_index = 7;
+       GlobalFulltextIndex global_fulltext_index = 8;
     }
     // list of columns content to be copied in to index table
     repeated string data_columns = 5;
 }
 
-// Represent secondary index with index state
+// Represent table index with index state
 message TableIndexDescription {
     enum Status {
         STATUS_UNSPECIFIED = 0;
@@ -648,7 +772,7 @@ message CreateTableRequest {
     // Table profile
     TableProfile profile = 5;
     Ydb.Operations.OperationParams operation_params = 6;
-    // List of secondary indexes
+    // List of table indexes
     repeated TableIndex indexes = 7;
     // Table rows time to live settings
     TtlSettings ttl_settings = 8;
@@ -726,9 +850,9 @@ message AlterTableRequest {
         TtlSettings set_ttl_settings = 7;
         google.protobuf.Empty drop_ttl_settings = 8;
     }
-    // Add secondary indexes
+    // Add table indexes
     repeated TableIndex add_indexes = 9;
-    // Remove secondary indexes
+    // Remove table indexes
     repeated string drop_indexes = 10;
     // Change table storage settings
     StorageSettings alter_storage_settings = 11;

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-3c40f4d3fdca895220ac3b08d4f7d54e8c3aee4c`
	`1`	`+5a9d3b720ac59796906673c5bdb547d07b68243a`