elastic
diff --git a/‎docs/changelog/129089.yaml‎
Lines changed: 5 additions & 0 deletions b/‎docs/changelog/129089.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/reference/elasticsearch/mapping-reference/sparse-vector.md‎
Lines changed: 59 additions & 0 deletions b/‎docs/reference/elasticsearch/mapping-reference/sparse-vector.md‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎server/src/main/java/org/elasticsearch/TransportVersions.java‎
Lines changed: 2 additions & 1 deletion b/‎server/src/main/java/org/elasticsearch/TransportVersions.java‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎server/src/main/java/org/elasticsearch/index/IndexVersions.java‎
Lines changed: 2 additions & 0 deletions b/‎server/src/main/java/org/elasticsearch/index/IndexVersions.java‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java‎
Lines changed: 3 additions & 1 deletion b/‎server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java‎
Lines changed: 3 additions & 1 deletion
@@ -0,0 +1,5 @@
+pr: 129089
+summary: Update `sparse_vector` field mapping to include default setting for token pruning
+area: Mapping
+type: enhancement
+issues: []
@@ -24,6 +24,33 @@ PUT my-index
 }
 ```
 
+## Token pruning
+```{applies_to}
+stack: preview 9.1
+```
+
+With any new indices created, token pruning will be turned on by default with appropriate defaults. You can control this behaviour using the optional `index_options` parameters for the field:
+
+```console
+PUT my-index
+{
+  "mappings": {
+    "properties": {
+      "text.tokens": {
+        "type": "sparse_vector",
+        "index_options": {
+          "prune": true,
+          "pruning_config": {
+            "tokens_freq_ratio_threshold": 5,
+            "tokens_weight_threshold": 0.4
+          }
+        }
+      }
+    }
+  }
+}
+```
+
 See [semantic search with ELSER](docs-content://solutions/search/semantic-search/semantic-search-elser-ingest-pipelines.md) for a complete example on adding documents to a `sparse_vector` mapped field using ELSER.
 
 ## Parameters for `sparse_vector` fields [sparse-vectors-params]
@@ -36,6 +63,38 @@ The following parameters are accepted by `sparse_vector` fields:
     * Exclude the field from [_source](/reference/elasticsearch/rest-apis/retrieve-selected-fields.md#source-filtering).
     * Use [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source).
 
+index_options {applies_to}`stack: preview 9.1`
+:   (Optional, object) You can set index options for your  `sparse_vector` field to determine if you should prune tokens, and the parameter configurations for the token pruning. If pruning options are not set in your [`sparse_vector` query](/reference/query-languages/query-dsl/query-dsl-sparse-vector-query.md), Elasticsearch will use the default options configured for the field, if any.
+
+Parameters for `index_options` are:
+
+`prune` {applies_to}`stack: preview 9.1`
+:   (Optional, boolean) Whether to perform pruning, omitting the non-significant tokens from the query to improve query performance. If `prune` is true but the `pruning_config` is not specified, pruning will occur but default values will be used. Default: true.
+
+`pruning_config` {applies_to}`stack: preview 9.1`
+:   (Optional, object) Optional pruning configuration. If enabled, this will omit non-significant tokens from the query in order to improve query performance. This is only used if `prune` is set to `true`. If `prune` is set to `true` but `pruning_config` is not specified, default values will be used. If `prune` is set to false but `pruning_config` is specified, an exception will occur.
+
+    Parameters for `pruning_config` include:
+
+    `tokens_freq_ratio_threshold` {applies_to}`stack: preview 9.1`
+    :   (Optional, integer) Tokens whose frequency is more than `tokens_freq_ratio_threshold` times the average frequency of all tokens in the specified field are considered outliers and pruned. This value must between 1 and 100. Default: `5`.
+
+    `tokens_weight_threshold` {applies_to}`stack: preview 9.1`
+    :   (Optional, float) Tokens whose weight is less than `tokens_weight_threshold` are considered insignificant and pruned. This value must be between 0 and 1. Default: `0.4`.
+
+    ::::{note}
+    The default values for `tokens_freq_ratio_threshold` and `tokens_weight_threshold` were chosen based on tests using ELSERv2 that provided the most optimal results.
+    ::::
+
+When token pruning is applied, non-significant tokens will be pruned from the query.
+Non-significant tokens can be defined as tokens that meet both of the following criteria:
+* The token appears much more frequently than most tokens, indicating that it is a very common word and may not benefit the overall search results much.
+* The weight/score is so low that the token is likely not very relevant to the original term
+
+Both the token frequency threshold and weight threshold must show the token is non-significant in order for the token to be pruned.
+This ensures that:
+* The tokens that are kept are frequent enough and have significant scoring.
+* Very infrequent tokens that may not have as high of a score are removed.
 
 
 ## Multi-value sparse vectors [index-multi-value-sparse-vectors]
 
@@ -203,7 +203,7 @@ static TransportVersion def(int id) {
     public static final TransportVersion ML_INFERENCE_CUSTOM_SERVICE_INPUT_TYPE_8_19 = def(8_841_0_55);
     public static final TransportVersion RANDOM_SAMPLER_QUERY_BUILDER_8_19 = def(8_841_0_56);
     public static final TransportVersion ML_INFERENCE_SAGEMAKER_ELASTIC_8_19 = def(8_841_0_57);
-
+    public static final TransportVersion SPARSE_VECTOR_FIELD_PRUNING_OPTIONS_8_19 = def(8_841_0_58);
     public static final TransportVersion V_9_0_0 = def(9_000_0_09);
     public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_1 = def(9_000_0_10);
     public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_2 = def(9_000_0_11);
@@ -313,6 +313,7 @@ static TransportVersion def(int id) {
     public static final TransportVersion STREAMS_LOGS_SUPPORT = def(9_104_0_00);
     public static final TransportVersion ML_INFERENCE_CUSTOM_SERVICE_INPUT_TYPE = def(9_105_0_00);
     public static final TransportVersion ML_INFERENCE_SAGEMAKER_ELASTIC = def(9_106_0_00);
+    public static final TransportVersion SPARSE_VECTOR_FIELD_PRUNING_OPTIONS = def(9_107_0_00);
 
     /*
      * STOP! READ THIS FIRST! No, really,
 
@@ -144,6 +144,7 @@ private static Version parseUnchecked(String version) {
     public static final IndexVersion INDEX_INT_SORT_INT_TYPE_8_19 = def(8_532_0_00, Version.LUCENE_9_12_1);
     public static final IndexVersion MAPPER_TEXT_MATCH_ONLY_MULTI_FIELDS_DEFAULT_NOT_STORED_8_19 = def(8_533_0_00, Version.LUCENE_9_12_1);
     public static final IndexVersion UPGRADE_TO_LUCENE_9_12_2 = def(8_534_0_00, Version.LUCENE_9_12_2);
+    public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X = def(8_535_0_00, Version.LUCENE_9_12_2);
     public static final IndexVersion UPGRADE_TO_LUCENE_10_0_0 = def(9_000_0_00, Version.LUCENE_10_0_0);
     public static final IndexVersion LOGSDB_DEFAULT_IGNORE_DYNAMIC_BEYOND_LIMIT = def(9_001_0_00, Version.LUCENE_10_0_0);
     public static final IndexVersion TIME_BASED_K_ORDERED_DOC_ID = def(9_002_0_00, Version.LUCENE_10_0_0);
@@ -175,6 +176,7 @@ private static Version parseUnchecked(String version) {
     public static final IndexVersion INDEX_INT_SORT_INT_TYPE = def(9_028_0_00, Version.LUCENE_10_2_1);
     public static final IndexVersion MAPPER_TEXT_MATCH_ONLY_MULTI_FIELDS_DEFAULT_NOT_STORED = def(9_029_0_00, Version.LUCENE_10_2_1);
     public static final IndexVersion UPGRADE_TO_LUCENE_10_2_2 = def(9_030_0_00, Version.LUCENE_10_2_2);
+    public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT = def(9_031_0_00, Version.LUCENE_10_2_2);
 
     /*
      * STOP! READ THIS FIRST! No, really,
 
@@ -17,6 +17,7 @@
 import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.RESCORE_VECTOR_QUANTIZED_VECTOR_MAPPING;
 import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.RESCORE_ZERO_VECTOR_QUANTIZED_VECTOR_MAPPING;
 import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.USE_DEFAULT_OVERSAMPLE_VALUE_FOR_BBQ;
+import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.SPARSE_VECTOR_INDEX_OPTIONS_FEATURE;
 
 /**
  * Spec for mapper-related features.
@@ -74,7 +75,8 @@ public Set<NodeFeature> getTestFeatures() {
             USE_DEFAULT_OVERSAMPLE_VALUE_FOR_BBQ,
             IVF_FORMAT_CLUSTER_FEATURE,
             IVF_NESTED_SUPPORT,
-            SEARCH_LOAD_PER_SHARD
+            SEARCH_LOAD_PER_SHARD,
+            SPARSE_VECTOR_INDEX_OPTIONS_FEATURE
         );
     }
 }