huggingface
diff --git a/‎README.md
Lines changed: 21 additions & 0 deletions b/‎README.md
Lines changed: 21 additions & 0 deletions
diff --git a/‎core/src/infer.rs
Lines changed: 48 additions & 0 deletions b/‎core/src/infer.rs
Lines changed: 48 additions & 0 deletions
diff --git a/‎docs/openapi.json
Lines changed: 256 additions & 0 deletions b/‎docs/openapi.json
Lines changed: 256 additions & 0 deletions
@@ -35,6 +35,7 @@ length of 512 tokens:
     - [Using a private or gated model](#using-a-private-or-gated-model)
     - [Using Re-rankers models](#using-re-rankers-models)
     - [Using Sequence Classification models](#using-sequence-classification-models)
+    - [Using SPLADE pooling](#using-splade-pooling)
     - [Distributed Tracing](#distributed-tracing)
     - [gRPC](#grpc)
 - [Local Install](#local-install)
@@ -331,6 +332,26 @@ curl 127.0.0.1:8080/predict \
     -H 'Content-Type: application/json'
 ```
 
+### Using SPLADE pooling
+
+You can choose to activate SPLADE pooling for Bert and Distilbert MaskedLM architectures:
+
+```shell
+model=naver/efficient-splade-VI-BT-large-query
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.1 --model-id $model --pooling splade
+```
+
+Once you have deployed the model you can use the `/embed_sparse` endpoint to get the sparse embedding:
+
+```bash
+curl 127.0.0.1:8080/embed_sparse \
+    -X POST \
+    -d '{"inputs":"I like you."}' \
+    -H 'Content-Type: application/json'
+```
+
 ### Distributed Tracing
 
 `text-embeddings-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature
 
@@ -144,6 +144,54 @@ impl Infer {
         Ok(response)
     }
 
+    #[instrument(skip(self, permit))]
+    pub async fn embed_sparse<I: Into<EncodingInput> + std::fmt::Debug>(
+        &self,
+        inputs: I,
+        truncate: bool,
+        permit: OwnedSemaphorePermit,
+    ) -> Result<PooledEmbeddingsInferResponse, TextEmbeddingsError> {
+        let start_time = Instant::now();
+
+        if !self.is_splade() {
+            metrics::increment_counter!("te_request_failure", "err" => "model_type");
+            let message = "Model is not an embedding model with SPLADE pooling".to_string();
+            tracing::error!("{message}");
+            return Err(TextEmbeddingsError::Backend(BackendError::Inference(
+                message,
+            )));
+        }
+
+        let results = self
+            .embed(inputs, truncate, true, &start_time, permit)
+            .await?;
+
+        let InferResult::PooledEmbedding(response) = results else {
+            panic!("unexpected enum variant")
+        };
+
+        // Timings
+        let total_time = start_time.elapsed();
+
+        // Metrics
+        metrics::increment_counter!("te_embed_success");
+        metrics::histogram!("te_embed_duration", total_time.as_secs_f64());
+        metrics::histogram!(
+            "te_embed_tokenization_duration",
+            response.metadata.tokenization.as_secs_f64()
+        );
+        metrics::histogram!(
+            "te_embed_queue_duration",
+            response.metadata.queue.as_secs_f64()
+        );
+        metrics::histogram!(
+            "te_embed_inference_duration",
+            response.metadata.inference.as_secs_f64()
+        );
+
+        Ok(response)
+    }
+
     #[instrument(skip(self, permit))]
     pub async fn embed_pooled<I: Into<EncodingInput> + std::fmt::Debug>(
         &self,
 
@@ -100,6 +100,182 @@
         }
       }
     },
+    "/embed_all": {
+      "post": {
+        "tags": [
+          "Text Embeddings Inference"
+        ],
+        "summary": "Get all Embeddings without Pooling.",
+        "description": "Get all Embeddings without Pooling.\nReturns a 424 status code if the model is not an embedding model.",
+        "operationId": "embed_all",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/EmbedAllRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Embeddings",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/EmbedAllResponse"
+                }
+              }
+            }
+          },
+          "413": {
+            "description": "Batch size error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Batch size error",
+                  "error_type": "validation"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Tokenization error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Tokenization error",
+                  "error_type": "tokenizer"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Embedding Error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Inference failed",
+                  "error_type": "backend"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded",
+                  "error_type": "overloaded"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/embed_sparse": {
+      "post": {
+        "tags": [
+          "Text Embeddings Inference"
+        ],
+        "summary": "Get Sparse Embeddings. Returns a 424 status code if the model is not an embedding model with SPLADE pooling.",
+        "description": "Get Sparse Embeddings. Returns a 424 status code if the model is not an embedding model with SPLADE pooling.",
+        "operationId": "embed_sparse",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/EmbedSparseRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Embeddings",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/EmbedSparseResponse"
+                }
+              }
+            }
+          },
+          "413": {
+            "description": "Batch size error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Batch size error",
+                  "error_type": "validation"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Tokenization error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Tokenization error",
+                  "error_type": "tokenizer"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Embedding Error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Inference failed",
+                  "error_type": "backend"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded",
+                  "error_type": "overloaded"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
     "/embeddings": {
       "post": {
         "tags": [
@@ -514,6 +690,44 @@
           }
         }
       },
+      "EmbedAllRequest": {
+        "type": "object",
+        "required": [
+          "inputs"
+        ],
+        "properties": {
+          "inputs": {
+            "$ref": "#/components/schemas/Input"
+          },
+          "truncate": {
+            "type": "boolean",
+            "default": "false",
+            "example": "false"
+          }
+        }
+      },
+      "EmbedAllResponse": {
+        "type": "array",
+        "items": {
+          "type": "array",
+          "items": {
+            "type": "array",
+            "items": {
+              "type": "number",
+              "format": "float"
+            }
+          }
+        },
+        "example": [
+          [
+            [
+              0.0,
+              1.0,
+              2.0
+            ]
+          ]
+        ]
+      },
       "EmbedRequest": {
         "type": "object",
         "required": [
@@ -552,6 +766,31 @@
           ]
         ]
       },
+      "EmbedSparseRequest": {
+        "type": "object",
+        "required": [
+          "inputs"
+        ],
+        "properties": {
+          "inputs": {
+            "$ref": "#/components/schemas/Input"
+          },
+          "truncate": {
+            "type": "boolean",
+            "default": "false",
+            "example": "false"
+          }
+        }
+      },
+      "EmbedSparseResponse": {
+        "type": "array",
+        "items": {
+          "type": "array",
+          "items": {
+            "$ref": "#/components/schemas/SparseValue"
+          }
+        }
+      },
       "EmbeddingModel": {
         "type": "object",
         "required": [
@@ -1047,6 +1286,23 @@
           }
         }
       },
+      "SparseValue": {
+        "type": "object",
+        "required": [
+          "index",
+          "value"
+        ],
+        "properties": {
+          "index": {
+            "type": "integer",
+            "minimum": 0
+          },
+          "value": {
+            "type": "number",
+            "format": "float"
+          }
+        }
+      },
       "TokenizeRequest": {
         "type": "object",
         "required": [