Merge branch 'main' into add-bfloat16-support

alvarobartt · alvarobartt · commit e3e34b82204f · 2026-01-31T19:25:54.000+01:00
diff --git a/README.md b/README.md
@@ -137,14 +137,15 @@ To see all options to serve your models:
 $ text-embeddings-router --help
 Text Embedding Webserver
 
-Usage: text-embeddings-router [OPTIONS]
+Usage: text-embeddings-router [OPTIONS] --model-id <MODEL_ID>
 
 Options:
       --model-id <MODEL_ID>
-          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `BAAI/bge-large-en-v1.5`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
+          The Hugging Face model ID, can be any model listed on <https://huggingface.co/models> with the `text-embeddings-inference` tag (meaning it's compatible with Text Embeddings Inference).
+
+          Alternatively, the specified ID can also be a path to a local directory containing the necessary model files saved by the `save_pretrained(...)` methods of either Transformers or Sentence Transformers.
 
           [env: MODEL_ID=]
-          [default: BAAI/bge-large-en-v1.5]
 
       --revision <REVISION>
           The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
@@ -162,6 +163,11 @@ Options:
           [env: DTYPE=]
           [possible values: float16, float32]
 
+      --served-model-name <SERVED_MODEL_NAME>
+          The name of the model that is being served. If not specified, defaults to `--model-id`. It is only used for the OpenAI-compatible endpoints via HTTP
+
+          [env: SERVED_MODEL_NAME=]
+
       --pooling <POOLING>
           Optionally control the pooling method for embedding models.
 
@@ -238,10 +244,9 @@ Options:
 
           Some embedding models require an extra `Dense` module which contains a single Linear layer and an activation function. By default, those `Dense` modules are stored under the `2_Dense` directory, but there might be cases where different `Dense` modules are provided, to convert the pooled embeddings into different dimensions, available as `2_Dense_<dims>` e.g. https://huggingface.co/NovaSearch/stella_en_400M_v5.
 
-          Note that this argument is optional, only required to be set if the path to the `Dense` module is other than `2_Dense`. And it also applies when leveraging the `candle` backend.
+          Note that this argument is optional, only required to be set if there is no `modules.json` file or when you want to override a single Dense module path, only when running with the `candle` backend.
 
           [env: DENSE_PATH=]
-          [default: 2_Dense]
 
       --hf-token <HF_TOKEN>
           Your Hugging Face Hub token
diff --git a/backends/candle/src/layers/linear.rs b/backends/candle/src/layers/linear.rs
@@ -5,7 +5,10 @@ use serde::Deserialize;
 #[derive(Debug, Deserialize, PartialEq, Clone)]
 #[serde(rename_all = "lowercase")]
 pub enum HiddenAct {
-    #[serde(alias = "gelu_pytorch_tanh")]
+    // NOTE: `GeluErf` is excluded due to incompatibility with cuBLASLt, as only GeLU + tanh
+    // approximation is implemented due to efficiency, so GeLU is standardized to tanh approx. with
+    // slight numerical deviation from GeLU erf (neglible on inference quality)
+    #[serde(alias = "gelu_new", alias = "gelu_pytorch_tanh")]
     Gelu,
     Relu,
     Silu,
diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs
@@ -180,7 +180,19 @@ impl CandleBackend {
         let config: String = std::fs::read_to_string(model_path.join("config.json"))
             .context("Unable to read config file")
             .map_err(|err| BackendError::Start(format!("{err:?}")))?;
-        let config: Config = serde_json::from_str(&config)
+
+        let config_value: serde_json::Value = serde_json::from_str(&config)
+            .context("Unable to parse config.json")
+            .map_err(|err| BackendError::Start(format!("{err:?}")))?;
+
+        if let Some(hidden_act) = config_value.get("hidden_act").and_then(|v| v.as_str()) {
+            if hidden_act == "gelu" {
+                // NOTE: https://github.com/huggingface/text-embeddings-inference/pull/753
+                tracing::warn!("The `config.json` contains `hidden_act=gelu` and GeLU + tanh approximation will be used instead of exact GeLU (aka. GeLU erf), which might lead to subtle differences with Transformers or Sentence Transformers outputs which use exact GeLU when `hidden_act=gelu`, unless specified otherwise. GeLU + tanh is more efficient and more consistent across devices (e.g., cuBLASLt comes with fused GeLU + tanh), and will have negligible impact on the inference quality.");
+            }
+        }
+
+        let config: Config = serde_json::from_value(config_value)
             .context("Model is not supported")
             .map_err(|err| BackendError::Start(format!("{err:?}")))?;
 
diff --git a/backends/candle/tests/snapshots/test_bert__emotions_batch.snap b/backends/candle/tests/snapshots/test_bert__emotions_batch.snap
@@ -2,87 +2,87 @@
 source: backends/candle/tests/test_bert.rs
 expression: predictions_batch
 ---
-- - -6.548559
-  - -6.302024
-  - -4.8671727
-  - -3.9600255
-  - -4.6329865
-  - -6.2816987
-  - -6.069644
-  - -5.7742686
-  - -6.9259467
-  - -6.1909447
-  - -5.67395
+- - -6.5485673
+  - -6.3020196
+  - -4.86717
+  - -3.9600184
+  - -4.632993
+  - -6.2817054
+  - -6.069636
+  - -5.7742705
+  - -6.925953
+  - -6.190939
+  - -5.6739373
   - -6.1698227
-  - -7.513461
-  - -6.865867
-  - -7.186479
-  - -7.128109
-  - -8.210709
-  - -7.0171394
-  - -7.1321163
-  - -8.533409
-  - -6.2294865
-  - -8.742306
-  - -5.7792044
-  - -8.657227
-  - -8.258305
-  - -6.64832
-  - -7.4060283
-  - 3.046496
-- - -5.8167515
-  - -6.6119466
-  - -5.2771955
-  - -2.6306503
-  - -4.6419163
-  - -5.579778
-  - -5.797174
-  - -6.0305815
-  - -5.8720746
-  - 0.45377323
-  - -3.0235887
-  - -5.3944407
-  - -5.186683
-  - -6.2649117
-  - -6.1962767
-  - -6.97937
-  - -5.5674877
-  - -5.521044
-  - -5.8899207
-  - -4.8699703
-  - -5.6259933
-  - -7.6109924
-  - -4.3881936
-  - -6.039008
-  - -4.934696
-  - -0.6715916
-  - -6.399376
-  - -2.4499295
-- - -6.548559
-  - -6.302024
-  - -4.8671727
-  - -3.9600255
-  - -4.6329865
-  - -6.2816987
-  - -6.069644
-  - -5.7742686
-  - -6.9259467
-  - -6.1909447
-  - -5.67395
+  - -7.5134573
+  - -6.8658743
+  - -7.1864815
+  - -7.128115
+  - -8.2107115
+  - -7.017146
+  - -7.132131
+  - -8.533407
+  - -6.229486
+  - -8.742311
+  - -5.7792006
+  - -8.65723
+  - -8.258308
+  - -6.648321
+  - -7.406026
+  - 3.0464942
+- - -5.816747
+  - -6.611947
+  - -5.2771983
+  - -2.6306484
+  - -4.6419153
+  - -5.5797825
+  - -5.7971735
+  - -6.030578
+  - -5.872076
+  - 0.45378062
+  - -3.0235896
+  - -5.3944383
+  - -5.18668
+  - -6.264913
+  - -6.196284
+  - -6.9793677
+  - -5.567489
+  - -5.5210495
+  - -5.889915
+  - -4.8699794
+  - -5.625993
+  - -7.6109934
+  - -4.388194
+  - -6.0390115
+  - -4.934693
+  - -0.6715966
+  - -6.3993735
+  - -2.4499245
+- - -6.5485673
+  - -6.3020196
+  - -4.86717
+  - -3.9600184
+  - -4.632993
+  - -6.2817054
+  - -6.069636
+  - -5.7742705
+  - -6.925953
+  - -6.190939
+  - -5.6739373
   - -6.1698227
-  - -7.513461
-  - -6.865867
-  - -7.186479
-  - -7.128109
-  - -8.210709
-  - -7.0171394
-  - -7.1321163
-  - -8.533409
-  - -6.2294865
-  - -8.742306
-  - -5.7792044
-  - -8.657227
-  - -8.258305
-  - -6.64832
-  - -7.4060283
-  - 3.046496
+  - -7.5134573
+  - -6.8658743
+  - -7.1864815
+  - -7.128115
+  - -8.2107115
+  - -7.017146
+  - -7.132131
+  - -8.533407
+  - -6.229486
+  - -8.742311
+  - -5.7792006
+  - -8.65723
+  - -8.258308
+  - -6.648321
+  - -7.406026
+  - 3.0464942
diff --git a/docs/openapi.json b/docs/openapi.json
@@ -1215,6 +1215,7 @@
         "required": [
           "model_id",
           "model_dtype",
+          "served_model_name",
           "model_type",
           "max_concurrent_requests",
           "max_input_length",
@@ -1278,6 +1279,10 @@
           "model_type": {
             "$ref": "#/components/schemas/ModelType"
           },
+          "served_model_name": {
+            "type": "string",
+            "example": "thenlper/gte-base"
+          },
           "sha": {
             "type": "string",
             "example": "null",
diff --git a/docs/source/en/cli_arguments.md b/docs/source/en/cli_arguments.md
@@ -22,14 +22,15 @@ To see all options to serve your models, run the following:
 $ text-embeddings-router --help
 Text Embedding Webserver
 
-Usage: text-embeddings-router [OPTIONS]
+Usage: text-embeddings-router [OPTIONS] --model-id <MODEL_ID>
 
 Options:
       --model-id <MODEL_ID>
-          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `BAAI/bge-large-en-v1.5`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
+          The Hugging Face model ID, can be any model listed on <https://huggingface.co/models> with the `text-embeddings-inference` tag (meaning it's compatible with Text Embeddings Inference).
+
+          Alternatively, the specified ID can also be a path to a local directory containing the necessary model files saved by the `save_pretrained(...)` methods of either Transformers or Sentence Transformers.
 
           [env: MODEL_ID=]
-          [default: BAAI/bge-large-en-v1.5]
 
       --revision <REVISION>
           The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
@@ -47,6 +48,11 @@ Options:
           [env: DTYPE=]
           [possible values: float16, float32]
 
+      --served-model-name <SERVED_MODEL_NAME>
+          The name of the model that is being served. If not specified, defaults to `--model-id`. It is only used for the OpenAI-compatible endpoints via HTTP
+
+          [env: SERVED_MODEL_NAME=]
+
       --pooling <POOLING>
           Optionally control the pooling method for embedding models.
 
@@ -123,10 +129,9 @@ Options:
 
           Some embedding models require an extra `Dense` module which contains a single Linear layer and an activation function. By default, those `Dense` modules are stored under the `2_Dense` directory, but there might be cases where different `Dense` modules are provided, to convert the pooled embeddings into different dimensions, available as `2_Dense_<dims>` e.g. https://huggingface.co/NovaSearch/stella_en_400M_v5.
 
-          Note that this argument is optional, only required to be set if the path to the `Dense` module is other than `2_Dense`. And it also applies when leveraging the `candle` backend.
+          Note that this argument is optional, only required to be set if there is no `modules.json` file or when you want to override a single Dense module path, only when running with the `candle` backend.
 
           [env: DENSE_PATH=]
-          [default: 2_Dense]
 
       --hf-token <HF_TOKEN>
           Your Hugging Face Hub token
diff --git a/router/src/http/server.rs b/router/src/http/server.rs
@@ -1153,6 +1153,18 @@ async fn openai_embed(
         span.set_parent(context);
     }
 
+    // NOTE: Validation of `model` won't fail for the time being given that Text Embeddings
+    // Inference can only serve a single model at a time so no need for the `model` parameter to
+    // differentiate one model from the other, but we at least raise a warning.
+    if let Some(requested_model) = &req.model {
+        if requested_model != &info.served_model_name {
+            tracing::warn!(
+                "The provided `model={}` has not been found, the `model` parameter should be provided either empty or with `model={}` instead.",
+                requested_model, info.served_model_name
+            );
+        }
+    }
+
     let start_time = Instant::now();
 
     let truncate = info.auto_truncate;
@@ -1308,7 +1320,7 @@ async fn openai_embed(
     let response = OpenAICompatResponse {
         object: "list",
         data: embeddings,
-        model: info.model_id.clone(),
+        model: info.served_model_name.clone(),
         usage: OpenAICompatUsage {
             prompt_tokens: compute_tokens,
             total_tokens: compute_tokens,
diff --git a/router/src/lib.rs b/router/src/lib.rs
@@ -47,6 +47,7 @@ pub async fn run(
     revision: Option<String>,
     tokenization_workers: Option<usize>,
     dtype: Option<DType>,
+    served_model_name: String,
     pooling: Option<text_embeddings_backend::Pool>,
     max_concurrent_requests: usize,
     max_batch_tokens: usize,
@@ -332,6 +333,7 @@ pub async fn run(
         model_id,
         model_sha: revision,
         model_dtype: dtype.to_string(),
+        served_model_name,
         model_type,
         max_concurrent_requests,
         max_input_length,
@@ -550,6 +552,8 @@ pub struct Info {
     pub model_sha: Option<String>,
     #[cfg_attr(feature = "http", schema(example = "float16"))]
     pub model_dtype: String,
+    #[cfg_attr(feature = "http", schema(example = "thenlper/gte-base"))]
+    pub served_model_name: String,
     pub model_type: ModelType,
     /// Router Parameters
     #[cfg_attr(feature = "http", schema(example = "128"))]
diff --git a/router/src/main.rs b/router/src/main.rs
@@ -14,7 +14,7 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
 struct Args {
     /// The Hugging Face model ID, can be any model listed on <https://huggingface.co/models> with
     /// the `text-embeddings-inference` tag (meaning it's compatible with Text Embeddings
-    /// Inference)
+    /// Inference).
     ///
     /// Alternatively, the specified ID can also be a path to a local directory containing the
     /// necessary model files saved by the `save_pretrained(...)` methods of either Transformers or
@@ -40,6 +40,11 @@ struct Args {
     #[clap(long, env, value_enum)]
     dtype: Option<DType>,
 
+    /// The name of the model that is being served. If not specified, defaults to `--model-id`. It
+    /// is only used for the OpenAI-compatible endpoints via HTTP.
+    #[clap(long, env)]
+    served_model_name: Option<String>,
+
     /// Optionally control the pooling method for embedding models.
     ///
     /// If `pooling` is not set, the pooling configuration will be parsed from the
@@ -227,11 +232,16 @@ async fn main() -> Result<()> {
     }
     let token = args.hf_token.or(args.hf_api_token);
 
+    let served_model_name = args
+        .served_model_name
+        .unwrap_or_else(|| args.model_id.clone());
+
     text_embeddings_router::run(
         args.model_id,
         args.revision,
         args.tokenization_workers,
         args.dtype,
+        served_model_name,
         args.pooling,
         args.max_concurrent_requests,
         args.max_batch_tokens,
diff --git a/router/tests/common.rs b/router/tests/common.rs