Update version to 1.9.0 (#830)

alvarobartt · web-flow · commit 5699247f57e4 · 2026-02-17T22:26:49.000+09:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -22,7 +22,7 @@ default-members = [
 resolver = "2"
 
 [workspace.package]
-version = "1.8.3"
+version = "1.9.0"
 edition = "2021"
 authors = ["Olivier Dehaene", "Nicolas Patry", "Alvaro Bartolome"]
 homepage = "https://github.com/huggingface/text-embeddings-inference"
diff --git a/README.md b/README.md
@@ -116,7 +116,7 @@ Below are some examples of the currently supported models:
 model=Qwen/Qwen3-Embedding-0.6B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cuda-1.9 --model-id $model
 ```
 
 And then you can make requests like
@@ -130,7 +130,7 @@ curl 127.0.0.1:8080/embed \
 
 **Note:** To use GPUs, you need to install
 the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
-NVIDIA drivers on your machine need to be compatible with CUDA version 12.6 or higher.
+NVIDIA drivers on your machine need to be compatible with CUDA version 12.2 or higher.
 
 To see all options to serve your models:
 
@@ -214,7 +214,7 @@ Options:
           [default: 32]
 
       --auto-truncate
-          Automatically truncate inputs that are longer than the maximum supported size
+          Control automatic truncation of inputs that exceed the model's maximum supported size. Defaults to `true` (truncation enabled). Set to `false` to disable truncation; when disabled and the model's maximum input length exceeds `--max-batch-tokens`, the server will refuse to start with an error instead of silently truncating sequences.
 
           Unused for gRPC servers
 
@@ -335,17 +335,17 @@ Options:
 
 Text Embeddings Inference ships with multiple Docker images that you can use to target a specific backend:
 
-| Architecture                           | Image                                                                        |
-|----------------------------------------|------------------------------------------------------------------------------|
-| CPU                                    | ghcr.io/huggingface/text-embeddings-inference:cpu-1.8                        |
-| Volta                                  | NOT SUPPORTED                                                                |
-| Turing (T4, RTX 2000 series, ...)      | ghcr.io/huggingface/text-embeddings-inference:turing-1.8 (experimental)      |
-| Ampere 8.0 (A100, A30)                 | ghcr.io/huggingface/text-embeddings-inference:1.8                            |
-| Ampere 8.6 (A10, A40, ...)             | ghcr.io/huggingface/text-embeddings-inference:86-1.8                         |
-| Ada Lovelace (RTX 4000 series, ...)    | ghcr.io/huggingface/text-embeddings-inference:89-1.8                         |
-| Hopper (H100)                          | ghcr.io/huggingface/text-embeddings-inference:hopper-1.8                     |
-| Blackwell 10.0 (B200, GB200, ...)      | ghcr.io/huggingface/text-embeddings-inference:100-sha-ac69b50 (experimental) |
-| Blackwell 12.0 (GeForce RTX 50X0, ...) | ghcr.io/huggingface/text-embeddings-inference:120-sha-ac69b50 (experimental) |
+| Architecture                           | Image                                                                   |
+|----------------------------------------|-------------------------------------------------------------------------|
+| CPU                                    | ghcr.io/huggingface/text-embeddings-inference:cpu-1.9                   |
+| Volta                                  | NOT SUPPORTED                                                           |
+| Turing (T4, RTX 2000 series, ...)      | ghcr.io/huggingface/text-embeddings-inference:turing-1.9 (experimental) |
+| Ampere 8.0 (A100, A30)                 | ghcr.io/huggingface/text-embeddings-inference:1.9                       |
+| Ampere 8.6 (A10, A40, ...)             | ghcr.io/huggingface/text-embeddings-inference:86-1.9                    |
+| Ada Lovelace (RTX 4000 series, ...)    | ghcr.io/huggingface/text-embeddings-inference:89-1.9                    |
+| Hopper (H100)                          | ghcr.io/huggingface/text-embeddings-inference:hopper-1.9                |
+| Blackwell 10.0 (B200, GB200, ...)      | ghcr.io/huggingface/text-embeddings-inference:100-1.9 (experimental)    |
+| Blackwell 12.0 (GeForce RTX 50X0, ...) | ghcr.io/huggingface/text-embeddings-inference:120-1.9 (experimental)    |
 
 **Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues.
 You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.
@@ -374,7 +374,7 @@ model=<your private model>
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your CLI READ token>
 
-docker run --gpus all -e HF_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model
+docker run --gpus all -e HF_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cuda-1.9 --model-id $model
 ```
 
 ### Air gapped deployment
@@ -397,7 +397,7 @@ git clone https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
 volume=$PWD
 
 # Mount the models directory inside the container with a volume and set the model ID
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id /data/Qwen3-Embedding-0.6B
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cuda-1.9 --model-id /data/Qwen3-Embedding-0.6B
 ```
 
 ### Using Re-rankers models
@@ -414,7 +414,7 @@ downstream performance.
 model=BAAI/bge-reranker-large
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cuda-1.9 --model-id $model
 ```
 
 And then you can rank the similarity between a query and a list of texts with:
@@ -434,7 +434,7 @@ You can also use classic Sequence Classification models like `SamLowe/roberta-ba
 model=SamLowe/roberta-base-go_emotions
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cuda-1.9 --model-id $model
 ```
 
 Once you have deployed the model you can use the `predict` endpoint to get the emotions most associated with an input:
@@ -454,7 +454,7 @@ You can choose to activate SPLADE pooling for Bert and Distilbert MaskedLM archi
 model=naver/efficient-splade-VI-BT-large-query
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model --pooling splade
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cuda-1.9 --model-id $model --pooling splade
 ```
 
 Once you have deployed the model you can use the `/embed_sparse` endpoint to get the sparse embedding:
@@ -483,7 +483,7 @@ You can use the gRPC API by adding the `-grpc` tag to any TEI Docker image. For
 model=Qwen/Qwen3-Embedding-0.6B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8-grpc --model-id $model
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cuda-1.9-grpc --model-id $model
 ```
 
 ```shell
@@ -532,7 +532,7 @@ sudo apt-get install libssl-dev gcc -y
 GPUs with CUDA compute capabilities < 7.5 are not supported (V100, Titan V, GTX 1000 series, ...).
 
 Make sure you have CUDA and the NVIDIA drivers installed. NVIDIA drivers on your device need to be compatible with CUDA
-version 12.6 or higher. You also need to add the NVIDIA binaries to your path:
+version 12.2 or higher. You also need to add the NVIDIA binaries to your path:
 
 ```shell
 export PATH=$PATH:/usr/local/cuda/bin
@@ -565,8 +565,7 @@ docker build -f Dockerfile .
 ```
 
 To build the CUDA containers, you need to know the compute cap of the GPU you will be using
-at runtime. If the compute capability is < 10.0 i.e., CUDA architecture is any of
-Turing, Ampere, Ada Lovelace, or Hopper; then run the following:
+at runtime, to build the image accordingly:
 
 ```shell
 # Get submodule dependencies
diff --git a/docs/openapi.json b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "1.8.3"
+    "version": "1.9.0"
   },
   "paths": {
     "/decode": {
diff --git a/docs/source/en/cli_arguments.md b/docs/source/en/cli_arguments.md
@@ -98,7 +98,7 @@ Options:
           [default: 32]
 
       --auto-truncate
-          Automatically truncate inputs that are longer than the maximum supported size
+          Control automatic truncation of inputs that exceed the model's maximum supported size. Defaults to `true` (truncation enabled). Set to `false` to disable truncation; when disabled and the model's maximum input length exceeds `--max-batch-tokens`, the server will refuse to start with an error instead of silently truncating sequences.
 
           Unused for gRPC servers
 
diff --git a/docs/source/en/local_gpu.md b/docs/source/en/local_gpu.md
@@ -21,7 +21,7 @@ To make sure that your hardware is supported, check out the [Supported models an
 
 ## Step 1: CUDA and NVIDIA drivers
 
-Make sure you have CUDA and the NVIDIA drivers installed - NVIDIA drivers on your device need to be compatible with CUDA version 12.6 or higher.
+Make sure you have CUDA and the NVIDIA drivers installed - NVIDIA drivers on your device need to be compatible with CUDA version 12.2 or higher.
 
 Add the NVIDIA binaries to your path:
 
diff --git a/docs/source/en/private_models.md b/docs/source/en/private_models.md
@@ -37,5 +37,5 @@ model=<your private model>
 volume=$PWD/data
 token=<your cli Hugging Face Hub token>
 
-docker run --gpus all -e HF_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model
+docker run --gpus all -e HF_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cuda-1.9 --model-id $model
 ```
diff --git a/docs/source/en/quick_tour.md b/docs/source/en/quick_tour.md
@@ -24,7 +24,7 @@ The easiest way to get started with TEI is to use one of the official Docker con
 Hence one needs to install Docker following their [installation instructions](https://docs.docker.com/get-docker/).
 
 TEI supports inference both on GPU and CPU. If you plan on using a GPU, make sure to check that your hardware is supported by checking [this table](https://github.com/huggingface/text-embeddings-inference?tab=readme-ov-file#docker-images).
-Next, install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). NVIDIA drivers on your device need to be compatible with CUDA version 12.6 or higher.
+Next, install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). NVIDIA drivers on your device need to be compatible with CUDA version 12.2 or higher.
 
 ## Deploy
 
@@ -34,7 +34,7 @@ Next it's time to deploy your model. Let's say you want to use [`Qwen/Qwen3-Embe
 model=Qwen/Qwen3-Embedding-0.6B
 volume=$PWD/data
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cuda-1.9 --model-id $model
 ```
 
 <Tip>
@@ -120,7 +120,7 @@ Let's say you want to use [`BAAI/bge-reranker-large`](https://huggingface.co/BAA
 model=BAAI/bge-reranker-large
 volume=$PWD/data
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cuda-1.9 --model-id $model
 ```
 
 Once you have deployed a model, you can use the `rerank` endpoint to rank the similarity between a query and a list of texts. With `cURL` this can be done like so:
@@ -140,7 +140,7 @@ You can also use classic Sequence Classification models like [`SamLowe/roberta-b
 model=SamLowe/roberta-base-go_emotions
 volume=$PWD/data
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cuda-1.9 --model-id $model
 ```
 
 Once you have deployed the model you can use the `predict` endpoint to get the emotions most associated with an input:
@@ -192,5 +192,5 @@ git clone https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5
 volume=$PWD
 
 # Mount the models directory inside the container with a volume and set the model ID
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id /data/gte-base-en-v1.5
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cuda-1.9 --model-id /data/gte-base-en-v1.5
 ```
diff --git a/docs/source/en/supported_models.md b/docs/source/en/supported_models.md
@@ -74,21 +74,21 @@ The library does **not** support CUDA compute capabilities < 7.5, which means V1
 
 To leverage your GPUs, make sure to install the
 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html), and use
-NVIDIA drivers with CUDA version 12.6 or higher.
+NVIDIA drivers with CUDA version 12.2 or higher.
 
 Find the appropriate Docker image for your hardware in the following table:
 
-| Architecture                           | Image                                                                        |
-|----------------------------------------|------------------------------------------------------------------------------|
-| CPU                                    | ghcr.io/huggingface/text-embeddings-inference:cpu-1.8                        |
-| Volta                                  | NOT SUPPORTED                                                                |
-| Turing (T4, RTX 2000 series, ...)      | ghcr.io/huggingface/text-embeddings-inference:turing-1.8 (experimental)      |
-| Ampere 8.0 (A100, A30)                 | ghcr.io/huggingface/text-embeddings-inference:1.8                            |
-| Ampere 8.6 (A10, A40, ...)             | ghcr.io/huggingface/text-embeddings-inference:86-1.8                         |
-| Ada Lovelace (RTX 4000 series, ...)    | ghcr.io/huggingface/text-embeddings-inference:89-1.8                         |
-| Hopper (H100)                          | ghcr.io/huggingface/text-embeddings-inference:hopper-1.8                     |
-| Blackwell 10.0 (B200, GB200, ...)      | ghcr.io/huggingface/text-embeddings-inference:100-sha-ac69b50 (experimental) |
-| Blackwell 12.0 (GeForce RTX 50X0, ...) | ghcr.io/huggingface/text-embeddings-inference:120-sha-ac69b50 (experimental) |
+| Architecture                           | Image                                                                   |
+|----------------------------------------|-------------------------------------------------------------------------|
+| CPU                                    | ghcr.io/huggingface/text-embeddings-inference:cpu-1.9                   |
+| Volta                                  | NOT SUPPORTED                                                           |
+| Turing (T4, RTX 2000 series, ...)      | ghcr.io/huggingface/text-embeddings-inference:turing-1.9 (experimental) |
+| Ampere 8.0 (A100, A30)                 | ghcr.io/huggingface/text-embeddings-inference:1.9                       |
+| Ampere 8.6 (A10, A40, ...)             | ghcr.io/huggingface/text-embeddings-inference:86-1.9                    |
+| Ada Lovelace (RTX 4000 series, ...)    | ghcr.io/huggingface/text-embeddings-inference:89-1.9                    |
+| Hopper (H100)                          | ghcr.io/huggingface/text-embeddings-inference:hopper-1.9                |
+| Blackwell 10.0 (B200, GB200, ...)      | ghcr.io/huggingface/text-embeddings-inference:100-1.9 (experimental)    |
+| Blackwell 12.0 (GeForce RTX 50X0, ...) | ghcr.io/huggingface/text-embeddings-inference:120-1.9 (experimental)    |
 
 **Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues.
 You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.