From 06e0beaab5601e286f68e452b263aca22b20a40e Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Fri, 25 Jul 2025 17:01:57 +0200 Subject: [PATCH 1/2] Update `version` in `Cargo.toml` --- Cargo.lock | 16 ++++++++-------- Cargo.toml | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 06e7c767..1ef02ed5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -326,7 +326,7 @@ dependencies = [ [[package]] name = "backend-grpc-client" -version = "1.7.4" +version = "1.8.0" dependencies = [ "grpc-metadata", "prost 0.11.9", @@ -4444,7 +4444,7 @@ dependencies = [ [[package]] name = "text-embeddings-backend" -version = "1.7.4" +version = "1.8.0" dependencies = [ "clap", "hf-hub", @@ -4460,7 +4460,7 @@ dependencies = [ [[package]] name = "text-embeddings-backend-candle" -version = "1.7.4" +version = "1.8.0" dependencies = [ "accelerate-src", "anyhow", @@ -4490,7 +4490,7 @@ dependencies = [ [[package]] name = "text-embeddings-backend-core" -version = "1.7.4" +version = "1.8.0" dependencies = [ "clap", "nohash-hasher", @@ -4500,7 +4500,7 @@ dependencies = [ [[package]] name = "text-embeddings-backend-ort" -version = "1.7.4" +version = "1.8.0" dependencies = [ "anyhow", "ndarray", @@ -4516,7 +4516,7 @@ dependencies = [ [[package]] name = "text-embeddings-backend-python" -version = "1.7.4" +version = "1.8.0" dependencies = [ "backend-grpc-client", "nohash-hasher", @@ -4530,7 +4530,7 @@ dependencies = [ [[package]] name = "text-embeddings-core" -version = "1.7.4" +version = "1.8.0" dependencies = [ "async-channel", "hf-hub", @@ -4545,7 +4545,7 @@ dependencies = [ [[package]] name = "text-embeddings-router" -version = "1.7.4" +version = "1.8.0" dependencies = [ "anyhow", "async-stream", diff --git a/Cargo.toml b/Cargo.toml index d1600eb7..5fc2822c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ default-members = [ resolver = "2" [workspace.package] -version = "1.7.4" +version = "1.8.0" edition = "2021" authors = ["Olivier Dehaene", "Nicolas Patry", "Alvaro Bartolome"] homepage = "https://github.com/huggingface/text-embeddings-inference" From 4ce4c892e5c9497ff4c014f84002ec5825f98c01 Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Fri, 25 Jul 2025 17:02:08 +0200 Subject: [PATCH 2/2] Update image URIs to 1.8 instead --- README.md | 26 +++++++++++++------------- docs/source/en/private_models.md | 2 +- docs/source/en/quick_tour.md | 8 ++++---- docs/source/en/supported_models.md | 12 ++++++------ 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 2e1476ea..f2508852 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ Below are some examples of the currently supported models: model=Qwen/Qwen3-Embedding-0.6B volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.7 --model-id $model +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model ``` And then you can make requests like @@ -327,13 +327,13 @@ Text Embeddings Inference ships with multiple Docker images that you can use to | Architecture | Image | |-------------------------------------|-------------------------------------------------------------------------| -| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-1.7 | +| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-1.8 | | Volta | NOT SUPPORTED | -| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-1.7 (experimental) | -| Ampere 80 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:1.7 | -| Ampere 86 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-1.7 | -| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-1.7 | -| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-1.7 (experimental) | +| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-1.8 (experimental) | +| Ampere 80 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:1.8 | +| Ampere 86 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-1.8 | +| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-1.8 | +| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-1.8 (experimental) | **Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues. You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable. @@ -362,7 +362,7 @@ model= volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run token= -docker run --gpus all -e HF_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.7 --model-id $model +docker run --gpus all -e HF_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model ``` ### Air gapped deployment @@ -385,7 +385,7 @@ git clone https://huggingface.co/Qwen/Qwen3-Embedding-0.6B volume=$PWD # Mount the models directory inside the container with a volume and set the model ID -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.7 --model-id /data/Qwen3-Embedding-0.6B +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id /data/Qwen3-Embedding-0.6B ``` ### Using Re-rankers models @@ -402,7 +402,7 @@ downstream performance. model=BAAI/bge-reranker-large volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.7 --model-id $model +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model ``` And then you can rank the similarity between a query and a list of texts with: @@ -422,7 +422,7 @@ You can also use classic Sequence Classification models like `SamLowe/roberta-ba model=SamLowe/roberta-base-go_emotions volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.7 --model-id $model +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model ``` Once you have deployed the model you can use the `predict` endpoint to get the emotions most associated with an input: @@ -442,7 +442,7 @@ You can choose to activate SPLADE pooling for Bert and Distilbert MaskedLM archi model=naver/efficient-splade-VI-BT-large-query volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.7 --model-id $model --pooling splade +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model --pooling splade ``` Once you have deployed the model you can use the `/embed_sparse` endpoint to get the sparse embedding: @@ -471,7 +471,7 @@ You can use the gRPC API by adding the `-grpc` tag to any TEI Docker image. For model=Qwen/Qwen3-Embedding-0.6B volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.7-grpc --model-id $model +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8-grpc --model-id $model ``` ```shell diff --git a/docs/source/en/private_models.md b/docs/source/en/private_models.md index b2e8e266..e744adb2 100644 --- a/docs/source/en/private_models.md +++ b/docs/source/en/private_models.md @@ -37,5 +37,5 @@ model= volume=$PWD/data token= -docker run --gpus all -e HF_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.7 --model-id $model +docker run --gpus all -e HF_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model ``` diff --git a/docs/source/en/quick_tour.md b/docs/source/en/quick_tour.md index 56ee05a1..13aaec62 100644 --- a/docs/source/en/quick_tour.md +++ b/docs/source/en/quick_tour.md @@ -34,7 +34,7 @@ Next it's time to deploy your model. Let's say you want to use [`Qwen/Qwen3-Embe model=Qwen/Qwen3-Embedding-0.6B volume=$PWD/data -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.7 --model-id $model +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model ``` @@ -110,7 +110,7 @@ Let's say you want to use [`BAAI/bge-reranker-large`](https://huggingface.co/BAA model=BAAI/bge-reranker-large volume=$PWD/data -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.7 --model-id $model +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model ``` Once you have deployed a model, you can use the `rerank` endpoint to rank the similarity between a query and a list of texts. With `cURL` this can be done like so: @@ -130,7 +130,7 @@ You can also use classic Sequence Classification models like [`SamLowe/roberta-b model=SamLowe/roberta-base-go_emotions volume=$PWD/data -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.7 --model-id $model +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id $model ``` Once you have deployed the model you can use the `predict` endpoint to get the emotions most associated with an input: @@ -182,5 +182,5 @@ git clone https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5 volume=$PWD # Mount the models directory inside the container with a volume and set the model ID -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.7 --model-id /data/gte-base-en-v1.5 +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.8 --model-id /data/gte-base-en-v1.5 ``` diff --git a/docs/source/en/supported_models.md b/docs/source/en/supported_models.md index 3888c087..a4d82fd8 100644 --- a/docs/source/en/supported_models.md +++ b/docs/source/en/supported_models.md @@ -77,13 +77,13 @@ Find the appropriate Docker image for your hardware in the following table: | Architecture | Image | |-------------------------------------|--------------------------------------------------------------------------| -| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-1.7 | +| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-1.8 | | Volta | NOT SUPPORTED | -| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-1.7 (experimental) | -| Ampere 80 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:1.7 | -| Ampere 86 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-1.7 | -| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-1.7 | -| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-1.7 (experimental) | +| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-1.8 (experimental) | +| Ampere 80 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:1.8 | +| Ampere 86 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-1.8 | +| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-1.8 | +| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-1.8 (experimental) | **Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues. You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.