model-conversion : add support for SentenceTransformers (#16387)

danbev · web-flow · commit 56b4795842d8 · 2025-10-09T14:35:22.000+02:00
* model-conversion : add support for SentenceTransformers

This commit adds support for models that use SentenceTransformer layers.

The motivation for this is that if converted model includes any of the
numbered layers specified in the original models repository then these
changes enable these models to be used and verified. Currently the
model-conversion only support the base model output without any of
the additional transformation layers.

Usage:
Convert the model that also includes the SentenceTransformer layers:
```console
(venv) $ export EMBEDDING_MODEL_PATH="~/google/embeddinggemma-300M"
(venv) make embedding-convert-model
```

Verify the produced embeddings from the converted model against the
original model embeddings:
```console
(venv) make embedding-verify-logits-st
```

The original model can be run using SentenceTransformer:
```console
(venv) make embedding-run-original-model-st
```

Run the converted model using "SentenceTransformer" layers whic
enables pooling and normalization:
```console
(venv) make embedding-run-converted-model-st
```

* add model-conversion example requirements

* add support for -st flag in embedding model conversion

This commit add support for the -st flag in the embedding model
conversion script. This will enable models to be converted using
sentence transformers dense layers.
diff --git a/examples/model-conversion/Makefile b/examples/model-conversion/Makefile
@@ -116,20 +116,39 @@ embedding-convert-model:
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/embedding/convert-model.sh
 
+embedding-convert-model-st:
+	$(call validate_embedding_model_path,embedding-convert-model-st)
+	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
+	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
+	./scripts/embedding/convert-model.sh -st
+
 embedding-run-original-model:
 	$(call validate_embedding_model_path,embedding-run-original-model)
 	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
+	USE_SENTENCE_TRANSFORMERS="$(USE_SENTENCE_TRANSFORMERS)" \
 	./scripts/embedding/run-original-model.py \
-	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
+	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
+	$(if $(USE_SENTENCE_TRANSFORMERS),--use-sentence-transformers)
+
+embedding-run-original-model-st: USE_SENTENCE_TRANSFORMERS=1
+embedding-run-original-model-st: embedding-run-original-model
 
 embedding-run-converted-model:
 	@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
-	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
+	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
+	$(if $(USE_POOLING),--pooling)
+
+embedding-run-converted-model-st: USE_POOLING=1
+embedding-run-converted-model-st: embedding-run-converted-model
 
 embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
 	@./scripts/embedding/compare-embeddings-logits.sh \
 	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
 
+embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model-st
+	@./scripts/embedding/compare-embeddings-logits.sh \
+	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
+
 embedding-inspect-original-model:
 	$(call validate_embedding_model_path,embedding-inspect-original-model)
 	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH}
diff --git a/examples/model-conversion/README.md b/examples/model-conversion/README.md
@@ -189,6 +189,23 @@ This command will save two files to the `data` directory, one is a binary
 file containing logits which will be used for comparison with the converted
 model, and the other is a text file which allows for manual visual inspection.
 
+#### Using SentenceTransformer with numbered layers
+For models that have numbered SentenceTransformer layers (01_Pooling, 02_Dense,
+03_Dense, 04_Normalize), use the `-st` targets to apply all these layers:
+
+```console
+# Run original model with SentenceTransformer (applies all numbered layers)
+(venv) $ make embedding-run-original-model-st
+
+# Run converted model with pooling enabled
+(venv) $ make embedding-run-converted-model-st
+```
+
+This will use the SentenceTransformer library to load and run the model, which
+automatically applies all the numbered layers in the correct order. This is
+particularly useful when comparing with models that should include these
+additional transformation layers beyond just the base model output.
+
 ### Model conversion
 After updates have been made to [gguf-py](../../gguf-py) to add support for the
 new model the model can be converted to GGUF format using the following command:
@@ -208,6 +225,13 @@ was done manually in the previous steps) and compare the logits:
 (venv) $ make embedding-verify-logits
 ```
 
+For models with SentenceTransformer layers, use the `-st` verification target:
+```console
+(venv) $ make embedding-verify-logits-st
+```
+This convenience target automatically runs both the original model with SentenceTransformer
+and the converted model with pooling enabled, then compares the results.
+
 ### llama-server verification
 To verify that the converted model works with llama-server, the following
 command can be used:
diff --git a/examples/model-conversion/logits.cpp b/examples/model-conversion/logits.cpp
@@ -1,4 +1,7 @@
 #include "llama.h"
+#include "common.h"
+
+
 #include <cstdio>
 #include <cstring>
 #include <string>
@@ -8,7 +11,10 @@
 
 static void print_usage(int, char ** argv) {
     printf("\nexample usage:\n");
-    printf("\n    %s -m model.gguf [-ngl n_gpu_layers] -embd-mode [prompt]\n", argv[0]);
+    printf("\n    %s -m model.gguf [-ngl n_gpu_layers] -embd-mode [-pooling] [-embd-norm <norm>] [prompt]\n", argv[0]);
+    printf("\n");
+    printf("  -embd-norm: normalization type for pooled embeddings (default: 2)\n");
+    printf("              -1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm\n");
     printf("\n");
 }
 
@@ -17,6 +23,8 @@ int main(int argc, char ** argv) {
     std::string prompt = "Hello, my name is";
     int ngl = 0;
     bool embedding_mode = false;
+    bool pooling_enabled = false;
+    int32_t embd_norm = 2;  // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
 
     {
         int i = 1;
@@ -41,9 +49,13 @@ int main(int argc, char ** argv) {
                     return 1;
                 }
             } else if (strcmp(argv[i], "-embd-mode") == 0) {
+                embedding_mode = true;
+            } else if (strcmp(argv[i], "-pooling") == 0) {
+                pooling_enabled = true;
+            } else if (strcmp(argv[i], "-embd-norm") == 0) {
                 if (i + 1 < argc) {
                     try {
-                        embedding_mode = true;
+                        embd_norm = std::stoi(argv[++i]);
                     } catch (...) {
                         print_usage(argc, argv);
                         return 1;
@@ -112,7 +124,7 @@ int main(int argc, char ** argv) {
     ctx_params.no_perf = false;
     if (embedding_mode) {
         ctx_params.embeddings = true;
-        ctx_params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+        ctx_params.pooling_type = pooling_enabled ? LLAMA_POOLING_TYPE_MEAN : LLAMA_POOLING_TYPE_NONE;
         ctx_params.n_ubatch = ctx_params.n_batch;
     }
 
@@ -143,17 +155,27 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    float * logits;
-    int n_logits;
+    float * data_ptr;
+    int data_size;
     const char * type;
+    std::vector<float> embd_out;
 
     if (embedding_mode) {
-        logits = llama_get_embeddings(ctx);
-        n_logits = llama_model_n_embd(model) * batch.n_tokens;
+        const int n_embd = llama_model_n_embd(model);
+        const int n_embd_count = pooling_enabled ? 1 : batch.n_tokens;
+        const int n_embeddings = n_embd * n_embd_count;
+        float * embeddings;
         type = "-embeddings";
 
-        const int n_embd = llama_model_n_embd(model);
-        const int n_embd_count = batch.n_tokens;
+        if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
+            embeddings = llama_get_embeddings_seq(ctx, 0);
+            embd_out.resize(n_embeddings);
+            printf("Normalizing embeddings using norm: %d\n", embd_norm);
+            common_embd_normalize(embeddings, embd_out.data(), n_embeddings, embd_norm);
+            embeddings = embd_out.data();
+        } else {
+            embeddings = llama_get_embeddings(ctx);
+        }
 
         printf("Embedding dimension: %d\n", n_embd);
         printf("\n");
@@ -164,43 +186,49 @@ int main(int argc, char ** argv) {
 
             // Print first 3 values
             for (int i = 0; i < 3 && i < n_embd; i++) {
-                printf("%9.6f ", logits[j * n_embd + i]);
+                printf("%9.6f ", embeddings[j * n_embd + i]);
             }
 
             printf(" ... ");
 
             // Print last 3 values
             for (int i = n_embd - 3; i < n_embd; i++) {
                 if (i >= 0) {
-                    printf("%9.6f ", logits[j * n_embd + i]);
+                    printf("%9.6f ", embeddings[j * n_embd + i]);
                 }
             }
 
             printf("\n");
         }
         printf("\n");
 
-        printf("Embeddings size: %d\n", n_logits);
+        printf("Embeddings size: %d\n", n_embeddings);
+
+        data_ptr = embeddings;
+        data_size = n_embeddings;
     } else {
-        logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
-        n_logits = llama_vocab_n_tokens(vocab);
+        float * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+        const int n_logits = llama_vocab_n_tokens(vocab);
         type = "";
         printf("Vocab size: %d\n", n_logits);
+
+        data_ptr = logits;
+        data_size = n_logits;
     }
 
     std::filesystem::create_directory("data");
 
-    // Save logits to binary file
+    // Save data to binary file
     char bin_filename[512];
     snprintf(bin_filename, sizeof(bin_filename), "data/llamacpp-%s%s.bin", model_name, type);
-    printf("Saving logits to %s\n", bin_filename);
+    printf("Saving data to %s\n", bin_filename);
 
     FILE * f = fopen(bin_filename, "wb");
     if (f == NULL) {
         fprintf(stderr, "%s: error: failed to open binary output file\n", __func__);
         return 1;
     }
-    fwrite(logits, sizeof(float), n_logits, f);
+    fwrite(data_ptr, sizeof(float), data_size, f);
     fclose(f);
 
     // Also save as text for debugging
@@ -211,27 +239,27 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s: error: failed to open text output file\n", __func__);
         return 1;
     }
-    for (int i = 0; i < n_logits; i++) {
-        fprintf(f, "%d: %.6f\n", i, logits[i]);
+    for (int i = 0; i < data_size; i++) {
+        fprintf(f, "%d: %.6f\n", i, data_ptr[i]);
     }
     fclose(f);
 
     if (!embedding_mode) {
         printf("First 10 logits: ");
-        for (int i = 0; i < 10 && i < n_logits; i++) {
-            printf("%.6f ", logits[i]);
+        for (int i = 0; i < 10 && i < data_size; i++) {
+            printf("%.6f ", data_ptr[i]);
         }
         printf("\n");
 
         printf("Last 10 logits: ");
-        for (int i = n_logits - 10; i < n_logits; i++) {
-            if (i >= 0) printf("%.6f ", logits[i]);
+        for (int i = data_size - 10; i < data_size; i++) {
+            if (i >= 0) printf("%.6f ", data_ptr[i]);
         }
         printf("\n\n");
     }
 
-    printf("Logits saved to %s\n", bin_filename);
-    printf("Logits saved to %s\n", txt_filename);
+    printf("Data saved to %s\n", bin_filename);
+    printf("Data saved to %s\n", txt_filename);
 
     llama_free(ctx);
     llama_model_free(model);
diff --git a/examples/model-conversion/requirements.txt b/examples/model-conversion/requirements.txt
@@ -4,3 +4,4 @@ torchvision
 transformers
 huggingface-hub
 accelerate
+sentence-transformers
diff --git a/examples/model-conversion/scripts/embedding/convert-model.sh b/examples/model-conversion/scripts/embedding/convert-model.sh
@@ -2,6 +2,21 @@
 
 set -e
 
+# Parse command line arguments
+SENTENCE_TRANSFORMERS=""
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -st|--sentence-transformers)
+            SENTENCE_TRANSFORMERS="--sentence-transformers-dense-modules"
+            shift
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
 MODEL_NAME="${MODEL_NAME:-$(basename "$EMBEDDING_MODEL_PATH")}"
 OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
 TYPE="${OUTTYPE:-f16}"
@@ -15,7 +30,8 @@ echo "Converted model path:: ${CONVERTED_MODEL}"
 python ../../convert_hf_to_gguf.py --verbose \
     ${EMBEDDING_MODEL_PATH} \
     --outfile ${CONVERTED_MODEL} \
-    --outtype ${TYPE}
+    --outtype ${TYPE} \
+    ${SENTENCE_TRANSFORMERS}
 
 echo ""
 echo "The environment variable CONVERTED_EMBEDDING MODEL can be set to this path using:"
diff --git a/examples/model-conversion/scripts/embedding/run-converted-model.sh b/examples/model-conversion/scripts/embedding/run-converted-model.sh
@@ -5,13 +5,18 @@ set -e
 # Parse command line arguments
 CONVERTED_MODEL=""
 PROMPTS_FILE=""
+USE_POOLING=""
 
 while [[ $# -gt 0 ]]; do
     case $1 in
         -p|--prompts-file)
             PROMPTS_FILE="$2"
             shift 2
             ;;
+        --pooling)
+            USE_POOLING="1"
+            shift
+            ;;
         *)
             if [ -z "$CONVERTED_MODEL" ]; then
                 CONVERTED_MODEL="$1"
@@ -47,4 +52,8 @@ echo $CONVERTED_MODEL
 
 cmake --build ../../build --target llama-logits -j8
 # TODO: update logits.cpp to accept a --file/-f option for the prompt
-../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "$PROMPT"
+if [ -n "$USE_POOLING" ]; then
+    ../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode -pooling "$PROMPT"
+else
+    ../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "$PROMPT"
+fi
diff --git a/examples/model-conversion/scripts/embedding/run-original-model.py b/examples/model-conversion/scripts/embedding/run-original-model.py
diff --git a/examples/model-conversion/scripts/utils/semantic_check.py b/examples/model-conversion/scripts/utils/semantic_check.py
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt