From a1b10181169648c918b00f0ac52ba79326246ef3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sat, 31 May 2025 08:59:25 +0200
Subject: [PATCH 01/11] add multiple classifier outputs and labels support

---
 src/llama-context.cpp |  7 ++++---
 src/llama-model.cpp   | 46 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index e153351af3809..6940440ecf380 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -808,16 +808,17 @@ int llama_context::encode(llama_batch & inp_batch) {
                 } break;
             case LLAMA_POOLING_TYPE_RANK:
                 {
-                    // extract the rerank score - a single float per sequence
+                    // extract the rerank score - n_cls_out floats per sequence
                     auto & embd_seq_out = embd_seq;
+                    const uint32_t n_cls_out = hparams.n_cls_out;
 
                     for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
                         const llama_seq_id seq_id = ubatch.seq_id[s][0];
                         if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
                             continue;
                         }
-                        embd_seq_out[seq_id].resize(1);
-                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                        embd_seq_out[seq_id].resize(n_cls_out);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_cls_out*seq_id)*sizeof(float), n_cls_out*sizeof(float));
                     }
                 } break;
             case LLAMA_POOLING_TYPE_UNSPECIFIED:
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 3f1f6c9bf3b06..08e178aa55f7c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -417,18 +417,41 @@ void llama_model::load_arch(llama_model_loader & ml) {
     }
 }
 
+struct LLM_KV_MATCH_WITHOUT_ARCH {
+    const LLM_KV kv_arch = LLM_KV(LLM_ARCH_UNKNOWN);
+    const std::string kv_arch_prefix = llm_arch_name(LLM_ARCH_UNKNOWN);
+
+    bool operator()(const llm_kv & kv, const std::string & kv_name) const
+    {
+        std::string kv_match = kv_arch(kv);
+        auto kv_arch_pos = kv_match.find(kv_arch_prefix);
+
+        return kv_name.find(kv_match.substr(kv_arch_pos == std::string::npos ? 0 : kv_arch_pos + kv_arch_prefix.size())) != std::string::npos;
+    }
+};
+
 void llama_model::load_hparams(llama_model_loader & ml) {
     const gguf_context * ctx = ml.meta.get();
 
     // get metadata as string
     for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
+        const char * name = gguf_get_key(ctx, i);
         gguf_type type = gguf_get_kv_type(ctx, i);
+
         if (type == GGUF_TYPE_ARRAY) {
-            continue;
+            if (LLM_KV_MATCH_WITHOUT_ARCH()(LLM_KV_CLASSIFIER_OUTPUT_LABELS, name)) {
+                const size_t n_items = gguf_get_arr_n(ctx, i);
+
+                for (size_t j = 0; j < n_items; j++) {
+                    const std::string name_i = format("%s.%zu", name, j);
+                    const std::string value = gguf_get_arr_str(ctx, i, j);
+                    gguf_kv.emplace(name_i, value);
+                }
+            }
+        } else {
+            const std::string value = gguf_kv_to_str(ctx, i);
+            gguf_kv.emplace(name, value);
         }
-        const char * name = gguf_get_key(ctx, i);
-        const std::string value = gguf_kv_to_str(ctx, i);
-        gguf_kv.emplace(name, value);
     }
 
     // get general kv
@@ -13593,6 +13616,21 @@ int32_t llama_model_n_head_kv(const llama_model * model) {
     return model->hparams.n_head_kv();
 }
 
+uint32_t llama_model_n_cls_out(const struct llama_model * model) {
+    return model->hparams.n_cls_out;
+}
+
+const char * llama_model_get_classifier_label_by_index(const struct llama_model * model, uint32_t i) {
+    const std::string key = format("%s.%u", LLM_KV(model->arch)(LLM_KV_CLASSIFIER_OUTPUT_LABELS).c_str(), i);
+    const auto & it = model->gguf_kv.find(key);
+
+    if (it != model->gguf_kv.end()) {
+        return it->second.c_str();
+    }
+
+    return nullptr;
+}
+
 // deprecated
 int32_t llama_n_ctx_train(const llama_model * model) {
     return llama_model_n_ctx_train(model);

From 6ef43bab4783c9ec84a6ba78a156f667712ecae1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sat, 31 May 2025 09:01:37 +0200
Subject: [PATCH 02/11] make public

---
 include/llama.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/llama.h b/include/llama.h
index 01762bea2bf96..06ef86542453b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -506,6 +506,12 @@ extern "C" {
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
 
+    // Returns the number of classifier outputs (only valid for classifier models)
+    LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
+
+    // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
+    LLAMA_API const char * llama_model_get_classifier_label_by_index(const struct llama_model * model, uint32_t i);
+
     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
 
     LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
@@ -912,7 +918,7 @@ extern "C" {
 
     // Get the embeddings for a sequence id
     // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[i] with the rank(s) of the sequence
     // otherwise: float[n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
 

From fa6127315734654f7587b0a9179044b41b285aa0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sat, 31 May 2025 09:04:02 +0200
Subject: [PATCH 03/11] show multiple rankings and associated labels

ggml-ci
---
 examples/embedding/embedding.cpp | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 71f700877a3b9..518289435980a 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -4,6 +4,7 @@
 #include "llama.h"
 
 #include <ctime>
+#include <cstring>
 #include <algorithm>
 
 #if defined(_MSC_VER)
@@ -236,9 +237,24 @@ int main(int argc, char ** argv) {
                 LOG("\n");
             }
         } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
+            const uint32_t n_cls_out = llama_model_n_cls_out(model);
+            std::vector<std::string> cls_out_labels;
+
+            for (uint32_t i = 0; i < n_cls_out; i++) {
+                const char * label = llama_model_get_classifier_label_by_index(model, i);
+                const std::string label_i = label == nullptr || strlen(label) == 0 ? std::to_string(i) : label;
+                cls_out_labels.emplace_back(label_i);
+            }
+
             for (int j = 0; j < n_embd_count; j++) {
-                // NOTE: if you change this log - update the tests in ci/run.sh
-                LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
+                for (uint32_t i = 0; i < n_cls_out; i++) {
+                    // NOTE: if you change this log - update the tests in ci/run.sh
+                    if (n_cls_out == 1) {
+                        LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
+                    } else {
+                        LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
+                    }
+                }
             }
         } else {
             // print the first part of the embeddings or for a single prompt, the full embedding

From 38ece05b9a1631d93eb7afefdb8e7e548591fd41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sun, 1 Jun 2025 22:29:14 +0200
Subject: [PATCH 04/11] move labels to llama_model

---
 examples/embedding/embedding.cpp |  2 +-
 include/llama.h                  |  2 +-
 src/llama-model.cpp              | 27 +++++----------------------
 src/llama-model.h                |  3 +++
 4 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 518289435980a..50bafabc8bd9f 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -241,7 +241,7 @@ int main(int argc, char ** argv) {
             std::vector<std::string> cls_out_labels;
 
             for (uint32_t i = 0; i < n_cls_out; i++) {
-                const char * label = llama_model_get_classifier_label_by_index(model, i);
+                const char * label = llama_model_cls_label(model, i);
                 const std::string label_i = label == nullptr || strlen(label) == 0 ? std::to_string(i) : label;
                 cls_out_labels.emplace_back(label_i);
             }
diff --git a/include/llama.h b/include/llama.h
index 06ef86542453b..c46e1bb744591 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -510,7 +510,7 @@ extern "C" {
     LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
 
     // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
-    LLAMA_API const char * llama_model_get_classifier_label_by_index(const struct llama_model * model, uint32_t i);
+    LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
 
     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 08e178aa55f7c..eef410a5f8f25 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -417,19 +417,6 @@ void llama_model::load_arch(llama_model_loader & ml) {
     }
 }
 
-struct LLM_KV_MATCH_WITHOUT_ARCH {
-    const LLM_KV kv_arch = LLM_KV(LLM_ARCH_UNKNOWN);
-    const std::string kv_arch_prefix = llm_arch_name(LLM_ARCH_UNKNOWN);
-
-    bool operator()(const llm_kv & kv, const std::string & kv_name) const
-    {
-        std::string kv_match = kv_arch(kv);
-        auto kv_arch_pos = kv_match.find(kv_arch_prefix);
-
-        return kv_name.find(kv_match.substr(kv_arch_pos == std::string::npos ? 0 : kv_arch_pos + kv_arch_prefix.size())) != std::string::npos;
-    }
-};
-
 void llama_model::load_hparams(llama_model_loader & ml) {
     const gguf_context * ctx = ml.meta.get();
 
@@ -439,13 +426,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         gguf_type type = gguf_get_kv_type(ctx, i);
 
         if (type == GGUF_TYPE_ARRAY) {
-            if (LLM_KV_MATCH_WITHOUT_ARCH()(LLM_KV_CLASSIFIER_OUTPUT_LABELS, name)) {
+            if (LLM_KV(arch)(LLM_KV_CLASSIFIER_OUTPUT_LABELS) == name) {
                 const size_t n_items = gguf_get_arr_n(ctx, i);
 
                 for (size_t j = 0; j < n_items; j++) {
-                    const std::string name_i = format("%s.%zu", name, j);
                     const std::string value = gguf_get_arr_str(ctx, i, j);
-                    gguf_kv.emplace(name_i, value);
+                    classifier_labels.emplace_back(value);
                 }
             }
         } else {
@@ -13620,12 +13606,9 @@ uint32_t llama_model_n_cls_out(const struct llama_model * model) {
     return model->hparams.n_cls_out;
 }
 
-const char * llama_model_get_classifier_label_by_index(const struct llama_model * model, uint32_t i) {
-    const std::string key = format("%s.%u", LLM_KV(model->arch)(LLM_KV_CLASSIFIER_OUTPUT_LABELS).c_str(), i);
-    const auto & it = model->gguf_kv.find(key);
-
-    if (it != model->gguf_kv.end()) {
-        return it->second.c_str();
+const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
+    if (i < model->classifier_labels.size()) {
+        return model->classifier_labels[i].c_str();
     }
 
     return nullptr;
diff --git a/src/llama-model.h b/src/llama-model.h
index cbea2cb331b62..4f0b5c36fb9b9 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -363,6 +363,9 @@ struct llama_model {
     // for quantize-stats only
     std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
 
+    // for classifier models
+    std::vector<std::string> classifier_labels;
+
     int64_t t_load_us  = 0;
     int64_t t_start_us = 0;
 

From 3a52f4c7155fa72c1641097c4dda9d3c74d99f29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Mon, 2 Jun 2025 09:58:05 +0200
Subject: [PATCH 05/11] update n_cls_out for any arch with labels

---
 src/llama-model.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index f620abb3773f9..964dcd24753f5 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -552,6 +552,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     uint32_t n_vocab = 0;
     ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
 
+    // for classifier models
+    if (!classifier_labels.empty()) {
+        hparams.n_cls_out = classifier_labels.size();
+    }
+
     // arch-specific KVs
     switch (arch) {
         case LLM_ARCH_LLAMA:
@@ -695,7 +700,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
                 ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
                 ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
-                ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
 
                 switch (hparams.n_layer) {
                     case 3:

From 41049e6c539253b0b9d9a2991185fb35e678a0d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Mon, 2 Jun 2025 12:17:56 +0200
Subject: [PATCH 06/11] be more specific about behaviour

---
 include/llama.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/llama.h b/include/llama.h
index ba97619c1b882..6f6b19f3dfdc6 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -510,6 +510,7 @@ extern "C" {
     LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
 
     // Returns the number of classifier outputs (only valid for classifier models)
+    // Undefined behavior for non-classifier models
     LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
 
     // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided

From 5b6bf2f18a179a92952cf502a9000e55d4a48a70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Mon, 2 Jun 2025 23:04:31 +0200
Subject: [PATCH 07/11] move string array functionality into model-loader

---
 src/llama-model-loader.cpp | 59 +++++++++++++++++++++++++++-----------
 src/llama-model.cpp        | 18 ++++--------
 2 files changed, 47 insertions(+), 30 deletions(-)

diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index ddb1b03675b28..5214727057bd0 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -288,9 +288,10 @@ namespace GGUFMeta {
 
     template<typename T>
     bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
-        const int kid = gguf_find_key(meta.get(), key.c_str());
+        const gguf_context * ctx = meta.get();
+        const int kid = gguf_find_key(ctx, key.c_str());
 
-        if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) {
+        if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
             if (required) {
                 throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
             }
@@ -298,28 +299,40 @@ namespace GGUFMeta {
         }
 
         struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
 
         switch (arr_info.gt) {
             case GGUF_TYPE_UINT32:
-            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
-                                                (std::is_same<T, uint32_t>::value)); break;
-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
+            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,     int32_t>::value) ||
+                                                (std::is_same<T,    uint32_t>::value)); break;
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,       float>::value)); break;
+            case GGUF_TYPE_STRING:  GGML_ASSERT((std::is_same<T, std::string>::value)); break;
             default:
-                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
         }
 
-        result.resize(arr_info.length);
-        result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
+        if constexpr (std::is_same<T, std::string>::value) {
+            const size_t n_items = gguf_get_arr_n(ctx, kid);
+            result.clear();
+
+            for (size_t i = 0; i < n_items; i++) {
+                const T value = gguf_get_arr_str(ctx, kid, i);
+                result.emplace_back(value);
+            }
+        } else {
+            result.resize(arr_info.length);
+            result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
+        }
 
         return true;
     }
 
     template<typename T, size_t N_MAX>
     bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
-        const int kid = gguf_find_key(meta.get(), key.c_str());
+        const gguf_context * ctx = meta.get();
+        const int kid = gguf_find_key(ctx, key.c_str());
 
-        if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) {
+        if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
             if (required) {
                 throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
             }
@@ -327,22 +340,32 @@ namespace GGUFMeta {
         }
 
         struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
 
         switch (arr_info.gt) {
             case GGUF_TYPE_UINT32:
-            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
-                                                (std::is_same<T, uint32_t>::value)); break;
-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
+            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,     int32_t>::value) ||
+                                                (std::is_same<T,    uint32_t>::value)); break;
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,       float>::value)); break;
+            case GGUF_TYPE_STRING:  GGML_ASSERT((std::is_same<T, std::string>::value)); break;
             default:
-                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
         }
 
         if (arr_info.length > N_MAX) {
             throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
         }
 
-        std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
+        if constexpr (std::is_same<T, std::string>::value) {
+            const size_t n_items = gguf_get_arr_n(meta.get(), kid);
+
+            for (size_t i = 0; i < n_items; i++) {
+                const T value = gguf_get_arr_str(meta.get(), kid, i);
+                result[i] = value;
+            }
+        } else {
+            std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
+        }
 
         return true;
     }
@@ -352,6 +375,8 @@ namespace GGUFMeta {
         return get_arr(llm_kv(kid), result, required);
     }
 
+    template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
+
     template<typename T>
     bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
         auto it = kv_overrides.find(key);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 964dcd24753f5..35a39496bccfb 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -425,22 +425,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
     // get metadata as string
     for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
-        const char * name = gguf_get_key(ctx, i);
         gguf_type type = gguf_get_kv_type(ctx, i);
-
         if (type == GGUF_TYPE_ARRAY) {
-            if (LLM_KV(arch)(LLM_KV_CLASSIFIER_OUTPUT_LABELS) == name) {
-                const size_t n_items = gguf_get_arr_n(ctx, i);
-
-                for (size_t j = 0; j < n_items; j++) {
-                    const std::string value = gguf_get_arr_str(ctx, i, j);
-                    classifier_labels.emplace_back(value);
-                }
-            }
-        } else {
-            const std::string value = gguf_kv_to_str(ctx, i);
-            gguf_kv.emplace(name, value);
+            continue;
         }
+        const char * name = gguf_get_key(ctx, i);
+        const std::string value = gguf_kv_to_str(ctx, i);
+        gguf_kv.emplace(name, value);
     }
 
     // get general kv
@@ -553,6 +544,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
 
     // for classifier models
+    ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
     if (!classifier_labels.empty()) {
         hparams.n_cls_out = classifier_labels.size();
     }

From 9aa5d730e57a63cf1784195ec7c9a297108830ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Mon, 2 Jun 2025 23:27:09 +0200
Subject: [PATCH 08/11] forgotten variable replacements

---
 src/llama-model-loader.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 5214727057bd0..bd9e6da8832b7 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -357,10 +357,10 @@ namespace GGUFMeta {
         }
 
         if constexpr (std::is_same<T, std::string>::value) {
-            const size_t n_items = gguf_get_arr_n(meta.get(), kid);
+            const size_t n_items = gguf_get_arr_n(ctx, kid);
 
             for (size_t i = 0; i < n_items; i++) {
-                const T value = gguf_get_arr_str(meta.get(), kid, i);
+                const T value = gguf_get_arr_str(ctx, kid, i);
                 result[i] = value;
             }
         } else {

From 6bba8ed38f09cf16e0539f633b2d74806df22269 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Tue, 3 Jun 2025 09:54:27 +0200
Subject: [PATCH 09/11] improved comment [no ci]

---
 include/llama.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/llama.h b/include/llama.h
index 6f6b19f3dfdc6..a4263841423da 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -923,7 +923,7 @@ extern "C" {
 
     // Get the embeddings for a sequence id
     // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[i] with the rank(s) of the sequence
+    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
     // otherwise: float[n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
 

From 7443156a9141f255c451a313ece957d20ee53478 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Thu, 5 Jun 2025 17:16:04 +0200
Subject: [PATCH 10/11] logging and minor changes

---
 examples/embedding/embedding.cpp | 4 ++--
 src/llama-model.cpp              | 9 +++++++++
 src/llama-model.h                | 6 +++---
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 50bafabc8bd9f..f368716d519fe 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -242,8 +242,8 @@ int main(int argc, char ** argv) {
 
             for (uint32_t i = 0; i < n_cls_out; i++) {
                 const char * label = llama_model_cls_label(model, i);
-                const std::string label_i = label == nullptr || strlen(label) == 0 ? std::to_string(i) : label;
-                cls_out_labels.emplace_back(label_i);
+                const std::string label_i(label == nullptr ? "" : label);
+                cls_out_labels.emplace_back(label_i.empty() ? std::to_string(i) : label_i);
             }
 
             for (int j = 0; j < n_embd_count; j++) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 35a39496bccfb..eb97019a0f928 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -4361,6 +4361,15 @@ void llama_model::print_info() const {
         LLAMA_LOG_INFO("%s: ssm_d_state      = %u\n",     __func__, hparams.ssm_d_state);
         LLAMA_LOG_INFO("%s: ssm_dt_rank      = %u\n",     __func__, hparams.ssm_dt_rank);
         LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms   = %d\n",     __func__, hparams.ssm_dt_b_c_rms);
+
+        if (!classifier_labels.empty()) {
+            LLAMA_LOG_INFO("%s: n_cls_out        = %u\n", __func__, hparams.n_cls_out);
+
+            size_t i = 0;
+            for (auto label : classifier_labels) {
+                LLAMA_LOG_INFO("%s: cls_label[%2zu]    = %s\n", __func__, i++, label.c_str());
+            }
+        }
     }
 
     LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, type_name().c_str());
diff --git a/src/llama-model.h b/src/llama-model.h
index 4f0b5c36fb9b9..18b714620bbcf 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -329,6 +329,9 @@ struct llama_model {
     llama_hparams hparams = {};
     llama_vocab   vocab;
 
+    // for classifier models
+    std::vector<std::string> classifier_labels;
+
     struct ggml_tensor * tok_embd   = nullptr;
     struct ggml_tensor * type_embd  = nullptr;
     struct ggml_tensor * pos_embd   = nullptr;
@@ -363,9 +366,6 @@ struct llama_model {
     // for quantize-stats only
     std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
 
-    // for classifier models
-    std::vector<std::string> classifier_labels;
-
     int64_t t_load_us  = 0;
     int64_t t_start_us = 0;
 

From 67c4cd21d5a2de94c8f04b985d7e55c2999a32fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Thu, 5 Jun 2025 17:23:05 +0200
Subject: [PATCH 11/11] forgot to remove cstring

---
 examples/embedding/embedding.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index f368716d519fe..8bef7f8f6ba25 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -4,7 +4,6 @@
 #include "llama.h"
 
 #include <ctime>
-#include <cstring>
 #include <algorithm>
 
 #if defined(_MSC_VER)