fix prompt

ngxson · ngxson · commit c2f4dc78114f · 2025-06-06T12:17:41.000+02:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -905,10 +905,9 @@ struct common_init_result common_init_from_params(common_params & params) {
             ok = false;
         }
 
-        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
-        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
-        bool has_rerank_prompt = llama_model_chat_template(model, "rerank_prefix") != NULL ||
-                                 llama_model_chat_template(model, "rerank_suffix") != NULL;
+        bool has_eos           = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
+        bool has_sep           = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
+        bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
 
         if (has_rerank_prompt) {
             // OK, do nothing
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -3099,11 +3099,10 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
             self.gguf_writer.add_classifier_output_labels(["yes", "no"])
             self.gguf_writer.add_chat_template([{
-                "name": "rerank_prefix",
-                "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n",
-            }, {
-                "name": "rerank_suffix",
-                "template": "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n",
+                "name": "rerank",
+                "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
+                    + "<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}\n"
+                    + "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
             }])
 
     def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -1585,7 +1585,7 @@ void llm_graph_context::build_pooling(
                 } else if (cls_out) {
                     if (arch == LLM_ARCH_QWEN3) {
                         cur = ggml_mul_mat(ctx0, cls_out, inp);
-                        cur = ggml_soft_max(ctx0, cur); // qwen3 uses softmax on the output
+                        cur = ggml_log(ctx0, ggml_soft_max(ctx0, cur)); // qwen3 uses log_softmax
                     } else {
                         // Single layer classification head (direct projection)
                         // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -4704,22 +4704,19 @@ int main(int argc, char ** argv) {
             return;
         }
 
-        llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0];
-
         // create and queue the task
         json responses = json::array();
         bool error = false;
         std::unordered_set<int> task_ids;
         {
             std::vector<server_task> tasks;
-            auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
-            tasks.reserve(tokenized_docs.size());
-            for (size_t i = 0; i < tokenized_docs.size(); i++) {
-                auto tmp = format_rerank(ctx_server.model, tokenized_query, tokenized_docs[i]);
+            auto inputs = tokenize_rerank(ctx_server.model, query, documents);
+            tasks.reserve(documents.size());
+            for (size_t i = 0; i < inputs.size(); i++) {
                 server_task task   = server_task(SERVER_TASK_TYPE_RERANK);
                 task.id            = ctx_server.queue_tasks.get_new_id();
                 task.index         = i;
-                task.prompt_tokens = server_tokens(tmp, ctx_server.mctx != nullptr);
+                task.prompt_tokens = server_tokens(inputs[i], ctx_server.mctx != nullptr);
                 tasks.push_back(std::move(task));
             }
 
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
@@ -260,43 +260,47 @@ static size_t validate_utf8(const std::string& text) {
 // template utils
 //
 
-// format rerank task:
+// format and tokenize rerank task:
 // - using SEP token: [BOS]query[EOS][SEP]doc[EOS]
 // - using prompt:    <rerank_prefix>query<rerank_suffix>doc
-static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) {
+static std::vector<llama_tokens> tokenize_rerank(const struct llama_model * model, const std::string & query, const std::vector<std::string> & documents) {
     const llama_vocab * vocab = llama_model_get_vocab(model);
-    llama_tokens result;
+    std::vector<llama_tokens> result;
 
-    if (llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL) {
-        // Get EOS token - use SEP token as fallback if EOS is not available
-        llama_token eos_token = llama_vocab_eos(vocab);
-        if (eos_token == LLAMA_TOKEN_NULL) {
-            eos_token = llama_vocab_sep(vocab);
-        }
+    for (const auto & doc : documents) {
+        if (llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL) {
+            // Get EOS token - use SEP token as fallback if EOS is not available
+            llama_tokens tok;
+            llama_tokens tok_query = common_tokenize(vocab, query, false, false);
+            llama_tokens tok_doc   = common_tokenize(vocab, doc,   false, false);
+            llama_token  eos_token = llama_vocab_eos(vocab);
+            if (eos_token == LLAMA_TOKEN_NULL) {
+                eos_token = llama_vocab_sep(vocab);
+            }
 
-        result.reserve(doc.size() + query.size() + 4);
-        result.push_back(llama_vocab_bos(vocab));
-        result.insert(result.end(), query.begin(), query.end());
-        result.push_back(eos_token);
-        result.push_back(llama_vocab_sep(vocab));
-        result.insert(result.end(), doc.begin(), doc.end());
-        result.push_back(eos_token);
-    } else {
-        // using prompt template
-        const char * prefix = llama_model_chat_template(model, "rerank_prefix");
-        const char * suffix = llama_model_chat_template(model, "rerank_suffix");
+            tok.reserve(doc.size() + query.size() + 4);
+            tok.push_back(llama_vocab_bos(vocab));
+            tok.insert(tok.end(), tok_query.begin(), tok_query.end());
+            tok.push_back(eos_token);
+            tok.push_back(llama_vocab_sep(vocab));
+            tok.insert(tok.end(), tok_doc.begin(), tok_doc.end());
+            tok.push_back(eos_token);
 
-        if (prefix == NULL && suffix == NULL) {
-            throw std::runtime_error("Rerank prompt template not found in the model\n");
-        }
+            result.push_back(std::move(tok));
+        } else {
+            // using prompt template
+            const char * tmpl = llama_model_chat_template(model, "rerank");
+            if (tmpl == nullptr) {
+                throw std::runtime_error("model does not have rerank template");
+            }
 
-        const llama_tokens prefix_tokens = prefix ? common_tokenize(vocab, prefix, true,  false) : llama_tokens();
-        const llama_tokens suffix_tokens = suffix ? common_tokenize(vocab, suffix, false, false) : llama_tokens();
-        result.reserve(prefix_tokens.size() + query.size() + suffix_tokens.size() + doc.size());
-        result.insert(result.end(), prefix_tokens.begin(), prefix_tokens.end());
-        result.insert(result.end(), query.begin(), query.end());
-        result.insert(result.end(), suffix_tokens.begin(), suffix_tokens.end());
-        result.insert(result.end(), doc.begin(), doc.end());
+            std::string prompt = tmpl;
+            // TODO: may not be efficient to call string_replace_all twice
+            string_replace_all(prompt, "{query}",    query);
+            string_replace_all(prompt, "{document}", doc);
+            llama_tokens tok = common_tokenize(vocab, prompt, true, false);
+            result.push_back(std::move(tok));
+        }
     }
 
     return result;