Merge branch 'concedo_exp_b6031' into crokeso

Nexesenex · Nexesenex · commit 19536ca54144 · 2025-08-07T12:31:27.000+02:00
diff --git a/common/chat.cpp b/common/chat.cpp
@@ -1944,6 +1944,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
         }
     }
     auto msg = builder.result();
-    LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
+    if (!is_partial) {
+        LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
+    }
     return msg;
 }
diff --git a/docs/ops/CANN.csv b/docs/ops/CANN.csv
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -81,6 +81,14 @@ int main(int argc, char ** argv) {
 
     params.embedding = true;
 
+    // if the number of prompts that would be encoded is known in advance, it's more efficient to specify the
+    //   --parallel argument accordingly. for convenience, if not specified, we fallback to unified KV cache
+    //   in order to support any number of prompts
+    if (params.n_parallel == 1) {
+        LOG_INF("%s: n_parallel == 1 -> unified KV cache is enabled\n", __func__);
+        params.kv_unified = true;
+    }
+
     // utilize the full context
     if (params.n_batch < params.n_ctx) {
         LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
diff --git a/klite.embd b/klite.embd
@@ -11363,6 +11363,8 @@ Current version indicated by LITEVER below.
 			}
 		}
 
+		autofetch_attempt_dict[desired_oai_ep] = true;
+
 		let dropdown = get_custom_ep_model_dropdown();
 		fetch((desired_oai_ep + oai_models_endpoint), {
 			method: 'GET',
@@ -11437,22 +11439,18 @@ Current version indicated by LITEVER below.
 		}
 	}
 
-	let openrouter_fetch_attempted = false;
-	let oai_custom_fetch_attempted = false;
+	let autofetch_attempt_dict = {};
 	function try_fetch_oai_models_auto()
 	{
+		let targetep = document.getElementById("custom_oai_endpoint").value;
 		//only for apis that don't gate the model list
-		if (document.getElementById("custom_oai_endpoint").value!="" &&
-		document.getElementById("custom_oai_endpoint").value.toLowerCase().includes("featherless.ai"))
+		if (targetep!="")
 		{
-			if(!oai_custom_fetch_attempted)
+			if(!autofetch_attempt_dict[targetep])
 			{
-				oai_custom_fetch_attempted = true;
-				let dropdown = document.getElementById("custom_oai_model");
-				if(dropdown.options.length < 40)
-				{
-					oai_fetch_models(); //autofetch models
-				}
+				autofetch_attempt_dict[targetep] = true;
+				let dropdown = get_custom_ep_model_dropdown();
+				oai_fetch_models(); //autofetch models
 			}
 		}
 	}
@@ -11509,6 +11507,14 @@ Current version indicated by LITEVER below.
 				} else {
 					document.getElementById("custom_oai_endpoint").value = (localsettings.saved_oai_addr ? localsettings.saved_oai_addr : default_oai_base);
 				}
+			}
+			else if(epchoice==3) //openrouter supports autofetch
+			{
+				document.getElementById("openrouterdesc").classList.remove("hidden");
+				document.getElementById("custom_openrouter_model").classList.remove("hidden");
+				document.getElementById("openrouterproviderbox").classList.remove("hidden");
+				document.getElementById("custom_oai_endpoint").value = default_openrouter_base;
+				document.getElementById("custom_oai_key").value =(localsettings.saved_openrouter_key==dummy_api_key?"":localsettings.saved_openrouter_key);
 				try_fetch_oai_models_auto();
 			}
 			else if(epchoice==7)
@@ -11540,23 +11546,7 @@ Current version indicated by LITEVER below.
 				document.getElementById("custom_oai_key").value = dummy_api_key;
 				document.getElementById("custom_oai_endpoint").value = pollinations_text_endpoint;
 				document.getElementById("custom_oai_key").classList.add("hidden");
-			}
-			else //openrouter supports autofetch
-			{
-				document.getElementById("openrouterdesc").classList.remove("hidden");
-				document.getElementById("custom_openrouter_model").classList.remove("hidden");
-				document.getElementById("openrouterproviderbox").classList.remove("hidden");
-				document.getElementById("custom_oai_endpoint").value = default_openrouter_base;
-				document.getElementById("custom_oai_key").value =(localsettings.saved_openrouter_key==dummy_api_key?"":localsettings.saved_openrouter_key);
-				if(!openrouter_fetch_attempted)
-				{
-					openrouter_fetch_attempted = true;
-					let dropdown = document.getElementById("custom_openrouter_model");
-					if(dropdown.options.length < 10)
-					{
-						oai_fetch_models(); //autofetch openrouter models
-					}
-				}
+				try_fetch_oai_models_auto();
 			}
 			oai_model_change(ep_should_always_use_chat_completions() || force_autotoggle_chatcompl);
 			toggleoaichatcompl();
@@ -26911,7 +26901,7 @@ Current version indicated by LITEVER below.
 				<span class="color_green" style="font-weight: bold;">No Key Required.</span><br><br>
 				</span>
 
-				<input class="form-control" type="text" id="custom_oai_endpoint" placeholder="OpenAI API URL" value="" onblur="try_fetch_oai_models_auto()">
+				<input class="form-control" type="text" id="custom_oai_endpoint" placeholder="OpenAI API URL" value="" onblur="">
 				<input class="form-control" type="password" id="custom_oai_key" placeholder="API Key (Required)" value="" onfocus="focus_api_keys()" onblur="blur_api_keys()"><br>
 				Model Choice:<br>
 				<select title="OpenAI Model Selection" style="padding:4px;display:inline;width:calc(100% - 220px)" class="form-control" id="custom_oai_model" onchange="oai_model_change(true)">
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -3869,7 +3869,7 @@ def transform_genparams(genparams, api_format):
                                 attachedaudid += 1
                                 messages_string += f"\n(Attached Audio {attachedaudid})\n"
                 # If last message, add any tools calls after message content and before message end token if any
-                if (message['role'] == "user" or message['role'] == "tool") and message_index == len(messages_array):
+                if message_index == len(messages_array):
                     used_tool_json = determine_tool_json_to_use(genparams, messages_string, assistant_message_start, (message['role'] == "tool"))
 
                     if used_tool_json:
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -59,7 +59,7 @@ bool llama_batch_allocr::init(
         for (int32_t i = 0; i < batch.n_tokens; ++i) {
             for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
                 if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
-                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
+                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d >= %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
                     return false;
                 }
             }
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -188,38 +188,23 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
 
 void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
     const int64_t n_tokens     = ubatch->n_tokens;
-    const int64_t n_seq_tokens = ubatch->n_seq_tokens;
     const int64_t n_seqs_unq   = ubatch->n_seqs_unq;
 
     if (cparams.embeddings && (
-            cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
-            cparams.pooling_type == LLAMA_POOLING_TYPE_RANK
-        )) {
+        cparams.pooling_type == LLAMA_POOLING_TYPE_CLS  ||
+        cparams.pooling_type == LLAMA_POOLING_TYPE_RANK ||
+        cparams.pooling_type == LLAMA_POOLING_TYPE_LAST
+    )) {
         GGML_ASSERT(cls);
         GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
 
         uint32_t * data = (uint32_t *) cls->data;
         memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
 
-        for (int i = 0; i < n_tokens; i += n_seq_tokens) {
-            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
-                const llama_seq_id seq_id  = ubatch->seq_id[i][s];
-                const int32_t      seq_idx = ubatch->seq_idx[seq_id];
-
-                data[seq_idx] = i;
-            }
-        }
-    }
-
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
-        GGML_ASSERT(cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
-
-        uint32_t * data = (uint32_t *) cls->data;
-        memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
+        std::vector<int> target_pos(n_seqs_unq, -1);
+        std::vector<int> target_row(n_seqs_unq, -1);
 
-        std::vector<int> last_pos(n_seqs_unq, -1);
-        std::vector<int> last_row(n_seqs_unq, -1);
+        bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
 
         for (int i = 0; i < n_tokens; ++i) {
             const llama_pos pos = ubatch->pos[i];
@@ -228,16 +213,20 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
                 const llama_seq_id seq_id  = ubatch->seq_id[i][s];
                 const int32_t      seq_idx = ubatch->seq_idx[seq_id];
 
-                if (pos >= last_pos[seq_idx]) {
-                    last_pos[seq_idx] = pos;
-                    last_row[seq_idx] = i;
+                if (
+                    (target_pos[seq_idx] == -1) ||
+                    ( last && pos >= target_pos[seq_idx]) ||
+                    (!last && pos <  target_pos[seq_idx])
+                ) {
+                    target_pos[seq_idx] = pos;
+                    target_row[seq_idx] = i;
                 }
             }
         }
 
         for (int s = 0; s < n_seqs_unq; ++s) {
-            if (last_row[s] >= 0) {
-                data[s] = last_row[s];
+            if (target_row[s] >= 0) {
+                data[s] = target_row[s];
             }
         }
     }
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -144,7 +144,7 @@ class llm_graph_input_pos_bucket : public llm_graph_input_i {
 
     ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
 
-    const llama_hparams & hparams;
+    const llama_hparams hparams;
 };
 
 class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
@@ -158,7 +158,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
 
     ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
 
-    const llama_hparams & hparams;
+    const llama_hparams hparams;
 
     const llama_kv_cache_unified_context * mctx;
 };
@@ -177,8 +177,8 @@ class llm_graph_input_out_ids : public llm_graph_input_i {
 
     ggml_tensor * out_ids; // I32 [n_outputs]
 
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
+    const llama_hparams hparams;
+    const llama_cparams cparams;
 
     const uint32_t n_outputs;
 };
@@ -192,7 +192,7 @@ class llm_graph_input_mean : public llm_graph_input_i {
 
     ggml_tensor * mean; // F32 [n_batch, n_batch]
 
-    const llama_cparams & cparams;
+    const llama_cparams cparams;
 };
 
 class llm_graph_input_cls : public llm_graph_input_i {
@@ -204,7 +204,7 @@ class llm_graph_input_cls : public llm_graph_input_i {
 
     ggml_tensor * cls; // I32 [n_batch]
 
-    const llama_cparams & cparams;
+    const llama_cparams cparams;
 };
 
 class llm_graph_input_rs : public llm_graph_input_i {
@@ -247,8 +247,8 @@ class llm_graph_input_attn_no_cache : public llm_graph_input_i {
     ggml_tensor * kq_mask     = nullptr; // F32 [n_tokens, n_batch, 1, 1]
     ggml_tensor * kq_mask_cnv = nullptr; //     [n_tokens, n_batch, 1, 1]
 
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
+    const llama_hparams hparams;
+    const llama_cparams cparams;
 };
 
 class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
@@ -278,8 +278,11 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
     ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
     ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
 
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
+    // note: these have to be copies because in order to be able to reuse a graph, its inputs
+    //       need to carry these parameters with them. otherwise, they can point to freed
+    //       llm_graph_params from a previous batch, causing stack-use-after-return
+    const llama_hparams hparams;
+    const llama_cparams cparams;
 
     const llama_kv_cache_unified_context * mctx;
 };
@@ -318,8 +321,8 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
     ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
     ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
 
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
+    const llama_hparams hparams;
+    const llama_cparams cparams;
 
     const llama_kv_cache_unified_iswa_context * mctx;
 };

Original file line number	Diff line number	Diff line change
`@@ -1944,6 +1944,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co`
`1944`	`1944`	`}`
`1945`	`1945`	`}`
`1946`	`1946`	`auto msg = builder.result();`
`1947`		`- LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());`
	`1947`	`+ if (!is_partial) {`
	`1948`	`+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());`
	`1949`	`+ }`
`1948`	`1950`	`return msg;`
`1949`	`1951`	`}`
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ bool llama_batch_allocr::init(`
`59`	`59`	`for (int32_t i = 0; i < batch.n_tokens; ++i) {`
`60`	`60`	`for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {`
`61`	`61`	`if (batch.seq_id && (batch.seq_id[i][s] < 0 \|\| batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {`
`62`		`- LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);`
	`62`	`+ LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d >= %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);`
`63`	`63`	`return false;`
`64`	`64`	`}`
`65`	`65`	`}`