tmp

orca-zhang · orca-zhang · commit d256aa04cecd · 2025-02-25T20:44:18.000+08:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3248,7 +3248,7 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
 }
 
 static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    const int min_batch_size = 32;
+    const int min_batch_size = 9999999;
 
     return get_op_batch_size(op) >= min_batch_size;
 
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -96,8 +96,8 @@ bool llama_kv_cache_init(
             return false;
         }
 
-        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, 1);
+        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, 1);
         ggml_format_name(k, "cache_k_l%d", i);
         ggml_format_name(v, "cache_v_l%d", i);
         cache.k_l.push_back(k);
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
@@ -776,7 +776,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
             // get more optimal quantization type based on the tensor shape, layer, etc.
             if (!params->pure && ggml_is_quantized(default_type)) {
-                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+                new_type = name.find("_exps") != std::string::npos ? name.find("ffn_down") != std::string::npos ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K : GGML_TYPE_BF16;
             }
             if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                 new_type = params->token_embedding_type;
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -6407,7 +6407,7 @@ struct llm_build_context {
 
         // whether to use n_tokens as the matrix dimension during multiplication or n_head
         // n_tokens is higher during prompt processing, this allows to optimize for this case
-        bool pp_opt = n_tokens > n_head;
+        bool pp_opt = true;
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;

Original file line number	Diff line number	Diff line change
`@@ -3248,7 +3248,7 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {`
`3248`	`3248`	`}`
`3249`	`3249`
`3250`	`3250`	`static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {`
`3251`		`- const int min_batch_size = 32;`
	`3251`	`+ const int min_batch_size = 9999999;`
`3252`	`3252`
`3253`	`3253`	`return get_op_batch_size(op) >= min_batch_size;`
`3254`	`3254`
Original file line number	Diff line number	Diff line change
`@@ -776,7 +776,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::`
`776`	`776`
`777`	`777`	`// get more optimal quantization type based on the tensor shape, layer, etc.`
`778`	`778`	`if (!params->pure && ggml_is_quantized(default_type)) {`
`779`		`- new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);`
	`779`	`+ new_type = name.find("_exps") != std::string::npos ? name.find("ffn_down") != std::string::npos ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K : GGML_TYPE_BF16;`
`780`	`780`	`}`
`781`	`781`	`if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {`
`782`	`782`	`new_type = params->token_embedding_type;`