@@ -2948,9 +2948,42 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
29482948
29492949 ggml_context * ctx = ctx_for_buft (buft);
29502950
2951- auto trans_wkv_b = ggml_cont (ctx, ggml_transpose (ctx, layer.wkv_b ));
2952- layer.wk_b = ggml_view_2d (ctx, trans_wkv_b, trans_wkv_b->ne [0 ], n_embd_head_qk_nope, n_head, 0 );
2953- layer.wv_b = ggml_view_2d (ctx, trans_wkv_b, trans_wkv_b->ne [0 ], n_embd_head_v, n_head, n_embd_head_qk_nope * n_head);
2951+ // 反量化 wkv_b
2952+ const auto * qtype = ggml_get_type_traits (layer.wkv_b ->type );
2953+ std::vector<float > dequantized_wkv_b (layer.wkv_b ->ne [0 ] * layer.wkv_b ->ne [1 ]);
2954+ qtype->to_float (layer.wkv_b ->data , dequantized_wkv_b.data (), layer.wkv_b ->ne [0 ] * layer.wkv_b ->ne [1 ]);
2955+
2956+ // 创建 wk_b 和 wv_b 张量
2957+ auto * wk_b = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, n_embd_head_qk_nope, n_head * kv_lora_rank);
2958+ auto * wv_b = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, kv_lora_rank, n_head * n_embd_head_v);
2959+
2960+ // 分割 wkv_b 数据来生成 wk_b 和 wv_b
2961+ for (int h = 0 ; h < n_head; ++h) {
2962+ int k_start = h * (n_embd_head_qk_nope + n_embd_head_v);
2963+
2964+ for (int row = 0 ; row < kv_lora_rank; ++row) {
2965+ for (int col = 0 ; col < n_embd_head_qk_nope; ++col) {
2966+ // 填充 wk_b
2967+ int src_idx = row * layer.wkv_b ->ne [0 ] + k_start + col;
2968+ GGML_ASSERT (src_idx < dequantized_wkv_b.size ());
2969+ int dst_row = h * kv_lora_rank + row;
2970+ int dst_col = col;
2971+ ((float *)wk_b->data )[dst_row * n_embd_head_qk_nope + dst_col] = dequantized_wkv_b[src_idx];
2972+ }
2973+
2974+ for (int col = 0 ; col < n_embd_head_v; ++col) {
2975+ // 填充 wv_b
2976+ int src_idx = row * layer.wkv_b ->ne [0 ] + k_start + n_embd_head_qk_nope + col;
2977+ GGML_ASSERT (src_idx < dequantized_wkv_b.size ());
2978+ int dst_row = row;
2979+ int dst_col = h * n_embd_head_v + col;
2980+ ((float *)wv_b->data )[dst_row * n_head * n_embd_head_v + dst_col] = dequantized_wkv_b[src_idx];
2981+ }
2982+ }
2983+ }
2984+
2985+ layer.wk_b = ggml_cast (ctx, wk_b, layer.wkv_b ->type );
2986+ layer.wv_b = ggml_cast (ctx, wv_b, layer.wkv_b ->type );
29542987 }
29552988 layer.wo = create_tensor (tn (LLM_TENSOR_ATTN_OUT, " weight" , i), { n_head * ( n_embd_head_v), n_embd}, 0 );
29562989
0 commit comments