@@ -2914,77 +2914,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
29142914
29152915 layer.wkv_a_mqa = create_tensor (tn (LLM_TENSOR_ATTN_KV_A_MQA, " weight" , i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0 );
29162916 layer.wkv_b = create_tensor (tn (LLM_TENSOR_ATTN_KV_B, " weight" , i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0 );
2917- layer.wk_b = create_tensor (tn (LLM_TENSOR_ATTN_K_B, " weight" , i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 0 );
2918- layer.wv_b = create_tensor (tn (LLM_TENSOR_ATTN_V_B, " weight" , i), {kv_lora_rank, n_head * n_embd_head_v}, 0 );
2919- if (!layer.wk_b || !layer.wv_b ) {
2920- if (!layer.wkv_b ) {
2921- throw std::runtime_error (" wkv_b must be defined without wk_b and wv_b" );
2922- }
2923-
2924- // select the buffer type for this tensor
2925- buft_list_t * buft_list = pimpl->dev_input .buft_list ;
2926-
2927- ggml_backend_buffer_type_t buft = nullptr ;
2928-
2929- // check overrides
2930- if (ml.tensor_buft_overrides ) {
2931- std::string tensor_name = " blk." + std::to_string (i) +" .attn_kv_b.weight" ;
2932- for (const auto * overrides = ml.tensor_buft_overrides ; overrides->pattern != nullptr ; ++overrides) {
2933- std::regex pattern (overrides->pattern );
2934- if (std::regex_search (tensor_name, pattern)) {
2935- LLAMA_LOG_DEBUG (" tensor %s buffer type overriden to %s\n " , tensor_name.c_str (), ggml_backend_buft_name (overrides->buft ));
2936- buft = overrides->buft ;
2937- break ;
2938- }
2939- }
2940- }
2941-
2942- // avoid using a host buffer when using mmap
2943- auto * buft_dev = ggml_backend_buft_get_device (buft);
2944- if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type (buft_dev)) {
2945- auto * cpu_dev = ggml_backend_dev_by_type (GGML_BACKEND_DEVICE_TYPE_CPU);
2946- buft = ggml_backend_dev_buffer_type (cpu_dev);
2947- }
2948-
2949- ggml_context * ctx = ctx_for_buft (buft);
2950-
2951- // 反量化 wkv_b
2952- const auto * qtype = ggml_get_type_traits (layer.wkv_b ->type );
2953- std::vector<float > dequantized_wkv_b (layer.wkv_b ->ne [0 ] * layer.wkv_b ->ne [1 ]);
2954- qtype->to_float (layer.wkv_b ->data , dequantized_wkv_b.data (), layer.wkv_b ->ne [0 ] * layer.wkv_b ->ne [1 ]);
2955-
2956- // 创建 wk_b 和 wv_b 张量
2957- auto * wk_b = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, n_embd_head_qk_nope, n_head * kv_lora_rank);
2958- auto * wv_b = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, kv_lora_rank, n_head * n_embd_head_v);
2959-
2960- // 分割 wkv_b 数据来生成 wk_b 和 wv_b
2961- for (int h = 0 ; h < n_head; ++h) {
2962- int k_start = h * (n_embd_head_qk_nope + n_embd_head_v);
2963-
2964- for (int row = 0 ; row < kv_lora_rank; ++row) {
2965- for (int col = 0 ; col < n_embd_head_qk_nope; ++col) {
2966- // 填充 wk_b
2967- int src_idx = row * layer.wkv_b ->ne [0 ] + k_start + col;
2968- GGML_ASSERT (src_idx < dequantized_wkv_b.size ());
2969- int dst_row = h * kv_lora_rank + row;
2970- int dst_col = col;
2971- ((float *)wk_b->data )[dst_row * n_embd_head_qk_nope + dst_col] = dequantized_wkv_b[src_idx];
2972- }
2973-
2974- for (int col = 0 ; col < n_embd_head_v; ++col) {
2975- // 填充 wv_b
2976- int src_idx = row * layer.wkv_b ->ne [0 ] + k_start + n_embd_head_qk_nope + col;
2977- GGML_ASSERT (src_idx < dequantized_wkv_b.size ());
2978- int dst_row = row;
2979- int dst_col = h * n_embd_head_v + col;
2980- ((float *)wv_b->data )[dst_row * n_head * n_embd_head_v + dst_col] = dequantized_wkv_b[src_idx];
2981- }
2982- }
2983- }
2984-
2985- layer.wk_b = ggml_cast (ctx, wk_b, layer.wkv_b ->type );
2986- layer.wv_b = ggml_cast (ctx, wv_b, layer.wkv_b ->type );
2987- }
29882917 layer.wo = create_tensor (tn (LLM_TENSOR_ATTN_OUT, " weight" , i), { n_head * ( n_embd_head_v), n_embd}, 0 );
29892918
29902919 layer.ffn_norm = create_tensor (tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd}, 0 );
0 commit comments