Skip to content

Commit 751ff03

Browse files
committed
fix
1 parent ab9a13a commit 751ff03

File tree

1 file changed

+36
-3
lines changed

1 file changed

+36
-3
lines changed

src/llama-model.cpp

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2948,9 +2948,42 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
29482948

29492949
ggml_context * ctx = ctx_for_buft(buft);
29502950

2951-
auto trans_wkv_b = ggml_cont(ctx, ggml_transpose(ctx, layer.wkv_b));
2952-
layer.wk_b = ggml_view_2d(ctx, trans_wkv_b, trans_wkv_b->ne[0], n_embd_head_qk_nope, n_head, 0);
2953-
layer.wv_b = ggml_view_2d(ctx, trans_wkv_b, trans_wkv_b->ne[0], n_embd_head_v, n_head, n_embd_head_qk_nope * n_head);
2951+
// 反量化 wkv_b
2952+
const auto * qtype = ggml_get_type_traits(layer.wkv_b->type);
2953+
std::vector<float> dequantized_wkv_b(layer.wkv_b->ne[0] * layer.wkv_b->ne[1]);
2954+
qtype->to_float(layer.wkv_b->data, dequantized_wkv_b.data(), layer.wkv_b->ne[0] * layer.wkv_b->ne[1]);
2955+
2956+
// 创建 wk_b 和 wv_b 张量
2957+
auto * wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd_head_qk_nope, n_head * kv_lora_rank);
2958+
auto * wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, kv_lora_rank, n_head * n_embd_head_v);
2959+
2960+
// 分割 wkv_b 数据来生成 wk_b 和 wv_b
2961+
for (int h = 0; h < n_head; ++h) {
2962+
int k_start = h * (n_embd_head_qk_nope + n_embd_head_v);
2963+
2964+
for (int row = 0; row < kv_lora_rank; ++row) {
2965+
for (int col = 0; col < n_embd_head_qk_nope; ++col) {
2966+
// 填充 wk_b
2967+
int src_idx = row * layer.wkv_b->ne[0] + k_start + col;
2968+
GGML_ASSERT(src_idx < dequantized_wkv_b.size());
2969+
int dst_row = h * kv_lora_rank + row;
2970+
int dst_col = col;
2971+
((float*)wk_b->data)[dst_row * n_embd_head_qk_nope + dst_col] = dequantized_wkv_b[src_idx];
2972+
}
2973+
2974+
for (int col = 0; col < n_embd_head_v; ++col) {
2975+
// 填充 wv_b
2976+
int src_idx = row * layer.wkv_b->ne[0] + k_start + n_embd_head_qk_nope + col;
2977+
GGML_ASSERT(src_idx < dequantized_wkv_b.size());
2978+
int dst_row = row;
2979+
int dst_col = h * n_embd_head_v + col;
2980+
((float*)wv_b->data)[dst_row * n_head * n_embd_head_v + dst_col] = dequantized_wkv_b[src_idx];
2981+
}
2982+
}
2983+
}
2984+
2985+
layer.wk_b = ggml_cast(ctx, wk_b, layer.wkv_b->type);
2986+
layer.wv_b = ggml_cast(ctx, wv_b, layer.wkv_b->type);
29542987
}
29552988
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
29562989

0 commit comments

Comments
 (0)