Skip to content

Commit 69355a0

Browse files
committed
revert MLA
1 parent 46ac9f6 commit 69355a0

File tree

4 files changed

+0
-81
lines changed

4 files changed

+0
-81
lines changed

src/llama-arch.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -999,8 +999,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
999999
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
10001000
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
10011001
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1002-
{ LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
1003-
{ LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
10041002
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
10051003
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
10061004
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
@@ -1335,8 +1333,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
13351333
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
13361334
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
13371335
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1338-
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1339-
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
13401336
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
13411337
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
13421338
{LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -1354,8 +1350,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
13541350
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
13551351
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
13561352
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1357-
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1358-
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
13591353
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
13601354
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
13611355
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},

src/llama-arch.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -277,8 +277,6 @@ enum llm_tensor {
277277
LLM_TENSOR_ATTN_Q_B,
278278
LLM_TENSOR_ATTN_KV_A_MQA,
279279
LLM_TENSOR_ATTN_KV_B,
280-
LLM_TENSOR_ATTN_K_B,
281-
LLM_TENSOR_ATTN_V_B,
282280
LLM_TENSOR_ATTN_Q_A_NORM,
283281
LLM_TENSOR_ATTN_KV_A_NORM,
284282
LLM_TENSOR_ATTN_SUB_NORM,

src/llama-model.cpp

Lines changed: 0 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -2914,77 +2914,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
29142914

29152915
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
29162916
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
2917-
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 0);
2918-
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_v}, 0);
2919-
if (!layer.wk_b || !layer.wv_b) {
2920-
if (!layer.wkv_b) {
2921-
throw std::runtime_error("wkv_b must be defined without wk_b and wv_b");
2922-
}
2923-
2924-
// select the buffer type for this tensor
2925-
buft_list_t * buft_list = pimpl->dev_input.buft_list;
2926-
2927-
ggml_backend_buffer_type_t buft = nullptr;
2928-
2929-
// check overrides
2930-
if (ml.tensor_buft_overrides) {
2931-
std::string tensor_name = "blk."+ std::to_string(i) +".attn_kv_b.weight";
2932-
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
2933-
std::regex pattern(overrides->pattern);
2934-
if (std::regex_search(tensor_name, pattern)) {
2935-
LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
2936-
buft = overrides->buft;
2937-
break;
2938-
}
2939-
}
2940-
}
2941-
2942-
// avoid using a host buffer when using mmap
2943-
auto * buft_dev = ggml_backend_buft_get_device(buft);
2944-
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
2945-
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2946-
buft = ggml_backend_dev_buffer_type(cpu_dev);
2947-
}
2948-
2949-
ggml_context * ctx = ctx_for_buft(buft);
2950-
2951-
// 反量化 wkv_b
2952-
const auto * qtype = ggml_get_type_traits(layer.wkv_b->type);
2953-
std::vector<float> dequantized_wkv_b(layer.wkv_b->ne[0] * layer.wkv_b->ne[1]);
2954-
qtype->to_float(layer.wkv_b->data, dequantized_wkv_b.data(), layer.wkv_b->ne[0] * layer.wkv_b->ne[1]);
2955-
2956-
// 创建 wk_b 和 wv_b 张量
2957-
auto * wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd_head_qk_nope, n_head * kv_lora_rank);
2958-
auto * wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, kv_lora_rank, n_head * n_embd_head_v);
2959-
2960-
// 分割 wkv_b 数据来生成 wk_b 和 wv_b
2961-
for (int h = 0; h < n_head; ++h) {
2962-
int k_start = h * (n_embd_head_qk_nope + n_embd_head_v);
2963-
2964-
for (int row = 0; row < kv_lora_rank; ++row) {
2965-
for (int col = 0; col < n_embd_head_qk_nope; ++col) {
2966-
// 填充 wk_b
2967-
int src_idx = row * layer.wkv_b->ne[0] + k_start + col;
2968-
GGML_ASSERT(src_idx < dequantized_wkv_b.size());
2969-
int dst_row = h * kv_lora_rank + row;
2970-
int dst_col = col;
2971-
((float*)wk_b->data)[dst_row * n_embd_head_qk_nope + dst_col] = dequantized_wkv_b[src_idx];
2972-
}
2973-
2974-
for (int col = 0; col < n_embd_head_v; ++col) {
2975-
// 填充 wv_b
2976-
int src_idx = row * layer.wkv_b->ne[0] + k_start + n_embd_head_qk_nope + col;
2977-
GGML_ASSERT(src_idx < dequantized_wkv_b.size());
2978-
int dst_row = row;
2979-
int dst_col = h * n_embd_head_v + col;
2980-
((float*)wv_b->data)[dst_row * n_head * n_embd_head_v + dst_col] = dequantized_wkv_b[src_idx];
2981-
}
2982-
}
2983-
}
2984-
2985-
layer.wk_b = ggml_cast(ctx, wk_b, layer.wkv_b->type);
2986-
layer.wv_b = ggml_cast(ctx, wv_b, layer.wkv_b->type);
2987-
}
29882917
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
29892918

29902919
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);

src/llama-model.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,6 @@ struct llama_layer {
161161
struct ggml_tensor * wq_b = nullptr;
162162
struct ggml_tensor * wkv_a_mqa = nullptr;
163163
struct ggml_tensor * wkv_b = nullptr;
164-
struct ggml_tensor * wk_b = nullptr;
165-
struct ggml_tensor * wv_b = nullptr;
166164
struct ggml_tensor * wq_cross = nullptr;
167165
struct ggml_tensor * wk_cross = nullptr;
168166
struct ggml_tensor * wv_cross = nullptr;

0 commit comments

Comments
 (0)