rwkv: fix llama-parallel

MollySophia · MollySophia · commit a2a8109f508b · 2025-02-20T11:21:46.000+08:00
Signed-off-by: Molly Sophia &lt;mollysophia379@gmail.com&gt;
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -3555,20 +3555,23 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             # ignore them all since they are not used
             return
 
+        wkv_has_gate = self.hparams.get("wkv_has_gate", True)
+        lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"]
+
         if bid is not None and "attention.x_" in name:
             if "attention.x_x" in name:
                 # already concatenated
                 new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
-                data = data_torch.reshape(6, 1, -1)
+                data = data_torch.reshape(len(lerp_list), 1, 1, -1)
                 yield (new_name, data)
             else:
                 try:
                     self.lerp_weights[bid][name] = data_torch
                 except KeyError:
                     self.lerp_weights[bid] = {name: data_torch}
-                if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in ["r", "w", "k", "v", "a", "g"]):
+                if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list):
                     new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
-                    data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"].squeeze(0) for i in ["r", "w", "k", "v", "a", "g"]], dim=0)
+                    data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0)
                     yield (new_name, data)
             return
         else:
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -3356,7 +3356,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
                         layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
 
-                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 6}, 0);
+                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
 
                         layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
                         layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -1060,7 +1060,7 @@ static struct ggml_tensor * llm_build_rwkv7_time_mix(
     bool has_gating = layer->time_mix_g1 && layer->time_mix_g2;
 
     struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
-    struct ggml_tensor * dummy = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, n_tokens, layer->time_mix_lerp_fused->ne[2]);
+    struct ggml_tensor * dummy = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, n_seq_tokens, n_seqs, has_gating ? 6 : 5);
     sx = ggml_repeat(ctx, sx, dummy);
 
     struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_fused), cur);
@@ -1149,7 +1149,7 @@ static struct ggml_tensor * llm_build_rwkv7_time_mix(
     }
     cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
 
-    return cur;
+    return ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs);
 }
 
 static struct ggml_tensor * llm_build_rwkv7_channel_mix(
@@ -7768,9 +7768,9 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                inp_ffn = ggml_get_rows(ctx0, x_norm_ffn, inp_out_ids);
-                x_prev  = ggml_get_rows(ctx0, x_prev,     inp_out_ids);
-                cur     = ggml_get_rows(ctx0, cur,        inp_out_ids);
+                inp_ffn = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_norm_ffn, n_embd, n_tokens), inp_out_ids);
+                x_prev  = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev,     n_embd, n_tokens), inp_out_ids);
+                cur     = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur,        n_embd, n_tokens), inp_out_ids);
             }
 
             cur = ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, inp_ffn, x_prev));
@@ -8002,9 +8002,9 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                inp_ffn = ggml_get_rows(ctx0, x_norm_ffn, inp_out_ids);
-                x_prev  = ggml_get_rows(ctx0, x_prev,     inp_out_ids);
-                cur     = ggml_get_rows(ctx0, cur,        inp_out_ids);
+                inp_ffn = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_norm_ffn, n_embd, n_tokens), inp_out_ids);
+                x_prev  = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev,     n_embd, n_tokens), inp_out_ids);
+                cur     = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur,        n_embd, n_tokens), inp_out_ids);
             }
 
             cur = ggml_add(ctx0, cur, llm_build_rwkv7_channel_mix(lctx, ctx0, layer, inp_ffn, x_prev));