fix llama

guocuimi · guocuimi · commit 045658c38e0c · 2025-09-23T18:54:39.000-07:00
diff --git a/src/layers/qkv_linear.cpp b/src/layers/qkv_linear.cpp
@@ -10,9 +10,9 @@ QKVColumnParallelLinearImpl::QKVColumnParallelLinearImpl(
     int64_t n_heads,
     int64_t n_kv_heads,
     int64_t head_dim,
+    const std::vector<std::string>& prefixes,
     bool bias,
     bool gather_output,
-    const std::vector<std::string>& prefixes,
     const QuantArgs& quant_args,
     const ParallelArgs& parallel_args,
     const torch::TensorOptions& options) {
diff --git a/src/layers/qkv_linear.h b/src/layers/qkv_linear.h
@@ -20,9 +20,9 @@ class QKVColumnParallelLinearImpl : public Module {
                               int64_t n_heads,
                               int64_t n_kv_heads,
                               int64_t head_dim,
+                              const std::vector<std::string>& prefixes,
                               bool bias,
                               bool gather_output,
-                              const std::vector<std::string>& prefixes,
                               const QuantArgs& quant_args,
                               const ParallelArgs& parallel_args,
                               const torch::TensorOptions& options);
diff --git a/src/layers/qkv_linear_test.cpp b/src/layers/qkv_linear_test.cpp
@@ -56,9 +56,9 @@ TEST_P(QKVColumnParallelLinearTest, LoadFusedWeight) {
         n_heads,
         n_kv_heads,
         head_dim,
+        std::vector<std::string>{"query.", "key.", "value."},
         /*bias=*/false,
         /*gather_output=*/false,
-        std::vector<std::string>{"query.", "key.", "value."},
         quant_args,
         parallel_args,
         options);
diff --git a/src/models/gemma.h b/src/models/gemma.h
@@ -100,9 +100,9 @@ class GemmaAttentionImpl : public Module {
             n_heads,
             n_kv_heads,
             head_dim,
+            std::vector<std::string>{"q_proj.", "k_proj.", "v_proj."},
             /*bias=*/false,
             /*gather_output=*/false,
-            std::vector<std::string>{"q_proj.", "k_proj.", "v_proj."},
             quant_args,
             parallel_args,
             options),
diff --git a/src/models/gemma2.h b/src/models/gemma2.h
@@ -100,9 +100,9 @@ class Gemma2AttentionImpl : public Module {
             n_heads,
             n_kv_heads,
             head_dim,
+            std::vector<std::string>{"q_proj.", "k_proj.", "v_proj."},
             args.attn_bias(),
             /*gather_output=*/false,
-            std::vector<std::string>{"q_proj.", "k_proj.", "v_proj."},
             quant_args,
             parallel_args,
             options),
diff --git a/src/models/llama.h b/src/models/llama.h
@@ -36,16 +36,18 @@ class LlamaMLPImpl : public Module {
     const int64_t intermediate_size = args.intermediate_size();
 
     // register the weight parameter
-    // gate_up_proj_ = register_module(
-    //     "gate_up_proj",
-    //     FusedColumnParallelLinear(
-    //         hidden_size,
-    //         std::vector<int64_t>{intermediate_size, intermediate_size},
-    //         /*bias=*/false,
-    //         /*gather_output=*/false,
-    //         quant_args,
-    //         parallel_args,
-    //         options));
+    gate_up_proj_ = register_module(
+        "gate_up_proj",
+        FusedColumnParallelLinear(
+            hidden_size,
+            std::vector<int64_t>{intermediate_size, intermediate_size},
+            std::vector<std::string>{"gate_proj.", "up_proj."},
+            /*bias=*/false,
+            /*gather_output=*/false,
+            quant_args,
+            parallel_args,
+            options),
+        /*selector=*/nullptr);
 
     down_proj_ =
         register_module("down_proj",
@@ -63,18 +65,6 @@ class LlamaMLPImpl : public Module {
     return down_proj_(act_func_(gate_up[0]) * gate_up[1]);
   }
 
-  // // load the weight from the checkpoint
-  // void load_state_dict(const StateDict& state_dict) {
-  //   // call each submodule's load_state_dict function
-  //   gate_up_proj_->load_state_dict(state_dict, {"gate_proj.", "up_proj."});
-  //   down_proj_->load_state_dict(state_dict.select("down_proj."));
-  // }
-
-  // void verify_loaded_weights(const std::string& prefix) const {
-  //   gate_up_proj_->verify_loaded_weights(prefix + "[gate_proj,up_proj].");
-  //   down_proj_->verify_loaded_weights(prefix + "down_proj.");
-  // }
-
  private:
   // parameter members, must be registered
   FusedColumnParallelLinear gate_up_proj_{nullptr};
@@ -102,16 +92,20 @@ class LlamaAttentionImpl : public Module {
         std::max<int64_t>(1, n_kv_heads / world_size);
 
     // register submodules
-    // qkv_proj_ = register_module("qkv_proj",
-    //                             QKVColumnParallelLinear(hidden_size,
-    //                                                     n_heads,
-    //                                                     n_kv_heads,
-    //                                                     head_dim,
-    //                                                     /*bias=*/false,
-    //                                                     /*gather_output=*/false,
-    //                                                     quant_args,
-    //                                                     parallel_args,
-    //                                                     options));
+    qkv_proj_ = register_module(
+        "qkv_proj",
+        QKVColumnParallelLinear(
+            hidden_size,
+            n_heads,
+            n_kv_heads,
+            head_dim,
+            std::vector<std::string>{"q_proj.", "k_proj.", "v_proj."},
+            /*bias=*/false,
+            /*gather_output=*/false,
+            quant_args,
+            parallel_args,
+            options),
+        /*selector=*/nullptr);
 
     o_proj_ = register_module("o_proj",
                               RowParallelLinear(hidden_size,
@@ -141,20 +135,6 @@ class LlamaAttentionImpl : public Module {
     return o_proj_(output);
   }
 
-  // // load the weight from the checkpoint
-  // void load_state_dict(const StateDict& state_dict) {
-  //   // call each submodule's load_state_dict function
-  //   qkv_proj_->load_state_dict(
-  //       state_dict, {"q_proj.", "k_proj.", "v_proj."}, {"k_proj.",
-  //       "v_proj."});
-  //   o_proj_->load_state_dict(state_dict.select("o_proj."));
-  // }
-
-  // void verify_loaded_weights(const std::string& prefix) const {
-  //   qkv_proj_->verify_loaded_weights(prefix + "[q_proj,k_proj,v_proj].");
-  //   o_proj_->verify_loaded_weights(prefix + "o_proj.");
-  // }
-
  private:
   // parameter members, must be registered
   QKVColumnParallelLinear qkv_proj_{nullptr};
diff --git a/src/models/models.h b/src/models/models.h
@@ -11,7 +11,7 @@
 // #include "gpt_j.h"     // IWYU pragma: keep
 // #include "gpt_neox.h"  // IWYU pragma: keep
 // #include "internlm.h"  // IWYU pragma: keep
-// #include "llama.h"     // IWYU pragma: keep
+#include "llama.h"  // IWYU pragma: keep
 // #include "mistral.h"   // IWYU pragma: keep
 // #include "mpt.h"       // IWYU pragma: keep
 // #include "phi.h"       // IWYU pragma: keep