fix qwen and qwen2

guocuimi · guocuimi · commit 4ffe8db98a59 · 2025-09-23T19:55:42.000-07:00
diff --git a/src/layers/linear_impl.cpp b/src/layers/linear_impl.cpp
@@ -35,8 +35,12 @@ ColumnParallelLinearImpl::ColumnParallelLinearImpl(
       torch::empty({out_features_per_partition, in_features}, options));
 
   if (bias) {
-    bias_ = register_parameter(
-        "bias", torch::empty({out_features_per_partition}, options));
+    bias_ = register_sharded_parameter(
+        "bias",
+        /*dim=*/0,
+        rank,
+        world_size,
+        torch::empty({out_features_per_partition}, options));
   }
 }
 
diff --git a/src/models/llama.h b/src/models/llama.h
@@ -10,7 +10,6 @@
 #include "layers/embedding.h"
 #include "layers/fused_linear.h"
 #include "layers/linear.h"
-#include "layers/linear_impl.h"
 #include "layers/normalization.h"
 #include "layers/qkv_linear.h"
 #include "memory/kv_cache.h"
diff --git a/src/models/models.h b/src/models/models.h
@@ -15,5 +15,5 @@
 // #include "mistral.h"   // IWYU pragma: keep
 // #include "mpt.h"       // IWYU pragma: keep
 // #include "phi.h"       // IWYU pragma: keep
-// #include "qwen.h"      // IWYU pragma: keep
-// #include "qwen2.h"     // IWYU pragma: keep
+#include "qwen.h"   // IWYU pragma: keep
+#include "qwen2.h"  // IWYU pragma: keep
diff --git a/src/models/qwen.h b/src/models/qwen.h
@@ -10,6 +10,7 @@
 #include "layers/attention/attention.h"
 #include "layers/attention/handler.h"
 #include "layers/embedding.h"
+#include "layers/fused_linear.h"
 #include "layers/linear.h"
 #include "layers/normalization.h"
 #include "memory/kv_cache.h"
@@ -20,7 +21,7 @@
 #include "module/module_holder.h"
 #include "module/module_list.h"
 // QWen model compatible with huggingface weights
-// adopted from https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
+// Adapted from https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
 namespace llm::hf {
 
 class QWenMLPImpl : public Module {
@@ -38,14 +39,18 @@ class QWenMLPImpl : public Module {
     const int64_t intermediate_size = args.intermediate_size() / 2;
 
     // register the weight parameter
-    w1_w2_proj_ = register_module("gate_up_proj",
-                                  ColumnParallelLinear(hidden_size,
-                                                       intermediate_size * 2,
-                                                       /*bias=*/false,
-                                                       /*gather_output=*/false,
-                                                       quant_args,
-                                                       parallel_args,
-                                                       options));
+    gate_up_proj_ = register_module(
+        "gate_up_proj",
+        FusedColumnParallelLinear(
+            hidden_size,
+            std::vector<int64_t>{intermediate_size, intermediate_size},
+            std::vector<std::string>{"w1.", "w2."},
+            /*bias=*/false,
+            /*gather_output=*/false,
+            quant_args,
+            parallel_args,
+            options),
+        /*selector=*/nullptr);
     c_proj_ = register_module("c_proj",
                               RowParallelLinear(intermediate_size,
                                                 hidden_size,
@@ -57,26 +62,13 @@ class QWenMLPImpl : public Module {
   }
 
   torch::Tensor forward(torch::Tensor x) {
-    auto gate_up_proj = w1_w2_proj_(x);
-    auto chunks = gate_up_proj.chunk(/*chunks=*/2, /*dim=*/-1);
-    return c_proj_(chunks[0] * act_(chunks[1]));
-  }
-
-  // load the weight from the checkpoint
-  void load_state_dict(const StateDict& state_dict) {
-    // call each submodule's load_state_dict function
-    w1_w2_proj_->load_state_dict(state_dict, {"w1.", "w2."});
-    c_proj_->load_state_dict(state_dict.select("c_proj."));
-  }
-
-  void verify_loaded_weights(const std::string& prefix) const {
-    w1_w2_proj_->verify_loaded_weights(prefix + "[w1,w2].");
-    c_proj_->verify_loaded_weights(prefix + "c_proj.");
+    const auto gate_up = gate_up_proj_(x);
+    return c_proj_(gate_up[0] * act_(gate_up[1]));
   }
 
  private:
   // parameter members, must be registered
-  ColumnParallelLinear w1_w2_proj_{nullptr};
+  FusedColumnParallelLinear gate_up_proj_{nullptr};
   RowParallelLinear c_proj_{nullptr};
 
   ActFunc act_{nullptr};
@@ -133,18 +125,6 @@ class QWenAttentionImpl : public Module {
     return c_proj_(output);
   }
 
-  // load the weight from the checkpoint
-  void load_state_dict(const StateDict& state_dict) {
-    // call each submodule's load_state_dict function
-    c_attn_->load_state_dict(state_dict.select("c_attn."));
-    c_proj_->load_state_dict(state_dict.select("c_proj."));
-  }
-
-  void verify_loaded_weights(const std::string& prefix) const {
-    c_attn_->verify_loaded_weights(prefix + "c_attn.");
-    c_proj_->verify_loaded_weights(prefix + "c_proj.");
-  }
-
  private:
   // parameter members, must be registered
   ColumnParallelLinear c_attn_{nullptr};
@@ -183,22 +163,6 @@ class QWenBlockImpl : public Module {
     return h + mlp_(ln_2_(h));
   }
 
-  // load the weight from the checkpoint
-  void load_state_dict(const StateDict& state_dict) {
-    // call each submodule's load_state_dict function
-    attn_->load_state_dict(state_dict.select("attn."));
-    mlp_->load_state_dict(state_dict.select("mlp."));
-    ln_1_->load_state_dict(state_dict.select("ln_1."));
-    ln_2_->load_state_dict(state_dict.select("ln_2."));
-  }
-
-  void verify_loaded_weights(const std::string& prefix) const {
-    attn_->verify_loaded_weights(prefix + "attn.");
-    mlp_->verify_loaded_weights(prefix + "mlp.");
-    ln_1_->verify_loaded_weights(prefix + "ln_1.");
-    ln_2_->verify_loaded_weights(prefix + "ln_2.");
-  }
-
  private:
   // parameter members, must be registered
   QWenAttention attn_{nullptr};
@@ -226,7 +190,7 @@ class QWenModelImpl : public Module {
     handler_ = AttentionHandler::create_handler_with_rope(
         args, /*interleaved=*/false, options);
 
-    blocks_ = register_module("layers", ModuleList());
+    blocks_ = register_module("h", ModuleList());
     layers_.reserve(args.n_layers());
     for (int32_t i = 0; i < args.n_layers(); i++) {
       auto block =
@@ -254,26 +218,6 @@ class QWenModelImpl : public Module {
     return ln_f_(h);
   }
 
-  // load the weight from the checkpoint
-  void load_state_dict(const StateDict& state_dict) {
-    wte_->load_state_dict(state_dict.select("wte."));
-    // call each layer's load_state_dict function
-    for (int i = 0; i < layers_.size(); i++) {
-      layers_[i]->load_state_dict(
-          state_dict.select("h." + std::to_string(i) + "."));
-    }
-    ln_f_->load_state_dict(state_dict.select("ln_f."));
-  }
-
-  void verify_loaded_weights(const std::string& prefix) const {
-    wte_->verify_loaded_weights(prefix + "wte.");
-    for (int i = 0; i < layers_.size(); i++) {
-      layers_[i]->verify_loaded_weights(prefix + "h." + std::to_string(i) +
-                                        ".");
-    }
-    ln_f_->verify_loaded_weights(prefix + "ln_f.");
-  }
-
  private:
   // parameter members, must be registered
   ParallelEmbedding wte_{nullptr};
@@ -331,17 +275,6 @@ class QWenForCausalLMImpl : public Module {
     return lm_head_(h);
   }
 
-  // load the weight from the checkpoint
-  void load_state_dict(const StateDict& state_dict) {
-    transformer_->load_state_dict(state_dict.select("transformer."));
-    lm_head_->load_state_dict(state_dict.select("lm_head."));
-  }
-
-  void verify_loaded_weights() const {
-    transformer_->verify_loaded_weights("transformer.");
-    lm_head_->verify_loaded_weights("lm_head.");
-  }
-
  private:
   // parameter members, must be registered
   QWenModel transformer_{nullptr};
diff --git a/src/models/qwen2.h b/src/models/qwen2.h
@@ -43,11 +43,13 @@ class QWen2MLPImpl : public Module {
         FusedColumnParallelLinear(
             hidden_size,
             std::vector<int64_t>{intermediate_size, intermediate_size},
+            std::vector<std::string>{"gate_proj.", "up_proj."},
             /*bias=*/false,
             /*gather_output=*/false,
             quant_args,
             parallel_args,
-            options));
+            options),
+        /*selector=*/nullptr);
     down_proj_ =
         register_module("down_proj",
                         RowParallelLinear(intermediate_size,
@@ -64,18 +66,6 @@ class QWen2MLPImpl : public Module {
     return down_proj_(act_func_(gate_up[0]) * gate_up[1]);
   }
 
-  // load the weight from the checkpoint
-  void load_state_dict(const StateDict& state_dict) {
-    // call each submodule's load_state_dict function
-    gate_up_proj_->load_state_dict(state_dict, {"gate_proj.", "up_proj."});
-    down_proj_->load_state_dict(state_dict.select("down_proj."));
-  }
-
-  void verify_loaded_weights(const std::string& prefix) const {
-    gate_up_proj_->verify_loaded_weights(prefix + "[gate_proj,up_proj].");
-    down_proj_->verify_loaded_weights(prefix + "down_proj.");
-  }
-
  private:
   // parameter members, must be registered
   FusedColumnParallelLinear gate_up_proj_{nullptr};
@@ -104,16 +94,20 @@ class QWen2AttentionImpl : public Module {
         std::max<int64_t>(1, n_kv_heads / world_size);
 
     // register submodules
-    qkv_proj_ = register_module("qkv_proj",
-                                QKVColumnParallelLinear(hidden_size,
-                                                        n_heads,
-                                                        n_kv_heads,
-                                                        head_dim,
-                                                        /*bias=*/true,
-                                                        /*gather_output=*/false,
-                                                        quant_args,
-                                                        parallel_args,
-                                                        options));
+    qkv_proj_ = register_module(
+        "qkv_proj",
+        QKVColumnParallelLinear(
+            hidden_size,
+            n_heads,
+            n_kv_heads,
+            head_dim,
+            std::vector<std::string>{"q_proj.", "k_proj.", "v_proj."},
+            /*bias=*/true,
+            /*gather_output=*/false,
+            quant_args,
+            parallel_args,
+            options),
+        /*selector=*/nullptr);
 
     o_proj_ = register_module("o_proj",
                               RowParallelLinear(hidden_size,
@@ -146,19 +140,6 @@ class QWen2AttentionImpl : public Module {
     return o_proj_(output);
   }
 
-  // load the weight from the checkpoint
-  void load_state_dict(const StateDict& state_dict) {
-    // call each submodule's load_state_dict function
-    qkv_proj_->load_state_dict(
-        state_dict, {"q_proj.", "k_proj.", "v_proj."}, {"k_proj.", "v_proj."});
-    o_proj_->load_state_dict(state_dict.select("o_proj."));
-  }
-
-  void verify_loaded_weights(const std::string& prefix) const {
-    qkv_proj_->verify_loaded_weights(prefix + "[q_proj,k_proj,v_proj].");
-    o_proj_->verify_loaded_weights(prefix + "o_proj.");
-  }
-
  private:
   // parameter members, must be registered
   QKVColumnParallelLinear qkv_proj_{nullptr};
@@ -208,24 +189,6 @@ class QWen2DecoderLayerImpl : public Module {
     return hidden_states;
   }
 
-  // load the weight from the checkpoint
-  void load_state_dict(const StateDict& state_dict) {
-    // call each submodule's load_state_dict function
-    self_attn_->load_state_dict(state_dict.select("self_attn."));
-    mlp_->load_state_dict(state_dict.select("mlp."));
-    input_layernorm_->load_state_dict(state_dict.select("input_layernorm."));
-    post_attention_layernorm_->load_state_dict(
-        state_dict.select("post_attention_layernorm."));
-  }
-
-  void verify_loaded_weights(const std::string& prefix) const {
-    self_attn_->verify_loaded_weights(prefix + "self_attn.");
-    mlp_->verify_loaded_weights(prefix + "mlp.");
-    input_layernorm_->verify_loaded_weights(prefix + "input_layernorm.");
-    post_attention_layernorm_->verify_loaded_weights(
-        prefix + "post_attention_layernorm.");
-  }
-
  private:
   // parameter members, must be registered
   QWen2Attention self_attn_{nullptr};
@@ -291,26 +254,6 @@ class QWen2ModelImpl : public Module {
     return norm_(h, residual);
   }
 
-  // load the weight from the checkpoint
-  void load_state_dict(const StateDict& state_dict) {
-    embed_tokens_->load_state_dict(state_dict.select("embed_tokens."));
-    // call each layer's load_state_dict function
-    for (int i = 0; i < layers_.size(); i++) {
-      layers_[i]->load_state_dict(
-          state_dict.select("layers." + std::to_string(i) + "."));
-    }
-    norm_->load_state_dict(state_dict.select("norm."));
-  }
-
-  void verify_loaded_weights(const std::string& prefix) const {
-    embed_tokens_->verify_loaded_weights(prefix + "embed_tokens.");
-    for (int i = 0; i < layers_.size(); i++) {
-      layers_[i]->verify_loaded_weights(prefix + "layers." + std::to_string(i) +
-                                        ".");
-    }
-    norm_->verify_loaded_weights(prefix + "norm.");
-  }
-
  private:
   // parameter members, must be registered
   ParallelEmbedding embed_tokens_{nullptr};
@@ -368,17 +311,6 @@ class QWen2ForCausalLMImpl : public Module {
     return lm_head_(h);
   }
 
-  // load the weight from the checkpoint
-  void load_state_dict(const StateDict& state_dict) {
-    model_->load_state_dict(state_dict.select("model."));
-    lm_head_->load_state_dict(state_dict.select("lm_head."));
-  }
-
-  void verify_loaded_weights() const {
-    model_->verify_loaded_weights("model.");
-    lm_head_->verify_loaded_weights("lm_head.");
-  }
-
  private:
   // parameter members, must be registered
   QWen2Model model_{nullptr};

Original file line number	Diff line number	Diff line change
`@@ -35,8 +35,12 @@ ColumnParallelLinearImpl::ColumnParallelLinearImpl(`
`35`	`35`	`torch::empty({out_features_per_partition, in_features}, options));`
`36`	`36`
`37`	`37`	`if (bias) {`
`38`		`- bias_ = register_parameter(`
`39`		`- "bias", torch::empty({out_features_per_partition}, options));`
	`38`	`+ bias_ = register_sharded_parameter(`
	`39`	`+ "bias",`
	`40`	`+ /dim=/0,`
	`41`	`+ rank,`
	`42`	`+ world_size,`
	`43`	`+ torch::empty({out_features_per_partition}, options));`
`40`	`44`	`}`
`41`	`45`	`}`
`42`	`46`