refactor: update causal LM implementations to inherit from LlmForCausalLMImplBase.

yingxudeng · yingxudeng · commit bc12eb7ddba4 · 2026-01-05T21:32:33.000+08:00
diff --git a/xllm/models/llm/npu/deepseek_v2.h b/xllm/models/llm/npu/deepseek_v2.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "core/layers/npu/npu_rms_norm_impl.h"
 #include "core/layers/npu/npu_word_embedding_impl.h"
 #include "core/layers/npu/rotary_embedding.h"
+#include "llm_model_base.h"
 #include "models/model_registry.h"
 // DeepSeek v2 compatible with huggingface weights
 // ref to:
@@ -265,72 +266,25 @@ class DeepseekV2ModelImpl : public torch::nn::Module {
 };
 TORCH_MODULE(DeepseekV2Model);
 
-class DeepseekV2ForCausalLMImpl : public torch::nn::Module {
+class DeepseekV2ForCausalLMImpl
+    : public LlmForCausalLMImplBase<DeepseekV2Model> {
  public:
-  DeepseekV2ForCausalLMImpl(const ModelContext& context) {
-    model_ = register_module("model", DeepseekV2Model(context));
-    npu_lm_head_ = register_module("lm_head", layer::NpuLmHead(context));
-    first_k_dense_replace_ = context.get_model_args().first_k_dense_replace();
-  }
-
-  // tokens: [num_tokens]
-  // positions: [num_tokens] token pos in the sequence
-  // returns: [num_tokens, hidden_size]
-  torch::Tensor forward(const torch::Tensor& tokens,
-                        const torch::Tensor& positions,
-                        std::vector<KVCache>& kv_caches,
-                        const ModelInputParams& input_params) {
-    return model_(tokens, positions, kv_caches, input_params);
-  }
-
-  // hidden_states: [num_tokens, hidden_size]
-  // seleted_idxes: [num_tokens]
-  // returns: [num_tokens, vocab_size]
-  torch::Tensor logits(const torch::Tensor& hidden_states,
-                       const torch::Tensor& seleted_idxes) {
-    return npu_lm_head_(hidden_states, seleted_idxes, 0);
-  }
-
-  void load_model(std::unique_ptr<ModelLoader> loader) {
-    for (const auto& state_dict : loader->get_state_dicts()) {
-      model_->load_state_dict(state_dict->get_dict_with_prefix("model."));
-      npu_lm_head_->load_state_dict(
-          state_dict->get_dict_with_prefix("lm_head."));
-    }
-
-    // verify
-    model_->verify_loaded_weights("model.");
-    npu_lm_head_->verify_loaded_weights("lm_head.");
-
-    model_->merge_loaded_weights();
-    npu_lm_head_->merge_loaded_weights();
-  }
+  DeepseekV2ForCausalLMImpl(const ModelContext& context)
+      : LlmForCausalLMImplBase<DeepseekV2Model>(context),
+        first_k_dense_replace_(
+            context.get_model_args().first_k_dense_replace()) {}
 
   void prepare_expert_weight(int32_t layer_id,
-                             const std::vector<int32_t>& expert_ids) {
+                             const std::vector<int32_t>& expert_ids) override {
     model_->prepare_expert_weight(layer_id + first_k_dense_replace_,
                                   expert_ids);
   }
 
-  void update_expert_weight(int32_t layer_id) {
+  void update_expert_weight(int32_t layer_id) override {
     model_->update_expert_weight(layer_id + first_k_dense_replace_);
   }
 
-  layer::NpuLmHead get_npu_lm_head() { return npu_lm_head_; }
-
-  void set_npu_lm_head(layer::NpuLmHead& head) { npu_lm_head_ = head; }
-
-  layer::NpuWordEmbedding get_npu_word_embedding() {
-    return model_->get_npu_word_embedding();
-  }
-
-  void set_npu_word_embedding(layer::NpuWordEmbedding& npu_word_embedding) {
-    model_->set_npu_word_embedding(npu_word_embedding);
-  }
-
  private:
-  DeepseekV2Model model_{nullptr};
-  layer::NpuLmHead npu_lm_head_{nullptr};
   int32_t first_k_dense_replace_;
 };
 TORCH_MODULE(DeepseekV2ForCausalLM);
diff --git a/xllm/models/llm/npu/deepseek_v32.h b/xllm/models/llm/npu/deepseek_v32.h
@@ -15,24 +15,9 @@ limitations under the License.
 
 #pragma once
 
-#include <torch/torch.h>
-
-#include <string>
-#include <vector>
-
-#include "core/common/global_flags.h"
-#include "core/framework/kv_cache/kv_cache.h"
-#include "core/framework/model/model_input_params.h"
-#include "core/framework/model/npu_dp_ep_padding.h"
-#include "core/framework/model_context.h"
-#include "core/layers/common/attention_mask.h"
-#include "core/layers/common/rotary_embedding_util.h"
 #include "core/layers/npu/npu_deepseek_v32_decoder_layer_impl.h"
-#include "core/layers/npu/npu_lm_head_impl.h"
-#include "core/layers/npu/npu_pos_embedding_impl.h"
-#include "core/layers/npu/npu_rms_norm_impl.h"
-#include "core/layers/npu/npu_word_embedding_impl.h"
-#include "models/model_registry.h"
+#include "llm_model_base.h"
+
 // DeepSeek v32 compatible with huggingface weights
 // ref to:
 // https://github.com/vllm-project/vllm/blob/v0.6.6/vllm/model_executor/models/deepseek_v2.py
@@ -263,72 +248,25 @@ class DeepseekV32ModelImpl : public torch::nn::Module {
 };
 TORCH_MODULE(DeepseekV32Model);
 
-class DeepseekV32ForCausalLMImpl : public torch::nn::Module {
+class DeepseekV32ForCausalLMImpl
+    : public LlmForCausalLMImplBase<DeepseekV32Model> {
  public:
-  DeepseekV32ForCausalLMImpl(const ModelContext& context) {
-    model_ = register_module("model", DeepseekV32Model(context));
-    npu_lm_head_ = register_module("lm_head", layer::NpuLmHead(context));
-    first_k_dense_replace_ = context.get_model_args().first_k_dense_replace();
-  }
-
-  // tokens: [num_tokens]
-  // positions: [num_tokens] token pos in the sequence
-  // returns: [num_tokens, hidden_size]
-  torch::Tensor forward(const torch::Tensor& tokens,
-                        const torch::Tensor& positions,
-                        std::vector<KVCache>& kv_caches,
-                        const ModelInputParams& input_params) {
-    return model_(tokens, positions, kv_caches, input_params);
-  }
-
-  // hidden_states: [num_tokens, hidden_size]
-  // seleted_idxes: [num_tokens]
-  // returns: [num_tokens, vocab_size]
-  torch::Tensor logits(const torch::Tensor& hidden_states,
-                       const torch::Tensor& seleted_idxes) {
-    return npu_lm_head_(hidden_states, seleted_idxes, 0);
-  }
-
-  void load_model(std::unique_ptr<ModelLoader> loader) {
-    for (const auto& state_dict : loader->get_state_dicts()) {
-      model_->load_state_dict(state_dict->get_dict_with_prefix("model."));
-      npu_lm_head_->load_state_dict(
-          state_dict->get_dict_with_prefix("lm_head."));
-    }
-
-    // verify
-    model_->verify_loaded_weights("model.");
-    npu_lm_head_->verify_loaded_weights("lm_head.");
-
-    model_->merge_loaded_weights();
-    npu_lm_head_->merge_loaded_weights();
-  }
+  DeepseekV32ForCausalLMImpl(const ModelContext& context)
+      : LlmForCausalLMImplBase<DeepseekV32Model>(context),
+        first_k_dense_replace_(
+            context.get_model_args().first_k_dense_replace()) {}
 
   void prepare_expert_weight(int32_t layer_id,
-                             const std::vector<int32_t>& expert_ids) {
+                             const std::vector<int32_t>& expert_ids) override {
     model_->prepare_expert_weight(layer_id + first_k_dense_replace_,
                                   expert_ids);
   }
 
-  void update_expert_weight(int32_t layer_id) {
+  void update_expert_weight(int32_t layer_id) override {
     model_->update_expert_weight(layer_id + first_k_dense_replace_);
   }
 
-  layer::NpuLmHead get_npu_lm_head() { return npu_lm_head_; }
-
-  void set_npu_lm_head(layer::NpuLmHead& head) { npu_lm_head_ = head; }
-
-  layer::NpuWordEmbedding get_npu_word_embedding() {
-    return model_->get_npu_word_embedding();
-  }
-
-  void set_npu_word_embedding(layer::NpuWordEmbedding& npu_word_embedding) {
-    model_->set_npu_word_embedding(npu_word_embedding);
-  }
-
  private:
-  DeepseekV32Model model_{nullptr};
-  layer::NpuLmHead npu_lm_head_{nullptr};
   int32_t first_k_dense_replace_;
 };
 TORCH_MODULE(DeepseekV32ForCausalLM);
diff --git a/xllm/models/llm/npu/glm4_moe.h b/xllm/models/llm/npu/glm4_moe.h
@@ -286,74 +286,10 @@ class Glm4MoeModelImpl : public torch::nn::Module {
 };
 TORCH_MODULE(Glm4MoeModel);
 
-class Glm4MoeForCausalLMImpl : public torch::nn::Module {
+class Glm4MoeForCausalLMImpl : public LlmForCausalLMImplBase<Glm4MoeModel> {
  public:
-  Glm4MoeForCausalLMImpl(const ModelContext& context) {
-    model_ = register_module("model", Glm4MoeModel(context));
-    npu_lm_head_ = register_module("lm_head", layer::NpuLmHead(context));
-  }
-
-  torch::Tensor get_input_embeddings(torch::Tensor input_ids) {
-    return model_->get_input_embeddings(input_ids);
-  }
-
-  // tokens: [num_tokens]
-  // positions: [num_tokens] token pos in the sequence
-  // returns: [num_tokens, hidden_size]
-  torch::Tensor forward(const torch::Tensor& tokens,
-                        const torch::Tensor& positions,
-                        std::vector<KVCache>& kv_caches,
-                        const ModelInputParams& input_params) {
-    return model_(tokens, positions, kv_caches, input_params);
-  }
-
-  // hidden_states: [num_tokens, hidden_size]
-  // seleted_idxes: [num_tokens]
-  // returns: [num_tokens, vocab_size]
-  torch::Tensor logits(const torch::Tensor& hidden_states,
-                       const torch::Tensor& seleted_idxes) {
-    // select tokens if provided
-    auto h = hidden_states;
-    return npu_lm_head_(hidden_states, seleted_idxes, 0);
-  }
-
-  void load_model(std::unique_ptr<ModelLoader> loader,
-                  std::string prefix = "model." /*llm model weight prefix*/) {
-    for (const auto& state_dict : loader->get_state_dicts()) {
-      model_->load_state_dict(state_dict->get_dict_with_prefix(prefix));
-      npu_lm_head_->load_state_dict(
-          state_dict->get_dict_with_prefix("lm_head."));
-    }
-
-    // verify
-    model_->verify_loaded_weights(prefix);
-    npu_lm_head_->verify_loaded_weights("lm_head.");
-
-    model_->merge_loaded_weights();
-    npu_lm_head_->merge_loaded_weights();
-  }
-
-  virtual void prepare_expert_weight(int32_t layer_id,
-                                     const std::vector<int32_t>& expert_ids) {
-    return;
-  }
-  virtual void update_expert_weight(int32_t layer_id) { return; }
-
-  layer::NpuLmHead get_npu_lm_head() { return npu_lm_head_; }
-
-  void set_npu_lm_head(layer::NpuLmHead& head) { npu_lm_head_ = head; }
-
-  layer::NpuWordEmbedding get_npu_word_embedding() {
-    return model_->get_npu_word_embedding();
-  }
-
-  void set_npu_word_embedding(layer::NpuWordEmbedding& npu_word_embedding) {
-    model_->set_npu_word_embedding(npu_word_embedding);
-  }
-
- private:
-  Glm4MoeModel model_{nullptr};
-  layer::NpuLmHead npu_lm_head_{nullptr};
+  Glm4MoeForCausalLMImpl(const ModelContext& context)
+      : LlmForCausalLMImplBase<Glm4MoeModel>(context) {}
 };
 TORCH_MODULE(Glm4MoeForCausalLM);
 
diff --git a/xllm/models/llm/npu/llama.h b/xllm/models/llm/npu/llama.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "core/layers/npu/npu_llama_decoder_layer_impl.h"
 #include "core/layers/npu/npu_rms_norm_impl.h"
 #include "core/util/tensor_helper.h"
+#include "llm_model_base.h"
 #include "models/model_registry.h"
 #include "xllm_kernels/core/include/atb_speed/log.h"
 
@@ -234,72 +235,10 @@ class LlamaModelImpl : public torch::nn::Module {
 };
 TORCH_MODULE(LlamaModel);
 
-class LlamaForCausalLMImpl : public torch::nn::Module {
+class LlamaForCausalLMImpl : public LlmForCausalLMImplBase<LlamaModel> {
  public:
-  LlamaForCausalLMImpl(const ModelContext& context) {
-    auto options = context.get_tensor_options();
-
-    // register submodules
-    model_ = register_module("model", LlamaModel(context));
-    device_id_ = options.device().index();
-    npu_lm_head_ = register_module("lm_head", layer::NpuLmHead(context));
-  }
-  // tokens: [num_tokens]
-  // positions: [num_tokens] token pos in the sequence
-  // returns: [num_tokens, hidden_size]
-  torch::Tensor forward(const torch::Tensor& tokens,
-                        const torch::Tensor& positions,
-                        std::vector<KVCache>& kv_caches,
-                        const ModelInputParams& input_params) {
-    return model_(tokens, positions, kv_caches, input_params);
-  }
-
-  // hidden_states: [num_tokens, hidden_size]
-  // seleted_idxes: [num_tokens]
-  // returns: [num_tokens, vocab_size]
-  torch::Tensor logits(const torch::Tensor& hidden_states,
-                       const torch::Tensor& seleted_idxes) {
-    return npu_lm_head_(hidden_states, seleted_idxes, 0);
-  }
-
-  void load_model(std::unique_ptr<ModelLoader> loader) {
-    for (const auto& state_dict : loader->get_state_dicts()) {
-      model_->load_state_dict(state_dict->get_dict_with_prefix("model."));
-      npu_lm_head_->load_state_dict(
-          state_dict->get_dict_with_prefix("lm_head."));
-    }
-
-    // verify
-    model_->verify_loaded_weights("model.");
-    npu_lm_head_->verify_loaded_weights("lm_head.");
-
-    model_->merge_loaded_weights();
-    npu_lm_head_->merge_loaded_weights();
-  }
-
-  void prepare_expert_weight(int32_t layer_id,
-                             const std::vector<int32_t>& expert_ids) {
-    return;
-  }
-  void update_expert_weight(int32_t layer_id) { return; }
-
-  layer::NpuLmHead get_npu_lm_head() { return npu_lm_head_; }
-
-  void set_npu_lm_head(layer::NpuLmHead& head) { npu_lm_head_ = head; }
-
-  layer::NpuWordEmbedding get_npu_word_embedding() {
-    return model_->get_npu_word_embedding();
-  }
-
-  void set_npu_word_embedding(layer::NpuWordEmbedding& npu_word_embedding) {
-    model_->set_npu_word_embedding(npu_word_embedding);
-  }
-
- private:
-  // parameter members, must be registered
-  LlamaModel model_{nullptr};
-  int device_id_ = 0;
-  layer::NpuLmHead npu_lm_head_{nullptr};
+  LlamaForCausalLMImpl(const ModelContext& context)
+      : LlmForCausalLMImplBase<LlamaModel>(context) {}
 };
 TORCH_MODULE(LlamaForCausalLM);
 
diff --git a/xllm/models/llm/npu/llm_model_base.h b/xllm/models/llm/npu/llm_model_base.h
@@ -341,7 +341,11 @@ class LlmForCausalLMImplBase : public torch::nn::Module {
 
     // verify
     model_->verify_loaded_weights(prefix);
-    npu_lm_head_->verify_loaded_weights("lm_head.");
+    if (tie_word_embeddings) {
+      npu_lm_head_->verify_loaded_weights(prefix + "embed_tokens.");
+    } else {
+      npu_lm_head_->verify_loaded_weights("lm_head.");
+    }
 
     model_->merge_loaded_weights();
     // test
diff --git a/xllm/models/llm/npu/qwen3_moe.h b/xllm/models/llm/npu/qwen3_moe.h