bugfix: fix the GLM4.5 compilation error.

DongheJin · liutongxuan · commit a9e34b17029b · 2025-10-14T22:04:16.000+08:00
diff --git a/xllm/core/layers/attention_mask.cpp b/xllm/core/layers/attention_mask.cpp
@@ -67,10 +67,10 @@ torch::Tensor AttentionMask::gen_free_mask(int32_t q_len,
   return mask_free;
 }
 
-torch::Tensor AttentionMaskImpl::gen_append_mask(int32_t q_len,
-                                                 int32_t kv_len,
-                                                 torch::Dtype dtype,
-                                                 torch::Device device) {
+torch::Tensor AttentionMask::gen_append_mask(int32_t q_len,
+                                             int32_t kv_len,
+                                             torch::Dtype dtype,
+                                             torch::Device device) {
   int diagonal = kv_len - q_len;
   auto options = torch::TensorOptions().dtype(torch::kBool).device(device);
   auto bias = torch::tril(torch::ones({q_len, kv_len}, options), diagonal);
@@ -82,9 +82,9 @@ torch::Tensor AttentionMaskImpl::gen_append_mask(int32_t q_len,
   return mask;
 }
 
-void AttentionMaskImpl::update_attn_cache(torch::Dtype dtype,
-                                          torch::Device device,
-                                          int64_t seqlen) {
+void AttentionMask::update_attn_cache(torch::Dtype dtype,
+                                      torch::Device device,
+                                      int64_t seqlen) {
   if (seqlen > seq_len_cached_ || atten_mask_cache_.dtype() != dtype) {
     seq_len_cached_ = seqlen;
 
diff --git a/xllm/core/layers/npu/npu_glm4_moe_decoder_layer.cpp b/xllm/core/layers/npu/npu_glm4_moe_decoder_layer.cpp
diff --git a/xllm/core/layers/npu/npu_glm4_moe_decoder_layer.h b/xllm/core/layers/npu/npu_glm4_moe_decoder_layer.h
@@ -21,12 +21,12 @@ limitations under the License.
 
 #include <nlohmann/json.hpp>
 
-#include "npu_base_layer.h"
 #include "framework/model/model_args.h"
 #include "framework/model/npu_dp_ep_padding.h"
 #include "framework/parallel_state.h"
 #include "framework/quant_args.h"
 #include "framework/state_dict/state_dict.h"
+#include "npu_base_layer.h"
 #include "xllm_kernels/models/glm/layer/moe_decoder_layer.h"
 
 namespace xllm {
@@ -35,7 +35,7 @@ namespace layer {
 class Glm4MoeDecoderImpl : public NpuBaseLayer {
  public:
   explicit Glm4MoeDecoderImpl(const ModelContext& context,
-                               const int32_t layer_id);
+                              const int32_t layer_id);
 
   ~Glm4MoeDecoderImpl() {};
 
@@ -82,21 +82,18 @@ class Glm4MoeDecoderImpl : public NpuBaseLayer {
                                    const ParallelArgs& parallel_args,
                                    bool is_prefill);
 
-  void initialize_attention_parameters(
-      atb_speed::moe::MoeLayerParam& param,
-      const ModelArgs& args,
-      const ParallelArgs& parallel_args);
+  void initialize_attention_parameters(atb_speed::moe::MoeLayerParam& param,
+                                       const ModelArgs& args,
+                                       const ParallelArgs& parallel_args);
 
   void initialize_mlp_parameters(atb_speed::moe::MoeLayerParam& param,
                                  const ModelArgs& args,
                                  const ParallelArgs& parallel_args);
 
-  void initialize_parallel_parameters(
-      atb_speed::moe::MoeLayerParam& param,
-      const ParallelArgs& parallel_args);
+  void initialize_parallel_parameters(atb_speed::moe::MoeLayerParam& param,
+                                      const ParallelArgs& parallel_args);
 
-  void initialize_quantization_parameters(
-      atb_speed::moe::MoeLayerParam& param);
+  void initialize_quantization_parameters(atb_speed::moe::MoeLayerParam& param);
 
   torch::Tensor get_sharded_tensor(const StateDict& state_dict,
                                    const std::string& name,
diff --git a/xllm/core/layers/npu/npu_qwen2_decoder_layer_impl.h b/xllm/core/layers/npu/npu_qwen2_decoder_layer_impl.h
@@ -165,7 +165,7 @@ class NpuQwen2DecoderLayerImpl : public NpuBaseLayer {
 
   int device_id_;
   int32_t layer_id_;
-  
+
   std::vector<std::shared_ptr<at::Tensor>> prefill_tensor_storage_;
   std::vector<std::shared_ptr<at::Tensor>> decode_tensor_storage_;
   std::vector<std::shared_ptr<std::vector<int>>> prefill_vector_storage_;
diff --git a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp
@@ -663,11 +663,11 @@ void NpuQwen3MoeDecoderLayerImpl::merge_loaded_weights() {
       torch::zeros({1}, torch::kFloat16).to(device_);
 
   at_weight_tensors_[IN_QKV_BIAS_0] =
-    torch::cat({at_weight_tensors_[IN_QKV_BIAS_0],
-                at_weight_tensors_[IN_QKV_BIAS_1],
-                at_weight_tensors_[IN_QKV_BIAS_2]},
-                0)
-        .contiguous();
+      torch::cat({at_weight_tensors_[IN_QKV_BIAS_0],
+                  at_weight_tensors_[IN_QKV_BIAS_1],
+                  at_weight_tensors_[IN_QKV_BIAS_2]},
+                 0)
+          .contiguous();
   at_weight_tensors_[IN_QKV_BIAS_1] =
       torch::zeros({1}, torch::kFloat16).to(device_);
   at_weight_tensors_[IN_QKV_BIAS_2] =
diff --git a/xllm/models/llm/glm4_moe.h b/xllm/models/llm/glm4_moe.h
@@ -84,9 +84,10 @@ class Glm4MoeModelImpl : public torch::nn::Module {
     device_ = options.device();
     dtype_ = options.dtype().toScalarType();
     num_speculative_tokens_ = model_args.num_speculative_tokens();
-    embed_tokens_ = register_module("embed_tokens", layer::WordEmbedding(context));
+    embed_tokens_ =
+        register_module("embed_tokens", layer::WordEmbedding(context));
 
-    atb_pos_emb_ = AtbRotaryEmbedding(context);
+    atb_pos_emb_ = layer::PosEmbedding(context);
     cos_sin_ = get_concat_rotary_embedding(64,
                                            model_args.max_position_embeddings(),
                                            model_args.rope_theta(),
@@ -127,7 +128,7 @@ class Glm4MoeModelImpl : public torch::nn::Module {
         positions = torch::tensor({0}).to(torch::kInt32).to(device_);
       }
     }
-    
+
     auto h = embed_tokens_(tokens, 0);
     int64_t input_length = tokens.size(0);
     torch::Tensor expert_array = torch::arange(
@@ -162,10 +163,9 @@ class Glm4MoeModelImpl : public torch::nn::Module {
       std::vector<std::atomic<bool>*> event_flags(1, nullptr);
       if (input_params.layer_synchronizer != nullptr) {
         events[0] = input_params.layer_synchronizer->get_event(i);
-        event_flags[0] =
-            input_params.layer_synchronizer->get_event_flag(i);
+        event_flags[0] = input_params.layer_synchronizer->get_event_flag(i);
       }
-      
+
       auto& layer = layers_[i];
       layer(h,
             cos_pos,
@@ -216,7 +216,7 @@ class Glm4MoeModelImpl : public torch::nn::Module {
   void set_word_embedding(std::vector<layer::WordEmbedding>& word_embedding) {
     embed_tokens_ = word_embedding[0];
   }
-  
+
  private:
   torch::nn::ModuleList blocks_{nullptr};
   std::vector<Glm4MoeDecoderLayer> layers_;
diff --git a/xllm/models/llm/llm_model_base.h b/xllm/models/llm/llm_model_base.h
@@ -37,11 +37,10 @@ limitations under the License.
 
 namespace xllm {
 
-torch::Tensor get_concat_rotary_embedding(
-    int64_t dim,
-    int64_t seq_len,
-    double rope_theta,
-    const torch::TensorOptions& options) {
+torch::Tensor get_concat_rotary_embedding(int64_t dim,
+                                          int64_t seq_len,
+                                          double rope_theta,
+                                          const torch::TensorOptions& options) {
   auto options_new =
       torch::device(options.device()).dtype(at::ScalarType::Double);
   auto inv_freq =
diff --git a/xllm/models/llm/qwen2.h b/xllm/models/llm/qwen2.h
@@ -35,8 +35,7 @@ TORCH_MODULE(QWen2DecoderLayer);
 class QWen2ModelImpl : public LlmModelImplBase<QWen2DecoderLayer> {
  public:
   QWen2ModelImpl(const ModelContext& context)
-      : LlmModelImplBase<QWen2DecoderLayer>("qwen2",
-                                             context.get_model_args()) {
+      : LlmModelImplBase<QWen2DecoderLayer>("qwen2", context.get_model_args()) {
     // register submodules
     auto model_args = context.get_model_args();
     auto options = context.get_tensor_options();
diff --git a/xllm/models/llm/qwen3.h b/xllm/models/llm/qwen3.h
@@ -31,8 +31,7 @@ TORCH_MODULE(QWen3DecoderLayer);
 class QWen3ModelImpl : public LlmModelImplBase<QWen3DecoderLayer> {
  public:
   QWen3ModelImpl(const ModelContext& context)
-      : LlmModelImplBase<QWen3DecoderLayer>("qwen3",
-                                             context.get_model_args()) {
+      : LlmModelImplBase<QWen3DecoderLayer>("qwen3", context.get_model_args()) {
     // register submodules
     auto model_args = context.get_model_args();
     auto options = context.get_tensor_options();
@@ -45,9 +44,9 @@ class QWen3ModelImpl : public LlmModelImplBase<QWen3DecoderLayer> {
       atb_pos_embeds_.push_back(layer::PosEmbedding(context));
     }
     cos_sin_ = get_concat_rotary_embedding(128,
-                                          model_args.max_position_embeddings(),
-                                          model_args.rope_theta(),
-                                          options);
+                                           model_args.max_position_embeddings(),
+                                           model_args.rope_theta(),
+                                           options);
     int32_t mask_value = FLAGS_enable_chunked_prefill ? -9984 : 1;
     // encode_attn_mask_ =
     //   layer::AttentionMask(options.device(),
diff --git a/xllm/models/models.h b/xllm/models/models.h
@@ -20,15 +20,15 @@ limitations under the License.
 #include "llm/deepseek_v2.h"      // IWYU pragma: keep
 #include "llm/deepseek_v2_mtp.h"  // IWYU pragma: keep
 #include "llm/deepseek_v3.h"      // IWYU pragma: keep
+#include "llm/glm4_moe.h"         // IWYU pragma: keep
 #include "llm/kimi_k2.h"          // IWYU pragma: keep
 #include "llm/llama.h"            // IWYU pragma: keep
 #include "llm/llama3.h"           // IWYU pragma: keep
+#include "llm/llm_model_base.h"   // IWYU pragma: keep
 #include "llm/qwen2.h"            // IWYU pragma: keep
 #include "llm/qwen3.h"            // IWYU pragma: keep
 #include "llm/qwen3_embedding.h"  // IWYU pragma: keep
 #include "llm/qwen3_moe.h"        // IWYU pragma: keep
-#include "llm/llm_model_base.h"   // IWYU pragma: keep
-#include "llm/glm4_moe.h"             // IWYU pragma: keep
 #include "vlm/minicpmv.h"         // IWYU pragma: keep
 #include "vlm/qwen2_5_vl.h"       // IWYU pragma: keep
 #endif