Skip to content

Commit a9e34b1

Browse files
DongheJinliutongxuan
authored andcommitted
bugfix: fix the GLM4.5 compilation error.
1 parent 3dbb04a commit a9e34b1

File tree

10 files changed

+138
-138
lines changed

10 files changed

+138
-138
lines changed

xllm/core/layers/attention_mask.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,10 @@ torch::Tensor AttentionMask::gen_free_mask(int32_t q_len,
6767
return mask_free;
6868
}
6969

70-
torch::Tensor AttentionMaskImpl::gen_append_mask(int32_t q_len,
71-
int32_t kv_len,
72-
torch::Dtype dtype,
73-
torch::Device device) {
70+
torch::Tensor AttentionMask::gen_append_mask(int32_t q_len,
71+
int32_t kv_len,
72+
torch::Dtype dtype,
73+
torch::Device device) {
7474
int diagonal = kv_len - q_len;
7575
auto options = torch::TensorOptions().dtype(torch::kBool).device(device);
7676
auto bias = torch::tril(torch::ones({q_len, kv_len}, options), diagonal);
@@ -82,9 +82,9 @@ torch::Tensor AttentionMaskImpl::gen_append_mask(int32_t q_len,
8282
return mask;
8383
}
8484

85-
void AttentionMaskImpl::update_attn_cache(torch::Dtype dtype,
86-
torch::Device device,
87-
int64_t seqlen) {
85+
void AttentionMask::update_attn_cache(torch::Dtype dtype,
86+
torch::Device device,
87+
int64_t seqlen) {
8888
if (seqlen > seq_len_cached_ || atten_mask_cache_.dtype() != dtype) {
8989
seq_len_cached_ = seqlen;
9090

xllm/core/layers/npu/npu_glm4_moe_decoder_layer.cpp

Lines changed: 99 additions & 93 deletions
Large diffs are not rendered by default.

xllm/core/layers/npu/npu_glm4_moe_decoder_layer.h

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,12 @@ limitations under the License.
2121

2222
#include <nlohmann/json.hpp>
2323

24-
#include "npu_base_layer.h"
2524
#include "framework/model/model_args.h"
2625
#include "framework/model/npu_dp_ep_padding.h"
2726
#include "framework/parallel_state.h"
2827
#include "framework/quant_args.h"
2928
#include "framework/state_dict/state_dict.h"
29+
#include "npu_base_layer.h"
3030
#include "xllm_kernels/models/glm/layer/moe_decoder_layer.h"
3131

3232
namespace xllm {
@@ -35,7 +35,7 @@ namespace layer {
3535
class Glm4MoeDecoderImpl : public NpuBaseLayer {
3636
public:
3737
explicit Glm4MoeDecoderImpl(const ModelContext& context,
38-
const int32_t layer_id);
38+
const int32_t layer_id);
3939

4040
~Glm4MoeDecoderImpl() {};
4141

@@ -82,21 +82,18 @@ class Glm4MoeDecoderImpl : public NpuBaseLayer {
8282
const ParallelArgs& parallel_args,
8383
bool is_prefill);
8484

85-
void initialize_attention_parameters(
86-
atb_speed::moe::MoeLayerParam& param,
87-
const ModelArgs& args,
88-
const ParallelArgs& parallel_args);
85+
void initialize_attention_parameters(atb_speed::moe::MoeLayerParam& param,
86+
const ModelArgs& args,
87+
const ParallelArgs& parallel_args);
8988

9089
void initialize_mlp_parameters(atb_speed::moe::MoeLayerParam& param,
9190
const ModelArgs& args,
9291
const ParallelArgs& parallel_args);
9392

94-
void initialize_parallel_parameters(
95-
atb_speed::moe::MoeLayerParam& param,
96-
const ParallelArgs& parallel_args);
93+
void initialize_parallel_parameters(atb_speed::moe::MoeLayerParam& param,
94+
const ParallelArgs& parallel_args);
9795

98-
void initialize_quantization_parameters(
99-
atb_speed::moe::MoeLayerParam& param);
96+
void initialize_quantization_parameters(atb_speed::moe::MoeLayerParam& param);
10097

10198
torch::Tensor get_sharded_tensor(const StateDict& state_dict,
10299
const std::string& name,

xllm/core/layers/npu/npu_qwen2_decoder_layer_impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ class NpuQwen2DecoderLayerImpl : public NpuBaseLayer {
165165

166166
int device_id_;
167167
int32_t layer_id_;
168-
168+
169169
std::vector<std::shared_ptr<at::Tensor>> prefill_tensor_storage_;
170170
std::vector<std::shared_ptr<at::Tensor>> decode_tensor_storage_;
171171
std::vector<std::shared_ptr<std::vector<int>>> prefill_vector_storage_;

xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -663,11 +663,11 @@ void NpuQwen3MoeDecoderLayerImpl::merge_loaded_weights() {
663663
torch::zeros({1}, torch::kFloat16).to(device_);
664664

665665
at_weight_tensors_[IN_QKV_BIAS_0] =
666-
torch::cat({at_weight_tensors_[IN_QKV_BIAS_0],
667-
at_weight_tensors_[IN_QKV_BIAS_1],
668-
at_weight_tensors_[IN_QKV_BIAS_2]},
669-
0)
670-
.contiguous();
666+
torch::cat({at_weight_tensors_[IN_QKV_BIAS_0],
667+
at_weight_tensors_[IN_QKV_BIAS_1],
668+
at_weight_tensors_[IN_QKV_BIAS_2]},
669+
0)
670+
.contiguous();
671671
at_weight_tensors_[IN_QKV_BIAS_1] =
672672
torch::zeros({1}, torch::kFloat16).to(device_);
673673
at_weight_tensors_[IN_QKV_BIAS_2] =

xllm/models/llm/glm4_moe.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,10 @@ class Glm4MoeModelImpl : public torch::nn::Module {
8484
device_ = options.device();
8585
dtype_ = options.dtype().toScalarType();
8686
num_speculative_tokens_ = model_args.num_speculative_tokens();
87-
embed_tokens_ = register_module("embed_tokens", layer::WordEmbedding(context));
87+
embed_tokens_ =
88+
register_module("embed_tokens", layer::WordEmbedding(context));
8889

89-
atb_pos_emb_ = AtbRotaryEmbedding(context);
90+
atb_pos_emb_ = layer::PosEmbedding(context);
9091
cos_sin_ = get_concat_rotary_embedding(64,
9192
model_args.max_position_embeddings(),
9293
model_args.rope_theta(),
@@ -127,7 +128,7 @@ class Glm4MoeModelImpl : public torch::nn::Module {
127128
positions = torch::tensor({0}).to(torch::kInt32).to(device_);
128129
}
129130
}
130-
131+
131132
auto h = embed_tokens_(tokens, 0);
132133
int64_t input_length = tokens.size(0);
133134
torch::Tensor expert_array = torch::arange(
@@ -162,10 +163,9 @@ class Glm4MoeModelImpl : public torch::nn::Module {
162163
std::vector<std::atomic<bool>*> event_flags(1, nullptr);
163164
if (input_params.layer_synchronizer != nullptr) {
164165
events[0] = input_params.layer_synchronizer->get_event(i);
165-
event_flags[0] =
166-
input_params.layer_synchronizer->get_event_flag(i);
166+
event_flags[0] = input_params.layer_synchronizer->get_event_flag(i);
167167
}
168-
168+
169169
auto& layer = layers_[i];
170170
layer(h,
171171
cos_pos,
@@ -216,7 +216,7 @@ class Glm4MoeModelImpl : public torch::nn::Module {
216216
void set_word_embedding(std::vector<layer::WordEmbedding>& word_embedding) {
217217
embed_tokens_ = word_embedding[0];
218218
}
219-
219+
220220
private:
221221
torch::nn::ModuleList blocks_{nullptr};
222222
std::vector<Glm4MoeDecoderLayer> layers_;

xllm/models/llm/llm_model_base.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,10 @@ limitations under the License.
3737

3838
namespace xllm {
3939

40-
torch::Tensor get_concat_rotary_embedding(
41-
int64_t dim,
42-
int64_t seq_len,
43-
double rope_theta,
44-
const torch::TensorOptions& options) {
40+
torch::Tensor get_concat_rotary_embedding(int64_t dim,
41+
int64_t seq_len,
42+
double rope_theta,
43+
const torch::TensorOptions& options) {
4544
auto options_new =
4645
torch::device(options.device()).dtype(at::ScalarType::Double);
4746
auto inv_freq =

xllm/models/llm/qwen2.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,7 @@ TORCH_MODULE(QWen2DecoderLayer);
3535
class QWen2ModelImpl : public LlmModelImplBase<QWen2DecoderLayer> {
3636
public:
3737
QWen2ModelImpl(const ModelContext& context)
38-
: LlmModelImplBase<QWen2DecoderLayer>("qwen2",
39-
context.get_model_args()) {
38+
: LlmModelImplBase<QWen2DecoderLayer>("qwen2", context.get_model_args()) {
4039
// register submodules
4140
auto model_args = context.get_model_args();
4241
auto options = context.get_tensor_options();

xllm/models/llm/qwen3.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,7 @@ TORCH_MODULE(QWen3DecoderLayer);
3131
class QWen3ModelImpl : public LlmModelImplBase<QWen3DecoderLayer> {
3232
public:
3333
QWen3ModelImpl(const ModelContext& context)
34-
: LlmModelImplBase<QWen3DecoderLayer>("qwen3",
35-
context.get_model_args()) {
34+
: LlmModelImplBase<QWen3DecoderLayer>("qwen3", context.get_model_args()) {
3635
// register submodules
3736
auto model_args = context.get_model_args();
3837
auto options = context.get_tensor_options();
@@ -45,9 +44,9 @@ class QWen3ModelImpl : public LlmModelImplBase<QWen3DecoderLayer> {
4544
atb_pos_embeds_.push_back(layer::PosEmbedding(context));
4645
}
4746
cos_sin_ = get_concat_rotary_embedding(128,
48-
model_args.max_position_embeddings(),
49-
model_args.rope_theta(),
50-
options);
47+
model_args.max_position_embeddings(),
48+
model_args.rope_theta(),
49+
options);
5150
int32_t mask_value = FLAGS_enable_chunked_prefill ? -9984 : 1;
5251
// encode_attn_mask_ =
5352
// layer::AttentionMask(options.device(),

xllm/models/models.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,15 @@ limitations under the License.
2020
#include "llm/deepseek_v2.h" // IWYU pragma: keep
2121
#include "llm/deepseek_v2_mtp.h" // IWYU pragma: keep
2222
#include "llm/deepseek_v3.h" // IWYU pragma: keep
23+
#include "llm/glm4_moe.h" // IWYU pragma: keep
2324
#include "llm/kimi_k2.h" // IWYU pragma: keep
2425
#include "llm/llama.h" // IWYU pragma: keep
2526
#include "llm/llama3.h" // IWYU pragma: keep
27+
#include "llm/llm_model_base.h" // IWYU pragma: keep
2628
#include "llm/qwen2.h" // IWYU pragma: keep
2729
#include "llm/qwen3.h" // IWYU pragma: keep
2830
#include "llm/qwen3_embedding.h" // IWYU pragma: keep
2931
#include "llm/qwen3_moe.h" // IWYU pragma: keep
30-
#include "llm/llm_model_base.h" // IWYU pragma: keep
31-
#include "llm/glm4_moe.h" // IWYU pragma: keep
3232
#include "vlm/minicpmv.h" // IWYU pragma: keep
3333
#include "vlm/qwen2_5_vl.h" // IWYU pragma: keep
3434
#endif

0 commit comments

Comments
 (0)