xiao-yu-chen
diff --git a/‎xllm/core/framework/parallel_state/parallel_state.cpp‎
Lines changed: 39 additions & 8 deletions b/‎xllm/core/framework/parallel_state/parallel_state.cpp‎
Lines changed: 39 additions & 8 deletions
diff --git a/‎xllm/core/framework/parallel_state/parallel_state.h‎
Lines changed: 3 additions & 0 deletions b/‎xllm/core/framework/parallel_state/parallel_state.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎xllm/core/framework/state_dict/utils.cpp‎
Lines changed: 33 additions & 0 deletions b/‎xllm/core/framework/state_dict/utils.cpp‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎xllm/core/framework/state_dict/utils.h‎
Lines changed: 21 additions & 0 deletions b/‎xllm/core/framework/state_dict/utils.h‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎xllm/core/kernels/mlu/active.cpp‎
Lines changed: 9 additions & 2 deletions b/‎xllm/core/kernels/mlu/active.cpp‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎xllm/core/kernels/mlu/mlu_ops_api.h‎
Lines changed: 1 addition & 1 deletion b/‎xllm/core/kernels/mlu/mlu_ops_api.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/core/kernels/mlu/rope.cpp‎
Lines changed: 1 addition & 1 deletion b/‎xllm/core/kernels/mlu/rope.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/core/kernels/param.h‎
Lines changed: 1 addition & 1 deletion b/‎xllm/core/kernels/param.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/core/layers/common/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎xllm/core/layers/common/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎xllm/core/layers/common/dense_mlp.cpp‎
Lines changed: 16 additions & 1 deletion b/‎xllm/core/layers/common/dense_mlp.cpp‎
Lines changed: 16 additions & 1 deletion
@@ -77,12 +77,12 @@ torch::Tensor gather(const torch::Tensor& input,
   if (!process_group) {
     return input;
   }
-  const auto world_size = process_group->world_size();
+  const int32_t world_size = process_group->world_size();
   if (world_size == 1) {
     return input;
   }
 
-  const auto rank = process_group->rank();
+  const int32_t rank = process_group->rank();
   std::vector<torch::Tensor> tensors(world_size);
   for (int64_t i = 0; i < world_size; ++i) {
     tensors[i] = torch::empty_like(input);
@@ -98,8 +98,8 @@ torch::Tensor gather(const torch::Tensor& input,
   if (!process_group) {
     return input;
   }
-  const auto world_size = process_group->world_size();
-  const auto rank = process_group->rank();
+  const int32_t world_size = process_group->world_size();
+  const int32_t rank = process_group->rank();
   if (world_size == 1) {
     return input;
   }
@@ -131,11 +131,42 @@ torch::Tensor gather(const torch::Tensor& input,
       gathered_input, max_num_tokens, token_num_list);
 }
 
+torch::Tensor all_gather_interleaved(const torch::Tensor& input,
+                                     ProcessGroup* process_group) {
+  if (!process_group) {
+    return input;
+  }
+  const int32_t world_size = process_group->world_size();
+  const int32_t rank = process_group->rank();
+  if (world_size == 1) {
+    return input;
+  }
+
+  std::vector<torch::Tensor> gathered_tensors(world_size);
+  for (int64_t i = 0; i < world_size; ++i) {
+    gathered_tensors[i] = torch::empty_like(input);
+  }
+  process_group->allgather(input, gathered_tensors);
+
+  int32_t dim = -1;
+  size_t num_chunks = 3;
+  std::vector<torch::Tensor> ordered_tensors;
+  int64_t shard_size = input.size(dim) / num_chunks;
+  for (size_t i = 0; i < num_chunks; ++i) {
+    for (size_t j = 0; j < world_size; ++j) {
+      auto shard_tensor =
+          gathered_tensors[j].slice(dim, shard_size * i, shard_size * (i + 1));
+      ordered_tensors.push_back(shard_tensor);
+    }
+  }
+  return torch::cat(ordered_tensors, dim).contiguous();
+}
+
 torch::Tensor reduce(torch::Tensor& input, ProcessGroup* process_group) {
   if (!process_group) {
     return input;
   }
-  const auto world_size = process_group->world_size();
+  const int32_t world_size = process_group->world_size();
   if (world_size == 1) {
     return input;
   }
@@ -149,20 +180,20 @@ torch::Tensor scatter(torch::Tensor input,
   if (!process_group) {
     return input;
   }
-  const auto world_size = process_group->world_size();
+  const int32_t world_size = process_group->world_size();
   if (world_size == 1) {
     return input;
   }
 
   // get the size for last dimension
-  const auto dim_size = input.size(dim);
+  const int32_t dim_size = input.size(dim);
   CHECK(dim_size % world_size == 0)
       << "dim_size " << dim_size << " cannot be divided by world_size "
       << world_size;
 
   // torch::split does not create contiguous tensors by default.
   const auto tensor_list = input.split(dim_size / world_size, dim);
-  const auto rank = process_group->rank();
+  const int32_t rank = process_group->rank();
   return tensor_list[rank];
 }
 
 
@@ -33,6 +33,9 @@ torch::Tensor gather(const torch::Tensor& input,
                      ProcessGroup* process_group,
                      const std::vector<int32_t>& token_num_list);
 
+torch::Tensor all_gather_interleaved(const torch::Tensor& input,
+                                     ProcessGroup* process_group);
+
 torch::Tensor reduce(torch::Tensor& input, ProcessGroup* process_group);
 
 torch::Tensor scatter(torch::Tensor input,
 
@@ -243,6 +243,39 @@ void load_moe_fused_weight(const StateDict& state_dict,
   }
 }
 
+void load_merged_weight(const StateDict& state_dict,
+                        const std::string& name,
+                        int64_t dim,
+                        int32_t rank,
+                        int32_t world_size,
+                        int32_t shard_tensor_count,
+                        int64_t shard_size,
+                        torch::Tensor& weight,
+                        bool& weight_is_loaded) {
+  if (weight_is_loaded) {
+    return;
+  }
+  const auto& tensor = state_dict.get_tensor(name);
+  if (!tensor.defined()) {
+    return;
+  }
+  CHECK_EQ(tensor.size(dim), shard_tensor_count * shard_size * world_size)
+      << name << "[" << dim << "] size mismatch for " << state_dict.prefix()
+      << name;
+  std::vector<torch::Tensor> shard_tensors;
+  for (size_t shard_id = 0; shard_id < shard_tensor_count; shard_id++) {
+    int64_t shard_offset =
+        shard_id * shard_size * world_size + rank * shard_size;
+    shard_tensors.push_back(
+        tensor.slice(dim, shard_offset, shard_offset + shard_size));
+  }
+  auto merged_weight = torch::cat(shard_tensors, dim);
+  CHECK_EQ(weight.sizes(), merged_weight.sizes())
+      << "weight size mismatch for " << state_dict.prefix() << name;
+  weight.copy_(merged_weight);
+  weight_is_loaded = true;
+}
+
 }  // namespace weight
 
 }  // namespace xllm
@@ -93,6 +93,17 @@ void load_moe_fused_weight(const StateDict& state_dict,
                            bool& w1_is_loaded,
                            bool& w3_is_loaded,
                            bool& w13_is_loaded);
+
+void load_merged_weight(const StateDict& state_dict,
+                        const std::string& name,
+                        int64_t dim,
+                        int32_t rank,
+                        int32_t world_size,
+                        int32_t shard_tensor_count,
+                        int64_t shard_size,
+                        torch::Tensor& weight,
+                        bool& weight_is_loaded);
+
 }  // namespace weight
 
 // helper macros for defining and loading weights
@@ -173,4 +184,14 @@ void load_moe_fused_weight(const StateDict& state_dict,
                                 w3##_is_loaded_,      \
                                 w13##_is_loaded_);
 
+#define LOAD_MERGED_WEIGHT(name, dim)            \
+  weight::load_merged_weight(state_dict,         \
+                             #name,              \
+                             dim,                \
+                             rank,               \
+                             world_size,         \
+                             shard_tensor_count, \
+                             shard_size,         \
+                             name##_,            \
+                             name##_is_loaded_);
 }  // namespace xllm
@@ -25,13 +25,20 @@ void active(const torch::Tensor& input,
             bool is_gated,
             int64_t start_expert_id,
             int64_t expert_size) {
+  std::string hidden_act = act_mode;
+  // TODO: act_mode gelu_pytorch_tanh not support yet.
+  std::string gelu_approximate = "none";
+  if (act_mode == "gelu_pytorch_tanh") {
+    hidden_act = "gelu";
+    gelu_approximate = "tanh";
+  }
   tmo::torch_api::active(input,
                          output,
                          bias,
                          cusum_token_count,
-                         act_mode,
+                         hidden_act,
                          is_gated,
                          start_expert_id,
                          expert_size);
 }
-}  // namespace xllm::kernel::mlu
+}  // namespace xllm::kernel::mlu
@@ -31,7 +31,7 @@ void apply_rotary(torch::Tensor& q,
                   const torch::Tensor& sin,
                   const torch::Tensor& cos,
                   const std::optional<torch::Tensor>& position_ids,
-                  const torch::Tensor& cu_query_lens,
+                  const std::optional<torch::Tensor>& cu_query_lens,
                   bool interleaved,
                   bool discrete,
                   bool dynamic_ntk,
 
@@ -22,7 +22,7 @@ void apply_rotary(torch::Tensor& q,
                   const torch::Tensor& sin,
                   const torch::Tensor& cos,
                   const std::optional<torch::Tensor>& position_ids,
-                  const torch::Tensor& cu_query_lens,
+                  const std::optional<torch::Tensor>& cu_query_lens,
                   bool interleaved,
                   bool discrete,
                   bool dynamic_ntk,
 
@@ -55,7 +55,7 @@ struct RotaryParams {
   // Required in pack mode (when q/k are 3D). Size should be [batch_size + 1].
   // Note: In current MLU implementation, this is always passed to underlying
   // API.
-  torch::Tensor cu_query_lens;
+  std::optional<torch::Tensor> cu_query_lens;
   // Whether to use interleaved rotary embedding pattern.
   bool interleaved;
   // Whether to use discrete position mode. If true, position_ids must be
 
@@ -5,23 +5,27 @@ cc_library(
     common_layers
   HDRS
     qwen2_attention.h
+    qwen2_vision_attention.h
     fuse_norm.h
     rotary_embedding.h
     fused_moe.h
     dense_mlp.h
     qwen2_decoder_layer.h
+    qwen2_5_vision_layer.h
     qwen3_moe_decoder_layer.h
     linear.h
     word_embedding_impl.h
     layer_utils.h
     indexer.h
   SRCS
     qwen2_attention.cpp
+    qwen2_vision_attention.cpp
     fuse_norm.cpp
     rotary_embedding.cpp
     fused_moe.cpp
     dense_mlp.cpp
     qwen2_decoder_layer.cpp
+    qwen2_5_vision_layer.cpp
     qwen3_moe_decoder_layer.cpp
     linear.cpp
     word_embedding_impl.cpp
 
@@ -57,10 +57,11 @@ DenseMLPImpl::DenseMLPImpl(int64_t hidden_size,
   }
 
   // 1. gate + up
+  int64_t out_feature = is_gated_ ? intermediate_size_ * 2 : intermediate_size_;
   gate_up_proj_ =
       register_module("gate_up_proj",
                       ColumnParallelLinear(hidden_size,
-                                           intermediate_size_ * 2,
+                                           out_feature,
                                            /*bias=*/has_bias,
                                            /*gather_output=*/false,
                                            quant_args,
@@ -111,5 +112,19 @@ void DenseMLPImpl::load_state_dict(const StateDict& state_dict) {
   down_proj_->load_state_dict(state_dict.get_dict_with_prefix("down_proj."));
 }
 
+void DenseMLPImpl::load_state_dict(const StateDict& state_dict,
+                                   const std::vector<std::string>& gate_up_name,
+                                   const std::string& down_name) {
+  if (is_gated_) {
+    CHECK_EQ(gate_up_name.size(), 2);
+    gate_up_proj_->load_state_dict(state_dict, gate_up_name);
+  } else {
+    CHECK_EQ(gate_up_name.size(), 1);
+    gate_up_proj_->load_state_dict(
+        state_dict.get_dict_with_prefix(gate_up_name[0]));
+  }
+  down_proj_->load_state_dict(state_dict.get_dict_with_prefix(down_name));
+}
+
 }  // namespace layer
 }  // namespace xllm