jd-opensource
diff --git a/‎xllm/core/framework/model/model_input_params.h‎
Lines changed: 2 additions & 0 deletions b/‎xllm/core/framework/model/model_input_params.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xllm/core/layers/npu/npu_glm4_moe_decoder_layer.cpp‎
Lines changed: 3 additions & 7 deletions b/‎xllm/core/layers/npu/npu_glm4_moe_decoder_layer.cpp‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎xllm/core/layers/npu/npu_glm4_moe_decoder_layer.h‎
Lines changed: 0 additions & 2 deletions b/‎xllm/core/layers/npu/npu_glm4_moe_decoder_layer.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp‎
Lines changed: 1 addition & 5 deletions b/‎xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.h‎
Lines changed: 0 additions & 2 deletions b/‎xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎xllm/models/llm/npu/deepseek_mtp.h‎
Lines changed: 127 additions & 0 deletions b/‎xllm/models/llm/npu/deepseek_mtp.h‎
Lines changed: 127 additions & 0 deletions
diff --git a/‎xllm/models/llm/npu/deepseek_v2.h‎
Lines changed: 0 additions & 6 deletions b/‎xllm/models/llm/npu/deepseek_v2.h‎
Lines changed: 0 additions & 6 deletions
@@ -257,6 +257,7 @@ struct ModelInputParams {
     params.layer_synchronizer = layer_synchronizer;
 #endif
     params.expert_load_data = expert_load_data;
+    params.expert_array = expert_array;
 
     params.swap_blocks = std::move(swap_blocks);
 
@@ -401,6 +402,7 @@ struct ModelInputParams {
   DpEpPaddingData dp_ep_padding_data;
 
   torch::Tensor expert_load_data;
+  torch::Tensor expert_array;
 
   torch::Tensor kv_cache_tokens_nums;
   std::vector<int32_t> kv_cache_tokens_nums_host;
 
@@ -344,7 +344,6 @@ torch::Tensor NpuGlm4MoeDecoderImpl::forward(
     torch::Tensor& attn_mask,
     KVCache& kv_cache,
     const ModelInputParams& input_params,
-    torch::Tensor& expert_array,
     aclrtEvent* event,
     std::atomic<bool>* event_flag,
     int node_id) {
@@ -357,11 +356,10 @@ torch::Tensor NpuGlm4MoeDecoderImpl::forward(
                             attn_mask,
                             kv_cache,
                             input_params,
-                            expert_array,
                             true);
     st = execute_node(prefill_node_, node_id, event, event_flag);
     LOG_IF(FATAL, st != 0) << model_name_
-                           << "excute prefill layer fail, error code: " << st;
+                           << " excute prefill layer fail, error code: " << st;
   } else {
     build_node_variant_pack(decode_node_,
                             x,
@@ -370,11 +368,10 @@ torch::Tensor NpuGlm4MoeDecoderImpl::forward(
                             /*attn_mask*/ tensor_placeholder_,
                             kv_cache,
                             input_params,
-                            expert_array,
                             false);
     st = execute_node(decode_node_, node_id + 1000, event, event_flag);
     LOG_IF(FATAL, st != 0) << model_name_
-                           << "excute decode layer fail, error code: " << st;
+                           << " excute decode layer fail, error code: " << st;
   }
 
   return tensor_placeholder_;
@@ -388,7 +385,6 @@ void NpuGlm4MoeDecoderImpl::build_node_variant_pack(
     torch::Tensor& attn_mask,
     KVCache& kv_cache,
     const ModelInputParams& input_params,
-    torch::Tensor& expert_array,
     bool is_prefill) {
   internal_tensor_ = atb_speed::Utils::AtTensor2Tensor(x);
   auto& dp_ep_padding = input_params.dp_ep_padding_data;
@@ -421,7 +417,7 @@ void NpuGlm4MoeDecoderImpl::build_node_variant_pack(
       atb_speed::Utils::AtTensor2Tensor(input_params.new_cache_slots);
 
   node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 11) =
-      atb_speed::Utils::AtTensor2Tensor(expert_array);
+      atb_speed::Utils::AtTensor2Tensor(input_params.expert_array);
   node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 12) =
       atb_speed::Utils::AtTensor2Tensor(expert_group_);
   node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 13) =
 
@@ -49,7 +49,6 @@ class NpuGlm4MoeDecoderImpl : public BaseLayer {
                         torch::Tensor& attn_mask,
                         KVCache& kv_cache,
                         const ModelInputParams& input_params,
-                        torch::Tensor& expert_array,
                         aclrtEvent* event = nullptr,
                         std::atomic<bool>* event_flag = nullptr,
                         int node_id = 0);
@@ -100,7 +99,6 @@ class NpuGlm4MoeDecoderImpl : public BaseLayer {
                                torch::Tensor& attn_mask,
                                KVCache& kv_cache,
                                const ModelInputParams& input_params,
-                               torch::Tensor& expert_array,
                                bool is_prefill);
 
   std::string model_name_;
 
@@ -283,7 +283,6 @@ torch::Tensor NpuQwen3MoeDecoderLayerImpl::forward(
     torch::Tensor& attn_mask,
     KVCache& kv_cache,
     const ModelInputParams& input_params,
-    torch::Tensor& expert_array,
     aclrtEvent* event,
     std::atomic<bool>* event_flag,
     int node_id) {
@@ -296,7 +295,6 @@ torch::Tensor NpuQwen3MoeDecoderLayerImpl::forward(
                             attn_mask,
                             kv_cache,
                             input_params,
-                            expert_array,
                             true);
     st = execute_node(prefill_node_, node_id, event, event_flag);
     LOG_IF(FATAL, st != 0) << model_name_
@@ -309,7 +307,6 @@ torch::Tensor NpuQwen3MoeDecoderLayerImpl::forward(
                             /*attn_mask*/ tensor_placeholder_,
                             kv_cache,
                             input_params,
-                            expert_array,
                             false);
     st = execute_node(decode_node_, node_id + 1000, event, event_flag);
     LOG_IF(FATAL, st != 0) << model_name_
@@ -327,15 +324,14 @@ void NpuQwen3MoeDecoderLayerImpl::build_node_variant_pack(
     torch::Tensor& attn_mask,
     KVCache& kv_cache,
     const ModelInputParams& input_params,
-    torch::Tensor& expert_array,
     bool is_prefill) {
   internal_tensor_ = atb_speed::Utils::AtTensor2Tensor(x);
   int32_t input_idx = 0;
   auto& dp_ep_padding = input_params.dp_ep_padding_data;
 
   node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER) = internal_tensor_;
   node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 1) =
-      atb_speed::Utils::AtTensor2Tensor(expert_array);
+      atb_speed::Utils::AtTensor2Tensor(input_params.expert_array);
   node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 2) =
       atb_speed::Utils::AtTensor2Tensor(expert_group_);
   node.variantPack.inTensors.at(WEIGHT_COUNT_PER_LAYER + 3) =
 
@@ -54,7 +54,6 @@ class NpuQwen3MoeDecoderLayerImpl : public BaseLayer {
                         torch::Tensor& attn_mask,
                         KVCache& kv_cache,
                         const ModelInputParams& input_params,
-                        torch::Tensor& expert_array,
                         aclrtEvent* event = nullptr,
                         std::atomic<bool>* event_flag = nullptr,
                         int node_id = 0);
@@ -104,7 +103,6 @@ class NpuQwen3MoeDecoderLayerImpl : public BaseLayer {
                                torch::Tensor& attn_mask,
                                KVCache& kv_cache,
                                const ModelInputParams& input_params,
-                               torch::Tensor& expert_array,
                                bool is_prefill);
 
   torch::Tensor block_tables_placeholder_;
 
@@ -0,0 +1,127 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include "core/layers/common/rotary_embedding_util.h"
+#include "deepseek_v2.h"
+#include "mtp_model_base.h"
+
+// DeepSeek v2 compatible with huggingface weights
+// ref to:
+// https://github.com/vllm-project/vllm/blob/v0.6.6/vllm/model_executor/models/deepseek_v2.py
+
+namespace xllm {
+
+class DeepseekMtpModelImpl : public MtpModelImplBase<DeepseekV2DecoderLayer> {
+ public:
+  DeepseekMtpModelImpl(const ModelContext& context)
+      : MtpModelImplBase<DeepseekV2DecoderLayer>("deepseek_v3_mtp", context) {
+    auto model_args = context.get_model_args();
+    auto options = context.get_tensor_options();
+
+    int32_t mask_value = model_args.dtype() == "bfloat16" ? 1 : -9984;
+    attn_mask_ = layer::AttentionMask(options.device(),
+                                      options.dtype().toScalarType(),
+                                      /*mask_value=*/mask_value);
+
+    cos_sin_ = layer::rotary::get_deepseek_rotary_embedding(
+        model_args.qk_rope_head_dim(),
+        model_args.qk_rope_head_dim(),
+        model_args.max_position_embeddings(),
+        model_args.rope_scaling_original_max_position_embeddings(),
+        model_args.rope_theta(),
+        /*interleaved*/ false,
+        model_args.rope_scaling_factor(),
+        model_args.rope_extrapolation_factor(),
+        model_args.rope_scaling_attn_factor(),
+        model_args.rope_scaling_beta_fast(),
+        model_args.rope_scaling_beta_slow(),
+        model_args.rope_scaling_mscale(),
+        model_args.rope_scaling_mscale_all_dim(),
+        options);
+  }
+};
+TORCH_MODULE(DeepseekMtpModel);
+
+class DeepseekMtpForCausalLMImpl
+    : public MtpForCausalLMImplBase<DeepseekMtpModel> {
+ public:
+  DeepseekMtpForCausalLMImpl(const ModelContext& context)
+      : MtpForCausalLMImplBase<DeepseekMtpModel>(context) {}
+};
+TORCH_MODULE(DeepseekMtpForCausalLM);
+
+// register the causal model
+REGISTER_CAUSAL_MODEL(deepseek_v3_mtp, DeepseekMtpForCausalLM);
+
+// example config:
+// https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/config.json
+REGISTER_MODEL_ARGS(deepseek_v3_mtp, [&] {
+  LOAD_ARG_OR(model_type, "model_type", "deepseek_v3_mtp");
+  LOAD_ARG_OR(dtype, "torch_dtype", "");
+  LOAD_ARG_OR(vocab_size, "vocab_size", 129280);
+  LOAD_ARG_OR(hidden_size, "hidden_size", 7168);
+  LOAD_ARG_OR(n_layers, "num_hidden_layers", 61);
+  LOAD_ARG_OR(n_heads, "num_attention_heads", 128);
+  LOAD_ARG_OR(n_kv_heads, "num_key_value_heads", 128);
+  LOAD_ARG_OR(intermediate_size, "intermediate_size", 18432);
+  LOAD_ARG_OR(max_position_embeddings, "max_position_embeddings", 163840);
+  LOAD_ARG_OR(rms_norm_eps, "rms_norm_eps", 1e-6);
+  LOAD_ARG_OR(eos_token_id, "eos_token_id", 1);
+  LOAD_ARG_OR(bos_token_id, "bos_token_id", 0);
+  LOAD_ARG_OR(rope_theta, "rope_theta", 10000.0f);
+  LOAD_ARG_OR(use_sliding_window, "use_sliding_window", false);
+  LOAD_ARG_OR(sliding_window, "sliding_window", 4096);
+  LOAD_ARG_OR(max_window_layers, "max_window_layers", 61);
+
+  LOAD_ARG_OR(first_k_dense_replace, "first_k_dense_replace", 0);
+  LOAD_ARG_OR(moe_layer_freq, "moe_layer_freq", 1);
+  LOAD_ARG_OR(topk_method, "topk_method", "noaux_tc");
+  LOAD_ARG_OR(n_routed_experts, "n_routed_experts", 256);
+  LOAD_ARG_OR(n_shared_experts, "n_shared_experts", 1);
+  LOAD_ARG_OR(num_experts_per_tok, "num_experts_per_tok", 8);
+  LOAD_ARG_OR(moe_intermediate_size, "moe_intermediate_size", 2048);
+  LOAD_ARG_OR(routed_scaling_factor, "routed_scaling_factor", 2.5f);
+  LOAD_ARG_OR(norm_topk_prob, "norm_topk_prob", true);
+  LOAD_ARG_OR(n_group, "n_group", 8);
+  LOAD_ARG_OR(topk_group, "topk_group", 4);
+  LOAD_ARG_OR(qk_nope_head_dim, "qk_nope_head_dim", 128);
+  LOAD_ARG_OR(qk_rope_head_dim, "qk_rope_head_dim", 64);
+  LOAD_ARG_OR(v_head_dim, "v_head_dim", 128);
+  LOAD_ARG_OR(q_lora_rank, "q_lora_rank", 1536);
+  LOAD_ARG_OR(kv_lora_rank, "kv_lora_rank", 512);
+
+  LOAD_ARG_OR_FUNC(head_dim, "head_dim", [&] {
+    return 256;  // args->qk_nope_head_dim() + args->qk_rope_head_dim();
+  });
+  LOAD_ARG_OR_FUNC(
+      rotary_dim, "rotary_dim", [&] { return args->qk_rope_head_dim(); });
+
+  SET_ARG(rope_scaling_rope_type, "deepseek_yarn");
+  LOAD_ARG(rope_scaling_beta_fast, "rope_scaling.beta_fast");
+  LOAD_ARG(rope_scaling_beta_slow, "rope_scaling.beta_slow");
+  LOAD_ARG(rope_scaling_factor, "rope_scaling.factor");
+  LOAD_ARG_OR(
+      rope_extrapolation_factor, "rope_scaling.extrapolation_factor", 1.0f);
+  LOAD_ARG(rope_scaling_mscale, "rope_scaling.mscale");
+  LOAD_ARG(rope_scaling_mscale_all_dim, "rope_scaling.mscale_all_dim");
+  LOAD_ARG(rope_scaling_original_max_position_embeddings,
+           "rope_scaling.original_max_position_embeddings");
+  LOAD_ARG_OR(rope_scaling_attn_factor, "rope_scaling.attn_factor", 1.0f);
+
+  SET_ARG(stop_token_ids, std::unordered_set<int32_t>({1}));
+});
+}  // namespace xllm
@@ -142,15 +142,10 @@ class DeepseekV2ModelImpl : public torch::nn::Module {
     norm_ = register_module("norm", layer::NpuRMSNorm(context));
 
     dp_size_ = parallel_args.dp_size();
-    std::vector<int64_t> indices;
     dp_local_tp_size_ = parallel_args.world_size() / dp_size_;
     dp_rank_ = parallel_args.rank() / dp_local_tp_size_;
     rank_ = parallel_args.rank();
-    mapping_data_ = parallel_args.mapping_data();
     num_experts_per_tok_ = model_args.num_experts_per_tok();
-    for (int i = 0; i < parallel_args.world_size(); i += dp_local_tp_size_) {
-      indices.push_back(i);
-    }
   }
 
   torch::Tensor forward(torch::Tensor tokens,
@@ -258,7 +253,6 @@ class DeepseekV2ModelImpl : public torch::nn::Module {
   int32_t rank_;
   int32_t dp_size_;
   int32_t dp_local_tp_size_;
-  nlohmann::json mapping_data_;
   int32_t num_experts_per_tok_;
   int32_t num_speculative_tokens_ = 0;
   at::Device device_;