jd-opensource
diff --git a/‎xllm/core/layers/CMakeLists.txt
Lines changed: 33 additions & 0 deletions b/‎xllm/core/layers/CMakeLists.txt
Lines changed: 33 additions & 0 deletions
diff --git a/‎xllm/core/layers/npu/attn_mask.cpp renamed to ‎xllm/core/layers/attention_mask.cpp
Lines changed: 19 additions & 20 deletions b/‎xllm/core/layers/npu/attn_mask.cpp renamed to ‎xllm/core/layers/attention_mask.cpp
Lines changed: 19 additions & 20 deletions
diff --git a/‎xllm/core/layers/npu/attn_mask.h renamed to ‎xllm/core/layers/attention_mask.h
Lines changed: 7 additions & 9 deletions b/‎xllm/core/layers/npu/attn_mask.h renamed to ‎xllm/core/layers/attention_mask.h
Lines changed: 7 additions & 9 deletions
diff --git a/‎xllm/core/layers/base_layer.cpp
Lines changed: 132 additions & 0 deletions b/‎xllm/core/layers/base_layer.cpp
Lines changed: 132 additions & 0 deletions
@@ -19,14 +19,47 @@ cc_library(
     torch
 )
 
+cc_library(
+  NAME 
+    base_layer
+  HDRS
+    attention_mask.h
+    base_layer.h
+  SRCS
+    attention_mask.cpp
+    base_layer.cpp
+  DEPS
+    :state_dict
+    :block
+    :kv_cache
+    :prefix_cache
+    glog::glog
+    gflags::gflags
+    torch
+)
+
 cc_library(
   NAME 
     layers
+  HDRS
+    column_parallel_linear.h
+    deepseek_v2_decoder_layer.h
+    llama_decoder_layer.h
+    multi_head_attention.h
+    qwen2_decoder_layer.h
+    qwen2dot5_vision_decode_layer.h
+    qwen3_decoder_layer.h
+    qwen3_moe_decoder_layer.h
+    rms_norm.h
+    siglip_encoder_layer.h
+  SRCS
+    multi_head_attention.cpp
   DEPS
     :state_dict
     :kv_cache
     :prefix_cache
     :block
+    :base_layer
     :rotary_embedding
     glog::glog
     gflags::gflags
 
@@ -1,10 +1,10 @@
-#include "attn_mask.h"
+#include "attention_mask.h"
 
-namespace xllm::hf {
+namespace xllm::layer {
 
-AttentionMaskImpl::AttentionMaskImpl(at::Device device,
-                                     torch::Dtype dtype,
-                                     float mask_value) {
+AttentionMask::AttentionMask(at::Device device,
+                             torch::Dtype dtype,
+                             float mask_value) {
   int max_seq_len = 128;
   seq_len_cached_ = max_seq_len;
   auto bias_cache =
@@ -21,25 +21,24 @@ AttentionMaskImpl::AttentionMaskImpl(at::Device device,
                           .to(device);
 }
 
-torch::Tensor AttentionMaskImpl::get_decode_attn_mask(
-    torch::Tensor input_lengths,
-    int64_t max_s,
-    torch::Dtype dtype,
-    torch::Device device) {
+torch::Tensor AttentionMask::get_decode_attn_mask(torch::Tensor input_lengths,
+                                                  int64_t max_s,
+                                                  torch::Dtype dtype,
+                                                  torch::Device device) {
   update_attn_cache(dtype, device, max_s);
   return atten_mask_cache_.index_select(0, input_lengths).view({-1, 1, max_s});
 }
 
-torch::Tensor AttentionMaskImpl::get_attn_mask(int64_t max_s,
-                                               torch::Dtype dtype,
-                                               torch::Device device) {
+torch::Tensor AttentionMask::get_attn_mask(int64_t max_s,
+                                           torch::Dtype dtype,
+                                           torch::Device device) {
   update_attn_cache(dtype, device, max_s);
   return atten_mask_cache_.slice(0, 0, max_s).slice(1, 0, max_s);
 }
 
-torch::Tensor AttentionMaskImpl::gen_free_mask(int32_t q_len,
-                                               torch::Dtype dtype,
-                                               torch::Device device) {
+torch::Tensor AttentionMask::gen_free_mask(int32_t q_len,
+                                           torch::Dtype dtype,
+                                           torch::Device device) {
   float pre_mask_factor = -10000.0f;
   if (dtype == torch::kBFloat16) {
     pre_mask_factor = 1.0f;
@@ -52,9 +51,9 @@ torch::Tensor AttentionMaskImpl::gen_free_mask(int32_t q_len,
   return mask_free;
 }
 
-void AttentionMaskImpl::update_attn_cache(torch::Dtype dtype,
-                                          torch::Device device,
-                                          int64_t seqlen) {
+void AttentionMask::update_attn_cache(torch::Dtype dtype,
+                                      torch::Device device,
+                                      int64_t seqlen) {
   if (seqlen > seq_len_cached_ || atten_mask_cache_.dtype() != dtype) {
     seq_len_cached_ = seqlen;
 
@@ -69,4 +68,4 @@ void AttentionMaskImpl::update_attn_cache(torch::Dtype dtype,
   }
 }
 
-}  // namespace xllm::hf
+}  // namespace xllm::layer
@@ -1,18 +1,16 @@
 #pragma once
 #include <torch/torch.h>
 
-#include "atb/atb_infer.h"
-
 namespace xllm {
-namespace hf {
+namespace layer {
 
-class AttentionMaskImpl : public torch::nn::Module {
+class AttentionMask : public torch::nn::Module {
  public:
-  AttentionMaskImpl() = default;
+  AttentionMask() = default;
 
-  explicit AttentionMaskImpl(at::Device device,
-                             torch::Dtype dtype,
-                             float mask_value = -9984);
+  explicit AttentionMask(at::Device device,
+                         torch::Dtype dtype,
+                         float mask_value = -9984);
 
   torch::Tensor get_decode_attn_mask(torch::Tensor input_lengths,
                                      int64_t max_s,
@@ -37,5 +35,5 @@ class AttentionMaskImpl : public torch::nn::Module {
   at::Tensor atten_mask_cache_;
 };
 
-}  // namespace hf
+}  // namespace layer
 }  // namespace xllm
@@ -0,0 +1,132 @@
+#include "base_layer.h"
+
+namespace xllm {
+namespace layer {
+
+BaseLayer::BaseLayer(const Context& context)
+    : device_(context.get_tensor_options().device()),
+      name_(""),
+      parallel_args_(context.get_parallel_args()) {
+  auto quant_args = context.get_quant_args();
+  if (!quant_args.quantize_type().empty()) {
+    quantize_type_ = quant_args.quantize_type();
+  }
+
+  if (!quant_args.torch_dtype().empty()) {
+    torch_dtype_ = quant_args.torch_dtype();
+  }
+
+  dp_size_ = parallel_args_.dp_size();
+  dp_local_tp_size_ = parallel_args_.world_size() / dp_size_;
+  dp_rank_ = parallel_args_.rank() / dp_local_tp_size_;
+  CHECK_EQ(parallel_args_.world_size(), dp_size_ * dp_local_tp_size_);
+  dp_local_tp_rank_ = parallel_args_.rank() % dp_local_tp_size_;
+
+  run_task_func_ = std::bind(
+      &BaseLayer::run_task, this, std::placeholders::_1, std::placeholders::_2);
+}
+
+torch::Dtype BaseLayer::string2dtype(const std::string& dtype_str) {
+  if (dtype_str.compare("float16") == 0) {
+    return torch::kFloat16;
+  } else if (dtype_str.compare("bfloat16") == 0) {
+    return torch::kBFloat16;
+  } else if (dtype_str.compare("float32") == 0) {
+    return torch::kFloat32;
+  } else if (dtype_str.compare("float64") == 0) {
+    return torch::kFloat64;
+  } else if (dtype_str.compare("int8") == 0) {
+    return torch::kInt8;
+  } else if (dtype_str.compare("int16") == 0) {
+    return torch::kInt16;
+  } else if (dtype_str.compare("int32") == 0) {
+    return torch::kInt32;
+  } else if (dtype_str.compare("int64") == 0) {
+    return torch::kInt64;
+  } else if (dtype_str.compare("uint8") == 0) {
+    return torch::kUInt8;
+  } else if (dtype_str.compare("bool") == 0) {
+    return torch::kBool;
+  }
+
+  throw std::runtime_error("Unsupported dtype string");
+}
+
+void BaseLayer::correct_tensor_dtype(torch::Tensor& tensor,
+                                     const std::string& tensorName) {
+  if (absl::EndsWith(tensorName, "deq_scale") &&
+      (torch_dtype_.compare("bfloat16") == 0)) {
+    return;
+  }
+
+  if (tensor.dtype() != torch::kInt8 && tensor.dtype() != torch::kInt32 &&
+      tensor.dtype() != torch::kInt64) {
+    torch::Dtype dtype = string2dtype(torch_dtype_);
+    tensor = tensor.to(dtype);
+  }
+}
+
+void BaseLayer::set_weight(const StateDict& state_dict,
+                           const std::string& tensor_name,
+                           int weight_position) {
+  for (const auto& [name, tensor] : state_dict) {
+    if (absl::EndsWith(name, tensor_name)) {
+      at::Tensor mutable_tensor = tensor;
+      correct_tensor_dtype(mutable_tensor, tensor_name);
+      at_weight_tensors_[weight_position] = mutable_tensor.to(device_);
+    }
+  }
+}
+
+void BaseLayer::set_weight(const StateDict& state_dict,
+                           const std::string& tensor_name,
+                           int weight_position,
+                           int dim) {
+  for (const auto& [name, tensor] : state_dict) {
+    if (absl::EndsWith(name, tensor_name)) {
+      if (parallel_args_.world_size() <= 1) {
+        at::Tensor mutable_tensor = tensor;
+        correct_tensor_dtype(mutable_tensor, tensor_name);
+        at_weight_tensors_[weight_position] = mutable_tensor.to(device_);
+      } else {
+        at_weight_tensors_[weight_position] =
+            state_dict
+                .get_sharded_tensor(tensor_name,
+                                    /*dim=*/dim,
+                                    /*rank=*/parallel_args_.rank(),
+                                    /*world_size=*/parallel_args_.world_size())
+                .to(device_);
+        correct_tensor_dtype(at_weight_tensors_[weight_position], tensor_name);
+      }
+    }
+  }
+}
+
+void BaseLayer::set_weight(const StateDict& state_dict,
+                           const std::string& tensor_name,
+                           int weight_position,
+                           int dim,
+                           int rank,
+                           int world_size) {
+  for (const auto& [name, tensor] : state_dict) {
+    if (absl::EndsWith(name, tensor_name)) {
+      if (world_size <= 1) {
+        at::Tensor mutable_tensor = tensor;
+        correct_tensor_dtype(mutable_tensor, tensor_name);
+        at_weight_tensors_[weight_position] = mutable_tensor.to(device_);
+      } else {
+        at_weight_tensors_[weight_position] =
+            state_dict
+                .get_sharded_tensor(tensor_name,
+                                    /*dim=*/dim,
+                                    /*rank=*/rank,
+                                    /*world_size=*/world_size)
+                .to(device_);
+        correct_tensor_dtype(at_weight_tensors_[weight_position], tensor_name);
+      }
+    }
+  }
+}
+
+}  // namespace layer
+}  // namespace xllm