diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7765ee7..7cb317f4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -298,6 +298,7 @@ else()
 endif()
 
 if(USE_NPU)
+  # add_definitions(-DUSE_NPU_TORCH)
   add_definitions(-DUSE_NPU)
   add_definitions(-DBUILD_LIBTORCH)
   add_definitions(-DTORCH_SETCUSTOMHANDLER=ON)
@@ -309,6 +310,7 @@ if(USE_NPU)
       $ENV{PYTORCH_INSTALL_PATH}/include
       $ENV{PYTORCH_INSTALL_PATH}/include/torch/csrc/api/include
       $ENV{PYTORCH_NPU_INSTALL_PATH}/include
+      $ENV{PYTORCH_INSTALL_PATH}/include/torch/csrc/distributed
       $ENV{NPU_HOME_PATH}/include
       $ENV{ATB_HOME_PATH}/include
       $ENV{NPU_HOME_PATH}/opp/vendors/xllm/op_api/include/
diff --git a/cmake/cc_test.cmake b/cmake/cc_test.cmake
index ce5dd0dc..ccaf449c 100644
--- a/cmake/cc_test.cmake
+++ b/cmake/cc_test.cmake
@@ -69,6 +69,14 @@ function(cc_test)
     PRIVATE ${CC_TEST_LINKOPTS}
   )
 
+  if(USE_NPU)
+    set(COMMON_LIBS Python::Python torch_npu torch_python)
+  endif()
+
+  if(USE_NPU AND DEFINED COMMON_LIBS)
+    target_link_libraries(${CC_TEST_NAME} PRIVATE ${COMMON_LIBS})
+  endif()
+
   add_dependencies(all_tests ${CC_TEST_NAME})
 
   gtest_add_tests(
diff --git a/xllm/CMakeLists.txt b/xllm/CMakeLists.txt
index e95742df..7714647a 100644
--- a/xllm/CMakeLists.txt
+++ b/xllm/CMakeLists.txt
@@ -34,7 +34,7 @@ target_link_libraries(xllm PRIVATE glog::glog brpc leveldb::leveldb ZLIB::ZLIB p
 add_dependencies(xllm brpc-static)
 
 if(USE_NPU)
-  set(COMMON_LIBS Python::Python ascendcl hccl c_sec nnopbase ms_tools_ext)
+  set(COMMON_LIBS Python::Python ascendcl hccl c_sec nnopbase ms_tools_ext torch_npu torch_python)
 elseif(USE_MLU)
   set(COMMON_LIBS Python::Python)
 endif()
diff --git a/xllm/core/common/CMakeLists.txt b/xllm/core/common/CMakeLists.txt
index 3410b2e5..76877872 100644
--- a/xllm/core/common/CMakeLists.txt
+++ b/xllm/core/common/CMakeLists.txt
@@ -28,6 +28,7 @@ cc_library(
     absl::random_random
     absl::strings
     torch
+    $<$<BOOL:${USE_NPU}>:torch_python>
     $<$<BOOL:${USE_NPU}>:torch_npu>
     $<$<BOOL:${USE_MSPTI}>:mspti>
     $<$<BOOL:${USE_NPU}>:ms_tools_ext>
diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
index 30b9b4e3..0d8cd3cd 100644
--- a/xllm/core/common/global_flags.cpp
+++ b/xllm/core/common/global_flags.cpp
@@ -389,3 +389,5 @@ DEFINE_string(reasoning_parser,
 
 // --- qwen3 reranker config ---
 DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker.");
+
+DEFINE_bool(enable_native_npu, true, "Whether to enable native NPU support.");
\ No newline at end of file
diff --git a/xllm/core/common/global_flags.h b/xllm/core/common/global_flags.h
index 5c79a7c3..49846b8d 100644
--- a/xllm/core/common/global_flags.h
+++ b/xllm/core/common/global_flags.h
@@ -202,3 +202,5 @@ DECLARE_bool(enable_qwen3_reranker);
 DECLARE_string(reasoning_parser);
 
 DECLARE_bool(enable_shm);
+
+DECLARE_bool(enable_native_npu);
\ No newline at end of file
diff --git a/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt b/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt
index 7fbae3e5..abc6d9f7 100644
--- a/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt
+++ b/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt
@@ -12,6 +12,7 @@ cc_binary(
     :models
     :model
     :distributed_runtime
+    :parallel_state
     absl::strings
     xllm_kernels
     ascendcl
diff --git a/xllm/core/distributed_runtime/worker_server.cpp b/xllm/core/distributed_runtime/worker_server.cpp
index 9b631587..99fb22a5 100644
--- a/xllm/core/distributed_runtime/worker_server.cpp
+++ b/xllm/core/distributed_runtime/worker_server.cpp
@@ -100,6 +100,12 @@ void WorkerServer::create_server(
   const ParallelArgs* parallel_args = comm.parallel_args();
 #if defined(USE_MLU) || defined(USE_CUDA)
   comm.create_process_groups(master_node_addr, device);
+#elif defined(USE_NPU)
+  // TODO: Refactor to use model_type or other appropriate enumeration for
+  // condition checking
+  if (FLAGS_enable_native_npu) {
+    comm.create_process_groups(master_node_addr, device);
+  }
 #endif
 
   WorkerType worker_type =
diff --git a/xllm/core/framework/model/CMakeLists.txt b/xllm/core/framework/model/CMakeLists.txt
index 9bdd452d..4bcc6ffb 100644
--- a/xllm/core/framework/model/CMakeLists.txt
+++ b/xllm/core/framework/model/CMakeLists.txt
@@ -17,10 +17,10 @@ set(BASE_DEPS
 if(USE_NPU)
   list(APPEND BASE_DEPS :npu_layers)
   list(APPEND BASE_DEPS :platform_npu)
-else()
-  list(APPEND BASE_DEPS :common_layers)
 endif()
 
+list(APPEND BASE_DEPS :common_layers)
+
 
 # Define the library
 cc_library(
diff --git a/xllm/core/framework/parallel_state/collective_communicator.cpp b/xllm/core/framework/parallel_state/collective_communicator.cpp
index c0066be0..8225cdf7 100644
--- a/xllm/core/framework/parallel_state/collective_communicator.cpp
+++ b/xllm/core/framework/parallel_state/collective_communicator.cpp
@@ -18,6 +18,9 @@ limitations under the License.
 #include "mapping_npu.h"
 
 #if defined(USE_NPU)
+#include <torch_npu/csrc/distributed/ProcessGroupHCCL.hpp>
+
+#include "npu_process_group.h"
 #include "xllm_kernels/core/include/atb_speed/base/external_comm_manager.h"
 #include "xllm_kernels/core/include/atb_speed/utils/singleton.h"
 #include "xllm_kernels/models/base/param/mapping.h"
@@ -30,23 +33,6 @@ limitations under the License.
 #include "parallel_args.h"
 #include "util/net.h"
 
-namespace {
-#if defined(USE_NPU)
-std::unique_ptr<xllm::ProcessGroup> create_process_group(
-    int rank,
-    int world_size,
-    int rank_size,
-    int port,
-    bool trans,
-    const std::string& host,
-    const std::string& group_name,
-    const torch::Device& device) {
-  LOG(FATAL) << "Unsupported device type";
-  return nullptr;
-}
-#endif
-}  // namespace
-
 namespace xllm {
 
 CollectiveCommunicator::CollectiveCommunicator(int global_rank,
diff --git a/xllm/core/framework/parallel_state/npu_process_group.cpp b/xllm/core/framework/parallel_state/npu_process_group.cpp
index eff99922..b401c437 100644
--- a/xllm/core/framework/parallel_state/npu_process_group.cpp
+++ b/xllm/core/framework/parallel_state/npu_process_group.cpp
@@ -14,6 +14,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include "npu_process_group.h"
+#ifdef TORCH_HIGHER_THAN_PTA6
+#include <torch_npu/csrc/framework/OpCommand.h>
+#else
+#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
+#include <torch_npu/csrc/framework/utils/OpPreparation.h>
+#endif
+
+#include <c10d/ProcessGroup.hpp>
+#include <c10d/TCPStore.hpp>
+#include <torch_npu/csrc/distributed/ProcessGroupHCCL.hpp>
 
 namespace {
 
@@ -24,113 +34,65 @@ namespace {
       LOG(FATAL) << "Failed, HCCL error :" << HcclGetErrorString(r); \
     }                                                                \
   } while (0)
+}  // namespace
 
-inline bool is_npu(const at::Tensor& tensor) {
-  if (!tensor.defined()) {
-    return false;
-  }
-  return tensor.device().is_privateuseone();
-}
-
-inline bool is_npu(const at::TensorOptions& options) {
-  return options.device().is_privateuseone();
-}
+namespace xllm {
 
-inline bool is_npu(const at::Device& device) {
-  return device.is_privateuseone();
-}
+ProcessGroupHCCL::ProcessGroupHCCL(int global_rank,
+                                   int world_size,
+                                   int rank_size,
+                                   int port,
+                                   bool trans,
+                                   const std::string& host,
+                                   const std::string& group_name,
+                                   const torch::Device& device)
+    : ProcessGroup(device) {
+  c10::intrusive_ptr<c10d_npu::ProcessGroupHCCL::Options> hccl_pg_options =
+      c10d_npu::ProcessGroupHCCL::Options::create();
+  // hccl_pg_options->group_name = group_name;
+  int rank = global_rank;
+  if (world_size != rank_size) {
+    auto [local_rank, group_ranks] =
+        get_group_rank(world_size, global_rank, rank_size, trans);
+    std::vector<uint32_t> uint32_ranks;
+    for (auto rank : group_ranks) {
+      uint32_ranks.push_back(static_cast<uint32_t>(rank));
+    }
+    hccl_pg_options->global_ranks_in_group = uint32_ranks;
+    rank = local_rank;
+  }
 
-at::Tensor flatten_for_scatter_gather(std::vector<at::Tensor>& tensors) {
-  auto& t = tensors[0];
-  std::vector<int64_t> sizes{static_cast<int64_t>(tensors.size())};
-  sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
-  return at::empty(sizes, t.options());
+  auto store = create_tcp_store(host, port, rank);
+  pg_ = std::make_unique<c10d_npu::ProcessGroupHCCL>(
+      store, rank, rank_size, hccl_pg_options);
 }
 
-HcclDataType to_hccl_data_type(const torch::Tensor& input) {
-  const auto type = input.scalar_type();
-  switch (type) {
-    case at::kFloat:
-      return HCCL_DATA_TYPE_FP32;
-    case at::kHalf:
-      return HCCL_DATA_TYPE_FP16;
-    case at::kDouble:
-      return HCCL_DATA_TYPE_FP64;
-    case at::kLong:
-      return HCCL_DATA_TYPE_INT64;
-    case at::kInt:
-      return HCCL_DATA_TYPE_INT32;
-    case at::kChar:
-      return HCCL_DATA_TYPE_INT8;
-    case at::kByte:
-      return HCCL_DATA_TYPE_UINT8;
-    case at::kBool:
-      return HCCL_DATA_TYPE_UINT8;
-    case at::kBFloat16:
-      return HCCL_DATA_TYPE_BFP16;
-    default:
-      TORCH_CHECK(false, "Unconvertible HCCL type ", type);
+// Destructor.
+ProcessGroupHCCL::~ProcessGroupHCCL() {
+  if (pg_) {
+    pg_->shutdown();
+  } else {
+    HCCLCHECK(HcclCommDestroy(comm_));
   }
 }
 
-void check_input(torch::Tensor input) {
-  CHECK(is_npu(input)) << "input should be npu tensor";
-  CHECK(input.is_contiguous()) << "input should be contiguous";
-  CHECK(!input.is_sparse()) << "input have to be npu dense tensor";
-}
-
-}  // namespace
-
-namespace xllm {
-
 ProcessGroupHCCL::ProcessGroupHCCL(int rank,
                                    int world_size,
                                    const torch::Device& device,
                                    HcclComm comm)
     : ProcessGroup(device), comm_(comm) {}
-// Destructor.
-ProcessGroupHCCL::~ProcessGroupHCCL() { HCCLCHECK(HcclCommDestroy(comm_)); }
 
-void ProcessGroupHCCL::allreduce(torch::Tensor& input) {
-  DCHECK(input.device() == device())
-      << "input should be on the same device as the process group";
-  check_input(input);
-  // inplace all reduce
-  // const auto count = input.numel();
-  // const auto data_type = to_hccl_data_type(input);
-  // auto stream = c10_npu::getCurrentNPUStream();
-  // torch::DeviceGuard device_guard(device());
-  // HCCLCHECK(HcclAllReduce(
-  //     /*sendbuff=*/input.data_ptr(),
-  //     /*recvbuff=*/input.data_ptr(),
-  //     /*count=*/count,
-  //     /*datatype=*/data_type,
-  //     /*op=*/HCCL_REDUCE_SUM,
-  //     /*comm=*/comm_,
-  //     /*stream=*/stream));
-}
-void ProcessGroupHCCL::allgather(const torch::Tensor& input,
-                                 std::vector<torch::Tensor>& outputs) {
-  check_input(input);
-  // CHECK(outputs.size() == world_size())
-  //     << "outputs should have the same size as world_size";
-  // DCHECK(input.device() == device())
-  //     << "input should be on the same device as the process group";
-  // torch::DeviceGuard device_guard(device());
-  // torch::Tensor flattened_output = flatten_for_scatter_gather(outputs);
-  // const auto count = input.numel();
-  // const auto data_type = to_hccl_data_type(input);
-  // auto stream = c10_npu::getCurrentNPUStream();
-  // HCCLCHECK(HcclAllGather(
-  //     /*sendbuff=*/input.data_ptr(),
-  //     /*recvbuff=*/flattened_output.data_ptr(),
-  //     /*sendcount=*/count,
-  //     /*datatype=*/data_type,
-  //     /*comm=*/comm_,
-  //     /*stream=*/stream));
-  // // copy the flattened output tensors to the outputs.
-  // for (int i = 0; i < outputs.size(); ++i) {
-  //   outputs[i].copy_(flattened_output[i], /*non_blocking=*/true);
-  // }
+std::unique_ptr<xllm::ProcessGroup> create_process_group(
+    int rank,
+    int world_size,
+    int rank_size,
+    int port,
+    bool trans,
+    const std::string& host,
+    const std::string& group_name,
+    const torch::Device& device) {
+  return std::make_unique<ProcessGroupHCCL>(
+      rank, world_size, rank_size, port, trans, host, group_name, device);
 }
+
 }  // namespace xllm
\ No newline at end of file
diff --git a/xllm/core/framework/parallel_state/npu_process_group.h b/xllm/core/framework/parallel_state/npu_process_group.h
index 7ca7d23b..b0047cf4 100644
--- a/xllm/core/framework/parallel_state/npu_process_group.h
+++ b/xllm/core/framework/parallel_state/npu_process_group.h
@@ -18,6 +18,10 @@ limitations under the License.
 #include "hccl/hccl.h"
 #include "process_group.h"
 
+namespace c10d_npu {
+class ProcessGroupHCCL;
+}
+
 namespace xllm {
 
 class ProcessGroupHCCL : public ProcessGroup {
@@ -28,16 +32,30 @@ class ProcessGroupHCCL : public ProcessGroup {
                    const torch::Device& device,
                    HcclComm comm);
 
+  ProcessGroupHCCL(int rank,
+                   int world_size,
+                   int rank_size,
+                   int port,
+                   bool trans,
+                   const std::string& host,
+                   const std::string& group_name,
+                   const torch::Device& device);
+
   // Destructor.
   ~ProcessGroupHCCL() override;
 
-  void allreduce(torch::Tensor& input) override;
-
-  void allgather(const torch::Tensor& input,
-                 std::vector<torch::Tensor>& outputs) override;
-
  private:
   HcclComm comm_ = nullptr;
 };
 
+std::unique_ptr<xllm::ProcessGroup> create_process_group(
+    int rank,
+    int world_size,
+    int rank_size,
+    int port,
+    bool trans,
+    const std::string& host,
+    const std::string& group_name,
+    const torch::Device& device);
+
 }  // namespace xllm
\ No newline at end of file
diff --git a/xllm/core/framework/parallel_state/process_group.h b/xllm/core/framework/parallel_state/process_group.h
index ba1d67a9..85ca32bf 100644
--- a/xllm/core/framework/parallel_state/process_group.h
+++ b/xllm/core/framework/parallel_state/process_group.h
@@ -19,6 +19,11 @@ limitations under the License.
 
 #include <torch/csrc/distributed/c10d/Backend.hpp>
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
+
+#if defined(USE_NPU)
+#include <torch_npu/csrc/distributed/ProcessGroupHCCL.hpp>
+#endif
+
 namespace xllm {
 std::pair<int, std::vector<uint64_t>> get_group_rank(int world_size,
                                                      int global_rank,
@@ -60,7 +65,11 @@ class ProcessGroup {
   torch::Device device_;
 
  protected:
+#if defined(USE_NPU)
+  std::unique_ptr<c10d_npu::ProcessGroupHCCL> pg_{nullptr};
+#else
   std::unique_ptr<c10d::Backend> pg_{nullptr};
+#endif
 };
 
 }  // namespace xllm
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/CMakeLists.txt b/xllm/core/kernels/npu/CMakeLists.txt
index 5553d8a0..1855db33 100644
--- a/xllm/core/kernels/npu/CMakeLists.txt
+++ b/xllm/core/kernels/npu/CMakeLists.txt
@@ -1,17 +1,29 @@
 include(cc_library)
 
-add_subdirectory(impl)
 add_subdirectory(xllm_ops)
 
+file(GLOB_RECURSE OPPLUGIN_UTILS_HEADER
+  "${CMAKE_CURRENT_LIST_DIR}/custom_functions_npu/*.h"
+  "${CMAKE_CURRENT_LIST_DIR}/ops_npu/*.h"
+  "${CMAKE_CURRENT_LIST_DIR}/*.h"
+)
+
+file(GLOB_RECURSE OPPLUGIN_UTILS_SRCS
+  "${CMAKE_CURRENT_LIST_DIR}/custom_functions_npu/*.cpp"
+  "${CMAKE_CURRENT_LIST_DIR}/ops_npu/*.cpp"
+  "${CMAKE_CURRENT_LIST_DIR}/*.cpp"
+)
+
 cc_library(
   NAME
     npu_kernels
   HDRS
-    linear.h
-    split.h
-    rms_norm.h
-    rope.h
+    ${OPPLUGIN_UTILS_HEADER}
+  SRCS
+    ${OPPLUGIN_UTILS_SRCS}
   DEPS
-    :npu_kernels_impl
-    # spdlog::spdlog
-)
\ No newline at end of file
+    :model_context
+    glog::glog
+    torch
+    torch_npu
+)
diff --git a/xllm/core/kernels/npu/rope.h b/xllm/core/kernels/npu/active.cpp
similarity index 63%
rename from xllm/core/kernels/npu/rope.h
rename to xllm/core/kernels/npu/active.cpp
index 7a075b0d..7ccfdc8d 100644
--- a/xllm/core/kernels/npu/rope.h
+++ b/xllm/core/kernels/npu/active.cpp
@@ -13,18 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#pragma once
-#include "impl/npu_rope_impl.h"
+#include <torch_npu/csrc/aten/CustomFunctions.h>
 
-namespace xllm::kernel {
+#include "npu_ops_api.h"
+#include "ops_npu/npu_ops.h"
 
-class Rope : public torch::nn::ModuleHolder<NpuRopeImpl> {
- public:
-  using torch::nn::ModuleHolder<NpuRopeImpl>::ModuleHolder;
-  using Impl __attribute__((__unused__)) = NpuRopeImpl;
+namespace xllm::kernel::npu {
 
-  Rope(const ModelContext& context)
-      : ModuleHolder(std::make_shared<NpuRopeImpl>(context)) {}
-};
-
-}  // namespace xllm::kernel
+torch::Tensor active(const torch::Tensor& input) {
+  return at_npu::native::custom_ops::npu_swiglu(input);
+}
+}  // namespace xllm::kernel::npu
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/attention.cpp b/xllm/core/kernels/npu/attention.cpp
new file mode 100644
index 00000000..bc7c64ac
--- /dev/null
+++ b/xllm/core/kernels/npu/attention.cpp
@@ -0,0 +1,61 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "npu_ops_api.h"
+#include "ops_npu/npu_ops.h"
+namespace xllm::kernel::npu {
+
+void reshape_paged_cache(torch::Tensor& key,
+                         torch::Tensor& value,
+                         torch::Tensor& k_cache,
+                         torch::Tensor& v_cache,
+                         const torch::Tensor& slot_mapping) {
+  atb::_npu_reshape_and_cache(key, value, k_cache, v_cache, slot_mapping);
+}
+
+void batch_prefill(const torch::Tensor& query,
+                   const torch::Tensor& key,
+                   const torch::Tensor& value,
+                   const torch::Tensor& mask,
+                   const torch::Tensor& seq_len,
+                   float scale,
+                   int num_heads,
+                   int num_kv_heads,
+                   torch::Tensor& output) {
+  atb::_npu_flash_attention(
+      query, key, value, mask, seq_len, scale, num_heads, num_kv_heads, output);
+}
+
+void batch_decode(const torch::Tensor& query,
+                  const torch::Tensor& k_cache,
+                  const torch::Tensor& v_cache,
+                  int num_kv_heads,
+                  int num_heads,
+                  float scale,
+                  const torch::Tensor& block_table,
+                  const torch::Tensor& seq_lens,
+                  torch::Tensor& output) {
+  atb::_npu_paged_attention(query,
+                            k_cache,
+                            v_cache,
+                            num_kv_heads,
+                            num_heads,
+                            scale,
+                            block_table,
+                            seq_lens,
+                            output);
+}
+
+}  // namespace xllm::kernel::npu
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/custom_functions_npu/AtbCommon.cpp b/xllm/core/kernels/npu/custom_functions_npu/AtbCommon.cpp
new file mode 100644
index 00000000..4429fcda
--- /dev/null
+++ b/xllm/core/kernels/npu/custom_functions_npu/AtbCommon.cpp
@@ -0,0 +1,173 @@
+// Copyright (c) 2025 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "AtbCommon.h"
+
+namespace atb {
+atb::Tensor AtTensor2AtbTensor(const at::Tensor at_tensor) {
+  static std::map<at::ScalarType, aclDataType> dtype_map = {
+      {at::ScalarType::Bool, ACL_BOOL},
+      {at::ScalarType::Byte, ACL_UINT8},
+      {at::ScalarType::Char, ACL_INT8},
+      {at::ScalarType::Half, ACL_FLOAT16},
+      {at::ScalarType::Float, ACL_FLOAT},
+      {at::ScalarType::Int, ACL_INT32},
+      {at::ScalarType::Long, ACL_INT64},
+      {at::ScalarType::BFloat16, ACL_BF16},
+      {at::ScalarType::Double, ACL_DOUBLE},
+      {at::ScalarType::Short, ACL_INT16},
+      {at::ScalarType::ComplexHalf, ACL_COMPLEX32},
+      {at::ScalarType::ComplexFloat, ACL_COMPLEX64},
+      {at::ScalarType::ComplexDouble, ACL_COMPLEX128},
+  };
+
+  TORCH_CHECK(at_tensor.is_contiguous(), "at_tensor is not contiguous");
+  atb::Tensor tensor;
+  tensor.desc.format = atb::utils::GetFormatForAtb(at_tensor);
+  if (at_tensor.device().type() == at::kCPU) {
+    tensor.hostData = at_tensor.data_ptr();
+  } else {
+    tensor.deviceData = at_tensor.data_ptr();
+  }
+
+  tensor.desc.shape.dimNum = at_tensor.sizes().size();
+  for (uint64_t i = 0; i < at_tensor.sizes().size(); i++) {
+    tensor.desc.shape.dims[i] = at_tensor.sizes()[i];
+  }
+
+  auto dtype_iterator = dtype_map.find(at_tensor.scalar_type());
+  TORCH_CHECK(dtype_iterator != dtype_map.end(),
+              "not support dtype: ",
+              at_tensor.scalar_type());
+  tensor.desc.dtype = dtype_iterator->second;
+
+  tensor.dataSize = atb::Utils::GetTensorSize(tensor);
+
+  return tensor;
+}
+
+void RunAtbCmdV1(atb::Operation* op,
+                 const ParamSetter& paramsetter,
+                 const std::string& name) {
+  aclrtStream stream = c10_npu::getCurrentNPUStream().stream(false);
+  auto context_ptr = atb::utils::GetContext(stream);
+  atb::VariantPack variant_pack = paramsetter.variant_pack_;
+  uint64_t workspace_size = OperationSetup(variant_pack, op, context_ptr);
+  at::Tensor workspace_tensor;
+  void* workspace_ptr = nullptr;
+  if (workspace_size != 0) {
+    at::TensorOptions options = at::TensorOptions(c10::DeviceType::PrivateUse1);
+    workspace_tensor = at::empty({workspace_size}, options.dtype(at::kByte));
+    workspace_ptr = const_cast<void*>(workspace_tensor.storage().data());
+  }
+  const c10::SmallVector<at::Tensor, N>& cpu_tensors =
+      paramsetter.tensor_maintainer_.cpu_tensors;
+  auto acl_call = [variant_pack,
+                   workspace_ptr,
+                   workspace_size,
+                   context_ptr,
+                   op,
+                   cpu_tensors]() -> int {
+    auto st = op->Execute(
+        variant_pack, (uint8_t*)workspace_ptr, workspace_size, context_ptr);
+    DestroyOperation(op);
+    return st;
+  };
+  at_npu::native::OpCommand::RunOpApiV2(name, acl_call);
+}
+
+void RunAtbCmdV2(atb::Operation* op,
+                 const ParamSetter& paramsetter,
+                 const std::string& name) {
+  aclrtStream stream = c10_npu::getCurrentNPUStream().stream(false);
+  atb::VariantPack variant_pack = paramsetter.variant_pack_;
+  const c10::SmallVector<at::Tensor, N>& cpu_tensors =
+      paramsetter.tensor_maintainer_.cpu_tensors;
+  auto acl_call = [op, variant_pack, stream, cpu_tensors]() -> int {
+    auto context_ptr = atb::utils::GetContext(stream);
+    uint64_t workspace_size = OperationSetup(variant_pack, op, context_ptr);
+    at::Tensor workspace_tensor;
+    void* workspace_ptr = nullptr;
+    if (workspace_size != 0) {
+      workspace_tensor =
+          at_npu::native::allocate_workspace(workspace_size, stream);
+      workspace_ptr = const_cast<void*>(workspace_tensor.storage().data());
+    }
+    auto st = op->Execute(
+        variant_pack, (uint8_t*)workspace_ptr, workspace_size, context_ptr);
+    return 0;
+  };
+  at_npu::native::OpCommand::RunOpApiV2(name, acl_call);
+}
+
+void RunAtbCmd(atb::Operation* op,
+               const ParamSetter& paramsetter,
+               const std::string& name) {
+  const auto is_capturing =
+      static_cast<int>(c10_npu::currentStreamCaptureStatusMayInitCtx());
+  if (is_capturing) {
+    RunAtbCmdV1(op, paramsetter, name);
+  } else {
+    RunAtbCmdV2(op, paramsetter, name);
+  }
+}
+
+ParamSetter& ParamSetter::Input(const at::Tensor& tensor,
+                                const bool& format_trans) {
+  if (!tensor.defined()) {
+    variant_pack_.inTensors.push_back(atb::Tensor());
+    return *this;
+  }
+  at::Tensor new_tensor = tensor.contiguous();
+  if (format_trans) {
+    new_tensor = atb::utils::FormatTrans(new_tensor);
+  }
+  atb::Tensor atb_tensor;
+  if (new_tensor.device().type() == at::kCPU) {
+    auto tensor_clone = new_tensor.clone();
+    atb_tensor = AtTensor2AtbTensor(tensor_clone);
+    tensor_maintainer_.cpu_tensors.emplace_back(std::move(tensor_clone));
+  } else {
+    atb_tensor = AtTensor2AtbTensor(new_tensor);
+    tensor_maintainer_.contiguous_tensors.emplace_back(std::move(new_tensor));
+  }
+  variant_pack_.inTensors.push_back(atb_tensor);
+  return *this;
+}
+
+ParamSetter& ParamSetter::Input(const c10::optional<at::Tensor>& tensor,
+                                const bool& format_trans) {
+  if (!tensor.has_value()) {
+    variant_pack_.inTensors.push_back(atb::Tensor());
+    return *this;
+  }
+  return Input(tensor.value(), format_trans);
+}
+
+ParamSetter& ParamSetter::Output(at::Tensor& output) {
+  auto atb_tensor = AtTensor2AtbTensor(output);
+  variant_pack_.outTensors.push_back(atb_tensor);
+  return *this;
+}
+
+uint64_t OperationSetup(atb::VariantPack variant_pack,
+                        atb::Operation* operation,
+                        atb::Context* context_ptr) {
+  uint64_t workspace_size = 0;
+  atb::Status status =
+      operation->Setup(variant_pack, workspace_size, context_ptr);
+  TORCH_CHECK(status == 0, operation->GetName(), " setup failed!");
+  return workspace_size;
+}
+
+}  // namespace atb
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/custom_functions_npu/AtbCommon.h b/xllm/core/kernels/npu/custom_functions_npu/AtbCommon.h
new file mode 100644
index 00000000..f4659eb9
--- /dev/null
+++ b/xllm/core/kernels/npu/custom_functions_npu/AtbCommon.h
@@ -0,0 +1,493 @@
+// Copyright (c) 2025 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPPLUGIN_UTILS_ATB_COMMON_H
+#define OPPLUGIN_UTILS_ATB_COMMON_H
+#include <dlfcn.h>
+#include <torch/library.h>
+#include <torch_npu/csrc/core/npu/NPUStream.h>
+#include <torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h>
+#include <torch_npu/csrc/framework/OpCommand.h>
+#include <torch_npu/csrc/framework/utils/OpPreparation.h>
+
+#include "./OperationCreate.h"
+#include "Utils.h"
+#include "atb/atb_infer.h"
+
+namespace atb {
+
+using aclTensor = struct aclTensor;
+constexpr int64_t MAX_DIM_NUM = 5;
+// small vector max size
+const int N = 32;
+
+using _aclCreateTensor = aclTensor* (*)(const int64_t* view_dims,
+                                        uint64_t view_dims_num,
+                                        aclDataType data_type,
+                                        const int64_t* stride,
+                                        int64_t offset,
+                                        aclFormat format,
+                                        const int64_t* storage_dims,
+                                        uint64_t storage_dims_num,
+                                        void* tensor_data);
+using _aclDestroyTensor = int (*)(const aclTensor*);
+
+using AtbApiFunc = int (*)(void*, uint64_t, atb::Operation*, atb::Context*);
+
+#define GET_OP_API_FUNC(apiName) \
+  reinterpret_cast<_##apiName>(GetApiFuncAddr(#apiName))
+
+inline const char* GetAtbApiLibName(void) { return "libatb.so"; }
+
+inline const char* GetOpApiLibName(void) { return "libopapi.so"; }
+
+inline void* GetApiLibHandler(const char* libName) {
+  auto handler = dlopen(libName, RTLD_LAZY);
+  if (handler == nullptr) {
+    ASCEND_LOGW("dlopen %s failed, error:%s.", libName, dlerror());
+  }
+  return handler;
+}
+
+inline void* GetApiFuncAddrInLib(void* handler,
+                                 const char* libName,
+                                 const char* apiName) {
+  auto funcAddr = dlsym(handler, apiName);
+  if (funcAddr == nullptr) {
+    ASCEND_LOGW(
+        "dlsym %s from %s failed, error:%s.", apiName, libName, dlerror());
+  }
+  return funcAddr;
+}
+
+inline void* GetApiFuncAddr(const char* apiName) {
+  static auto atbApiHandler = GetApiLibHandler(GetAtbApiLibName());
+  if (atbApiHandler != nullptr) {
+    auto funcAddr =
+        GetApiFuncAddrInLib(atbApiHandler, GetAtbApiLibName(), apiName);
+    if (funcAddr != nullptr) {
+      return funcAddr;
+    }
+  }
+  static auto opApiHandler = GetApiLibHandler(GetOpApiLibName());
+  if (opApiHandler != nullptr) {
+    auto funcAddr =
+        GetApiFuncAddrInLib(opApiHandler, GetOpApiLibName(), apiName);
+    if (funcAddr != nullptr) {
+      return funcAddr;
+    }
+    TORCH_CHECK(false, "GetApiFuncAddr not found ", apiName);
+  }
+}
+
+struct TensorMaintainer {
+  c10::SmallVector<at::Tensor, N>
+      contiguous_tensors;  // npu tensor's life should maintain when
+                           // uncontiguous to contiguous.
+  c10::SmallVector<at::Tensor, N>
+      cpu_tensors;  // cpu tensor's life should maintain in taskqueue.
+};
+
+inline aclTensor* ConvertType(TensorMaintainer& maintainer,
+                              const at::Tensor& tensor) {
+  static const auto aclCreateTensor =
+      reinterpret_cast<_aclCreateTensor>(GetApiFuncAddr("aclCreateTensor"));
+  if (aclCreateTensor == nullptr) {
+    return nullptr;
+  }
+
+  if (!tensor.defined()) {
+    return nullptr;
+  }
+  at::Tensor at_tensor = tensor.contiguous();
+  aclFormat format = atb::utils::GetFormatForAtb(at_tensor);
+
+  at::ScalarType scalar_data_type = at_tensor.scalar_type();
+  aclDataType acl_data_type =
+      atb::utils::ConvertToAclDataType(scalar_data_type);
+  c10::SmallVector<int64_t, MAX_DIM_NUM> storageDims;
+  // if acl_data_type is ACL_STRING, storageDims is empty.
+  if (acl_data_type != ACL_STRING) {
+    TORCH_CHECK(at_tensor.itemsize() > 0,
+                "the itemsize of tensor must be greater than 0.");
+    storageDims.push_back(at_tensor.storage().nbytes() / at_tensor.itemsize());
+  }
+
+  const auto dimNum = at_tensor.sizes().size();
+  auto acl_tensor =
+      aclCreateTensor(at_tensor.sizes().data(),
+                      at_tensor.sizes().size(),
+                      acl_data_type,
+                      at_tensor.strides().data(),
+                      at_tensor.storage_offset(),
+                      format,
+                      storageDims.data(),
+                      storageDims.size(),
+                      const_cast<void*>(at_tensor.storage().data()));
+  if (at_tensor.device().type() == at::kCPU) {
+    maintainer.cpu_tensors.emplace_back(std::move(at_tensor));
+  } else {
+    maintainer.contiguous_tensors.emplace_back(std::move(at_tensor));
+  }
+  return acl_tensor;
+}
+
+inline aclTensor* ConvertType(TensorMaintainer& maintainer,
+                              const c10::optional<at::Tensor>& opt_tensor) {
+  if (opt_tensor.has_value() && opt_tensor.value().defined()) {
+    return ConvertType(maintainer, opt_tensor.value());
+  }
+
+  return nullptr;
+}
+
+template <typename T>
+T ConvertType(TensorMaintainer& maintainer, T value) {
+  return value;
+}
+
+template <typename... Ts>
+constexpr auto ConvertTypes(TensorMaintainer& maintainer, Ts&... args) {
+  return std::make_tuple(ConvertType(maintainer, args)...);
+}
+
+struct TensorStruct {
+  void* data_ptr = nullptr;      // at_tensor.storage().data()
+  at::ScalarType scalar_type;    // at_tensor.scalar_type()
+  size_t nbytes;                 // at_tensor.storage().nbytes()
+  size_t itemsize;               // at_tensor.itemsize()
+  int64_t storage_offset;        // at_tensor.storage_offset()
+  std::vector<int64_t> sizes;    // at_tensor.sizes()
+  std::vector<int64_t> strides;  // at_tensor.strides()
+  aclFormat format;              // at_tensor format
+
+  TensorStruct(void* data_ptr_,
+               at::ScalarType scalar_type_,
+               size_t nbytes_,
+               size_t itemsize_,
+               int64_t storage_offset_,
+               at::IntArrayRef sizes_,
+               at::IntArrayRef strides_,
+               aclFormat format_)
+      : data_ptr(data_ptr_),
+        scalar_type(scalar_type_),
+        nbytes(nbytes_),
+        itemsize(itemsize_),
+        storage_offset(storage_offset_),
+        sizes(sizes_.vec()),
+        strides(strides_.vec()),
+        format(format_) {}
+};
+using TensorStructPtr = std::shared_ptr<TensorStruct>;
+
+inline TensorStructPtr CopyTypeV2(TensorMaintainer& maintainer,
+                                  const at::Tensor& tensor) {
+  if (!tensor.defined()) {
+    return nullptr;
+  }
+  at::Tensor at_tensor = tensor.contiguous();
+  aclFormat format = atb::utils::GetFormatForAtb(at_tensor);
+  std::shared_ptr<TensorStruct> tensor_structptr =
+      std::make_shared<TensorStruct>(
+          const_cast<void*>(at_tensor.storage().data()),
+          at_tensor.scalar_type(),
+          at_tensor.storage().nbytes(),
+          at_tensor.itemsize(),
+          at_tensor.storage_offset(),
+          at_tensor.sizes(),
+          at_tensor.strides(),
+          format);
+  if (at_tensor.device().type() == at::kCPU) {
+    maintainer.cpu_tensors.emplace_back(std::move(at_tensor));
+  } else {
+    maintainer.contiguous_tensors.emplace_back(std::move(at_tensor));
+  }
+  return tensor_structptr;
+}
+
+inline TensorStructPtr CopyTypeV2(TensorMaintainer& maintainer,
+                                  const c10::optional<at::Tensor>& opt_tensor) {
+  if (opt_tensor.has_value() && opt_tensor.value().defined()) {
+    return CopyTypeV2(maintainer, opt_tensor.value());
+  }
+
+  return nullptr;
+}
+
+template <typename T>
+T CopyTypeV2(TensorMaintainer& maintainer, T value) {
+  return value;
+}
+
+inline aclTensor* ConvertTypeV2(TensorStructPtr at_tensor) {
+  static const auto aclCreateTensor = GET_OP_API_FUNC(aclCreateTensor);
+  if (aclCreateTensor == nullptr) {
+    return nullptr;
+  }
+
+  if (at_tensor == nullptr) {
+    return nullptr;
+  }
+  at::ScalarType scalar_data_type = (*at_tensor).scalar_type;
+  aclDataType acl_data_type =
+      atb::utils::ConvertToAclDataType(scalar_data_type);
+  c10::SmallVector<int64_t, MAX_DIM_NUM> storageDims;
+  // if acl_data_type is ACL_STRING, storageDims is empty.
+  if (acl_data_type != ACL_STRING) {
+    TORCH_CHECK((*at_tensor).itemsize > 0,
+                "the itemsize of tensor must be greater than 0.");
+    storageDims.push_back((*at_tensor).nbytes / (*at_tensor).itemsize);
+  }
+
+  const auto dimNum = (*at_tensor).sizes.size();
+
+  auto acl_tensor = aclCreateTensor((*at_tensor).sizes.data(),
+                                    (*at_tensor).sizes.size(),
+                                    acl_data_type,
+                                    (*at_tensor).strides.data(),
+                                    (*at_tensor).storage_offset,
+                                    (*at_tensor).format,
+                                    storageDims.data(),
+                                    storageDims.size(),
+                                    (*at_tensor).data_ptr);
+  return acl_tensor;
+}
+
+template <typename T>
+T ConvertTypeV2(T value) {
+  return value;
+}
+
+template <typename Tuple, std::size_t... I>
+auto convert_types_impl_v2(const Tuple& t, std::index_sequence<I...>) {
+  return std::make_tuple(ConvertTypeV2(std::get<I>(t))...);
+}
+
+template <typename... Ts>
+constexpr auto ConvertTypesV2(const std::tuple<Ts...>& args,
+                              uint64_t* workspace_size_addr,
+                              atb::Operation** op_addr,
+                              atb::Context* context_ptr) {
+  auto convert_args =
+      convert_types_impl_v2(args, std::make_index_sequence<sizeof...(Ts)>{});
+  auto appends = std::make_tuple(workspace_size_addr, op_addr, context_ptr);
+  return std::tuple_cat(convert_args, appends);
+}
+
+template <typename... Ts>
+constexpr auto CopyTypesV2(TensorMaintainer& maintainer, Ts&... args) {
+  return std::make_tuple(CopyTypeV2(maintainer, args)...);
+}
+
+template <typename Function, typename Tuple, size_t... I>
+auto call(Function f, Tuple t, std::index_sequence<I...>) {
+  return f(std::get<I>(t)...);
+}
+
+template <typename Function, typename Tuple>
+auto call(Function f, Tuple t) {
+  static constexpr auto size = std::tuple_size<Tuple>::value;
+  return call(f, t, std::make_index_sequence<size>{});
+}
+
+template <typename Tuple, size_t... I>
+auto ConvertToOpApiFunc(const Tuple& params,
+                        void* opApiAddr,
+                        std::index_sequence<I...>) {
+  using OpApiFunc =
+      int (*)(typename std::decay<decltype(std::get<I>(params))>::type...);
+  auto func = reinterpret_cast<OpApiFunc>(opApiAddr);
+  return func;
+}
+
+template <typename Tuple>
+auto ConvertToOpApiFunc(const Tuple& params, void* opApiAddr) {
+  static constexpr auto size = std::tuple_size<Tuple>::value;
+  return ConvertToOpApiFunc(
+      params, opApiAddr, std::make_index_sequence<size>{});
+}
+
+inline void Release(atb::Context* context) {}
+
+inline void Release(aclTensor* p) {
+  static const auto aclDestroyTensor = GET_OP_API_FUNC(aclDestroyTensor);
+  if (aclDestroyTensor == nullptr) {
+    return;
+  }
+  aclDestroyTensor(p);
+}
+
+template <typename T>
+void Release(T value) {
+  (void)value;
+}
+
+template <typename Tuple, size_t... I>
+void CallRelease(Tuple t, std::index_sequence<I...>) {
+  (void)std::initializer_list<int>{(Release(std::get<I>(t)), 0)...};
+}
+
+template <typename Tuple>
+void ReleaseConvertTypes(Tuple& t) {
+  static constexpr auto size = std::tuple_size<Tuple>::value;
+  CallRelease(t, std::make_index_sequence<size>{});
+}
+
+#define EXEC_ATB_CMD_V1(atb_api, ...)                                        \
+  do {                                                                       \
+    static const auto getWorkspaceSizeFuncAddr =                             \
+        GetApiFuncAddr(#atb_api "GetWorkspaceSize");                         \
+    static const auto atbApiFuncAddr = GetApiFuncAddr(#atb_api);             \
+    TORCH_CHECK(                                                             \
+        getWorkspaceSizeFuncAddr != nullptr && atbApiFuncAddr != nullptr,    \
+        #atb_api,                                                            \
+        " or ",                                                              \
+        #atb_api "GetWorkspaceSize",                                         \
+        " not in ",                                                          \
+        GetAtbApiLibName(),                                                  \
+        ", or ",                                                             \
+        GetAtbApiLibName(),                                                  \
+        "not found.");                                                       \
+    auto acl_stream = c10_npu::getCurrentNPUStream().stream(false);          \
+    auto context_ptr = atb::utils::GetContext(acl_stream);                   \
+    uint64_t workspace_size = 0;                                             \
+    uint64_t* workspace_size_addr = &workspace_size;                         \
+    atb::Operation* op = nullptr;                                            \
+    atb::Operation** op_addr = &op;                                          \
+    TensorMaintainer tensor_maintainer;                                      \
+    auto converted_params = ConvertTypes(tensor_maintainer,                  \
+                                         __VA_ARGS__,                        \
+                                         workspace_size_addr,                \
+                                         op_addr,                            \
+                                         context_ptr);                       \
+    static auto getWorkspaceSizeFunc =                                       \
+        ConvertToOpApiFunc(converted_params, getWorkspaceSizeFuncAddr);      \
+    auto workspace_status = call(getWorkspaceSizeFunc, converted_params);    \
+    TORCH_CHECK(workspace_status == 0, "call " #atb_api " failed, detail:"); \
+    void* workspace_addr = nullptr;                                          \
+    at::Tensor workspace_tensor;                                             \
+    if (workspace_size != 0) {                                               \
+      at::TensorOptions options =                                            \
+          at::TensorOptions(c10::DeviceType::PrivateUse1);                   \
+      workspace_tensor =                                                     \
+          at::empty({workspace_size}, options.dtype(at::kByte));             \
+      workspace_addr = const_cast<void*>(workspace_tensor.storage().data()); \
+    }                                                                        \
+    const c10::SmallVector<at::Tensor, N>& cpu_tensors =                     \
+        tensor_maintainer.cpu_tensors;                                       \
+    auto atb_call = [converted_params,                                       \
+                     workspace_addr,                                         \
+                     workspace_size,                                         \
+                     context_ptr,                                            \
+                     op,                                                     \
+                     cpu_tensors]() -> int {                                 \
+      AtbApiFunc atbApiFunc = reinterpret_cast<AtbApiFunc>(atbApiFuncAddr);  \
+      auto api_ret =                                                         \
+          atbApiFunc(workspace_addr, workspace_size, op, context_ptr);       \
+      TORCH_CHECK(api_ret == 0, "call " #atb_api " failed, detail:");        \
+      DestroyOperation(op);                                                  \
+      ReleaseConvertTypes(converted_params);                                 \
+      return api_ret;                                                        \
+    };                                                                       \
+    at_npu::native::OpCommand::RunOpApiV2(#atb_api, atb_call);               \
+  } while (false)
+
+#define EXEC_ATB_CMD_V2(atb_api, ...)                                          \
+  do {                                                                         \
+    static const auto getWorkspaceSizeFuncAddr =                               \
+        GetApiFuncAddr(#atb_api "GetWorkspaceSize");                           \
+    static const auto AtbApiFuncAddr = GetApiFuncAddr(#atb_api);               \
+    TORCH_CHECK(                                                               \
+        getWorkspaceSizeFuncAddr != nullptr && AtbApiFuncAddr != nullptr,      \
+        #atb_api,                                                              \
+        " or ",                                                                \
+        #atb_api "GetWorkspaceSize",                                           \
+        " not in ",                                                            \
+        GetAtbApiLibName(),                                                    \
+        ", or ",                                                               \
+        GetAtbApiLibName(),                                                    \
+        "not found.");                                                         \
+    auto acl_stream = c10_npu::getCurrentNPUStream().stream(false);            \
+    TensorMaintainer tensor_maintainer;                                        \
+    auto copied_params = CopyTypesV2(tensor_maintainer, __VA_ARGS__);          \
+    auto hash_id = computeHash(std::string(#atb_api), __VA_ARGS__);            \
+    const c10::SmallVector<at::Tensor, N>& cpu_tensors =                       \
+        tensor_maintainer.cpu_tensors;                                         \
+    auto atb_call =                                                            \
+        [copied_params, acl_stream, hash_id, cpu_tensors]() -> int {           \
+      auto context_ptr = atb::utils::GetContext(acl_stream);                   \
+      uint64_t workspace_size = 0;                                             \
+      uint64_t* workspace_size_addr = &workspace_size;                         \
+      OpParamCache<uint64_t>& opParamCache =                                   \
+          OpParamCache<uint64_t>::getInstance();                               \
+      atb::Operation* op = opParamCache.getOperation(hash_id);                 \
+      atb::Operation** op_addr = &op;                                          \
+      int api_ret = 0;                                                         \
+      auto converted_params = ConvertTypesV2(                                  \
+          copied_params, workspace_size_addr, op_addr, context_ptr);           \
+      auto getWorkspaceSizeFunc =                                              \
+          ConvertToOpApiFunc(converted_params, getWorkspaceSizeFuncAddr);      \
+      auto workspace_status = call(getWorkspaceSizeFunc, converted_params);    \
+      opParamCache.saveOperation(hash_id, op);                                 \
+      TORCH_CHECK(workspace_status == 0,                                       \
+                  "call " #atb_api "GetWorkspaceSize failed");                 \
+      void* workspace_addr = nullptr;                                          \
+      at::Tensor workspace_tensor;                                             \
+      if (workspace_size != 0) {                                               \
+        workspace_tensor =                                                     \
+            at_npu::native::allocate_workspace(workspace_size, acl_stream);    \
+        workspace_addr = const_cast<void*>(workspace_tensor.storage().data()); \
+      }                                                                        \
+      AtbApiFunc atbApiFunc = reinterpret_cast<AtbApiFunc>(AtbApiFuncAddr);    \
+      api_ret = atbApiFunc(workspace_addr, workspace_size, op, context_ptr);   \
+      TORCH_CHECK(api_ret == 0, "call " #atb_api " failed");                   \
+      ReleaseConvertTypes(converted_params);                                   \
+      return api_ret;                                                          \
+    };                                                                         \
+    at_npu::native::OpCommand::RunOpApiV2(#atb_api, atb_call);                 \
+  } while (false)
+
+#define EXEC_ATB_CMD(atb_api, ...)                                         \
+  do {                                                                     \
+    const auto is_capturing =                                              \
+        static_cast<int>(c10_npu::currentStreamCaptureStatusMayInitCtx()); \
+    if (is_capturing) {                                                    \
+      EXEC_ATB_CMD_V1(atb_api, __VA_ARGS__);                               \
+    } else {                                                               \
+      EXEC_ATB_CMD_V2(atb_api, __VA_ARGS__);                               \
+    }                                                                      \
+  } while (false)
+
+atb::Tensor AtTensor2AtbTensor(const at::Tensor atTensor);
+atb::Context* GetContext(aclrtStream stream);
+uint64_t OperationSetup(atb::VariantPack variant_pack,
+                        atb::Operation* operation,
+                        atb::Context* context_ptr);
+class ParamSetter {
+ public:
+  ParamSetter& Input(const at::Tensor& tensor,
+                     const bool& format_trans = false);
+  ParamSetter& Input(const c10::optional<at::Tensor>& tensor,
+                     const bool& format_trans = false);
+  ParamSetter& Output(at::Tensor& tensor);
+  atb::VariantPack variant_pack_;
+  TensorMaintainer tensor_maintainer_;
+};
+
+void RunAtbCmd(atb::Operation* op,
+               const ParamSetter& paramsetter,
+               const std::string& name);
+
+}  // namespace atb
+
+#endif
diff --git a/xllm/core/kernels/npu/custom_functions_npu/OperationCacheCompute.cpp b/xllm/core/kernels/npu/custom_functions_npu/OperationCacheCompute.cpp
new file mode 100644
index 00000000..b46abb96
--- /dev/null
+++ b/xllm/core/kernels/npu/custom_functions_npu/OperationCacheCompute.cpp
@@ -0,0 +1,201 @@
+// Copyright (c) 2025 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "OperationCacheCompute.h"
+
+namespace atb {
+
+thread_local char g_hash_buf[g_hash_buf_size];
+thread_local int g_hash_offset = 0;
+constexpr int g_rShift33Bits = 33;
+constexpr uint64_t MIX_STEP1 = 18397679294719823053LLU;
+constexpr uint64_t MIX_STEP2 = 14181476777654086739LLU;
+
+void add_param_to_buf(const string& s) {
+  MEMCPY_TO_BUF(s.c_str(), static_cast<int64_t>(s.size()));
+}
+
+void add_param_to_buf(const c10::optional<at::Tensor>& t) {}
+void add_param_to_buf(const at::Tensor& t) {}
+
+void add_param_to_buf() {}
+
+inline uint64_t rotating_left(uint64_t x, uint8_t n) {
+  return (x << n) | (x >> (64 - n));
+}
+
+inline uint64_t mixture(uint64_t x) {
+  // constants step1(18397679294719823053) and step2(14181476777654086739) are
+  // used to allow hash values to be more evenly distributed after
+  // multiplication.
+  x ^= x >> g_rShift33Bits;
+  x *= MIX_STEP1;
+  x ^= x >> g_rShift33Bits;
+  x *= MIX_STEP2;
+  x ^= x >> g_rShift33Bits;
+
+  return x;
+}
+
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+uint64_t gen_hash(const void* key,
+                  const int len,
+                  const uint32_t seed = 0xdeadb0d7) {
+  const uint8_t* data = static_cast<const uint8_t*>(key);
+  // the length of each block is 16 bytes
+  const int block_num = len / 16;
+  // has and hax are literal appromix to hash, and hax is the return value of
+  // this function.
+  uint64_t has = seed;
+  uint64_t hax = seed;
+
+  // use 9782798678568883157 and 5545529020109919103 for blocking and
+  // obfuscation of input data
+  const uint64_t c1 = 9782798678568883157LLU;
+  const uint64_t c2 = 5545529020109919103LLU;
+
+  const uint64_t* blocks =
+      static_cast<const uint64_t*>(static_cast<const void*>(data));
+
+  for (int i = 0; i < block_num; i++) {
+    int even_num = 2;
+    uint64_t tmp1 = blocks[i * even_num];
+    uint64_t tmp2 = blocks[i * even_num + 1];
+
+    int8_t bits_31 = 31;
+    tmp1 *= c1;
+    tmp1 = rotating_left(tmp1, bits_31);
+    tmp1 *= c2;
+    has ^= tmp1;
+
+    int8_t bits_27 = 27;
+    has = rotating_left(has, bits_27);
+    has += hax;
+    // increase randomness by mul by 5 and adding a constant
+    has = has * 5 + 1390208809;
+
+    int8_t bits_33 = 33;
+    tmp2 *= c2;
+    tmp2 = rotating_left(tmp2, bits_33);
+    tmp2 *= c1;
+    hax ^= tmp2;
+
+    hax = rotating_left(hax, bits_31);
+    hax += has;
+    // increase randomness by mul by 5 and adding a constant
+    hax = hax * 5 + 944331445;
+  }
+
+  // the length of each block is 16 bytes
+  const uint8_t* tail = data + block_num * 16;
+  uint64_t t1 = 0;
+  uint64_t t2 = 0;
+  // because the size of a block is 16, different offsets are calculated for
+  // tail blocks for different sizes
+  switch (static_cast<uint64_t>(len) & 15) {
+    case 15:
+      t2 ^= (static_cast<uint64_t>(tail[14])) << 48;
+      [[fallthrough]];
+      ;
+    case 14:
+      t2 ^= (static_cast<uint64_t>(tail[13])) << 40;
+      [[fallthrough]];
+      ;
+    case 13:
+      t2 ^= (static_cast<uint64_t>(tail[12])) << 32;
+      [[fallthrough]];
+      ;
+    case 12:
+      t2 ^= (static_cast<uint64_t>(tail[11])) << 24;
+      [[fallthrough]];
+      ;
+    case 11:
+      t2 ^= (static_cast<uint64_t>(tail[10])) << 16;
+      [[fallthrough]];
+      ;
+    case 10:
+      t2 ^= (static_cast<uint64_t>(tail[9])) << 8;
+      [[fallthrough]];
+      ;
+    case 9:
+      t2 ^= (static_cast<uint64_t>(tail[8])) << 0;
+      t2 *= c2;
+      t2 = rotating_left(t2, 33);
+      t2 *= c1;
+      hax ^= t2;
+      [[fallthrough]];
+      ;
+    case 8:
+      t1 ^= (static_cast<uint64_t>(tail[7])) << 56;
+      [[fallthrough]];
+      ;
+    case 7:
+      t1 ^= (static_cast<uint64_t>(tail[6])) << 48;
+      [[fallthrough]];
+      ;
+    case 6:
+      t1 ^= (static_cast<uint64_t>(tail[5])) << 40;
+      [[fallthrough]];
+      ;
+    case 5:
+      t1 ^= (static_cast<uint64_t>(tail[4])) << 32;
+      [[fallthrough]];
+      ;
+    case 4:
+      t1 ^= (static_cast<uint64_t>(tail[3])) << 24;
+      [[fallthrough]];
+      ;
+    case 3:
+      t1 ^= (static_cast<uint64_t>(tail[2])) << 16;
+      [[fallthrough]];
+      ;
+    case 2:
+      t1 ^= (static_cast<uint64_t>(tail[1])) << 8;
+      [[fallthrough]];
+      ;
+    case 1:
+      t1 ^= (static_cast<uint64_t>(tail[0])) << 0;
+      t1 *= c1;
+      t1 = rotating_left(t1, 31);
+      t1 *= c2;
+      has ^= t1;
+      [[fallthrough]];
+      ;
+    default:
+      break;
+  };
+
+  has ^= static_cast<uint64_t>(len);
+  hax ^= static_cast<uint64_t>(len);
+
+  has += hax;
+  hax += has;
+
+  has = mixture(has);
+  hax = mixture(hax);
+
+  has += hax;
+  hax += has;
+  return hax;
+}
+
+uint64_t calc_hash_id() {
+  if (g_hash_offset == g_hash_buf_max_size) {
+    return 0;
+  }
+  uint64_t hash_id = gen_hash(g_hash_buf, g_hash_offset);
+  return hash_id;
+}
+
+}  // namespace atb
diff --git a/xllm/core/kernels/npu/custom_functions_npu/OperationCacheCompute.h b/xllm/core/kernels/npu/custom_functions_npu/OperationCacheCompute.h
new file mode 100644
index 00000000..c9b293f7
--- /dev/null
+++ b/xllm/core/kernels/npu/custom_functions_npu/OperationCacheCompute.h
@@ -0,0 +1,161 @@
+// Copyright (c) 2025 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPPLUGIN_UTILS_ATB_PARAM_OPERATION_CACHE_COMPUTE_H
+#define OPPLUGIN_UTILS_ATB_PARAM_OPERATION_CACHE_COMPUTE_H
+
+#include <torch_npu/csrc/framework/OpCommand.h>
+
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+#include "atb/atb_infer.h"
+
+namespace atb {
+
+constexpr int g_hash_buf_size = 8192;
+constexpr int g_hash_buf_max_size = g_hash_buf_size + 1024;
+extern thread_local char g_hash_buf[g_hash_buf_size];
+extern thread_local int g_hash_offset;
+
+#define MEMCPY_TO_BUF(data_expression, size_expression)                 \
+  if (g_hash_offset + (size_expression) > g_hash_buf_size) {            \
+    g_hash_offset = g_hash_buf_max_size;                                \
+    return;                                                             \
+  }                                                                     \
+  memcpy(g_hash_buf + g_hash_offset, data_expression, size_expression); \
+  g_hash_offset += size_expression;
+
+uint64_t calc_hash_id();
+
+template <typename T>
+void add_param_to_buf(const T& value) {
+  MEMCPY_TO_BUF(&value, sizeof(T));
+}
+
+void add_param_to_buf(const string& s);
+void add_param_to_buf(const c10::optional<at::Tensor>& t);
+void add_param_to_buf(const at::Tensor& t);
+void add_param_to_buf();
+
+template <typename T>
+void add_param_to_buf(const std::string& name, const T& value) {
+  add_param_to_buf(name);
+  add_param_to_buf(value);
+}
+
+template <typename T, typename... Args>
+void add_param_to_buf(const T& arg, Args&... args) {
+  add_param_to_buf(arg);
+  add_param_to_buf(args...);
+}
+
+template <typename T>
+struct HashOpParam {
+  void operator()(const T& param) const {};
+};
+
+// Each operator implements its own hash function calculation.
+// If the operator parameters do not change, implementation can be omitted.
+// It is possible to hash only the attributes that may change in the parameters
+// of the calculation. following example::
+//
+// `template <>`
+// `struct HashOpParam<atb::infer::XXXParam> {   //if XXXParam's transposeA and
+// hasBias need hash`
+//     `void operator()(const atb::infer::XXXParam& param) const {`
+//         `add_param_to_buf("transposeA", param.transposeA);`
+//         `add_param_to_buf("hasBias", param.hasBias);`
+//     `}`
+// `};`
+
+template <>
+struct HashOpParam<atb::infer::RmsNormParam> {
+  void operator()(const atb::infer::RmsNormParam& param) const {
+    add_param_to_buf("epsilon", param.normParam.epsilon);
+    add_param_to_buf("layerType", param.layerType);
+    add_param_to_buf("quantType", param.normParam.quantType);
+  }
+};
+
+template <>
+struct HashOpParam<atb::infer::GroupTopkParam> {
+  void operator()(const atb::infer::GroupTopkParam& param) const {
+    add_param_to_buf("groupNum", param.groupNum);
+    add_param_to_buf("k", param.k);
+    add_param_to_buf("groupMultiFlag", param.groupMultiFlag);
+    add_param_to_buf("n", param.n);
+  }
+};
+
+template <>
+struct HashOpParam<atb::infer::PagedAttentionParam> {
+  void operator()(const atb::infer::PagedAttentionParam& param) const {
+    add_param_to_buf("num_kv_heads", param.kvHeadNum);
+    add_param_to_buf("num_heads", param.headNum);
+    add_param_to_buf("scale_value", param.qkScale);
+    add_param_to_buf("quant_type", param.quantType);
+    add_param_to_buf("outdata_type", param.outDataType);
+    add_param_to_buf("mla_vheadsize", param.mlaVHeadSize);
+    add_param_to_buf("maskType", param.maskType);
+    add_param_to_buf("calcType", param.calcType);
+  }
+};
+
+template <>
+struct HashOpParam<atb::infer::SelfAttentionParam> {
+  void operator()(const atb::infer::SelfAttentionParam& param) const {
+    add_param_to_buf("num_kv_heads", param.kvHeadNum);
+    add_param_to_buf("num_heads", param.headNum);
+    add_param_to_buf("scale_value", param.qkScale);
+    add_param_to_buf("calcType", param.calcType);
+    add_param_to_buf("kernelType", param.kernelType);
+    add_param_to_buf("maskType", param.maskType);
+    add_param_to_buf("quantType", param.quantType);
+    add_param_to_buf("isTriuMask", param.isTriuMask);
+  }
+};
+
+template <>
+struct HashOpParam<atb::infer::RopeParam> {
+  void operator()(const atb::infer::RopeParam& param) const {
+    add_param_to_buf("rotaryCoeff", param.rotaryCoeff);
+  }
+};
+
+template <>
+struct HashOpParam<atb::infer::ReshapeAndCacheParam> {
+  void operator()(const atb::infer::ReshapeAndCacheParam& param) const {
+    add_param_to_buf("compressType", param.compressType);
+    add_param_to_buf("kvCacheCfg", param.kvCacheCfg);
+  }
+};
+
+template <typename T>
+uint64_t computeHash(const T& obj) {
+  g_hash_offset = 0;
+  HashOpParam<T>{}(obj);
+  return calc_hash_id();
+}
+
+template <typename... Ts>
+uint64_t computeHash(const std::string& name, Ts&... args) {
+  g_hash_offset = 0;
+  add_param_to_buf(name, args...);
+  return calc_hash_id();
+}
+
+}  // namespace atb
+
+#endif  // OPPLUGIN_UTILS_ATB_PARAM_OPERATION_CACHE_COMPUTE_H
diff --git a/xllm/core/kernels/npu/custom_functions_npu/OperationCreate.h b/xllm/core/kernels/npu/custom_functions_npu/OperationCreate.h
new file mode 100644
index 00000000..c08a1310
--- /dev/null
+++ b/xllm/core/kernels/npu/custom_functions_npu/OperationCreate.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2025 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPPLUGIN_UTILS_ATB_OPERATION_CREATE_H
+#define OPPLUGIN_UTILS_ATB_OPERATION_CREATE_H
+
+#include <torch_npu/csrc/core/npu/NPUGraphsUtils.h>
+#include <torch_npu/csrc/framework/OpCommand.h>
+
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+#include "OperationCacheCompute.h"
+#include "Utils.h"
+#include "atb/atb_infer.h"
+
+namespace atb {
+
+template <typename ParamType>
+class OpParamCache {
+ public:
+  static OpParamCache& getInstance();
+
+  atb::Operation* getOperation(const ParamType& param, const std::string& name);
+  atb::Operation* getOperation(uint64_t hash_id);
+  void saveOperation(uint64_t hash_id, atb::Operation* op);
+
+ private:
+  OpParamCache();
+
+  OpParamCache(const OpParamCache&) = delete;
+  OpParamCache& operator=(const OpParamCache&) = delete;
+
+  ~OpParamCache();
+
+  std::unordered_map<uint64_t, atb::Operation*> op_map_;
+  mutable std::mutex mutex_;
+};
+
+template <typename ParamType>
+atb::Operation* CreateAtbOperation(const ParamType& param,
+                                   const std::string& name) {
+  atb::Operation* op = nullptr;
+  atb::CreateOperation(param, &op);
+  TORCH_CHECK(op != nullptr, name, " CreateOperation failed!");
+  return op;
+}
+
+template <typename ParamType>
+OpParamCache<ParamType>& OpParamCache<ParamType>::getInstance() {
+  static OpParamCache instance;
+  return instance;
+}
+
+template <typename ParamType>
+atb::Operation* OpParamCache<ParamType>::getOperation(const ParamType& param,
+                                                      const std::string& name) {
+  const auto is_capturing =
+      static_cast<int>(c10_npu::currentStreamCaptureStatusMayInitCtx());
+  if (is_capturing) {
+    // The atb operator does not support operator reuse, when operator creation
+    // and execution in separate threads.
+    return CreateAtbOperation(param, name);
+  } else {
+    uint64_t hashValue = computeHash(param);
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      auto op_cache = op_map_.find(hashValue);
+      if (op_cache != op_map_.end()) {
+        return op_cache->second;
+      }
+      atb::Operation* op = CreateAtbOperation(param, name);
+      op_map_[hashValue] = op;
+      return op;
+    }
+  }
+}
+
+template <typename ParamType>
+atb::Operation* OpParamCache<ParamType>::getOperation(uint64_t hash_id) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto op_cache = op_map_.find(hash_id);
+  if (op_cache != op_map_.end()) {
+    return op_cache->second;
+  }
+
+  atb::Operation* op = nullptr;
+  return op;
+}
+
+template <typename ParamType>
+void OpParamCache<ParamType>::saveOperation(uint64_t hash_id,
+                                            atb::Operation* op) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  op_map_[hash_id] = op;
+  return;
+}
+
+template <typename ParamType>
+OpParamCache<ParamType>::OpParamCache() {
+  // To satisfy the destructuring order, ContextManager should be instantiated
+  // before OpParamCache.
+  atb::utils::ContextManager::GetInstance();
+}
+
+template <typename ParamType>
+OpParamCache<ParamType>::~OpParamCache() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  for (auto& op_item : op_map_) {
+    DestroyOperation(op_item.second);
+  }
+}
+
+}  // namespace atb
+
+#endif  // OPPLUGIN_UTILS_ATB_OPERATION_CREATE_H
diff --git a/xllm/core/kernels/npu/custom_functions_npu/Utils.cpp b/xllm/core/kernels/npu/custom_functions_npu/Utils.cpp
new file mode 100644
index 00000000..882497d8
--- /dev/null
+++ b/xllm/core/kernels/npu/custom_functions_npu/Utils.cpp
@@ -0,0 +1,79 @@
+// Copyright (c) 2025 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Utils.h"
+
+#include <torch_npu/csrc/core/npu/DeviceUtils.h>
+
+namespace atb {
+namespace utils {
+
+ContextManager& ContextManager::GetInstance() {
+  static ContextManager instance;
+  return instance;
+}
+
+ContextManager::ContextManager() : atb_context_(nullptr) {}
+
+ContextManager::~ContextManager() {
+  if (atb_context_) {
+    auto status = atb::DestroyContext(atb_context_);
+    TORCH_CHECK(status == 0, "Destroy context failed!");
+    atb_context_ = nullptr;
+  }
+}
+
+atb::Context* ContextManager::GetContext(aclrtStream stream) {
+  std::call_once(create_flag_, [this]() {
+    auto status = atb::CreateContext(&atb_context_);
+    TORCH_CHECK(status == 0, "Create context failed!");
+  });
+
+  atb_context_->SetExecuteStream(stream);
+  return atb_context_;
+}
+
+atb::Context* GetContext(aclrtStream stream) {
+  return ContextManager::GetInstance().GetContext(stream);
+}
+
+aclDataType ConvertToAclDataType(const at::ScalarType& data_type) {
+  auto acl_dtype =
+      kATenScalarTypeToAclDataTypeTable[static_cast<int64_t>(data_type)];
+  TORCH_CHECK(acl_dtype != ACL_DT_UNDEFINED,
+              std::string(c10::toString(data_type)) + " has not been supported")
+  return acl_dtype;
+}
+
+at::Tensor FormatTrans(const at::Tensor& at_tensor) {
+  if (torch_npu::utils::is_npu(at_tensor)) {
+    return at_npu::native::npu_format_cast(at_tensor, ACL_FORMAT_ND);
+  }
+  return at_tensor;
+}
+
+bool IsBaseFormat(aclFormat& format) {
+  return (format == ACL_FORMAT_NCHW) || (format == ACL_FORMAT_ND) ||
+         (format == ACL_FORMAT_NHWC) || (format == ACL_FORMAT_NCDHW);
+}
+
+aclFormat GetFormatForAtb(const at::Tensor& at_tensor) {
+  if (torch_npu::utils::is_npu(at_tensor)) {
+    aclFormat format =
+        static_cast<aclFormat>(at_npu::native::get_npu_format(at_tensor));
+    return IsBaseFormat(format) ? ACL_FORMAT_ND : format;
+  }
+  return ACL_FORMAT_ND;
+}
+}  // namespace utils
+}  // namespace atb
diff --git a/xllm/core/kernels/npu/custom_functions_npu/Utils.h b/xllm/core/kernels/npu/custom_functions_npu/Utils.h
new file mode 100644
index 00000000..e42b4274
--- /dev/null
+++ b/xllm/core/kernels/npu/custom_functions_npu/Utils.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2025 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPPLUGIN_UTILS_ATB_UTILS_H
+#define OPPLUGIN_UTILS_ATB_UTILS_H
+
+#include <ATen/ATen.h>
+#include <acl/acl.h>
+#include <torch_npu/csrc/core/npu/NPUFormat.h>
+
+#include "atb/atb_infer.h"
+
+namespace atb {
+namespace utils {
+
+class ContextManager {
+ public:
+  static ContextManager& GetInstance();
+  atb::Context* GetContext(aclrtStream stream);
+  ~ContextManager();
+
+  ContextManager(const ContextManager&) = delete;
+  ContextManager& operator=(const ContextManager&) = delete;
+
+ private:
+  ContextManager();
+  std::once_flag create_flag_;
+  atb::Context* atb_context_;
+};
+
+atb::Context* GetContext(aclrtStream stream);
+
+#define AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(_)  \
+  _(at::ScalarType::Byte, ACL_UINT8)                 \
+  _(at::ScalarType::Char, ACL_INT8)                  \
+  _(at::ScalarType::Short, ACL_INT16)                \
+  _(at::ScalarType::Int, ACL_INT32)                  \
+  _(at::ScalarType::Long, ACL_INT64)                 \
+  _(at::ScalarType::Half, ACL_FLOAT16)               \
+  _(at::ScalarType::Float, ACL_FLOAT)                \
+  _(at::ScalarType::Double, ACL_DOUBLE)              \
+  _(at::ScalarType::ComplexHalf, ACL_COMPLEX32)      \
+  _(at::ScalarType::ComplexFloat, ACL_COMPLEX64)     \
+  _(at::ScalarType::ComplexDouble, ACL_COMPLEX128)   \
+  _(at::ScalarType::Bool, ACL_BOOL)                  \
+  _(at::ScalarType::QInt8, ACL_DT_UNDEFINED)         \
+  _(at::ScalarType::QUInt8, ACL_DT_UNDEFINED)        \
+  _(at::ScalarType::QInt32, ACL_DT_UNDEFINED)        \
+  _(at::ScalarType::BFloat16, ACL_BF16)              \
+  _(at::ScalarType::QUInt4x2, ACL_DT_UNDEFINED)      \
+  _(at::ScalarType::QUInt2x4, ACL_DT_UNDEFINED)      \
+  _(at::ScalarType::Bits1x8, ACL_DT_UNDEFINED)       \
+  _(at::ScalarType::Bits2x4, ACL_DT_UNDEFINED)       \
+  _(at::ScalarType::Bits4x2, ACL_DT_UNDEFINED)       \
+  _(at::ScalarType::Bits8, ACL_DT_UNDEFINED)         \
+  _(at::ScalarType::Bits16, ACL_DT_UNDEFINED)        \
+  _(at::ScalarType::Float8_e5m2, ACL_DT_UNDEFINED)   \
+  _(at::ScalarType::Float8_e4m3fn, ACL_DT_UNDEFINED) \
+  _(at::ScalarType::Undefined, ACL_DT_UNDEFINED)     \
+  _(at::ScalarType::NumOptions, ACL_DT_UNDEFINED)
+
+constexpr aclDataType kATenScalarTypeToAclDataTypeTable
+    [static_cast<int64_t>(at::ScalarType::NumOptions) + 1] = {
+#define DEFINE_ENUM(_1, n) n,
+        AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(DEFINE_ENUM)
+#undef DEFINE_ENUM
+};
+
+aclDataType ConvertToAclDataType(const at::ScalarType& data_type);
+at::Tensor FormatTrans(const at::Tensor& at_tensor);
+aclFormat GetFormatForAtb(const at::Tensor& at_tensor);
+
+template <typename MapType>
+inline int get_op_mode(const MapType& mode_map,
+                       c10::optional<c10::string_view> mode_opt,
+                       c10::string_view default_mode,
+                       const char* mode_name) {
+  c10::string_view mode_str = mode_opt.value_or(default_mode);
+  auto it = mode_map.find(mode_str);
+  TORCH_CHECK(it != mode_map.end(),
+              "Unsupported ",
+              mode_name,
+              " value: '",
+              mode_str,
+              "'");
+  return it->second;
+}
+}  // namespace utils
+}  // namespace atb
+
+#endif
diff --git a/xllm/core/kernels/npu/rms_norm.h b/xllm/core/kernels/npu/fused_layernorm.cpp
similarity index 56%
rename from xllm/core/kernels/npu/rms_norm.h
rename to xllm/core/kernels/npu/fused_layernorm.cpp
index ed7f8d04..3e663523 100644
--- a/xllm/core/kernels/npu/rms_norm.h
+++ b/xllm/core/kernels/npu/fused_layernorm.cpp
@@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <torch_npu/csrc/aten/CustomFunctions.h>
 
-#pragma once
-#include "impl/npu_rms_norm_impl.h"
+#include "npu_ops_api.h"
+#include "ops_npu/npu_ops.h"
 
-namespace xllm {
-namespace kernel {
+namespace xllm::kernel::npu {
 
-class RmsNorm : public torch::nn::ModuleHolder<NpuRmsNormImpl> {
- public:
-  using torch::nn::ModuleHolder<NpuRmsNormImpl>::ModuleHolder;
-  using Impl __attribute__((__unused__)) = NpuRmsNormImpl;
+torch::Tensor fused_layernorm(const torch::Tensor& input,
+                              const torch::Tensor& weight,
+                              double eps) {
+  std::tuple<at::Tensor, at::Tensor> result =
+      at_npu::native::custom_ops::npu_rms_norm(input, weight, eps);
+  auto normalized_input = std::get<0>(result);
+  return normalized_input;
+}
 
-  RmsNorm(const ModelContext& context)
-      : ModuleHolder(std::make_shared<NpuRmsNormImpl>(context)) {}
-};
-
-}  // namespace kernel
-}  // namespace xllm
+}  // namespace xllm::kernel::npu
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/impl/CMakeLists.txt b/xllm/core/kernels/npu/impl/CMakeLists.txt
deleted file mode 100644
index d8ec37ff..00000000
--- a/xllm/core/kernels/npu/impl/CMakeLists.txt
+++ /dev/null
@@ -1,104 +0,0 @@
-include(cc_library)
-include(cc_test)
-
-include_directories(
-  ${CMAKE_SOURCE_DIR}/third_party/spdlog/include
-)
-
-
-cc_library(
-  NAME
-    npu_kernels_impl
-  HDRS
-    npu_split_impl.h
-    npu_linear_impl.h
-    npu_rms_norm_impl.h
-    npu_rope_impl.h
-  SRCS
-    npu_split_impl.cpp
-    npu_linear_impl.cpp
-    npu_rms_norm_impl.cpp
-    npu_rope_impl.cpp
-  DEPS
-    :npu_layers 
-    :model_context
-    :state_dict
-    glog::glog
-    torch
-    torch_npu
-)
-
-cc_test(
-  NAME
-    npu_rms_norm_test
-  SRCS
-    npu_rms_norm_test.cpp
-  DEPS
-    :npu_kernels_impl
-    GTest::gtest
-    GTest::gtest_main
-    xllm_kernels
-    c_sec
-    atb
-    spdlog::spdlog
-)
-
-cc_test(
-  NAME
-    npu_linear_test
-  SRCS
-    npu_linear_test.cpp
-  DEPS
-    :npu_kernels_impl
-    GTest::gtest
-    GTest::gtest_main
-    xllm_kernels
-    c_sec
-    atb
-    spdlog::spdlog
-)
-
-cc_test(
-  NAME
-    npu_split_test
-  SRCS
-    npu_split_test.cpp
-  DEPS
-    :npu_kernels_impl
-    GTest::gtest
-    GTest::gtest_main
-    xllm_kernels
-    c_sec
-    atb
-    spdlog::spdlog
-)
-
-cc_test(
-  NAME
-    npu_rope_impl_test
-  SRCS
-    npu_rope_impl_test.cpp
-  DEPS
-    :npu_kernels_impl
-    GTest::gtest
-    GTest::gtest_main
-    xllm_kernels
-    c_sec
-    atb
-    spdlog::spdlog
-)
-
-cc_test(
-  NAME
-    npu_sample_model_test
-  SRCS
-    npu_sample_model_test.cpp
-  DEPS
-    :npu_kernels_impl
-    GTest::gtest
-    GTest::gtest_main
-    xllm_kernels
-    c_sec
-    atb
-    spdlog::spdlog
-)
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/impl/npu_linear_impl.cpp b/xllm/core/kernels/npu/impl/npu_linear_impl.cpp
deleted file mode 100644
index e233f1ca..00000000
--- a/xllm/core/kernels/npu/impl/npu_linear_impl.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "npu_linear_impl.h"
-
-#include <glog/logging.h>
-
-namespace xllm::kernel {
-
-NpuLinearImpl::NpuLinearImpl(const ModelContext& context)
-    : NpuBaseLayer(context) {
-  at_weight_tensors_.resize(1);
-  atb_weight_tensors_.resize(1);
-  at_out_tensors_.resize(1);
-  dtype_ = c10::typeMetaToScalarType(context.get_tensor_options().dtype());
-  at_weight_tensors_[0] = torch::zeros({1}).to(context.get_tensor_options());
-  tensor_placeholder_ = torch::zeros({1}).to(context.get_tensor_options());
-
-  atb::Status status = init_node(linear_node_);
-  if (status != atb::NO_ERROR) {
-    LOG(ERROR) << "Failed to initialize node, status: " << status;
-    LOG(FATAL) << "NpuLinearImpl initialization failed with status: " << status;
-  }
-}
-
-void NpuLinearImpl::verify_loaded_weights(const std::string weight_str) const {
-  CHECK(at_weight_tensors_[0].sizes() != std::vector<int64_t>({1}))
-      << "weight is not loaded for " << weight_str;
-}
-
-void NpuLinearImpl::merge_loaded_weights() {
-  atb_weight_tensors_[0] =
-      atb_speed::Utils::AtTensor2Tensor(at_weight_tensors_[0]);
-}
-
-void NpuLinearImpl::load_state_dict(const StateDict& state_dict) {
-  set_weight(state_dict, "weight", 0);
-}
-
-int64_t NpuLinearImpl::init_node(atb_speed::Model::Node& node) {
-  name_ = "linear";
-  model_name_ = "llm";
-  run_task_func_ = std::bind(&NpuLinearImpl::run_task,
-                             this,
-                             std::placeholders::_1,
-                             std::placeholders::_2);
-
-  atb::Operation* operation = nullptr;
-  atb::infer::LinearParam linearParam;
-  linearParam.transposeB = true;
-  // linearParam.outDataType = ACL_BF16;
-  linearParam.hasBias = false;
-  atb::Status atbStatus = atb::CreateOperation(linearParam, &operation);
-  if (atbStatus != atb::NO_ERROR) {
-    return atbStatus;
-  }
-
-  node.operation.reset(operation);
-  if (node.operation == nullptr) {
-    LOG(ERROR) << "node.operation is null";
-    return -1;
-  }
-  if (node.operation->GetInputNum() < 1) {
-    LOG(ERROR) << "Get unexpected input num: " << node.operation->GetInputNum();
-    return -1;
-  }
-  if (node.operation->GetOutputNum() < 1) {
-    LOG(ERROR) << "Get unexpected output num: "
-               << node.operation->GetOutputNum();
-    return -1;
-  }
-  ATB_SPEED_LOG_DEBUG("AddLinear");
-
-  return atb::NO_ERROR;
-}
-
-torch::Tensor NpuLinearImpl::forward(const torch::Tensor& input, int nodeId) {
-  atb::Status st;
-
-  build_node_variant_pack(linear_node_, input);
-
-  st = execute_node(linear_node_, nodeId);
-
-  if (st != 0) {
-    LOG(FATAL) << model_name_
-               << " inference failed with error code: " << std::to_string(st);
-  }
-
-  return at_out_tensors_.at(0);
-}
-
-void NpuLinearImpl::build_node_variant_pack(atb_speed::Model::Node& node,
-                                            const torch::Tensor& input) {
-  internal_input = atb_speed::Utils::AtTensor2Tensor(input);
-
-  atb::SVector<atb::Tensor> ins = {internal_input, atb_weight_tensors_[0]};
-  node.variantPack.inTensors = ins;
-
-  atb::SVector<atb::TensorDesc> inTensorDescs;
-  inTensorDescs.resize(node.operation->GetInputNum());
-  inTensorDescs.at(0) = internal_input.desc;
-  inTensorDescs.at(1) = atb_weight_tensors_[0].desc;
-
-  atb::SVector<atb::TensorDesc> outTensorDescs;
-  node.operation->InferShape(inTensorDescs, outTensorDescs);
-
-  at::Tensor output =
-      atb_speed::Utils::CreateAtTensorFromTensorDesc(outTensorDescs.at(0));
-  at_out_tensors_.at(0) = output;
-
-  node.variantPack.outTensors = {atb_speed::Utils::AtTensor2Tensor(output)};
-}
-
-}  // namespace xllm::kernel
diff --git a/xllm/core/kernels/npu/impl/npu_linear_impl.h b/xllm/core/kernels/npu/impl/npu_linear_impl.h
deleted file mode 100644
index b1fc3d26..00000000
--- a/xllm/core/kernels/npu/impl/npu_linear_impl.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#pragma once
-
-#ifdef TORCH_HIGHER_THAN_PTA6
-#include <torch_npu/csrc/core/npu/NPUFormat.h>
-#include <torch_npu/csrc/framework/OpCommand.h>
-#else
-#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
-#include <torch_npu/csrc/framework/utils/OpPreparation.h>
-#endif
-
-#include <torch_npu/csrc/libs/init_npu.h>
-
-#include <functional>
-
-#include "atb/atb_infer.h"
-#include "framework/model/model_input_params.h"
-#include "framework/model_context.h"
-#include "framework/state_dict/state_dict.h"
-#include "layers/npu/npu_base_layer.h"
-#include "nlohmann/json.hpp"
-#include "pytorch/adapter/utils/utils.h"
-#include "xllm_kernels/core/include/atb_speed/base/hosttensor_binder.h"
-#include "xllm_kernels/core/include/atb_speed/base/model.h"
-#include "xllm_kernels/core/include/atb_speed/log.h"
-#include "xllm_kernels/core/include/atb_speed/utils/model_factory.h"
-
-namespace xllm::kernel {
-
-class NpuLinearImpl : public xllm::layer::NpuBaseLayer {
- public:
-  explicit NpuLinearImpl(const ModelContext& context);
-
-  ~NpuLinearImpl() {};
-
-  void load_state_dict(const StateDict& state_dict);
-
-  void verify_loaded_weights(const std::string weight_str) const;
-
-  void merge_loaded_weights();
-
-  torch::Tensor forward(const torch::Tensor& input, int nodeId);
-
- private:
-  int64_t init_node(atb_speed::Model::Node& node);
-  void build_node_variant_pack(atb_speed::Model::Node& node,
-                               const torch::Tensor& input);
-  atb_speed::Model::Node linear_node_;
-  std::string model_name_;
-
-  std::vector<at::Tensor> at_out_tensors_;
-  atb::Tensor internal_input;
-  torch::Tensor tensor_placeholder_;
-};
-
-}  // namespace xllm::kernel
diff --git a/xllm/core/kernels/npu/impl/npu_linear_test.cpp b/xllm/core/kernels/npu/impl/npu_linear_test.cpp
deleted file mode 100644
index ec4607a0..00000000
--- a/xllm/core/kernels/npu/impl/npu_linear_test.cpp
+++ /dev/null
@@ -1,401 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <gtest/gtest.h>
-#include <sys/resource.h>
-
-#include "kernels/npu/linear.h"
-
-namespace xllm::kernel {
-
-class NpuLinearTest : public ::testing::Test {
- protected:
-  NpuLinearTest() : parallel_args_(1, 1, nullptr) {
-    try {
-      torch::zeros({1}, torch::TensorOptions().device("npu:0"));
-
-      tensor_options_ =
-          torch::TensorOptions().dtype(torch::kFloat16).device("npu:0");
-      npu_available_ = true;
-      std::cout << "Using NPU device" << std::endl;
-
-    } catch (...) {
-      tensor_options_ =
-          torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU);
-      npu_available_ = false;
-      std::cout << "Using CPU device (NPU unavailable)" << std::endl;
-    }
-  }
-
-  void SetUp() override {
-    torch::manual_seed(42);
-
-    model_args_.hidden_size() = 4096;
-    model_args_.intermediate_size() = 11008;
-    model_args_.dtype() = "float16";
-
-    quant_args_.torch_dtype() = "float16";
-
-    context_ = std::make_unique<ModelContext>(
-        parallel_args_, model_args_, quant_args_, tensor_options_);
-  }
-
-  void TearDown() override {
-    context_.reset();
-
-    if (npu_available_) {
-      try {
-        c10_npu::npuSynchronizeDevice();
-        c10_npu::NPUCachingAllocator::emptyCache();
-        std::this_thread::sleep_for(std::chrono::milliseconds(500));
-      } catch (...) {
-        // NPU cleanup failures are usually not critical in test teardown
-      }
-    }
-  }
-
-  StateDict CreateStateDict(const torch::Tensor& weight_tensor) {
-    std::unordered_map<std::string, torch::Tensor> tensor_map;
-    tensor_map["weight"] = weight_tensor;
-    return StateDict(tensor_map, "");
-  }
-
-  ModelArgs model_args_;
-  QuantArgs quant_args_;
-  ParallelArgs parallel_args_;
-  torch::TensorOptions tensor_options_;
-  std::unique_ptr<ModelContext> context_;
-  bool npu_available_ = true;
-};
-
-// Test NpuLinearImpl construction
-TEST_F(NpuLinearTest, ConstructorTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  ASSERT_NO_THROW({
-    auto linear = std::make_shared<NpuLinearImpl>(*context_);
-    EXPECT_NE(linear, nullptr);
-  });
-}
-
-// Test Linear wrapper construction
-TEST_F(NpuLinearTest, NpuLinearWrapperTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  ASSERT_NO_THROW({ auto linear = Linear(*context_); });
-}
-
-// Test state dict loading with mock weights
-TEST_F(NpuLinearTest, LoadStateDictTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto linear = std::make_shared<NpuLinearImpl>(*context_);
-
-  // Create weight tensor with shape [output_size, input_size] for linear layer
-  auto weight_tensor =
-      torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()},
-                   tensor_options_);
-  auto state_dict = CreateStateDict(weight_tensor);
-
-  ASSERT_NO_THROW({ linear->load_state_dict(state_dict); });
-}
-
-// Test weight verification (should fail with uninitialized weights)
-TEST_F(NpuLinearTest, VerifyLoadedWeightsFailTest) {
-  auto linear = std::make_shared<NpuLinearImpl>(*context_);
-
-  EXPECT_DEATH({ linear->verify_loaded_weights("test_weight"); }, ".*");
-}
-
-// Test weight verification (should pass with loaded weights)
-TEST_F(NpuLinearTest, VerifyLoadedWeightsPassTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto linear = std::make_shared<NpuLinearImpl>(*context_);
-
-  auto weight_tensor =
-      torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()},
-                   tensor_options_);
-  auto state_dict = CreateStateDict(weight_tensor);
-  linear->load_state_dict(state_dict);
-
-  ASSERT_NO_THROW({ linear->verify_loaded_weights("test_weight"); });
-}
-
-// Test merge loaded weights
-TEST_F(NpuLinearTest, MergeLoadedWeightsTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto linear = std::make_shared<NpuLinearImpl>(*context_);
-
-  auto weight_tensor =
-      torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()},
-                   tensor_options_);
-  auto state_dict = CreateStateDict(weight_tensor);
-  linear->load_state_dict(state_dict);
-
-  ASSERT_NO_THROW({ linear->merge_loaded_weights(); });
-}
-
-// Test forward pass with mock input (may fail without proper NPU setup)
-TEST_F(NpuLinearTest, ForwardPassBasicTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto linear = Linear(*context_);
-
-  auto weight_tensor =
-      torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()},
-                   tensor_options_);
-  auto state_dict = CreateStateDict(weight_tensor);
-  linear->load_state_dict(state_dict);
-  linear->merge_loaded_weights();
-
-  // Input tensor with shape [batch_size, seq_len, hidden_size]
-  auto input =
-      torch::randn({1, 10, model_args_.hidden_size()}, tensor_options_);
-
-  try {
-    auto npu_stream = c10_npu::getCurrentNPUStream(0);
-    auto output = linear(input, 0);
-    aclrtSynchronizeStream(npu_stream.stream());
-    std::cout << "Input tensor shape: " << input.sizes() << std::endl;
-    std::cout << "Output tensor shape: " << output.sizes() << std::endl;
-
-    // Expected output shape: [batch_size, seq_len, intermediate_size]
-    std::vector<int64_t> expected_shape = {
-        1, 10, model_args_.intermediate_size()};
-    EXPECT_EQ(output.sizes(), expected_shape);
-  } catch (const std::exception& e) {
-    GTEST_SKIP() << "Skipping forward pass test - requires NPU environment: "
-                 << e.what();
-  }
-}
-
-// Test tensor shape consistency
-TEST_F(NpuLinearTest, TensorShapeTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto linear = std::make_shared<NpuLinearImpl>(*context_);
-
-  auto weight_tensor =
-      torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()},
-                   tensor_options_);
-  auto state_dict = CreateStateDict(weight_tensor);
-  linear->load_state_dict(state_dict);
-
-  EXPECT_EQ(weight_tensor.size(0), model_args_.intermediate_size());
-  EXPECT_EQ(weight_tensor.size(1), model_args_.hidden_size());
-  EXPECT_EQ(weight_tensor.dim(), 2);
-}
-
-// Test different weight matrix dimensions
-TEST_F(NpuLinearTest, DifferentWeightDimensionsTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  std::vector<std::pair<int64_t, int64_t>> dimensions = {
-      {768, 3072}, {1024, 4096}, {2048, 8192}, {4096, 11008}, {8192, 22016}};
-
-  for (auto [input_size, output_size] : dimensions) {
-    model_args_.hidden_size() = input_size;
-    model_args_.intermediate_size() = output_size;
-
-    QuantArgs local_quant_args = quant_args_;
-    local_quant_args.torch_dtype() = "float16";
-
-    auto context = std::make_unique<ModelContext>(
-        parallel_args_, model_args_, local_quant_args, tensor_options_);
-
-    auto linear = std::make_shared<NpuLinearImpl>(*context);
-
-    auto weight_tensor =
-        torch::randn({output_size, input_size}, tensor_options_);
-    auto state_dict = CreateStateDict(weight_tensor);
-
-    ASSERT_NO_THROW({ linear->load_state_dict(state_dict); });
-
-    EXPECT_EQ(weight_tensor.size(0), output_size);
-    EXPECT_EQ(weight_tensor.size(1), input_size);
-  }
-}
-
-// Test linear transformation mathematical properties
-TEST_F(NpuLinearTest, LinearTransformationPropertiesTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto linear = Linear(*context_);
-
-  auto weight_tensor = torch::eye(model_args_.hidden_size(),
-                                  torch::TensorOptions()
-                                      .dtype(torch::kFloat16)
-                                      .device(tensor_options_.device()));
-
-  if (model_args_.intermediate_size() != model_args_.hidden_size()) {
-    if (model_args_.intermediate_size() > model_args_.hidden_size()) {
-      auto padded_weight = torch::zeros(
-          {model_args_.intermediate_size(), model_args_.hidden_size()},
-          tensor_options_);
-      padded_weight.narrow(0, 0, model_args_.hidden_size()) = weight_tensor;
-      weight_tensor = padded_weight;
-    } else {
-      weight_tensor =
-          weight_tensor.narrow(0, 0, model_args_.intermediate_size());
-    }
-  }
-
-  auto state_dict = CreateStateDict(weight_tensor);
-  linear->load_state_dict(state_dict);
-  linear->merge_loaded_weights();
-
-  auto input = torch::ones({1, 1, model_args_.hidden_size()}, tensor_options_);
-
-  try {
-    auto npu_stream = c10_npu::getCurrentNPUStream(0);
-    auto output = linear(input, 0);
-    aclrtSynchronizeStream(npu_stream.stream());
-
-    EXPECT_EQ(output.dim(), 3);
-    EXPECT_EQ(output.size(0), 1);
-    EXPECT_EQ(output.size(1), 1);
-    EXPECT_EQ(output.size(2),
-              model_args_.intermediate_size());  // output features
-
-  } catch (const std::exception& e) {
-    GTEST_SKIP()
-        << "Skipping mathematical properties test - requires NPU environment: "
-        << e.what();
-  }
-}
-
-// Test batch processing
-TEST_F(NpuLinearTest, BatchProcessingTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto linear = Linear(*context_);
-
-  auto weight_tensor =
-      torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()},
-                   tensor_options_);
-  auto state_dict = CreateStateDict(weight_tensor);
-  linear->load_state_dict(state_dict);
-  linear->merge_loaded_weights();
-
-  std::vector<std::vector<int64_t>> batch_shapes = {
-      {1, 5, model_args_.hidden_size()},
-      {2, 10, model_args_.hidden_size()},
-      {4, 20, model_args_.hidden_size()},
-      {8, 15, model_args_.hidden_size()}};
-
-  for (const auto& shape : batch_shapes) {
-    auto input = torch::randn(shape, tensor_options_);
-
-    try {
-      auto npu_stream = c10_npu::getCurrentNPUStream(0);
-      auto output = linear(input, 0);
-      aclrtSynchronizeStream(npu_stream.stream());
-
-      EXPECT_EQ(output.size(0), shape[0]);
-      EXPECT_EQ(output.size(1), shape[1]);
-      EXPECT_EQ(output.size(2), model_args_.intermediate_size());
-
-    } catch (const std::exception& e) {
-      GTEST_SKIP() << "Skipping batch processing test for shape [" << shape[0]
-                   << ", " << shape[1] << ", " << shape[2]
-                   << "] - requires NPU environment: " << e.what();
-      break;
-    }
-  }
-}
-
-// Test error handling with invalid inputs
-TEST_F(NpuLinearTest, ErrorHandlingTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto linear = Linear(*context_);
-
-  auto weight_tensor =
-      torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()},
-                   tensor_options_);
-  auto state_dict = CreateStateDict(weight_tensor);
-  linear->load_state_dict(state_dict);
-  linear->merge_loaded_weights();
-
-  auto wrong_input =
-      torch::randn({1, 10, model_args_.hidden_size() + 100}, tensor_options_);
-
-  try {
-    auto npu_stream = c10_npu::getCurrentNPUStream(0);
-    auto output = linear(wrong_input, 0);
-    aclrtSynchronizeStream(npu_stream.stream());
-    FAIL() << "Expected exception for mismatched input dimensions";
-  } catch (const std::exception& e) {
-    // Expected behavior - input dimension mismatch should cause error
-    std::cout << "Correctly caught expected error: " << e.what() << std::endl;
-  }
-}
-
-}  // namespace xllm::kernel
-
-int main(int argc, char** argv) {
-  struct rlimit core_limit;
-  core_limit.rlim_cur = 0;
-  core_limit.rlim_max = 0;
-  setrlimit(RLIMIT_CORE, &core_limit);
-
-  FILE* null_stderr = freopen("/dev/null", "w", stderr);
-  if (null_stderr == nullptr) {
-    fclose(stderr);
-  }
-
-  ::testing::InitGoogleTest(&argc, argv);
-
-  bool npu_available = false;
-  try {
-    auto test_tensor =
-        torch::zeros({1}, torch::TensorOptions().device("npu:0"));
-    npu_available = true;
-  } catch (...) {
-    npu_available = false;
-  }
-
-  if (!npu_available) {
-    std::cout << "NPU device not available, skipping all tests." << std::endl;
-    return 0;  // Exit with success code, all tests skipped
-  }
-
-  int result = RUN_ALL_TESTS();
-  _exit(result);
-}
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/impl/npu_rms_norm_impl.cpp b/xllm/core/kernels/npu/impl/npu_rms_norm_impl.cpp
deleted file mode 100644
index 1d16c8ba..00000000
--- a/xllm/core/kernels/npu/impl/npu_rms_norm_impl.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "npu_rms_norm_impl.h"
-
-#include <glog/logging.h>
-
-namespace xllm::kernel {
-
-void NpuRmsNormImpl::param_from_args(atb::infer::RmsNormParam& param,
-                                     const ModelArgs& args) {
-  param.layerType = atb::infer::RmsNormParam::RmsNormType::RMS_NORM_NORM;
-  param.normParam.epsilon = args.rms_norm_eps();
-}
-
-int64_t NpuRmsNormImpl::init_node(atb_speed::Model::Node& node,
-                                  atb::infer::RmsNormParam& param) {
-  name_ = "rms_norm";
-  model_name_ = "llm";
-  run_task_func_ = std::bind(&NpuRmsNormImpl::run_task,
-                             this,
-                             std::placeholders::_1,
-                             std::placeholders::_2);
-
-  atb::Operation* operation = nullptr;
-  atb::Status atbStatus = atb::CreateOperation(param, &operation);
-  if (atbStatus != atb::NO_ERROR) {
-    return atbStatus;
-  }
-
-  node.operation.reset(operation);
-  if (node.operation == nullptr) {
-    LOG(ERROR) << "node.operation is null";
-    return -1;
-  }
-  if (node.operation->GetInputNum() < 1) {
-    LOG(ERROR) << "Can not resize number which is smaller than 1";
-    return -1;
-  }
-
-  return atb::NO_ERROR;
-}
-
-NpuRmsNormImpl::NpuRmsNormImpl(const ModelContext& context)
-    : NpuBaseLayer(context) {
-  param_from_args(norm_param_, context.get_model_args());
-
-  at_weight_tensors_.resize(1);
-  atb_weight_tensors_.resize(1);
-
-  auto options = context.get_tensor_options();
-  dtype_ = c10::typeMetaToScalarType(options.dtype());
-  at_weight_tensors_[0] = torch::zeros({1}).to(options);
-
-  atb::Status status = init_node(norm_node_, norm_param_);
-  if (status != atb::NO_ERROR) {
-    LOG(ERROR) << "Failed to initialize node, status: " << status;
-    LOG(FATAL) << "NpuRmsNormImpl initialization failed with status: "
-               << std::to_string(status);
-  }
-}
-
-void NpuRmsNormImpl::verify_loaded_weights(const std::string weight_str) const {
-  CHECK(at_weight_tensors_[0].sizes() != std::vector<int64_t>({1}))
-      << "final norm weight is not loaded for " << weight_str;
-}
-
-void NpuRmsNormImpl::merge_loaded_weights() {
-  atb_weight_tensors_[0] =
-      atb_speed::Utils::AtTensor2Tensor(at_weight_tensors_[0]);
-}
-
-void NpuRmsNormImpl::load_state_dict(const StateDict& state_dict) {
-  set_weight(state_dict, "weight", 0);
-  at_weight_tensors_[0] = at_weight_tensors_[0].to(dtype_);
-}
-
-torch::Tensor NpuRmsNormImpl::forward(torch::Tensor& x, int nodeId) {
-  atb::Status st;
-  build_node_variant_pack(norm_node_, x);
-  st = execute_node(norm_node_, nodeId);
-  LOG_IF(FATAL, st != 0) << model_name_
-                         << "infer shape fail, error code: " << st;
-  return x;
-}
-
-void NpuRmsNormImpl::build_node_variant_pack(atb_speed::Model::Node& node,
-                                             torch::Tensor& x) {
-  internal_tensors_ = atb_speed::Utils::AtTensor2Tensor(x);
-
-  atb::SVector<atb::Tensor> ins = {internal_tensors_, atb_weight_tensors_[0]};
-  atb::SVector<atb::Tensor> outs = {internal_tensors_};
-
-  node.variantPack.inTensors = ins;
-  node.variantPack.outTensors = outs;
-}
-
-}  // namespace xllm::kernel
diff --git a/xllm/core/kernels/npu/impl/npu_rms_norm_impl.h b/xllm/core/kernels/npu/impl/npu_rms_norm_impl.h
deleted file mode 100644
index dda02375..00000000
--- a/xllm/core/kernels/npu/impl/npu_rms_norm_impl.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#pragma once
-#ifdef TORCH_HIGHER_THAN_PTA6
-#include <torch_npu/csrc/core/npu/NPUFormat.h>
-#include <torch_npu/csrc/framework/OpCommand.h>
-#else
-#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
-#include <torch_npu/csrc/framework/utils/OpPreparation.h>
-#endif
-
-#include <torch_npu/csrc/libs/init_npu.h>
-
-#include <functional>
-
-#include "atb/atb_infer.h"
-#include "framework/kv_cache/kv_cache.h"
-#include "framework/model/model_input_params.h"
-#include "framework/state_dict/state_dict.h"
-#include "layers/npu/npu_base_layer.h"
-#include "nlohmann/json.hpp"
-#include "pytorch/adapter/utils/utils.h"
-#include "xllm_kernels/core/include/atb_speed/base/hosttensor_binder.h"
-#include "xllm_kernels/core/include/atb_speed/base/model.h"
-#include "xllm_kernels/core/include/atb_speed/log.h"
-#include "xllm_kernels/core/include/atb_speed/utils/model_factory.h"
-
-namespace xllm::kernel {
-
-class NpuRmsNormImpl : public xllm::layer::NpuBaseLayer {
- public:
-  explicit NpuRmsNormImpl(const ModelContext& context);
-
-  ~NpuRmsNormImpl() {};
-
-  void load_state_dict(const StateDict& state_dict);
-
-  void verify_loaded_weights(const std::string weight_str) const;
-
-  void merge_loaded_weights();
-
-  torch::Tensor forward(torch::Tensor& x, int nodeId);
-
- private:
-  int64_t init_node(atb_speed::Model::Node& node,
-                    atb::infer::RmsNormParam& param);
-
-  void build_node_variant_pack(atb_speed::Model::Node& node, torch::Tensor& x);
-
-  void param_from_args(atb::infer::RmsNormParam& param, const ModelArgs& args);
-
-  atb_speed::Model::Node norm_node_;
-  std::string model_name_;
-  atb::infer::RmsNormParam norm_param_;
-  atb::Tensor internal_tensors_;
-};
-
-}  // namespace xllm::kernel
diff --git a/xllm/core/kernels/npu/impl/npu_rms_norm_test.cpp b/xllm/core/kernels/npu/impl/npu_rms_norm_test.cpp
deleted file mode 100644
index df4c0ce3..00000000
--- a/xllm/core/kernels/npu/impl/npu_rms_norm_test.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <gtest/gtest.h>
-#include <sys/resource.h>
-
-#include "kernels/npu/rms_norm.h"
-
-namespace xllm::kernel {
-
-class NpuRmsNormTest : public ::testing::Test {
- protected:
-  NpuRmsNormTest() : parallel_args_(1, 1, nullptr) {
-    try {
-      torch::zeros({1}, torch::TensorOptions().device("npu:0"));
-
-      tensor_options_ =
-          torch::TensorOptions().dtype(torch::kFloat16).device("npu:0");
-      npu_available_ = true;
-      std::cout << "Using NPU device" << std::endl;
-
-    } catch (...) {
-      tensor_options_ =
-          torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU);
-      npu_available_ = false;
-      std::cout << "Using CPU device (NPU unavailable)" << std::endl;
-    }
-  }
-
-  void SetUp() override {
-    torch::manual_seed(42);
-
-    model_args_.rms_norm_eps() = 1e-6f;
-    model_args_.hidden_size() = 4096;
-    model_args_.dtype() = "float16";
-
-    quant_args_.torch_dtype() = "float16";
-
-    context_ = std::make_unique<ModelContext>(
-        parallel_args_, model_args_, quant_args_, tensor_options_);
-  }
-
-  void TearDown() override {
-    context_.reset();
-
-    if (npu_available_) {
-      try {
-        c10_npu::npuSynchronizeDevice();
-        c10_npu::NPUCachingAllocator::emptyCache();
-        std::this_thread::sleep_for(std::chrono::milliseconds(500));
-      } catch (...) {
-        // NPU cleanup failures are usually not critical in test teardown
-      }
-    }
-  }
-
-  StateDict CreateStateDict(const torch::Tensor& weight_tensor) {
-    std::unordered_map<std::string, torch::Tensor> tensor_map;
-    tensor_map["weight"] = weight_tensor;
-    return StateDict(tensor_map, "");
-  }
-
-  ModelArgs model_args_;
-  QuantArgs quant_args_;
-  ParallelArgs parallel_args_;
-  torch::TensorOptions tensor_options_;
-  std::unique_ptr<ModelContext> context_;
-  bool npu_available_ = true;
-};
-
-// Test NpuRmsNormImpl construction
-TEST_F(NpuRmsNormTest, ConstructorTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  ASSERT_NO_THROW({
-    auto rms_norm = std::make_shared<NpuRmsNormImpl>(*context_);
-    EXPECT_NE(rms_norm, nullptr);
-  });
-}
-
-// Test RmsNorm wrapper construction
-TEST_F(NpuRmsNormTest, RmsNormWrapperTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  ASSERT_NO_THROW({ auto rms_norm = RmsNorm(*context_); });
-}
-
-// Test state dict loading with mock weights
-TEST_F(NpuRmsNormTest, LoadStateDictTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rms_norm = std::make_shared<NpuRmsNormImpl>(*context_);
-
-  auto weight_tensor =
-      torch::randn({model_args_.hidden_size()}, tensor_options_);
-  auto state_dict = CreateStateDict(weight_tensor);
-
-  ASSERT_NO_THROW({ rms_norm->load_state_dict(state_dict); });
-}
-
-// Test weight verification (should fail with uninitialized weights)
-TEST_F(NpuRmsNormTest, VerifyLoadedWeightsFailTest) {
-  auto rms_norm = std::make_shared<NpuRmsNormImpl>(*context_);
-
-  EXPECT_DEATH({ rms_norm->verify_loaded_weights("test_weight"); }, ".*");
-}
-
-// Test weight verification (should pass with loaded weights)
-TEST_F(NpuRmsNormTest, VerifyLoadedWeightsPassTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rms_norm = std::make_shared<NpuRmsNormImpl>(*context_);
-
-  auto weight_tensor =
-      torch::randn({model_args_.hidden_size()}, tensor_options_);
-  auto state_dict = CreateStateDict(weight_tensor);
-  rms_norm->load_state_dict(state_dict);
-
-  ASSERT_NO_THROW({ rms_norm->verify_loaded_weights("test_weight"); });
-}
-
-// Test merge loaded weights
-TEST_F(NpuRmsNormTest, MergeLoadedWeightsTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rms_norm = std::make_shared<NpuRmsNormImpl>(*context_);
-
-  auto weight_tensor =
-      torch::randn({model_args_.hidden_size()}, tensor_options_);
-  auto state_dict = CreateStateDict(weight_tensor);
-  rms_norm->load_state_dict(state_dict);
-
-  ASSERT_NO_THROW({ rms_norm->merge_loaded_weights(); });
-}
-
-// Test forward pass with mock input (may fail without proper NPU setup)
-TEST_F(NpuRmsNormTest, ForwardPassBasicTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rms_norm = RmsNorm(*context_);
-
-  auto weight_tensor =
-      torch::randn({model_args_.hidden_size()}, tensor_options_);
-  auto state_dict = CreateStateDict(weight_tensor);
-  rms_norm->load_state_dict(state_dict);
-  rms_norm->merge_loaded_weights();
-
-  auto input =
-      torch::randn({1, 10, model_args_.hidden_size()}, tensor_options_);
-
-  try {
-    auto npu_stream = c10_npu::getCurrentNPUStream(0);
-    auto output = rms_norm(input, 0);
-    aclrtSynchronizeStream(npu_stream.stream());
-    std::cout << "Output tensor shape: " << output.sizes() << std::endl;
-    EXPECT_EQ(output.sizes(), input.sizes());
-  } catch (const std::exception& e) {
-    GTEST_SKIP() << "Skipping forward pass test - requires NPU environment: "
-                 << e.what();
-  }
-}
-
-// Test tensor shape consistency
-TEST_F(NpuRmsNormTest, TensorShapeTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rms_norm = std::make_shared<NpuRmsNormImpl>(*context_);
-
-  auto weight_tensor =
-      torch::randn({model_args_.hidden_size()}, tensor_options_);
-  auto state_dict = CreateStateDict(weight_tensor);
-  rms_norm->load_state_dict(state_dict);
-
-  EXPECT_EQ(weight_tensor.size(0), model_args_.hidden_size());
-  EXPECT_EQ(weight_tensor.dim(), 1);
-}
-
-// Test with different hidden sizes
-TEST_F(NpuRmsNormTest, DifferentHiddenSizesTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  std::vector<int64_t> hidden_sizes = {768, 1024, 2048, 4096, 8192};
-
-  for (int64_t hidden_size : hidden_sizes) {
-    model_args_.hidden_size() = hidden_size;
-    QuantArgs local_quant_args = quant_args_;
-    local_quant_args.torch_dtype() = "float16";
-
-    auto context = std::make_unique<ModelContext>(
-        parallel_args_, model_args_, local_quant_args, tensor_options_);
-
-    auto rms_norm = std::make_shared<NpuRmsNormImpl>(*context);
-
-    auto weight_tensor = torch::randn({hidden_size}, tensor_options_);
-    auto state_dict = CreateStateDict(weight_tensor);
-
-    ASSERT_NO_THROW({ rms_norm->load_state_dict(state_dict); });
-
-    EXPECT_EQ(weight_tensor.size(0), hidden_size);
-  }
-}
-
-}  // namespace xllm::kernel
-
-int main(int argc, char** argv) {
-  struct rlimit core_limit;
-  core_limit.rlim_cur = 0;
-  core_limit.rlim_max = 0;
-  setrlimit(RLIMIT_CORE, &core_limit);
-
-  FILE* null_stderr = freopen("/dev/null", "w", stderr);
-  if (null_stderr == nullptr) {
-    fclose(stderr);
-  }
-
-  ::testing::InitGoogleTest(&argc, argv);
-
-  bool npu_available = false;
-  try {
-    auto test_tensor =
-        torch::zeros({1}, torch::TensorOptions().device("npu:0"));
-    npu_available = true;
-  } catch (...) {
-    npu_available = false;
-  }
-
-  if (!npu_available) {
-    std::cout << "NPU device not available, skipping all tests." << std::endl;
-    return 0;  // Exit with success code, all tests skipped
-  }
-
-  int result = RUN_ALL_TESTS();
-  _exit(result);
-}
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/impl/npu_rope_impl.cpp b/xllm/core/kernels/npu/impl/npu_rope_impl.cpp
deleted file mode 100644
index 805f4cda..00000000
--- a/xllm/core/kernels/npu/impl/npu_rope_impl.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "npu_rope_impl.h"
-
-#include <glog/logging.h>
-
-namespace xllm::kernel {
-
-void NpuRopeImpl::param_from_args(atb::infer::RopeParam& param,
-                                  const ModelArgs& args) {
-  param.rotaryCoeff = 2;
-}
-
-int64_t NpuRopeImpl::init_node(atb_speed::Model::Node& node,
-                               atb::infer::RopeParam& param) {
-  name_ = "rope";
-  model_name_ = "llm";
-  run_task_func_ = std::bind(&NpuRopeImpl::run_task,
-                             this,
-                             std::placeholders::_1,
-                             std::placeholders::_2);
-
-  atb::Operation* operation = nullptr;
-  atb::Status atbStatus = atb::CreateOperation(param, &operation);
-  if (atbStatus != atb::NO_ERROR) {
-    return atbStatus;
-  }
-
-  node.operation.reset(operation);
-  if (node.operation == nullptr) {
-    LOG(ERROR) << "node.operation is null";
-    return -1;
-  }
-  if (node.operation->GetInputNum() < 1) {
-    LOG(ERROR) << "Can not resize number which is smaller than 1";
-    return -1;
-  }
-
-  return atb::NO_ERROR;
-}
-
-NpuRopeImpl::NpuRopeImpl(const ModelContext& context) : NpuBaseLayer(context) {
-  param_from_args(rope_param_, context.get_model_args());
-
-  at_weight_tensors_.resize(1);
-  atb_weight_tensors_.resize(1);
-
-  auto options = context.get_tensor_options();
-  dtype_ = c10::typeMetaToScalarType(options.dtype());
-  at_weight_tensors_[0] = torch::zeros({1}).to(options);
-
-  atb::Status status = init_node(rope_node_, rope_param_);
-  if (status != atb::NO_ERROR) {
-    LOG(ERROR) << "Failed to initialize node, status: " << status;
-    LOG(FATAL) << "NpuRopeImpl initialization failed with status: "
-               << std::to_string(status);
-  }
-}
-
-void NpuRopeImpl::verify_loaded_weights(const std::string weight_str) const {
-  // No operation needed for rope layer
-}
-
-void NpuRopeImpl::merge_loaded_weights() {
-  // No operation needed for rope layer
-}
-
-void NpuRopeImpl::load_state_dict(const StateDict& state_dict) {
-  // No operation needed for rope layer
-}
-
-std::vector<at::Tensor> NpuRopeImpl::forward(const torch::Tensor& q,
-                                             const torch::Tensor& k,
-                                             const torch::Tensor& cos_embedding,
-                                             const torch::Tensor& sin_embedding,
-                                             const torch::Tensor& seq_len,
-                                             int nodeId) {
-  atb::Status st;
-  build_node_variant_pack(
-      rope_node_, q, k, cos_embedding, sin_embedding, seq_len);
-  st = execute_node(rope_node_, nodeId);
-  LOG_IF(FATAL, st != 0) << model_name_
-                         << "infer shape fail, error code: " << st;
-  return at_out_tensors_;
-}
-
-void NpuRopeImpl::build_node_variant_pack(atb_speed::Model::Node& node,
-                                          const torch::Tensor& q,
-                                          const torch::Tensor& k,
-                                          const torch::Tensor& cos_embedding,
-                                          const torch::Tensor& sin_embedding,
-                                          const torch::Tensor& seq_len) {
-  internal_q = atb_speed::Utils::AtTensor2Tensor(q);
-  internal_k = atb_speed::Utils::AtTensor2Tensor(k);
-  internal_cos_embedding = atb_speed::Utils::AtTensor2Tensor(cos_embedding);
-  internal_sin_embedding = atb_speed::Utils::AtTensor2Tensor(sin_embedding);
-  internal_seq_len = atb_speed::Utils::AtTensor2Tensor(seq_len);
-
-  atb::SVector<atb::Tensor> ins = {internal_q,
-                                   internal_k,
-                                   internal_cos_embedding,
-                                   internal_sin_embedding,
-                                   internal_seq_len};
-  node.variantPack.inTensors = ins;
-
-  atb::SVector<atb::TensorDesc> inTensorDescs;
-  inTensorDescs.resize(node.operation->GetInputNum());
-  inTensorDescs.at(0) = internal_q.desc;
-  inTensorDescs.at(1) = internal_k.desc;
-  inTensorDescs.at(2) = internal_cos_embedding.desc;
-  inTensorDescs.at(3) = internal_sin_embedding.desc;
-  inTensorDescs.at(4) = internal_seq_len.desc;
-
-  atb::SVector<atb::TensorDesc> outTensorDescs;
-  node.operation->InferShape(inTensorDescs, outTensorDescs);
-
-  at_out_tensors_.resize(outTensorDescs.size());
-  at::Tensor output_0 =
-      atb_speed::Utils::CreateAtTensorFromTensorDesc(outTensorDescs.at(0));
-  at_out_tensors_.at(0) = output_0;
-  at::Tensor output_1 =
-      atb_speed::Utils::CreateAtTensorFromTensorDesc(outTensorDescs.at(1));
-  at_out_tensors_.at(1) = output_1;
-
-  node.variantPack.outTensors = {atb_speed::Utils::AtTensor2Tensor(output_0),
-                                 atb_speed::Utils::AtTensor2Tensor(output_1)};
-}
-
-}  // namespace xllm::kernel
diff --git a/xllm/core/kernels/npu/impl/npu_rope_impl.h b/xllm/core/kernels/npu/impl/npu_rope_impl.h
deleted file mode 100644
index 1f3ee107..00000000
--- a/xllm/core/kernels/npu/impl/npu_rope_impl.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#pragma once
-#ifdef TORCH_HIGHER_THAN_PTA6
-#include <torch_npu/csrc/core/npu/NPUFormat.h>
-#include <torch_npu/csrc/framework/OpCommand.h>
-#else
-#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
-#include <torch_npu/csrc/framework/utils/OpPreparation.h>
-#endif
-
-#include <torch_npu/csrc/libs/init_npu.h>
-
-#include <functional>
-
-#include "atb/atb_infer.h"
-#include "framework/kv_cache/kv_cache.h"
-#include "framework/model/model_input_params.h"
-#include "framework/state_dict/state_dict.h"
-#include "layers/npu/npu_base_layer.h"
-#include "nlohmann/json.hpp"
-#include "pytorch/adapter/utils/utils.h"
-#include "xllm_kernels/core/include/atb_speed/base/hosttensor_binder.h"
-#include "xllm_kernels/core/include/atb_speed/base/model.h"
-#include "xllm_kernels/core/include/atb_speed/log.h"
-#include "xllm_kernels/core/include/atb_speed/utils/model_factory.h"
-
-namespace xllm::kernel {
-
-class NpuRopeImpl : public xllm::layer::NpuBaseLayer {
- public:
-  explicit NpuRopeImpl(const ModelContext& context);
-
-  ~NpuRopeImpl() {};
-
-  void load_state_dict(const StateDict& state_dict);
-
-  void verify_loaded_weights(const std::string weight_str) const;
-
-  void merge_loaded_weights();
-
-  std::vector<at::Tensor> forward(const torch::Tensor& q,
-                                  const torch::Tensor& k,
-                                  const torch::Tensor& cos_embedding,
-                                  const torch::Tensor& sin_embedding,
-                                  const torch::Tensor& seq_len,
-                                  int nodeId);
-
- private:
-  int64_t init_node(atb_speed::Model::Node& node, atb::infer::RopeParam& param);
-  void build_node_variant_pack(atb_speed::Model::Node& node,
-                               const torch::Tensor& q,
-                               const torch::Tensor& k,
-                               const torch::Tensor& cos_embedding,
-                               const torch::Tensor& sin_embedding,
-                               const torch::Tensor& seq_len);
-  void param_from_args(atb::infer::RopeParam& param, const ModelArgs& args);
-
-  std::vector<at::Tensor> at_out_tensors_;
-  atb::Tensor internal_q;
-  atb::Tensor internal_k;
-  atb::Tensor internal_cos_embedding;
-  atb::Tensor internal_sin_embedding;
-  atb::Tensor internal_seq_len;
-
-  atb_speed::Model::Node rope_node_;
-  std::string model_name_;
-  atb::infer::RopeParam rope_param_;
-  atb::Tensor internal_tensors_;
-};
-
-}  // namespace xllm::kernel
diff --git a/xllm/core/kernels/npu/impl/npu_rope_impl_test.cpp b/xllm/core/kernels/npu/impl/npu_rope_impl_test.cpp
deleted file mode 100644
index 26a78bef..00000000
--- a/xllm/core/kernels/npu/impl/npu_rope_impl_test.cpp
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <gtest/gtest.h>
-#include <sys/resource.h>
-
-#include "kernels/npu/rope.h"
-
-namespace xllm::kernel {
-
-class NpuRopeTest : public ::testing::Test {
- protected:
-  NpuRopeTest() : parallel_args_(1, 1, nullptr) {
-    try {
-      torch::zeros({1}, torch::TensorOptions().device("npu:0"));
-
-      tensor_options_ =
-          torch::TensorOptions().dtype(torch::kFloat16).device("npu:0");
-      npu_available_ = true;
-      std::cout << "Using NPU device" << std::endl;
-
-    } catch (...) {
-      tensor_options_ =
-          torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU);
-      npu_available_ = false;
-      std::cout << "Using CPU device (NPU unavailable)" << std::endl;
-    }
-  }
-
-  void SetUp() override {
-    torch::manual_seed(42);
-
-    model_args_.hidden_size() = 4096;
-    model_args_.num_attention_heads() = 32;
-    model_args_.head_dim() = 128;
-    model_args_.max_position_embeddings() = 2048;
-    model_args_.dtype() = "float16";
-
-    quant_args_.torch_dtype() = "float16";
-
-    context_ = std::make_unique<ModelContext>(
-        parallel_args_, model_args_, quant_args_, tensor_options_);
-  }
-
-  void TearDown() override {
-    context_.reset();
-
-    if (npu_available_) {
-      try {
-        c10_npu::npuSynchronizeDevice();
-        c10_npu::NPUCachingAllocator::emptyCache();
-        std::this_thread::sleep_for(std::chrono::milliseconds(500));
-      } catch (...) {
-        // NPU cleanup failures are usually not critical in test teardown
-      }
-    }
-  }
-
-  StateDict CreateStateDict() {
-    std::unordered_map<std::string, torch::Tensor> tensor_map;
-    // RoPE layer doesn't have trainable weights, so empty state dict
-    return StateDict(tensor_map, "");
-  }
-
-  ModelArgs model_args_;
-  QuantArgs quant_args_;
-  ParallelArgs parallel_args_;
-  torch::TensorOptions tensor_options_;
-  std::unique_ptr<ModelContext> context_;
-  bool npu_available_ = true;
-};
-
-// Test NpuRopeImpl construction
-TEST_F(NpuRopeTest, ConstructorTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  ASSERT_NO_THROW({
-    auto rope = std::make_shared<NpuRopeImpl>(*context_);
-    EXPECT_NE(rope, nullptr);
-  });
-}
-
-// Test Rope wrapper construction
-TEST_F(NpuRopeTest, RopeWrapperTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  ASSERT_NO_THROW({ auto rope = Rope(*context_); });
-}
-
-// Test state dict loading (RoPE doesn't have weights)
-TEST_F(NpuRopeTest, LoadStateDictTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rope = std::make_shared<NpuRopeImpl>(*context_);
-  auto state_dict = CreateStateDict();
-
-  ASSERT_NO_THROW({ rope->load_state_dict(state_dict); });
-}
-
-// Test weight verification (should pass as RoPE has no weights)
-TEST_F(NpuRopeTest, VerifyLoadedWeightsTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rope = std::make_shared<NpuRopeImpl>(*context_);
-
-  ASSERT_NO_THROW({ rope->verify_loaded_weights("test_weight"); });
-}
-
-// Test merge loaded weights
-TEST_F(NpuRopeTest, MergeLoadedWeightsTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rope = std::make_shared<NpuRopeImpl>(*context_);
-
-  ASSERT_NO_THROW({ rope->merge_loaded_weights(); });
-}
-
-// Test forward pass with mock input tensors following constraint specifications
-TEST_F(NpuRopeTest, ForwardPassBasicTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rope = std::make_shared<NpuRopeImpl>(*context_);
-
-  int64_t batch_size = 2;
-  std::vector<int64_t> seq_lengths = {8, 12};
-  int64_t max_seq_len =
-      *std::max_element(seq_lengths.begin(), seq_lengths.end());
-  int64_t ntokens = std::accumulate(
-      seq_lengths.begin(), seq_lengths.end(), 0);  // ntokens = sum(seqlen[i])
-
-  int64_t head_num_q = model_args_.num_attention_heads();  // headNumQ
-  int64_t head_num_k =
-      model_args_.num_attention_heads();  // headNumK (can be <= headNumQ)
-  int64_t head_size = model_args_.head_dim();
-
-  // Ensure 32-byte alignment for hiddenSizeQ and hiddenSizeK
-  int64_t hidden_size_q =
-      head_num_q * head_size;  // hiddenSizeQ = head_size * headNumQ
-  int64_t hidden_size_k =
-      head_num_k * head_size;  // hiddenSizeK = head_size * headNumK
-
-  // Validate 32-byte alignment constraint
-  ASSERT_EQ(hidden_size_q % 32, 0) << "hiddenSizeQ must be 32-byte aligned";
-  ASSERT_EQ(hidden_size_k % 32, 0) << "hiddenSizeK must be 32-byte aligned";
-
-  // Create tensors with constraint-compliant dimensions
-  // Input format: [ntokens, hiddenSizeQ/K] for 2D tensors
-  auto q = torch::randn({ntokens, hidden_size_q}, tensor_options_);
-  auto k = torch::randn({ntokens, hidden_size_k}, tensor_options_);
-
-  // cos/sin embeddings: [ntokens, head_size] for standard mode
-  auto cos_embedding = torch::randn({ntokens, head_size}, tensor_options_);
-  auto sin_embedding = torch::randn({ntokens, head_size}, tensor_options_);
-
-  auto seq_len_tensor =
-      torch::tensor(seq_lengths, tensor_options_.dtype(torch::kInt32));
-
-  try {
-    auto npu_stream = c10_npu::getCurrentNPUStream(0);
-    auto outputs =
-        rope->forward(q, k, cos_embedding, sin_embedding, seq_len_tensor, 0);
-    aclrtSynchronizeStream(npu_stream.stream());
-
-    EXPECT_GE(outputs.size(),
-              2);  // Should return at least q_rotated and k_rotated
-    if (outputs.size() >= 2) {
-      std::cout << "Output Q tensor shape: " << outputs[0].sizes() << std::endl;
-      std::cout << "Output K tensor shape: " << outputs[1].sizes() << std::endl;
-      EXPECT_EQ(outputs[0].sizes(), q.sizes());
-      EXPECT_EQ(outputs[1].sizes(), k.sizes());
-    }
-  } catch (const std::exception& e) {
-    GTEST_SKIP() << "Skipping forward pass test - requires NPU environment: "
-                 << e.what();
-  }
-}
-
-}  // namespace xllm::kernel
-
-int main(int argc, char** argv) {
-  struct rlimit core_limit;
-  core_limit.rlim_cur = 0;
-  core_limit.rlim_max = 0;
-  setrlimit(RLIMIT_CORE, &core_limit);
-
-  FILE* null_stderr = freopen("/dev/null", "w", stderr);
-  if (null_stderr == nullptr) {
-    fclose(stderr);
-  }
-
-  ::testing::InitGoogleTest(&argc, argv);
-
-  bool npu_available = false;
-  try {
-    auto test_tensor =
-        torch::zeros({1}, torch::TensorOptions().device("npu:0"));
-    npu_available = true;
-  } catch (...) {
-    npu_available = false;
-  }
-
-  if (!npu_available) {
-    std::cout << "NPU device not available, skipping all tests." << std::endl;
-    return 0;
-  }
-
-  int result = RUN_ALL_TESTS();
-  _exit(result);
-}
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/impl/npu_sample_model_test.cpp b/xllm/core/kernels/npu/impl/npu_sample_model_test.cpp
deleted file mode 100644
index c8cee2d4..00000000
--- a/xllm/core/kernels/npu/impl/npu_sample_model_test.cpp
+++ /dev/null
@@ -1,904 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <gtest/gtest.h>
-#include <sys/resource.h>
-
-#include "kernels/npu/linear.h"
-#include "kernels/npu/rms_norm.h"
-#include "kernels/npu/rope.h"
-#include "kernels/npu/split.h"
-
-namespace xllm::kernel {
-
-class SampleModelTest : public ::testing::Test {
- protected:
-  SampleModelTest() : parallel_args_(1, 1, nullptr) {
-    try {
-      torch::zeros({1}, torch::TensorOptions().device("npu:0"));
-
-      tensor_options_ =
-          torch::TensorOptions().dtype(torch::kFloat16).device("npu:0");
-      npu_available_ = true;
-      std::cout << "Using NPU device" << std::endl;
-
-    } catch (...) {
-      tensor_options_ =
-          torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU);
-      npu_available_ = false;
-      std::cout << "Using CPU device (NPU unavailable)" << std::endl;
-    }
-  }
-
-  void SetUp() override {
-    torch::manual_seed(42);
-
-    model_args_.hidden_size() = 4096;
-    model_args_.intermediate_size() = 11008;
-    model_args_.rms_norm_eps() = 1e-6f;
-    model_args_.dtype() = "float16";
-
-    q_size_ = model_args_.hidden_size();
-    kv_size_ = model_args_.hidden_size();
-    qkv_size_ = q_size_ + 2 * kv_size_;  // q + k + v
-
-    quant_args_.torch_dtype() = "float16";
-
-    context_ = std::make_unique<ModelContext>(
-        parallel_args_, model_args_, quant_args_, tensor_options_);
-  }
-
-  void TearDown() override {
-    context_.reset();
-
-    if (npu_available_) {
-      try {
-        c10_npu::npuSynchronizeDevice();
-        c10_npu::NPUCachingAllocator::emptyCache();
-        std::this_thread::sleep_for(std::chrono::milliseconds(500));
-      } catch (...) {
-        // NPU cleanup failures are usually not critical in test teardown
-      }
-    }
-  }
-
-  StateDict CreateRmsNormStateDict(const torch::Tensor& weight_tensor) {
-    std::unordered_map<std::string, torch::Tensor> tensor_map;
-    tensor_map["weight"] = weight_tensor;
-    return StateDict(tensor_map, "");
-  }
-
-  StateDict CreateLinearStateDict(const torch::Tensor& weight_tensor) {
-    std::unordered_map<std::string, torch::Tensor> tensor_map;
-    tensor_map["weight"] = weight_tensor;
-    return StateDict(tensor_map, "");
-  }
-
-  StateDict CreateEmptyStateDict() {
-    std::unordered_map<std::string, torch::Tensor> tensor_map;
-    return StateDict(tensor_map, "");
-  }
-
-  // Helper method to create cos/sin embeddings for RoPE
-  std::pair<torch::Tensor, torch::Tensor> CreateRopeEmbeddings(
-      int64_t seq_len,
-      int64_t head_dim) {
-    auto cos_embedding = torch::cos(
-        torch::arange(0, seq_len, tensor_options_).unsqueeze(1) *
-        torch::arange(0, head_dim / 2, tensor_options_).unsqueeze(0) * 0.01);
-    auto sin_embedding = torch::sin(
-        torch::arange(0, seq_len, tensor_options_).unsqueeze(1) *
-        torch::arange(0, head_dim / 2, tensor_options_).unsqueeze(0) * 0.01);
-    return std::make_pair(cos_embedding, sin_embedding);
-  }
-
-  ModelArgs model_args_;
-  QuantArgs quant_args_;
-  ParallelArgs parallel_args_;
-  torch::TensorOptions tensor_options_;
-  std::unique_ptr<ModelContext> context_;
-  bool npu_available_ = true;
-
-  // QKV dimensions
-  int64_t q_size_;
-  int64_t kv_size_;
-  int64_t qkv_size_;
-
-  // Attention parameters
-  int64_t num_heads_ = 32;
-  int64_t num_kv_heads_ = 32;
-  int64_t head_dim_ = 128;
-  bool attn_output_gate_ = false;
-};
-
-// Test RMS norm + Linear layer construction
-TEST_F(SampleModelTest, ConstructorTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  ASSERT_NO_THROW({
-    auto rms_norm = std::make_shared<NpuRmsNormImpl>(*context_);
-    auto linear = std::make_shared<NpuLinearImpl>(*context_);
-    EXPECT_NE(rms_norm, nullptr);
-    EXPECT_NE(linear, nullptr);
-  });
-}
-
-// Test combined RMS norm + Linear layer wrapper construction
-TEST_F(SampleModelTest, WrapperConstructionTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  ASSERT_NO_THROW({
-    auto rms_norm = RmsNorm(*context_);
-    auto linear = Linear(*context_);
-  });
-}
-
-// Test state dict loading for both layers
-TEST_F(SampleModelTest, LoadStateDictTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rms_norm = std::make_shared<NpuRmsNormImpl>(*context_);
-  auto linear = std::make_shared<NpuLinearImpl>(*context_);
-
-  auto rms_norm_weight =
-      torch::randn({model_args_.hidden_size()}, tensor_options_);
-  auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight);
-
-  auto linear_weight =
-      torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()},
-                   tensor_options_);
-  auto linear_state_dict = CreateLinearStateDict(linear_weight);
-
-  ASSERT_NO_THROW({
-    rms_norm->load_state_dict(rms_norm_state_dict);
-    linear->load_state_dict(linear_state_dict);
-  });
-}
-
-// Test weight verification for both layers
-TEST_F(SampleModelTest, VerifyLoadedWeightsTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rms_norm = std::make_shared<NpuRmsNormImpl>(*context_);
-  auto linear = std::make_shared<NpuLinearImpl>(*context_);
-
-  auto rms_norm_weight =
-      torch::randn({model_args_.hidden_size()}, tensor_options_);
-  auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight);
-  rms_norm->load_state_dict(rms_norm_state_dict);
-
-  auto linear_weight =
-      torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()},
-                   tensor_options_);
-  auto linear_state_dict = CreateLinearStateDict(linear_weight);
-  linear->load_state_dict(linear_state_dict);
-
-  ASSERT_NO_THROW({
-    rms_norm->verify_loaded_weights("rms_norm_weight");
-    linear->verify_loaded_weights("linear_weight");
-  });
-}
-
-// Test merge loaded weights for both layers
-TEST_F(SampleModelTest, MergeLoadedWeightsTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rms_norm = std::make_shared<NpuRmsNormImpl>(*context_);
-  auto linear = std::make_shared<NpuLinearImpl>(*context_);
-
-  auto rms_norm_weight =
-      torch::randn({model_args_.hidden_size()}, tensor_options_);
-  auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight);
-  rms_norm->load_state_dict(rms_norm_state_dict);
-
-  auto linear_weight =
-      torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()},
-                   tensor_options_);
-  auto linear_state_dict = CreateLinearStateDict(linear_weight);
-  linear->load_state_dict(linear_state_dict);
-
-  ASSERT_NO_THROW({
-    rms_norm->merge_loaded_weights();
-    linear->merge_loaded_weights();
-  });
-}
-
-// Test combined forward pass: RMS norm -> QKV projection -> Split (q, k, v)
-TEST_F(SampleModelTest, CombinedForwardPassTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rms_norm = RmsNorm(*context_);
-  auto qkv_proj = Linear(*context_);
-  auto split_layer = Split(*context_);
-
-  auto rms_norm_weight =
-      torch::randn({model_args_.hidden_size()}, tensor_options_);
-  auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight);
-  rms_norm->load_state_dict(rms_norm_state_dict);
-  rms_norm->merge_loaded_weights();
-
-  // Setup QKV projection weights: output size = q_size + k_size + v_size
-  auto qkv_weight =
-      torch::randn({qkv_size_, model_args_.hidden_size()}, tensor_options_);
-  auto qkv_state_dict = CreateLinearStateDict(qkv_weight);
-  qkv_proj->load_state_dict(qkv_state_dict);
-  qkv_proj->merge_loaded_weights();
-
-  // Setup split layer (no weights needed)
-  auto split_state_dict = CreateEmptyStateDict();
-  split_layer->load_state_dict(split_state_dict);
-  split_layer->merge_loaded_weights();
-
-  // Input tensor with shape [batch_size, seq_len, hidden_size]
-  auto input =
-      torch::randn({1, 10, model_args_.hidden_size()}, tensor_options_);
-
-  try {
-    std::cout << "Input tensor shape: " << input.sizes() << std::endl;
-
-    auto npu_stream = c10_npu::getCurrentNPUStream(0);
-
-    // Step 1: hidden_states = self.norm(hidden_states)
-    auto normalized_output = rms_norm(input, 0);
-    std::cout << "RMS norm output shape: " << normalized_output.sizes()
-              << std::endl;
-
-    // Step 2: qkv, _ = self.qkv_proj(hidden_states)
-    auto qkv_output = qkv_proj(normalized_output, 0);
-    std::cout << "QKV projection output shape: " << qkv_output.sizes()
-              << std::endl;
-
-    // Step 3: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size],
-    // dim=-1)
-    auto split_outputs = split_layer(qkv_output, 0);
-
-    EXPECT_EQ(split_outputs.size(), 3)
-        << "Split should produce 3 tensors (q, k, v)";
-
-    std::cout << "Split outputs:" << std::endl;
-    for (size_t i = 0; i < split_outputs.size(); ++i) {
-      std::cout << "  Tensor " << i << " shape: " << split_outputs[i].sizes()
-                << std::endl;
-    }
-
-    EXPECT_EQ(normalized_output.sizes(), input.sizes());
-
-    // Expected QKV output shape: [batch_size, seq_len, qkv_size]
-    std::vector<int64_t> expected_qkv_shape = {1, 10, qkv_size_};
-    EXPECT_EQ(qkv_output.sizes(), expected_qkv_shape);
-
-    // Expected split output shapes
-    // q: [batch_size, seq_len, q_size]
-    // k: [batch_size, seq_len, kv_size]
-    // v: [batch_size, seq_len, kv_size]
-    std::vector<int64_t> expected_q_shape = {1, 10, q_size_};
-    std::vector<int64_t> expected_kv_shape = {1, 10, kv_size_};
-
-    if (split_outputs.size() >= 3) {
-      EXPECT_EQ(split_outputs[0].sizes(), expected_q_shape)
-          << "Q tensor shape mismatch";
-      EXPECT_EQ(split_outputs[1].sizes(), expected_kv_shape)
-          << "K tensor shape mismatch";
-      EXPECT_EQ(split_outputs[2].sizes(), expected_kv_shape)
-          << "V tensor shape mismatch";
-    }
-
-    std::cout << "Combined forward pass test (norm -> qkv_proj -> split) "
-                 "completed successfully!"
-              << std::endl;
-    aclrtSynchronizeStream(npu_stream.stream());
-  } catch (const std::exception& e) {
-    GTEST_SKIP()
-        << "Skipping combined forward pass test - requires NPU environment: "
-        << e.what();
-  }
-}
-
-// Test combined forward pass with different batch sizes: norm -> qkv_proj ->
-// split
-TEST_F(SampleModelTest, CombinedForwardPassBatchTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rms_norm = RmsNorm(*context_);
-  auto qkv_proj = Linear(*context_);
-  auto split_layer = Split(*context_);
-
-  auto rms_norm_weight =
-      torch::randn({model_args_.hidden_size()}, tensor_options_);
-  auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight);
-  rms_norm->load_state_dict(rms_norm_state_dict);
-  rms_norm->merge_loaded_weights();
-
-  auto qkv_weight =
-      torch::randn({qkv_size_, model_args_.hidden_size()}, tensor_options_);
-  auto qkv_state_dict = CreateLinearStateDict(qkv_weight);
-  qkv_proj->load_state_dict(qkv_state_dict);
-  qkv_proj->merge_loaded_weights();
-
-  auto split_state_dict = CreateEmptyStateDict();
-  split_layer->load_state_dict(split_state_dict);
-  split_layer->merge_loaded_weights();
-
-  std::vector<std::vector<int64_t>> batch_shapes = {
-      {1, 5, model_args_.hidden_size()},
-      {2, 10, model_args_.hidden_size()},
-      {4, 20, model_args_.hidden_size()}};
-
-  for (const auto& shape : batch_shapes) {
-    auto input = torch::randn(shape, tensor_options_);
-
-    try {
-      auto npu_stream = c10_npu::getCurrentNPUStream(0);
-
-      auto normalized_output = rms_norm(input, 0);
-      auto qkv_output = qkv_proj(normalized_output, 0);
-      auto split_outputs = split_layer(qkv_output, 0);
-
-      EXPECT_EQ(normalized_output.size(0), shape[0]);
-      EXPECT_EQ(normalized_output.size(1), shape[1]);
-      EXPECT_EQ(normalized_output.size(2), shape[2]);
-
-      EXPECT_EQ(qkv_output.size(0), shape[0]);
-      EXPECT_EQ(qkv_output.size(1), shape[1]);
-      EXPECT_EQ(qkv_output.size(2), qkv_size_);
-
-      EXPECT_EQ(split_outputs.size(), 3);
-      if (split_outputs.size() >= 3) {
-        // Q tensor
-        EXPECT_EQ(split_outputs[0].size(0), shape[0]);
-        EXPECT_EQ(split_outputs[0].size(1), shape[1]);
-        EXPECT_EQ(split_outputs[0].size(2), q_size_);
-
-        // K tensor
-        EXPECT_EQ(split_outputs[1].size(0), shape[0]);
-        EXPECT_EQ(split_outputs[1].size(1), shape[1]);
-        EXPECT_EQ(split_outputs[1].size(2), kv_size_);
-
-        // V tensor
-        EXPECT_EQ(split_outputs[2].size(0), shape[0]);
-        EXPECT_EQ(split_outputs[2].size(1), shape[1]);
-        EXPECT_EQ(split_outputs[2].size(2), kv_size_);
-      }
-      aclrtSynchronizeStream(npu_stream.stream());
-    } catch (const std::exception& e) {
-      GTEST_SKIP() << "Skipping batch processing test for shape [" << shape[0]
-                   << ", " << shape[1] << ", " << shape[2]
-                   << "] - requires NPU environment: " << e.what();
-      break;
-    }
-  }
-}
-
-// Test tensor data flow and numerical properties: norm -> qkv_proj -> split
-TEST_F(SampleModelTest, DataFlowTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rms_norm = RmsNorm(*context_);
-  auto qkv_proj = Linear(*context_);
-  auto split_layer = Split(*context_);
-
-  auto rms_norm_weight =
-      torch::ones({model_args_.hidden_size()}, tensor_options_);
-  auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight);
-  rms_norm->load_state_dict(rms_norm_state_dict);
-  rms_norm->merge_loaded_weights();
-
-  auto qkv_weight =
-      torch::ones({qkv_size_, model_args_.hidden_size()}, tensor_options_) *
-      0.1f;
-  auto qkv_state_dict = CreateLinearStateDict(qkv_weight);
-  qkv_proj->load_state_dict(qkv_state_dict);
-  qkv_proj->merge_loaded_weights();
-
-  auto split_state_dict = CreateEmptyStateDict();
-  split_layer->load_state_dict(split_state_dict);
-  split_layer->merge_loaded_weights();
-
-  auto input = torch::ones({1, 1, model_args_.hidden_size()}, tensor_options_);
-
-  try {
-    auto npu_stream = c10_npu::getCurrentNPUStream(0);
-
-    auto normalized_output = rms_norm(input, 0);
-    auto qkv_output = qkv_proj(normalized_output, 0);
-    auto split_outputs = split_layer(qkv_output, 0);
-
-    EXPECT_FALSE(torch::isnan(normalized_output).any().item<bool>())
-        << "NaN detected in normalized output";
-    EXPECT_FALSE(torch::isinf(normalized_output).any().item<bool>())
-        << "Inf detected in normalized output";
-
-    EXPECT_FALSE(torch::isnan(qkv_output).any().item<bool>())
-        << "NaN detected in QKV projection output";
-    EXPECT_FALSE(torch::isinf(qkv_output).any().item<bool>())
-        << "Inf detected in QKV projection output";
-
-    EXPECT_EQ(split_outputs.size(), 3) << "Expected 3 split outputs (q, k, v)";
-
-    for (size_t i = 0; i < split_outputs.size(); ++i) {
-      EXPECT_FALSE(torch::isnan(split_outputs[i]).any().item<bool>())
-          << "NaN detected in split output " << i;
-      EXPECT_FALSE(torch::isinf(split_outputs[i]).any().item<bool>())
-          << "Inf detected in split output " << i;
-    }
-
-    std::cout << "Data flow test completed - no NaN or Inf values detected in "
-                 "pipeline!"
-              << std::endl;
-    aclrtSynchronizeStream(npu_stream.stream());
-  } catch (const std::exception& e) {
-    GTEST_SKIP() << "Skipping data flow test - requires NPU environment: "
-                 << e.what();
-  }
-}
-
-// Test QKV splitting with attention output gate functionality
-TEST_F(SampleModelTest, QKVSplitWithAttentionGateTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  int64_t q_gate_size = q_size_ * 2;                   // q + gate
-  int64_t qkv_gate_size = q_gate_size + 2 * kv_size_;  // (q + gate) + k + v
-
-  auto rms_norm = RmsNorm(*context_);
-  auto qkv_proj = Linear(*context_);
-  auto split_layer = Split(*context_, 2, 3, {q_gate_size, kv_size_, kv_size_});
-
-  // Setup for attention output gate mode
-  attn_output_gate_ = true;
-
-  auto rms_norm_weight =
-      torch::randn({model_args_.hidden_size()}, tensor_options_);
-  auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight);
-  rms_norm->load_state_dict(rms_norm_state_dict);
-  rms_norm->merge_loaded_weights();
-
-  // QKV projection with gate: output size = (q_size * 2) + k_size + v_size
-  auto qkv_weight =
-      torch::randn({qkv_gate_size, model_args_.hidden_size()}, tensor_options_);
-  auto qkv_state_dict = CreateLinearStateDict(qkv_weight);
-  qkv_proj->load_state_dict(qkv_state_dict);
-  qkv_proj->merge_loaded_weights();
-
-  auto split_state_dict = CreateEmptyStateDict();
-  split_layer->load_state_dict(split_state_dict);
-  split_layer->merge_loaded_weights();
-
-  auto input =
-      torch::randn({1, 10, model_args_.hidden_size()}, tensor_options_);
-
-  try {
-    auto npu_stream = c10_npu::getCurrentNPUStream(0);
-
-    auto normalized_output = rms_norm(input, 0);
-
-    auto qkv_output = qkv_proj(normalized_output, 0);
-    std::cout << "QKV with gate output shape: " << qkv_output.sizes()
-              << std::endl;
-
-    auto split_outputs = split_layer(qkv_output, 0);
-    EXPECT_EQ(split_outputs.size(), 3)
-        << "Split should produce 3 tensors (q_gate, k, v)";
-
-    if (split_outputs.size() >= 3) {
-      auto q_gate = split_outputs[0];
-      auto k = split_outputs[1];
-      auto v = split_outputs[2];
-
-      std::cout << "Q+Gate tensor shape: " << q_gate.sizes() << std::endl;
-      std::cout << "K tensor shape: " << k.sizes() << std::endl;
-      std::cout << "V tensor shape: " << v.sizes() << std::endl;
-
-      std::vector<int64_t> expected_q_gate_shape = {1, 10, q_gate_size};
-      std::vector<int64_t> expected_kv_shape = {1, 10, kv_size_};
-
-      EXPECT_EQ(q_gate.sizes(), expected_q_gate_shape)
-          << "Q+Gate tensor shape mismatch";
-      EXPECT_EQ(k.sizes(), expected_kv_shape) << "K tensor shape mismatch";
-      EXPECT_EQ(v.sizes(), expected_kv_shape) << "V tensor shape mismatch";
-
-      // q_gate = q_gate.view(*orig_shape, self.num_heads, -1)
-      auto orig_shape = q_gate.sizes();
-      auto q_gate_reshaped =
-          q_gate.view({orig_shape[0], orig_shape[1], num_heads_, -1});
-
-      // q, gate = torch.chunk(q_gate, 2, dim=-1)
-      auto q_gate_chunks = torch::chunk(q_gate_reshaped, 2, -1);
-      EXPECT_EQ(q_gate_chunks.size(), 2) << "Should split q_gate into 2 chunks";
-
-      if (q_gate_chunks.size() >= 2) {
-        auto q = q_gate_chunks[0];
-        auto gate = q_gate_chunks[1];
-
-        q = q.reshape({orig_shape[0], orig_shape[1], -1});
-        gate = gate.reshape({orig_shape[0], orig_shape[1], -1});
-
-        std::cout << "Final Q shape: " << q.sizes() << std::endl;
-        std::cout << "Final Gate shape: " << gate.sizes() << std::endl;
-
-        std::vector<int64_t> expected_final_q_shape = {1, 10, q_size_};
-        EXPECT_EQ(q.sizes(), expected_final_q_shape)
-            << "Final Q tensor shape mismatch";
-        EXPECT_EQ(gate.sizes(), expected_final_q_shape)
-            << "Gate tensor shape mismatch";
-      }
-    }
-
-    std::cout << "QKV split with attention gate test completed successfully!"
-              << std::endl;
-    aclrtSynchronizeStream(npu_stream.stream());
-  } catch (const std::exception& e) {
-    GTEST_SKIP() << "Skipping attention gate test - requires NPU environment: "
-                 << e.what();
-  }
-}
-
-// Test standard QKV splitting (without attention gate)
-TEST_F(SampleModelTest, StandardQKVSplitTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rms_norm = RmsNorm(*context_);
-  auto qkv_proj = Linear(*context_);
-  auto split_layer = Split(*context_);
-
-  attn_output_gate_ = false;
-
-  auto rms_norm_weight =
-      torch::randn({model_args_.hidden_size()}, tensor_options_);
-  auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight);
-  rms_norm->load_state_dict(rms_norm_state_dict);
-  rms_norm->merge_loaded_weights();
-
-  // Standard QKV projection: output size = q_size + k_size + v_size
-  auto qkv_weight =
-      torch::randn({qkv_size_, model_args_.hidden_size()}, tensor_options_);
-  auto qkv_state_dict = CreateLinearStateDict(qkv_weight);
-  qkv_proj->load_state_dict(qkv_state_dict);
-  qkv_proj->merge_loaded_weights();
-
-  auto split_state_dict = CreateEmptyStateDict();
-  split_layer->load_state_dict(split_state_dict);
-  split_layer->merge_loaded_weights();
-
-  auto input =
-      torch::randn({1, 10, model_args_.hidden_size()}, tensor_options_);
-
-  try {
-    auto npu_stream = c10_npu::getCurrentNPUStream(0);
-
-    auto normalized_output = rms_norm(input, 0);
-
-    auto qkv_output = qkv_proj(normalized_output, 0);
-    std::cout << "Standard QKV output shape: " << qkv_output.sizes()
-              << std::endl;
-
-    auto split_outputs = split_layer(qkv_output, 0);
-    EXPECT_EQ(split_outputs.size(), 3)
-        << "Split should produce 3 tensors (q, k, v)";
-
-    if (split_outputs.size() >= 3) {
-      auto q = split_outputs[0];
-      auto k = split_outputs[1];
-      auto v = split_outputs[2];
-
-      std::cout << "Q tensor shape: " << q.sizes() << std::endl;
-      std::cout << "K tensor shape: " << k.sizes() << std::endl;
-      std::cout << "V tensor shape: " << v.sizes() << std::endl;
-
-      std::vector<int64_t> expected_q_shape = {1, 10, q_size_};
-      std::vector<int64_t> expected_kv_shape = {1, 10, kv_size_};
-
-      EXPECT_EQ(q.sizes(), expected_q_shape) << "Q tensor shape mismatch";
-      EXPECT_EQ(k.sizes(), expected_kv_shape) << "K tensor shape mismatch";
-      EXPECT_EQ(v.sizes(), expected_kv_shape) << "V tensor shape mismatch";
-    }
-
-    std::cout << "Standard QKV split test completed successfully!" << std::endl;
-    aclrtSynchronizeStream(npu_stream.stream());
-  } catch (const std::exception& e) {
-    GTEST_SKIP() << "Skipping standard QKV test - requires NPU environment: "
-                 << e.what();
-  }
-}
-
-// Test Q and K normalization functionality
-TEST_F(SampleModelTest, QKNormalizationTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rms_norm = RmsNorm(*context_);
-  auto qkv_proj = Linear(*context_);
-  auto split_layer = Split(*context_);
-  auto q_norm = RmsNorm(*context_);
-  auto k_norm = RmsNorm(*context_);
-
-  auto rms_norm_weight =
-      torch::randn({model_args_.hidden_size()}, tensor_options_);
-  auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight);
-  rms_norm->load_state_dict(rms_norm_state_dict);
-  rms_norm->merge_loaded_weights();
-
-  auto qkv_weight =
-      torch::randn({qkv_size_, model_args_.hidden_size()}, tensor_options_);
-  auto qkv_state_dict = CreateLinearStateDict(qkv_weight);
-  qkv_proj->load_state_dict(qkv_state_dict);
-  qkv_proj->merge_loaded_weights();
-
-  auto split_state_dict = CreateEmptyStateDict();
-  split_layer->load_state_dict(split_state_dict);
-  split_layer->merge_loaded_weights();
-
-  auto q_norm_weight = torch::randn({head_dim_}, tensor_options_);
-  auto q_norm_state_dict = CreateRmsNormStateDict(q_norm_weight);
-  q_norm->load_state_dict(q_norm_state_dict);
-  q_norm->merge_loaded_weights();
-
-  auto k_norm_weight = torch::randn({head_dim_}, tensor_options_);
-  auto k_norm_state_dict = CreateRmsNormStateDict(k_norm_weight);
-  k_norm->load_state_dict(k_norm_state_dict);
-  k_norm->merge_loaded_weights();
-
-  auto input =
-      torch::randn({1, 10, model_args_.hidden_size()}, tensor_options_);
-
-  try {
-    auto npu_stream = c10_npu::getCurrentNPUStream(0);
-
-    // Forward pass: norm -> qkv_proj -> split -> q_norm/k_norm
-    auto normalized_output = rms_norm(input, 0);
-    auto qkv_output = qkv_proj(normalized_output, 0);
-    auto split_outputs = split_layer(qkv_output, 0);
-
-    EXPECT_EQ(split_outputs.size(), 3) << "Expected 3 split outputs";
-
-    if (split_outputs.size() >= 3) {
-      auto q = split_outputs[0];
-      auto k = split_outputs[1];
-      auto v = split_outputs[2];
-
-      // Reshape Q and K for normalization: [batch, seq, num_heads, head_dim]
-      auto q_reshaped = q.view({-1, num_heads_, head_dim_});
-      auto k_reshaped = k.view({-1, num_kv_heads_, head_dim_});
-
-      std::cout << "Q reshaped for norm: " << q_reshaped.sizes() << std::endl;
-      std::cout << "K reshaped for norm: " << k_reshaped.sizes() << std::endl;
-
-      auto q_normalized = q_norm(q_reshaped, 0);
-      auto k_normalized = k_norm(k_reshaped, 0);
-
-      q_normalized = q_normalized.view({1, -1, num_heads_ * head_dim_});
-      k_normalized = k_normalized.view({1, -1, num_kv_heads_ * head_dim_});
-
-      std::cout << "Q after norm: " << q_normalized.sizes() << std::endl;
-      std::cout << "K after norm: " << k_normalized.sizes() << std::endl;
-
-      EXPECT_FALSE(torch::isnan(q_normalized).any().item<bool>())
-          << "NaN detected in normalized Q";
-      EXPECT_FALSE(torch::isinf(q_normalized).any().item<bool>())
-          << "Inf detected in normalized Q";
-      EXPECT_FALSE(torch::isnan(k_normalized).any().item<bool>())
-          << "NaN detected in normalized K";
-      EXPECT_FALSE(torch::isinf(k_normalized).any().item<bool>())
-          << "Inf detected in normalized K";
-
-      EXPECT_EQ(q_normalized.sizes(), q.sizes())
-          << "Q shape changed after norm";
-      EXPECT_EQ(k_normalized.sizes(), k.sizes())
-          << "K shape changed after norm";
-    }
-
-    std::cout << "Q and K normalization test completed successfully!"
-              << std::endl;
-    aclrtSynchronizeStream(npu_stream.stream());
-  } catch (const std::exception& e) {
-    GTEST_SKIP() << "Skipping Q/K norm test - requires NPU environment: "
-                 << e.what();
-  }
-}
-
-// Comprehensive test: norm -> qkv_proj -> split -> q_norm/k_norm -> rope
-TEST_F(SampleModelTest, CompleteAttentionPipelineTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto rms_norm = RmsNorm(*context_);
-  auto qkv_proj = Linear(*context_);
-  auto split_layer = Split(*context_);
-  auto q_norm = RmsNorm(*context_);
-  auto k_norm = RmsNorm(*context_);
-  auto rope_layer = Rope(*context_);
-
-  auto rms_norm_weight =
-      torch::randn({model_args_.hidden_size()}, tensor_options_);
-  auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight);
-  rms_norm->load_state_dict(rms_norm_state_dict);
-  rms_norm->merge_loaded_weights();
-
-  auto qkv_weight =
-      torch::randn({qkv_size_, model_args_.hidden_size()}, tensor_options_);
-  auto qkv_state_dict = CreateLinearStateDict(qkv_weight);
-  qkv_proj->load_state_dict(qkv_state_dict);
-  qkv_proj->merge_loaded_weights();
-
-  auto split_state_dict = CreateEmptyStateDict();
-  split_layer->load_state_dict(split_state_dict);
-  split_layer->merge_loaded_weights();
-
-  auto q_norm_weight = torch::randn({head_dim_}, tensor_options_);
-  auto q_norm_state_dict = CreateRmsNormStateDict(q_norm_weight);
-  q_norm->load_state_dict(q_norm_state_dict);
-  q_norm->merge_loaded_weights();
-
-  auto k_norm_weight = torch::randn({head_dim_}, tensor_options_);
-  auto k_norm_state_dict = CreateRmsNormStateDict(k_norm_weight);
-  k_norm->load_state_dict(k_norm_state_dict);
-  k_norm->merge_loaded_weights();
-
-  auto rope_state_dict = CreateEmptyStateDict();
-  rope_layer->load_state_dict(rope_state_dict);
-  rope_layer->merge_loaded_weights();
-
-  std::vector<std::vector<int64_t>> test_shapes = {
-      {1, 5, model_args_.hidden_size()},
-      {2, 10, model_args_.hidden_size()},
-      {1, 20, model_args_.hidden_size()}};
-
-  for (const auto& shape : test_shapes) {
-    auto input = torch::randn(shape, tensor_options_);
-    int64_t seq_len = shape[1];
-
-    try {
-      auto npu_stream = c10_npu::getCurrentNPUStream(0);
-
-      std::cout << "\nTesting complete pipeline with input shape: " << shape[0]
-                << "x" << shape[1] << "x" << shape[2] << std::endl;
-
-      auto normalized_output = rms_norm(input, 0);
-
-      auto qkv_output = qkv_proj(normalized_output, 0);
-
-      auto split_outputs = split_layer(qkv_output, 0);
-      EXPECT_EQ(split_outputs.size(), 3) << "Expected 3 split outputs";
-
-      if (split_outputs.size() >= 3) {
-        auto q = split_outputs[0];
-        auto k = split_outputs[1];
-        auto v = split_outputs[2];
-
-        auto q_reshaped = q.view({-1, num_heads_, head_dim_});
-        auto k_reshaped = k.view({-1, num_kv_heads_, head_dim_});
-
-        auto q_normalized = q_norm(q_reshaped, 0);
-        auto k_normalized = k_norm(k_reshaped, 0);
-
-        q_normalized = q_normalized.view({-1, num_heads_ * head_dim_});
-        k_normalized = k_normalized.view({-1, num_kv_heads_ * head_dim_});
-
-        auto rope_embeddings = CreateRopeEmbeddings(seq_len, head_dim_);
-        auto cos_embedding = rope_embeddings.first;
-        auto sin_embedding = rope_embeddings.second;
-        auto seq_len_tensor =
-            torch::tensor({seq_len}, tensor_options_.dtype(torch::kInt32));
-
-        auto rope_outputs = rope_layer->forward(q_normalized,
-                                                k_normalized,
-                                                cos_embedding,
-                                                sin_embedding,
-                                                seq_len_tensor,
-                                                0);
-
-        EXPECT_EQ(rope_outputs.size(), 2) << "Expected 2 RoPE outputs";
-
-        if (rope_outputs.size() >= 2) {
-          auto q_final = rope_outputs[0];
-          auto k_final = rope_outputs[1];
-
-          std::cout << "Final Q shape: " << q_final.sizes() << std::endl;
-          std::cout << "Final K shape: " << k_final.sizes() << std::endl;
-          std::cout << "V shape: " << v.sizes() << std::endl;
-
-          // Verify final shapes
-          // EXPECT_EQ(q_final.size(0), shape[0]) << "Batch size mismatch";
-          // EXPECT_EQ(q_final.size(1), shape[1]) << "Sequence length mismatch";
-          // EXPECT_EQ(k_final.size(0), shape[0]) << "Batch size mismatch";
-          // EXPECT_EQ(k_final.size(1), shape[1]) << "Sequence length mismatch";
-
-          EXPECT_EQ(q_final.sizes(), q_normalized.sizes());
-          EXPECT_EQ(k_final.sizes(), k_normalized.sizes());
-
-          EXPECT_FALSE(torch::isnan(q_final).any().item<bool>())
-              << "NaN detected in final Q";
-          EXPECT_FALSE(torch::isinf(q_final).any().item<bool>())
-              << "Inf detected in final Q";
-          EXPECT_FALSE(torch::isnan(k_final).any().item<bool>())
-              << "NaN detected in final K";
-          EXPECT_FALSE(torch::isinf(k_final).any().item<bool>())
-              << "Inf detected in final K";
-          EXPECT_FALSE(torch::isnan(v).any().item<bool>())
-              << "NaN detected in V";
-          EXPECT_FALSE(torch::isinf(v).any().item<bool>())
-              << "Inf detected in V";
-
-          std::cout << "Complete pipeline test passed for shape [" << shape[0]
-                    << ", " << shape[1] << ", " << shape[2] << "]" << std::endl;
-        }
-      }
-
-      aclrtSynchronizeStream(npu_stream.stream());
-    } catch (const std::exception& e) {
-      GTEST_SKIP() << "Skipping complete pipeline test for shape [" << shape[0]
-                   << ", " << shape[1] << ", " << shape[2]
-                   << "] - requires NPU environment: " << e.what();
-      break;
-    }
-  }
-
-  std::cout << "\nComplete attention pipeline test completed successfully!"
-            << std::endl;
-}
-
-}  // namespace xllm::kernel
-
-int main(int argc, char** argv) {
-  struct rlimit core_limit;
-  core_limit.rlim_cur = 0;
-  core_limit.rlim_max = 0;
-  setrlimit(RLIMIT_CORE, &core_limit);
-
-  FILE* null_stderr = freopen("/dev/null", "w", stderr);
-  if (null_stderr == nullptr) {
-    fclose(stderr);
-  }
-
-  ::testing::InitGoogleTest(&argc, argv);
-
-  bool npu_available = false;
-  try {
-    auto test_tensor =
-        torch::zeros({1}, torch::TensorOptions().device("npu:0"));
-    npu_available = true;
-  } catch (...) {
-    npu_available = false;
-  }
-
-  if (!npu_available) {
-    std::cout << "NPU device not available, skipping all tests." << std::endl;
-    return 0;
-  }
-
-  int result = RUN_ALL_TESTS();
-  _exit(result);
-}
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/impl/npu_split_impl.cpp b/xllm/core/kernels/npu/impl/npu_split_impl.cpp
deleted file mode 100644
index a1346ec2..00000000
--- a/xllm/core/kernels/npu/impl/npu_split_impl.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "npu_split_impl.h"
-
-#include <glog/logging.h>
-
-namespace xllm::kernel {
-
-void NpuSplitImpl::param_from_args(atb::infer::SplitParam& param,
-                                   const ModelArgs& args,
-                                   int32_t splitDim,
-                                   int32_t splitNum,
-                                   atb::SVector<int32_t> splitSizes) {
-  param.splitDim = splitDim;
-  param.splitNum = splitNum;
-  param.splitSizes = splitSizes;
-}
-
-int64_t NpuSplitImpl::init_node(atb_speed::Model::Node& node,
-                                atb::infer::SplitParam& param) {
-  name_ = "split";
-  model_name_ = "llm";
-  run_task_func_ = std::bind(&NpuSplitImpl::run_task,
-                             this,
-                             std::placeholders::_1,
-                             std::placeholders::_2);
-
-  atb::Operation* operation = nullptr;
-  atb::Status atbStatus = atb::CreateOperation(param, &operation);
-  if (atbStatus != atb::NO_ERROR) {
-    return atbStatus;
-  }
-
-  node.operation.reset(operation);
-  if (node.operation == nullptr) {
-    LOG(ERROR) << "node.operation is null";
-    return -1;
-  }
-  if (node.operation->GetInputNum() < 1) {
-    LOG(ERROR) << "Can not resize number which is smaller than 1";
-    return -1;
-  }
-
-  return atb::NO_ERROR;
-}
-
-NpuSplitImpl::NpuSplitImpl(const ModelContext& context,
-                           int32_t splitDim,
-                           int32_t splitNum,
-                           atb::SVector<int32_t> splitSizes)
-    : NpuBaseLayer(context) {
-  param_from_args(
-      split_param_, context.get_model_args(), splitDim, splitNum, splitSizes);
-
-  at_weight_tensors_.resize(1);
-  atb_weight_tensors_.resize(1);
-  at_out_tensors_.resize(3);
-
-  auto options = context.get_tensor_options();
-  dtype_ = c10::typeMetaToScalarType(options.dtype());
-  at_weight_tensors_[0] = torch::zeros({1}).to(options);
-
-  atb::Status status = init_node(split_node_, split_param_);
-  if (status != atb::NO_ERROR) {
-    LOG(ERROR) << "Failed to initialize node, status: " << status;
-    LOG(FATAL) << "NpuSplitImpl initialization failed with status: "
-               << std::to_string(status);
-  }
-}
-
-void NpuSplitImpl::verify_loaded_weights(const std::string weight_str) const {
-  // No operation needed for split layer
-}
-
-void NpuSplitImpl::merge_loaded_weights() {
-  // No operation needed for split layer
-}
-
-void NpuSplitImpl::load_state_dict(const StateDict& state_dict) {
-  // No operation needed for split layer
-}
-
-std::vector<at::Tensor> NpuSplitImpl::forward(const torch::Tensor& input,
-                                              int nodeId) {
-  atb::Status st;
-  build_node_variant_pack(split_node_, input);
-  st = execute_node(split_node_, nodeId);
-  LOG_IF(FATAL, st != 0) << model_name_
-                         << "infer shape fail, error code: " << st;
-  return at_out_tensors_;
-}
-
-void NpuSplitImpl::build_node_variant_pack(atb_speed::Model::Node& node,
-                                           const torch::Tensor& input) {
-  internal_input = atb_speed::Utils::AtTensor2Tensor(input);
-
-  atb::SVector<atb::Tensor> ins = {internal_input};
-  node.variantPack.inTensors = ins;
-
-  atb::SVector<atb::TensorDesc> inTensorDescs;
-  inTensorDescs.resize(node.operation->GetInputNum());
-  inTensorDescs.at(0) = internal_input.desc;
-
-  atb::SVector<atb::TensorDesc> outTensorDescs;
-  node.operation->InferShape(inTensorDescs, outTensorDescs);
-
-  at::Tensor output_0 =
-      atb_speed::Utils::CreateAtTensorFromTensorDesc(outTensorDescs.at(0));
-  at_out_tensors_.at(0) = output_0;
-  at::Tensor output_1 =
-      atb_speed::Utils::CreateAtTensorFromTensorDesc(outTensorDescs.at(1));
-  at_out_tensors_.at(1) = output_1;
-  at::Tensor output_2 =
-      atb_speed::Utils::CreateAtTensorFromTensorDesc(outTensorDescs.at(2));
-  at_out_tensors_.at(2) = output_2;
-
-  node.variantPack.outTensors = {atb_speed::Utils::AtTensor2Tensor(output_0),
-                                 atb_speed::Utils::AtTensor2Tensor(output_1),
-                                 atb_speed::Utils::AtTensor2Tensor(output_2)};
-}
-
-}  // namespace xllm::kernel
diff --git a/xllm/core/kernels/npu/impl/npu_split_impl.h b/xllm/core/kernels/npu/impl/npu_split_impl.h
deleted file mode 100644
index c8f85ae8..00000000
--- a/xllm/core/kernels/npu/impl/npu_split_impl.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#pragma once
-#ifdef TORCH_HIGHER_THAN_PTA6
-#include <torch_npu/csrc/core/npu/NPUFormat.h>
-#include <torch_npu/csrc/framework/OpCommand.h>
-#else
-#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
-#include <torch_npu/csrc/framework/utils/OpPreparation.h>
-#endif
-
-#include <torch_npu/csrc/libs/init_npu.h>
-
-#include <functional>
-
-#include "atb/atb_infer.h"
-#include "framework/kv_cache/kv_cache.h"
-#include "framework/model/model_input_params.h"
-#include "framework/state_dict/state_dict.h"
-#include "layers/npu/npu_base_layer.h"
-#include "nlohmann/json.hpp"
-#include "pytorch/adapter/utils/utils.h"
-#include "xllm_kernels/core/include/atb_speed/base/hosttensor_binder.h"
-#include "xllm_kernels/core/include/atb_speed/base/model.h"
-#include "xllm_kernels/core/include/atb_speed/log.h"
-#include "xllm_kernels/core/include/atb_speed/utils/model_factory.h"
-
-namespace xllm::kernel {
-
-class NpuSplitImpl : public xllm::layer::NpuBaseLayer {
- public:
-  explicit NpuSplitImpl(const ModelContext& context,
-                        int32_t splitDim = 2,
-                        int32_t splitNum = 3,
-                        atb::SVector<int32_t> splitSizes = {});
-
-  ~NpuSplitImpl() {};
-
-  void load_state_dict(const StateDict& state_dict);
-
-  void verify_loaded_weights(const std::string weight_str) const;
-
-  void merge_loaded_weights();
-
-  std::vector<at::Tensor> forward(const torch::Tensor& input, int nodeId);
-
- private:
-  int64_t init_node(atb_speed::Model::Node& node,
-                    atb::infer::SplitParam& param);
-  void build_node_variant_pack(atb_speed::Model::Node& node,
-                               const torch::Tensor& input);
-  void param_from_args(atb::infer::SplitParam& param,
-                       const ModelArgs& args,
-                       int32_t splitDim,
-                       int32_t splitNum,
-                       atb::SVector<int32_t> splitSizes);
-
-  std::vector<at::Tensor> at_out_tensors_;
-  atb::Tensor internal_input;
-
-  atb_speed::Model::Node split_node_;
-  std::string model_name_;
-  atb::infer::SplitParam split_param_;
-  atb::Tensor internal_tensors_;
-};
-
-}  // namespace xllm::kernel
diff --git a/xllm/core/kernels/npu/impl/npu_split_test.cpp b/xllm/core/kernels/npu/impl/npu_split_test.cpp
deleted file mode 100644
index 2e28d26c..00000000
--- a/xllm/core/kernels/npu/impl/npu_split_test.cpp
+++ /dev/null
@@ -1,356 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <gtest/gtest.h>
-#include <sys/resource.h>
-
-#include "kernels/npu/split.h"
-
-namespace xllm::kernel {
-
-class NpuSplitTest : public ::testing::Test {
- protected:
-  NpuSplitTest() : parallel_args_(1, 1, nullptr) {
-    try {
-      torch::zeros({1}, torch::TensorOptions().device("npu:0"));
-
-      tensor_options_ =
-          torch::TensorOptions().dtype(torch::kFloat16).device("npu:0");
-      npu_available_ = true;
-      std::cout << "Using NPU device" << std::endl;
-
-    } catch (...) {
-      tensor_options_ =
-          torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU);
-      npu_available_ = false;
-      std::cout << "Using CPU device (NPU unavailable)" << std::endl;
-    }
-  }
-
-  void SetUp() override {
-    torch::manual_seed(42);
-
-    model_args_.hidden_size() = 4096 * 3;
-    model_args_.intermediate_size() = 11008;
-    model_args_.dtype() = "float16";
-
-    quant_args_.torch_dtype() = "float16";
-
-    context_ = std::make_unique<ModelContext>(
-        parallel_args_, model_args_, quant_args_, tensor_options_);
-  }
-
-  void TearDown() override {
-    context_.reset();
-
-    if (npu_available_) {
-      try {
-        c10_npu::npuSynchronizeDevice();
-        c10_npu::NPUCachingAllocator::emptyCache();
-        std::this_thread::sleep_for(std::chrono::milliseconds(500));
-      } catch (...) {
-        // NPU cleanup failures are usually not critical in test teardown
-      }
-    }
-  }
-
-  StateDict CreateEmptyStateDict() {
-    std::unordered_map<std::string, torch::Tensor> tensor_map;
-    return StateDict(tensor_map, "");
-  }
-
-  ModelArgs model_args_;
-  QuantArgs quant_args_;
-  ParallelArgs parallel_args_;
-  torch::TensorOptions tensor_options_;
-  std::unique_ptr<ModelContext> context_;
-  bool npu_available_ = true;
-};
-
-// Test NpuSplitImpl construction
-TEST_F(NpuSplitTest, ConstructorTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  ASSERT_NO_THROW({
-    auto split = std::make_shared<NpuSplitImpl>(*context_);
-    EXPECT_NE(split, nullptr);
-  });
-}
-
-// Test Split wrapper construction
-TEST_F(NpuSplitTest, SplitWrapperTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  ASSERT_NO_THROW({ auto split = Split(*context_); });
-}
-
-// Test state dict loading (should be no-op for split layer)
-TEST_F(NpuSplitTest, LoadStateDictTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto split = std::make_shared<NpuSplitImpl>(*context_);
-  auto state_dict = CreateEmptyStateDict();
-
-  ASSERT_NO_THROW({ split->load_state_dict(state_dict); });
-}
-
-// Test weight verification (should pass for split layer as it has no weights)
-TEST_F(NpuSplitTest, VerifyLoadedWeightsTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto split = std::make_shared<NpuSplitImpl>(*context_);
-
-  ASSERT_NO_THROW({ split->verify_loaded_weights("test_weight"); });
-}
-
-// Test merge loaded weights (should be no-op for split layer)
-TEST_F(NpuSplitTest, MergeLoadedWeightsTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto split = std::make_shared<NpuSplitImpl>(*context_);
-
-  ASSERT_NO_THROW({ split->merge_loaded_weights(); });
-}
-
-// Test forward pass with basic input
-TEST_F(NpuSplitTest, ForwardPassBasicTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto split = Split(*context_);
-
-  // Input tensor with shape [batch_size, seq_len, hidden_size]
-  auto input =
-      torch::randn({1, 10, model_args_.hidden_size() * 3}, tensor_options_);
-
-  try {
-    auto npu_stream = c10_npu::getCurrentNPUStream(0);
-    auto outputs = split->forward(input, 0);
-    aclrtSynchronizeStream(npu_stream.stream());
-    std::cout << "Input tensor shape: " << input.sizes() << std::endl;
-    std::cout << "Number of output tensors: " << outputs.size() << std::endl;
-
-    EXPECT_EQ(outputs.size(), 3);
-
-    for (size_t i = 0; i < outputs.size(); ++i) {
-      EXPECT_EQ(outputs[i].size(0), 1);   // batch size
-      EXPECT_EQ(outputs[i].size(1), 10);  // sequence length
-      std::cout << "Output " << i << " shape: " << outputs[i].sizes()
-                << std::endl;
-    }
-
-    int64_t total_output_features = 0;
-    for (const auto& output : outputs) {
-      total_output_features += output.size(2);
-    }
-    EXPECT_EQ(total_output_features, model_args_.hidden_size() * 3);
-
-  } catch (const std::exception& e) {
-    GTEST_SKIP() << "Skipping forward pass test - requires NPU environment: "
-                 << e.what();
-  }
-}
-
-// Test split functionality with different input shapes
-TEST_F(NpuSplitTest, SplitDifferentInputShapesTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto split = Split(*context_);
-
-  std::vector<std::vector<int64_t>> input_shapes = {
-      {1, 5, model_args_.hidden_size()},
-      {2, 10, model_args_.hidden_size()},
-      {4, 20, model_args_.hidden_size()},
-      {1, 1, model_args_.hidden_size()}};
-
-  for (const auto& shape : input_shapes) {
-    auto input = torch::randn(shape, tensor_options_);
-
-    try {
-      auto npu_stream = c10_npu::getCurrentNPUStream(0);
-      auto outputs = split->forward(input, 0);
-      aclrtSynchronizeStream(npu_stream.stream());
-
-      EXPECT_EQ(outputs.size(), 3);
-
-      for (const auto& output : outputs) {
-        EXPECT_EQ(output.size(0), shape[0]);
-        EXPECT_EQ(output.size(1), shape[1]);
-        EXPECT_GT(output.size(2), 0);
-      }
-
-      int64_t total_features = 0;
-      for (const auto& output : outputs) {
-        total_features += output.size(2);
-      }
-      EXPECT_EQ(total_features, shape[2]);
-
-    } catch (const std::exception& e) {
-      GTEST_SKIP() << "Skipping shape test for [" << shape[0] << ", "
-                   << shape[1] << ", " << shape[2]
-                   << "] - requires NPU environment: " << e.what();
-      break;
-    }
-  }
-}
-
-// Test split with different hidden sizes
-TEST_F(NpuSplitTest, SplitDifferentHiddenSizesTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  std::vector<int64_t> hidden_sizes = {
-      768 * 3, 1024 * 3, 2048 * 3, 4096 * 3, 6144 * 3, 8192 * 3};
-  auto npu_stream = c10_npu::getCurrentNPUStream(0);
-  for (auto hidden_size : hidden_sizes) {
-    model_args_.hidden_size() = hidden_size;
-
-    QuantArgs local_quant_args = quant_args_;
-    local_quant_args.torch_dtype() = "float16";
-
-    auto context = std::make_unique<ModelContext>(
-        parallel_args_, model_args_, local_quant_args, tensor_options_);
-
-    try {
-      auto split = Split(*context);
-
-      auto input = torch::randn({1, 10, hidden_size}, tensor_options_);
-
-      auto npu_stream = c10_npu::getCurrentNPUStream(0);
-      auto outputs = split->forward(input, 0);
-      aclrtSynchronizeStream(npu_stream.stream());
-      aclrtSynchronizeStream(npu_stream.stream());
-      EXPECT_EQ(outputs.size(), 3);
-
-      int64_t total_features = 0;
-      for (const auto& output : outputs) {
-        total_features += output.size(2);
-      }
-      EXPECT_EQ(total_features, hidden_size);
-
-    } catch (const std::exception& e) {
-      GTEST_SKIP() << "Skipping hidden size test for " << hidden_size
-                   << " - requires NPU environment: " << e.what();
-      break;
-    }
-  }
-}
-
-// Test error handling with invalid inputs
-TEST_F(NpuSplitTest, ErrorHandlingTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto split = Split(*context_);
-
-  try {
-    auto empty_input = torch::empty({0, 0, 0}, tensor_options_);
-    auto npu_stream = c10_npu::getCurrentNPUStream(0);
-    auto outputs = split->forward(empty_input, 0);
-    aclrtSynchronizeStream(npu_stream.stream());
-  } catch (const std::exception& e) {
-    std::cout << "Correctly caught expected error for empty tensor: "
-              << e.what() << std::endl;
-  }
-
-  try {
-    auto wrong_dim_input =
-        torch::randn({10, model_args_.hidden_size()}, tensor_options_);
-    auto npu_stream = c10_npu::getCurrentNPUStream(0);
-    auto outputs = split->forward(wrong_dim_input, 0);
-    aclrtSynchronizeStream(npu_stream.stream());
-  } catch (const std::exception& e) {
-    std::cout << "Caught error for 2D input: " << e.what() << std::endl;
-  }
-}
-
-// Test consistency of split operation
-TEST_F(NpuSplitTest, SplitConsistencyTest) {
-  if (!npu_available_) {
-    GTEST_SKIP() << "Skipping NPU test - NPU device not available";
-  }
-
-  auto split = Split(*context_);
-  auto input1 =
-      torch::randn({2, 5, model_args_.hidden_size()}, tensor_options_);
-
-  try {
-    auto npu_stream = c10_npu::getCurrentNPUStream(0);
-
-    auto outputs1 = split->forward(input1, 0);
-    aclrtSynchronizeStream(npu_stream.stream());
-
-    auto outputs2 = split->forward(input1, 1);
-    aclrtSynchronizeStream(npu_stream.stream());
-
-    EXPECT_EQ(outputs1.size(), outputs2.size());
-
-    for (size_t i = 0; i < outputs1.size(); ++i) {
-      EXPECT_TRUE(outputs1[i].sizes().equals(outputs2[i].sizes()));
-    }
-
-  } catch (const std::exception& e) {
-    GTEST_SKIP() << "Skipping consistency test - requires NPU environment: "
-                 << e.what();
-  }
-}
-
-}  // namespace xllm::kernel
-
-int main(int argc, char** argv) {
-  struct rlimit core_limit;
-  core_limit.rlim_cur = 0;
-  core_limit.rlim_max = 0;
-  setrlimit(RLIMIT_CORE, &core_limit);
-
-  FILE* null_stderr = freopen("/dev/null", "w", stderr);
-  if (null_stderr == nullptr) {
-    fclose(stderr);
-  }
-
-  ::testing::InitGoogleTest(&argc, argv);
-
-  bool npu_available = false;
-  try {
-    auto test_tensor =
-        torch::zeros({1}, torch::TensorOptions().device("npu:0"));
-    npu_available = true;
-  } catch (...) {
-    npu_available = false;
-  }
-
-  if (!npu_available) {
-    std::cout << "NPU device not available, skipping all tests." << std::endl;
-    return 0;
-  }
-
-  int result = RUN_ALL_TESTS();
-  _exit(result);
-}
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/linear.h b/xllm/core/kernels/npu/matmul.cpp
similarity index 61%
rename from xllm/core/kernels/npu/linear.h
rename to xllm/core/kernels/npu/matmul.cpp
index 0834c014..0b80c9dd 100644
--- a/xllm/core/kernels/npu/linear.h
+++ b/xllm/core/kernels/npu/matmul.cpp
@@ -13,18 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#pragma once
-#include "impl/npu_linear_impl.h"
-
-namespace xllm::kernel {
-
-class Linear : public torch::nn::ModuleHolder<NpuLinearImpl> {
- public:
-  using torch::nn::ModuleHolder<NpuLinearImpl>::ModuleHolder;
-  using Impl __attribute__((__unused__)) = NpuLinearImpl;
-
-  Linear(const ModelContext& context)
-      : ModuleHolder(std::make_shared<NpuLinearImpl>(context)) {}
-};
-
-}  // namespace xllm::kernel
+#include "npu_ops_api.h"
+#include "ops_npu/npu_ops.h"
+
+namespace xllm::kernel::npu {
+
+torch::Tensor matmul(const torch::Tensor& a,
+                     const torch::Tensor& b,
+                     const std::optional<torch::Tensor>& bias) {
+  if (!bias.has_value()) {
+    return torch::nn::functional::linear(a, b);
+  } else {
+    return torch::nn::functional::linear(a, b, bias.value());
+  }
+}
+
+}  // namespace xllm::kernel::npu
diff --git a/xllm/core/kernels/npu/npu_ops_api.h b/xllm/core/kernels/npu/npu_ops_api.h
new file mode 100644
index 00000000..e9c85b38
--- /dev/null
+++ b/xllm/core/kernels/npu/npu_ops_api.h
@@ -0,0 +1,65 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+#include <torch/torch.h>
+
+#include <optional>
+
+#include "./custom_functions_npu/AtbCommon.h"
+
+namespace xllm::kernel::npu {
+
+void reshape_paged_cache(torch::Tensor& key,
+                         torch::Tensor& value,
+                         torch::Tensor& k_cache,
+                         torch::Tensor& v_cache,
+                         const torch::Tensor& slot_mapping);
+
+void batch_prefill(const torch::Tensor& query,
+                   const torch::Tensor& key,
+                   const torch::Tensor& value,
+                   const torch::Tensor& mask,
+                   const torch::Tensor& seq_len,
+                   float scale,
+                   int num_heads,
+                   int num_kv_heads,
+                   torch::Tensor& output);
+
+void batch_decode(const torch::Tensor& query,
+                  const torch::Tensor& k_cache,
+                  const torch::Tensor& v_cache,
+                  int num_kv_heads,
+                  int num_heads,
+                  float scale,
+                  const torch::Tensor& block_table,
+                  const torch::Tensor& seq_lens,
+                  torch::Tensor& output);
+
+torch::Tensor matmul(const torch::Tensor& a,
+                     const torch::Tensor& b,
+                     const std::optional<torch::Tensor>& bias);
+
+torch::Tensor active(const torch::Tensor& input);
+
+torch::Tensor fused_layernorm(const torch::Tensor& input,
+                              const torch::Tensor& weight,
+                              double eps);
+
+void apply_rotary(torch::Tensor& q,
+                  torch::Tensor& k,
+                  const torch::Tensor& cos_sin_cache,
+                  const torch::Tensor& positions);
+}  // namespace xllm::kernel::npu
diff --git a/xllm/core/kernels/npu/ops_npu/PagedAttentionAtb.cpp b/xllm/core/kernels/npu/ops_npu/PagedAttentionAtb.cpp
new file mode 100644
index 00000000..a05ee76b
--- /dev/null
+++ b/xllm/core/kernels/npu/ops_npu/PagedAttentionAtb.cpp
@@ -0,0 +1,61 @@
+// Copyright (c) 2025 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <acl/acl.h>
+
+#include "../custom_functions_npu/AtbCommon.h"
+
+namespace atb {
+using PagedAttentionParam = atb::infer::PagedAttentionParam;
+void _npu_paged_attention(const at::Tensor& query,
+                          const at::Tensor& key_cache,
+                          const at::Tensor& value_cache,
+                          int64_t num_kv_heads,
+                          int64_t num_heads,
+                          double scale_value,
+                          const at::Tensor& block_table,
+                          const at::Tensor& context_lens,
+                          at::Tensor& out) {
+  const c10::OptionalDeviceGuard device_guard(device_of(query));
+  OpParamCache<PagedAttentionParam>& pagedAttentionParamCache =
+      OpParamCache<PagedAttentionParam>::getInstance();
+  PagedAttentionParam pagedparam;
+  pagedparam.headNum = num_heads;
+  pagedparam.qkScale = scale_value;
+  pagedparam.kvHeadNum = num_kv_heads;
+  pagedparam.maskType = PagedAttentionParam::UNDEFINED;
+  pagedparam.batchRunStatusEnable = false;
+  pagedparam.quantType = PagedAttentionParam::TYPE_QUANT_UNDEFINED;
+  pagedparam.outDataType = ACL_DT_UNDEFINED;
+  pagedparam.hasQuantOffset = false;
+  pagedparam.compressType = PagedAttentionParam::COMPRESS_TYPE_UNDEFINED;
+  pagedparam.calcType = PagedAttentionParam::CALC_TYPE_UNDEFINED;
+  pagedparam.scaleType = PagedAttentionParam::SCALE_TYPE_TOR;
+  pagedparam.inputLayout = atb::infer::TYPE_BSND;
+  pagedparam.mlaVHeadSize = 0;
+
+  ParamSetter paramsetter;
+  paramsetter.Input(query, true)
+      .Input(key_cache)
+      .Input(value_cache)
+      .Input(block_table, true)
+      .Input(context_lens, true)
+      .Output(out);
+  auto opPaged = pagedAttentionParamCache.getOperation(
+      pagedparam, "PagedAttentionOperation");
+  RunAtbCmd(opPaged, paramsetter, "PagedAttentionOperation");
+
+  return;
+}
+
+}  // namespace atb
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/ops_npu/ReshapeAndCachAtb.cpp b/xllm/core/kernels/npu/ops_npu/ReshapeAndCachAtb.cpp
new file mode 100644
index 00000000..cba05d19
--- /dev/null
+++ b/xllm/core/kernels/npu/ops_npu/ReshapeAndCachAtb.cpp
@@ -0,0 +1,58 @@
+// Copyright (c) 2025 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <acl/acl.h>
+
+#include "../custom_functions_npu/AtbCommon.h"
+
+using namespace std;
+namespace atb {
+using ReshapeAndCacheParam = atb::infer::ReshapeAndCacheParam;
+void _npu_reshape_and_cache(const at::Tensor& key,
+                            const at::Tensor& value,
+                            at::Tensor& key_cache,
+                            at::Tensor& value_cache,
+                            const at::Tensor& slot_indices) {
+  const c10::OptionalDeviceGuard device_guard(device_of(key));
+  OpParamCache<ReshapeAndCacheParam>& reshapeAndCacheParamCache =
+      OpParamCache<ReshapeAndCacheParam>::getInstance();
+  ReshapeAndCacheParam reshapeparam;
+  reshapeparam.compressType = ReshapeAndCacheParam::COMPRESS_TYPE_UNDEFINED;
+
+  auto key_cache_format = at_npu::native::get_npu_format(key_cache);
+  auto value_cache_format = at_npu::native::get_npu_format(value_cache);
+  bool is_key_cache_nz = (key_cache_format == ACL_FORMAT_FRACTAL_NZ);
+  bool is_value_cache_nz = (value_cache_format == ACL_FORMAT_FRACTAL_NZ);
+
+  if (is_key_cache_nz && is_value_cache_nz) {
+    reshapeparam.kvCacheCfg = ReshapeAndCacheParam::K_CACHE_V_CACHE_NZ;
+  } else {
+    reshapeparam.kvCacheCfg = ReshapeAndCacheParam::K_CACHE_V_CACHE;
+  }
+
+  ParamSetter parametter;
+  parametter.Input(key, true)
+      .Input(value, true)
+      .Input(key_cache)
+      .Input(value_cache)
+      .Input(slot_indices, true)
+      .Output(key_cache)
+      .Output(value_cache);
+  auto opReshape = reshapeAndCacheParamCache.getOperation(
+      reshapeparam, "ReshapeCacheOperation");
+  RunAtbCmd(opReshape, parametter, "ReshapeCacheOperation");
+
+  return;
+}
+
+}  // namespace atb
diff --git a/xllm/core/kernels/npu/ops_npu/SelfAttentionAtb.cpp b/xllm/core/kernels/npu/ops_npu/SelfAttentionAtb.cpp
new file mode 100644
index 00000000..08f14497
--- /dev/null
+++ b/xllm/core/kernels/npu/ops_npu/SelfAttentionAtb.cpp
@@ -0,0 +1,71 @@
+// Copyright (c) 2025 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <acl/acl.h>
+
+#include "../custom_functions_npu/AtbCommon.h"
+
+using namespace std;
+namespace atb {
+using SelfAttentionParam = atb::infer::SelfAttentionParam;
+void _npu_flash_attention(const at::Tensor& query,
+                          const at::Tensor& key,
+                          const at::Tensor& value,
+                          const at::Tensor& mask,
+                          const at::Tensor& seq_len,
+                          const double scale_value,
+                          const int64_t num_heads,
+                          const int64_t num_kv_heads,
+                          at::Tensor& out) {
+  const c10::OptionalDeviceGuard device_guard(device_of(query));
+  OpParamCache<SelfAttentionParam>& selfAttentionParamCache =
+      OpParamCache<SelfAttentionParam>::getInstance();
+  SelfAttentionParam selfattentionparam;
+
+  selfattentionparam.calcType = SelfAttentionParam::PA_ENCODER;
+  selfattentionparam.kernelType = SelfAttentionParam::KERNELTYPE_DEFAULT;
+  selfattentionparam.clampType = SelfAttentionParam::CLAMP_TYPE_UNDEFINED;
+  selfattentionparam.maskType = SelfAttentionParam::MASK_TYPE_NORM;
+  selfattentionparam.kvcacheCfg = SelfAttentionParam::K_CACHE_V_CACHE;
+  selfattentionparam.scaleType = SelfAttentionParam::SCALE_TYPE_TOR;
+  selfattentionparam.quantType = SelfAttentionParam::TYPE_QUANT_UNDEFINED;
+  selfattentionparam.cacheType = SelfAttentionParam::CACHE_TYPE_NORM;
+  selfattentionparam.outDataType = ACL_DT_UNDEFINED;
+  selfattentionparam.headNum = num_heads;
+  selfattentionparam.kvHeadNum = num_kv_heads;
+  selfattentionparam.qScale = 1;
+  selfattentionparam.qkScale = scale_value;
+  selfattentionparam.batchRunStatusEnable = false;
+  selfattentionparam.isTriuMask = 0;
+  selfattentionparam.clampMin = 0;
+  selfattentionparam.clampMax = 0;
+  selfattentionparam.inputLayout = atb::infer::TYPE_BSND;
+  selfattentionparam.mlaVHeadSize = 0;
+  selfattentionparam.windowSize = 0;
+
+  ParamSetter parametter;
+  parametter.Input(query, true)
+      .Input(key, true)
+      .Input(value, true)
+      .Input(mask)
+      .Input(seq_len, true)
+      .Output(out);
+
+  auto opSelfattention = selfAttentionParamCache.getOperation(
+      selfattentionparam, "SelfAttentionOperation");
+  RunAtbCmd(opSelfattention, parametter, "SelfAttentionOperation");
+
+  return;
+}
+
+}  // namespace atb
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/ops_npu/npu_ops.h b/xllm/core/kernels/npu/ops_npu/npu_ops.h
new file mode 100644
index 00000000..de6b6039
--- /dev/null
+++ b/xllm/core/kernels/npu/ops_npu/npu_ops.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2025 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef XLLM_NPU_OPS_H
+#define XLLM_NPU_OPS_H
+
+#include "../custom_functions_npu/AtbCommon.h"
+
+using namespace std;
+
+namespace atb {
+
+using PagedAttentionParam = atb::infer::PagedAttentionParam;
+using ReshapeAndCacheParam = atb::infer::ReshapeAndCacheParam;
+using SelfAttentionParam = atb::infer::SelfAttentionParam;
+
+void _npu_paged_attention(const at::Tensor& query,
+                          const at::Tensor& key_cache,
+                          const at::Tensor& value_cache,
+                          int64_t num_kv_heads,
+                          int64_t num_heads,
+                          double scale_value,
+                          const at::Tensor& block_table,
+                          const at::Tensor& context_lens,
+                          at::Tensor& out);
+
+void _npu_reshape_and_cache(const at::Tensor& key,
+                            const at::Tensor& value,
+                            at::Tensor& key_cache,
+                            at::Tensor& value_cache,
+                            const at::Tensor& slot_indices);
+
+void _npu_flash_attention(const at::Tensor& query,
+                          const at::Tensor& key,
+                          const at::Tensor& value,
+                          const at::Tensor& mask,
+                          const at::Tensor& seq_len,
+                          const double scale_value,
+                          const int64_t num_heads,
+                          const int64_t num_kv_heads,
+                          at::Tensor& out);
+
+}  // namespace atb
+
+#endif  // XLLM_NPU_OPS_H
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/rope.cpp b/xllm/core/kernels/npu/rope.cpp
new file mode 100644
index 00000000..9e312f96
--- /dev/null
+++ b/xllm/core/kernels/npu/rope.cpp
@@ -0,0 +1,42 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <torch_npu/csrc/aten/CustomFunctions.h>
+
+#include "npu_ops_api.h"
+#include "ops_npu/npu_ops.h"
+
+namespace xllm::kernel::npu {
+
+void apply_rotary(torch::Tensor& q,
+                  torch::Tensor& k,
+                  const torch::Tensor& cos_sin_cache,
+                  const torch::Tensor& positions) {
+  auto cos_sin = cos_sin_cache.index_select(0, positions);
+  auto last_dim = cos_sin.size(-1);
+  auto cos_sin_vec = cos_sin.view({-1, 2, last_dim / 2})
+                         .repeat({1, 1, 2})
+                         .chunk(2, /*dim=*/-2);
+  auto cos = cos_sin_vec[0].view({1, -1, 1, last_dim});
+  auto sin = cos_sin_vec[1].view({1, -1, 1, last_dim});
+
+  const int64_t rotary_dim = sin.size(-1);
+  q = q.view({1, q.size(0), -1, rotary_dim});
+  k = k.view({1, k.size(0), -1, rotary_dim});
+
+  at_npu::native::custom_ops::npu_apply_rotary_pos_emb(q, k, cos, sin);
+}
+
+}  // namespace xllm::kernel::npu
\ No newline at end of file
diff --git a/xllm/core/kernels/npu/split.h b/xllm/core/kernels/npu/split.h
deleted file mode 100644
index cda39703..00000000
--- a/xllm/core/kernels/npu/split.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#pragma once
-#include "impl/npu_split_impl.h"
-
-namespace xllm::kernel {
-class Split : public torch::nn::ModuleHolder<NpuSplitImpl> {
- public:
-  using torch::nn::ModuleHolder<NpuSplitImpl>::ModuleHolder;
-  using Impl __attribute__((__unused__)) = NpuSplitImpl;
-
-  Split(const ModelContext& context,
-        int32_t splitDim = 2,
-        int32_t splitNum = 3,
-        atb::SVector<int32_t> splitSizes = {})
-      : ModuleHolder(std::make_shared<NpuSplitImpl>(context,
-                                                    splitDim,
-                                                    splitNum,
-                                                    splitSizes)) {}
-};
-
-}  // namespace xllm::kernel
diff --git a/xllm/core/kernels/ops_api.cpp b/xllm/core/kernels/ops_api.cpp
index 1d468c72..70bdcf1e 100644
--- a/xllm/core/kernels/ops_api.cpp
+++ b/xllm/core/kernels/ops_api.cpp
@@ -30,6 +30,8 @@ void apply_rotary(RotaryParams& params) {
                     params.discrete,
                     params.dynamic_ntk,
                     params.max_query_len);
+#elif defined(USE_NPU)
+  npu::apply_rotary(params.q, params.k, params.cos_sin, params.positions);
 #else
   throw std::runtime_error("apply_rotary not implemented");
 #endif
@@ -50,6 +52,14 @@ void active(ActivationParams& params) {
 #endif
 }
 
+torch::Tensor active_tensor(ActivationParams& params) {
+#if defined(USE_NPU)
+  return npu::active(params.input);
+#else
+  throw std::runtime_error("active not implemented");
+#endif
+}
+
 void reshape_paged_cache(ReshapePagedCacheParams& params) {
 #if defined(USE_MLU)
   mlu::reshape_paged_cache(params.key,
@@ -58,6 +68,12 @@ void reshape_paged_cache(ReshapePagedCacheParams& params) {
                            params.v_cache,
                            params.slot_mapping,
                            params.direction);
+#elif defined(USE_NPU)
+  npu::reshape_paged_cache(params.key,
+                           params.value,
+                           params.k_cache,
+                           params.v_cache,
+                           params.slot_mapping);
 #else
   throw std::runtime_error("reshape_paged_cache not implemented");
 #endif
@@ -87,6 +103,16 @@ void batch_prefill(AttentionParams& params) {
                      params.window_size_right,
                      params.compute_dtype,
                      params.return_lse);
+#elif defined(USE_NPU)
+  npu::batch_prefill(params.query,
+                     params.key,
+                     params.value,
+                     params.attn_mask,
+                     params.seq_lens,
+                     params.scale,
+                     params.num_heads,
+                     params.num_kv_heads,
+                     params.output);
 #else
   throw std::runtime_error("batch_prefill not implemented");
 #endif
@@ -114,6 +140,16 @@ void batch_decode(AttentionParams& params) {
                     params.scale,
                     params.return_lse,
                     params.kv_cache_quant_bit_size);
+#elif defined(USE_NPU)
+  npu::batch_decode(params.query,
+                    params.k_cache,
+                    params.v_cache,
+                    params.num_kv_heads,
+                    params.num_heads,
+                    params.scale,
+                    params.block_table.value(),
+                    params.seq_lens,
+                    params.output);
 #else
   throw std::runtime_error("batch_decode not implemented");
 #endif
@@ -141,10 +177,20 @@ void fused_layernorm(FusedLayerNormParams& params) {
 #endif
 }
 
+torch::Tensor fused_layernorm_tensor(FusedLayerNormParams& params) {
+#if defined(USE_NPU)
+  return npu::fused_layernorm(params.input, params.weight, params.eps);
+#else
+  throw std::runtime_error("fused_layernorm not implemented");
+#endif
+}
+
 torch::Tensor matmul(MatmulParams& params) {
 #if defined(USE_MLU)
   return mlu::matmul(
       params.a, params.b, params.bias, params.c, params.alpha, params.beta);
+#elif defined(USE_NPU)
+  return npu::matmul(params.a, params.b, params.bias);
 #else
   throw std::runtime_error("matmul not implemented");
 #endif
diff --git a/xllm/core/kernels/ops_api.h b/xllm/core/kernels/ops_api.h
index 46bc74bd..6d41d6af 100644
--- a/xllm/core/kernels/ops_api.h
+++ b/xllm/core/kernels/ops_api.h
@@ -19,6 +19,8 @@ limitations under the License.
 
 #if defined(USE_MLU)
 #include "mlu/mlu_ops_api.h"
+#elif defined(USE_NPU)
+#include "npu/npu_ops_api.h"
 #endif
 
 namespace xllm {
@@ -28,6 +30,8 @@ void apply_rotary(RotaryParams& params);
 
 void active(ActivationParams& params);
 
+torch::Tensor active_tensor(ActivationParams& params);
+
 void reshape_paged_cache(ReshapePagedCacheParams& params);
 
 void batch_prefill(AttentionParams& params);
@@ -36,6 +40,8 @@ void batch_decode(AttentionParams& params);
 
 void fused_layernorm(FusedLayerNormParams& params);
 
+torch::Tensor fused_layernorm_tensor(FusedLayerNormParams& params);
+
 torch::Tensor matmul(MatmulParams& params);
 
 torch::Tensor fused_moe(FusedMoEParams& params);
diff --git a/xllm/core/kernels/param.h b/xllm/core/kernels/param.h
index ff0a3410..c4f776c4 100644
--- a/xllm/core/kernels/param.h
+++ b/xllm/core/kernels/param.h
@@ -39,6 +39,7 @@ struct RotaryParams {
   bool discrete;
   bool dynamic_ntk = false;
   int max_query_len;
+  torch::Tensor positions;
 };
 
 // Activation parameters
@@ -79,6 +80,11 @@ struct AttentionParams {
   int window_size_right = -1;
   float scale;
   bool return_lse = false;
+  // for npu
+  torch::Tensor seq_lens;
+  int num_heads;
+  int num_kv_heads;
+  torch::Tensor attn_mask;
   // for flashinfer
   torch::Tensor paged_kv_indptr;
   torch::Tensor paged_kv_indices;
diff --git a/xllm/core/layers/CMakeLists.txt b/xllm/core/layers/CMakeLists.txt
index 6ad3d0c7..dc3b63cf 100644
--- a/xllm/core/layers/CMakeLists.txt
+++ b/xllm/core/layers/CMakeLists.txt
@@ -79,6 +79,5 @@ cc_library(
 
 if(USE_NPU)
   add_subdirectory(npu)
-else()
-  add_subdirectory(common)
 endif()
+add_subdirectory(common)
diff --git a/xllm/core/layers/column_parallel_linear.h b/xllm/core/layers/column_parallel_linear.h
index aadeec1e..25e05d8a 100644
--- a/xllm/core/layers/column_parallel_linear.h
+++ b/xllm/core/layers/column_parallel_linear.h
@@ -23,13 +23,13 @@ namespace xllm {
 namespace layer {
 
 #if defined(USE_NPU)
-class ColumnParallelLinear
+class NpuColumnParallelLinear
     : public torch::nn::ModuleHolder<NpuColumnParallelLinearImpl> {
  public:
   using torch::nn::ModuleHolder<NpuColumnParallelLinearImpl>::ModuleHolder;
   using Impl __attribute__((__unused__)) = NpuColumnParallelLinearImpl;
 
-  ColumnParallelLinear(const ModelContext& context)
+  NpuColumnParallelLinear(const ModelContext& context)
       : ModuleHolder(std::make_shared<NpuColumnParallelLinearImpl>(context)) {}
 };
 #endif
diff --git a/xllm/core/layers/common/CMakeLists.txt b/xllm/core/layers/common/CMakeLists.txt
old mode 100755
new mode 100644
index 4600fafd..6c9f5bf8
--- a/xllm/core/layers/common/CMakeLists.txt
+++ b/xllm/core/layers/common/CMakeLists.txt
@@ -43,6 +43,7 @@ cc_library(
     torch
 )
 
+if(NOT USE_NPU)
 # Add test for DenseMLP
 cc_test(
   NAME
@@ -76,3 +77,4 @@ cc_test(
     torch
     GTest::gtest_main
 )
+endif()
\ No newline at end of file
diff --git a/xllm/core/layers/common/attention.cpp b/xllm/core/layers/common/attention.cpp
index adb8b911..3a2d8833 100644
--- a/xllm/core/layers/common/attention.cpp
+++ b/xllm/core/layers/common/attention.cpp
@@ -21,14 +21,29 @@ DECLARE_bool(enable_chunked_prefill);
 namespace xllm {
 namespace layer {
 
+#if defined(USE_NPU)
+AttentionMetadata AttentionMetadata::build(const ModelInputParams& params,
+                                           bool is_prefill,
+                                           const torch::Tensor& attn_mask) {
+  return AttentionMetadata::build(params, "float", is_prefill, attn_mask);
+}
+#else
 AttentionMetadata AttentionMetadata::build(const ModelInputParams& params,
                                            bool is_prefill) {
   return AttentionMetadata::build(params, "float", is_prefill);
 }
+#endif
 
+#if defined(USE_NPU)
+AttentionMetadata AttentionMetadata::build(const ModelInputParams& params,
+                                           const std::string& compute_dtype,
+                                           bool is_prefill,
+                                           const torch::Tensor& attn_mask) {
+#else
 AttentionMetadata AttentionMetadata::build(const ModelInputParams& params,
                                            const std::string& compute_dtype,
                                            bool is_prefill) {
+#endif
   AttentionMetadata attn_metadata;
   attn_metadata.query_start_loc = params.q_seq_lens;
   attn_metadata.seq_start_loc = params.kv_seq_lens;
@@ -37,6 +52,11 @@ AttentionMetadata AttentionMetadata::build(const ModelInputParams& params,
   attn_metadata.slot_mapping = params.new_cache_slots;
   attn_metadata.compute_dtype = compute_dtype;
 
+#if defined(USE_NPU)
+  attn_metadata.attn_mask = attn_mask;
+  attn_metadata.seq_lens = params.kv_seq_lens.to(torch::kCPU);
+#endif
+
   bool is_start_loc_match = (params.q_seq_lens_vec == params.kv_seq_lens_vec);
   attn_metadata.is_chunked_prefill = is_prefill && !is_start_loc_match;
   attn_metadata.is_prefill = is_prefill && !attn_metadata.is_chunked_prefill;
@@ -87,7 +107,6 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>> AttentionImpl::forward(
   reshape_paged_cache_params.v_cache = v_cache;
   reshape_paged_cache_params.slot_mapping = attn_metadata.slot_mapping;
   xllm::kernel::reshape_paged_cache(reshape_paged_cache_params);
-
   xllm::kernel::AttentionParams attention_params;
   attention_params.query = query;
   attention_params.output = output;
@@ -103,7 +122,12 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>> AttentionImpl::forward(
     attention_params.query_start_loc = attn_metadata.query_start_loc;
     attention_params.seq_start_loc = attn_metadata.seq_start_loc;
     attention_params.max_query_len = attn_metadata.max_query_len;
-
+#if defined(USE_NPU)
+    attention_params.num_heads = num_heads_;
+    attention_params.num_kv_heads = num_kv_heads_;
+    attention_params.attn_mask = attn_metadata.attn_mask;
+    attention_params.seq_lens = attn_metadata.seq_lens;
+#endif
     xllm::kernel::batch_prefill(attention_params);
   } else if (attn_metadata.is_chunked_prefill) {
     attention_params.key = k_cache;
@@ -115,8 +139,16 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>> AttentionImpl::forward(
 
     xllm::kernel::batch_prefill(attention_params);
   } else {
+#if defined(USE_NPU)
+    query = query.view({-1, num_heads_, head_size_});
+    output = output.view({-1, num_heads_, head_size_});
+    attention_params.num_heads = num_heads_;
+    attention_params.num_kv_heads = num_kv_heads_;
+    attention_params.seq_lens = attn_metadata.seq_lens;
+#else
     query = query.view({-1, 1, num_heads_, head_size_});
     output = output.view({-1, 1, num_heads_, head_size_});
+#endif
 
     attention_params.query = query;
     attention_params.output = output;
diff --git a/xllm/core/layers/common/attention.h b/xllm/core/layers/common/attention.h
index 7e210001..a1cc9b9b 100644
--- a/xllm/core/layers/common/attention.h
+++ b/xllm/core/layers/common/attention.h
@@ -27,12 +27,25 @@ namespace layer {
 
 struct AttentionMetadata {
  public:
+#if defined(USE_NPU)
+  static AttentionMetadata build(const ModelInputParams& params,
+                                 bool is_prefill,
+                                 const torch::Tensor& attn_mask);
+
+  static AttentionMetadata build(const ModelInputParams& params,
+                                 const std::string& compute_dtype,
+                                 bool is_prefill,
+                                 const torch::Tensor& attn_mask);
+  torch::Tensor attn_mask;
+  torch::Tensor seq_lens;
+#else
   static AttentionMetadata build(const ModelInputParams& params,
                                  bool is_prefill);
 
   static AttentionMetadata build(const ModelInputParams& params,
                                  const std::string& compute_dtype,
                                  bool is_prefill);
+#endif
 
   torch::Tensor query_start_loc;
   torch::Tensor seq_start_loc;
diff --git a/xllm/core/layers/common/dense_mlp.cpp b/xllm/core/layers/common/dense_mlp.cpp
index b487b90f..cda35725 100644
--- a/xllm/core/layers/common/dense_mlp.cpp
+++ b/xllm/core/layers/common/dense_mlp.cpp
@@ -89,6 +89,11 @@ torch::Tensor DenseMLPImpl::forward(const torch::Tensor& hidden_states) {
     return down_proj_->forward(gate_up);
   } else {
     int64_t batch_size = gate_up.sizes()[0];
+#if defined(USE_NPU)
+    xllm::kernel::ActivationParams activation_params;
+    activation_params.input = gate_up;
+    auto output = xllm::kernel::active_tensor(activation_params);
+#else
     auto output = torch::empty(
         {batch_size,
          intermediate_size_ / parallel_args_.tp_group_->world_size()},
@@ -100,6 +105,7 @@ torch::Tensor DenseMLPImpl::forward(const torch::Tensor& hidden_states) {
     activation_params.act_mode = hidden_act_;
     activation_params.is_gated = is_gated_;
     xllm::kernel::active(activation_params);
+#endif
 
     return down_proj_->forward(output);
   }
diff --git a/xllm/core/layers/common/fuse_norm.cpp b/xllm/core/layers/common/fuse_norm.cpp
index 9b5dec01..b546258c 100644
--- a/xllm/core/layers/common/fuse_norm.cpp
+++ b/xllm/core/layers/common/fuse_norm.cpp
@@ -37,6 +37,13 @@ FusedRMSNormImpl::FusedRMSNormImpl(int64_t dim,
 torch::Tensor FusedRMSNormImpl::forward(torch::Tensor& input) {
   auto org_shape = input.sizes().vec();
   input = input.reshape({-1, norm_dim_});
+#if defined(USE_NPU)
+  xllm::kernel::FusedLayerNormParams fused_layernorm_params;
+  fused_layernorm_params.input = input;
+  fused_layernorm_params.weight = weight_;
+  fused_layernorm_params.eps = eps_;
+  auto output = xllm::kernel::fused_layernorm_tensor(fused_layernorm_params);
+#else
   auto output = torch::empty_like(input);
 
   xllm::kernel::FusedLayerNormParams fused_layernorm_params;
@@ -47,6 +54,7 @@ torch::Tensor FusedRMSNormImpl::forward(torch::Tensor& input) {
   fused_layernorm_params.eps = eps_;
 
   xllm::kernel::fused_layernorm(fused_layernorm_params);
+#endif
 
   output = output.view(org_shape);
   return output;
diff --git a/xllm/core/layers/common/qwen3_attention.cpp b/xllm/core/layers/common/qwen3_attention.cpp
index d3d08768..0c58bb86 100644
--- a/xllm/core/layers/common/qwen3_attention.cpp
+++ b/xllm/core/layers/common/qwen3_attention.cpp
@@ -77,7 +77,6 @@ Qwen3AttentionImpl::Qwen3AttentionImpl(const ModelArgs& args,
 
   k_norm_ = register_module(
       "k_norm", RmsNorm(args.head_dim(), args.rms_norm_eps(), options));
-
   // 4. Rotary embedding
   rotary_emb_ = register_module("rope",
                                 RotaryEmbedding(/*rotary_dim=*/head_dim_,
diff --git a/xllm/core/layers/common/qwen3_attention.h b/xllm/core/layers/common/qwen3_attention.h
index 9d5536ce..b7ba2e90 100644
--- a/xllm/core/layers/common/qwen3_attention.h
+++ b/xllm/core/layers/common/qwen3_attention.h
@@ -56,8 +56,10 @@ class Qwen3AttentionImpl : public torch::nn::Module {
 
   QKVParallelLinear qkv_proj_{nullptr};
   RowParallelLinear o_proj_{nullptr};
+
   RmsNorm q_norm_{nullptr};
   RmsNorm k_norm_{nullptr};
+
   Attention attn_{nullptr};
   RotaryEmbedding rotary_emb_{nullptr};
 };
diff --git a/xllm/core/layers/common/qwen3_decoder_layer.h b/xllm/core/layers/common/qwen3_decoder_layer.h
index f5c8cc26..c9c0a278 100644
--- a/xllm/core/layers/common/qwen3_decoder_layer.h
+++ b/xllm/core/layers/common/qwen3_decoder_layer.h
@@ -51,6 +51,7 @@ class Qwen3DecoderImpl : public torch::nn::Module {
  private:
   Qwen3Attention attention_{nullptr};
   DenseMLP mlp_{nullptr};
+
   RmsNorm input_norm_{nullptr};
   RmsNorm post_norm_{nullptr};
 
diff --git a/xllm/core/layers/common/qwen3_moe_decoder_layer.h b/xllm/core/layers/common/qwen3_moe_decoder_layer.h
index 44895629..07280ead 100644
--- a/xllm/core/layers/common/qwen3_moe_decoder_layer.h
+++ b/xllm/core/layers/common/qwen3_moe_decoder_layer.h
@@ -51,6 +51,7 @@ class Qwen3MoeDecoderImpl : public torch::nn::Module {
   Qwen3Attention attention_{nullptr};
   DenseMLP mlp_{nullptr};
   FusedMoE moe_mlp_{nullptr};
+
   RmsNorm input_norm_{nullptr};
   RmsNorm post_norm_{nullptr};
 };
diff --git a/xllm/core/layers/common/rotary_embedding.cpp b/xllm/core/layers/common/rotary_embedding.cpp
index 1280e29c..b3ee6e5e 100644
--- a/xllm/core/layers/common/rotary_embedding.cpp
+++ b/xllm/core/layers/common/rotary_embedding.cpp
@@ -45,6 +45,12 @@ RotaryEmbeddingImpl::RotaryEmbeddingImpl(int rotary_dim,
   t = t.to(dev_options);
 
   const auto freqs = torch::einsum("i,j->ij", {t, inv_freq});
+#if defined(USE_NPU)
+  const auto cos_sin =
+      torch::cat({freqs.cos(), freqs.sin()}, /*dim=*/-1).contiguous();
+  cos_sin_cache_ = register_buffer("cos_sin_cache", cos_sin.to(options));
+  auto cos_sin_vec = cos_sin_cache_.chunk(2, /*dim=*/-1);
+#else
   // Create cos and sin embeddings.
   torch::Tensor emd;
   if (interleaved) {
@@ -61,6 +67,7 @@ RotaryEmbeddingImpl::RotaryEmbeddingImpl(int rotary_dim,
   auto cos_sin_vec = cos_sin_cache_.chunk(2, /*dim=*/-1);
   cos_ = cos_sin_vec[0].view({-1, rotary_dim});
   sin_ = cos_sin_vec[1].view({-1, rotary_dim});
+#endif
 }
 
 void RotaryEmbeddingImpl::forward(torch::Tensor& q,
@@ -82,8 +89,12 @@ void RotaryEmbeddingImpl::forward(torch::Tensor& q,
   xllm::kernel::RotaryParams rotary_params;
   rotary_params.q = q;
   rotary_params.k = k;
+#if defined(USE_NPU)
+  rotary_params.positions = positions;
+#else
   rotary_params.sin = sin_;
   rotary_params.cos = cos_;
+#endif
   rotary_params.cos_sin = cos_sin_cache_;
   rotary_params.position_ids = position_ids;
   rotary_params.cu_query_lens = cu_query_lens;
diff --git a/xllm/core/layers/linear.h b/xllm/core/layers/linear.h
index 7870dbeb..63252c45 100644
--- a/xllm/core/layers/linear.h
+++ b/xllm/core/layers/linear.h
@@ -18,14 +18,10 @@ limitations under the License.
 #include <glog/logging.h>
 #include <torch/torch.h>
 
-#if defined(USE_MLU)
 #include "common/linear_impl.h"
-#endif
-
 namespace xllm {
 namespace layer {
 
-#if defined(USE_MLU)
 class ColumnParallelLinear
     : public torch::nn::ModuleHolder<ColumnParallelLinearImpl> {
  public:
@@ -123,7 +119,6 @@ class ReplicatedLinear : public torch::nn::ModuleHolder<ReplicatedLinearImpl> {
                                                             quant_args,
                                                             options)) {}
 };
-#endif
 
 }  // namespace layer
 }  // namespace xllm
diff --git a/xllm/core/layers/lm_head.h b/xllm/core/layers/lm_head.h
index 3b6fcd49..3b3210eb 100644
--- a/xllm/core/layers/lm_head.h
+++ b/xllm/core/layers/lm_head.h
@@ -17,9 +17,8 @@ limitations under the License.
 
 #if defined(USE_NPU)
 #include "npu/npu_lm_head_impl.h"
-#else
-#include "common/linear_impl.h"
 #endif
+#include "common/linear_impl.h"
 
 namespace xllm {
 namespace layer {
@@ -33,6 +32,33 @@ class LmHead : public torch::nn::ModuleHolder<NpuLmHeadImpl> {
   LmHead(const ModelContext& context)
       : ModuleHolder(std::make_shared<NpuLmHeadImpl>(context)) {}
 };
+
+/**
+ * TODO: Rename the original LmHead definition to NpuLmHead,
+ * and define the current one as LmHead to unify NPU's LmHead
+ * related code with MLU and GPU
+ */
+class LmHeadNative : public torch::nn::ModuleHolder<ColumnParallelLinearImpl> {
+ public:
+  using torch::nn::ModuleHolder<ColumnParallelLinearImpl>::ModuleHolder;
+  using Impl __attribute__((__unused__)) = ColumnParallelLinearImpl;
+
+  LmHeadNative(int64_t in_features,
+               int64_t out_features,
+               bool bias,
+               bool gather_output,
+               const QuantArgs& quant_args,
+               const ParallelArgs& parallel_args,
+               const torch::TensorOptions& options)
+      : ModuleHolder(std::make_shared<ColumnParallelLinearImpl>(in_features,
+                                                                out_features,
+                                                                bias,
+                                                                gather_output,
+                                                                quant_args,
+                                                                parallel_args,
+                                                                options)) {}
+};
+
 #else
 class LmHead : public torch::nn::ModuleHolder<ColumnParallelLinearImpl> {
  public:
diff --git a/xllm/core/layers/npu/npu_rms_norm_impl.h b/xllm/core/layers/npu/npu_rms_norm_impl.h
index fa1af2c4..fb4f469b 100644
--- a/xllm/core/layers/npu/npu_rms_norm_impl.h
+++ b/xllm/core/layers/npu/npu_rms_norm_impl.h
@@ -54,7 +54,7 @@ class NpuRmsNormImpl : public NpuBaseLayer {
 
   void merge_loaded_weights() override;
 
-  torch::Tensor forward(torch::Tensor& x, int nodeId);
+  torch::Tensor forward(torch::Tensor& x, int nodeId = 0);
 
  private:
   int64_t init_layer() override;
diff --git a/xllm/core/layers/npu/npu_word_embedding_impl.cpp b/xllm/core/layers/npu/npu_word_embedding_impl.cpp
index eb4a09b9..5c4ea046 100644
--- a/xllm/core/layers/npu/npu_word_embedding_impl.cpp
+++ b/xllm/core/layers/npu/npu_word_embedding_impl.cpp
@@ -125,6 +125,11 @@ torch::Tensor NpuWordEmbeddingImpl::forward(const torch::Tensor& x,
 
 void NpuWordEmbeddingImpl::build_node_variant_pack(atb_speed::Model::Node& node,
                                                    const torch::Tensor& x) {
+  if (!node.operation) {
+    throw std::runtime_error(
+        "node.operation is null in build_node_variant_pack");
+  }
+
   internalTensors = atb_speed::Utils::AtTensor2Tensor(x);
   // node.outTensors[0] = &internalTensors;
 
@@ -133,6 +138,13 @@ void NpuWordEmbeddingImpl::build_node_variant_pack(atb_speed::Model::Node& node,
   inTensorDescs.resize(node.variantPack.inTensors.size());
 
   atb::SVector<atb::TensorDesc> outTensorDescs;
+
+  auto output_num = node.operation->GetOutputNum();
+  if (output_num <= 0) {
+    throw std::runtime_error("Invalid output number: " +
+                             std::to_string(output_num));
+  }
+
   outTensorDescs.reserve(node.operation->GetOutputNum());
   outTensorDescs.resize(node.operation->GetOutputNum());
 
diff --git a/xllm/core/layers/qwen3_decoder_layer.h b/xllm/core/layers/qwen3_decoder_layer.h
index 324738d5..ec2c311e 100644
--- a/xllm/core/layers/qwen3_decoder_layer.h
+++ b/xllm/core/layers/qwen3_decoder_layer.h
@@ -17,14 +17,14 @@ limitations under the License.
 
 #if defined(USE_NPU)
 #include "npu/npu_qwen3_decoder_layer_impl.h"
-#else
-#include "common/qwen3_decoder_layer.h"
 #endif
 
+#include "common/qwen3_decoder_layer.h"
+
 namespace xllm {
 namespace layer {
 
-#if defined(USE_NPU)
+#if defined(USE_NPU) && !defined(USE_NPU_TORCH)
 class Qwen3DecoderLayer
     : public torch::nn::ModuleHolder<NpuQwen3DecoderLayerImpl> {
  public:
diff --git a/xllm/core/layers/rms_norm.h b/xllm/core/layers/rms_norm.h
index d8920c68..3065810f 100644
--- a/xllm/core/layers/rms_norm.h
+++ b/xllm/core/layers/rms_norm.h
@@ -16,23 +16,23 @@ limitations under the License.
 #pragma once
 #if defined(USE_NPU)
 #include "npu/npu_rms_norm_impl.h"
-#else
-#include "common/fuse_norm.h"
 #endif
+#include "common/fuse_norm.h"
 
 namespace xllm {
 namespace layer {
 
 #if defined(USE_NPU)
-class RmsNorm : public torch::nn::ModuleHolder<NpuRmsNormImpl> {
+class NpuRmsNorm : public torch::nn::ModuleHolder<NpuRmsNormImpl> {
  public:
   using torch::nn::ModuleHolder<NpuRmsNormImpl>::ModuleHolder;
   using Impl __attribute__((__unused__)) = NpuRmsNormImpl;
 
-  RmsNorm(const ModelContext& context)
+  NpuRmsNorm(const ModelContext& context)
       : ModuleHolder(std::make_shared<NpuRmsNormImpl>(context)) {}
 };
-#else
+
+#endif
 class RmsNorm : public torch::nn::ModuleHolder<FusedRMSNormImpl> {
  public:
   using torch::nn::ModuleHolder<FusedRMSNormImpl>::ModuleHolder;
@@ -41,7 +41,6 @@ class RmsNorm : public torch::nn::ModuleHolder<FusedRMSNormImpl> {
   RmsNorm(int64_t dim, double eps, const torch::TensorOptions& options)
       : ModuleHolder(std::make_shared<FusedRMSNormImpl>(dim, eps, options)) {}
 };
-#endif
 
 }  // namespace layer
 }  // namespace xllm
diff --git a/xllm/core/layers/word_embedding.h b/xllm/core/layers/word_embedding.h
index c377dcc2..6df992b1 100644
--- a/xllm/core/layers/word_embedding.h
+++ b/xllm/core/layers/word_embedding.h
@@ -17,9 +17,8 @@ limitations under the License.
 
 #if defined(USE_NPU)
 #include "npu/npu_word_embedding_impl.h"
-#else
-#include "common/word_embedding_impl.h"
 #endif
+#include "common/word_embedding_impl.h"
 
 namespace xllm {
 namespace layer {
@@ -33,6 +32,26 @@ class WordEmbedding : public torch::nn::ModuleHolder<NpuWordEmbeddingImpl> {
       : ModuleHolder(std::make_shared<NpuWordEmbeddingImpl>(context)) {}
 };
 
+/**
+ * TODO: Rename the original WordEmbedding definition to NpuWordEmbedding,
+ * and define the current one as WordEmbedding to unify NPU's WordEmbedding
+ * related code with MLU and GPU
+ */
+
+class WordEmbeddingNative : public torch::nn::ModuleHolder<WordEmbeddingImpl> {
+ public:
+  using torch::nn::ModuleHolder<WordEmbeddingImpl>::ModuleHolder;
+  using Impl __attribute__((__unused__)) = WordEmbeddingImpl;
+  WordEmbeddingNative(int64_t num_embeddings,
+                      int64_t embedding_dim,
+                      const ParallelArgs& parallel_args,
+                      const torch::TensorOptions& options)
+      : ModuleHolder(std::make_shared<WordEmbeddingImpl>(num_embeddings,
+                                                         embedding_dim,
+                                                         parallel_args,
+                                                         options)) {}
+};
+
 #else
 
 class WordEmbedding : public torch::nn::ModuleHolder<WordEmbeddingImpl> {
diff --git a/xllm/core/runtime/CMakeLists.txt b/xllm/core/runtime/CMakeLists.txt
index 54b10152..594c017f 100644
--- a/xllm/core/runtime/CMakeLists.txt
+++ b/xllm/core/runtime/CMakeLists.txt
@@ -61,6 +61,7 @@ cc_library(
     :state_dict
     :dit_cache
     $<$<BOOL:${USE_NPU}>:npu_layers>
+    $<$<BOOL:${USE_NPU}>:common_layers>
     :model
     :models
     :sampler
diff --git a/xllm/models/llm/deepseek_v2.h b/xllm/models/llm/deepseek_v2.h
index 010993a4..eb094617 100644
--- a/xllm/models/llm/deepseek_v2.h
+++ b/xllm/models/llm/deepseek_v2.h
@@ -140,7 +140,7 @@ class DeepseekV2ModelImpl : public torch::nn::Module {
       blocks_->push_back(block);
     }
 
-    norm_ = register_module("norm", layer::RmsNorm(context));
+    norm_ = register_module("norm", layer::NpuRmsNorm(context));
     // dp_size_=4;
     dp_size_ = parallel_args.dp_size();
     std::vector<int64_t> indices;
@@ -289,7 +289,7 @@ class DeepseekV2ModelImpl : public torch::nn::Module {
   std::vector<std::shared_ptr<RotaryEmbedding>> pos_embs_;
   std::vector<layer::PosEmbedding> atb_pos_embs_;
   layer::AttentionMask attn_mask_;
-  layer::RmsNorm norm_{nullptr};
+  layer::NpuRmsNorm norm_{nullptr};
 };
 TORCH_MODULE(DeepseekV2Model);
 
diff --git a/xllm/models/llm/deepseek_v2_mtp.h b/xllm/models/llm/deepseek_v2_mtp.h
index 7960711c..9cb10dd6 100644
--- a/xllm/models/llm/deepseek_v2_mtp.h
+++ b/xllm/models/llm/deepseek_v2_mtp.h
@@ -81,11 +81,11 @@ class DeepseekV2MtpModelImpl : public torch::nn::Module {
                                                   sm_scale,
                                                   options));
       atb_pos_embs_.push_back(layer::PosEmbedding(context));
-      eh_projs_.push_back(layer::ColumnParallelLinear(context));
+      eh_projs_.push_back(layer::NpuColumnParallelLinear(context));
     }
-    enorm_ = register_module("enorm", layer::RmsNorm(context));
-    hnorm_ = register_module("hnorm", layer::RmsNorm(context));
-    final_norm_ = register_module("final_norm", layer::RmsNorm(context));
+    enorm_ = register_module("enorm", layer::NpuRmsNorm(context));
+    hnorm_ = register_module("hnorm", layer::NpuRmsNorm(context));
+    final_norm_ = register_module("final_norm", layer::NpuRmsNorm(context));
 
     // dp_size_=4;
     dp_size_ = parallel_args.dp_size();
@@ -241,10 +241,10 @@ class DeepseekV2MtpModelImpl : public torch::nn::Module {
   std::vector<std::shared_ptr<RotaryEmbedding>> pos_embs_;
   std::vector<layer::PosEmbedding> atb_pos_embs_;
   layer::AttentionMask attn_mask_;
-  std::vector<layer::ColumnParallelLinear> eh_projs_;
-  layer::RmsNorm enorm_{nullptr};
-  layer::RmsNorm hnorm_{nullptr};
-  layer::RmsNorm final_norm_{nullptr};
+  std::vector<layer::NpuColumnParallelLinear> eh_projs_;
+  layer::NpuRmsNorm enorm_{nullptr};
+  layer::NpuRmsNorm hnorm_{nullptr};
+  layer::NpuRmsNorm final_norm_{nullptr};
 };
 TORCH_MODULE(DeepseekV2MtpModel);
 
diff --git a/xllm/models/llm/glm4_moe.h b/xllm/models/llm/glm4_moe.h
index 79dbefd7..913de60a 100644
--- a/xllm/models/llm/glm4_moe.h
+++ b/xllm/models/llm/glm4_moe.h
@@ -104,7 +104,7 @@ class Glm4MoeModelImpl : public torch::nn::Module {
       blocks_->push_back(block);
     }
 
-    norm_ = register_module("norm", layer::RmsNorm(context));
+    norm_ = register_module("norm", layer::NpuRmsNorm(context));
     dp_size_ = parallel_args.dp_size();
     std::vector<int64_t> indices;
     dp_local_tp_size_ = parallel_args.world_size() / dp_size_;
@@ -244,7 +244,7 @@ class Glm4MoeModelImpl : public torch::nn::Module {
   torch::Dtype dtype_;
   layer::WordEmbedding embed_tokens_{nullptr};
   layer::AttentionMask attn_mask_;
-  layer::RmsNorm norm_{nullptr};
+  layer::NpuRmsNorm norm_{nullptr};
   torch::Tensor cos_sin_;
   layer::PosEmbedding atb_pos_emb_{nullptr};
 };
diff --git a/xllm/models/llm/glm4_moe_mtp.h b/xllm/models/llm/glm4_moe_mtp.h
index 5c005a24..578051da 100644
--- a/xllm/models/llm/glm4_moe_mtp.h
+++ b/xllm/models/llm/glm4_moe_mtp.h
@@ -60,10 +60,11 @@ class Glm4MoeMtpModelImpl : public torch::nn::Module {
       blocks_->push_back(block);
     }
 
-    eh_proj_ = register_module("eh_proj", layer::ColumnParallelLinear(context));
-    enorm_ = register_module("enorm", layer::RmsNorm(context));
-    hnorm_ = register_module("hnorm", layer::RmsNorm(context));
-    final_norm_ = register_module("final_norm", layer::RmsNorm(context));
+    eh_proj_ =
+        register_module("eh_proj", layer::NpuColumnParallelLinear(context));
+    enorm_ = register_module("enorm", layer::NpuRmsNorm(context));
+    hnorm_ = register_module("hnorm", layer::NpuRmsNorm(context));
+    final_norm_ = register_module("final_norm", layer::NpuRmsNorm(context));
 
     dp_size_ = parallel_args.dp_size();
     std::vector<int64_t> indices;
@@ -229,10 +230,10 @@ class Glm4MoeMtpModelImpl : public torch::nn::Module {
   layer::AttentionMask attn_mask_;
   torch::Tensor cos_sin_;
   layer::PosEmbedding atb_pos_emb_{nullptr};
-  layer::ColumnParallelLinear eh_proj_{nullptr};
-  layer::RmsNorm enorm_{nullptr};
-  layer::RmsNorm hnorm_{nullptr};
-  layer::RmsNorm final_norm_{nullptr};
+  layer::NpuColumnParallelLinear eh_proj_{nullptr};
+  layer::NpuRmsNorm enorm_{nullptr};
+  layer::NpuRmsNorm hnorm_{nullptr};
+  layer::NpuRmsNorm final_norm_{nullptr};
 };
 TORCH_MODULE(Glm4MoeMtpModel);
 
diff --git a/xllm/models/llm/llama.h b/xllm/models/llm/llama.h
index e8516942..df3e76bd 100644
--- a/xllm/models/llm/llama.h
+++ b/xllm/models/llm/llama.h
@@ -115,7 +115,7 @@ class LlamaModelImpl : public torch::nn::Module {
     layers_.reserve(context.get_model_args().n_layers());
     embed_tokens_ =
         register_module("embed_tokens", layer::WordEmbedding(context));
-    norm_ = register_module("norm", layer::RmsNorm(context));
+    norm_ = register_module("norm", layer::NpuRmsNorm(context));
 
     std::tie(cos_pos_, sin_pos_) =
         get_llama_rotary_embedding(128,
@@ -230,7 +230,7 @@ class LlamaModelImpl : public torch::nn::Module {
   int device_id_ = 0;
   layer::AttentionMask attn_mask_;
   layer::WordEmbedding embed_tokens_{nullptr};
-  layer::RmsNorm norm_{nullptr};
+  layer::NpuRmsNorm norm_{nullptr};
 
   torch::nn::ModuleList blocks_{nullptr};
   // hold same data but different type as blocks_ to avoid type cast
diff --git a/xllm/models/llm/llm_model_base.h b/xllm/models/llm/llm_model_base.h
index 7b4212be..826c8dc6 100644
--- a/xllm/models/llm/llm_model_base.h
+++ b/xllm/models/llm/llm_model_base.h
@@ -17,6 +17,8 @@ limitations under the License.
 
 #if defined(USE_NPU)
 #include <atb/atb_infer.h>
+#include <torch_npu/csrc/aten/CustomFunctions.h>
+
 #endif
 #include <gflags/gflags.h>
 #include <torch/torch.h>
@@ -32,14 +34,14 @@ limitations under the License.
 #include "core/framework/model_context.h"
 #include "core/layers/attention_mask.h"
 #include "core/layers/block_copy.h"
+#include "core/layers/common/attention.h"
 #include "core/layers/lm_head.h"
 #include "core/layers/pos_embedding.h"
 #include "core/layers/rms_norm.h"
 #include "models/model_registry.h"
+
 #if defined(USE_NPU)
 #include "xllm_kernels/core/include/atb_speed/log.h"
-#else
-#include "core/layers/common/attention.h"
 #endif
 
 namespace xllm {
@@ -81,12 +83,12 @@ class LlmDecoderLayerImplBase : public torch::nn::Module {
   LlmDecoderLayerImplBase(const ModelContext& context) {
     // register submodules
     decoder_layer_ = register_module("decoder_layer", DecoderType(context));
-#if defined(USE_NPU)
+#if defined(USE_NPU) && !defined(USE_NPU_TORCH)
     block_copy_ = register_module("block_copy", layer::BlockCopy(context));
 #endif
   }
 
-#if defined(USE_NPU)
+#if defined(USE_NPU) && !defined(USE_NPU_TORCH)
   virtual torch::Tensor forward(std::vector<torch::Tensor>& x,
                                 std::vector<torch::Tensor>& cos_pos,
                                 std::vector<torch::Tensor>& sin_pos,
@@ -96,7 +98,7 @@ class LlmDecoderLayerImplBase : public torch::nn::Module {
                                 int node_id,
                                 std::vector<aclrtEvent*> event,
                                 std::vector<std::atomic<bool>*> event_flag) {
-#if defined(USE_NPU)
+#if defined(USE_NPU) && !defined(USE_NPU_TORCH)
     auto micro_batch_num = x.size();
     for (auto i = 0; i < micro_batch_num; ++i) {
       if (input_params[i].src_block_indices.numel() > 0) {
@@ -125,11 +127,11 @@ class LlmDecoderLayerImplBase : public torch::nn::Module {
   }
   virtual void merge_loaded_weights() {
     decoder_layer_->merge_loaded_weights();
-#if defined(USE_NPU)
+#if defined(USE_NPU) && !defined(USE_NPU_TORCH)
     block_copy_->merge_loaded_weights();
 #endif
   }
-#elif defined(USE_MLU)
+#elif defined(USE_MLU) || defined(USE_NPU_TORCH)
   virtual torch::Tensor forward(torch::Tensor& x,
                                 torch::Tensor& positions,
                                 const layer::AttentionMetadata& attn_metadata,
@@ -147,7 +149,7 @@ class LlmDecoderLayerImplBase : public torch::nn::Module {
 
  private:
   DecoderType decoder_layer_{nullptr};
-#if defined(USE_NPU)
+#if defined(USE_NPU) && !defined(USE_NPU_TORCH)
   layer::BlockCopy block_copy_{nullptr};
 #endif
 };
@@ -165,7 +167,11 @@ class LlmModelImplBase : public torch::nn::Module {
 
   torch::Tensor get_input_embeddings(torch::Tensor input_ids) {
 #if defined(USE_NPU)
+#if defined(USE_NPU_TORCH)
+    return embed_tokens_native_[0](input_ids);
+#else
     return embed_tokens_[0](input_ids, 0);
+#endif
 #elif defined(USE_MLU)
     return embed_tokens_[0](input_ids);
 #endif
@@ -203,7 +209,11 @@ class LlmModelImplBase : public torch::nn::Module {
         h = inputs_embeds;
       } else {
 #if defined(USE_NPU)
+#if defined(USE_NPU_TORCH)
+        h = embed_tokens_native_[i](tokens[i]);
+#else
         h = embed_tokens_[i](tokens[i], 0);
+#endif
 #elif defined(USE_MLU)
         h = embed_tokens_[i](tokens[i]);
 #endif
@@ -277,7 +287,7 @@ class LlmModelImplBase : public torch::nn::Module {
       attn_masks.push_back(std::move(attn_mask));
 #endif
     }
-#if defined(USE_NPU)
+#if defined(USE_NPU) && !defined(USE_NPU_TORCH)
     for (size_t i = 0; i < layers_.size(); i++) {
       std::vector<aclrtEvent*> events(micro_batch_num, nullptr);
       std::vector<std::atomic<bool>*> event_flags(micro_batch_num, nullptr);
@@ -307,10 +317,15 @@ class LlmModelImplBase : public torch::nn::Module {
     }
     auto cancated_h = torch::cat(hs, 0);
     return norm_(cancated_h, 0);
-#elif defined(USE_MLU)
+#elif defined(USE_MLU) || defined(USE_NPU_TORCH)
     bool is_prefill = input_params[0].q_max_seq_len > 1;
+#if defined(USE_NPU_TORCH)
+    auto attn_metadata = layer::AttentionMetadata::build(
+        input_params[0], is_prefill, attn_masks[0]);
+#else
     auto attn_metadata =
         layer::AttentionMetadata::build(input_params[0], is_prefill);
+#endif
 
     torch::Tensor h;
     for (size_t i = 0; i < layers_.size(); i++) {
@@ -325,8 +340,13 @@ class LlmModelImplBase : public torch::nn::Module {
   // load the weight from the checkpoint
   virtual void load_state_dict(const StateDict& state_dict) {
     for (auto i = 0; i < FLAGS_micro_batch_num; i++) {
+#if defined(USE_NPU_TORCH)
+      embed_tokens_native_[i]->load_state_dict(
+          state_dict.get_dict_with_prefix("embed_tokens."));
+#else
       embed_tokens_[i]->load_state_dict(
           state_dict.get_dict_with_prefix("embed_tokens."));
+#endif
     }
     // call each layer's load_state_dict function
     for (int i = 0; i < layers_.size(); i++) {
@@ -338,6 +358,7 @@ class LlmModelImplBase : public torch::nn::Module {
 
 #if defined(USE_NPU)
   virtual void verify_loaded_weights(const std::string& prefix) const {
+#if !defined(USE_NPU_TORCH)
     for (auto i = 0; i < FLAGS_micro_batch_num; i++) {
       embed_tokens_[i]->verify_loaded_weights(prefix + "embed_tokens.");
     }
@@ -346,9 +367,11 @@ class LlmModelImplBase : public torch::nn::Module {
                                         ".");
     }
     norm_->verify_loaded_weights(prefix + "norm.");
+#endif
   }
 
   virtual void merge_loaded_weights() {
+#if !defined(USE_NPU_TORCH)
     for (auto i = 0; i < FLAGS_micro_batch_num; i++) {
       embed_tokens_[i]->merge_loaded_weights();
     }
@@ -356,6 +379,7 @@ class LlmModelImplBase : public torch::nn::Module {
       layers_[i]->merge_loaded_weights();
     }
     norm_->merge_loaded_weights();
+#endif
   }
 #endif
 
@@ -385,7 +409,13 @@ class LlmModelImplBase : public torch::nn::Module {
   // test
   //  ParallelEmbedding embed_tokens_{nullptr};
   std::vector<layer::WordEmbedding> embed_tokens_;
-  layer::RmsNorm norm_{nullptr};
+
+#if !defined(USE_NPU_TORCH) && defined(USE_NPU)
+  layer::NpuRmsNorm norm_{nullptr};
+#else
+  xllm::layer::RmsNorm norm_{nullptr};
+  std::vector<layer::WordEmbeddingNative> embed_tokens_native_;
+#endif
 
   torch::nn::ModuleList blocks_{nullptr};
   // hold same data but different type as blocks_ to avoid type cast
@@ -406,7 +436,20 @@ class LlmForCausalLMImplBase : public torch::nn::Module {
     model_ = register_module("model", LlmModelType(context));
 
 #if defined(USE_NPU)
+#if defined(USE_NPU_TORCH)
+    lm_head_native_ = register_module(
+        "lm_head",
+        layer::LmHeadNative(context.get_model_args().hidden_size(),
+                            context.get_model_args().vocab_size(),
+                            /*bias=*/false,
+                            /*gather_output=*/true,
+                            QuantArgs{},
+                            context.get_parallel_args(),
+                            context.get_tensor_options()));
+#else
     lm_head_ = register_module("lm_head", layer::LmHead(context));
+#endif
+
 #elif defined(USE_MLU)
     // lm_head_ is default to no quantization
     lm_head_ =
@@ -445,7 +488,15 @@ class LlmForCausalLMImplBase : public torch::nn::Module {
     auto h = hidden_states;
     // test
 #if defined(USE_NPU)
+#if defined(USE_NPU_TORCH)
+    if (seleted_idxes.defined()) {
+      h = h.index_select(/*dim=*/0, seleted_idxes);
+    }
+    return lm_head_native_(h);
+#else
     return lm_head_(hidden_states, seleted_idxes, 0);
+#endif
+
 #elif defined(USE_MLU)
     if (seleted_idxes.defined()) {
       h = h.index_select(/*dim=*/0, seleted_idxes);
@@ -459,6 +510,15 @@ class LlmForCausalLMImplBase : public torch::nn::Module {
     for (const auto& state_dict : loader->get_state_dicts()) {
       model_->load_state_dict(
           state_dict->get_dict_with_prefix(prefix + "model."));
+#if defined(USE_NPU_TORCH)
+      if (tie_word_embeddings) {
+        lm_head_native_->load_state_dict(
+            state_dict->get_dict_with_prefix(prefix + "model.embed_tokens."));
+      } else {
+        lm_head_native_->load_state_dict(
+            state_dict->get_dict_with_prefix(prefix + "lm_head."));
+      }
+#else
       if (tie_word_embeddings) {
         lm_head_->load_state_dict(
             state_dict->get_dict_with_prefix(prefix + "model.embed_tokens."));
@@ -466,15 +526,18 @@ class LlmForCausalLMImplBase : public torch::nn::Module {
         lm_head_->load_state_dict(
             state_dict->get_dict_with_prefix(prefix + "lm_head."));
       }
+#endif
     }
 #if defined(USE_NPU)
     // verify
     model_->verify_loaded_weights(prefix + "model.");
+    model_->merge_loaded_weights();
+#if !defined(USE_NPU_TORCH)
     lm_head_->verify_loaded_weights(prefix + "lm_head.");
 
-    model_->merge_loaded_weights();
     // test
     lm_head_->merge_loaded_weights();
+#endif
 #endif
   }
 
@@ -504,6 +567,9 @@ class LlmForCausalLMImplBase : public torch::nn::Module {
   bool tie_word_embeddings{false};
   // test
   layer::LmHead lm_head_{nullptr};
+#if defined(USE_NPU_TORCH)
+  layer::LmHeadNative lm_head_native_{nullptr};
+#endif
 };
 
 }  // namespace xllm
diff --git a/xllm/models/llm/qwen2.h b/xllm/models/llm/qwen2.h
index c510471c..d4223cae 100644
--- a/xllm/models/llm/qwen2.h
+++ b/xllm/models/llm/qwen2.h
@@ -42,7 +42,7 @@ class QWen2ModelImpl : public LlmModelImplBase<QWen2DecoderLayer> {
 
     blocks_ = register_module("layers", torch::nn::ModuleList());
     layers_.reserve(model_args.n_layers());
-    norm_ = register_module("norm", layer::RmsNorm(context));
+    norm_ = register_module("norm", layer::NpuRmsNorm(context));
     for (auto i = 0; i < FLAGS_micro_batch_num; i++) {
       embed_tokens_.push_back(layer::WordEmbedding(context));
       atb_pos_embeds_.push_back(layer::PosEmbedding(context));
diff --git a/xllm/models/llm/qwen3.h b/xllm/models/llm/qwen3.h
index 8a104d9d..a05c11d5 100644
--- a/xllm/models/llm/qwen3.h
+++ b/xllm/models/llm/qwen3.h
@@ -39,9 +39,24 @@ class QWen3ModelImpl : public LlmModelImplBase<QWen3DecoderLayer> {
     blocks_ = register_module("layers", torch::nn::ModuleList());
     layers_.reserve(model_args.n_layers());
 #if defined(USE_NPU)
-    norm_ = register_module("norm", layer::RmsNorm(context));
+#if defined(USE_NPU_TORCH)
+    norm_ = register_module(
+        "norm",
+        xllm::layer::RmsNorm(
+            model_args.hidden_size(), model_args.rms_norm_eps(), options));
+#else
+    norm_ = register_module("norm", layer::NpuRmsNorm(context));
+#endif
     for (auto i = 0; i < FLAGS_micro_batch_num; i++) {
+#if defined(USE_NPU_TORCH)
+      embed_tokens_native_.push_back(
+          layer::WordEmbeddingNative(model_args.vocab_size(),
+                                     model_args.hidden_size(),
+                                     context.get_parallel_args(),
+                                     options));
+#else
       embed_tokens_.push_back(layer::WordEmbedding(context));
+#endif
       atb_pos_embeds_.push_back(layer::PosEmbedding(context));
     }
     cos_sin_ = get_concat_rotary_embedding(128,
diff --git a/xllm/models/llm/qwen3_moe.h b/xllm/models/llm/qwen3_moe.h
index 16771fb9..9085a171 100644
--- a/xllm/models/llm/qwen3_moe.h
+++ b/xllm/models/llm/qwen3_moe.h
@@ -122,7 +122,7 @@ class Qwen3MoeModelImpl : public torch::nn::Module {
     attn_mask_ = layer::AttentionMask(options.device(),
                                       options.dtype().toScalarType(),
                                       /*mask_value=*/mask_value);
-    norm_ = register_module("norm", layer::RmsNorm(context));
+    norm_ = register_module("norm", layer::NpuRmsNorm(context));
     mapping_data_ = parallel_args.mapping_data();
 #elif defined(USE_MLU)
     norm_ = register_module(
@@ -274,10 +274,13 @@ class Qwen3MoeModelImpl : public torch::nn::Module {
   torch::Dtype dtype_;
   layer::WordEmbedding embed_tokens_{nullptr};
   layer::AttentionMask attn_mask_;
-  layer::RmsNorm norm_{nullptr};
+
 #if defined(USE_NPU)
   torch::Tensor cos_sin_;
   layer::PosEmbedding atb_pos_emb_{nullptr};
+  layer::NpuRmsNorm norm_{nullptr};
+#else
+  layer::RmsNorm norm_{nullptr};
 #endif
 };
 TORCH_MODULE(Qwen3MoeModel);
diff --git a/xllm/models/models.h b/xllm/models/models.h
index 5c77ce86..4427ad7b 100644
--- a/xllm/models/models.h
+++ b/xllm/models/models.h
@@ -15,7 +15,7 @@ limitations under the License.
 
 #pragma once
 
-#if defined(USE_NPU)
+#if defined(USE_NPU) && !defined(USE_NPU_TORCH)
 #include "dit/pipeline_flux.h"       // IWYU pragma: keep
 #include "dit/pipeline_flux_fill.h"  // IWYU pragma: keep
 #include "llm/deepseek_v2.h"         // IWYU pragma: keep
@@ -35,4 +35,6 @@ limitations under the License.
 
 #include "llm/llm_model_base.h"  // IWYU pragma: keep
 #include "llm/qwen3.h"           // IWYU pragma: keep
-#include "llm/qwen3_moe.h"       // IWYU pragma: keep
+#if !defined(USE_NPU_TORCH)
+#include "llm/qwen3_moe.h"  // IWYU pragma: keep
+#endif
\ No newline at end of file
diff --git a/xllm/models/vlm/qwen2_5_vl.h b/xllm/models/vlm/qwen2_5_vl.h
index a05148c3..e7cef340 100644
--- a/xllm/models/vlm/qwen2_5_vl.h
+++ b/xllm/models/vlm/qwen2_5_vl.h
@@ -287,7 +287,7 @@ class Qwen2_5_VisionPatchMergerImpl : public torch::nn::Module {
 
     hidden_size_ =
         context_dim * static_cast<int>(std::pow(spatial_merge_size, 2));
-    ln_q_ = register_module("ln_q", layer::RmsNorm(context));
+    ln_q_ = register_module("ln_q", layer::NpuRmsNorm(context));
 
     auto cpl = torch::nn::Linear(
         torch::nn::LinearOptions(hidden_size_, hidden_size_).bias(true));
@@ -361,7 +361,7 @@ class Qwen2_5_VisionPatchMergerImpl : public torch::nn::Module {
  private:
   int64_t hidden_size_;
 
-  layer::RmsNorm ln_q_{nullptr};
+  layer::NpuRmsNorm ln_q_{nullptr};
   torch::nn::Sequential mlp_{nullptr};
   std::tuple<torch::nn::Linear, torch::nn::GELU, torch::nn::Linear> layers_ = {
       nullptr,