diff --git a/CMakeLists.txt b/CMakeLists.txt index c7765ee7..7cb317f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -298,6 +298,7 @@ else() endif() if(USE_NPU) + # add_definitions(-DUSE_NPU_TORCH) add_definitions(-DUSE_NPU) add_definitions(-DBUILD_LIBTORCH) add_definitions(-DTORCH_SETCUSTOMHANDLER=ON) @@ -309,6 +310,7 @@ if(USE_NPU) $ENV{PYTORCH_INSTALL_PATH}/include $ENV{PYTORCH_INSTALL_PATH}/include/torch/csrc/api/include $ENV{PYTORCH_NPU_INSTALL_PATH}/include + $ENV{PYTORCH_INSTALL_PATH}/include/torch/csrc/distributed $ENV{NPU_HOME_PATH}/include $ENV{ATB_HOME_PATH}/include $ENV{NPU_HOME_PATH}/opp/vendors/xllm/op_api/include/ diff --git a/cmake/cc_test.cmake b/cmake/cc_test.cmake index ce5dd0dc..ccaf449c 100644 --- a/cmake/cc_test.cmake +++ b/cmake/cc_test.cmake @@ -69,6 +69,14 @@ function(cc_test) PRIVATE ${CC_TEST_LINKOPTS} ) + if(USE_NPU) + set(COMMON_LIBS Python::Python torch_npu torch_python) + endif() + + if(USE_NPU AND DEFINED COMMON_LIBS) + target_link_libraries(${CC_TEST_NAME} PRIVATE ${COMMON_LIBS}) + endif() + add_dependencies(all_tests ${CC_TEST_NAME}) gtest_add_tests( diff --git a/xllm/CMakeLists.txt b/xllm/CMakeLists.txt index e95742df..7714647a 100644 --- a/xllm/CMakeLists.txt +++ b/xllm/CMakeLists.txt @@ -34,7 +34,7 @@ target_link_libraries(xllm PRIVATE glog::glog brpc leveldb::leveldb ZLIB::ZLIB p add_dependencies(xllm brpc-static) if(USE_NPU) - set(COMMON_LIBS Python::Python ascendcl hccl c_sec nnopbase ms_tools_ext) + set(COMMON_LIBS Python::Python ascendcl hccl c_sec nnopbase ms_tools_ext torch_npu torch_python) elseif(USE_MLU) set(COMMON_LIBS Python::Python) endif() diff --git a/xllm/core/common/CMakeLists.txt b/xllm/core/common/CMakeLists.txt index 3410b2e5..76877872 100644 --- a/xllm/core/common/CMakeLists.txt +++ b/xllm/core/common/CMakeLists.txt @@ -28,6 +28,7 @@ cc_library( absl::random_random absl::strings torch + $<$:torch_python> $<$:torch_npu> $<$:mspti> $<$:ms_tools_ext> diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp index 30b9b4e3..0d8cd3cd 100644 --- a/xllm/core/common/global_flags.cpp +++ b/xllm/core/common/global_flags.cpp @@ -389,3 +389,5 @@ DEFINE_string(reasoning_parser, // --- qwen3 reranker config --- DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker."); + +DEFINE_bool(enable_native_npu, true, "Whether to enable native NPU support."); \ No newline at end of file diff --git a/xllm/core/common/global_flags.h b/xllm/core/common/global_flags.h index 5c79a7c3..49846b8d 100644 --- a/xllm/core/common/global_flags.h +++ b/xllm/core/common/global_flags.h @@ -202,3 +202,5 @@ DECLARE_bool(enable_qwen3_reranker); DECLARE_string(reasoning_parser); DECLARE_bool(enable_shm); + +DECLARE_bool(enable_native_npu); \ No newline at end of file diff --git a/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt b/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt index 7fbae3e5..abc6d9f7 100644 --- a/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt +++ b/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt @@ -12,6 +12,7 @@ cc_binary( :models :model :distributed_runtime + :parallel_state absl::strings xllm_kernels ascendcl diff --git a/xllm/core/distributed_runtime/worker_server.cpp b/xllm/core/distributed_runtime/worker_server.cpp index 9b631587..99fb22a5 100644 --- a/xllm/core/distributed_runtime/worker_server.cpp +++ b/xllm/core/distributed_runtime/worker_server.cpp @@ -100,6 +100,12 @@ void WorkerServer::create_server( const ParallelArgs* parallel_args = comm.parallel_args(); #if defined(USE_MLU) || defined(USE_CUDA) comm.create_process_groups(master_node_addr, device); +#elif defined(USE_NPU) + // TODO: Refactor to use model_type or other appropriate enumeration for + // condition checking + if (FLAGS_enable_native_npu) { + comm.create_process_groups(master_node_addr, device); + } #endif WorkerType worker_type = diff --git a/xllm/core/framework/model/CMakeLists.txt b/xllm/core/framework/model/CMakeLists.txt index 9bdd452d..4bcc6ffb 100644 --- a/xllm/core/framework/model/CMakeLists.txt +++ b/xllm/core/framework/model/CMakeLists.txt @@ -17,10 +17,10 @@ set(BASE_DEPS if(USE_NPU) list(APPEND BASE_DEPS :npu_layers) list(APPEND BASE_DEPS :platform_npu) -else() - list(APPEND BASE_DEPS :common_layers) endif() +list(APPEND BASE_DEPS :common_layers) + # Define the library cc_library( diff --git a/xllm/core/framework/parallel_state/collective_communicator.cpp b/xllm/core/framework/parallel_state/collective_communicator.cpp index c0066be0..8225cdf7 100644 --- a/xllm/core/framework/parallel_state/collective_communicator.cpp +++ b/xllm/core/framework/parallel_state/collective_communicator.cpp @@ -18,6 +18,9 @@ limitations under the License. #include "mapping_npu.h" #if defined(USE_NPU) +#include + +#include "npu_process_group.h" #include "xllm_kernels/core/include/atb_speed/base/external_comm_manager.h" #include "xllm_kernels/core/include/atb_speed/utils/singleton.h" #include "xllm_kernels/models/base/param/mapping.h" @@ -30,23 +33,6 @@ limitations under the License. #include "parallel_args.h" #include "util/net.h" -namespace { -#if defined(USE_NPU) -std::unique_ptr create_process_group( - int rank, - int world_size, - int rank_size, - int port, - bool trans, - const std::string& host, - const std::string& group_name, - const torch::Device& device) { - LOG(FATAL) << "Unsupported device type"; - return nullptr; -} -#endif -} // namespace - namespace xllm { CollectiveCommunicator::CollectiveCommunicator(int global_rank, diff --git a/xllm/core/framework/parallel_state/npu_process_group.cpp b/xllm/core/framework/parallel_state/npu_process_group.cpp index eff99922..b401c437 100644 --- a/xllm/core/framework/parallel_state/npu_process_group.cpp +++ b/xllm/core/framework/parallel_state/npu_process_group.cpp @@ -14,6 +14,16 @@ limitations under the License. ==============================================================================*/ #include "npu_process_group.h" +#ifdef TORCH_HIGHER_THAN_PTA6 +#include +#else +#include +#include +#endif + +#include +#include +#include namespace { @@ -24,113 +34,65 @@ namespace { LOG(FATAL) << "Failed, HCCL error :" << HcclGetErrorString(r); \ } \ } while (0) +} // namespace -inline bool is_npu(const at::Tensor& tensor) { - if (!tensor.defined()) { - return false; - } - return tensor.device().is_privateuseone(); -} - -inline bool is_npu(const at::TensorOptions& options) { - return options.device().is_privateuseone(); -} +namespace xllm { -inline bool is_npu(const at::Device& device) { - return device.is_privateuseone(); -} +ProcessGroupHCCL::ProcessGroupHCCL(int global_rank, + int world_size, + int rank_size, + int port, + bool trans, + const std::string& host, + const std::string& group_name, + const torch::Device& device) + : ProcessGroup(device) { + c10::intrusive_ptr hccl_pg_options = + c10d_npu::ProcessGroupHCCL::Options::create(); + // hccl_pg_options->group_name = group_name; + int rank = global_rank; + if (world_size != rank_size) { + auto [local_rank, group_ranks] = + get_group_rank(world_size, global_rank, rank_size, trans); + std::vector uint32_ranks; + for (auto rank : group_ranks) { + uint32_ranks.push_back(static_cast(rank)); + } + hccl_pg_options->global_ranks_in_group = uint32_ranks; + rank = local_rank; + } -at::Tensor flatten_for_scatter_gather(std::vector& tensors) { - auto& t = tensors[0]; - std::vector sizes{static_cast(tensors.size())}; - sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end()); - return at::empty(sizes, t.options()); + auto store = create_tcp_store(host, port, rank); + pg_ = std::make_unique( + store, rank, rank_size, hccl_pg_options); } -HcclDataType to_hccl_data_type(const torch::Tensor& input) { - const auto type = input.scalar_type(); - switch (type) { - case at::kFloat: - return HCCL_DATA_TYPE_FP32; - case at::kHalf: - return HCCL_DATA_TYPE_FP16; - case at::kDouble: - return HCCL_DATA_TYPE_FP64; - case at::kLong: - return HCCL_DATA_TYPE_INT64; - case at::kInt: - return HCCL_DATA_TYPE_INT32; - case at::kChar: - return HCCL_DATA_TYPE_INT8; - case at::kByte: - return HCCL_DATA_TYPE_UINT8; - case at::kBool: - return HCCL_DATA_TYPE_UINT8; - case at::kBFloat16: - return HCCL_DATA_TYPE_BFP16; - default: - TORCH_CHECK(false, "Unconvertible HCCL type ", type); +// Destructor. +ProcessGroupHCCL::~ProcessGroupHCCL() { + if (pg_) { + pg_->shutdown(); + } else { + HCCLCHECK(HcclCommDestroy(comm_)); } } -void check_input(torch::Tensor input) { - CHECK(is_npu(input)) << "input should be npu tensor"; - CHECK(input.is_contiguous()) << "input should be contiguous"; - CHECK(!input.is_sparse()) << "input have to be npu dense tensor"; -} - -} // namespace - -namespace xllm { - ProcessGroupHCCL::ProcessGroupHCCL(int rank, int world_size, const torch::Device& device, HcclComm comm) : ProcessGroup(device), comm_(comm) {} -// Destructor. -ProcessGroupHCCL::~ProcessGroupHCCL() { HCCLCHECK(HcclCommDestroy(comm_)); } -void ProcessGroupHCCL::allreduce(torch::Tensor& input) { - DCHECK(input.device() == device()) - << "input should be on the same device as the process group"; - check_input(input); - // inplace all reduce - // const auto count = input.numel(); - // const auto data_type = to_hccl_data_type(input); - // auto stream = c10_npu::getCurrentNPUStream(); - // torch::DeviceGuard device_guard(device()); - // HCCLCHECK(HcclAllReduce( - // /*sendbuff=*/input.data_ptr(), - // /*recvbuff=*/input.data_ptr(), - // /*count=*/count, - // /*datatype=*/data_type, - // /*op=*/HCCL_REDUCE_SUM, - // /*comm=*/comm_, - // /*stream=*/stream)); -} -void ProcessGroupHCCL::allgather(const torch::Tensor& input, - std::vector& outputs) { - check_input(input); - // CHECK(outputs.size() == world_size()) - // << "outputs should have the same size as world_size"; - // DCHECK(input.device() == device()) - // << "input should be on the same device as the process group"; - // torch::DeviceGuard device_guard(device()); - // torch::Tensor flattened_output = flatten_for_scatter_gather(outputs); - // const auto count = input.numel(); - // const auto data_type = to_hccl_data_type(input); - // auto stream = c10_npu::getCurrentNPUStream(); - // HCCLCHECK(HcclAllGather( - // /*sendbuff=*/input.data_ptr(), - // /*recvbuff=*/flattened_output.data_ptr(), - // /*sendcount=*/count, - // /*datatype=*/data_type, - // /*comm=*/comm_, - // /*stream=*/stream)); - // // copy the flattened output tensors to the outputs. - // for (int i = 0; i < outputs.size(); ++i) { - // outputs[i].copy_(flattened_output[i], /*non_blocking=*/true); - // } +std::unique_ptr create_process_group( + int rank, + int world_size, + int rank_size, + int port, + bool trans, + const std::string& host, + const std::string& group_name, + const torch::Device& device) { + return std::make_unique( + rank, world_size, rank_size, port, trans, host, group_name, device); } + } // namespace xllm \ No newline at end of file diff --git a/xllm/core/framework/parallel_state/npu_process_group.h b/xllm/core/framework/parallel_state/npu_process_group.h index 7ca7d23b..b0047cf4 100644 --- a/xllm/core/framework/parallel_state/npu_process_group.h +++ b/xllm/core/framework/parallel_state/npu_process_group.h @@ -18,6 +18,10 @@ limitations under the License. #include "hccl/hccl.h" #include "process_group.h" +namespace c10d_npu { +class ProcessGroupHCCL; +} + namespace xllm { class ProcessGroupHCCL : public ProcessGroup { @@ -28,16 +32,30 @@ class ProcessGroupHCCL : public ProcessGroup { const torch::Device& device, HcclComm comm); + ProcessGroupHCCL(int rank, + int world_size, + int rank_size, + int port, + bool trans, + const std::string& host, + const std::string& group_name, + const torch::Device& device); + // Destructor. ~ProcessGroupHCCL() override; - void allreduce(torch::Tensor& input) override; - - void allgather(const torch::Tensor& input, - std::vector& outputs) override; - private: HcclComm comm_ = nullptr; }; +std::unique_ptr create_process_group( + int rank, + int world_size, + int rank_size, + int port, + bool trans, + const std::string& host, + const std::string& group_name, + const torch::Device& device); + } // namespace xllm \ No newline at end of file diff --git a/xllm/core/framework/parallel_state/process_group.h b/xllm/core/framework/parallel_state/process_group.h index ba1d67a9..85ca32bf 100644 --- a/xllm/core/framework/parallel_state/process_group.h +++ b/xllm/core/framework/parallel_state/process_group.h @@ -19,6 +19,11 @@ limitations under the License. #include #include + +#if defined(USE_NPU) +#include +#endif + namespace xllm { std::pair> get_group_rank(int world_size, int global_rank, @@ -60,7 +65,11 @@ class ProcessGroup { torch::Device device_; protected: +#if defined(USE_NPU) + std::unique_ptr pg_{nullptr}; +#else std::unique_ptr pg_{nullptr}; +#endif }; } // namespace xllm \ No newline at end of file diff --git a/xllm/core/kernels/npu/CMakeLists.txt b/xllm/core/kernels/npu/CMakeLists.txt index 5553d8a0..1855db33 100644 --- a/xllm/core/kernels/npu/CMakeLists.txt +++ b/xllm/core/kernels/npu/CMakeLists.txt @@ -1,17 +1,29 @@ include(cc_library) -add_subdirectory(impl) add_subdirectory(xllm_ops) +file(GLOB_RECURSE OPPLUGIN_UTILS_HEADER + "${CMAKE_CURRENT_LIST_DIR}/custom_functions_npu/*.h" + "${CMAKE_CURRENT_LIST_DIR}/ops_npu/*.h" + "${CMAKE_CURRENT_LIST_DIR}/*.h" +) + +file(GLOB_RECURSE OPPLUGIN_UTILS_SRCS + "${CMAKE_CURRENT_LIST_DIR}/custom_functions_npu/*.cpp" + "${CMAKE_CURRENT_LIST_DIR}/ops_npu/*.cpp" + "${CMAKE_CURRENT_LIST_DIR}/*.cpp" +) + cc_library( NAME npu_kernels HDRS - linear.h - split.h - rms_norm.h - rope.h + ${OPPLUGIN_UTILS_HEADER} + SRCS + ${OPPLUGIN_UTILS_SRCS} DEPS - :npu_kernels_impl - # spdlog::spdlog -) \ No newline at end of file + :model_context + glog::glog + torch + torch_npu +) diff --git a/xllm/core/kernels/npu/rope.h b/xllm/core/kernels/npu/active.cpp similarity index 63% rename from xllm/core/kernels/npu/rope.h rename to xllm/core/kernels/npu/active.cpp index 7a075b0d..7ccfdc8d 100644 --- a/xllm/core/kernels/npu/rope.h +++ b/xllm/core/kernels/npu/active.cpp @@ -13,18 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#pragma once -#include "impl/npu_rope_impl.h" +#include -namespace xllm::kernel { +#include "npu_ops_api.h" +#include "ops_npu/npu_ops.h" -class Rope : public torch::nn::ModuleHolder { - public: - using torch::nn::ModuleHolder::ModuleHolder; - using Impl __attribute__((__unused__)) = NpuRopeImpl; +namespace xllm::kernel::npu { - Rope(const ModelContext& context) - : ModuleHolder(std::make_shared(context)) {} -}; - -} // namespace xllm::kernel +torch::Tensor active(const torch::Tensor& input) { + return at_npu::native::custom_ops::npu_swiglu(input); +} +} // namespace xllm::kernel::npu \ No newline at end of file diff --git a/xllm/core/kernels/npu/attention.cpp b/xllm/core/kernels/npu/attention.cpp new file mode 100644 index 00000000..bc7c64ac --- /dev/null +++ b/xllm/core/kernels/npu/attention.cpp @@ -0,0 +1,61 @@ +/* Copyright 2025 The xLLM Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://github.com/jd-opensource/xllm/blob/main/LICENSE + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "npu_ops_api.h" +#include "ops_npu/npu_ops.h" +namespace xllm::kernel::npu { + +void reshape_paged_cache(torch::Tensor& key, + torch::Tensor& value, + torch::Tensor& k_cache, + torch::Tensor& v_cache, + const torch::Tensor& slot_mapping) { + atb::_npu_reshape_and_cache(key, value, k_cache, v_cache, slot_mapping); +} + +void batch_prefill(const torch::Tensor& query, + const torch::Tensor& key, + const torch::Tensor& value, + const torch::Tensor& mask, + const torch::Tensor& seq_len, + float scale, + int num_heads, + int num_kv_heads, + torch::Tensor& output) { + atb::_npu_flash_attention( + query, key, value, mask, seq_len, scale, num_heads, num_kv_heads, output); +} + +void batch_decode(const torch::Tensor& query, + const torch::Tensor& k_cache, + const torch::Tensor& v_cache, + int num_kv_heads, + int num_heads, + float scale, + const torch::Tensor& block_table, + const torch::Tensor& seq_lens, + torch::Tensor& output) { + atb::_npu_paged_attention(query, + k_cache, + v_cache, + num_kv_heads, + num_heads, + scale, + block_table, + seq_lens, + output); +} + +} // namespace xllm::kernel::npu \ No newline at end of file diff --git a/xllm/core/kernels/npu/custom_functions_npu/AtbCommon.cpp b/xllm/core/kernels/npu/custom_functions_npu/AtbCommon.cpp new file mode 100644 index 00000000..4429fcda --- /dev/null +++ b/xllm/core/kernels/npu/custom_functions_npu/AtbCommon.cpp @@ -0,0 +1,173 @@ +// Copyright (c) 2025 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "AtbCommon.h" + +namespace atb { +atb::Tensor AtTensor2AtbTensor(const at::Tensor at_tensor) { + static std::map dtype_map = { + {at::ScalarType::Bool, ACL_BOOL}, + {at::ScalarType::Byte, ACL_UINT8}, + {at::ScalarType::Char, ACL_INT8}, + {at::ScalarType::Half, ACL_FLOAT16}, + {at::ScalarType::Float, ACL_FLOAT}, + {at::ScalarType::Int, ACL_INT32}, + {at::ScalarType::Long, ACL_INT64}, + {at::ScalarType::BFloat16, ACL_BF16}, + {at::ScalarType::Double, ACL_DOUBLE}, + {at::ScalarType::Short, ACL_INT16}, + {at::ScalarType::ComplexHalf, ACL_COMPLEX32}, + {at::ScalarType::ComplexFloat, ACL_COMPLEX64}, + {at::ScalarType::ComplexDouble, ACL_COMPLEX128}, + }; + + TORCH_CHECK(at_tensor.is_contiguous(), "at_tensor is not contiguous"); + atb::Tensor tensor; + tensor.desc.format = atb::utils::GetFormatForAtb(at_tensor); + if (at_tensor.device().type() == at::kCPU) { + tensor.hostData = at_tensor.data_ptr(); + } else { + tensor.deviceData = at_tensor.data_ptr(); + } + + tensor.desc.shape.dimNum = at_tensor.sizes().size(); + for (uint64_t i = 0; i < at_tensor.sizes().size(); i++) { + tensor.desc.shape.dims[i] = at_tensor.sizes()[i]; + } + + auto dtype_iterator = dtype_map.find(at_tensor.scalar_type()); + TORCH_CHECK(dtype_iterator != dtype_map.end(), + "not support dtype: ", + at_tensor.scalar_type()); + tensor.desc.dtype = dtype_iterator->second; + + tensor.dataSize = atb::Utils::GetTensorSize(tensor); + + return tensor; +} + +void RunAtbCmdV1(atb::Operation* op, + const ParamSetter& paramsetter, + const std::string& name) { + aclrtStream stream = c10_npu::getCurrentNPUStream().stream(false); + auto context_ptr = atb::utils::GetContext(stream); + atb::VariantPack variant_pack = paramsetter.variant_pack_; + uint64_t workspace_size = OperationSetup(variant_pack, op, context_ptr); + at::Tensor workspace_tensor; + void* workspace_ptr = nullptr; + if (workspace_size != 0) { + at::TensorOptions options = at::TensorOptions(c10::DeviceType::PrivateUse1); + workspace_tensor = at::empty({workspace_size}, options.dtype(at::kByte)); + workspace_ptr = const_cast(workspace_tensor.storage().data()); + } + const c10::SmallVector& cpu_tensors = + paramsetter.tensor_maintainer_.cpu_tensors; + auto acl_call = [variant_pack, + workspace_ptr, + workspace_size, + context_ptr, + op, + cpu_tensors]() -> int { + auto st = op->Execute( + variant_pack, (uint8_t*)workspace_ptr, workspace_size, context_ptr); + DestroyOperation(op); + return st; + }; + at_npu::native::OpCommand::RunOpApiV2(name, acl_call); +} + +void RunAtbCmdV2(atb::Operation* op, + const ParamSetter& paramsetter, + const std::string& name) { + aclrtStream stream = c10_npu::getCurrentNPUStream().stream(false); + atb::VariantPack variant_pack = paramsetter.variant_pack_; + const c10::SmallVector& cpu_tensors = + paramsetter.tensor_maintainer_.cpu_tensors; + auto acl_call = [op, variant_pack, stream, cpu_tensors]() -> int { + auto context_ptr = atb::utils::GetContext(stream); + uint64_t workspace_size = OperationSetup(variant_pack, op, context_ptr); + at::Tensor workspace_tensor; + void* workspace_ptr = nullptr; + if (workspace_size != 0) { + workspace_tensor = + at_npu::native::allocate_workspace(workspace_size, stream); + workspace_ptr = const_cast(workspace_tensor.storage().data()); + } + auto st = op->Execute( + variant_pack, (uint8_t*)workspace_ptr, workspace_size, context_ptr); + return 0; + }; + at_npu::native::OpCommand::RunOpApiV2(name, acl_call); +} + +void RunAtbCmd(atb::Operation* op, + const ParamSetter& paramsetter, + const std::string& name) { + const auto is_capturing = + static_cast(c10_npu::currentStreamCaptureStatusMayInitCtx()); + if (is_capturing) { + RunAtbCmdV1(op, paramsetter, name); + } else { + RunAtbCmdV2(op, paramsetter, name); + } +} + +ParamSetter& ParamSetter::Input(const at::Tensor& tensor, + const bool& format_trans) { + if (!tensor.defined()) { + variant_pack_.inTensors.push_back(atb::Tensor()); + return *this; + } + at::Tensor new_tensor = tensor.contiguous(); + if (format_trans) { + new_tensor = atb::utils::FormatTrans(new_tensor); + } + atb::Tensor atb_tensor; + if (new_tensor.device().type() == at::kCPU) { + auto tensor_clone = new_tensor.clone(); + atb_tensor = AtTensor2AtbTensor(tensor_clone); + tensor_maintainer_.cpu_tensors.emplace_back(std::move(tensor_clone)); + } else { + atb_tensor = AtTensor2AtbTensor(new_tensor); + tensor_maintainer_.contiguous_tensors.emplace_back(std::move(new_tensor)); + } + variant_pack_.inTensors.push_back(atb_tensor); + return *this; +} + +ParamSetter& ParamSetter::Input(const c10::optional& tensor, + const bool& format_trans) { + if (!tensor.has_value()) { + variant_pack_.inTensors.push_back(atb::Tensor()); + return *this; + } + return Input(tensor.value(), format_trans); +} + +ParamSetter& ParamSetter::Output(at::Tensor& output) { + auto atb_tensor = AtTensor2AtbTensor(output); + variant_pack_.outTensors.push_back(atb_tensor); + return *this; +} + +uint64_t OperationSetup(atb::VariantPack variant_pack, + atb::Operation* operation, + atb::Context* context_ptr) { + uint64_t workspace_size = 0; + atb::Status status = + operation->Setup(variant_pack, workspace_size, context_ptr); + TORCH_CHECK(status == 0, operation->GetName(), " setup failed!"); + return workspace_size; +} + +} // namespace atb \ No newline at end of file diff --git a/xllm/core/kernels/npu/custom_functions_npu/AtbCommon.h b/xllm/core/kernels/npu/custom_functions_npu/AtbCommon.h new file mode 100644 index 00000000..f4659eb9 --- /dev/null +++ b/xllm/core/kernels/npu/custom_functions_npu/AtbCommon.h @@ -0,0 +1,493 @@ +// Copyright (c) 2025 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPPLUGIN_UTILS_ATB_COMMON_H +#define OPPLUGIN_UTILS_ATB_COMMON_H +#include +#include +#include +#include +#include +#include + +#include "./OperationCreate.h" +#include "Utils.h" +#include "atb/atb_infer.h" + +namespace atb { + +using aclTensor = struct aclTensor; +constexpr int64_t MAX_DIM_NUM = 5; +// small vector max size +const int N = 32; + +using _aclCreateTensor = aclTensor* (*)(const int64_t* view_dims, + uint64_t view_dims_num, + aclDataType data_type, + const int64_t* stride, + int64_t offset, + aclFormat format, + const int64_t* storage_dims, + uint64_t storage_dims_num, + void* tensor_data); +using _aclDestroyTensor = int (*)(const aclTensor*); + +using AtbApiFunc = int (*)(void*, uint64_t, atb::Operation*, atb::Context*); + +#define GET_OP_API_FUNC(apiName) \ + reinterpret_cast<_##apiName>(GetApiFuncAddr(#apiName)) + +inline const char* GetAtbApiLibName(void) { return "libatb.so"; } + +inline const char* GetOpApiLibName(void) { return "libopapi.so"; } + +inline void* GetApiLibHandler(const char* libName) { + auto handler = dlopen(libName, RTLD_LAZY); + if (handler == nullptr) { + ASCEND_LOGW("dlopen %s failed, error:%s.", libName, dlerror()); + } + return handler; +} + +inline void* GetApiFuncAddrInLib(void* handler, + const char* libName, + const char* apiName) { + auto funcAddr = dlsym(handler, apiName); + if (funcAddr == nullptr) { + ASCEND_LOGW( + "dlsym %s from %s failed, error:%s.", apiName, libName, dlerror()); + } + return funcAddr; +} + +inline void* GetApiFuncAddr(const char* apiName) { + static auto atbApiHandler = GetApiLibHandler(GetAtbApiLibName()); + if (atbApiHandler != nullptr) { + auto funcAddr = + GetApiFuncAddrInLib(atbApiHandler, GetAtbApiLibName(), apiName); + if (funcAddr != nullptr) { + return funcAddr; + } + } + static auto opApiHandler = GetApiLibHandler(GetOpApiLibName()); + if (opApiHandler != nullptr) { + auto funcAddr = + GetApiFuncAddrInLib(opApiHandler, GetOpApiLibName(), apiName); + if (funcAddr != nullptr) { + return funcAddr; + } + TORCH_CHECK(false, "GetApiFuncAddr not found ", apiName); + } +} + +struct TensorMaintainer { + c10::SmallVector + contiguous_tensors; // npu tensor's life should maintain when + // uncontiguous to contiguous. + c10::SmallVector + cpu_tensors; // cpu tensor's life should maintain in taskqueue. +}; + +inline aclTensor* ConvertType(TensorMaintainer& maintainer, + const at::Tensor& tensor) { + static const auto aclCreateTensor = + reinterpret_cast<_aclCreateTensor>(GetApiFuncAddr("aclCreateTensor")); + if (aclCreateTensor == nullptr) { + return nullptr; + } + + if (!tensor.defined()) { + return nullptr; + } + at::Tensor at_tensor = tensor.contiguous(); + aclFormat format = atb::utils::GetFormatForAtb(at_tensor); + + at::ScalarType scalar_data_type = at_tensor.scalar_type(); + aclDataType acl_data_type = + atb::utils::ConvertToAclDataType(scalar_data_type); + c10::SmallVector storageDims; + // if acl_data_type is ACL_STRING, storageDims is empty. + if (acl_data_type != ACL_STRING) { + TORCH_CHECK(at_tensor.itemsize() > 0, + "the itemsize of tensor must be greater than 0."); + storageDims.push_back(at_tensor.storage().nbytes() / at_tensor.itemsize()); + } + + const auto dimNum = at_tensor.sizes().size(); + auto acl_tensor = + aclCreateTensor(at_tensor.sizes().data(), + at_tensor.sizes().size(), + acl_data_type, + at_tensor.strides().data(), + at_tensor.storage_offset(), + format, + storageDims.data(), + storageDims.size(), + const_cast(at_tensor.storage().data())); + if (at_tensor.device().type() == at::kCPU) { + maintainer.cpu_tensors.emplace_back(std::move(at_tensor)); + } else { + maintainer.contiguous_tensors.emplace_back(std::move(at_tensor)); + } + return acl_tensor; +} + +inline aclTensor* ConvertType(TensorMaintainer& maintainer, + const c10::optional& opt_tensor) { + if (opt_tensor.has_value() && opt_tensor.value().defined()) { + return ConvertType(maintainer, opt_tensor.value()); + } + + return nullptr; +} + +template +T ConvertType(TensorMaintainer& maintainer, T value) { + return value; +} + +template +constexpr auto ConvertTypes(TensorMaintainer& maintainer, Ts&... args) { + return std::make_tuple(ConvertType(maintainer, args)...); +} + +struct TensorStruct { + void* data_ptr = nullptr; // at_tensor.storage().data() + at::ScalarType scalar_type; // at_tensor.scalar_type() + size_t nbytes; // at_tensor.storage().nbytes() + size_t itemsize; // at_tensor.itemsize() + int64_t storage_offset; // at_tensor.storage_offset() + std::vector sizes; // at_tensor.sizes() + std::vector strides; // at_tensor.strides() + aclFormat format; // at_tensor format + + TensorStruct(void* data_ptr_, + at::ScalarType scalar_type_, + size_t nbytes_, + size_t itemsize_, + int64_t storage_offset_, + at::IntArrayRef sizes_, + at::IntArrayRef strides_, + aclFormat format_) + : data_ptr(data_ptr_), + scalar_type(scalar_type_), + nbytes(nbytes_), + itemsize(itemsize_), + storage_offset(storage_offset_), + sizes(sizes_.vec()), + strides(strides_.vec()), + format(format_) {} +}; +using TensorStructPtr = std::shared_ptr; + +inline TensorStructPtr CopyTypeV2(TensorMaintainer& maintainer, + const at::Tensor& tensor) { + if (!tensor.defined()) { + return nullptr; + } + at::Tensor at_tensor = tensor.contiguous(); + aclFormat format = atb::utils::GetFormatForAtb(at_tensor); + std::shared_ptr tensor_structptr = + std::make_shared( + const_cast(at_tensor.storage().data()), + at_tensor.scalar_type(), + at_tensor.storage().nbytes(), + at_tensor.itemsize(), + at_tensor.storage_offset(), + at_tensor.sizes(), + at_tensor.strides(), + format); + if (at_tensor.device().type() == at::kCPU) { + maintainer.cpu_tensors.emplace_back(std::move(at_tensor)); + } else { + maintainer.contiguous_tensors.emplace_back(std::move(at_tensor)); + } + return tensor_structptr; +} + +inline TensorStructPtr CopyTypeV2(TensorMaintainer& maintainer, + const c10::optional& opt_tensor) { + if (opt_tensor.has_value() && opt_tensor.value().defined()) { + return CopyTypeV2(maintainer, opt_tensor.value()); + } + + return nullptr; +} + +template +T CopyTypeV2(TensorMaintainer& maintainer, T value) { + return value; +} + +inline aclTensor* ConvertTypeV2(TensorStructPtr at_tensor) { + static const auto aclCreateTensor = GET_OP_API_FUNC(aclCreateTensor); + if (aclCreateTensor == nullptr) { + return nullptr; + } + + if (at_tensor == nullptr) { + return nullptr; + } + at::ScalarType scalar_data_type = (*at_tensor).scalar_type; + aclDataType acl_data_type = + atb::utils::ConvertToAclDataType(scalar_data_type); + c10::SmallVector storageDims; + // if acl_data_type is ACL_STRING, storageDims is empty. + if (acl_data_type != ACL_STRING) { + TORCH_CHECK((*at_tensor).itemsize > 0, + "the itemsize of tensor must be greater than 0."); + storageDims.push_back((*at_tensor).nbytes / (*at_tensor).itemsize); + } + + const auto dimNum = (*at_tensor).sizes.size(); + + auto acl_tensor = aclCreateTensor((*at_tensor).sizes.data(), + (*at_tensor).sizes.size(), + acl_data_type, + (*at_tensor).strides.data(), + (*at_tensor).storage_offset, + (*at_tensor).format, + storageDims.data(), + storageDims.size(), + (*at_tensor).data_ptr); + return acl_tensor; +} + +template +T ConvertTypeV2(T value) { + return value; +} + +template +auto convert_types_impl_v2(const Tuple& t, std::index_sequence) { + return std::make_tuple(ConvertTypeV2(std::get(t))...); +} + +template +constexpr auto ConvertTypesV2(const std::tuple& args, + uint64_t* workspace_size_addr, + atb::Operation** op_addr, + atb::Context* context_ptr) { + auto convert_args = + convert_types_impl_v2(args, std::make_index_sequence{}); + auto appends = std::make_tuple(workspace_size_addr, op_addr, context_ptr); + return std::tuple_cat(convert_args, appends); +} + +template +constexpr auto CopyTypesV2(TensorMaintainer& maintainer, Ts&... args) { + return std::make_tuple(CopyTypeV2(maintainer, args)...); +} + +template +auto call(Function f, Tuple t, std::index_sequence) { + return f(std::get(t)...); +} + +template +auto call(Function f, Tuple t) { + static constexpr auto size = std::tuple_size::value; + return call(f, t, std::make_index_sequence{}); +} + +template +auto ConvertToOpApiFunc(const Tuple& params, + void* opApiAddr, + std::index_sequence) { + using OpApiFunc = + int (*)(typename std::decay(params))>::type...); + auto func = reinterpret_cast(opApiAddr); + return func; +} + +template +auto ConvertToOpApiFunc(const Tuple& params, void* opApiAddr) { + static constexpr auto size = std::tuple_size::value; + return ConvertToOpApiFunc( + params, opApiAddr, std::make_index_sequence{}); +} + +inline void Release(atb::Context* context) {} + +inline void Release(aclTensor* p) { + static const auto aclDestroyTensor = GET_OP_API_FUNC(aclDestroyTensor); + if (aclDestroyTensor == nullptr) { + return; + } + aclDestroyTensor(p); +} + +template +void Release(T value) { + (void)value; +} + +template +void CallRelease(Tuple t, std::index_sequence) { + (void)std::initializer_list{(Release(std::get(t)), 0)...}; +} + +template +void ReleaseConvertTypes(Tuple& t) { + static constexpr auto size = std::tuple_size::value; + CallRelease(t, std::make_index_sequence{}); +} + +#define EXEC_ATB_CMD_V1(atb_api, ...) \ + do { \ + static const auto getWorkspaceSizeFuncAddr = \ + GetApiFuncAddr(#atb_api "GetWorkspaceSize"); \ + static const auto atbApiFuncAddr = GetApiFuncAddr(#atb_api); \ + TORCH_CHECK( \ + getWorkspaceSizeFuncAddr != nullptr && atbApiFuncAddr != nullptr, \ + #atb_api, \ + " or ", \ + #atb_api "GetWorkspaceSize", \ + " not in ", \ + GetAtbApiLibName(), \ + ", or ", \ + GetAtbApiLibName(), \ + "not found."); \ + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); \ + auto context_ptr = atb::utils::GetContext(acl_stream); \ + uint64_t workspace_size = 0; \ + uint64_t* workspace_size_addr = &workspace_size; \ + atb::Operation* op = nullptr; \ + atb::Operation** op_addr = &op; \ + TensorMaintainer tensor_maintainer; \ + auto converted_params = ConvertTypes(tensor_maintainer, \ + __VA_ARGS__, \ + workspace_size_addr, \ + op_addr, \ + context_ptr); \ + static auto getWorkspaceSizeFunc = \ + ConvertToOpApiFunc(converted_params, getWorkspaceSizeFuncAddr); \ + auto workspace_status = call(getWorkspaceSizeFunc, converted_params); \ + TORCH_CHECK(workspace_status == 0, "call " #atb_api " failed, detail:"); \ + void* workspace_addr = nullptr; \ + at::Tensor workspace_tensor; \ + if (workspace_size != 0) { \ + at::TensorOptions options = \ + at::TensorOptions(c10::DeviceType::PrivateUse1); \ + workspace_tensor = \ + at::empty({workspace_size}, options.dtype(at::kByte)); \ + workspace_addr = const_cast(workspace_tensor.storage().data()); \ + } \ + const c10::SmallVector& cpu_tensors = \ + tensor_maintainer.cpu_tensors; \ + auto atb_call = [converted_params, \ + workspace_addr, \ + workspace_size, \ + context_ptr, \ + op, \ + cpu_tensors]() -> int { \ + AtbApiFunc atbApiFunc = reinterpret_cast(atbApiFuncAddr); \ + auto api_ret = \ + atbApiFunc(workspace_addr, workspace_size, op, context_ptr); \ + TORCH_CHECK(api_ret == 0, "call " #atb_api " failed, detail:"); \ + DestroyOperation(op); \ + ReleaseConvertTypes(converted_params); \ + return api_ret; \ + }; \ + at_npu::native::OpCommand::RunOpApiV2(#atb_api, atb_call); \ + } while (false) + +#define EXEC_ATB_CMD_V2(atb_api, ...) \ + do { \ + static const auto getWorkspaceSizeFuncAddr = \ + GetApiFuncAddr(#atb_api "GetWorkspaceSize"); \ + static const auto AtbApiFuncAddr = GetApiFuncAddr(#atb_api); \ + TORCH_CHECK( \ + getWorkspaceSizeFuncAddr != nullptr && AtbApiFuncAddr != nullptr, \ + #atb_api, \ + " or ", \ + #atb_api "GetWorkspaceSize", \ + " not in ", \ + GetAtbApiLibName(), \ + ", or ", \ + GetAtbApiLibName(), \ + "not found."); \ + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); \ + TensorMaintainer tensor_maintainer; \ + auto copied_params = CopyTypesV2(tensor_maintainer, __VA_ARGS__); \ + auto hash_id = computeHash(std::string(#atb_api), __VA_ARGS__); \ + const c10::SmallVector& cpu_tensors = \ + tensor_maintainer.cpu_tensors; \ + auto atb_call = \ + [copied_params, acl_stream, hash_id, cpu_tensors]() -> int { \ + auto context_ptr = atb::utils::GetContext(acl_stream); \ + uint64_t workspace_size = 0; \ + uint64_t* workspace_size_addr = &workspace_size; \ + OpParamCache& opParamCache = \ + OpParamCache::getInstance(); \ + atb::Operation* op = opParamCache.getOperation(hash_id); \ + atb::Operation** op_addr = &op; \ + int api_ret = 0; \ + auto converted_params = ConvertTypesV2( \ + copied_params, workspace_size_addr, op_addr, context_ptr); \ + auto getWorkspaceSizeFunc = \ + ConvertToOpApiFunc(converted_params, getWorkspaceSizeFuncAddr); \ + auto workspace_status = call(getWorkspaceSizeFunc, converted_params); \ + opParamCache.saveOperation(hash_id, op); \ + TORCH_CHECK(workspace_status == 0, \ + "call " #atb_api "GetWorkspaceSize failed"); \ + void* workspace_addr = nullptr; \ + at::Tensor workspace_tensor; \ + if (workspace_size != 0) { \ + workspace_tensor = \ + at_npu::native::allocate_workspace(workspace_size, acl_stream); \ + workspace_addr = const_cast(workspace_tensor.storage().data()); \ + } \ + AtbApiFunc atbApiFunc = reinterpret_cast(AtbApiFuncAddr); \ + api_ret = atbApiFunc(workspace_addr, workspace_size, op, context_ptr); \ + TORCH_CHECK(api_ret == 0, "call " #atb_api " failed"); \ + ReleaseConvertTypes(converted_params); \ + return api_ret; \ + }; \ + at_npu::native::OpCommand::RunOpApiV2(#atb_api, atb_call); \ + } while (false) + +#define EXEC_ATB_CMD(atb_api, ...) \ + do { \ + const auto is_capturing = \ + static_cast(c10_npu::currentStreamCaptureStatusMayInitCtx()); \ + if (is_capturing) { \ + EXEC_ATB_CMD_V1(atb_api, __VA_ARGS__); \ + } else { \ + EXEC_ATB_CMD_V2(atb_api, __VA_ARGS__); \ + } \ + } while (false) + +atb::Tensor AtTensor2AtbTensor(const at::Tensor atTensor); +atb::Context* GetContext(aclrtStream stream); +uint64_t OperationSetup(atb::VariantPack variant_pack, + atb::Operation* operation, + atb::Context* context_ptr); +class ParamSetter { + public: + ParamSetter& Input(const at::Tensor& tensor, + const bool& format_trans = false); + ParamSetter& Input(const c10::optional& tensor, + const bool& format_trans = false); + ParamSetter& Output(at::Tensor& tensor); + atb::VariantPack variant_pack_; + TensorMaintainer tensor_maintainer_; +}; + +void RunAtbCmd(atb::Operation* op, + const ParamSetter& paramsetter, + const std::string& name); + +} // namespace atb + +#endif diff --git a/xllm/core/kernels/npu/custom_functions_npu/OperationCacheCompute.cpp b/xllm/core/kernels/npu/custom_functions_npu/OperationCacheCompute.cpp new file mode 100644 index 00000000..b46abb96 --- /dev/null +++ b/xllm/core/kernels/npu/custom_functions_npu/OperationCacheCompute.cpp @@ -0,0 +1,201 @@ +// Copyright (c) 2025 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "OperationCacheCompute.h" + +namespace atb { + +thread_local char g_hash_buf[g_hash_buf_size]; +thread_local int g_hash_offset = 0; +constexpr int g_rShift33Bits = 33; +constexpr uint64_t MIX_STEP1 = 18397679294719823053LLU; +constexpr uint64_t MIX_STEP2 = 14181476777654086739LLU; + +void add_param_to_buf(const string& s) { + MEMCPY_TO_BUF(s.c_str(), static_cast(s.size())); +} + +void add_param_to_buf(const c10::optional& t) {} +void add_param_to_buf(const at::Tensor& t) {} + +void add_param_to_buf() {} + +inline uint64_t rotating_left(uint64_t x, uint8_t n) { + return (x << n) | (x >> (64 - n)); +} + +inline uint64_t mixture(uint64_t x) { + // constants step1(18397679294719823053) and step2(14181476777654086739) are + // used to allow hash values to be more evenly distributed after + // multiplication. + x ^= x >> g_rShift33Bits; + x *= MIX_STEP1; + x ^= x >> g_rShift33Bits; + x *= MIX_STEP2; + x ^= x >> g_rShift33Bits; + + return x; +} + +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. +uint64_t gen_hash(const void* key, + const int len, + const uint32_t seed = 0xdeadb0d7) { + const uint8_t* data = static_cast(key); + // the length of each block is 16 bytes + const int block_num = len / 16; + // has and hax are literal appromix to hash, and hax is the return value of + // this function. + uint64_t has = seed; + uint64_t hax = seed; + + // use 9782798678568883157 and 5545529020109919103 for blocking and + // obfuscation of input data + const uint64_t c1 = 9782798678568883157LLU; + const uint64_t c2 = 5545529020109919103LLU; + + const uint64_t* blocks = + static_cast(static_cast(data)); + + for (int i = 0; i < block_num; i++) { + int even_num = 2; + uint64_t tmp1 = blocks[i * even_num]; + uint64_t tmp2 = blocks[i * even_num + 1]; + + int8_t bits_31 = 31; + tmp1 *= c1; + tmp1 = rotating_left(tmp1, bits_31); + tmp1 *= c2; + has ^= tmp1; + + int8_t bits_27 = 27; + has = rotating_left(has, bits_27); + has += hax; + // increase randomness by mul by 5 and adding a constant + has = has * 5 + 1390208809; + + int8_t bits_33 = 33; + tmp2 *= c2; + tmp2 = rotating_left(tmp2, bits_33); + tmp2 *= c1; + hax ^= tmp2; + + hax = rotating_left(hax, bits_31); + hax += has; + // increase randomness by mul by 5 and adding a constant + hax = hax * 5 + 944331445; + } + + // the length of each block is 16 bytes + const uint8_t* tail = data + block_num * 16; + uint64_t t1 = 0; + uint64_t t2 = 0; + // because the size of a block is 16, different offsets are calculated for + // tail blocks for different sizes + switch (static_cast(len) & 15) { + case 15: + t2 ^= (static_cast(tail[14])) << 48; + [[fallthrough]]; + ; + case 14: + t2 ^= (static_cast(tail[13])) << 40; + [[fallthrough]]; + ; + case 13: + t2 ^= (static_cast(tail[12])) << 32; + [[fallthrough]]; + ; + case 12: + t2 ^= (static_cast(tail[11])) << 24; + [[fallthrough]]; + ; + case 11: + t2 ^= (static_cast(tail[10])) << 16; + [[fallthrough]]; + ; + case 10: + t2 ^= (static_cast(tail[9])) << 8; + [[fallthrough]]; + ; + case 9: + t2 ^= (static_cast(tail[8])) << 0; + t2 *= c2; + t2 = rotating_left(t2, 33); + t2 *= c1; + hax ^= t2; + [[fallthrough]]; + ; + case 8: + t1 ^= (static_cast(tail[7])) << 56; + [[fallthrough]]; + ; + case 7: + t1 ^= (static_cast(tail[6])) << 48; + [[fallthrough]]; + ; + case 6: + t1 ^= (static_cast(tail[5])) << 40; + [[fallthrough]]; + ; + case 5: + t1 ^= (static_cast(tail[4])) << 32; + [[fallthrough]]; + ; + case 4: + t1 ^= (static_cast(tail[3])) << 24; + [[fallthrough]]; + ; + case 3: + t1 ^= (static_cast(tail[2])) << 16; + [[fallthrough]]; + ; + case 2: + t1 ^= (static_cast(tail[1])) << 8; + [[fallthrough]]; + ; + case 1: + t1 ^= (static_cast(tail[0])) << 0; + t1 *= c1; + t1 = rotating_left(t1, 31); + t1 *= c2; + has ^= t1; + [[fallthrough]]; + ; + default: + break; + }; + + has ^= static_cast(len); + hax ^= static_cast(len); + + has += hax; + hax += has; + + has = mixture(has); + hax = mixture(hax); + + has += hax; + hax += has; + return hax; +} + +uint64_t calc_hash_id() { + if (g_hash_offset == g_hash_buf_max_size) { + return 0; + } + uint64_t hash_id = gen_hash(g_hash_buf, g_hash_offset); + return hash_id; +} + +} // namespace atb diff --git a/xllm/core/kernels/npu/custom_functions_npu/OperationCacheCompute.h b/xllm/core/kernels/npu/custom_functions_npu/OperationCacheCompute.h new file mode 100644 index 00000000..c9b293f7 --- /dev/null +++ b/xllm/core/kernels/npu/custom_functions_npu/OperationCacheCompute.h @@ -0,0 +1,161 @@ +// Copyright (c) 2025 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPPLUGIN_UTILS_ATB_PARAM_OPERATION_CACHE_COMPUTE_H +#define OPPLUGIN_UTILS_ATB_PARAM_OPERATION_CACHE_COMPUTE_H + +#include + +#include +#include +#include + +#include "atb/atb_infer.h" + +namespace atb { + +constexpr int g_hash_buf_size = 8192; +constexpr int g_hash_buf_max_size = g_hash_buf_size + 1024; +extern thread_local char g_hash_buf[g_hash_buf_size]; +extern thread_local int g_hash_offset; + +#define MEMCPY_TO_BUF(data_expression, size_expression) \ + if (g_hash_offset + (size_expression) > g_hash_buf_size) { \ + g_hash_offset = g_hash_buf_max_size; \ + return; \ + } \ + memcpy(g_hash_buf + g_hash_offset, data_expression, size_expression); \ + g_hash_offset += size_expression; + +uint64_t calc_hash_id(); + +template +void add_param_to_buf(const T& value) { + MEMCPY_TO_BUF(&value, sizeof(T)); +} + +void add_param_to_buf(const string& s); +void add_param_to_buf(const c10::optional& t); +void add_param_to_buf(const at::Tensor& t); +void add_param_to_buf(); + +template +void add_param_to_buf(const std::string& name, const T& value) { + add_param_to_buf(name); + add_param_to_buf(value); +} + +template +void add_param_to_buf(const T& arg, Args&... args) { + add_param_to_buf(arg); + add_param_to_buf(args...); +} + +template +struct HashOpParam { + void operator()(const T& param) const {}; +}; + +// Each operator implements its own hash function calculation. +// If the operator parameters do not change, implementation can be omitted. +// It is possible to hash only the attributes that may change in the parameters +// of the calculation. following example:: +// +// `template <>` +// `struct HashOpParam { //if XXXParam's transposeA and +// hasBias need hash` +// `void operator()(const atb::infer::XXXParam& param) const {` +// `add_param_to_buf("transposeA", param.transposeA);` +// `add_param_to_buf("hasBias", param.hasBias);` +// `}` +// `};` + +template <> +struct HashOpParam { + void operator()(const atb::infer::RmsNormParam& param) const { + add_param_to_buf("epsilon", param.normParam.epsilon); + add_param_to_buf("layerType", param.layerType); + add_param_to_buf("quantType", param.normParam.quantType); + } +}; + +template <> +struct HashOpParam { + void operator()(const atb::infer::GroupTopkParam& param) const { + add_param_to_buf("groupNum", param.groupNum); + add_param_to_buf("k", param.k); + add_param_to_buf("groupMultiFlag", param.groupMultiFlag); + add_param_to_buf("n", param.n); + } +}; + +template <> +struct HashOpParam { + void operator()(const atb::infer::PagedAttentionParam& param) const { + add_param_to_buf("num_kv_heads", param.kvHeadNum); + add_param_to_buf("num_heads", param.headNum); + add_param_to_buf("scale_value", param.qkScale); + add_param_to_buf("quant_type", param.quantType); + add_param_to_buf("outdata_type", param.outDataType); + add_param_to_buf("mla_vheadsize", param.mlaVHeadSize); + add_param_to_buf("maskType", param.maskType); + add_param_to_buf("calcType", param.calcType); + } +}; + +template <> +struct HashOpParam { + void operator()(const atb::infer::SelfAttentionParam& param) const { + add_param_to_buf("num_kv_heads", param.kvHeadNum); + add_param_to_buf("num_heads", param.headNum); + add_param_to_buf("scale_value", param.qkScale); + add_param_to_buf("calcType", param.calcType); + add_param_to_buf("kernelType", param.kernelType); + add_param_to_buf("maskType", param.maskType); + add_param_to_buf("quantType", param.quantType); + add_param_to_buf("isTriuMask", param.isTriuMask); + } +}; + +template <> +struct HashOpParam { + void operator()(const atb::infer::RopeParam& param) const { + add_param_to_buf("rotaryCoeff", param.rotaryCoeff); + } +}; + +template <> +struct HashOpParam { + void operator()(const atb::infer::ReshapeAndCacheParam& param) const { + add_param_to_buf("compressType", param.compressType); + add_param_to_buf("kvCacheCfg", param.kvCacheCfg); + } +}; + +template +uint64_t computeHash(const T& obj) { + g_hash_offset = 0; + HashOpParam{}(obj); + return calc_hash_id(); +} + +template +uint64_t computeHash(const std::string& name, Ts&... args) { + g_hash_offset = 0; + add_param_to_buf(name, args...); + return calc_hash_id(); +} + +} // namespace atb + +#endif // OPPLUGIN_UTILS_ATB_PARAM_OPERATION_CACHE_COMPUTE_H diff --git a/xllm/core/kernels/npu/custom_functions_npu/OperationCreate.h b/xllm/core/kernels/npu/custom_functions_npu/OperationCreate.h new file mode 100644 index 00000000..c08a1310 --- /dev/null +++ b/xllm/core/kernels/npu/custom_functions_npu/OperationCreate.h @@ -0,0 +1,127 @@ +// Copyright (c) 2025 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPPLUGIN_UTILS_ATB_OPERATION_CREATE_H +#define OPPLUGIN_UTILS_ATB_OPERATION_CREATE_H + +#include +#include + +#include +#include +#include + +#include "OperationCacheCompute.h" +#include "Utils.h" +#include "atb/atb_infer.h" + +namespace atb { + +template +class OpParamCache { + public: + static OpParamCache& getInstance(); + + atb::Operation* getOperation(const ParamType& param, const std::string& name); + atb::Operation* getOperation(uint64_t hash_id); + void saveOperation(uint64_t hash_id, atb::Operation* op); + + private: + OpParamCache(); + + OpParamCache(const OpParamCache&) = delete; + OpParamCache& operator=(const OpParamCache&) = delete; + + ~OpParamCache(); + + std::unordered_map op_map_; + mutable std::mutex mutex_; +}; + +template +atb::Operation* CreateAtbOperation(const ParamType& param, + const std::string& name) { + atb::Operation* op = nullptr; + atb::CreateOperation(param, &op); + TORCH_CHECK(op != nullptr, name, " CreateOperation failed!"); + return op; +} + +template +OpParamCache& OpParamCache::getInstance() { + static OpParamCache instance; + return instance; +} + +template +atb::Operation* OpParamCache::getOperation(const ParamType& param, + const std::string& name) { + const auto is_capturing = + static_cast(c10_npu::currentStreamCaptureStatusMayInitCtx()); + if (is_capturing) { + // The atb operator does not support operator reuse, when operator creation + // and execution in separate threads. + return CreateAtbOperation(param, name); + } else { + uint64_t hashValue = computeHash(param); + { + std::lock_guard lock(mutex_); + auto op_cache = op_map_.find(hashValue); + if (op_cache != op_map_.end()) { + return op_cache->second; + } + atb::Operation* op = CreateAtbOperation(param, name); + op_map_[hashValue] = op; + return op; + } + } +} + +template +atb::Operation* OpParamCache::getOperation(uint64_t hash_id) { + std::lock_guard lock(mutex_); + auto op_cache = op_map_.find(hash_id); + if (op_cache != op_map_.end()) { + return op_cache->second; + } + + atb::Operation* op = nullptr; + return op; +} + +template +void OpParamCache::saveOperation(uint64_t hash_id, + atb::Operation* op) { + std::lock_guard lock(mutex_); + op_map_[hash_id] = op; + return; +} + +template +OpParamCache::OpParamCache() { + // To satisfy the destructuring order, ContextManager should be instantiated + // before OpParamCache. + atb::utils::ContextManager::GetInstance(); +} + +template +OpParamCache::~OpParamCache() { + std::lock_guard lock(mutex_); + for (auto& op_item : op_map_) { + DestroyOperation(op_item.second); + } +} + +} // namespace atb + +#endif // OPPLUGIN_UTILS_ATB_OPERATION_CREATE_H diff --git a/xllm/core/kernels/npu/custom_functions_npu/Utils.cpp b/xllm/core/kernels/npu/custom_functions_npu/Utils.cpp new file mode 100644 index 00000000..882497d8 --- /dev/null +++ b/xllm/core/kernels/npu/custom_functions_npu/Utils.cpp @@ -0,0 +1,79 @@ +// Copyright (c) 2025 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "Utils.h" + +#include + +namespace atb { +namespace utils { + +ContextManager& ContextManager::GetInstance() { + static ContextManager instance; + return instance; +} + +ContextManager::ContextManager() : atb_context_(nullptr) {} + +ContextManager::~ContextManager() { + if (atb_context_) { + auto status = atb::DestroyContext(atb_context_); + TORCH_CHECK(status == 0, "Destroy context failed!"); + atb_context_ = nullptr; + } +} + +atb::Context* ContextManager::GetContext(aclrtStream stream) { + std::call_once(create_flag_, [this]() { + auto status = atb::CreateContext(&atb_context_); + TORCH_CHECK(status == 0, "Create context failed!"); + }); + + atb_context_->SetExecuteStream(stream); + return atb_context_; +} + +atb::Context* GetContext(aclrtStream stream) { + return ContextManager::GetInstance().GetContext(stream); +} + +aclDataType ConvertToAclDataType(const at::ScalarType& data_type) { + auto acl_dtype = + kATenScalarTypeToAclDataTypeTable[static_cast(data_type)]; + TORCH_CHECK(acl_dtype != ACL_DT_UNDEFINED, + std::string(c10::toString(data_type)) + " has not been supported") + return acl_dtype; +} + +at::Tensor FormatTrans(const at::Tensor& at_tensor) { + if (torch_npu::utils::is_npu(at_tensor)) { + return at_npu::native::npu_format_cast(at_tensor, ACL_FORMAT_ND); + } + return at_tensor; +} + +bool IsBaseFormat(aclFormat& format) { + return (format == ACL_FORMAT_NCHW) || (format == ACL_FORMAT_ND) || + (format == ACL_FORMAT_NHWC) || (format == ACL_FORMAT_NCDHW); +} + +aclFormat GetFormatForAtb(const at::Tensor& at_tensor) { + if (torch_npu::utils::is_npu(at_tensor)) { + aclFormat format = + static_cast(at_npu::native::get_npu_format(at_tensor)); + return IsBaseFormat(format) ? ACL_FORMAT_ND : format; + } + return ACL_FORMAT_ND; +} +} // namespace utils +} // namespace atb diff --git a/xllm/core/kernels/npu/custom_functions_npu/Utils.h b/xllm/core/kernels/npu/custom_functions_npu/Utils.h new file mode 100644 index 00000000..e42b4274 --- /dev/null +++ b/xllm/core/kernels/npu/custom_functions_npu/Utils.h @@ -0,0 +1,101 @@ +// Copyright (c) 2025 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPPLUGIN_UTILS_ATB_UTILS_H +#define OPPLUGIN_UTILS_ATB_UTILS_H + +#include +#include +#include + +#include "atb/atb_infer.h" + +namespace atb { +namespace utils { + +class ContextManager { + public: + static ContextManager& GetInstance(); + atb::Context* GetContext(aclrtStream stream); + ~ContextManager(); + + ContextManager(const ContextManager&) = delete; + ContextManager& operator=(const ContextManager&) = delete; + + private: + ContextManager(); + std::once_flag create_flag_; + atb::Context* atb_context_; +}; + +atb::Context* GetContext(aclrtStream stream); + +#define AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(_) \ + _(at::ScalarType::Byte, ACL_UINT8) \ + _(at::ScalarType::Char, ACL_INT8) \ + _(at::ScalarType::Short, ACL_INT16) \ + _(at::ScalarType::Int, ACL_INT32) \ + _(at::ScalarType::Long, ACL_INT64) \ + _(at::ScalarType::Half, ACL_FLOAT16) \ + _(at::ScalarType::Float, ACL_FLOAT) \ + _(at::ScalarType::Double, ACL_DOUBLE) \ + _(at::ScalarType::ComplexHalf, ACL_COMPLEX32) \ + _(at::ScalarType::ComplexFloat, ACL_COMPLEX64) \ + _(at::ScalarType::ComplexDouble, ACL_COMPLEX128) \ + _(at::ScalarType::Bool, ACL_BOOL) \ + _(at::ScalarType::QInt8, ACL_DT_UNDEFINED) \ + _(at::ScalarType::QUInt8, ACL_DT_UNDEFINED) \ + _(at::ScalarType::QInt32, ACL_DT_UNDEFINED) \ + _(at::ScalarType::BFloat16, ACL_BF16) \ + _(at::ScalarType::QUInt4x2, ACL_DT_UNDEFINED) \ + _(at::ScalarType::QUInt2x4, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Bits1x8, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Bits2x4, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Bits4x2, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Bits8, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Bits16, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Float8_e5m2, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Float8_e4m3fn, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Undefined, ACL_DT_UNDEFINED) \ + _(at::ScalarType::NumOptions, ACL_DT_UNDEFINED) + +constexpr aclDataType kATenScalarTypeToAclDataTypeTable + [static_cast(at::ScalarType::NumOptions) + 1] = { +#define DEFINE_ENUM(_1, n) n, + AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(DEFINE_ENUM) +#undef DEFINE_ENUM +}; + +aclDataType ConvertToAclDataType(const at::ScalarType& data_type); +at::Tensor FormatTrans(const at::Tensor& at_tensor); +aclFormat GetFormatForAtb(const at::Tensor& at_tensor); + +template +inline int get_op_mode(const MapType& mode_map, + c10::optional mode_opt, + c10::string_view default_mode, + const char* mode_name) { + c10::string_view mode_str = mode_opt.value_or(default_mode); + auto it = mode_map.find(mode_str); + TORCH_CHECK(it != mode_map.end(), + "Unsupported ", + mode_name, + " value: '", + mode_str, + "'"); + return it->second; +} +} // namespace utils +} // namespace atb + +#endif diff --git a/xllm/core/kernels/npu/rms_norm.h b/xllm/core/kernels/npu/fused_layernorm.cpp similarity index 56% rename from xllm/core/kernels/npu/rms_norm.h rename to xllm/core/kernels/npu/fused_layernorm.cpp index ed7f8d04..3e663523 100644 --- a/xllm/core/kernels/npu/rms_norm.h +++ b/xllm/core/kernels/npu/fused_layernorm.cpp @@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include -#pragma once -#include "impl/npu_rms_norm_impl.h" +#include "npu_ops_api.h" +#include "ops_npu/npu_ops.h" -namespace xllm { -namespace kernel { +namespace xllm::kernel::npu { -class RmsNorm : public torch::nn::ModuleHolder { - public: - using torch::nn::ModuleHolder::ModuleHolder; - using Impl __attribute__((__unused__)) = NpuRmsNormImpl; +torch::Tensor fused_layernorm(const torch::Tensor& input, + const torch::Tensor& weight, + double eps) { + std::tuple result = + at_npu::native::custom_ops::npu_rms_norm(input, weight, eps); + auto normalized_input = std::get<0>(result); + return normalized_input; +} - RmsNorm(const ModelContext& context) - : ModuleHolder(std::make_shared(context)) {} -}; - -} // namespace kernel -} // namespace xllm +} // namespace xllm::kernel::npu \ No newline at end of file diff --git a/xllm/core/kernels/npu/impl/CMakeLists.txt b/xllm/core/kernels/npu/impl/CMakeLists.txt deleted file mode 100644 index d8ec37ff..00000000 --- a/xllm/core/kernels/npu/impl/CMakeLists.txt +++ /dev/null @@ -1,104 +0,0 @@ -include(cc_library) -include(cc_test) - -include_directories( - ${CMAKE_SOURCE_DIR}/third_party/spdlog/include -) - - -cc_library( - NAME - npu_kernels_impl - HDRS - npu_split_impl.h - npu_linear_impl.h - npu_rms_norm_impl.h - npu_rope_impl.h - SRCS - npu_split_impl.cpp - npu_linear_impl.cpp - npu_rms_norm_impl.cpp - npu_rope_impl.cpp - DEPS - :npu_layers - :model_context - :state_dict - glog::glog - torch - torch_npu -) - -cc_test( - NAME - npu_rms_norm_test - SRCS - npu_rms_norm_test.cpp - DEPS - :npu_kernels_impl - GTest::gtest - GTest::gtest_main - xllm_kernels - c_sec - atb - spdlog::spdlog -) - -cc_test( - NAME - npu_linear_test - SRCS - npu_linear_test.cpp - DEPS - :npu_kernels_impl - GTest::gtest - GTest::gtest_main - xllm_kernels - c_sec - atb - spdlog::spdlog -) - -cc_test( - NAME - npu_split_test - SRCS - npu_split_test.cpp - DEPS - :npu_kernels_impl - GTest::gtest - GTest::gtest_main - xllm_kernels - c_sec - atb - spdlog::spdlog -) - -cc_test( - NAME - npu_rope_impl_test - SRCS - npu_rope_impl_test.cpp - DEPS - :npu_kernels_impl - GTest::gtest - GTest::gtest_main - xllm_kernels - c_sec - atb - spdlog::spdlog -) - -cc_test( - NAME - npu_sample_model_test - SRCS - npu_sample_model_test.cpp - DEPS - :npu_kernels_impl - GTest::gtest - GTest::gtest_main - xllm_kernels - c_sec - atb - spdlog::spdlog -) \ No newline at end of file diff --git a/xllm/core/kernels/npu/impl/npu_linear_impl.cpp b/xllm/core/kernels/npu/impl/npu_linear_impl.cpp deleted file mode 100644 index e233f1ca..00000000 --- a/xllm/core/kernels/npu/impl/npu_linear_impl.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "npu_linear_impl.h" - -#include - -namespace xllm::kernel { - -NpuLinearImpl::NpuLinearImpl(const ModelContext& context) - : NpuBaseLayer(context) { - at_weight_tensors_.resize(1); - atb_weight_tensors_.resize(1); - at_out_tensors_.resize(1); - dtype_ = c10::typeMetaToScalarType(context.get_tensor_options().dtype()); - at_weight_tensors_[0] = torch::zeros({1}).to(context.get_tensor_options()); - tensor_placeholder_ = torch::zeros({1}).to(context.get_tensor_options()); - - atb::Status status = init_node(linear_node_); - if (status != atb::NO_ERROR) { - LOG(ERROR) << "Failed to initialize node, status: " << status; - LOG(FATAL) << "NpuLinearImpl initialization failed with status: " << status; - } -} - -void NpuLinearImpl::verify_loaded_weights(const std::string weight_str) const { - CHECK(at_weight_tensors_[0].sizes() != std::vector({1})) - << "weight is not loaded for " << weight_str; -} - -void NpuLinearImpl::merge_loaded_weights() { - atb_weight_tensors_[0] = - atb_speed::Utils::AtTensor2Tensor(at_weight_tensors_[0]); -} - -void NpuLinearImpl::load_state_dict(const StateDict& state_dict) { - set_weight(state_dict, "weight", 0); -} - -int64_t NpuLinearImpl::init_node(atb_speed::Model::Node& node) { - name_ = "linear"; - model_name_ = "llm"; - run_task_func_ = std::bind(&NpuLinearImpl::run_task, - this, - std::placeholders::_1, - std::placeholders::_2); - - atb::Operation* operation = nullptr; - atb::infer::LinearParam linearParam; - linearParam.transposeB = true; - // linearParam.outDataType = ACL_BF16; - linearParam.hasBias = false; - atb::Status atbStatus = atb::CreateOperation(linearParam, &operation); - if (atbStatus != atb::NO_ERROR) { - return atbStatus; - } - - node.operation.reset(operation); - if (node.operation == nullptr) { - LOG(ERROR) << "node.operation is null"; - return -1; - } - if (node.operation->GetInputNum() < 1) { - LOG(ERROR) << "Get unexpected input num: " << node.operation->GetInputNum(); - return -1; - } - if (node.operation->GetOutputNum() < 1) { - LOG(ERROR) << "Get unexpected output num: " - << node.operation->GetOutputNum(); - return -1; - } - ATB_SPEED_LOG_DEBUG("AddLinear"); - - return atb::NO_ERROR; -} - -torch::Tensor NpuLinearImpl::forward(const torch::Tensor& input, int nodeId) { - atb::Status st; - - build_node_variant_pack(linear_node_, input); - - st = execute_node(linear_node_, nodeId); - - if (st != 0) { - LOG(FATAL) << model_name_ - << " inference failed with error code: " << std::to_string(st); - } - - return at_out_tensors_.at(0); -} - -void NpuLinearImpl::build_node_variant_pack(atb_speed::Model::Node& node, - const torch::Tensor& input) { - internal_input = atb_speed::Utils::AtTensor2Tensor(input); - - atb::SVector ins = {internal_input, atb_weight_tensors_[0]}; - node.variantPack.inTensors = ins; - - atb::SVector inTensorDescs; - inTensorDescs.resize(node.operation->GetInputNum()); - inTensorDescs.at(0) = internal_input.desc; - inTensorDescs.at(1) = atb_weight_tensors_[0].desc; - - atb::SVector outTensorDescs; - node.operation->InferShape(inTensorDescs, outTensorDescs); - - at::Tensor output = - atb_speed::Utils::CreateAtTensorFromTensorDesc(outTensorDescs.at(0)); - at_out_tensors_.at(0) = output; - - node.variantPack.outTensors = {atb_speed::Utils::AtTensor2Tensor(output)}; -} - -} // namespace xllm::kernel diff --git a/xllm/core/kernels/npu/impl/npu_linear_impl.h b/xllm/core/kernels/npu/impl/npu_linear_impl.h deleted file mode 100644 index b1fc3d26..00000000 --- a/xllm/core/kernels/npu/impl/npu_linear_impl.h +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#pragma once - -#ifdef TORCH_HIGHER_THAN_PTA6 -#include -#include -#else -#include -#include -#endif - -#include - -#include - -#include "atb/atb_infer.h" -#include "framework/model/model_input_params.h" -#include "framework/model_context.h" -#include "framework/state_dict/state_dict.h" -#include "layers/npu/npu_base_layer.h" -#include "nlohmann/json.hpp" -#include "pytorch/adapter/utils/utils.h" -#include "xllm_kernels/core/include/atb_speed/base/hosttensor_binder.h" -#include "xllm_kernels/core/include/atb_speed/base/model.h" -#include "xllm_kernels/core/include/atb_speed/log.h" -#include "xllm_kernels/core/include/atb_speed/utils/model_factory.h" - -namespace xllm::kernel { - -class NpuLinearImpl : public xllm::layer::NpuBaseLayer { - public: - explicit NpuLinearImpl(const ModelContext& context); - - ~NpuLinearImpl() {}; - - void load_state_dict(const StateDict& state_dict); - - void verify_loaded_weights(const std::string weight_str) const; - - void merge_loaded_weights(); - - torch::Tensor forward(const torch::Tensor& input, int nodeId); - - private: - int64_t init_node(atb_speed::Model::Node& node); - void build_node_variant_pack(atb_speed::Model::Node& node, - const torch::Tensor& input); - atb_speed::Model::Node linear_node_; - std::string model_name_; - - std::vector at_out_tensors_; - atb::Tensor internal_input; - torch::Tensor tensor_placeholder_; -}; - -} // namespace xllm::kernel diff --git a/xllm/core/kernels/npu/impl/npu_linear_test.cpp b/xllm/core/kernels/npu/impl/npu_linear_test.cpp deleted file mode 100644 index ec4607a0..00000000 --- a/xllm/core/kernels/npu/impl/npu_linear_test.cpp +++ /dev/null @@ -1,401 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include -#include - -#include "kernels/npu/linear.h" - -namespace xllm::kernel { - -class NpuLinearTest : public ::testing::Test { - protected: - NpuLinearTest() : parallel_args_(1, 1, nullptr) { - try { - torch::zeros({1}, torch::TensorOptions().device("npu:0")); - - tensor_options_ = - torch::TensorOptions().dtype(torch::kFloat16).device("npu:0"); - npu_available_ = true; - std::cout << "Using NPU device" << std::endl; - - } catch (...) { - tensor_options_ = - torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU); - npu_available_ = false; - std::cout << "Using CPU device (NPU unavailable)" << std::endl; - } - } - - void SetUp() override { - torch::manual_seed(42); - - model_args_.hidden_size() = 4096; - model_args_.intermediate_size() = 11008; - model_args_.dtype() = "float16"; - - quant_args_.torch_dtype() = "float16"; - - context_ = std::make_unique( - parallel_args_, model_args_, quant_args_, tensor_options_); - } - - void TearDown() override { - context_.reset(); - - if (npu_available_) { - try { - c10_npu::npuSynchronizeDevice(); - c10_npu::NPUCachingAllocator::emptyCache(); - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } catch (...) { - // NPU cleanup failures are usually not critical in test teardown - } - } - } - - StateDict CreateStateDict(const torch::Tensor& weight_tensor) { - std::unordered_map tensor_map; - tensor_map["weight"] = weight_tensor; - return StateDict(tensor_map, ""); - } - - ModelArgs model_args_; - QuantArgs quant_args_; - ParallelArgs parallel_args_; - torch::TensorOptions tensor_options_; - std::unique_ptr context_; - bool npu_available_ = true; -}; - -// Test NpuLinearImpl construction -TEST_F(NpuLinearTest, ConstructorTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - ASSERT_NO_THROW({ - auto linear = std::make_shared(*context_); - EXPECT_NE(linear, nullptr); - }); -} - -// Test Linear wrapper construction -TEST_F(NpuLinearTest, NpuLinearWrapperTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - ASSERT_NO_THROW({ auto linear = Linear(*context_); }); -} - -// Test state dict loading with mock weights -TEST_F(NpuLinearTest, LoadStateDictTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto linear = std::make_shared(*context_); - - // Create weight tensor with shape [output_size, input_size] for linear layer - auto weight_tensor = - torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()}, - tensor_options_); - auto state_dict = CreateStateDict(weight_tensor); - - ASSERT_NO_THROW({ linear->load_state_dict(state_dict); }); -} - -// Test weight verification (should fail with uninitialized weights) -TEST_F(NpuLinearTest, VerifyLoadedWeightsFailTest) { - auto linear = std::make_shared(*context_); - - EXPECT_DEATH({ linear->verify_loaded_weights("test_weight"); }, ".*"); -} - -// Test weight verification (should pass with loaded weights) -TEST_F(NpuLinearTest, VerifyLoadedWeightsPassTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto linear = std::make_shared(*context_); - - auto weight_tensor = - torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()}, - tensor_options_); - auto state_dict = CreateStateDict(weight_tensor); - linear->load_state_dict(state_dict); - - ASSERT_NO_THROW({ linear->verify_loaded_weights("test_weight"); }); -} - -// Test merge loaded weights -TEST_F(NpuLinearTest, MergeLoadedWeightsTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto linear = std::make_shared(*context_); - - auto weight_tensor = - torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()}, - tensor_options_); - auto state_dict = CreateStateDict(weight_tensor); - linear->load_state_dict(state_dict); - - ASSERT_NO_THROW({ linear->merge_loaded_weights(); }); -} - -// Test forward pass with mock input (may fail without proper NPU setup) -TEST_F(NpuLinearTest, ForwardPassBasicTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto linear = Linear(*context_); - - auto weight_tensor = - torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()}, - tensor_options_); - auto state_dict = CreateStateDict(weight_tensor); - linear->load_state_dict(state_dict); - linear->merge_loaded_weights(); - - // Input tensor with shape [batch_size, seq_len, hidden_size] - auto input = - torch::randn({1, 10, model_args_.hidden_size()}, tensor_options_); - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - auto output = linear(input, 0); - aclrtSynchronizeStream(npu_stream.stream()); - std::cout << "Input tensor shape: " << input.sizes() << std::endl; - std::cout << "Output tensor shape: " << output.sizes() << std::endl; - - // Expected output shape: [batch_size, seq_len, intermediate_size] - std::vector expected_shape = { - 1, 10, model_args_.intermediate_size()}; - EXPECT_EQ(output.sizes(), expected_shape); - } catch (const std::exception& e) { - GTEST_SKIP() << "Skipping forward pass test - requires NPU environment: " - << e.what(); - } -} - -// Test tensor shape consistency -TEST_F(NpuLinearTest, TensorShapeTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto linear = std::make_shared(*context_); - - auto weight_tensor = - torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()}, - tensor_options_); - auto state_dict = CreateStateDict(weight_tensor); - linear->load_state_dict(state_dict); - - EXPECT_EQ(weight_tensor.size(0), model_args_.intermediate_size()); - EXPECT_EQ(weight_tensor.size(1), model_args_.hidden_size()); - EXPECT_EQ(weight_tensor.dim(), 2); -} - -// Test different weight matrix dimensions -TEST_F(NpuLinearTest, DifferentWeightDimensionsTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - std::vector> dimensions = { - {768, 3072}, {1024, 4096}, {2048, 8192}, {4096, 11008}, {8192, 22016}}; - - for (auto [input_size, output_size] : dimensions) { - model_args_.hidden_size() = input_size; - model_args_.intermediate_size() = output_size; - - QuantArgs local_quant_args = quant_args_; - local_quant_args.torch_dtype() = "float16"; - - auto context = std::make_unique( - parallel_args_, model_args_, local_quant_args, tensor_options_); - - auto linear = std::make_shared(*context); - - auto weight_tensor = - torch::randn({output_size, input_size}, tensor_options_); - auto state_dict = CreateStateDict(weight_tensor); - - ASSERT_NO_THROW({ linear->load_state_dict(state_dict); }); - - EXPECT_EQ(weight_tensor.size(0), output_size); - EXPECT_EQ(weight_tensor.size(1), input_size); - } -} - -// Test linear transformation mathematical properties -TEST_F(NpuLinearTest, LinearTransformationPropertiesTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto linear = Linear(*context_); - - auto weight_tensor = torch::eye(model_args_.hidden_size(), - torch::TensorOptions() - .dtype(torch::kFloat16) - .device(tensor_options_.device())); - - if (model_args_.intermediate_size() != model_args_.hidden_size()) { - if (model_args_.intermediate_size() > model_args_.hidden_size()) { - auto padded_weight = torch::zeros( - {model_args_.intermediate_size(), model_args_.hidden_size()}, - tensor_options_); - padded_weight.narrow(0, 0, model_args_.hidden_size()) = weight_tensor; - weight_tensor = padded_weight; - } else { - weight_tensor = - weight_tensor.narrow(0, 0, model_args_.intermediate_size()); - } - } - - auto state_dict = CreateStateDict(weight_tensor); - linear->load_state_dict(state_dict); - linear->merge_loaded_weights(); - - auto input = torch::ones({1, 1, model_args_.hidden_size()}, tensor_options_); - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - auto output = linear(input, 0); - aclrtSynchronizeStream(npu_stream.stream()); - - EXPECT_EQ(output.dim(), 3); - EXPECT_EQ(output.size(0), 1); - EXPECT_EQ(output.size(1), 1); - EXPECT_EQ(output.size(2), - model_args_.intermediate_size()); // output features - - } catch (const std::exception& e) { - GTEST_SKIP() - << "Skipping mathematical properties test - requires NPU environment: " - << e.what(); - } -} - -// Test batch processing -TEST_F(NpuLinearTest, BatchProcessingTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto linear = Linear(*context_); - - auto weight_tensor = - torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()}, - tensor_options_); - auto state_dict = CreateStateDict(weight_tensor); - linear->load_state_dict(state_dict); - linear->merge_loaded_weights(); - - std::vector> batch_shapes = { - {1, 5, model_args_.hidden_size()}, - {2, 10, model_args_.hidden_size()}, - {4, 20, model_args_.hidden_size()}, - {8, 15, model_args_.hidden_size()}}; - - for (const auto& shape : batch_shapes) { - auto input = torch::randn(shape, tensor_options_); - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - auto output = linear(input, 0); - aclrtSynchronizeStream(npu_stream.stream()); - - EXPECT_EQ(output.size(0), shape[0]); - EXPECT_EQ(output.size(1), shape[1]); - EXPECT_EQ(output.size(2), model_args_.intermediate_size()); - - } catch (const std::exception& e) { - GTEST_SKIP() << "Skipping batch processing test for shape [" << shape[0] - << ", " << shape[1] << ", " << shape[2] - << "] - requires NPU environment: " << e.what(); - break; - } - } -} - -// Test error handling with invalid inputs -TEST_F(NpuLinearTest, ErrorHandlingTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto linear = Linear(*context_); - - auto weight_tensor = - torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()}, - tensor_options_); - auto state_dict = CreateStateDict(weight_tensor); - linear->load_state_dict(state_dict); - linear->merge_loaded_weights(); - - auto wrong_input = - torch::randn({1, 10, model_args_.hidden_size() + 100}, tensor_options_); - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - auto output = linear(wrong_input, 0); - aclrtSynchronizeStream(npu_stream.stream()); - FAIL() << "Expected exception for mismatched input dimensions"; - } catch (const std::exception& e) { - // Expected behavior - input dimension mismatch should cause error - std::cout << "Correctly caught expected error: " << e.what() << std::endl; - } -} - -} // namespace xllm::kernel - -int main(int argc, char** argv) { - struct rlimit core_limit; - core_limit.rlim_cur = 0; - core_limit.rlim_max = 0; - setrlimit(RLIMIT_CORE, &core_limit); - - FILE* null_stderr = freopen("/dev/null", "w", stderr); - if (null_stderr == nullptr) { - fclose(stderr); - } - - ::testing::InitGoogleTest(&argc, argv); - - bool npu_available = false; - try { - auto test_tensor = - torch::zeros({1}, torch::TensorOptions().device("npu:0")); - npu_available = true; - } catch (...) { - npu_available = false; - } - - if (!npu_available) { - std::cout << "NPU device not available, skipping all tests." << std::endl; - return 0; // Exit with success code, all tests skipped - } - - int result = RUN_ALL_TESTS(); - _exit(result); -} \ No newline at end of file diff --git a/xllm/core/kernels/npu/impl/npu_rms_norm_impl.cpp b/xllm/core/kernels/npu/impl/npu_rms_norm_impl.cpp deleted file mode 100644 index 1d16c8ba..00000000 --- a/xllm/core/kernels/npu/impl/npu_rms_norm_impl.cpp +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "npu_rms_norm_impl.h" - -#include - -namespace xllm::kernel { - -void NpuRmsNormImpl::param_from_args(atb::infer::RmsNormParam& param, - const ModelArgs& args) { - param.layerType = atb::infer::RmsNormParam::RmsNormType::RMS_NORM_NORM; - param.normParam.epsilon = args.rms_norm_eps(); -} - -int64_t NpuRmsNormImpl::init_node(atb_speed::Model::Node& node, - atb::infer::RmsNormParam& param) { - name_ = "rms_norm"; - model_name_ = "llm"; - run_task_func_ = std::bind(&NpuRmsNormImpl::run_task, - this, - std::placeholders::_1, - std::placeholders::_2); - - atb::Operation* operation = nullptr; - atb::Status atbStatus = atb::CreateOperation(param, &operation); - if (atbStatus != atb::NO_ERROR) { - return atbStatus; - } - - node.operation.reset(operation); - if (node.operation == nullptr) { - LOG(ERROR) << "node.operation is null"; - return -1; - } - if (node.operation->GetInputNum() < 1) { - LOG(ERROR) << "Can not resize number which is smaller than 1"; - return -1; - } - - return atb::NO_ERROR; -} - -NpuRmsNormImpl::NpuRmsNormImpl(const ModelContext& context) - : NpuBaseLayer(context) { - param_from_args(norm_param_, context.get_model_args()); - - at_weight_tensors_.resize(1); - atb_weight_tensors_.resize(1); - - auto options = context.get_tensor_options(); - dtype_ = c10::typeMetaToScalarType(options.dtype()); - at_weight_tensors_[0] = torch::zeros({1}).to(options); - - atb::Status status = init_node(norm_node_, norm_param_); - if (status != atb::NO_ERROR) { - LOG(ERROR) << "Failed to initialize node, status: " << status; - LOG(FATAL) << "NpuRmsNormImpl initialization failed with status: " - << std::to_string(status); - } -} - -void NpuRmsNormImpl::verify_loaded_weights(const std::string weight_str) const { - CHECK(at_weight_tensors_[0].sizes() != std::vector({1})) - << "final norm weight is not loaded for " << weight_str; -} - -void NpuRmsNormImpl::merge_loaded_weights() { - atb_weight_tensors_[0] = - atb_speed::Utils::AtTensor2Tensor(at_weight_tensors_[0]); -} - -void NpuRmsNormImpl::load_state_dict(const StateDict& state_dict) { - set_weight(state_dict, "weight", 0); - at_weight_tensors_[0] = at_weight_tensors_[0].to(dtype_); -} - -torch::Tensor NpuRmsNormImpl::forward(torch::Tensor& x, int nodeId) { - atb::Status st; - build_node_variant_pack(norm_node_, x); - st = execute_node(norm_node_, nodeId); - LOG_IF(FATAL, st != 0) << model_name_ - << "infer shape fail, error code: " << st; - return x; -} - -void NpuRmsNormImpl::build_node_variant_pack(atb_speed::Model::Node& node, - torch::Tensor& x) { - internal_tensors_ = atb_speed::Utils::AtTensor2Tensor(x); - - atb::SVector ins = {internal_tensors_, atb_weight_tensors_[0]}; - atb::SVector outs = {internal_tensors_}; - - node.variantPack.inTensors = ins; - node.variantPack.outTensors = outs; -} - -} // namespace xllm::kernel diff --git a/xllm/core/kernels/npu/impl/npu_rms_norm_impl.h b/xllm/core/kernels/npu/impl/npu_rms_norm_impl.h deleted file mode 100644 index dda02375..00000000 --- a/xllm/core/kernels/npu/impl/npu_rms_norm_impl.h +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#pragma once -#ifdef TORCH_HIGHER_THAN_PTA6 -#include -#include -#else -#include -#include -#endif - -#include - -#include - -#include "atb/atb_infer.h" -#include "framework/kv_cache/kv_cache.h" -#include "framework/model/model_input_params.h" -#include "framework/state_dict/state_dict.h" -#include "layers/npu/npu_base_layer.h" -#include "nlohmann/json.hpp" -#include "pytorch/adapter/utils/utils.h" -#include "xllm_kernels/core/include/atb_speed/base/hosttensor_binder.h" -#include "xllm_kernels/core/include/atb_speed/base/model.h" -#include "xllm_kernels/core/include/atb_speed/log.h" -#include "xllm_kernels/core/include/atb_speed/utils/model_factory.h" - -namespace xllm::kernel { - -class NpuRmsNormImpl : public xllm::layer::NpuBaseLayer { - public: - explicit NpuRmsNormImpl(const ModelContext& context); - - ~NpuRmsNormImpl() {}; - - void load_state_dict(const StateDict& state_dict); - - void verify_loaded_weights(const std::string weight_str) const; - - void merge_loaded_weights(); - - torch::Tensor forward(torch::Tensor& x, int nodeId); - - private: - int64_t init_node(atb_speed::Model::Node& node, - atb::infer::RmsNormParam& param); - - void build_node_variant_pack(atb_speed::Model::Node& node, torch::Tensor& x); - - void param_from_args(atb::infer::RmsNormParam& param, const ModelArgs& args); - - atb_speed::Model::Node norm_node_; - std::string model_name_; - atb::infer::RmsNormParam norm_param_; - atb::Tensor internal_tensors_; -}; - -} // namespace xllm::kernel diff --git a/xllm/core/kernels/npu/impl/npu_rms_norm_test.cpp b/xllm/core/kernels/npu/impl/npu_rms_norm_test.cpp deleted file mode 100644 index df4c0ce3..00000000 --- a/xllm/core/kernels/npu/impl/npu_rms_norm_test.cpp +++ /dev/null @@ -1,262 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include -#include - -#include "kernels/npu/rms_norm.h" - -namespace xllm::kernel { - -class NpuRmsNormTest : public ::testing::Test { - protected: - NpuRmsNormTest() : parallel_args_(1, 1, nullptr) { - try { - torch::zeros({1}, torch::TensorOptions().device("npu:0")); - - tensor_options_ = - torch::TensorOptions().dtype(torch::kFloat16).device("npu:0"); - npu_available_ = true; - std::cout << "Using NPU device" << std::endl; - - } catch (...) { - tensor_options_ = - torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU); - npu_available_ = false; - std::cout << "Using CPU device (NPU unavailable)" << std::endl; - } - } - - void SetUp() override { - torch::manual_seed(42); - - model_args_.rms_norm_eps() = 1e-6f; - model_args_.hidden_size() = 4096; - model_args_.dtype() = "float16"; - - quant_args_.torch_dtype() = "float16"; - - context_ = std::make_unique( - parallel_args_, model_args_, quant_args_, tensor_options_); - } - - void TearDown() override { - context_.reset(); - - if (npu_available_) { - try { - c10_npu::npuSynchronizeDevice(); - c10_npu::NPUCachingAllocator::emptyCache(); - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } catch (...) { - // NPU cleanup failures are usually not critical in test teardown - } - } - } - - StateDict CreateStateDict(const torch::Tensor& weight_tensor) { - std::unordered_map tensor_map; - tensor_map["weight"] = weight_tensor; - return StateDict(tensor_map, ""); - } - - ModelArgs model_args_; - QuantArgs quant_args_; - ParallelArgs parallel_args_; - torch::TensorOptions tensor_options_; - std::unique_ptr context_; - bool npu_available_ = true; -}; - -// Test NpuRmsNormImpl construction -TEST_F(NpuRmsNormTest, ConstructorTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - ASSERT_NO_THROW({ - auto rms_norm = std::make_shared(*context_); - EXPECT_NE(rms_norm, nullptr); - }); -} - -// Test RmsNorm wrapper construction -TEST_F(NpuRmsNormTest, RmsNormWrapperTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - ASSERT_NO_THROW({ auto rms_norm = RmsNorm(*context_); }); -} - -// Test state dict loading with mock weights -TEST_F(NpuRmsNormTest, LoadStateDictTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rms_norm = std::make_shared(*context_); - - auto weight_tensor = - torch::randn({model_args_.hidden_size()}, tensor_options_); - auto state_dict = CreateStateDict(weight_tensor); - - ASSERT_NO_THROW({ rms_norm->load_state_dict(state_dict); }); -} - -// Test weight verification (should fail with uninitialized weights) -TEST_F(NpuRmsNormTest, VerifyLoadedWeightsFailTest) { - auto rms_norm = std::make_shared(*context_); - - EXPECT_DEATH({ rms_norm->verify_loaded_weights("test_weight"); }, ".*"); -} - -// Test weight verification (should pass with loaded weights) -TEST_F(NpuRmsNormTest, VerifyLoadedWeightsPassTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rms_norm = std::make_shared(*context_); - - auto weight_tensor = - torch::randn({model_args_.hidden_size()}, tensor_options_); - auto state_dict = CreateStateDict(weight_tensor); - rms_norm->load_state_dict(state_dict); - - ASSERT_NO_THROW({ rms_norm->verify_loaded_weights("test_weight"); }); -} - -// Test merge loaded weights -TEST_F(NpuRmsNormTest, MergeLoadedWeightsTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rms_norm = std::make_shared(*context_); - - auto weight_tensor = - torch::randn({model_args_.hidden_size()}, tensor_options_); - auto state_dict = CreateStateDict(weight_tensor); - rms_norm->load_state_dict(state_dict); - - ASSERT_NO_THROW({ rms_norm->merge_loaded_weights(); }); -} - -// Test forward pass with mock input (may fail without proper NPU setup) -TEST_F(NpuRmsNormTest, ForwardPassBasicTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rms_norm = RmsNorm(*context_); - - auto weight_tensor = - torch::randn({model_args_.hidden_size()}, tensor_options_); - auto state_dict = CreateStateDict(weight_tensor); - rms_norm->load_state_dict(state_dict); - rms_norm->merge_loaded_weights(); - - auto input = - torch::randn({1, 10, model_args_.hidden_size()}, tensor_options_); - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - auto output = rms_norm(input, 0); - aclrtSynchronizeStream(npu_stream.stream()); - std::cout << "Output tensor shape: " << output.sizes() << std::endl; - EXPECT_EQ(output.sizes(), input.sizes()); - } catch (const std::exception& e) { - GTEST_SKIP() << "Skipping forward pass test - requires NPU environment: " - << e.what(); - } -} - -// Test tensor shape consistency -TEST_F(NpuRmsNormTest, TensorShapeTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rms_norm = std::make_shared(*context_); - - auto weight_tensor = - torch::randn({model_args_.hidden_size()}, tensor_options_); - auto state_dict = CreateStateDict(weight_tensor); - rms_norm->load_state_dict(state_dict); - - EXPECT_EQ(weight_tensor.size(0), model_args_.hidden_size()); - EXPECT_EQ(weight_tensor.dim(), 1); -} - -// Test with different hidden sizes -TEST_F(NpuRmsNormTest, DifferentHiddenSizesTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - std::vector hidden_sizes = {768, 1024, 2048, 4096, 8192}; - - for (int64_t hidden_size : hidden_sizes) { - model_args_.hidden_size() = hidden_size; - QuantArgs local_quant_args = quant_args_; - local_quant_args.torch_dtype() = "float16"; - - auto context = std::make_unique( - parallel_args_, model_args_, local_quant_args, tensor_options_); - - auto rms_norm = std::make_shared(*context); - - auto weight_tensor = torch::randn({hidden_size}, tensor_options_); - auto state_dict = CreateStateDict(weight_tensor); - - ASSERT_NO_THROW({ rms_norm->load_state_dict(state_dict); }); - - EXPECT_EQ(weight_tensor.size(0), hidden_size); - } -} - -} // namespace xllm::kernel - -int main(int argc, char** argv) { - struct rlimit core_limit; - core_limit.rlim_cur = 0; - core_limit.rlim_max = 0; - setrlimit(RLIMIT_CORE, &core_limit); - - FILE* null_stderr = freopen("/dev/null", "w", stderr); - if (null_stderr == nullptr) { - fclose(stderr); - } - - ::testing::InitGoogleTest(&argc, argv); - - bool npu_available = false; - try { - auto test_tensor = - torch::zeros({1}, torch::TensorOptions().device("npu:0")); - npu_available = true; - } catch (...) { - npu_available = false; - } - - if (!npu_available) { - std::cout << "NPU device not available, skipping all tests." << std::endl; - return 0; // Exit with success code, all tests skipped - } - - int result = RUN_ALL_TESTS(); - _exit(result); -} \ No newline at end of file diff --git a/xllm/core/kernels/npu/impl/npu_rope_impl.cpp b/xllm/core/kernels/npu/impl/npu_rope_impl.cpp deleted file mode 100644 index 805f4cda..00000000 --- a/xllm/core/kernels/npu/impl/npu_rope_impl.cpp +++ /dev/null @@ -1,142 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "npu_rope_impl.h" - -#include - -namespace xllm::kernel { - -void NpuRopeImpl::param_from_args(atb::infer::RopeParam& param, - const ModelArgs& args) { - param.rotaryCoeff = 2; -} - -int64_t NpuRopeImpl::init_node(atb_speed::Model::Node& node, - atb::infer::RopeParam& param) { - name_ = "rope"; - model_name_ = "llm"; - run_task_func_ = std::bind(&NpuRopeImpl::run_task, - this, - std::placeholders::_1, - std::placeholders::_2); - - atb::Operation* operation = nullptr; - atb::Status atbStatus = atb::CreateOperation(param, &operation); - if (atbStatus != atb::NO_ERROR) { - return atbStatus; - } - - node.operation.reset(operation); - if (node.operation == nullptr) { - LOG(ERROR) << "node.operation is null"; - return -1; - } - if (node.operation->GetInputNum() < 1) { - LOG(ERROR) << "Can not resize number which is smaller than 1"; - return -1; - } - - return atb::NO_ERROR; -} - -NpuRopeImpl::NpuRopeImpl(const ModelContext& context) : NpuBaseLayer(context) { - param_from_args(rope_param_, context.get_model_args()); - - at_weight_tensors_.resize(1); - atb_weight_tensors_.resize(1); - - auto options = context.get_tensor_options(); - dtype_ = c10::typeMetaToScalarType(options.dtype()); - at_weight_tensors_[0] = torch::zeros({1}).to(options); - - atb::Status status = init_node(rope_node_, rope_param_); - if (status != atb::NO_ERROR) { - LOG(ERROR) << "Failed to initialize node, status: " << status; - LOG(FATAL) << "NpuRopeImpl initialization failed with status: " - << std::to_string(status); - } -} - -void NpuRopeImpl::verify_loaded_weights(const std::string weight_str) const { - // No operation needed for rope layer -} - -void NpuRopeImpl::merge_loaded_weights() { - // No operation needed for rope layer -} - -void NpuRopeImpl::load_state_dict(const StateDict& state_dict) { - // No operation needed for rope layer -} - -std::vector NpuRopeImpl::forward(const torch::Tensor& q, - const torch::Tensor& k, - const torch::Tensor& cos_embedding, - const torch::Tensor& sin_embedding, - const torch::Tensor& seq_len, - int nodeId) { - atb::Status st; - build_node_variant_pack( - rope_node_, q, k, cos_embedding, sin_embedding, seq_len); - st = execute_node(rope_node_, nodeId); - LOG_IF(FATAL, st != 0) << model_name_ - << "infer shape fail, error code: " << st; - return at_out_tensors_; -} - -void NpuRopeImpl::build_node_variant_pack(atb_speed::Model::Node& node, - const torch::Tensor& q, - const torch::Tensor& k, - const torch::Tensor& cos_embedding, - const torch::Tensor& sin_embedding, - const torch::Tensor& seq_len) { - internal_q = atb_speed::Utils::AtTensor2Tensor(q); - internal_k = atb_speed::Utils::AtTensor2Tensor(k); - internal_cos_embedding = atb_speed::Utils::AtTensor2Tensor(cos_embedding); - internal_sin_embedding = atb_speed::Utils::AtTensor2Tensor(sin_embedding); - internal_seq_len = atb_speed::Utils::AtTensor2Tensor(seq_len); - - atb::SVector ins = {internal_q, - internal_k, - internal_cos_embedding, - internal_sin_embedding, - internal_seq_len}; - node.variantPack.inTensors = ins; - - atb::SVector inTensorDescs; - inTensorDescs.resize(node.operation->GetInputNum()); - inTensorDescs.at(0) = internal_q.desc; - inTensorDescs.at(1) = internal_k.desc; - inTensorDescs.at(2) = internal_cos_embedding.desc; - inTensorDescs.at(3) = internal_sin_embedding.desc; - inTensorDescs.at(4) = internal_seq_len.desc; - - atb::SVector outTensorDescs; - node.operation->InferShape(inTensorDescs, outTensorDescs); - - at_out_tensors_.resize(outTensorDescs.size()); - at::Tensor output_0 = - atb_speed::Utils::CreateAtTensorFromTensorDesc(outTensorDescs.at(0)); - at_out_tensors_.at(0) = output_0; - at::Tensor output_1 = - atb_speed::Utils::CreateAtTensorFromTensorDesc(outTensorDescs.at(1)); - at_out_tensors_.at(1) = output_1; - - node.variantPack.outTensors = {atb_speed::Utils::AtTensor2Tensor(output_0), - atb_speed::Utils::AtTensor2Tensor(output_1)}; -} - -} // namespace xllm::kernel diff --git a/xllm/core/kernels/npu/impl/npu_rope_impl.h b/xllm/core/kernels/npu/impl/npu_rope_impl.h deleted file mode 100644 index 1f3ee107..00000000 --- a/xllm/core/kernels/npu/impl/npu_rope_impl.h +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#pragma once -#ifdef TORCH_HIGHER_THAN_PTA6 -#include -#include -#else -#include -#include -#endif - -#include - -#include - -#include "atb/atb_infer.h" -#include "framework/kv_cache/kv_cache.h" -#include "framework/model/model_input_params.h" -#include "framework/state_dict/state_dict.h" -#include "layers/npu/npu_base_layer.h" -#include "nlohmann/json.hpp" -#include "pytorch/adapter/utils/utils.h" -#include "xllm_kernels/core/include/atb_speed/base/hosttensor_binder.h" -#include "xllm_kernels/core/include/atb_speed/base/model.h" -#include "xllm_kernels/core/include/atb_speed/log.h" -#include "xllm_kernels/core/include/atb_speed/utils/model_factory.h" - -namespace xllm::kernel { - -class NpuRopeImpl : public xllm::layer::NpuBaseLayer { - public: - explicit NpuRopeImpl(const ModelContext& context); - - ~NpuRopeImpl() {}; - - void load_state_dict(const StateDict& state_dict); - - void verify_loaded_weights(const std::string weight_str) const; - - void merge_loaded_weights(); - - std::vector forward(const torch::Tensor& q, - const torch::Tensor& k, - const torch::Tensor& cos_embedding, - const torch::Tensor& sin_embedding, - const torch::Tensor& seq_len, - int nodeId); - - private: - int64_t init_node(atb_speed::Model::Node& node, atb::infer::RopeParam& param); - void build_node_variant_pack(atb_speed::Model::Node& node, - const torch::Tensor& q, - const torch::Tensor& k, - const torch::Tensor& cos_embedding, - const torch::Tensor& sin_embedding, - const torch::Tensor& seq_len); - void param_from_args(atb::infer::RopeParam& param, const ModelArgs& args); - - std::vector at_out_tensors_; - atb::Tensor internal_q; - atb::Tensor internal_k; - atb::Tensor internal_cos_embedding; - atb::Tensor internal_sin_embedding; - atb::Tensor internal_seq_len; - - atb_speed::Model::Node rope_node_; - std::string model_name_; - atb::infer::RopeParam rope_param_; - atb::Tensor internal_tensors_; -}; - -} // namespace xllm::kernel diff --git a/xllm/core/kernels/npu/impl/npu_rope_impl_test.cpp b/xllm/core/kernels/npu/impl/npu_rope_impl_test.cpp deleted file mode 100644 index 26a78bef..00000000 --- a/xllm/core/kernels/npu/impl/npu_rope_impl_test.cpp +++ /dev/null @@ -1,233 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include -#include - -#include "kernels/npu/rope.h" - -namespace xllm::kernel { - -class NpuRopeTest : public ::testing::Test { - protected: - NpuRopeTest() : parallel_args_(1, 1, nullptr) { - try { - torch::zeros({1}, torch::TensorOptions().device("npu:0")); - - tensor_options_ = - torch::TensorOptions().dtype(torch::kFloat16).device("npu:0"); - npu_available_ = true; - std::cout << "Using NPU device" << std::endl; - - } catch (...) { - tensor_options_ = - torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU); - npu_available_ = false; - std::cout << "Using CPU device (NPU unavailable)" << std::endl; - } - } - - void SetUp() override { - torch::manual_seed(42); - - model_args_.hidden_size() = 4096; - model_args_.num_attention_heads() = 32; - model_args_.head_dim() = 128; - model_args_.max_position_embeddings() = 2048; - model_args_.dtype() = "float16"; - - quant_args_.torch_dtype() = "float16"; - - context_ = std::make_unique( - parallel_args_, model_args_, quant_args_, tensor_options_); - } - - void TearDown() override { - context_.reset(); - - if (npu_available_) { - try { - c10_npu::npuSynchronizeDevice(); - c10_npu::NPUCachingAllocator::emptyCache(); - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } catch (...) { - // NPU cleanup failures are usually not critical in test teardown - } - } - } - - StateDict CreateStateDict() { - std::unordered_map tensor_map; - // RoPE layer doesn't have trainable weights, so empty state dict - return StateDict(tensor_map, ""); - } - - ModelArgs model_args_; - QuantArgs quant_args_; - ParallelArgs parallel_args_; - torch::TensorOptions tensor_options_; - std::unique_ptr context_; - bool npu_available_ = true; -}; - -// Test NpuRopeImpl construction -TEST_F(NpuRopeTest, ConstructorTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - ASSERT_NO_THROW({ - auto rope = std::make_shared(*context_); - EXPECT_NE(rope, nullptr); - }); -} - -// Test Rope wrapper construction -TEST_F(NpuRopeTest, RopeWrapperTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - ASSERT_NO_THROW({ auto rope = Rope(*context_); }); -} - -// Test state dict loading (RoPE doesn't have weights) -TEST_F(NpuRopeTest, LoadStateDictTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rope = std::make_shared(*context_); - auto state_dict = CreateStateDict(); - - ASSERT_NO_THROW({ rope->load_state_dict(state_dict); }); -} - -// Test weight verification (should pass as RoPE has no weights) -TEST_F(NpuRopeTest, VerifyLoadedWeightsTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rope = std::make_shared(*context_); - - ASSERT_NO_THROW({ rope->verify_loaded_weights("test_weight"); }); -} - -// Test merge loaded weights -TEST_F(NpuRopeTest, MergeLoadedWeightsTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rope = std::make_shared(*context_); - - ASSERT_NO_THROW({ rope->merge_loaded_weights(); }); -} - -// Test forward pass with mock input tensors following constraint specifications -TEST_F(NpuRopeTest, ForwardPassBasicTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rope = std::make_shared(*context_); - - int64_t batch_size = 2; - std::vector seq_lengths = {8, 12}; - int64_t max_seq_len = - *std::max_element(seq_lengths.begin(), seq_lengths.end()); - int64_t ntokens = std::accumulate( - seq_lengths.begin(), seq_lengths.end(), 0); // ntokens = sum(seqlen[i]) - - int64_t head_num_q = model_args_.num_attention_heads(); // headNumQ - int64_t head_num_k = - model_args_.num_attention_heads(); // headNumK (can be <= headNumQ) - int64_t head_size = model_args_.head_dim(); - - // Ensure 32-byte alignment for hiddenSizeQ and hiddenSizeK - int64_t hidden_size_q = - head_num_q * head_size; // hiddenSizeQ = head_size * headNumQ - int64_t hidden_size_k = - head_num_k * head_size; // hiddenSizeK = head_size * headNumK - - // Validate 32-byte alignment constraint - ASSERT_EQ(hidden_size_q % 32, 0) << "hiddenSizeQ must be 32-byte aligned"; - ASSERT_EQ(hidden_size_k % 32, 0) << "hiddenSizeK must be 32-byte aligned"; - - // Create tensors with constraint-compliant dimensions - // Input format: [ntokens, hiddenSizeQ/K] for 2D tensors - auto q = torch::randn({ntokens, hidden_size_q}, tensor_options_); - auto k = torch::randn({ntokens, hidden_size_k}, tensor_options_); - - // cos/sin embeddings: [ntokens, head_size] for standard mode - auto cos_embedding = torch::randn({ntokens, head_size}, tensor_options_); - auto sin_embedding = torch::randn({ntokens, head_size}, tensor_options_); - - auto seq_len_tensor = - torch::tensor(seq_lengths, tensor_options_.dtype(torch::kInt32)); - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - auto outputs = - rope->forward(q, k, cos_embedding, sin_embedding, seq_len_tensor, 0); - aclrtSynchronizeStream(npu_stream.stream()); - - EXPECT_GE(outputs.size(), - 2); // Should return at least q_rotated and k_rotated - if (outputs.size() >= 2) { - std::cout << "Output Q tensor shape: " << outputs[0].sizes() << std::endl; - std::cout << "Output K tensor shape: " << outputs[1].sizes() << std::endl; - EXPECT_EQ(outputs[0].sizes(), q.sizes()); - EXPECT_EQ(outputs[1].sizes(), k.sizes()); - } - } catch (const std::exception& e) { - GTEST_SKIP() << "Skipping forward pass test - requires NPU environment: " - << e.what(); - } -} - -} // namespace xllm::kernel - -int main(int argc, char** argv) { - struct rlimit core_limit; - core_limit.rlim_cur = 0; - core_limit.rlim_max = 0; - setrlimit(RLIMIT_CORE, &core_limit); - - FILE* null_stderr = freopen("/dev/null", "w", stderr); - if (null_stderr == nullptr) { - fclose(stderr); - } - - ::testing::InitGoogleTest(&argc, argv); - - bool npu_available = false; - try { - auto test_tensor = - torch::zeros({1}, torch::TensorOptions().device("npu:0")); - npu_available = true; - } catch (...) { - npu_available = false; - } - - if (!npu_available) { - std::cout << "NPU device not available, skipping all tests." << std::endl; - return 0; - } - - int result = RUN_ALL_TESTS(); - _exit(result); -} \ No newline at end of file diff --git a/xllm/core/kernels/npu/impl/npu_sample_model_test.cpp b/xllm/core/kernels/npu/impl/npu_sample_model_test.cpp deleted file mode 100644 index c8cee2d4..00000000 --- a/xllm/core/kernels/npu/impl/npu_sample_model_test.cpp +++ /dev/null @@ -1,904 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include -#include - -#include "kernels/npu/linear.h" -#include "kernels/npu/rms_norm.h" -#include "kernels/npu/rope.h" -#include "kernels/npu/split.h" - -namespace xllm::kernel { - -class SampleModelTest : public ::testing::Test { - protected: - SampleModelTest() : parallel_args_(1, 1, nullptr) { - try { - torch::zeros({1}, torch::TensorOptions().device("npu:0")); - - tensor_options_ = - torch::TensorOptions().dtype(torch::kFloat16).device("npu:0"); - npu_available_ = true; - std::cout << "Using NPU device" << std::endl; - - } catch (...) { - tensor_options_ = - torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU); - npu_available_ = false; - std::cout << "Using CPU device (NPU unavailable)" << std::endl; - } - } - - void SetUp() override { - torch::manual_seed(42); - - model_args_.hidden_size() = 4096; - model_args_.intermediate_size() = 11008; - model_args_.rms_norm_eps() = 1e-6f; - model_args_.dtype() = "float16"; - - q_size_ = model_args_.hidden_size(); - kv_size_ = model_args_.hidden_size(); - qkv_size_ = q_size_ + 2 * kv_size_; // q + k + v - - quant_args_.torch_dtype() = "float16"; - - context_ = std::make_unique( - parallel_args_, model_args_, quant_args_, tensor_options_); - } - - void TearDown() override { - context_.reset(); - - if (npu_available_) { - try { - c10_npu::npuSynchronizeDevice(); - c10_npu::NPUCachingAllocator::emptyCache(); - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } catch (...) { - // NPU cleanup failures are usually not critical in test teardown - } - } - } - - StateDict CreateRmsNormStateDict(const torch::Tensor& weight_tensor) { - std::unordered_map tensor_map; - tensor_map["weight"] = weight_tensor; - return StateDict(tensor_map, ""); - } - - StateDict CreateLinearStateDict(const torch::Tensor& weight_tensor) { - std::unordered_map tensor_map; - tensor_map["weight"] = weight_tensor; - return StateDict(tensor_map, ""); - } - - StateDict CreateEmptyStateDict() { - std::unordered_map tensor_map; - return StateDict(tensor_map, ""); - } - - // Helper method to create cos/sin embeddings for RoPE - std::pair CreateRopeEmbeddings( - int64_t seq_len, - int64_t head_dim) { - auto cos_embedding = torch::cos( - torch::arange(0, seq_len, tensor_options_).unsqueeze(1) * - torch::arange(0, head_dim / 2, tensor_options_).unsqueeze(0) * 0.01); - auto sin_embedding = torch::sin( - torch::arange(0, seq_len, tensor_options_).unsqueeze(1) * - torch::arange(0, head_dim / 2, tensor_options_).unsqueeze(0) * 0.01); - return std::make_pair(cos_embedding, sin_embedding); - } - - ModelArgs model_args_; - QuantArgs quant_args_; - ParallelArgs parallel_args_; - torch::TensorOptions tensor_options_; - std::unique_ptr context_; - bool npu_available_ = true; - - // QKV dimensions - int64_t q_size_; - int64_t kv_size_; - int64_t qkv_size_; - - // Attention parameters - int64_t num_heads_ = 32; - int64_t num_kv_heads_ = 32; - int64_t head_dim_ = 128; - bool attn_output_gate_ = false; -}; - -// Test RMS norm + Linear layer construction -TEST_F(SampleModelTest, ConstructorTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - ASSERT_NO_THROW({ - auto rms_norm = std::make_shared(*context_); - auto linear = std::make_shared(*context_); - EXPECT_NE(rms_norm, nullptr); - EXPECT_NE(linear, nullptr); - }); -} - -// Test combined RMS norm + Linear layer wrapper construction -TEST_F(SampleModelTest, WrapperConstructionTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - ASSERT_NO_THROW({ - auto rms_norm = RmsNorm(*context_); - auto linear = Linear(*context_); - }); -} - -// Test state dict loading for both layers -TEST_F(SampleModelTest, LoadStateDictTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rms_norm = std::make_shared(*context_); - auto linear = std::make_shared(*context_); - - auto rms_norm_weight = - torch::randn({model_args_.hidden_size()}, tensor_options_); - auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight); - - auto linear_weight = - torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()}, - tensor_options_); - auto linear_state_dict = CreateLinearStateDict(linear_weight); - - ASSERT_NO_THROW({ - rms_norm->load_state_dict(rms_norm_state_dict); - linear->load_state_dict(linear_state_dict); - }); -} - -// Test weight verification for both layers -TEST_F(SampleModelTest, VerifyLoadedWeightsTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rms_norm = std::make_shared(*context_); - auto linear = std::make_shared(*context_); - - auto rms_norm_weight = - torch::randn({model_args_.hidden_size()}, tensor_options_); - auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight); - rms_norm->load_state_dict(rms_norm_state_dict); - - auto linear_weight = - torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()}, - tensor_options_); - auto linear_state_dict = CreateLinearStateDict(linear_weight); - linear->load_state_dict(linear_state_dict); - - ASSERT_NO_THROW({ - rms_norm->verify_loaded_weights("rms_norm_weight"); - linear->verify_loaded_weights("linear_weight"); - }); -} - -// Test merge loaded weights for both layers -TEST_F(SampleModelTest, MergeLoadedWeightsTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rms_norm = std::make_shared(*context_); - auto linear = std::make_shared(*context_); - - auto rms_norm_weight = - torch::randn({model_args_.hidden_size()}, tensor_options_); - auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight); - rms_norm->load_state_dict(rms_norm_state_dict); - - auto linear_weight = - torch::randn({model_args_.intermediate_size(), model_args_.hidden_size()}, - tensor_options_); - auto linear_state_dict = CreateLinearStateDict(linear_weight); - linear->load_state_dict(linear_state_dict); - - ASSERT_NO_THROW({ - rms_norm->merge_loaded_weights(); - linear->merge_loaded_weights(); - }); -} - -// Test combined forward pass: RMS norm -> QKV projection -> Split (q, k, v) -TEST_F(SampleModelTest, CombinedForwardPassTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rms_norm = RmsNorm(*context_); - auto qkv_proj = Linear(*context_); - auto split_layer = Split(*context_); - - auto rms_norm_weight = - torch::randn({model_args_.hidden_size()}, tensor_options_); - auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight); - rms_norm->load_state_dict(rms_norm_state_dict); - rms_norm->merge_loaded_weights(); - - // Setup QKV projection weights: output size = q_size + k_size + v_size - auto qkv_weight = - torch::randn({qkv_size_, model_args_.hidden_size()}, tensor_options_); - auto qkv_state_dict = CreateLinearStateDict(qkv_weight); - qkv_proj->load_state_dict(qkv_state_dict); - qkv_proj->merge_loaded_weights(); - - // Setup split layer (no weights needed) - auto split_state_dict = CreateEmptyStateDict(); - split_layer->load_state_dict(split_state_dict); - split_layer->merge_loaded_weights(); - - // Input tensor with shape [batch_size, seq_len, hidden_size] - auto input = - torch::randn({1, 10, model_args_.hidden_size()}, tensor_options_); - - try { - std::cout << "Input tensor shape: " << input.sizes() << std::endl; - - auto npu_stream = c10_npu::getCurrentNPUStream(0); - - // Step 1: hidden_states = self.norm(hidden_states) - auto normalized_output = rms_norm(input, 0); - std::cout << "RMS norm output shape: " << normalized_output.sizes() - << std::endl; - - // Step 2: qkv, _ = self.qkv_proj(hidden_states) - auto qkv_output = qkv_proj(normalized_output, 0); - std::cout << "QKV projection output shape: " << qkv_output.sizes() - << std::endl; - - // Step 3: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], - // dim=-1) - auto split_outputs = split_layer(qkv_output, 0); - - EXPECT_EQ(split_outputs.size(), 3) - << "Split should produce 3 tensors (q, k, v)"; - - std::cout << "Split outputs:" << std::endl; - for (size_t i = 0; i < split_outputs.size(); ++i) { - std::cout << " Tensor " << i << " shape: " << split_outputs[i].sizes() - << std::endl; - } - - EXPECT_EQ(normalized_output.sizes(), input.sizes()); - - // Expected QKV output shape: [batch_size, seq_len, qkv_size] - std::vector expected_qkv_shape = {1, 10, qkv_size_}; - EXPECT_EQ(qkv_output.sizes(), expected_qkv_shape); - - // Expected split output shapes - // q: [batch_size, seq_len, q_size] - // k: [batch_size, seq_len, kv_size] - // v: [batch_size, seq_len, kv_size] - std::vector expected_q_shape = {1, 10, q_size_}; - std::vector expected_kv_shape = {1, 10, kv_size_}; - - if (split_outputs.size() >= 3) { - EXPECT_EQ(split_outputs[0].sizes(), expected_q_shape) - << "Q tensor shape mismatch"; - EXPECT_EQ(split_outputs[1].sizes(), expected_kv_shape) - << "K tensor shape mismatch"; - EXPECT_EQ(split_outputs[2].sizes(), expected_kv_shape) - << "V tensor shape mismatch"; - } - - std::cout << "Combined forward pass test (norm -> qkv_proj -> split) " - "completed successfully!" - << std::endl; - aclrtSynchronizeStream(npu_stream.stream()); - } catch (const std::exception& e) { - GTEST_SKIP() - << "Skipping combined forward pass test - requires NPU environment: " - << e.what(); - } -} - -// Test combined forward pass with different batch sizes: norm -> qkv_proj -> -// split -TEST_F(SampleModelTest, CombinedForwardPassBatchTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rms_norm = RmsNorm(*context_); - auto qkv_proj = Linear(*context_); - auto split_layer = Split(*context_); - - auto rms_norm_weight = - torch::randn({model_args_.hidden_size()}, tensor_options_); - auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight); - rms_norm->load_state_dict(rms_norm_state_dict); - rms_norm->merge_loaded_weights(); - - auto qkv_weight = - torch::randn({qkv_size_, model_args_.hidden_size()}, tensor_options_); - auto qkv_state_dict = CreateLinearStateDict(qkv_weight); - qkv_proj->load_state_dict(qkv_state_dict); - qkv_proj->merge_loaded_weights(); - - auto split_state_dict = CreateEmptyStateDict(); - split_layer->load_state_dict(split_state_dict); - split_layer->merge_loaded_weights(); - - std::vector> batch_shapes = { - {1, 5, model_args_.hidden_size()}, - {2, 10, model_args_.hidden_size()}, - {4, 20, model_args_.hidden_size()}}; - - for (const auto& shape : batch_shapes) { - auto input = torch::randn(shape, tensor_options_); - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - - auto normalized_output = rms_norm(input, 0); - auto qkv_output = qkv_proj(normalized_output, 0); - auto split_outputs = split_layer(qkv_output, 0); - - EXPECT_EQ(normalized_output.size(0), shape[0]); - EXPECT_EQ(normalized_output.size(1), shape[1]); - EXPECT_EQ(normalized_output.size(2), shape[2]); - - EXPECT_EQ(qkv_output.size(0), shape[0]); - EXPECT_EQ(qkv_output.size(1), shape[1]); - EXPECT_EQ(qkv_output.size(2), qkv_size_); - - EXPECT_EQ(split_outputs.size(), 3); - if (split_outputs.size() >= 3) { - // Q tensor - EXPECT_EQ(split_outputs[0].size(0), shape[0]); - EXPECT_EQ(split_outputs[0].size(1), shape[1]); - EXPECT_EQ(split_outputs[0].size(2), q_size_); - - // K tensor - EXPECT_EQ(split_outputs[1].size(0), shape[0]); - EXPECT_EQ(split_outputs[1].size(1), shape[1]); - EXPECT_EQ(split_outputs[1].size(2), kv_size_); - - // V tensor - EXPECT_EQ(split_outputs[2].size(0), shape[0]); - EXPECT_EQ(split_outputs[2].size(1), shape[1]); - EXPECT_EQ(split_outputs[2].size(2), kv_size_); - } - aclrtSynchronizeStream(npu_stream.stream()); - } catch (const std::exception& e) { - GTEST_SKIP() << "Skipping batch processing test for shape [" << shape[0] - << ", " << shape[1] << ", " << shape[2] - << "] - requires NPU environment: " << e.what(); - break; - } - } -} - -// Test tensor data flow and numerical properties: norm -> qkv_proj -> split -TEST_F(SampleModelTest, DataFlowTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rms_norm = RmsNorm(*context_); - auto qkv_proj = Linear(*context_); - auto split_layer = Split(*context_); - - auto rms_norm_weight = - torch::ones({model_args_.hidden_size()}, tensor_options_); - auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight); - rms_norm->load_state_dict(rms_norm_state_dict); - rms_norm->merge_loaded_weights(); - - auto qkv_weight = - torch::ones({qkv_size_, model_args_.hidden_size()}, tensor_options_) * - 0.1f; - auto qkv_state_dict = CreateLinearStateDict(qkv_weight); - qkv_proj->load_state_dict(qkv_state_dict); - qkv_proj->merge_loaded_weights(); - - auto split_state_dict = CreateEmptyStateDict(); - split_layer->load_state_dict(split_state_dict); - split_layer->merge_loaded_weights(); - - auto input = torch::ones({1, 1, model_args_.hidden_size()}, tensor_options_); - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - - auto normalized_output = rms_norm(input, 0); - auto qkv_output = qkv_proj(normalized_output, 0); - auto split_outputs = split_layer(qkv_output, 0); - - EXPECT_FALSE(torch::isnan(normalized_output).any().item()) - << "NaN detected in normalized output"; - EXPECT_FALSE(torch::isinf(normalized_output).any().item()) - << "Inf detected in normalized output"; - - EXPECT_FALSE(torch::isnan(qkv_output).any().item()) - << "NaN detected in QKV projection output"; - EXPECT_FALSE(torch::isinf(qkv_output).any().item()) - << "Inf detected in QKV projection output"; - - EXPECT_EQ(split_outputs.size(), 3) << "Expected 3 split outputs (q, k, v)"; - - for (size_t i = 0; i < split_outputs.size(); ++i) { - EXPECT_FALSE(torch::isnan(split_outputs[i]).any().item()) - << "NaN detected in split output " << i; - EXPECT_FALSE(torch::isinf(split_outputs[i]).any().item()) - << "Inf detected in split output " << i; - } - - std::cout << "Data flow test completed - no NaN or Inf values detected in " - "pipeline!" - << std::endl; - aclrtSynchronizeStream(npu_stream.stream()); - } catch (const std::exception& e) { - GTEST_SKIP() << "Skipping data flow test - requires NPU environment: " - << e.what(); - } -} - -// Test QKV splitting with attention output gate functionality -TEST_F(SampleModelTest, QKVSplitWithAttentionGateTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - int64_t q_gate_size = q_size_ * 2; // q + gate - int64_t qkv_gate_size = q_gate_size + 2 * kv_size_; // (q + gate) + k + v - - auto rms_norm = RmsNorm(*context_); - auto qkv_proj = Linear(*context_); - auto split_layer = Split(*context_, 2, 3, {q_gate_size, kv_size_, kv_size_}); - - // Setup for attention output gate mode - attn_output_gate_ = true; - - auto rms_norm_weight = - torch::randn({model_args_.hidden_size()}, tensor_options_); - auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight); - rms_norm->load_state_dict(rms_norm_state_dict); - rms_norm->merge_loaded_weights(); - - // QKV projection with gate: output size = (q_size * 2) + k_size + v_size - auto qkv_weight = - torch::randn({qkv_gate_size, model_args_.hidden_size()}, tensor_options_); - auto qkv_state_dict = CreateLinearStateDict(qkv_weight); - qkv_proj->load_state_dict(qkv_state_dict); - qkv_proj->merge_loaded_weights(); - - auto split_state_dict = CreateEmptyStateDict(); - split_layer->load_state_dict(split_state_dict); - split_layer->merge_loaded_weights(); - - auto input = - torch::randn({1, 10, model_args_.hidden_size()}, tensor_options_); - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - - auto normalized_output = rms_norm(input, 0); - - auto qkv_output = qkv_proj(normalized_output, 0); - std::cout << "QKV with gate output shape: " << qkv_output.sizes() - << std::endl; - - auto split_outputs = split_layer(qkv_output, 0); - EXPECT_EQ(split_outputs.size(), 3) - << "Split should produce 3 tensors (q_gate, k, v)"; - - if (split_outputs.size() >= 3) { - auto q_gate = split_outputs[0]; - auto k = split_outputs[1]; - auto v = split_outputs[2]; - - std::cout << "Q+Gate tensor shape: " << q_gate.sizes() << std::endl; - std::cout << "K tensor shape: " << k.sizes() << std::endl; - std::cout << "V tensor shape: " << v.sizes() << std::endl; - - std::vector expected_q_gate_shape = {1, 10, q_gate_size}; - std::vector expected_kv_shape = {1, 10, kv_size_}; - - EXPECT_EQ(q_gate.sizes(), expected_q_gate_shape) - << "Q+Gate tensor shape mismatch"; - EXPECT_EQ(k.sizes(), expected_kv_shape) << "K tensor shape mismatch"; - EXPECT_EQ(v.sizes(), expected_kv_shape) << "V tensor shape mismatch"; - - // q_gate = q_gate.view(*orig_shape, self.num_heads, -1) - auto orig_shape = q_gate.sizes(); - auto q_gate_reshaped = - q_gate.view({orig_shape[0], orig_shape[1], num_heads_, -1}); - - // q, gate = torch.chunk(q_gate, 2, dim=-1) - auto q_gate_chunks = torch::chunk(q_gate_reshaped, 2, -1); - EXPECT_EQ(q_gate_chunks.size(), 2) << "Should split q_gate into 2 chunks"; - - if (q_gate_chunks.size() >= 2) { - auto q = q_gate_chunks[0]; - auto gate = q_gate_chunks[1]; - - q = q.reshape({orig_shape[0], orig_shape[1], -1}); - gate = gate.reshape({orig_shape[0], orig_shape[1], -1}); - - std::cout << "Final Q shape: " << q.sizes() << std::endl; - std::cout << "Final Gate shape: " << gate.sizes() << std::endl; - - std::vector expected_final_q_shape = {1, 10, q_size_}; - EXPECT_EQ(q.sizes(), expected_final_q_shape) - << "Final Q tensor shape mismatch"; - EXPECT_EQ(gate.sizes(), expected_final_q_shape) - << "Gate tensor shape mismatch"; - } - } - - std::cout << "QKV split with attention gate test completed successfully!" - << std::endl; - aclrtSynchronizeStream(npu_stream.stream()); - } catch (const std::exception& e) { - GTEST_SKIP() << "Skipping attention gate test - requires NPU environment: " - << e.what(); - } -} - -// Test standard QKV splitting (without attention gate) -TEST_F(SampleModelTest, StandardQKVSplitTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rms_norm = RmsNorm(*context_); - auto qkv_proj = Linear(*context_); - auto split_layer = Split(*context_); - - attn_output_gate_ = false; - - auto rms_norm_weight = - torch::randn({model_args_.hidden_size()}, tensor_options_); - auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight); - rms_norm->load_state_dict(rms_norm_state_dict); - rms_norm->merge_loaded_weights(); - - // Standard QKV projection: output size = q_size + k_size + v_size - auto qkv_weight = - torch::randn({qkv_size_, model_args_.hidden_size()}, tensor_options_); - auto qkv_state_dict = CreateLinearStateDict(qkv_weight); - qkv_proj->load_state_dict(qkv_state_dict); - qkv_proj->merge_loaded_weights(); - - auto split_state_dict = CreateEmptyStateDict(); - split_layer->load_state_dict(split_state_dict); - split_layer->merge_loaded_weights(); - - auto input = - torch::randn({1, 10, model_args_.hidden_size()}, tensor_options_); - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - - auto normalized_output = rms_norm(input, 0); - - auto qkv_output = qkv_proj(normalized_output, 0); - std::cout << "Standard QKV output shape: " << qkv_output.sizes() - << std::endl; - - auto split_outputs = split_layer(qkv_output, 0); - EXPECT_EQ(split_outputs.size(), 3) - << "Split should produce 3 tensors (q, k, v)"; - - if (split_outputs.size() >= 3) { - auto q = split_outputs[0]; - auto k = split_outputs[1]; - auto v = split_outputs[2]; - - std::cout << "Q tensor shape: " << q.sizes() << std::endl; - std::cout << "K tensor shape: " << k.sizes() << std::endl; - std::cout << "V tensor shape: " << v.sizes() << std::endl; - - std::vector expected_q_shape = {1, 10, q_size_}; - std::vector expected_kv_shape = {1, 10, kv_size_}; - - EXPECT_EQ(q.sizes(), expected_q_shape) << "Q tensor shape mismatch"; - EXPECT_EQ(k.sizes(), expected_kv_shape) << "K tensor shape mismatch"; - EXPECT_EQ(v.sizes(), expected_kv_shape) << "V tensor shape mismatch"; - } - - std::cout << "Standard QKV split test completed successfully!" << std::endl; - aclrtSynchronizeStream(npu_stream.stream()); - } catch (const std::exception& e) { - GTEST_SKIP() << "Skipping standard QKV test - requires NPU environment: " - << e.what(); - } -} - -// Test Q and K normalization functionality -TEST_F(SampleModelTest, QKNormalizationTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rms_norm = RmsNorm(*context_); - auto qkv_proj = Linear(*context_); - auto split_layer = Split(*context_); - auto q_norm = RmsNorm(*context_); - auto k_norm = RmsNorm(*context_); - - auto rms_norm_weight = - torch::randn({model_args_.hidden_size()}, tensor_options_); - auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight); - rms_norm->load_state_dict(rms_norm_state_dict); - rms_norm->merge_loaded_weights(); - - auto qkv_weight = - torch::randn({qkv_size_, model_args_.hidden_size()}, tensor_options_); - auto qkv_state_dict = CreateLinearStateDict(qkv_weight); - qkv_proj->load_state_dict(qkv_state_dict); - qkv_proj->merge_loaded_weights(); - - auto split_state_dict = CreateEmptyStateDict(); - split_layer->load_state_dict(split_state_dict); - split_layer->merge_loaded_weights(); - - auto q_norm_weight = torch::randn({head_dim_}, tensor_options_); - auto q_norm_state_dict = CreateRmsNormStateDict(q_norm_weight); - q_norm->load_state_dict(q_norm_state_dict); - q_norm->merge_loaded_weights(); - - auto k_norm_weight = torch::randn({head_dim_}, tensor_options_); - auto k_norm_state_dict = CreateRmsNormStateDict(k_norm_weight); - k_norm->load_state_dict(k_norm_state_dict); - k_norm->merge_loaded_weights(); - - auto input = - torch::randn({1, 10, model_args_.hidden_size()}, tensor_options_); - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - - // Forward pass: norm -> qkv_proj -> split -> q_norm/k_norm - auto normalized_output = rms_norm(input, 0); - auto qkv_output = qkv_proj(normalized_output, 0); - auto split_outputs = split_layer(qkv_output, 0); - - EXPECT_EQ(split_outputs.size(), 3) << "Expected 3 split outputs"; - - if (split_outputs.size() >= 3) { - auto q = split_outputs[0]; - auto k = split_outputs[1]; - auto v = split_outputs[2]; - - // Reshape Q and K for normalization: [batch, seq, num_heads, head_dim] - auto q_reshaped = q.view({-1, num_heads_, head_dim_}); - auto k_reshaped = k.view({-1, num_kv_heads_, head_dim_}); - - std::cout << "Q reshaped for norm: " << q_reshaped.sizes() << std::endl; - std::cout << "K reshaped for norm: " << k_reshaped.sizes() << std::endl; - - auto q_normalized = q_norm(q_reshaped, 0); - auto k_normalized = k_norm(k_reshaped, 0); - - q_normalized = q_normalized.view({1, -1, num_heads_ * head_dim_}); - k_normalized = k_normalized.view({1, -1, num_kv_heads_ * head_dim_}); - - std::cout << "Q after norm: " << q_normalized.sizes() << std::endl; - std::cout << "K after norm: " << k_normalized.sizes() << std::endl; - - EXPECT_FALSE(torch::isnan(q_normalized).any().item()) - << "NaN detected in normalized Q"; - EXPECT_FALSE(torch::isinf(q_normalized).any().item()) - << "Inf detected in normalized Q"; - EXPECT_FALSE(torch::isnan(k_normalized).any().item()) - << "NaN detected in normalized K"; - EXPECT_FALSE(torch::isinf(k_normalized).any().item()) - << "Inf detected in normalized K"; - - EXPECT_EQ(q_normalized.sizes(), q.sizes()) - << "Q shape changed after norm"; - EXPECT_EQ(k_normalized.sizes(), k.sizes()) - << "K shape changed after norm"; - } - - std::cout << "Q and K normalization test completed successfully!" - << std::endl; - aclrtSynchronizeStream(npu_stream.stream()); - } catch (const std::exception& e) { - GTEST_SKIP() << "Skipping Q/K norm test - requires NPU environment: " - << e.what(); - } -} - -// Comprehensive test: norm -> qkv_proj -> split -> q_norm/k_norm -> rope -TEST_F(SampleModelTest, CompleteAttentionPipelineTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto rms_norm = RmsNorm(*context_); - auto qkv_proj = Linear(*context_); - auto split_layer = Split(*context_); - auto q_norm = RmsNorm(*context_); - auto k_norm = RmsNorm(*context_); - auto rope_layer = Rope(*context_); - - auto rms_norm_weight = - torch::randn({model_args_.hidden_size()}, tensor_options_); - auto rms_norm_state_dict = CreateRmsNormStateDict(rms_norm_weight); - rms_norm->load_state_dict(rms_norm_state_dict); - rms_norm->merge_loaded_weights(); - - auto qkv_weight = - torch::randn({qkv_size_, model_args_.hidden_size()}, tensor_options_); - auto qkv_state_dict = CreateLinearStateDict(qkv_weight); - qkv_proj->load_state_dict(qkv_state_dict); - qkv_proj->merge_loaded_weights(); - - auto split_state_dict = CreateEmptyStateDict(); - split_layer->load_state_dict(split_state_dict); - split_layer->merge_loaded_weights(); - - auto q_norm_weight = torch::randn({head_dim_}, tensor_options_); - auto q_norm_state_dict = CreateRmsNormStateDict(q_norm_weight); - q_norm->load_state_dict(q_norm_state_dict); - q_norm->merge_loaded_weights(); - - auto k_norm_weight = torch::randn({head_dim_}, tensor_options_); - auto k_norm_state_dict = CreateRmsNormStateDict(k_norm_weight); - k_norm->load_state_dict(k_norm_state_dict); - k_norm->merge_loaded_weights(); - - auto rope_state_dict = CreateEmptyStateDict(); - rope_layer->load_state_dict(rope_state_dict); - rope_layer->merge_loaded_weights(); - - std::vector> test_shapes = { - {1, 5, model_args_.hidden_size()}, - {2, 10, model_args_.hidden_size()}, - {1, 20, model_args_.hidden_size()}}; - - for (const auto& shape : test_shapes) { - auto input = torch::randn(shape, tensor_options_); - int64_t seq_len = shape[1]; - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - - std::cout << "\nTesting complete pipeline with input shape: " << shape[0] - << "x" << shape[1] << "x" << shape[2] << std::endl; - - auto normalized_output = rms_norm(input, 0); - - auto qkv_output = qkv_proj(normalized_output, 0); - - auto split_outputs = split_layer(qkv_output, 0); - EXPECT_EQ(split_outputs.size(), 3) << "Expected 3 split outputs"; - - if (split_outputs.size() >= 3) { - auto q = split_outputs[0]; - auto k = split_outputs[1]; - auto v = split_outputs[2]; - - auto q_reshaped = q.view({-1, num_heads_, head_dim_}); - auto k_reshaped = k.view({-1, num_kv_heads_, head_dim_}); - - auto q_normalized = q_norm(q_reshaped, 0); - auto k_normalized = k_norm(k_reshaped, 0); - - q_normalized = q_normalized.view({-1, num_heads_ * head_dim_}); - k_normalized = k_normalized.view({-1, num_kv_heads_ * head_dim_}); - - auto rope_embeddings = CreateRopeEmbeddings(seq_len, head_dim_); - auto cos_embedding = rope_embeddings.first; - auto sin_embedding = rope_embeddings.second; - auto seq_len_tensor = - torch::tensor({seq_len}, tensor_options_.dtype(torch::kInt32)); - - auto rope_outputs = rope_layer->forward(q_normalized, - k_normalized, - cos_embedding, - sin_embedding, - seq_len_tensor, - 0); - - EXPECT_EQ(rope_outputs.size(), 2) << "Expected 2 RoPE outputs"; - - if (rope_outputs.size() >= 2) { - auto q_final = rope_outputs[0]; - auto k_final = rope_outputs[1]; - - std::cout << "Final Q shape: " << q_final.sizes() << std::endl; - std::cout << "Final K shape: " << k_final.sizes() << std::endl; - std::cout << "V shape: " << v.sizes() << std::endl; - - // Verify final shapes - // EXPECT_EQ(q_final.size(0), shape[0]) << "Batch size mismatch"; - // EXPECT_EQ(q_final.size(1), shape[1]) << "Sequence length mismatch"; - // EXPECT_EQ(k_final.size(0), shape[0]) << "Batch size mismatch"; - // EXPECT_EQ(k_final.size(1), shape[1]) << "Sequence length mismatch"; - - EXPECT_EQ(q_final.sizes(), q_normalized.sizes()); - EXPECT_EQ(k_final.sizes(), k_normalized.sizes()); - - EXPECT_FALSE(torch::isnan(q_final).any().item()) - << "NaN detected in final Q"; - EXPECT_FALSE(torch::isinf(q_final).any().item()) - << "Inf detected in final Q"; - EXPECT_FALSE(torch::isnan(k_final).any().item()) - << "NaN detected in final K"; - EXPECT_FALSE(torch::isinf(k_final).any().item()) - << "Inf detected in final K"; - EXPECT_FALSE(torch::isnan(v).any().item()) - << "NaN detected in V"; - EXPECT_FALSE(torch::isinf(v).any().item()) - << "Inf detected in V"; - - std::cout << "Complete pipeline test passed for shape [" << shape[0] - << ", " << shape[1] << ", " << shape[2] << "]" << std::endl; - } - } - - aclrtSynchronizeStream(npu_stream.stream()); - } catch (const std::exception& e) { - GTEST_SKIP() << "Skipping complete pipeline test for shape [" << shape[0] - << ", " << shape[1] << ", " << shape[2] - << "] - requires NPU environment: " << e.what(); - break; - } - } - - std::cout << "\nComplete attention pipeline test completed successfully!" - << std::endl; -} - -} // namespace xllm::kernel - -int main(int argc, char** argv) { - struct rlimit core_limit; - core_limit.rlim_cur = 0; - core_limit.rlim_max = 0; - setrlimit(RLIMIT_CORE, &core_limit); - - FILE* null_stderr = freopen("/dev/null", "w", stderr); - if (null_stderr == nullptr) { - fclose(stderr); - } - - ::testing::InitGoogleTest(&argc, argv); - - bool npu_available = false; - try { - auto test_tensor = - torch::zeros({1}, torch::TensorOptions().device("npu:0")); - npu_available = true; - } catch (...) { - npu_available = false; - } - - if (!npu_available) { - std::cout << "NPU device not available, skipping all tests." << std::endl; - return 0; - } - - int result = RUN_ALL_TESTS(); - _exit(result); -} \ No newline at end of file diff --git a/xllm/core/kernels/npu/impl/npu_split_impl.cpp b/xllm/core/kernels/npu/impl/npu_split_impl.cpp deleted file mode 100644 index a1346ec2..00000000 --- a/xllm/core/kernels/npu/impl/npu_split_impl.cpp +++ /dev/null @@ -1,135 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "npu_split_impl.h" - -#include - -namespace xllm::kernel { - -void NpuSplitImpl::param_from_args(atb::infer::SplitParam& param, - const ModelArgs& args, - int32_t splitDim, - int32_t splitNum, - atb::SVector splitSizes) { - param.splitDim = splitDim; - param.splitNum = splitNum; - param.splitSizes = splitSizes; -} - -int64_t NpuSplitImpl::init_node(atb_speed::Model::Node& node, - atb::infer::SplitParam& param) { - name_ = "split"; - model_name_ = "llm"; - run_task_func_ = std::bind(&NpuSplitImpl::run_task, - this, - std::placeholders::_1, - std::placeholders::_2); - - atb::Operation* operation = nullptr; - atb::Status atbStatus = atb::CreateOperation(param, &operation); - if (atbStatus != atb::NO_ERROR) { - return atbStatus; - } - - node.operation.reset(operation); - if (node.operation == nullptr) { - LOG(ERROR) << "node.operation is null"; - return -1; - } - if (node.operation->GetInputNum() < 1) { - LOG(ERROR) << "Can not resize number which is smaller than 1"; - return -1; - } - - return atb::NO_ERROR; -} - -NpuSplitImpl::NpuSplitImpl(const ModelContext& context, - int32_t splitDim, - int32_t splitNum, - atb::SVector splitSizes) - : NpuBaseLayer(context) { - param_from_args( - split_param_, context.get_model_args(), splitDim, splitNum, splitSizes); - - at_weight_tensors_.resize(1); - atb_weight_tensors_.resize(1); - at_out_tensors_.resize(3); - - auto options = context.get_tensor_options(); - dtype_ = c10::typeMetaToScalarType(options.dtype()); - at_weight_tensors_[0] = torch::zeros({1}).to(options); - - atb::Status status = init_node(split_node_, split_param_); - if (status != atb::NO_ERROR) { - LOG(ERROR) << "Failed to initialize node, status: " << status; - LOG(FATAL) << "NpuSplitImpl initialization failed with status: " - << std::to_string(status); - } -} - -void NpuSplitImpl::verify_loaded_weights(const std::string weight_str) const { - // No operation needed for split layer -} - -void NpuSplitImpl::merge_loaded_weights() { - // No operation needed for split layer -} - -void NpuSplitImpl::load_state_dict(const StateDict& state_dict) { - // No operation needed for split layer -} - -std::vector NpuSplitImpl::forward(const torch::Tensor& input, - int nodeId) { - atb::Status st; - build_node_variant_pack(split_node_, input); - st = execute_node(split_node_, nodeId); - LOG_IF(FATAL, st != 0) << model_name_ - << "infer shape fail, error code: " << st; - return at_out_tensors_; -} - -void NpuSplitImpl::build_node_variant_pack(atb_speed::Model::Node& node, - const torch::Tensor& input) { - internal_input = atb_speed::Utils::AtTensor2Tensor(input); - - atb::SVector ins = {internal_input}; - node.variantPack.inTensors = ins; - - atb::SVector inTensorDescs; - inTensorDescs.resize(node.operation->GetInputNum()); - inTensorDescs.at(0) = internal_input.desc; - - atb::SVector outTensorDescs; - node.operation->InferShape(inTensorDescs, outTensorDescs); - - at::Tensor output_0 = - atb_speed::Utils::CreateAtTensorFromTensorDesc(outTensorDescs.at(0)); - at_out_tensors_.at(0) = output_0; - at::Tensor output_1 = - atb_speed::Utils::CreateAtTensorFromTensorDesc(outTensorDescs.at(1)); - at_out_tensors_.at(1) = output_1; - at::Tensor output_2 = - atb_speed::Utils::CreateAtTensorFromTensorDesc(outTensorDescs.at(2)); - at_out_tensors_.at(2) = output_2; - - node.variantPack.outTensors = {atb_speed::Utils::AtTensor2Tensor(output_0), - atb_speed::Utils::AtTensor2Tensor(output_1), - atb_speed::Utils::AtTensor2Tensor(output_2)}; -} - -} // namespace xllm::kernel diff --git a/xllm/core/kernels/npu/impl/npu_split_impl.h b/xllm/core/kernels/npu/impl/npu_split_impl.h deleted file mode 100644 index c8f85ae8..00000000 --- a/xllm/core/kernels/npu/impl/npu_split_impl.h +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#pragma once -#ifdef TORCH_HIGHER_THAN_PTA6 -#include -#include -#else -#include -#include -#endif - -#include - -#include - -#include "atb/atb_infer.h" -#include "framework/kv_cache/kv_cache.h" -#include "framework/model/model_input_params.h" -#include "framework/state_dict/state_dict.h" -#include "layers/npu/npu_base_layer.h" -#include "nlohmann/json.hpp" -#include "pytorch/adapter/utils/utils.h" -#include "xllm_kernels/core/include/atb_speed/base/hosttensor_binder.h" -#include "xllm_kernels/core/include/atb_speed/base/model.h" -#include "xllm_kernels/core/include/atb_speed/log.h" -#include "xllm_kernels/core/include/atb_speed/utils/model_factory.h" - -namespace xllm::kernel { - -class NpuSplitImpl : public xllm::layer::NpuBaseLayer { - public: - explicit NpuSplitImpl(const ModelContext& context, - int32_t splitDim = 2, - int32_t splitNum = 3, - atb::SVector splitSizes = {}); - - ~NpuSplitImpl() {}; - - void load_state_dict(const StateDict& state_dict); - - void verify_loaded_weights(const std::string weight_str) const; - - void merge_loaded_weights(); - - std::vector forward(const torch::Tensor& input, int nodeId); - - private: - int64_t init_node(atb_speed::Model::Node& node, - atb::infer::SplitParam& param); - void build_node_variant_pack(atb_speed::Model::Node& node, - const torch::Tensor& input); - void param_from_args(atb::infer::SplitParam& param, - const ModelArgs& args, - int32_t splitDim, - int32_t splitNum, - atb::SVector splitSizes); - - std::vector at_out_tensors_; - atb::Tensor internal_input; - - atb_speed::Model::Node split_node_; - std::string model_name_; - atb::infer::SplitParam split_param_; - atb::Tensor internal_tensors_; -}; - -} // namespace xllm::kernel diff --git a/xllm/core/kernels/npu/impl/npu_split_test.cpp b/xllm/core/kernels/npu/impl/npu_split_test.cpp deleted file mode 100644 index 2e28d26c..00000000 --- a/xllm/core/kernels/npu/impl/npu_split_test.cpp +++ /dev/null @@ -1,356 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include -#include - -#include "kernels/npu/split.h" - -namespace xllm::kernel { - -class NpuSplitTest : public ::testing::Test { - protected: - NpuSplitTest() : parallel_args_(1, 1, nullptr) { - try { - torch::zeros({1}, torch::TensorOptions().device("npu:0")); - - tensor_options_ = - torch::TensorOptions().dtype(torch::kFloat16).device("npu:0"); - npu_available_ = true; - std::cout << "Using NPU device" << std::endl; - - } catch (...) { - tensor_options_ = - torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU); - npu_available_ = false; - std::cout << "Using CPU device (NPU unavailable)" << std::endl; - } - } - - void SetUp() override { - torch::manual_seed(42); - - model_args_.hidden_size() = 4096 * 3; - model_args_.intermediate_size() = 11008; - model_args_.dtype() = "float16"; - - quant_args_.torch_dtype() = "float16"; - - context_ = std::make_unique( - parallel_args_, model_args_, quant_args_, tensor_options_); - } - - void TearDown() override { - context_.reset(); - - if (npu_available_) { - try { - c10_npu::npuSynchronizeDevice(); - c10_npu::NPUCachingAllocator::emptyCache(); - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } catch (...) { - // NPU cleanup failures are usually not critical in test teardown - } - } - } - - StateDict CreateEmptyStateDict() { - std::unordered_map tensor_map; - return StateDict(tensor_map, ""); - } - - ModelArgs model_args_; - QuantArgs quant_args_; - ParallelArgs parallel_args_; - torch::TensorOptions tensor_options_; - std::unique_ptr context_; - bool npu_available_ = true; -}; - -// Test NpuSplitImpl construction -TEST_F(NpuSplitTest, ConstructorTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - ASSERT_NO_THROW({ - auto split = std::make_shared(*context_); - EXPECT_NE(split, nullptr); - }); -} - -// Test Split wrapper construction -TEST_F(NpuSplitTest, SplitWrapperTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - ASSERT_NO_THROW({ auto split = Split(*context_); }); -} - -// Test state dict loading (should be no-op for split layer) -TEST_F(NpuSplitTest, LoadStateDictTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto split = std::make_shared(*context_); - auto state_dict = CreateEmptyStateDict(); - - ASSERT_NO_THROW({ split->load_state_dict(state_dict); }); -} - -// Test weight verification (should pass for split layer as it has no weights) -TEST_F(NpuSplitTest, VerifyLoadedWeightsTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto split = std::make_shared(*context_); - - ASSERT_NO_THROW({ split->verify_loaded_weights("test_weight"); }); -} - -// Test merge loaded weights (should be no-op for split layer) -TEST_F(NpuSplitTest, MergeLoadedWeightsTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto split = std::make_shared(*context_); - - ASSERT_NO_THROW({ split->merge_loaded_weights(); }); -} - -// Test forward pass with basic input -TEST_F(NpuSplitTest, ForwardPassBasicTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto split = Split(*context_); - - // Input tensor with shape [batch_size, seq_len, hidden_size] - auto input = - torch::randn({1, 10, model_args_.hidden_size() * 3}, tensor_options_); - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - auto outputs = split->forward(input, 0); - aclrtSynchronizeStream(npu_stream.stream()); - std::cout << "Input tensor shape: " << input.sizes() << std::endl; - std::cout << "Number of output tensors: " << outputs.size() << std::endl; - - EXPECT_EQ(outputs.size(), 3); - - for (size_t i = 0; i < outputs.size(); ++i) { - EXPECT_EQ(outputs[i].size(0), 1); // batch size - EXPECT_EQ(outputs[i].size(1), 10); // sequence length - std::cout << "Output " << i << " shape: " << outputs[i].sizes() - << std::endl; - } - - int64_t total_output_features = 0; - for (const auto& output : outputs) { - total_output_features += output.size(2); - } - EXPECT_EQ(total_output_features, model_args_.hidden_size() * 3); - - } catch (const std::exception& e) { - GTEST_SKIP() << "Skipping forward pass test - requires NPU environment: " - << e.what(); - } -} - -// Test split functionality with different input shapes -TEST_F(NpuSplitTest, SplitDifferentInputShapesTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto split = Split(*context_); - - std::vector> input_shapes = { - {1, 5, model_args_.hidden_size()}, - {2, 10, model_args_.hidden_size()}, - {4, 20, model_args_.hidden_size()}, - {1, 1, model_args_.hidden_size()}}; - - for (const auto& shape : input_shapes) { - auto input = torch::randn(shape, tensor_options_); - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - auto outputs = split->forward(input, 0); - aclrtSynchronizeStream(npu_stream.stream()); - - EXPECT_EQ(outputs.size(), 3); - - for (const auto& output : outputs) { - EXPECT_EQ(output.size(0), shape[0]); - EXPECT_EQ(output.size(1), shape[1]); - EXPECT_GT(output.size(2), 0); - } - - int64_t total_features = 0; - for (const auto& output : outputs) { - total_features += output.size(2); - } - EXPECT_EQ(total_features, shape[2]); - - } catch (const std::exception& e) { - GTEST_SKIP() << "Skipping shape test for [" << shape[0] << ", " - << shape[1] << ", " << shape[2] - << "] - requires NPU environment: " << e.what(); - break; - } - } -} - -// Test split with different hidden sizes -TEST_F(NpuSplitTest, SplitDifferentHiddenSizesTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - std::vector hidden_sizes = { - 768 * 3, 1024 * 3, 2048 * 3, 4096 * 3, 6144 * 3, 8192 * 3}; - auto npu_stream = c10_npu::getCurrentNPUStream(0); - for (auto hidden_size : hidden_sizes) { - model_args_.hidden_size() = hidden_size; - - QuantArgs local_quant_args = quant_args_; - local_quant_args.torch_dtype() = "float16"; - - auto context = std::make_unique( - parallel_args_, model_args_, local_quant_args, tensor_options_); - - try { - auto split = Split(*context); - - auto input = torch::randn({1, 10, hidden_size}, tensor_options_); - - auto npu_stream = c10_npu::getCurrentNPUStream(0); - auto outputs = split->forward(input, 0); - aclrtSynchronizeStream(npu_stream.stream()); - aclrtSynchronizeStream(npu_stream.stream()); - EXPECT_EQ(outputs.size(), 3); - - int64_t total_features = 0; - for (const auto& output : outputs) { - total_features += output.size(2); - } - EXPECT_EQ(total_features, hidden_size); - - } catch (const std::exception& e) { - GTEST_SKIP() << "Skipping hidden size test for " << hidden_size - << " - requires NPU environment: " << e.what(); - break; - } - } -} - -// Test error handling with invalid inputs -TEST_F(NpuSplitTest, ErrorHandlingTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto split = Split(*context_); - - try { - auto empty_input = torch::empty({0, 0, 0}, tensor_options_); - auto npu_stream = c10_npu::getCurrentNPUStream(0); - auto outputs = split->forward(empty_input, 0); - aclrtSynchronizeStream(npu_stream.stream()); - } catch (const std::exception& e) { - std::cout << "Correctly caught expected error for empty tensor: " - << e.what() << std::endl; - } - - try { - auto wrong_dim_input = - torch::randn({10, model_args_.hidden_size()}, tensor_options_); - auto npu_stream = c10_npu::getCurrentNPUStream(0); - auto outputs = split->forward(wrong_dim_input, 0); - aclrtSynchronizeStream(npu_stream.stream()); - } catch (const std::exception& e) { - std::cout << "Caught error for 2D input: " << e.what() << std::endl; - } -} - -// Test consistency of split operation -TEST_F(NpuSplitTest, SplitConsistencyTest) { - if (!npu_available_) { - GTEST_SKIP() << "Skipping NPU test - NPU device not available"; - } - - auto split = Split(*context_); - auto input1 = - torch::randn({2, 5, model_args_.hidden_size()}, tensor_options_); - - try { - auto npu_stream = c10_npu::getCurrentNPUStream(0); - - auto outputs1 = split->forward(input1, 0); - aclrtSynchronizeStream(npu_stream.stream()); - - auto outputs2 = split->forward(input1, 1); - aclrtSynchronizeStream(npu_stream.stream()); - - EXPECT_EQ(outputs1.size(), outputs2.size()); - - for (size_t i = 0; i < outputs1.size(); ++i) { - EXPECT_TRUE(outputs1[i].sizes().equals(outputs2[i].sizes())); - } - - } catch (const std::exception& e) { - GTEST_SKIP() << "Skipping consistency test - requires NPU environment: " - << e.what(); - } -} - -} // namespace xllm::kernel - -int main(int argc, char** argv) { - struct rlimit core_limit; - core_limit.rlim_cur = 0; - core_limit.rlim_max = 0; - setrlimit(RLIMIT_CORE, &core_limit); - - FILE* null_stderr = freopen("/dev/null", "w", stderr); - if (null_stderr == nullptr) { - fclose(stderr); - } - - ::testing::InitGoogleTest(&argc, argv); - - bool npu_available = false; - try { - auto test_tensor = - torch::zeros({1}, torch::TensorOptions().device("npu:0")); - npu_available = true; - } catch (...) { - npu_available = false; - } - - if (!npu_available) { - std::cout << "NPU device not available, skipping all tests." << std::endl; - return 0; - } - - int result = RUN_ALL_TESTS(); - _exit(result); -} \ No newline at end of file diff --git a/xllm/core/kernels/npu/linear.h b/xllm/core/kernels/npu/matmul.cpp similarity index 61% rename from xllm/core/kernels/npu/linear.h rename to xllm/core/kernels/npu/matmul.cpp index 0834c014..0b80c9dd 100644 --- a/xllm/core/kernels/npu/linear.h +++ b/xllm/core/kernels/npu/matmul.cpp @@ -13,18 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#pragma once -#include "impl/npu_linear_impl.h" - -namespace xllm::kernel { - -class Linear : public torch::nn::ModuleHolder { - public: - using torch::nn::ModuleHolder::ModuleHolder; - using Impl __attribute__((__unused__)) = NpuLinearImpl; - - Linear(const ModelContext& context) - : ModuleHolder(std::make_shared(context)) {} -}; - -} // namespace xllm::kernel +#include "npu_ops_api.h" +#include "ops_npu/npu_ops.h" + +namespace xllm::kernel::npu { + +torch::Tensor matmul(const torch::Tensor& a, + const torch::Tensor& b, + const std::optional& bias) { + if (!bias.has_value()) { + return torch::nn::functional::linear(a, b); + } else { + return torch::nn::functional::linear(a, b, bias.value()); + } +} + +} // namespace xllm::kernel::npu diff --git a/xllm/core/kernels/npu/npu_ops_api.h b/xllm/core/kernels/npu/npu_ops_api.h new file mode 100644 index 00000000..e9c85b38 --- /dev/null +++ b/xllm/core/kernels/npu/npu_ops_api.h @@ -0,0 +1,65 @@ +/* Copyright 2025 The xLLM Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://github.com/jd-opensource/xllm/blob/main/LICENSE + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#pragma once +#include + +#include + +#include "./custom_functions_npu/AtbCommon.h" + +namespace xllm::kernel::npu { + +void reshape_paged_cache(torch::Tensor& key, + torch::Tensor& value, + torch::Tensor& k_cache, + torch::Tensor& v_cache, + const torch::Tensor& slot_mapping); + +void batch_prefill(const torch::Tensor& query, + const torch::Tensor& key, + const torch::Tensor& value, + const torch::Tensor& mask, + const torch::Tensor& seq_len, + float scale, + int num_heads, + int num_kv_heads, + torch::Tensor& output); + +void batch_decode(const torch::Tensor& query, + const torch::Tensor& k_cache, + const torch::Tensor& v_cache, + int num_kv_heads, + int num_heads, + float scale, + const torch::Tensor& block_table, + const torch::Tensor& seq_lens, + torch::Tensor& output); + +torch::Tensor matmul(const torch::Tensor& a, + const torch::Tensor& b, + const std::optional& bias); + +torch::Tensor active(const torch::Tensor& input); + +torch::Tensor fused_layernorm(const torch::Tensor& input, + const torch::Tensor& weight, + double eps); + +void apply_rotary(torch::Tensor& q, + torch::Tensor& k, + const torch::Tensor& cos_sin_cache, + const torch::Tensor& positions); +} // namespace xllm::kernel::npu diff --git a/xllm/core/kernels/npu/ops_npu/PagedAttentionAtb.cpp b/xllm/core/kernels/npu/ops_npu/PagedAttentionAtb.cpp new file mode 100644 index 00000000..a05ee76b --- /dev/null +++ b/xllm/core/kernels/npu/ops_npu/PagedAttentionAtb.cpp @@ -0,0 +1,61 @@ +// Copyright (c) 2025 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../custom_functions_npu/AtbCommon.h" + +namespace atb { +using PagedAttentionParam = atb::infer::PagedAttentionParam; +void _npu_paged_attention(const at::Tensor& query, + const at::Tensor& key_cache, + const at::Tensor& value_cache, + int64_t num_kv_heads, + int64_t num_heads, + double scale_value, + const at::Tensor& block_table, + const at::Tensor& context_lens, + at::Tensor& out) { + const c10::OptionalDeviceGuard device_guard(device_of(query)); + OpParamCache& pagedAttentionParamCache = + OpParamCache::getInstance(); + PagedAttentionParam pagedparam; + pagedparam.headNum = num_heads; + pagedparam.qkScale = scale_value; + pagedparam.kvHeadNum = num_kv_heads; + pagedparam.maskType = PagedAttentionParam::UNDEFINED; + pagedparam.batchRunStatusEnable = false; + pagedparam.quantType = PagedAttentionParam::TYPE_QUANT_UNDEFINED; + pagedparam.outDataType = ACL_DT_UNDEFINED; + pagedparam.hasQuantOffset = false; + pagedparam.compressType = PagedAttentionParam::COMPRESS_TYPE_UNDEFINED; + pagedparam.calcType = PagedAttentionParam::CALC_TYPE_UNDEFINED; + pagedparam.scaleType = PagedAttentionParam::SCALE_TYPE_TOR; + pagedparam.inputLayout = atb::infer::TYPE_BSND; + pagedparam.mlaVHeadSize = 0; + + ParamSetter paramsetter; + paramsetter.Input(query, true) + .Input(key_cache) + .Input(value_cache) + .Input(block_table, true) + .Input(context_lens, true) + .Output(out); + auto opPaged = pagedAttentionParamCache.getOperation( + pagedparam, "PagedAttentionOperation"); + RunAtbCmd(opPaged, paramsetter, "PagedAttentionOperation"); + + return; +} + +} // namespace atb \ No newline at end of file diff --git a/xllm/core/kernels/npu/ops_npu/ReshapeAndCachAtb.cpp b/xllm/core/kernels/npu/ops_npu/ReshapeAndCachAtb.cpp new file mode 100644 index 00000000..cba05d19 --- /dev/null +++ b/xllm/core/kernels/npu/ops_npu/ReshapeAndCachAtb.cpp @@ -0,0 +1,58 @@ +// Copyright (c) 2025 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../custom_functions_npu/AtbCommon.h" + +using namespace std; +namespace atb { +using ReshapeAndCacheParam = atb::infer::ReshapeAndCacheParam; +void _npu_reshape_and_cache(const at::Tensor& key, + const at::Tensor& value, + at::Tensor& key_cache, + at::Tensor& value_cache, + const at::Tensor& slot_indices) { + const c10::OptionalDeviceGuard device_guard(device_of(key)); + OpParamCache& reshapeAndCacheParamCache = + OpParamCache::getInstance(); + ReshapeAndCacheParam reshapeparam; + reshapeparam.compressType = ReshapeAndCacheParam::COMPRESS_TYPE_UNDEFINED; + + auto key_cache_format = at_npu::native::get_npu_format(key_cache); + auto value_cache_format = at_npu::native::get_npu_format(value_cache); + bool is_key_cache_nz = (key_cache_format == ACL_FORMAT_FRACTAL_NZ); + bool is_value_cache_nz = (value_cache_format == ACL_FORMAT_FRACTAL_NZ); + + if (is_key_cache_nz && is_value_cache_nz) { + reshapeparam.kvCacheCfg = ReshapeAndCacheParam::K_CACHE_V_CACHE_NZ; + } else { + reshapeparam.kvCacheCfg = ReshapeAndCacheParam::K_CACHE_V_CACHE; + } + + ParamSetter parametter; + parametter.Input(key, true) + .Input(value, true) + .Input(key_cache) + .Input(value_cache) + .Input(slot_indices, true) + .Output(key_cache) + .Output(value_cache); + auto opReshape = reshapeAndCacheParamCache.getOperation( + reshapeparam, "ReshapeCacheOperation"); + RunAtbCmd(opReshape, parametter, "ReshapeCacheOperation"); + + return; +} + +} // namespace atb diff --git a/xllm/core/kernels/npu/ops_npu/SelfAttentionAtb.cpp b/xllm/core/kernels/npu/ops_npu/SelfAttentionAtb.cpp new file mode 100644 index 00000000..08f14497 --- /dev/null +++ b/xllm/core/kernels/npu/ops_npu/SelfAttentionAtb.cpp @@ -0,0 +1,71 @@ +// Copyright (c) 2025 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../custom_functions_npu/AtbCommon.h" + +using namespace std; +namespace atb { +using SelfAttentionParam = atb::infer::SelfAttentionParam; +void _npu_flash_attention(const at::Tensor& query, + const at::Tensor& key, + const at::Tensor& value, + const at::Tensor& mask, + const at::Tensor& seq_len, + const double scale_value, + const int64_t num_heads, + const int64_t num_kv_heads, + at::Tensor& out) { + const c10::OptionalDeviceGuard device_guard(device_of(query)); + OpParamCache& selfAttentionParamCache = + OpParamCache::getInstance(); + SelfAttentionParam selfattentionparam; + + selfattentionparam.calcType = SelfAttentionParam::PA_ENCODER; + selfattentionparam.kernelType = SelfAttentionParam::KERNELTYPE_DEFAULT; + selfattentionparam.clampType = SelfAttentionParam::CLAMP_TYPE_UNDEFINED; + selfattentionparam.maskType = SelfAttentionParam::MASK_TYPE_NORM; + selfattentionparam.kvcacheCfg = SelfAttentionParam::K_CACHE_V_CACHE; + selfattentionparam.scaleType = SelfAttentionParam::SCALE_TYPE_TOR; + selfattentionparam.quantType = SelfAttentionParam::TYPE_QUANT_UNDEFINED; + selfattentionparam.cacheType = SelfAttentionParam::CACHE_TYPE_NORM; + selfattentionparam.outDataType = ACL_DT_UNDEFINED; + selfattentionparam.headNum = num_heads; + selfattentionparam.kvHeadNum = num_kv_heads; + selfattentionparam.qScale = 1; + selfattentionparam.qkScale = scale_value; + selfattentionparam.batchRunStatusEnable = false; + selfattentionparam.isTriuMask = 0; + selfattentionparam.clampMin = 0; + selfattentionparam.clampMax = 0; + selfattentionparam.inputLayout = atb::infer::TYPE_BSND; + selfattentionparam.mlaVHeadSize = 0; + selfattentionparam.windowSize = 0; + + ParamSetter parametter; + parametter.Input(query, true) + .Input(key, true) + .Input(value, true) + .Input(mask) + .Input(seq_len, true) + .Output(out); + + auto opSelfattention = selfAttentionParamCache.getOperation( + selfattentionparam, "SelfAttentionOperation"); + RunAtbCmd(opSelfattention, parametter, "SelfAttentionOperation"); + + return; +} + +} // namespace atb \ No newline at end of file diff --git a/xllm/core/kernels/npu/ops_npu/npu_ops.h b/xllm/core/kernels/npu/ops_npu/npu_ops.h new file mode 100644 index 00000000..de6b6039 --- /dev/null +++ b/xllm/core/kernels/npu/ops_npu/npu_ops.h @@ -0,0 +1,55 @@ +// Copyright (c) 2025 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef XLLM_NPU_OPS_H +#define XLLM_NPU_OPS_H + +#include "../custom_functions_npu/AtbCommon.h" + +using namespace std; + +namespace atb { + +using PagedAttentionParam = atb::infer::PagedAttentionParam; +using ReshapeAndCacheParam = atb::infer::ReshapeAndCacheParam; +using SelfAttentionParam = atb::infer::SelfAttentionParam; + +void _npu_paged_attention(const at::Tensor& query, + const at::Tensor& key_cache, + const at::Tensor& value_cache, + int64_t num_kv_heads, + int64_t num_heads, + double scale_value, + const at::Tensor& block_table, + const at::Tensor& context_lens, + at::Tensor& out); + +void _npu_reshape_and_cache(const at::Tensor& key, + const at::Tensor& value, + at::Tensor& key_cache, + at::Tensor& value_cache, + const at::Tensor& slot_indices); + +void _npu_flash_attention(const at::Tensor& query, + const at::Tensor& key, + const at::Tensor& value, + const at::Tensor& mask, + const at::Tensor& seq_len, + const double scale_value, + const int64_t num_heads, + const int64_t num_kv_heads, + at::Tensor& out); + +} // namespace atb + +#endif // XLLM_NPU_OPS_H \ No newline at end of file diff --git a/xllm/core/kernels/npu/rope.cpp b/xllm/core/kernels/npu/rope.cpp new file mode 100644 index 00000000..9e312f96 --- /dev/null +++ b/xllm/core/kernels/npu/rope.cpp @@ -0,0 +1,42 @@ +/* Copyright 2025 The xLLM Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://github.com/jd-opensource/xllm/blob/main/LICENSE + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "npu_ops_api.h" +#include "ops_npu/npu_ops.h" + +namespace xllm::kernel::npu { + +void apply_rotary(torch::Tensor& q, + torch::Tensor& k, + const torch::Tensor& cos_sin_cache, + const torch::Tensor& positions) { + auto cos_sin = cos_sin_cache.index_select(0, positions); + auto last_dim = cos_sin.size(-1); + auto cos_sin_vec = cos_sin.view({-1, 2, last_dim / 2}) + .repeat({1, 1, 2}) + .chunk(2, /*dim=*/-2); + auto cos = cos_sin_vec[0].view({1, -1, 1, last_dim}); + auto sin = cos_sin_vec[1].view({1, -1, 1, last_dim}); + + const int64_t rotary_dim = sin.size(-1); + q = q.view({1, q.size(0), -1, rotary_dim}); + k = k.view({1, k.size(0), -1, rotary_dim}); + + at_npu::native::custom_ops::npu_apply_rotary_pos_emb(q, k, cos, sin); +} + +} // namespace xllm::kernel::npu \ No newline at end of file diff --git a/xllm/core/kernels/npu/split.h b/xllm/core/kernels/npu/split.h deleted file mode 100644 index cda39703..00000000 --- a/xllm/core/kernels/npu/split.h +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#pragma once -#include "impl/npu_split_impl.h" - -namespace xllm::kernel { -class Split : public torch::nn::ModuleHolder { - public: - using torch::nn::ModuleHolder::ModuleHolder; - using Impl __attribute__((__unused__)) = NpuSplitImpl; - - Split(const ModelContext& context, - int32_t splitDim = 2, - int32_t splitNum = 3, - atb::SVector splitSizes = {}) - : ModuleHolder(std::make_shared(context, - splitDim, - splitNum, - splitSizes)) {} -}; - -} // namespace xllm::kernel diff --git a/xllm/core/kernels/ops_api.cpp b/xllm/core/kernels/ops_api.cpp index 1d468c72..70bdcf1e 100644 --- a/xllm/core/kernels/ops_api.cpp +++ b/xllm/core/kernels/ops_api.cpp @@ -30,6 +30,8 @@ void apply_rotary(RotaryParams& params) { params.discrete, params.dynamic_ntk, params.max_query_len); +#elif defined(USE_NPU) + npu::apply_rotary(params.q, params.k, params.cos_sin, params.positions); #else throw std::runtime_error("apply_rotary not implemented"); #endif @@ -50,6 +52,14 @@ void active(ActivationParams& params) { #endif } +torch::Tensor active_tensor(ActivationParams& params) { +#if defined(USE_NPU) + return npu::active(params.input); +#else + throw std::runtime_error("active not implemented"); +#endif +} + void reshape_paged_cache(ReshapePagedCacheParams& params) { #if defined(USE_MLU) mlu::reshape_paged_cache(params.key, @@ -58,6 +68,12 @@ void reshape_paged_cache(ReshapePagedCacheParams& params) { params.v_cache, params.slot_mapping, params.direction); +#elif defined(USE_NPU) + npu::reshape_paged_cache(params.key, + params.value, + params.k_cache, + params.v_cache, + params.slot_mapping); #else throw std::runtime_error("reshape_paged_cache not implemented"); #endif @@ -87,6 +103,16 @@ void batch_prefill(AttentionParams& params) { params.window_size_right, params.compute_dtype, params.return_lse); +#elif defined(USE_NPU) + npu::batch_prefill(params.query, + params.key, + params.value, + params.attn_mask, + params.seq_lens, + params.scale, + params.num_heads, + params.num_kv_heads, + params.output); #else throw std::runtime_error("batch_prefill not implemented"); #endif @@ -114,6 +140,16 @@ void batch_decode(AttentionParams& params) { params.scale, params.return_lse, params.kv_cache_quant_bit_size); +#elif defined(USE_NPU) + npu::batch_decode(params.query, + params.k_cache, + params.v_cache, + params.num_kv_heads, + params.num_heads, + params.scale, + params.block_table.value(), + params.seq_lens, + params.output); #else throw std::runtime_error("batch_decode not implemented"); #endif @@ -141,10 +177,20 @@ void fused_layernorm(FusedLayerNormParams& params) { #endif } +torch::Tensor fused_layernorm_tensor(FusedLayerNormParams& params) { +#if defined(USE_NPU) + return npu::fused_layernorm(params.input, params.weight, params.eps); +#else + throw std::runtime_error("fused_layernorm not implemented"); +#endif +} + torch::Tensor matmul(MatmulParams& params) { #if defined(USE_MLU) return mlu::matmul( params.a, params.b, params.bias, params.c, params.alpha, params.beta); +#elif defined(USE_NPU) + return npu::matmul(params.a, params.b, params.bias); #else throw std::runtime_error("matmul not implemented"); #endif diff --git a/xllm/core/kernels/ops_api.h b/xllm/core/kernels/ops_api.h index 46bc74bd..6d41d6af 100644 --- a/xllm/core/kernels/ops_api.h +++ b/xllm/core/kernels/ops_api.h @@ -19,6 +19,8 @@ limitations under the License. #if defined(USE_MLU) #include "mlu/mlu_ops_api.h" +#elif defined(USE_NPU) +#include "npu/npu_ops_api.h" #endif namespace xllm { @@ -28,6 +30,8 @@ void apply_rotary(RotaryParams& params); void active(ActivationParams& params); +torch::Tensor active_tensor(ActivationParams& params); + void reshape_paged_cache(ReshapePagedCacheParams& params); void batch_prefill(AttentionParams& params); @@ -36,6 +40,8 @@ void batch_decode(AttentionParams& params); void fused_layernorm(FusedLayerNormParams& params); +torch::Tensor fused_layernorm_tensor(FusedLayerNormParams& params); + torch::Tensor matmul(MatmulParams& params); torch::Tensor fused_moe(FusedMoEParams& params); diff --git a/xllm/core/kernels/param.h b/xllm/core/kernels/param.h index ff0a3410..c4f776c4 100644 --- a/xllm/core/kernels/param.h +++ b/xllm/core/kernels/param.h @@ -39,6 +39,7 @@ struct RotaryParams { bool discrete; bool dynamic_ntk = false; int max_query_len; + torch::Tensor positions; }; // Activation parameters @@ -79,6 +80,11 @@ struct AttentionParams { int window_size_right = -1; float scale; bool return_lse = false; + // for npu + torch::Tensor seq_lens; + int num_heads; + int num_kv_heads; + torch::Tensor attn_mask; // for flashinfer torch::Tensor paged_kv_indptr; torch::Tensor paged_kv_indices; diff --git a/xllm/core/layers/CMakeLists.txt b/xllm/core/layers/CMakeLists.txt index 6ad3d0c7..dc3b63cf 100644 --- a/xllm/core/layers/CMakeLists.txt +++ b/xllm/core/layers/CMakeLists.txt @@ -79,6 +79,5 @@ cc_library( if(USE_NPU) add_subdirectory(npu) -else() - add_subdirectory(common) endif() +add_subdirectory(common) diff --git a/xllm/core/layers/column_parallel_linear.h b/xllm/core/layers/column_parallel_linear.h index aadeec1e..25e05d8a 100644 --- a/xllm/core/layers/column_parallel_linear.h +++ b/xllm/core/layers/column_parallel_linear.h @@ -23,13 +23,13 @@ namespace xllm { namespace layer { #if defined(USE_NPU) -class ColumnParallelLinear +class NpuColumnParallelLinear : public torch::nn::ModuleHolder { public: using torch::nn::ModuleHolder::ModuleHolder; using Impl __attribute__((__unused__)) = NpuColumnParallelLinearImpl; - ColumnParallelLinear(const ModelContext& context) + NpuColumnParallelLinear(const ModelContext& context) : ModuleHolder(std::make_shared(context)) {} }; #endif diff --git a/xllm/core/layers/common/CMakeLists.txt b/xllm/core/layers/common/CMakeLists.txt old mode 100755 new mode 100644 index 4600fafd..6c9f5bf8 --- a/xllm/core/layers/common/CMakeLists.txt +++ b/xllm/core/layers/common/CMakeLists.txt @@ -43,6 +43,7 @@ cc_library( torch ) +if(NOT USE_NPU) # Add test for DenseMLP cc_test( NAME @@ -76,3 +77,4 @@ cc_test( torch GTest::gtest_main ) +endif() \ No newline at end of file diff --git a/xllm/core/layers/common/attention.cpp b/xllm/core/layers/common/attention.cpp index adb8b911..3a2d8833 100644 --- a/xllm/core/layers/common/attention.cpp +++ b/xllm/core/layers/common/attention.cpp @@ -21,14 +21,29 @@ DECLARE_bool(enable_chunked_prefill); namespace xllm { namespace layer { +#if defined(USE_NPU) +AttentionMetadata AttentionMetadata::build(const ModelInputParams& params, + bool is_prefill, + const torch::Tensor& attn_mask) { + return AttentionMetadata::build(params, "float", is_prefill, attn_mask); +} +#else AttentionMetadata AttentionMetadata::build(const ModelInputParams& params, bool is_prefill) { return AttentionMetadata::build(params, "float", is_prefill); } +#endif +#if defined(USE_NPU) +AttentionMetadata AttentionMetadata::build(const ModelInputParams& params, + const std::string& compute_dtype, + bool is_prefill, + const torch::Tensor& attn_mask) { +#else AttentionMetadata AttentionMetadata::build(const ModelInputParams& params, const std::string& compute_dtype, bool is_prefill) { +#endif AttentionMetadata attn_metadata; attn_metadata.query_start_loc = params.q_seq_lens; attn_metadata.seq_start_loc = params.kv_seq_lens; @@ -37,6 +52,11 @@ AttentionMetadata AttentionMetadata::build(const ModelInputParams& params, attn_metadata.slot_mapping = params.new_cache_slots; attn_metadata.compute_dtype = compute_dtype; +#if defined(USE_NPU) + attn_metadata.attn_mask = attn_mask; + attn_metadata.seq_lens = params.kv_seq_lens.to(torch::kCPU); +#endif + bool is_start_loc_match = (params.q_seq_lens_vec == params.kv_seq_lens_vec); attn_metadata.is_chunked_prefill = is_prefill && !is_start_loc_match; attn_metadata.is_prefill = is_prefill && !attn_metadata.is_chunked_prefill; @@ -87,7 +107,6 @@ std::tuple> AttentionImpl::forward( reshape_paged_cache_params.v_cache = v_cache; reshape_paged_cache_params.slot_mapping = attn_metadata.slot_mapping; xllm::kernel::reshape_paged_cache(reshape_paged_cache_params); - xllm::kernel::AttentionParams attention_params; attention_params.query = query; attention_params.output = output; @@ -103,7 +122,12 @@ std::tuple> AttentionImpl::forward( attention_params.query_start_loc = attn_metadata.query_start_loc; attention_params.seq_start_loc = attn_metadata.seq_start_loc; attention_params.max_query_len = attn_metadata.max_query_len; - +#if defined(USE_NPU) + attention_params.num_heads = num_heads_; + attention_params.num_kv_heads = num_kv_heads_; + attention_params.attn_mask = attn_metadata.attn_mask; + attention_params.seq_lens = attn_metadata.seq_lens; +#endif xllm::kernel::batch_prefill(attention_params); } else if (attn_metadata.is_chunked_prefill) { attention_params.key = k_cache; @@ -115,8 +139,16 @@ std::tuple> AttentionImpl::forward( xllm::kernel::batch_prefill(attention_params); } else { +#if defined(USE_NPU) + query = query.view({-1, num_heads_, head_size_}); + output = output.view({-1, num_heads_, head_size_}); + attention_params.num_heads = num_heads_; + attention_params.num_kv_heads = num_kv_heads_; + attention_params.seq_lens = attn_metadata.seq_lens; +#else query = query.view({-1, 1, num_heads_, head_size_}); output = output.view({-1, 1, num_heads_, head_size_}); +#endif attention_params.query = query; attention_params.output = output; diff --git a/xllm/core/layers/common/attention.h b/xllm/core/layers/common/attention.h index 7e210001..a1cc9b9b 100644 --- a/xllm/core/layers/common/attention.h +++ b/xllm/core/layers/common/attention.h @@ -27,12 +27,25 @@ namespace layer { struct AttentionMetadata { public: +#if defined(USE_NPU) + static AttentionMetadata build(const ModelInputParams& params, + bool is_prefill, + const torch::Tensor& attn_mask); + + static AttentionMetadata build(const ModelInputParams& params, + const std::string& compute_dtype, + bool is_prefill, + const torch::Tensor& attn_mask); + torch::Tensor attn_mask; + torch::Tensor seq_lens; +#else static AttentionMetadata build(const ModelInputParams& params, bool is_prefill); static AttentionMetadata build(const ModelInputParams& params, const std::string& compute_dtype, bool is_prefill); +#endif torch::Tensor query_start_loc; torch::Tensor seq_start_loc; diff --git a/xllm/core/layers/common/dense_mlp.cpp b/xllm/core/layers/common/dense_mlp.cpp index b487b90f..cda35725 100644 --- a/xllm/core/layers/common/dense_mlp.cpp +++ b/xllm/core/layers/common/dense_mlp.cpp @@ -89,6 +89,11 @@ torch::Tensor DenseMLPImpl::forward(const torch::Tensor& hidden_states) { return down_proj_->forward(gate_up); } else { int64_t batch_size = gate_up.sizes()[0]; +#if defined(USE_NPU) + xllm::kernel::ActivationParams activation_params; + activation_params.input = gate_up; + auto output = xllm::kernel::active_tensor(activation_params); +#else auto output = torch::empty( {batch_size, intermediate_size_ / parallel_args_.tp_group_->world_size()}, @@ -100,6 +105,7 @@ torch::Tensor DenseMLPImpl::forward(const torch::Tensor& hidden_states) { activation_params.act_mode = hidden_act_; activation_params.is_gated = is_gated_; xllm::kernel::active(activation_params); +#endif return down_proj_->forward(output); } diff --git a/xllm/core/layers/common/fuse_norm.cpp b/xllm/core/layers/common/fuse_norm.cpp index 9b5dec01..b546258c 100644 --- a/xllm/core/layers/common/fuse_norm.cpp +++ b/xllm/core/layers/common/fuse_norm.cpp @@ -37,6 +37,13 @@ FusedRMSNormImpl::FusedRMSNormImpl(int64_t dim, torch::Tensor FusedRMSNormImpl::forward(torch::Tensor& input) { auto org_shape = input.sizes().vec(); input = input.reshape({-1, norm_dim_}); +#if defined(USE_NPU) + xllm::kernel::FusedLayerNormParams fused_layernorm_params; + fused_layernorm_params.input = input; + fused_layernorm_params.weight = weight_; + fused_layernorm_params.eps = eps_; + auto output = xllm::kernel::fused_layernorm_tensor(fused_layernorm_params); +#else auto output = torch::empty_like(input); xllm::kernel::FusedLayerNormParams fused_layernorm_params; @@ -47,6 +54,7 @@ torch::Tensor FusedRMSNormImpl::forward(torch::Tensor& input) { fused_layernorm_params.eps = eps_; xllm::kernel::fused_layernorm(fused_layernorm_params); +#endif output = output.view(org_shape); return output; diff --git a/xllm/core/layers/common/qwen3_attention.cpp b/xllm/core/layers/common/qwen3_attention.cpp index d3d08768..0c58bb86 100644 --- a/xllm/core/layers/common/qwen3_attention.cpp +++ b/xllm/core/layers/common/qwen3_attention.cpp @@ -77,7 +77,6 @@ Qwen3AttentionImpl::Qwen3AttentionImpl(const ModelArgs& args, k_norm_ = register_module( "k_norm", RmsNorm(args.head_dim(), args.rms_norm_eps(), options)); - // 4. Rotary embedding rotary_emb_ = register_module("rope", RotaryEmbedding(/*rotary_dim=*/head_dim_, diff --git a/xllm/core/layers/common/qwen3_attention.h b/xllm/core/layers/common/qwen3_attention.h index 9d5536ce..b7ba2e90 100644 --- a/xllm/core/layers/common/qwen3_attention.h +++ b/xllm/core/layers/common/qwen3_attention.h @@ -56,8 +56,10 @@ class Qwen3AttentionImpl : public torch::nn::Module { QKVParallelLinear qkv_proj_{nullptr}; RowParallelLinear o_proj_{nullptr}; + RmsNorm q_norm_{nullptr}; RmsNorm k_norm_{nullptr}; + Attention attn_{nullptr}; RotaryEmbedding rotary_emb_{nullptr}; }; diff --git a/xllm/core/layers/common/qwen3_decoder_layer.h b/xllm/core/layers/common/qwen3_decoder_layer.h index f5c8cc26..c9c0a278 100644 --- a/xllm/core/layers/common/qwen3_decoder_layer.h +++ b/xllm/core/layers/common/qwen3_decoder_layer.h @@ -51,6 +51,7 @@ class Qwen3DecoderImpl : public torch::nn::Module { private: Qwen3Attention attention_{nullptr}; DenseMLP mlp_{nullptr}; + RmsNorm input_norm_{nullptr}; RmsNorm post_norm_{nullptr}; diff --git a/xllm/core/layers/common/qwen3_moe_decoder_layer.h b/xllm/core/layers/common/qwen3_moe_decoder_layer.h index 44895629..07280ead 100644 --- a/xllm/core/layers/common/qwen3_moe_decoder_layer.h +++ b/xllm/core/layers/common/qwen3_moe_decoder_layer.h @@ -51,6 +51,7 @@ class Qwen3MoeDecoderImpl : public torch::nn::Module { Qwen3Attention attention_{nullptr}; DenseMLP mlp_{nullptr}; FusedMoE moe_mlp_{nullptr}; + RmsNorm input_norm_{nullptr}; RmsNorm post_norm_{nullptr}; }; diff --git a/xllm/core/layers/common/rotary_embedding.cpp b/xllm/core/layers/common/rotary_embedding.cpp index 1280e29c..b3ee6e5e 100644 --- a/xllm/core/layers/common/rotary_embedding.cpp +++ b/xllm/core/layers/common/rotary_embedding.cpp @@ -45,6 +45,12 @@ RotaryEmbeddingImpl::RotaryEmbeddingImpl(int rotary_dim, t = t.to(dev_options); const auto freqs = torch::einsum("i,j->ij", {t, inv_freq}); +#if defined(USE_NPU) + const auto cos_sin = + torch::cat({freqs.cos(), freqs.sin()}, /*dim=*/-1).contiguous(); + cos_sin_cache_ = register_buffer("cos_sin_cache", cos_sin.to(options)); + auto cos_sin_vec = cos_sin_cache_.chunk(2, /*dim=*/-1); +#else // Create cos and sin embeddings. torch::Tensor emd; if (interleaved) { @@ -61,6 +67,7 @@ RotaryEmbeddingImpl::RotaryEmbeddingImpl(int rotary_dim, auto cos_sin_vec = cos_sin_cache_.chunk(2, /*dim=*/-1); cos_ = cos_sin_vec[0].view({-1, rotary_dim}); sin_ = cos_sin_vec[1].view({-1, rotary_dim}); +#endif } void RotaryEmbeddingImpl::forward(torch::Tensor& q, @@ -82,8 +89,12 @@ void RotaryEmbeddingImpl::forward(torch::Tensor& q, xllm::kernel::RotaryParams rotary_params; rotary_params.q = q; rotary_params.k = k; +#if defined(USE_NPU) + rotary_params.positions = positions; +#else rotary_params.sin = sin_; rotary_params.cos = cos_; +#endif rotary_params.cos_sin = cos_sin_cache_; rotary_params.position_ids = position_ids; rotary_params.cu_query_lens = cu_query_lens; diff --git a/xllm/core/layers/linear.h b/xllm/core/layers/linear.h index 7870dbeb..63252c45 100644 --- a/xllm/core/layers/linear.h +++ b/xllm/core/layers/linear.h @@ -18,14 +18,10 @@ limitations under the License. #include #include -#if defined(USE_MLU) #include "common/linear_impl.h" -#endif - namespace xllm { namespace layer { -#if defined(USE_MLU) class ColumnParallelLinear : public torch::nn::ModuleHolder { public: @@ -123,7 +119,6 @@ class ReplicatedLinear : public torch::nn::ModuleHolder { quant_args, options)) {} }; -#endif } // namespace layer } // namespace xllm diff --git a/xllm/core/layers/lm_head.h b/xllm/core/layers/lm_head.h index 3b6fcd49..3b3210eb 100644 --- a/xllm/core/layers/lm_head.h +++ b/xllm/core/layers/lm_head.h @@ -17,9 +17,8 @@ limitations under the License. #if defined(USE_NPU) #include "npu/npu_lm_head_impl.h" -#else -#include "common/linear_impl.h" #endif +#include "common/linear_impl.h" namespace xllm { namespace layer { @@ -33,6 +32,33 @@ class LmHead : public torch::nn::ModuleHolder { LmHead(const ModelContext& context) : ModuleHolder(std::make_shared(context)) {} }; + +/** + * TODO: Rename the original LmHead definition to NpuLmHead, + * and define the current one as LmHead to unify NPU's LmHead + * related code with MLU and GPU + */ +class LmHeadNative : public torch::nn::ModuleHolder { + public: + using torch::nn::ModuleHolder::ModuleHolder; + using Impl __attribute__((__unused__)) = ColumnParallelLinearImpl; + + LmHeadNative(int64_t in_features, + int64_t out_features, + bool bias, + bool gather_output, + const QuantArgs& quant_args, + const ParallelArgs& parallel_args, + const torch::TensorOptions& options) + : ModuleHolder(std::make_shared(in_features, + out_features, + bias, + gather_output, + quant_args, + parallel_args, + options)) {} +}; + #else class LmHead : public torch::nn::ModuleHolder { public: diff --git a/xllm/core/layers/npu/npu_rms_norm_impl.h b/xllm/core/layers/npu/npu_rms_norm_impl.h index fa1af2c4..fb4f469b 100644 --- a/xllm/core/layers/npu/npu_rms_norm_impl.h +++ b/xllm/core/layers/npu/npu_rms_norm_impl.h @@ -54,7 +54,7 @@ class NpuRmsNormImpl : public NpuBaseLayer { void merge_loaded_weights() override; - torch::Tensor forward(torch::Tensor& x, int nodeId); + torch::Tensor forward(torch::Tensor& x, int nodeId = 0); private: int64_t init_layer() override; diff --git a/xllm/core/layers/npu/npu_word_embedding_impl.cpp b/xllm/core/layers/npu/npu_word_embedding_impl.cpp index eb4a09b9..5c4ea046 100644 --- a/xllm/core/layers/npu/npu_word_embedding_impl.cpp +++ b/xllm/core/layers/npu/npu_word_embedding_impl.cpp @@ -125,6 +125,11 @@ torch::Tensor NpuWordEmbeddingImpl::forward(const torch::Tensor& x, void NpuWordEmbeddingImpl::build_node_variant_pack(atb_speed::Model::Node& node, const torch::Tensor& x) { + if (!node.operation) { + throw std::runtime_error( + "node.operation is null in build_node_variant_pack"); + } + internalTensors = atb_speed::Utils::AtTensor2Tensor(x); // node.outTensors[0] = &internalTensors; @@ -133,6 +138,13 @@ void NpuWordEmbeddingImpl::build_node_variant_pack(atb_speed::Model::Node& node, inTensorDescs.resize(node.variantPack.inTensors.size()); atb::SVector outTensorDescs; + + auto output_num = node.operation->GetOutputNum(); + if (output_num <= 0) { + throw std::runtime_error("Invalid output number: " + + std::to_string(output_num)); + } + outTensorDescs.reserve(node.operation->GetOutputNum()); outTensorDescs.resize(node.operation->GetOutputNum()); diff --git a/xllm/core/layers/qwen3_decoder_layer.h b/xllm/core/layers/qwen3_decoder_layer.h index 324738d5..ec2c311e 100644 --- a/xllm/core/layers/qwen3_decoder_layer.h +++ b/xllm/core/layers/qwen3_decoder_layer.h @@ -17,14 +17,14 @@ limitations under the License. #if defined(USE_NPU) #include "npu/npu_qwen3_decoder_layer_impl.h" -#else -#include "common/qwen3_decoder_layer.h" #endif +#include "common/qwen3_decoder_layer.h" + namespace xllm { namespace layer { -#if defined(USE_NPU) +#if defined(USE_NPU) && !defined(USE_NPU_TORCH) class Qwen3DecoderLayer : public torch::nn::ModuleHolder { public: diff --git a/xllm/core/layers/rms_norm.h b/xllm/core/layers/rms_norm.h index d8920c68..3065810f 100644 --- a/xllm/core/layers/rms_norm.h +++ b/xllm/core/layers/rms_norm.h @@ -16,23 +16,23 @@ limitations under the License. #pragma once #if defined(USE_NPU) #include "npu/npu_rms_norm_impl.h" -#else -#include "common/fuse_norm.h" #endif +#include "common/fuse_norm.h" namespace xllm { namespace layer { #if defined(USE_NPU) -class RmsNorm : public torch::nn::ModuleHolder { +class NpuRmsNorm : public torch::nn::ModuleHolder { public: using torch::nn::ModuleHolder::ModuleHolder; using Impl __attribute__((__unused__)) = NpuRmsNormImpl; - RmsNorm(const ModelContext& context) + NpuRmsNorm(const ModelContext& context) : ModuleHolder(std::make_shared(context)) {} }; -#else + +#endif class RmsNorm : public torch::nn::ModuleHolder { public: using torch::nn::ModuleHolder::ModuleHolder; @@ -41,7 +41,6 @@ class RmsNorm : public torch::nn::ModuleHolder { RmsNorm(int64_t dim, double eps, const torch::TensorOptions& options) : ModuleHolder(std::make_shared(dim, eps, options)) {} }; -#endif } // namespace layer } // namespace xllm diff --git a/xllm/core/layers/word_embedding.h b/xllm/core/layers/word_embedding.h index c377dcc2..6df992b1 100644 --- a/xllm/core/layers/word_embedding.h +++ b/xllm/core/layers/word_embedding.h @@ -17,9 +17,8 @@ limitations under the License. #if defined(USE_NPU) #include "npu/npu_word_embedding_impl.h" -#else -#include "common/word_embedding_impl.h" #endif +#include "common/word_embedding_impl.h" namespace xllm { namespace layer { @@ -33,6 +32,26 @@ class WordEmbedding : public torch::nn::ModuleHolder { : ModuleHolder(std::make_shared(context)) {} }; +/** + * TODO: Rename the original WordEmbedding definition to NpuWordEmbedding, + * and define the current one as WordEmbedding to unify NPU's WordEmbedding + * related code with MLU and GPU + */ + +class WordEmbeddingNative : public torch::nn::ModuleHolder { + public: + using torch::nn::ModuleHolder::ModuleHolder; + using Impl __attribute__((__unused__)) = WordEmbeddingImpl; + WordEmbeddingNative(int64_t num_embeddings, + int64_t embedding_dim, + const ParallelArgs& parallel_args, + const torch::TensorOptions& options) + : ModuleHolder(std::make_shared(num_embeddings, + embedding_dim, + parallel_args, + options)) {} +}; + #else class WordEmbedding : public torch::nn::ModuleHolder { diff --git a/xllm/core/runtime/CMakeLists.txt b/xllm/core/runtime/CMakeLists.txt index 54b10152..594c017f 100644 --- a/xllm/core/runtime/CMakeLists.txt +++ b/xllm/core/runtime/CMakeLists.txt @@ -61,6 +61,7 @@ cc_library( :state_dict :dit_cache $<$:npu_layers> + $<$:common_layers> :model :models :sampler diff --git a/xllm/models/llm/deepseek_v2.h b/xllm/models/llm/deepseek_v2.h index 010993a4..eb094617 100644 --- a/xllm/models/llm/deepseek_v2.h +++ b/xllm/models/llm/deepseek_v2.h @@ -140,7 +140,7 @@ class DeepseekV2ModelImpl : public torch::nn::Module { blocks_->push_back(block); } - norm_ = register_module("norm", layer::RmsNorm(context)); + norm_ = register_module("norm", layer::NpuRmsNorm(context)); // dp_size_=4; dp_size_ = parallel_args.dp_size(); std::vector indices; @@ -289,7 +289,7 @@ class DeepseekV2ModelImpl : public torch::nn::Module { std::vector> pos_embs_; std::vector atb_pos_embs_; layer::AttentionMask attn_mask_; - layer::RmsNorm norm_{nullptr}; + layer::NpuRmsNorm norm_{nullptr}; }; TORCH_MODULE(DeepseekV2Model); diff --git a/xllm/models/llm/deepseek_v2_mtp.h b/xllm/models/llm/deepseek_v2_mtp.h index 7960711c..9cb10dd6 100644 --- a/xllm/models/llm/deepseek_v2_mtp.h +++ b/xllm/models/llm/deepseek_v2_mtp.h @@ -81,11 +81,11 @@ class DeepseekV2MtpModelImpl : public torch::nn::Module { sm_scale, options)); atb_pos_embs_.push_back(layer::PosEmbedding(context)); - eh_projs_.push_back(layer::ColumnParallelLinear(context)); + eh_projs_.push_back(layer::NpuColumnParallelLinear(context)); } - enorm_ = register_module("enorm", layer::RmsNorm(context)); - hnorm_ = register_module("hnorm", layer::RmsNorm(context)); - final_norm_ = register_module("final_norm", layer::RmsNorm(context)); + enorm_ = register_module("enorm", layer::NpuRmsNorm(context)); + hnorm_ = register_module("hnorm", layer::NpuRmsNorm(context)); + final_norm_ = register_module("final_norm", layer::NpuRmsNorm(context)); // dp_size_=4; dp_size_ = parallel_args.dp_size(); @@ -241,10 +241,10 @@ class DeepseekV2MtpModelImpl : public torch::nn::Module { std::vector> pos_embs_; std::vector atb_pos_embs_; layer::AttentionMask attn_mask_; - std::vector eh_projs_; - layer::RmsNorm enorm_{nullptr}; - layer::RmsNorm hnorm_{nullptr}; - layer::RmsNorm final_norm_{nullptr}; + std::vector eh_projs_; + layer::NpuRmsNorm enorm_{nullptr}; + layer::NpuRmsNorm hnorm_{nullptr}; + layer::NpuRmsNorm final_norm_{nullptr}; }; TORCH_MODULE(DeepseekV2MtpModel); diff --git a/xllm/models/llm/glm4_moe.h b/xllm/models/llm/glm4_moe.h index 79dbefd7..913de60a 100644 --- a/xllm/models/llm/glm4_moe.h +++ b/xllm/models/llm/glm4_moe.h @@ -104,7 +104,7 @@ class Glm4MoeModelImpl : public torch::nn::Module { blocks_->push_back(block); } - norm_ = register_module("norm", layer::RmsNorm(context)); + norm_ = register_module("norm", layer::NpuRmsNorm(context)); dp_size_ = parallel_args.dp_size(); std::vector indices; dp_local_tp_size_ = parallel_args.world_size() / dp_size_; @@ -244,7 +244,7 @@ class Glm4MoeModelImpl : public torch::nn::Module { torch::Dtype dtype_; layer::WordEmbedding embed_tokens_{nullptr}; layer::AttentionMask attn_mask_; - layer::RmsNorm norm_{nullptr}; + layer::NpuRmsNorm norm_{nullptr}; torch::Tensor cos_sin_; layer::PosEmbedding atb_pos_emb_{nullptr}; }; diff --git a/xllm/models/llm/glm4_moe_mtp.h b/xllm/models/llm/glm4_moe_mtp.h index 5c005a24..578051da 100644 --- a/xllm/models/llm/glm4_moe_mtp.h +++ b/xllm/models/llm/glm4_moe_mtp.h @@ -60,10 +60,11 @@ class Glm4MoeMtpModelImpl : public torch::nn::Module { blocks_->push_back(block); } - eh_proj_ = register_module("eh_proj", layer::ColumnParallelLinear(context)); - enorm_ = register_module("enorm", layer::RmsNorm(context)); - hnorm_ = register_module("hnorm", layer::RmsNorm(context)); - final_norm_ = register_module("final_norm", layer::RmsNorm(context)); + eh_proj_ = + register_module("eh_proj", layer::NpuColumnParallelLinear(context)); + enorm_ = register_module("enorm", layer::NpuRmsNorm(context)); + hnorm_ = register_module("hnorm", layer::NpuRmsNorm(context)); + final_norm_ = register_module("final_norm", layer::NpuRmsNorm(context)); dp_size_ = parallel_args.dp_size(); std::vector indices; @@ -229,10 +230,10 @@ class Glm4MoeMtpModelImpl : public torch::nn::Module { layer::AttentionMask attn_mask_; torch::Tensor cos_sin_; layer::PosEmbedding atb_pos_emb_{nullptr}; - layer::ColumnParallelLinear eh_proj_{nullptr}; - layer::RmsNorm enorm_{nullptr}; - layer::RmsNorm hnorm_{nullptr}; - layer::RmsNorm final_norm_{nullptr}; + layer::NpuColumnParallelLinear eh_proj_{nullptr}; + layer::NpuRmsNorm enorm_{nullptr}; + layer::NpuRmsNorm hnorm_{nullptr}; + layer::NpuRmsNorm final_norm_{nullptr}; }; TORCH_MODULE(Glm4MoeMtpModel); diff --git a/xllm/models/llm/llama.h b/xllm/models/llm/llama.h index e8516942..df3e76bd 100644 --- a/xllm/models/llm/llama.h +++ b/xllm/models/llm/llama.h @@ -115,7 +115,7 @@ class LlamaModelImpl : public torch::nn::Module { layers_.reserve(context.get_model_args().n_layers()); embed_tokens_ = register_module("embed_tokens", layer::WordEmbedding(context)); - norm_ = register_module("norm", layer::RmsNorm(context)); + norm_ = register_module("norm", layer::NpuRmsNorm(context)); std::tie(cos_pos_, sin_pos_) = get_llama_rotary_embedding(128, @@ -230,7 +230,7 @@ class LlamaModelImpl : public torch::nn::Module { int device_id_ = 0; layer::AttentionMask attn_mask_; layer::WordEmbedding embed_tokens_{nullptr}; - layer::RmsNorm norm_{nullptr}; + layer::NpuRmsNorm norm_{nullptr}; torch::nn::ModuleList blocks_{nullptr}; // hold same data but different type as blocks_ to avoid type cast diff --git a/xllm/models/llm/llm_model_base.h b/xllm/models/llm/llm_model_base.h index 7b4212be..826c8dc6 100644 --- a/xllm/models/llm/llm_model_base.h +++ b/xllm/models/llm/llm_model_base.h @@ -17,6 +17,8 @@ limitations under the License. #if defined(USE_NPU) #include +#include + #endif #include #include @@ -32,14 +34,14 @@ limitations under the License. #include "core/framework/model_context.h" #include "core/layers/attention_mask.h" #include "core/layers/block_copy.h" +#include "core/layers/common/attention.h" #include "core/layers/lm_head.h" #include "core/layers/pos_embedding.h" #include "core/layers/rms_norm.h" #include "models/model_registry.h" + #if defined(USE_NPU) #include "xllm_kernels/core/include/atb_speed/log.h" -#else -#include "core/layers/common/attention.h" #endif namespace xllm { @@ -81,12 +83,12 @@ class LlmDecoderLayerImplBase : public torch::nn::Module { LlmDecoderLayerImplBase(const ModelContext& context) { // register submodules decoder_layer_ = register_module("decoder_layer", DecoderType(context)); -#if defined(USE_NPU) +#if defined(USE_NPU) && !defined(USE_NPU_TORCH) block_copy_ = register_module("block_copy", layer::BlockCopy(context)); #endif } -#if defined(USE_NPU) +#if defined(USE_NPU) && !defined(USE_NPU_TORCH) virtual torch::Tensor forward(std::vector& x, std::vector& cos_pos, std::vector& sin_pos, @@ -96,7 +98,7 @@ class LlmDecoderLayerImplBase : public torch::nn::Module { int node_id, std::vector event, std::vector*> event_flag) { -#if defined(USE_NPU) +#if defined(USE_NPU) && !defined(USE_NPU_TORCH) auto micro_batch_num = x.size(); for (auto i = 0; i < micro_batch_num; ++i) { if (input_params[i].src_block_indices.numel() > 0) { @@ -125,11 +127,11 @@ class LlmDecoderLayerImplBase : public torch::nn::Module { } virtual void merge_loaded_weights() { decoder_layer_->merge_loaded_weights(); -#if defined(USE_NPU) +#if defined(USE_NPU) && !defined(USE_NPU_TORCH) block_copy_->merge_loaded_weights(); #endif } -#elif defined(USE_MLU) +#elif defined(USE_MLU) || defined(USE_NPU_TORCH) virtual torch::Tensor forward(torch::Tensor& x, torch::Tensor& positions, const layer::AttentionMetadata& attn_metadata, @@ -147,7 +149,7 @@ class LlmDecoderLayerImplBase : public torch::nn::Module { private: DecoderType decoder_layer_{nullptr}; -#if defined(USE_NPU) +#if defined(USE_NPU) && !defined(USE_NPU_TORCH) layer::BlockCopy block_copy_{nullptr}; #endif }; @@ -165,7 +167,11 @@ class LlmModelImplBase : public torch::nn::Module { torch::Tensor get_input_embeddings(torch::Tensor input_ids) { #if defined(USE_NPU) +#if defined(USE_NPU_TORCH) + return embed_tokens_native_[0](input_ids); +#else return embed_tokens_[0](input_ids, 0); +#endif #elif defined(USE_MLU) return embed_tokens_[0](input_ids); #endif @@ -203,7 +209,11 @@ class LlmModelImplBase : public torch::nn::Module { h = inputs_embeds; } else { #if defined(USE_NPU) +#if defined(USE_NPU_TORCH) + h = embed_tokens_native_[i](tokens[i]); +#else h = embed_tokens_[i](tokens[i], 0); +#endif #elif defined(USE_MLU) h = embed_tokens_[i](tokens[i]); #endif @@ -277,7 +287,7 @@ class LlmModelImplBase : public torch::nn::Module { attn_masks.push_back(std::move(attn_mask)); #endif } -#if defined(USE_NPU) +#if defined(USE_NPU) && !defined(USE_NPU_TORCH) for (size_t i = 0; i < layers_.size(); i++) { std::vector events(micro_batch_num, nullptr); std::vector*> event_flags(micro_batch_num, nullptr); @@ -307,10 +317,15 @@ class LlmModelImplBase : public torch::nn::Module { } auto cancated_h = torch::cat(hs, 0); return norm_(cancated_h, 0); -#elif defined(USE_MLU) +#elif defined(USE_MLU) || defined(USE_NPU_TORCH) bool is_prefill = input_params[0].q_max_seq_len > 1; +#if defined(USE_NPU_TORCH) + auto attn_metadata = layer::AttentionMetadata::build( + input_params[0], is_prefill, attn_masks[0]); +#else auto attn_metadata = layer::AttentionMetadata::build(input_params[0], is_prefill); +#endif torch::Tensor h; for (size_t i = 0; i < layers_.size(); i++) { @@ -325,8 +340,13 @@ class LlmModelImplBase : public torch::nn::Module { // load the weight from the checkpoint virtual void load_state_dict(const StateDict& state_dict) { for (auto i = 0; i < FLAGS_micro_batch_num; i++) { +#if defined(USE_NPU_TORCH) + embed_tokens_native_[i]->load_state_dict( + state_dict.get_dict_with_prefix("embed_tokens.")); +#else embed_tokens_[i]->load_state_dict( state_dict.get_dict_with_prefix("embed_tokens.")); +#endif } // call each layer's load_state_dict function for (int i = 0; i < layers_.size(); i++) { @@ -338,6 +358,7 @@ class LlmModelImplBase : public torch::nn::Module { #if defined(USE_NPU) virtual void verify_loaded_weights(const std::string& prefix) const { +#if !defined(USE_NPU_TORCH) for (auto i = 0; i < FLAGS_micro_batch_num; i++) { embed_tokens_[i]->verify_loaded_weights(prefix + "embed_tokens."); } @@ -346,9 +367,11 @@ class LlmModelImplBase : public torch::nn::Module { "."); } norm_->verify_loaded_weights(prefix + "norm."); +#endif } virtual void merge_loaded_weights() { +#if !defined(USE_NPU_TORCH) for (auto i = 0; i < FLAGS_micro_batch_num; i++) { embed_tokens_[i]->merge_loaded_weights(); } @@ -356,6 +379,7 @@ class LlmModelImplBase : public torch::nn::Module { layers_[i]->merge_loaded_weights(); } norm_->merge_loaded_weights(); +#endif } #endif @@ -385,7 +409,13 @@ class LlmModelImplBase : public torch::nn::Module { // test // ParallelEmbedding embed_tokens_{nullptr}; std::vector embed_tokens_; - layer::RmsNorm norm_{nullptr}; + +#if !defined(USE_NPU_TORCH) && defined(USE_NPU) + layer::NpuRmsNorm norm_{nullptr}; +#else + xllm::layer::RmsNorm norm_{nullptr}; + std::vector embed_tokens_native_; +#endif torch::nn::ModuleList blocks_{nullptr}; // hold same data but different type as blocks_ to avoid type cast @@ -406,7 +436,20 @@ class LlmForCausalLMImplBase : public torch::nn::Module { model_ = register_module("model", LlmModelType(context)); #if defined(USE_NPU) +#if defined(USE_NPU_TORCH) + lm_head_native_ = register_module( + "lm_head", + layer::LmHeadNative(context.get_model_args().hidden_size(), + context.get_model_args().vocab_size(), + /*bias=*/false, + /*gather_output=*/true, + QuantArgs{}, + context.get_parallel_args(), + context.get_tensor_options())); +#else lm_head_ = register_module("lm_head", layer::LmHead(context)); +#endif + #elif defined(USE_MLU) // lm_head_ is default to no quantization lm_head_ = @@ -445,7 +488,15 @@ class LlmForCausalLMImplBase : public torch::nn::Module { auto h = hidden_states; // test #if defined(USE_NPU) +#if defined(USE_NPU_TORCH) + if (seleted_idxes.defined()) { + h = h.index_select(/*dim=*/0, seleted_idxes); + } + return lm_head_native_(h); +#else return lm_head_(hidden_states, seleted_idxes, 0); +#endif + #elif defined(USE_MLU) if (seleted_idxes.defined()) { h = h.index_select(/*dim=*/0, seleted_idxes); @@ -459,6 +510,15 @@ class LlmForCausalLMImplBase : public torch::nn::Module { for (const auto& state_dict : loader->get_state_dicts()) { model_->load_state_dict( state_dict->get_dict_with_prefix(prefix + "model.")); +#if defined(USE_NPU_TORCH) + if (tie_word_embeddings) { + lm_head_native_->load_state_dict( + state_dict->get_dict_with_prefix(prefix + "model.embed_tokens.")); + } else { + lm_head_native_->load_state_dict( + state_dict->get_dict_with_prefix(prefix + "lm_head.")); + } +#else if (tie_word_embeddings) { lm_head_->load_state_dict( state_dict->get_dict_with_prefix(prefix + "model.embed_tokens.")); @@ -466,15 +526,18 @@ class LlmForCausalLMImplBase : public torch::nn::Module { lm_head_->load_state_dict( state_dict->get_dict_with_prefix(prefix + "lm_head.")); } +#endif } #if defined(USE_NPU) // verify model_->verify_loaded_weights(prefix + "model."); + model_->merge_loaded_weights(); +#if !defined(USE_NPU_TORCH) lm_head_->verify_loaded_weights(prefix + "lm_head."); - model_->merge_loaded_weights(); // test lm_head_->merge_loaded_weights(); +#endif #endif } @@ -504,6 +567,9 @@ class LlmForCausalLMImplBase : public torch::nn::Module { bool tie_word_embeddings{false}; // test layer::LmHead lm_head_{nullptr}; +#if defined(USE_NPU_TORCH) + layer::LmHeadNative lm_head_native_{nullptr}; +#endif }; } // namespace xllm diff --git a/xllm/models/llm/qwen2.h b/xllm/models/llm/qwen2.h index c510471c..d4223cae 100644 --- a/xllm/models/llm/qwen2.h +++ b/xllm/models/llm/qwen2.h @@ -42,7 +42,7 @@ class QWen2ModelImpl : public LlmModelImplBase { blocks_ = register_module("layers", torch::nn::ModuleList()); layers_.reserve(model_args.n_layers()); - norm_ = register_module("norm", layer::RmsNorm(context)); + norm_ = register_module("norm", layer::NpuRmsNorm(context)); for (auto i = 0; i < FLAGS_micro_batch_num; i++) { embed_tokens_.push_back(layer::WordEmbedding(context)); atb_pos_embeds_.push_back(layer::PosEmbedding(context)); diff --git a/xllm/models/llm/qwen3.h b/xllm/models/llm/qwen3.h index 8a104d9d..a05c11d5 100644 --- a/xllm/models/llm/qwen3.h +++ b/xllm/models/llm/qwen3.h @@ -39,9 +39,24 @@ class QWen3ModelImpl : public LlmModelImplBase { blocks_ = register_module("layers", torch::nn::ModuleList()); layers_.reserve(model_args.n_layers()); #if defined(USE_NPU) - norm_ = register_module("norm", layer::RmsNorm(context)); +#if defined(USE_NPU_TORCH) + norm_ = register_module( + "norm", + xllm::layer::RmsNorm( + model_args.hidden_size(), model_args.rms_norm_eps(), options)); +#else + norm_ = register_module("norm", layer::NpuRmsNorm(context)); +#endif for (auto i = 0; i < FLAGS_micro_batch_num; i++) { +#if defined(USE_NPU_TORCH) + embed_tokens_native_.push_back( + layer::WordEmbeddingNative(model_args.vocab_size(), + model_args.hidden_size(), + context.get_parallel_args(), + options)); +#else embed_tokens_.push_back(layer::WordEmbedding(context)); +#endif atb_pos_embeds_.push_back(layer::PosEmbedding(context)); } cos_sin_ = get_concat_rotary_embedding(128, diff --git a/xllm/models/llm/qwen3_moe.h b/xllm/models/llm/qwen3_moe.h index 16771fb9..9085a171 100644 --- a/xllm/models/llm/qwen3_moe.h +++ b/xllm/models/llm/qwen3_moe.h @@ -122,7 +122,7 @@ class Qwen3MoeModelImpl : public torch::nn::Module { attn_mask_ = layer::AttentionMask(options.device(), options.dtype().toScalarType(), /*mask_value=*/mask_value); - norm_ = register_module("norm", layer::RmsNorm(context)); + norm_ = register_module("norm", layer::NpuRmsNorm(context)); mapping_data_ = parallel_args.mapping_data(); #elif defined(USE_MLU) norm_ = register_module( @@ -274,10 +274,13 @@ class Qwen3MoeModelImpl : public torch::nn::Module { torch::Dtype dtype_; layer::WordEmbedding embed_tokens_{nullptr}; layer::AttentionMask attn_mask_; - layer::RmsNorm norm_{nullptr}; + #if defined(USE_NPU) torch::Tensor cos_sin_; layer::PosEmbedding atb_pos_emb_{nullptr}; + layer::NpuRmsNorm norm_{nullptr}; +#else + layer::RmsNorm norm_{nullptr}; #endif }; TORCH_MODULE(Qwen3MoeModel); diff --git a/xllm/models/models.h b/xllm/models/models.h index 5c77ce86..4427ad7b 100644 --- a/xllm/models/models.h +++ b/xllm/models/models.h @@ -15,7 +15,7 @@ limitations under the License. #pragma once -#if defined(USE_NPU) +#if defined(USE_NPU) && !defined(USE_NPU_TORCH) #include "dit/pipeline_flux.h" // IWYU pragma: keep #include "dit/pipeline_flux_fill.h" // IWYU pragma: keep #include "llm/deepseek_v2.h" // IWYU pragma: keep @@ -35,4 +35,6 @@ limitations under the License. #include "llm/llm_model_base.h" // IWYU pragma: keep #include "llm/qwen3.h" // IWYU pragma: keep -#include "llm/qwen3_moe.h" // IWYU pragma: keep +#if !defined(USE_NPU_TORCH) +#include "llm/qwen3_moe.h" // IWYU pragma: keep +#endif \ No newline at end of file diff --git a/xllm/models/vlm/qwen2_5_vl.h b/xllm/models/vlm/qwen2_5_vl.h index a05148c3..e7cef340 100644 --- a/xllm/models/vlm/qwen2_5_vl.h +++ b/xllm/models/vlm/qwen2_5_vl.h @@ -287,7 +287,7 @@ class Qwen2_5_VisionPatchMergerImpl : public torch::nn::Module { hidden_size_ = context_dim * static_cast(std::pow(spatial_merge_size, 2)); - ln_q_ = register_module("ln_q", layer::RmsNorm(context)); + ln_q_ = register_module("ln_q", layer::NpuRmsNorm(context)); auto cpl = torch::nn::Linear( torch::nn::LinearOptions(hidden_size_, hidden_size_).bias(true)); @@ -361,7 +361,7 @@ class Qwen2_5_VisionPatchMergerImpl : public torch::nn::Module { private: int64_t hidden_size_; - layer::RmsNorm ln_q_{nullptr}; + layer::NpuRmsNorm ln_q_{nullptr}; torch::nn::Sequential mlp_{nullptr}; std::tuple layers_ = { nullptr,