diff --git a/backends/npu/kernels/sigmoid_cross_entropy_with_logits_kernel.cc b/backends/npu/kernels/sigmoid_cross_entropy_with_logits_kernel.cc index b7608f10128..fa28bfbbdcd 100644 --- a/backends/npu/kernels/sigmoid_cross_entropy_with_logits_kernel.cc +++ b/backends/npu/kernels/sigmoid_cross_entropy_with_logits_kernel.cc @@ -51,12 +51,14 @@ void SigmoidCrossEntropyWithLogitsKernel( phi::DenseTensor pos_weight_tensor; phi::DenseTensorMeta weight_tensor_meta = {phi::DataType::FLOAT32, x.dims()}; weight_tensor.set_meta(weight_tensor_meta); - FillNpuTensorWithConstant(&weight_tensor, dev_ctx, 1.0); + dev_ctx.template Alloc(&weight_tensor); + EXEC_NPU_CMD(aclnnInplaceOne, dev_ctx, weight_tensor); weight_tensor.Resize(x.dims()); if (pos_weight.get_ptr() == nullptr) { pos_weight_tensor.set_meta(weight_tensor_meta); - FillNpuTensorWithConstant(&pos_weight_tensor, dev_ctx, 1.0); + dev_ctx.template Alloc(&pos_weight_tensor); + EXEC_NPU_CMD(aclnnInplaceOne, dev_ctx, pos_weight_tensor); pos_weight_tensor.Resize(x.dims()); } else { pos_weight_tensor = *pos_weight.get_ptr(); @@ -89,12 +91,14 @@ void SigmoidCrossEntropyWithLogitsGradKernel( phi::DenseTensor pos_weight_tensor; phi::DenseTensorMeta weight_tensor_meta = {phi::DataType::FLOAT32, x.dims()}; weight_tensor.set_meta(weight_tensor_meta); - FillNpuTensorWithConstant(&weight_tensor, dev_ctx, 1.0); + dev_ctx.template Alloc(&weight_tensor); + EXEC_NPU_CMD(aclnnInplaceOne, dev_ctx, weight_tensor); weight_tensor.Resize(x.dims()); if (pos_weight.get_ptr() == nullptr) { pos_weight_tensor.set_meta(weight_tensor_meta); - FillNpuTensorWithConstant(&pos_weight_tensor, dev_ctx, 1.0); + dev_ctx.template Alloc(&pos_weight_tensor); + EXEC_NPU_CMD(aclnnInplaceOne, dev_ctx, pos_weight_tensor); pos_weight_tensor.Resize(x.dims()); } else { pos_weight_tensor = *pos_weight.get_ptr(); diff --git a/backends/npu/kernels/uniform_kernel.cc b/backends/npu/kernels/uniform_kernel.cc index 2f49236916a..2b4ddffbb94 100644 --- a/backends/npu/kernels/uniform_kernel.cc +++ b/backends/npu/kernels/uniform_kernel.cc @@ -31,16 +31,16 @@ inline void UniformRealDistribution(T* data, } template -void UniformRawKernel(const Context& dev_ctx, - const phi::IntArray& shape, - phi::DataType dtype, - const phi::Scalar& min, - const phi::Scalar& max, - int seed, - int diag_num, - int diag_step, - float diag_val, - phi::DenseTensor* out) { +void UniformRawKernelCPU(const Context& dev_ctx, + const phi::IntArray& shape, + phi::DataType dtype, + const phi::Scalar& min, + const phi::Scalar& max, + int seed, + int diag_num, + int diag_step, + float diag_val, + phi::DenseTensor* out) { out->Resize(phi::make_ddim(shape.GetData())); VLOG(4) << out->dims(); T* data = dev_ctx.template Alloc(out); @@ -83,6 +83,32 @@ void UniformRawKernel(const Context& dev_ctx, TensorCopy(dev_ctx, cpu_out, true, out); } +template +void UniformRawKernel(const Context& dev_ctx, + const phi::IntArray& shape, + phi::DataType dtype, + const phi::Scalar& min, + const phi::Scalar& max, + int seed, + int diag_num, + int diag_step, + float diag_val, + phi::DenseTensor* out) { + DO_COMPATIBILITY( + aclnnInplaceUniform, + (custom_kernel::UniformRawKernelCPU( + dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out))); + + out->Resize(phi::make_ddim(shape.GetData())); + dev_ctx.template Alloc(out); + double from = min.to(); + double to = max.to(); + uint64_t offset_int64 = 0; + uint64_t seed_int64 = static_cast(seed); + EXEC_NPU_CMD( + aclnnInplaceUniform, dev_ctx, *out, from, to, seed_int64, offset_int64); +} + template void UniformKernel(const Context& dev_ctx, const phi::IntArray& shape, @@ -91,7 +117,7 @@ void UniformKernel(const Context& dev_ctx, const phi::Scalar& max, int seed, phi::DenseTensor* out) { - custom_kernel::UniformRawKernel( + custom_kernel::UniformRawKernel( dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out); }