diff --git a/backends/npu/kernels/sigmoid_cross_entropy_with_logits_kernel.cc b/backends/npu/kernels/sigmoid_cross_entropy_with_logits_kernel.cc
index b7608f10128..fa28bfbbdcd 100644
--- a/backends/npu/kernels/sigmoid_cross_entropy_with_logits_kernel.cc
+++ b/backends/npu/kernels/sigmoid_cross_entropy_with_logits_kernel.cc
@@ -51,12 +51,14 @@ void SigmoidCrossEntropyWithLogitsKernel(
   phi::DenseTensor pos_weight_tensor;
   phi::DenseTensorMeta weight_tensor_meta = {phi::DataType::FLOAT32, x.dims()};
   weight_tensor.set_meta(weight_tensor_meta);
-  FillNpuTensorWithConstant<float>(&weight_tensor, dev_ctx, 1.0);
+  dev_ctx.template Alloc<float>(&weight_tensor);
+  EXEC_NPU_CMD(aclnnInplaceOne, dev_ctx, weight_tensor);
   weight_tensor.Resize(x.dims());
 
   if (pos_weight.get_ptr() == nullptr) {
     pos_weight_tensor.set_meta(weight_tensor_meta);
-    FillNpuTensorWithConstant<float>(&pos_weight_tensor, dev_ctx, 1.0);
+    dev_ctx.template Alloc<float>(&pos_weight_tensor);
+    EXEC_NPU_CMD(aclnnInplaceOne, dev_ctx, pos_weight_tensor);
     pos_weight_tensor.Resize(x.dims());
   } else {
     pos_weight_tensor = *pos_weight.get_ptr();
@@ -89,12 +91,14 @@ void SigmoidCrossEntropyWithLogitsGradKernel(
   phi::DenseTensor pos_weight_tensor;
   phi::DenseTensorMeta weight_tensor_meta = {phi::DataType::FLOAT32, x.dims()};
   weight_tensor.set_meta(weight_tensor_meta);
-  FillNpuTensorWithConstant<float>(&weight_tensor, dev_ctx, 1.0);
+  dev_ctx.template Alloc<float>(&weight_tensor);
+  EXEC_NPU_CMD(aclnnInplaceOne, dev_ctx, weight_tensor);
   weight_tensor.Resize(x.dims());
 
   if (pos_weight.get_ptr() == nullptr) {
     pos_weight_tensor.set_meta(weight_tensor_meta);
-    FillNpuTensorWithConstant<float>(&pos_weight_tensor, dev_ctx, 1.0);
+    dev_ctx.template Alloc<float>(&pos_weight_tensor);
+    EXEC_NPU_CMD(aclnnInplaceOne, dev_ctx, pos_weight_tensor);
     pos_weight_tensor.Resize(x.dims());
   } else {
     pos_weight_tensor = *pos_weight.get_ptr();
diff --git a/backends/npu/kernels/uniform_kernel.cc b/backends/npu/kernels/uniform_kernel.cc
index 2f49236916a..2b4ddffbb94 100644
--- a/backends/npu/kernels/uniform_kernel.cc
+++ b/backends/npu/kernels/uniform_kernel.cc
@@ -31,16 +31,16 @@ inline void UniformRealDistribution(T* data,
 }
 
 template <typename T, typename Context>
-void UniformRawKernel(const Context& dev_ctx,
-                      const phi::IntArray& shape,
-                      phi::DataType dtype,
-                      const phi::Scalar& min,
-                      const phi::Scalar& max,
-                      int seed,
-                      int diag_num,
-                      int diag_step,
-                      float diag_val,
-                      phi::DenseTensor* out) {
+void UniformRawKernelCPU(const Context& dev_ctx,
+                         const phi::IntArray& shape,
+                         phi::DataType dtype,
+                         const phi::Scalar& min,
+                         const phi::Scalar& max,
+                         int seed,
+                         int diag_num,
+                         int diag_step,
+                         float diag_val,
+                         phi::DenseTensor* out) {
   out->Resize(phi::make_ddim(shape.GetData()));
   VLOG(4) << out->dims();
   T* data = dev_ctx.template Alloc<T>(out);
@@ -83,6 +83,32 @@ void UniformRawKernel(const Context& dev_ctx,
   TensorCopy(dev_ctx, cpu_out, true, out);
 }
 
+template <typename T, typename Context>
+void UniformRawKernel(const Context& dev_ctx,
+                      const phi::IntArray& shape,
+                      phi::DataType dtype,
+                      const phi::Scalar& min,
+                      const phi::Scalar& max,
+                      int seed,
+                      int diag_num,
+                      int diag_step,
+                      float diag_val,
+                      phi::DenseTensor* out) {
+  DO_COMPATIBILITY(
+      aclnnInplaceUniform,
+      (custom_kernel::UniformRawKernelCPU<T, Context>(
+          dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out)));
+
+  out->Resize(phi::make_ddim(shape.GetData()));
+  dev_ctx.template Alloc<T>(out);
+  double from = min.to<double>();
+  double to = max.to<double>();
+  uint64_t offset_int64 = 0;
+  uint64_t seed_int64 = static_cast<uint64_t>(seed);
+  EXEC_NPU_CMD(
+      aclnnInplaceUniform, dev_ctx, *out, from, to, seed_int64, offset_int64);
+}
+
 template <typename T, typename Context>
 void UniformKernel(const Context& dev_ctx,
                    const phi::IntArray& shape,
@@ -91,7 +117,7 @@ void UniformKernel(const Context& dev_ctx,
                    const phi::Scalar& max,
                    int seed,
                    phi::DenseTensor* out) {
-  custom_kernel::UniformRawKernel<T>(
+  custom_kernel::UniformRawKernel<T, Context>(
       dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out);
 }