[Custom Device ]Solved several problems for CUDA custom device backend (#74411)

YqGe585 · web-flow · commit de04d9ede924 · 2025-08-06T10:30:56.000+08:00
* solve several problems for CUDA custom device backend

* fix cpu kernel compilation bug
diff --git a/paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h b/paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h
@@ -24,8 +24,10 @@
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/common/transform.h"
 #include "paddle/phi/core/dense_tensor.h"
+#if !defined(PADDLE_WITH_CUDA) || !defined(PADDLE_WITH_CUSTOM_DEVICE)
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#endif
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
diff --git a/paddle/phi/kernels/funcs/quant_dequant.h b/paddle/phi/kernels/funcs/quant_dequant.h
@@ -20,8 +20,9 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/common/transform.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
+#ifndef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-
+#endif
 namespace phi {
 
 using backends::gpu::GpuLaunchConfig;
diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h
@@ -18,7 +18,10 @@ limitations under the License. */
 #include "paddle/phi/core/kmap_cache.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+#if !defined(PADDLE_WITH_CUDA) || !defined(PADDLE_WITH_CUSTOM_DEVICE)
+#include "paddle/phi/kernels/funcs/sparse/convolution_blas.h"
+#endif
 
 namespace phi {
 namespace funcs {
@@ -154,47 +157,6 @@ inline void ResetSubmKernelSizeAndStrides(const DDim& kernel_dims,
   }
 }
 
-template <typename T, typename Context>
-inline void SubmPreProcess(const Context& dev_ctx,
-                           const SparseCooTensor& x,
-                           const DenseTensor& kernel,
-                           const DenseTensor& out_grad,
-                           const int in_channels,
-                           const int out_channels,
-                           const int half_kernel_size,
-                           DenseTensor* kernel_grad,
-                           DenseTensor* x_grad) {
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  const bool is_params_freezing = kernel_grad == nullptr;
-  if (!is_params_freezing) {
-    T* d_kernel_ptr = kernel_grad->data<T>();
-    blas.GEMM(CblasTrans,
-              CblasNoTrans,
-              x.non_zero_elements().dims()[1],
-              out_grad.dims()[1],
-              x.non_zero_elements().dims()[0],
-              static_cast<T>(1),
-              x.non_zero_elements().data<T>(),
-              out_grad.data<T>(),
-              static_cast<T>(0),
-              d_kernel_ptr + half_kernel_size * in_channels * out_channels);
-  }
-
-  // call gemm: d_x = out_grad * transpose(kernel)
-  // (n, out_channels) * (out_channels, in_channels)
-  T* x_grad_ptr = x_grad->data<T>();
-  blas.GEMM(CblasNoTrans,
-            CblasTrans,
-            out_grad.dims()[0],
-            in_channels,
-            out_grad.dims()[1],
-            static_cast<T>(1),
-            out_grad.data<T>(),
-            kernel.data<T>() + half_kernel_size * in_channels * out_channels,
-            static_cast<T>(0),
-            x_grad_ptr);
-}
-
 inline const std::vector<int> PoolResetKernel(
     const std::vector<int>& kernel_sizes,
     const int in_channels,
diff --git a/paddle/phi/kernels/funcs/sparse/convolution_blas.h b/paddle/phi/kernels/funcs/sparse/convolution_blas.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/common/ddim.h"
+#include "paddle/phi/core/kmap_cache.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+template <typename T, typename Context>
+inline void SubmPreProcess(const Context& dev_ctx,
+                           const SparseCooTensor& x,
+                           const DenseTensor& kernel,
+                           const DenseTensor& out_grad,
+                           const int in_channels,
+                           const int out_channels,
+                           const int half_kernel_size,
+                           DenseTensor* kernel_grad,
+                           DenseTensor* x_grad) {
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  const bool is_params_freezing = kernel_grad == nullptr;
+  if (!is_params_freezing) {
+    T* d_kernel_ptr = kernel_grad->data<T>();
+    blas.GEMM(CblasTrans,
+              CblasNoTrans,
+              x.non_zero_elements().dims()[1],
+              out_grad.dims()[1],
+              x.non_zero_elements().dims()[0],
+              static_cast<T>(1),
+              x.non_zero_elements().data<T>(),
+              out_grad.data<T>(),
+              static_cast<T>(0),
+              d_kernel_ptr + half_kernel_size * in_channels * out_channels);
+  }
+
+  // call gemm: d_x = out_grad * transpose(kernel)
+  // (n, out_channels) * (out_channels, in_channels)
+  T* x_grad_ptr = x_grad->data<T>();
+  blas.GEMM(CblasNoTrans,
+            CblasTrans,
+            out_grad.dims()[0],
+            in_channels,
+            out_grad.dims()[1],
+            static_cast<T>(1),
+            out_grad.data<T>(),
+            kernel.data<T>() + half_kernel_size * in_channels * out_channels,
+            static_cast<T>(0),
+            x_grad_ptr);
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/stride/as_complex_kernel.cc b/paddle/phi/kernels/stride/as_complex_kernel.cc
@@ -91,7 +91,7 @@ PD_REGISTER_KERNEL(
 }
 #endif
 
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA)
 PD_REGISTER_KERNEL(
     as_complex, Custom, STRIDED, phi::AsComplexStridedKernel, float, double) {
   kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
diff --git a/paddle/phi/kernels/stride/as_real_kernel.cc b/paddle/phi/kernels/stride/as_real_kernel.cc
@@ -71,7 +71,7 @@ PD_REGISTER_KERNEL(as_real,
 }
 #endif
 
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA)
 PD_REGISTER_KERNEL(as_real,
                    Custom,
                    STRIDED,
diff --git a/paddle/phi/kernels/stride/complex_grad_kernel.cc b/paddle/phi/kernels/stride/complex_grad_kernel.cc
@@ -126,7 +126,7 @@ PD_REGISTER_KERNEL(imag_grad,
 }
 #endif
 
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA)
 PD_REGISTER_KERNEL(real_grad,
                    Custom,
                    STRIDED,
diff --git a/paddle/phi/kernels/stride/complex_kernel.cc b/paddle/phi/kernels/stride/complex_kernel.cc
@@ -119,7 +119,7 @@ PD_REGISTER_KERNEL(imag,
 }
 #endif
 
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA)
 PD_REGISTER_KERNEL(real,
                    Custom,
                    STRIDED,

Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ PD_REGISTER_KERNEL(`
`91`	`91`	`}`
`92`	`92`	`#endif`
`93`	`93`
`94`		`-#ifdef PADDLE_WITH_CUSTOM_DEVICE`
	`94`	`+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA)`
`95`	`95`	`PD_REGISTER_KERNEL(`
`96`	`96`	`as_complex, Custom, STRIDED, phi::AsComplexStridedKernel, float, double) {`
`97`	`97`	`kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));`
Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ PD_REGISTER_KERNEL(as_real,`
`71`	`71`	`}`
`72`	`72`	`#endif`
`73`	`73`
`74`		`-#ifdef PADDLE_WITH_CUSTOM_DEVICE`
	`74`	`+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA)`
`75`	`75`	`PD_REGISTER_KERNEL(as_real,`
`76`	`76`	`Custom,`
`77`	`77`	`STRIDED,`
Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,7 @@ PD_REGISTER_KERNEL(imag_grad,`
`126`	`126`	`}`
`127`	`127`	`#endif`
`128`	`128`
`129`		`-#ifdef PADDLE_WITH_CUSTOM_DEVICE`
	`129`	`+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA)`
`130`	`130`	`PD_REGISTER_KERNEL(real_grad,`
`131`	`131`	`Custom,`
`132`	`132`	`STRIDED,`
Original file line number	Diff line number	Diff line change
`@@ -119,7 +119,7 @@ PD_REGISTER_KERNEL(imag,`
`119`	`119`	`}`
`120`	`120`	`#endif`
`121`	`121`
`122`		`-#ifdef PADDLE_WITH_CUSTOM_DEVICE`
	`122`	`+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA)`
`123`	`123`	`PD_REGISTER_KERNEL(real,`
`124`	`124`	`Custom,`
`125`	`125`	`STRIDED,`