fix_affine_grid_cuda13 (PaddlePaddle#76367)

zhengshengning · web-flow · commit c00237f3dee0 · 2025-11-13T11:13:14.000+08:00
diff --git a/paddle/phi/kernels/funcs/affine_grid_utils.cu b/paddle/phi/kernels/funcs/affine_grid_utils.cu
@@ -25,7 +25,7 @@ namespace phi {
 namespace funcs {
 
 template <typename T>
-__global__ void CreateBaseGridKernel_4D(
+__global__ void CreateBaseGridKernel_4D_Kernel(
     T* base_grid_data, int64_t n, int64_t h, int64_t w, bool align_corners) {
   int64_t total_elements = n * h * w;
   CUDA_KERNEL_LOOP(idx, total_elements) {
@@ -78,12 +78,12 @@ __global__ void CreateBaseGridKernel_4D(
 }
 
 template <typename T>
-__global__ void CreateBaseGridKernel_5D(T* base_grid_data,
-                                        int64_t n,
-                                        int64_t d,
-                                        int64_t h,
-                                        int64_t w,
-                                        bool align_corners) {
+__global__ void CreateBaseGridKernel_5D_Kernel(T* base_grid_data,
+                                               int64_t n,
+                                               int64_t d,
+                                               int64_t h,
+                                               int64_t w,
+                                               bool align_corners) {
   int64_t total_elements = n * d * h * w;
   CUDA_KERNEL_LOOP(idx, total_elements) {
     int64_t w_idx = idx % w;
@@ -155,14 +155,46 @@ __global__ void CreateBaseGridKernel_5D(T* base_grid_data,
   }
 }
 
-template __global__ void CreateBaseGridKernel_4D<float>(
-    float*, int64_t, int64_t, int64_t, bool);
-template __global__ void CreateBaseGridKernel_4D<double>(
-    double*, int64_t, int64_t, int64_t, bool);
+template <typename T, typename Context>
+void CreateBaseGridKernel_4D(const Context& dev_ctx,
+                             T* base_grid_data,
+                             int64_t n,
+                             int64_t h,
+                             int64_t w,
+                             bool align_corners) {
+  int64_t total_elements = n * h * w;
+  auto stream = dev_ctx.stream();
+  int64_t block_size = 512;
+  int64_t grid_size = (total_elements + block_size - 1) / block_size;
+  CreateBaseGridKernel_4D_Kernel<T><<<grid_size, block_size, 0, stream>>>(
+      base_grid_data, n, h, w, align_corners);
+}
+
+template <typename T, typename Context>
+void CreateBaseGridKernel_5D(const Context& dev_ctx,
+                             T* base_grid_data,
+                             int64_t n,
+                             int64_t d,
+                             int64_t h,
+                             int64_t w,
+                             bool align_corners) {
+  int64_t total_elements = n * d * h * w;
+  auto stream = dev_ctx.stream();
+  int64_t block_size = 512;
+  int64_t grid_size = (total_elements + block_size - 1) / block_size;
+  CreateBaseGridKernel_5D_Kernel<T><<<grid_size, block_size, 0, stream>>>(
+      base_grid_data, n, d, h, w, align_corners);
+}
+
+template void CreateBaseGridKernel_4D<float, phi::GPUContext>(
+    const phi::GPUContext&, float*, int64_t, int64_t, int64_t, bool);
+template void CreateBaseGridKernel_4D<double, phi::GPUContext>(
+    const phi::GPUContext&, double*, int64_t, int64_t, int64_t, bool);
+
+template void CreateBaseGridKernel_5D<float, phi::GPUContext>(
+    const phi::GPUContext&, float*, int64_t, int64_t, int64_t, int64_t, bool);
+template void CreateBaseGridKernel_5D<double, phi::GPUContext>(
+    const phi::GPUContext&, double*, int64_t, int64_t, int64_t, int64_t, bool);
 
-template __global__ void CreateBaseGridKernel_5D<float>(
-    float*, int64_t, int64_t, int64_t, int64_t, bool);
-template __global__ void CreateBaseGridKernel_5D<double>(
-    double*, int64_t, int64_t, int64_t, int64_t, bool);
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/affine_grid_utils.h b/paddle/phi/kernels/funcs/affine_grid_utils.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -184,16 +185,23 @@ inline void GetIdxMap5D(int n,
 
 namespace funcs {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-__global__ void CreateBaseGridKernel_4D(
-    T* base_grid_data, int64_t n, int64_t h, int64_t w, bool align_corners);
-template <typename T>
-__global__ void CreateBaseGridKernel_5D(T* base_grid_data,
-                                        int64_t n,
-                                        int64_t d,
-                                        int64_t h,
-                                        int64_t w,
-                                        bool align_corners);
+
+template <typename T, typename Context>
+void CreateBaseGridKernel_4D(const Context& dev_ctx,
+                             T* base_grid_data,
+                             int64_t n,
+                             int64_t h,
+                             int64_t w,
+                             bool align_corners);
+
+template <typename T, typename Context>
+void CreateBaseGridKernel_5D(const Context& dev_ctx,
+                             T* base_grid_data,
+                             int64_t n,
+                             int64_t d,
+                             int64_t h,
+                             int64_t w,
+                             bool align_corners);
 #endif
 }  // namespace funcs
 
diff --git a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
@@ -60,13 +60,8 @@ void AffineGridGrad4DCUDAKernel(const Context& dev_ctx,
   base_grid.Resize(common::make_ddim({n, h, w, 3}));
   T* base_grid_data = dev_ctx.template Alloc<T>(&base_grid);
 
-  int64_t total_elements = n * h * w;
-  auto stream = dev_ctx.stream();
-  int64_t block_size = 512;
-  int64_t grid_size = (total_elements + block_size - 1) / block_size;
-
-  phi::funcs::CreateBaseGridKernel_4D<T><<<grid_size, block_size, 0, stream>>>(
-      base_grid_data, n, h, w, align_corners);
+  phi::funcs::CreateBaseGridKernel_4D<T, Context>(
+      dev_ctx, base_grid_data, n, h, w, align_corners);
 
   // 2. Reshaping base_grid to [N, H * W, 3]
   DenseTensor base_grid_reshaped;
@@ -127,13 +122,8 @@ void AffineGridGrad5DCUDAKernel(const Context& dev_ctx,
   base_grid.Resize(common::make_ddim({n, d, h, w, 4}));
   T* base_grid_data = dev_ctx.template Alloc<T>(&base_grid);
 
-  int64_t total_elements = n * d * h * w;
-  auto stream = dev_ctx.stream();
-  int64_t block_size = 512;
-  int64_t grid_size = (total_elements + block_size - 1) / block_size;
-
-  phi::funcs::CreateBaseGridKernel_5D<T><<<grid_size, block_size, 0, stream>>>(
-      base_grid_data, n, d, h, w, align_corners);
+  phi::funcs::CreateBaseGridKernel_5D<T, Context>(
+      dev_ctx, base_grid_data, n, d, h, w, align_corners);
 
   // 2. Reshaping base_grid to [N, D * H * W, 4]
   DenseTensor base_grid_reshaped;
diff --git a/paddle/phi/kernels/gpu/affine_grid_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_kernel.cu
@@ -53,13 +53,8 @@ void AffineGrid4DCUDAKernel(const Context& dev_ctx,
   base_grid.Resize(common::make_ddim({n, h, w, 3}));
   T* base_grid_data = dev_ctx.template Alloc<T>(&base_grid);
 
-  int64_t total_elements = n * h * w;
-  auto stream = dev_ctx.stream();
-  int64_t block_size = 512;
-  int64_t grid_size = (total_elements + block_size - 1) / block_size;
-
-  phi::funcs::CreateBaseGridKernel_4D<T><<<grid_size, block_size, 0, stream>>>(
-      base_grid_data, n, h, w, align_corners);
+  phi::funcs::CreateBaseGridKernel_4D<T, Context>(
+      dev_ctx, base_grid_data, n, h, w, align_corners);
 
   // Apply affine transformation
   DenseTensor base_grid_new;
@@ -107,13 +102,8 @@ void AffineGrid5DCUDAKernel(const Context& dev_ctx,
   base_grid.Resize(common::make_ddim({n, d, h, w, 4}));
   T* base_grid_data = dev_ctx.template Alloc<T>(&base_grid);
 
-  int64_t total_elements = n * d * h * w;
-  auto stream = dev_ctx.stream();
-  int64_t block_size = 512;
-  int64_t grid_size = (total_elements + block_size - 1) / block_size;
-
-  phi::funcs::CreateBaseGridKernel_5D<T><<<grid_size, block_size, 0, stream>>>(
-      base_grid_data, n, d, h, w, align_corners);
+  phi::funcs::CreateBaseGridKernel_5D<T, Context>(
+      dev_ctx, base_grid_data, n, d, h, w, align_corners);
 
   // Apply affine transformation
   DenseTensor base_grid_new;