[geometric] Optimize graph sample speed (#47531) (#47548)

DesmonDay · web-flow · commit 7a1cf277543f · 2022-11-02T14:22:45.000+08:00
diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
@@ -58,7 +58,7 @@ struct MaxFunctor {
   }
 };
 
-template <typename T, int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
+template <typename T, int CTA_SIZE, int BLOCK_CTAS, int TILE_SIZE>
 __global__ void SampleKernel(const uint64_t rand_seed,
                              int k,
                              const int64_t num_nodes,
@@ -71,52 +71,51 @@ __global__ void SampleKernel(const uint64_t rand_seed,
                              T* output_eids,
                              int* output_ptr,
                              bool return_eids) {
-  assert(blockDim.x == WARP_SIZE);
-  assert(blockDim.y == BLOCK_WARPS);
+  assert(blockDim.x == CTA_SIZE);
 
   int64_t out_row = blockIdx.x * TILE_SIZE + threadIdx.y;
   const int64_t last_row =
       min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_nodes);
 #ifdef PADDLE_WITH_HIP
   hiprandState rng;
   hiprand_init(rand_seed * gridDim.x + blockIdx.x,
-               threadIdx.y * WARP_SIZE + threadIdx.x,
+               threadIdx.y * CTA_SIZE + threadIdx.x,
                0,
                &rng);
 #else
-  curandState rng;
+  curandStatePhilox4_32_10_t rng;
   curand_init(rand_seed * gridDim.x + blockIdx.x,
-              threadIdx.y * WARP_SIZE + threadIdx.x,
+              threadIdx.y * CTA_SIZE + threadIdx.x,
               0,
               &rng);
 #endif
 
   while (out_row < last_row) {
     T node = nodes[out_row];
     if (node > len_col_ptr - 1) {
-      out_row += BLOCK_WARPS;
+      out_row += BLOCK_CTAS;
       continue;
     }
     T in_row_start = col_ptr[node];
     int deg = col_ptr[node + 1] - in_row_start;
     int out_row_start = output_ptr[out_row];
 
     if (deg <= k) {
-      for (int idx = threadIdx.x; idx < deg; idx += WARP_SIZE) {
+      for (int idx = threadIdx.x; idx < deg; idx += CTA_SIZE) {
         output[out_row_start + idx] = row[in_row_start + idx];
         if (return_eids) {
           output_eids[out_row_start + idx] = eids[in_row_start + idx];
         }
       }
     } else {
-      for (int idx = threadIdx.x; idx < k; idx += WARP_SIZE) {
+      for (int idx = threadIdx.x; idx < k; idx += CTA_SIZE) {
         output[out_row_start + idx] = idx;
       }
 #ifdef PADDLE_WITH_CUDA
-      __syncwarp();
+      __syncthreads();
 #endif
 
-      for (int idx = k + threadIdx.x; idx < deg; idx += WARP_SIZE) {
+      for (int idx = k + threadIdx.x; idx < deg; idx += CTA_SIZE) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
 #else
@@ -129,10 +128,10 @@ __global__ void SampleKernel(const uint64_t rand_seed,
         }
       }
 #ifdef PADDLE_WITH_CUDA
-      __syncwarp();
+      __syncthreads();
 #endif
 
-      for (int idx = threadIdx.x; idx < k; idx += WARP_SIZE) {
+      for (int idx = threadIdx.x; idx < k; idx += CTA_SIZE) {
         T perm_idx = output[out_row_start + idx] + in_row_start;
         output[out_row_start + idx] = row[perm_idx];
         if (return_eids) {
@@ -141,7 +140,7 @@ __global__ void SampleKernel(const uint64_t rand_seed,
       }
     }
 
-    out_row += BLOCK_WARPS;
+    out_row += BLOCK_CTAS;
   }
 }
 
@@ -181,12 +180,12 @@ void SampleNeighbors(const Context& dev_ctx,
   thrust::exclusive_scan(
       output_count, output_count + bs, output_ptr.begin(), 0);
 
-  constexpr int WARP_SIZE = 32;
-  constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
-  constexpr int TILE_SIZE = BLOCK_WARPS * 16;
-  const dim3 block(WARP_SIZE, BLOCK_WARPS);
+  constexpr int CTA_SIZE = 128;
+  constexpr int BLOCK_CTAS = 128 / CTA_SIZE;
+  constexpr int TILE_SIZE = BLOCK_CTAS;
+  const dim3 block(CTA_SIZE, BLOCK_CTAS);
   const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE);
-  SampleKernel<T, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+  SampleKernel<T, CTA_SIZE, BLOCK_CTAS, TILE_SIZE>
       <<<grid, block, 0, dev_ctx.stream()>>>(
           0,
           sample_size,
@@ -202,16 +201,15 @@ void SampleNeighbors(const Context& dev_ctx,
           return_eids);
 }
 
-template <typename T, int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
+template <typename T, int CTA_SIZE, int BLOCK_CTAS, int TILE_SIZE>
 __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
                                         int k,
                                         const int64_t num_rows,
                                         const int64_t len_col_ptr,
                                         const T* in_rows,
                                         T* src,
                                         const T* dst_count) {
-  assert(blockDim.x == WARP_SIZE);
-  assert(blockDim.y == BLOCK_WARPS);
+  assert(blockDim.x == CTA_SIZE);
 
   int64_t out_row = blockIdx.x * TILE_SIZE + threadIdx.y;
   const int64_t last_row =
@@ -221,15 +219,15 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
   hiprand_init(
       rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
 #else
-  curandState rng;
+  curandStatePhilox4_32_10_t rng;
   curand_init(
       rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
 #endif
 
   while (out_row < last_row) {
     const T row = in_rows[out_row];
     if (row > len_col_ptr - 1) {
-      out_row += BLOCK_WARPS;
+      out_row += BLOCK_CTAS;
       continue;
     }
     const T in_row_start = dst_count[row];
@@ -241,7 +239,7 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
       } else {
         split = deg - k;
       }
-      for (int idx = split + threadIdx.x; idx <= deg - 1; idx += WARP_SIZE) {
+      for (int idx = split + threadIdx.x; idx <= deg - 1; idx += CTA_SIZE) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
 #else
@@ -254,14 +252,14 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
                            src[in_row_start + idx])));
       }
 #ifdef PADDLE_WITH_CUDA
-      __syncwarp();
+      __syncthreads();
 #endif
     }
-    out_row += BLOCK_WARPS;
+    out_row += BLOCK_CTAS;
   }
 }
 
-template <typename T, int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
+template <typename T, int CTA_SIZE, int BLOCK_CTAS, int TILE_SIZE>
 __global__ void GatherEdge(int k,
                            int64_t num_rows,
                            const T* in_rows,
@@ -273,8 +271,7 @@ __global__ void GatherEdge(int k,
                            int* output_ptr,
                            T* perm_data,
                            bool return_eids) {
-  assert(blockDim.x == WARP_SIZE);
-  assert(blockDim.y == BLOCK_WARPS);
+  assert(blockDim.x == CTA_SIZE);
 
   int64_t out_row = blockIdx.x * TILE_SIZE + threadIdx.y;
   const int64_t last_row =
@@ -287,7 +284,7 @@ __global__ void GatherEdge(int k,
     const T out_row_start = output_ptr[out_row];
 
     if (deg <= k) {
-      for (int idx = threadIdx.x; idx < deg; idx += WARP_SIZE) {
+      for (int idx = threadIdx.x; idx < deg; idx += CTA_SIZE) {
         outputs[out_row_start + idx] = src[in_row_start + idx];
         if (return_eids) {
           output_eids[out_row_start + idx] = eids[in_row_start + idx];
@@ -304,7 +301,7 @@ __global__ void GatherEdge(int k,
         end = deg;
       }
 
-      for (int idx = begin + threadIdx.x; idx < end; idx += WARP_SIZE) {
+      for (int idx = begin + threadIdx.x; idx < end; idx += CTA_SIZE) {
         outputs[out_row_start + idx - begin] =
             src[perm_data[in_row_start + idx]];
         if (return_eids) {
@@ -313,7 +310,7 @@ __global__ void GatherEdge(int k,
         }
       }
     }
-    out_row += BLOCK_WARPS;
+    out_row += BLOCK_CTAS;
   }
 }
 
@@ -337,13 +334,13 @@ void FisherYatesSampleNeighbors(const Context& dev_ctx,
   thrust::exclusive_scan(
       output_count, output_count + bs, output_ptr.begin(), 0);
 
-  constexpr int WARP_SIZE = 32;
-  constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
-  constexpr int TILE_SIZE = BLOCK_WARPS * 16;
-  const dim3 block(WARP_SIZE, BLOCK_WARPS);
+  constexpr int CTA_SIZE = 128;
+  constexpr int BLOCK_CTAS = 128 / CTA_SIZE;
+  constexpr int TILE_SIZE = BLOCK_CTAS;
+  const dim3 block(CTA_SIZE, BLOCK_CTAS);
   const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE);
 
-  FisherYatesSampleKernel<T, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+  FisherYatesSampleKernel<T, CTA_SIZE, BLOCK_CTAS, TILE_SIZE>
       <<<grid, block, 0, dev_ctx.stream()>>>(0,
                                              sample_size,
                                              bs,
@@ -352,7 +349,7 @@ void FisherYatesSampleNeighbors(const Context& dev_ctx,
                                              perm_data,
                                              col_ptr);
 
-  GatherEdge<T, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+  GatherEdge<T, CTA_SIZE, BLOCK_CTAS, TILE_SIZE>
       <<<grid, block, 0, dev_ctx.stream()>>>(
           sample_size,
           bs,