Fix dynamic 0size nan complex problem (#73516)

A-nnonymous · web-flow · commit 6a5494b440e9 · 2025-06-22T15:17:44.000+08:00
diff --git a/paddle/phi/kernels/gpu/moe_permute_kernel.cu b/paddle/phi/kernels/gpu/moe_permute_kernel.cu
@@ -58,6 +58,7 @@ __global__ void tokens_unzip_stable_kernel(
     local_expert_offsets[i] = expert_base_offset.data[i];
     local_cumsum[i] = 0;
   }
+  const int base_row_idx = blockIdx.x * CUMSUM_BLOCK_SIZE;
   __shared__ int shared_expert_rowmap[CUMSUM_BLOCK_SIZE][MAX_NUM_EXPERTS];
   __shared__ probs_T shared_expert_probmap[CUMSUM_BLOCK_SIZE][MAX_NUM_EXPERTS];
 
@@ -238,13 +239,7 @@ void MoePermuteKernel(const Context &dev_ctx,
           "value.",
           MAX_NUM_EXPERTS,
           num_experts));
-  if (X.numel() == 0) {
-    dev_ctx.template Alloc<float>(XScale_unzipped);
-    dev_ctx.template Alloc<int>(zipped_expertwise_rowmap);
-    dev_ctx.template Alloc<T>(X_unzipped);
-    dev_ctx.template Alloc<float>(token_prob_unzipped);
-    return;
-  }
+
   const int quanted_cols = (XScale) ? XScale.get_ptr()->dims()[1] : 0;
   expert_base_offset expert_offset;
   int tokens_cumulated = 0;
@@ -270,46 +265,26 @@ void MoePermuteKernel(const Context &dev_ctx,
   dev_ctx.template Alloc<float>(XScale_unzipped);
   dev_ctx.template Alloc<int>(zipped_expertwise_rowmap);
   dev_ctx.template Alloc<T>(X_unzipped);
+  dev_ctx.template Alloc<float>(token_prob_unzipped);
   auto X_unzipped_ptr = reinterpret_cast<void *>(X_unzipped->data<T>());
-  for (int i = 0; i < num_experts; i++) {
-    int next_expert_offset =
-        i < num_experts - 1 ? expert_offset.data[i + 1] : output_rows;
-    int invalid_rows =
-        next_expert_offset - expert_offset.data[i] - tokens_per_expert[i];
-    cudaMemsetAsync(X_unzipped_ptr + tokens_per_expert[i] * sizeof(T),
-                    0,
-                    sizeof(T) * invalid_rows * cols,
-                    dev_ctx.stream());
-  }
+  cudaMemsetAsync(
+      X_unzipped_ptr, 0, sizeof(T) * output_rows * cols, dev_ctx.stream());
   if (XScale) {
     auto XScale_unzipped_ptr =
         reinterpret_cast<void *>(XScale_unzipped->data<float>());
-    for (int i = 0; i < num_experts; i++) {
-      int next_expert_offset =
-          i < num_experts - 1 ? expert_offset.data[i + 1] : output_rows;
-      int invalid_rows =
-          next_expert_offset - expert_offset.data[i] - tokens_per_expert[i];
-      cudaMemsetAsync(
-          XScale_unzipped_ptr + tokens_per_expert[i] * sizeof(float),
-          0,
-          sizeof(float) * invalid_rows * quanted_cols,
-          dev_ctx.stream());
-    }
+    cudaMemsetAsync(XScale_unzipped_ptr,
+                    0,
+                    sizeof(float) * output_rows * quanted_cols,
+                    dev_ctx.stream());
   }
-  dev_ctx.template Alloc<float>(token_prob_unzipped);
+
   auto token_prob_unzipped_ptr =
       reinterpret_cast<void *>(token_prob_unzipped->data<float>());
-  for (int i = 0; i < num_experts; i++) {
-    int next_expert_offset =
-        i < num_experts - 1 ? expert_offset.data[i + 1] : output_rows;
-    int invalid_rows =
-        next_expert_offset - expert_offset.data[i] - tokens_per_expert[i];
-    cudaMemsetAsync(
-        token_prob_unzipped_ptr + tokens_per_expert[i] * sizeof(float),
-        0,
-        sizeof(float) * invalid_rows,
-        dev_ctx.stream());
-  }
+  cudaMemsetAsync(token_prob_unzipped_ptr,
+                  0,
+                  sizeof(float) * output_rows,
+                  dev_ctx.stream());
+  if (X.numel() == 0) return;
   const int cumsum_blocknum =
       (rows + CUMSUM_BLOCK_SIZE - 1) / CUMSUM_BLOCK_SIZE;
   DenseTensor global_expertwise_block_cumsum =