[fix] Fix Llama4 guradwords failures (#4844)

nv-yilinf · web-flow · commit 90aab0596e53 · 2025-06-02T13:43:42.000-07:00
Signed-off-by: Yilin Fan &lt;206948969+nv-yilinf@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu
@@ -45,7 +45,7 @@ __global__ void llama4_bf16_bf16_gemm_kernel(int num_tokens,
     int const row = blockIdx.x % NUM_EXPERTS; // Matrix row / Output element index
     int const tid = threadIdx.x;              // Thread ID within the block
 
-    // FDL prefetch all B data
+    // PDL prefetch all B data
     aligned_bf16x4 b_vec[GEMM_K / BLOCK_SIZE / VEC_SIZE];
 #pragma unroll
     for (int chunk = 0; chunk < GEMM_K / BLOCK_SIZE / VEC_SIZE; chunk++)
@@ -113,7 +113,7 @@ void llama4_bf16_bf16_gemm_launcher(
     int const grid_size = NUM_EXPERTS * num_tokens;
 
     void* args[] = {(void*) &num_tokens, (void*) &A, (void*) &B, (void*) &C};
-    launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, (void*) llama4_bf16_bf16_gemm_kernel, args, 4);
+    launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, (void*) llama4_bf16_bf16_gemm_kernel, args, 4);
 }
 
 void llama4_bf16_bf16_gemm_op(int num_tokens, void const* A, void const* B, void* C, cudaStream_t stream)
diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.cu
@@ -38,56 +38,56 @@ void llama4_fp8_bf16_gemm_launcher(void const* A, void const* B, void* C, void c
         // When num_tokens == 1, the best tiling size is tile_token == 1 and tile_out == 1.
         dim3 const grid_size = dim3(div_up(hidden_out, 1), div_up(num_tokens, 1), 1);
         void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(1, 1);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
     }
     else if (num_tokens == 2)
     {
         // When num_tokens == 2, the best tiling size is tile_token == 2 and tile_out == 1.
         dim3 const grid_size = dim3(div_up(hidden_out, 1), div_up(num_tokens, 2), 1);
         void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(2, 1);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
     }
     else if (num_tokens == 3)
     {
         // When num_tokens == 3, the best tiling size is tile_token == 1 and tile_out == 4.
         dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 1), 1);
         void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(1, 4);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
     }
     else if (num_tokens == 4)
     {
         // When num_tokens == 4, the best tiling size is tile_token == 2 and tile_out == 2.
         dim3 const grid_size = dim3(div_up(hidden_out, 2), div_up(num_tokens, 2), 1);
         void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(2, 2);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
     }
     else if (num_tokens == 5)
     {
         // When num_tokens == 5, the best tiling size is tile_token == 1 and tile_out == 4.
         dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 1), 1);
         void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(1, 4);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
     }
     else if (num_tokens == 6)
     {
         // When num_tokens == 6, the best tiling size is tile_token == 3 and tile_out == 4.
         dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 3), 1);
         void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(3, 4);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
     }
     else if (num_tokens == 7)
     {
         // When num_tokens == 7, the best tiling size is tile_token == 1 and tile_out == 4.
         dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 1), 1);
         void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(1, 4);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
     }
     else if (num_tokens == 8)
     {
         // When num_tokens == 8, the best tiling size is tile_token == 2 and tile_out == 4.
         dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 2), 1);
         void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(2, 4);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);
     }
     else
     {
@@ -115,56 +115,56 @@ void llama4_fp8_bf16_gemm_attn_scaling_launcher(void const* A, void const* B, vo
         // When num_tokens == 1, the best tiling size is tile_token == 1 and tile_out == 1.
         dim3 const grid_size = dim3(div_up(hidden_out, 1), div_up(num_tokens, 1), 1);
         void* kernel_func = get_kernel_func(1, 1, pos_ids_int64);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
     }
     else if (num_tokens == 2)
     {
         // When num_tokens == 2, the best tiling size is tile_token == 2 and tile_out == 2.
         dim3 const grid_size = dim3(div_up(hidden_out, 2), div_up(num_tokens, 2), 1);
         void* kernel_func = get_kernel_func(2, 2, pos_ids_int64);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
     }
     else if (num_tokens == 3)
     {
         // When num_tokens == 3, the best tiling size is tile_token == 1 and tile_out == 4.
         dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 1), 1);
         void* kernel_func = get_kernel_func(1, 4, pos_ids_int64);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
     }
     else if (num_tokens == 4)
     {
         // When num_tokens == 4, the best tiling size is tile_token == 2 and tile_out == 2.
         dim3 const grid_size = dim3(div_up(hidden_out, 2), div_up(num_tokens, 2), 1);
         void* kernel_func = get_kernel_func(2, 2, pos_ids_int64);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
     }
     else if (num_tokens == 5)
     {
         // When num_tokens == 5, the best tiling size is tile_token == 1 and tile_out == 4.
         dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 1), 1);
         void* kernel_func = get_kernel_func(1, 4, pos_ids_int64);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
     }
     else if (num_tokens == 6)
     {
         // When num_tokens == 6, the best tiling size is tile_token == 2 and tile_out == 4.
         dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 2), 1);
         void* kernel_func = get_kernel_func(2, 4, pos_ids_int64);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
     }
     else if (num_tokens == 7)
     {
         // When num_tokens == 7, the best tiling size is tile_token == 1 and tile_out == 4.
         dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 1), 1);
         void* kernel_func = get_kernel_func(1, 4, pos_ids_int64);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
     }
     else if (num_tokens == 8)
     {
         // When num_tokens == 8, the best tiling size is tile_token == 2 and tile_out == 4.
         dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 2), 1);
         void* kernel_func = get_kernel_func(2, 4, pos_ids_int64);
-        launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
+        launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);
     }
     else
     {
diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.cu
@@ -80,7 +80,7 @@ void dispatch_llama4_fp8_fp8_gemm_swiglu_hidden_in(void const* __restrict__ A, v
 
     void* args[] = {(void*) &A, (void*) &B, (void*) &C, (void*) &in_scale, (void*) &out_scale_inv, (void*) &num_tokens,
         (void*) &hidden_in, (void*) &hidden_out};
-    launch_kernel_fdl(grid_size, dim3(BLOCK_SIZE), stream, func_ptr, args, 8);
+    launch_kernel_pdl(grid_size, dim3(BLOCK_SIZE), stream, func_ptr, args, 8);
 }
 
 template <int TILE_TOKEN>
diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu
@@ -39,7 +39,7 @@ namespace tensorrt_llm::kernels::llama4_min_latency::llama4_moe
 #define TOPK_VEC_SIZE 4
 static_assert(NUM_EXPERTS == TOPK_VEC_SIZE * WARP_SIZE, "NUM_EXPERTS must be equal to TOPK_VEC_SIZE * WARP_SIZE");
 
-// This is the hand-optimized kernel by Po-Han.
+// This is the hand-optimized kernel.
 // The computation is:
 //   C = silu(AxB_gated * in_scale * sigmoid(logit)) * (AxB_linear * in_scale * sigmoid(logit)) * out_scale_inv
 // The out_scale_inv cannot be fused with in_scale because silu() is non-linear.
@@ -213,10 +213,10 @@ void launch_llama4_moe_fc13_swiglu_fp8_kernel(int num_tokens, int num_experts,
 
     void* args[] = {(void*) &num_tokens, (void*) &A, (void*) &B, (void*) &logits, (void*) &C, (void*) &exp_idx,
         (void*) &in_scales, (void*) &out_scale_inv};
-    launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, (void*) llama4_moe_fc13_swiglu_fp8_kernel, args, 8);
+    launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, (void*) llama4_moe_fc13_swiglu_fp8_kernel, args, 8);
 }
 
-// This is the hand-optimized kernel by Po-Han.
+// This is the hand-optimized kernel.
 __global__ void llama4_moe_fc2_fp8_kernel(int num_tokens,
     __nv_fp8_e4m3 const* __restrict__ A,      // Input tensor A [num_tokens][INTER_SIZE]
     __nv_fp8_e4m3 const* __restrict__ B,      // Input tensor B [num_experts][HIDDEN_SIZE][INTER_SIZE]
@@ -329,7 +329,7 @@ void launch_llama4_moe_fc2_fp8_kernel(int num_tokens, int num_experts,
 
     void* args[]
         = {(void*) &num_tokens, (void*) &A, (void*) &B, (void*) &exp_idx, (void*) &C, (void*) &scaling_factors};
-    launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, (void*) llama4_moe_fc2_fp8_kernel, args, 6);
+    launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, (void*) llama4_moe_fc2_fp8_kernel, args, 6);
 }
 
 void run_moe_llama4_tp8ep1_min_latency(int num_tokens, int num_experts,
diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh
@@ -66,7 +66,7 @@ constexpr bool ENABLE_PREEXIT = 0;
 
 } // namespace llama4_fp8_fp8_gemm_swiglu
 
-inline void launch_kernel_fdl(
+inline void launch_kernel_pdl(
     dim3 grid_dim, dim3 block_dim, cudaStream_t stream, void* kernel_func, void* args[], int num_args)
 {
     cudaLaunchConfig_t config;
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.cu
@@ -1774,7 +1774,7 @@ __global__ void __launch_bounds__(WarpSize) routingIndicesWarpKernel(KernelParam
         params.mPtrNumNonExitingCtas[0] = numNonExitingCtas;
     }
 
-#if !defined(FDL_PROFILE) || FDL_PROFILE == 0
+#if !defined(PDL_PROFILE) || PDL_PROFILE == 0
     // we can trigger the next kernel at this point
     if constexpr (KernelParams::UsePdl)
     {
@@ -2059,7 +2059,7 @@ __global__ void __cluster_dims__(NumBlocksPerCluster, 1, 1) __launch_bounds__(Nu
     // We can't do it earlier because FC1 depends on the mPtrCtaIdxXyToBatchIdx,
     // mPtrCtaIdxXyToMnLimit, mPtrNumNonExitingCtas and mPtrTotalNumPaddedTokens
     // TODO: this is not sufficient to ensure visibility in the next kernel!
-#if !defined(FDL_PROFILE) || FDL_PROFILE == 0
+#if !defined(PDL_PROFILE) || PDL_PROFILE == 0
     if constexpr (KernelParams::UsePdl)
     {
         cudaTriggerProgrammaticLaunchCompletion();
@@ -2517,7 +2517,7 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(Ke
 // Trigger secondary kernel.
 // Note: this does not guarantee the visibility of prior writes unless the consumer executes a
 // dependency sync.
-#if !defined(FDL_PROFILE) || FDL_PROFILE == 0
+#if !defined(PDL_PROFILE) || PDL_PROFILE == 0
     if constexpr (KernelParams::UsePdl)
     {
         cudaTriggerProgrammaticLaunchCompletion();
@@ -3183,7 +3183,7 @@ __global__ void __cluster_dims__(NumBlocksPerCluster, 1, 1) __launch_bounds__(Nu
     // We can't do it earlier because FC1 depends on the mPtrCtaIdxXyToBatchIdx,
     // mPtrCtaIdxXyToMnLimit, mPtrNumNonExitingCtas and mPtrTotalNumPaddedTokens
     // TODO: this is not sufficient to ensure visibility in the next kernel!
-#if !defined(FDL_PROFILE) || FDL_PROFILE == 0
+#if !defined(PDL_PROFILE) || PDL_PROFILE == 0
     if constexpr (KernelParams::UsePdl)
     {
         cudaTriggerProgrammaticLaunchCompletion();
@@ -3665,7 +3665,7 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(Ke
 // Trigger secondary kernel.
 // Note: this does not guarantee the visibility of prior writes unless the consumer executes a
 // dependency sync.
-#if !defined(FDL_PROFILE) || FDL_PROFILE == 0
+#if !defined(PDL_PROFILE) || PDL_PROFILE == 0
     if constexpr (KernelParams::UsePdl)
     {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))

Original file line number	Diff line number	Diff line change
`@@ -38,56 +38,56 @@ void llama4_fp8_bf16_gemm_launcher(void const* A, void const* B, void* C, void c`
`38`	`38`	`// When num_tokens == 1, the best tiling size is tile_token == 1 and tile_out == 1.`
`39`	`39`	`dim3 const grid_size = dim3(div_up(hidden_out, 1), div_up(num_tokens, 1), 1);`
`40`	`40`	`void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(1, 1);`
`41`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
	`41`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
`42`	`42`	`}`
`43`	`43`	`else if (num_tokens == 2)`
`44`	`44`	`{`
`45`	`45`	`// When num_tokens == 2, the best tiling size is tile_token == 2 and tile_out == 1.`
`46`	`46`	`dim3 const grid_size = dim3(div_up(hidden_out, 1), div_up(num_tokens, 2), 1);`
`47`	`47`	`void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(2, 1);`
`48`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
	`48`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
`49`	`49`	`}`
`50`	`50`	`else if (num_tokens == 3)`
`51`	`51`	`{`
`52`	`52`	`// When num_tokens == 3, the best tiling size is tile_token == 1 and tile_out == 4.`
`53`	`53`	`dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 1), 1);`
`54`	`54`	`void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(1, 4);`
`55`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
	`55`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
`56`	`56`	`}`
`57`	`57`	`else if (num_tokens == 4)`
`58`	`58`	`{`
`59`	`59`	`// When num_tokens == 4, the best tiling size is tile_token == 2 and tile_out == 2.`
`60`	`60`	`dim3 const grid_size = dim3(div_up(hidden_out, 2), div_up(num_tokens, 2), 1);`
`61`	`61`	`void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(2, 2);`
`62`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
	`62`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
`63`	`63`	`}`
`64`	`64`	`else if (num_tokens == 5)`
`65`	`65`	`{`
`66`	`66`	`// When num_tokens == 5, the best tiling size is tile_token == 1 and tile_out == 4.`
`67`	`67`	`dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 1), 1);`
`68`	`68`	`void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(1, 4);`
`69`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
	`69`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
`70`	`70`	`}`
`71`	`71`	`else if (num_tokens == 6)`
`72`	`72`	`{`
`73`	`73`	`// When num_tokens == 6, the best tiling size is tile_token == 3 and tile_out == 4.`
`74`	`74`	`dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 3), 1);`
`75`	`75`	`void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(3, 4);`
`76`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
	`76`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
`77`	`77`	`}`
`78`	`78`	`else if (num_tokens == 7)`
`79`	`79`	`{`
`80`	`80`	`// When num_tokens == 7, the best tiling size is tile_token == 1 and tile_out == 4.`
`81`	`81`	`dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 1), 1);`
`82`	`82`	`void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(1, 4);`
`83`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
	`83`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
`84`	`84`	`}`
`85`	`85`	`else if (num_tokens == 8)`
`86`	`86`	`{`
`87`	`87`	`// When num_tokens == 8, the best tiling size is tile_token == 2 and tile_out == 4.`
`88`	`88`	`dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 2), 1);`
`89`	`89`	`void* kernel_func = get_per_block_func_ptr_aligned_true_5120_(2, 4);`
`90`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
	`90`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 7);`
`91`	`91`	`}`
`92`	`92`	`else`
`93`	`93`	`{`
`@@ -115,56 +115,56 @@ void llama4_fp8_bf16_gemm_attn_scaling_launcher(void const* A, void const* B, vo`
`115`	`115`	`// When num_tokens == 1, the best tiling size is tile_token == 1 and tile_out == 1.`
`116`	`116`	`dim3 const grid_size = dim3(div_up(hidden_out, 1), div_up(num_tokens, 1), 1);`
`117`	`117`	`void* kernel_func = get_kernel_func(1, 1, pos_ids_int64);`
`118`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
	`118`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
`119`	`119`	`}`
`120`	`120`	`else if (num_tokens == 2)`
`121`	`121`	`{`
`122`	`122`	`// When num_tokens == 2, the best tiling size is tile_token == 2 and tile_out == 2.`
`123`	`123`	`dim3 const grid_size = dim3(div_up(hidden_out, 2), div_up(num_tokens, 2), 1);`
`124`	`124`	`void* kernel_func = get_kernel_func(2, 2, pos_ids_int64);`
`125`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
	`125`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
`126`	`126`	`}`
`127`	`127`	`else if (num_tokens == 3)`
`128`	`128`	`{`
`129`	`129`	`// When num_tokens == 3, the best tiling size is tile_token == 1 and tile_out == 4.`
`130`	`130`	`dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 1), 1);`
`131`	`131`	`void* kernel_func = get_kernel_func(1, 4, pos_ids_int64);`
`132`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
	`132`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
`133`	`133`	`}`
`134`	`134`	`else if (num_tokens == 4)`
`135`	`135`	`{`
`136`	`136`	`// When num_tokens == 4, the best tiling size is tile_token == 2 and tile_out == 2.`
`137`	`137`	`dim3 const grid_size = dim3(div_up(hidden_out, 2), div_up(num_tokens, 2), 1);`
`138`	`138`	`void* kernel_func = get_kernel_func(2, 2, pos_ids_int64);`
`139`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
	`139`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
`140`	`140`	`}`
`141`	`141`	`else if (num_tokens == 5)`
`142`	`142`	`{`
`143`	`143`	`// When num_tokens == 5, the best tiling size is tile_token == 1 and tile_out == 4.`
`144`	`144`	`dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 1), 1);`
`145`	`145`	`void* kernel_func = get_kernel_func(1, 4, pos_ids_int64);`
`146`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
	`146`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
`147`	`147`	`}`
`148`	`148`	`else if (num_tokens == 6)`
`149`	`149`	`{`
`150`	`150`	`// When num_tokens == 6, the best tiling size is tile_token == 2 and tile_out == 4.`
`151`	`151`	`dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 2), 1);`
`152`	`152`	`void* kernel_func = get_kernel_func(2, 4, pos_ids_int64);`
`153`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
	`153`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
`154`	`154`	`}`
`155`	`155`	`else if (num_tokens == 7)`
`156`	`156`	`{`
`157`	`157`	`// When num_tokens == 7, the best tiling size is tile_token == 1 and tile_out == 4.`
`158`	`158`	`dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 1), 1);`
`159`	`159`	`void* kernel_func = get_kernel_func(1, 4, pos_ids_int64);`
`160`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
	`160`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
`161`	`161`	`}`
`162`	`162`	`else if (num_tokens == 8)`
`163`	`163`	`{`
`164`	`164`	`// When num_tokens == 8, the best tiling size is tile_token == 2 and tile_out == 4.`
`165`	`165`	`dim3 const grid_size = dim3(div_up(hidden_out, 4), div_up(num_tokens, 2), 1);`
`166`	`166`	`void* kernel_func = get_kernel_func(2, 4, pos_ids_int64);`
`167`		`- launch_kernel_fdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
	`167`	`+ launch_kernel_pdl(dim3(grid_size), dim3(BLOCK_SIZE), stream, kernel_func, args, 11);`
`168`	`168`	`}`
`169`	`169`	`else`
`170`	`170`	`{`
Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,7 @@ void dispatch_llama4_fp8_fp8_gemm_swiglu_hidden_in(void const* __restrict__ A, v`
`80`	`80`
`81`	`81`	`void* args[] = {(void) &A, (void) &B, (void) &C, (void) &in_scale, (void) &out_scale_inv, (void) &num_tokens,`
`82`	`82`	`(void) &hidden_in, (void) &hidden_out};`
`83`		`- launch_kernel_fdl(grid_size, dim3(BLOCK_SIZE), stream, func_ptr, args, 8);`
	`83`	`+ launch_kernel_pdl(grid_size, dim3(BLOCK_SIZE), stream, func_ptr, args, 8);`
`84`	`84`	`}`
`85`	`85`
`86`	`86`	`template <int TILE_TOKEN>`
Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ constexpr bool ENABLE_PREEXIT = 0;`
`66`	`66`
`67`	`67`	`} // namespace llama4_fp8_fp8_gemm_swiglu`
`68`	`68`
`69`		`-inline void launch_kernel_fdl(`
	`69`	`+inline void launch_kernel_pdl(`
`70`	`70`	`dim3 grid_dim, dim3 block_dim, cudaStream_t stream, void* kernel_func, void* args[], int num_args)`
`71`	`71`	`{`
`72`	`72`	`cudaLaunchConfig_t config;`