unify PDL's ACKBLK and PREEXIT with CUDA API

dc3671 · dc3671 · commit 42c18566fb93 · 2025-12-09T06:01:54.000-08:00
Signed-off-by: Zhenhuan Chen &lt;zhenhuanc@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/common/cudaFp8Utils.cu b/cpp/tensorrt_llm/common/cudaFp8Utils.cu
@@ -42,7 +42,7 @@ template <QuantizeMode QUANTIZE_MODE, bool QUANTIZE, typename T_OUT, typename T_
 __global__ void scaleMatrix(T_OUT* output, T_S const* input_scale, T_IN const* input, int64_t numel, int64_t lda)
 {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < numel; i += blockDim.x * gridDim.x)
@@ -62,7 +62,7 @@ __global__ void scaleMatrix(T_OUT* output, T_S const* input_scale, T_IN const* i
         }
     }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h
@@ -46,7 +46,7 @@ CUTLASS_DEVICE
 void launch_dependent_grids()
 {
 #if (defined(CUTLASS_GDC_ENABLED))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -57,7 +57,7 @@ CUTLASS_DEVICE
 void wait_on_dependent_grids()
 {
 #if (defined(CUTLASS_GDC_ENABLED))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 }
 
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
@@ -161,7 +161,7 @@ __global__ void buildMinLatencyActiveExpertMapsKernel(int* num_active_experts_pe
     int const cluster_size, int const num_experts_smem)
 {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
     // Use one block to process the min latency case
     int tid = threadIdx.x;
@@ -271,7 +271,7 @@ __global__ void buildMinLatencyActiveExpertMapsKernel(int* num_active_experts_pe
         }
     }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -330,7 +330,7 @@ __global__ void fusedBuildExpertMapsSortFirstTokenKernel(int const* const token_
 
     // Wait PDL before reading token_selected_experts
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
 // build expert map
@@ -371,7 +371,7 @@ __global__ void fusedBuildExpertMapsSortFirstTokenKernel(int const* const token_
 
 // We are done with compute, launch the dependent kernels while the stores are in flight
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 
     // write to shared memory and global memory
@@ -576,7 +576,7 @@ __global__ void blockExpertPrefixSumKernel(int const* token_selected_experts, in
     int const token_id = block_id * kNumTokensPerBlock + threadIdx.x;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     int expanded_token_id = -1;
@@ -609,7 +609,7 @@ __global__ void blockExpertPrefixSumKernel(int const* token_selected_experts, in
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -669,7 +669,7 @@ __global__ void globalExpertPrefixSumLargeKernel(int const* blocked_expert_count
     int cnt = 0;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     // Note: Because of limited registers, cannot store thread-level prefix sum or enable #pragma unroll
@@ -703,7 +703,7 @@ __global__ void globalExpertPrefixSumLargeKernel(int const* blocked_expert_count
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -715,7 +715,7 @@ __global__ void globalExpertPrefixSumKernel(int const* blocked_expert_counts, in
     __shared__ typename BlockScan::TempStorage temp_storage;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     int const cnt = threadIdx.x < num_experts_per_node * num_blocks_per_seq ? blocked_expert_counts[threadIdx.x] : 0;
@@ -736,7 +736,7 @@ __global__ void globalExpertPrefixSumKernel(int const* blocked_expert_counts, in
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -807,7 +807,7 @@ __global__ void mergeExpertPrefixSumKernel(int const* blocked_expert_counts, int
     int const token_id = block_id * blockDim.x + threadIdx.x;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     int const cnt = blocked_expert_counts[target_expert_id * num_blocks_per_seq + block_id];
@@ -822,7 +822,7 @@ __global__ void mergeExpertPrefixSumKernel(int const* blocked_expert_counts, int
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -1256,7 +1256,7 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     // Both gemms use the same token offset
@@ -1331,7 +1331,7 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir
         bias2, gemm2_output, router_scales, permuted_row_to_unpermuted_row, expert);
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -1392,7 +1392,7 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp
         "Only NVFP4, MXFP8 and WINT4_AFP8 supports outputting a different format as part of the expansion");
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     constexpr int VecSize = is_nvfp4 ? TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
@@ -1522,7 +1522,7 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 
     // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values in the padded SF
@@ -1714,7 +1714,7 @@ __global__ void finalizeMoeRoutingKernel(GemmOutputType const* expanded_permuted
     auto* reduced_row_ptr_v = reinterpret_cast<OutputElem*>(reduced_row_ptr);
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
 #pragma unroll
@@ -1754,7 +1754,7 @@ __global__ void finalizeMoeRoutingKernel(GemmOutputType const* expanded_permuted
         reduced_row_ptr_v[elem_index] = output_elem;
     }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -1773,7 +1773,7 @@ __global__ void finalizeMoeRoutingNoFillingKernel(GemmOutputType const* expanded
     assert(unpadded_cols <= padded_cols);
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node];
@@ -1862,7 +1862,7 @@ __global__ void finalizeMoeRoutingNoFillingKernel(GemmOutputType const* expanded
         }
     }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -2059,7 +2059,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
     int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node];
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
     for (int64_t token = blockIdx.x; token < num_valid_tokens; token += gridDim.x)
     {
@@ -2175,7 +2175,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 
     // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values in the padded SF
diff --git a/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu b/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu
@@ -108,7 +108,7 @@ __global__ void GroupRMSNormBaseKernel(GroupRMSParams<n> params, int rounds)
     PackedType const* __restrict__ weight_ptr = nullptr;
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     // Find which input current warp operates on
@@ -260,7 +260,7 @@ __global__ void GroupRMSNormBaseKernel(GroupRMSParams<n> params, int rounds)
     }
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -302,7 +302,7 @@ __global__ void GroupRMSNormKernelLargeBatch(
     bool process_input_1 = warp_idx < warp_size_1;
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     // Get input pointers
@@ -562,7 +562,7 @@ __global__ void GroupRMSNormKernelLargeBatch(
     }
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh
@@ -247,7 +247,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_attn_scaling_
     }
 
 #if ENABLE_PREEXIT
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 #endif
 }
diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh
@@ -234,7 +234,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_block_ker
     }
 
 #if ENABLE_PREEXIT
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 #endif
 }
diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh
@@ -260,7 +260,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_warp_kern
     }
 
 #if ENABLE_PREEXIT
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 #endif
 }
diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh
@@ -273,7 +273,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_fp8_gemm_swiglu_per_blo
     }
 
 #if ENABLE_PREEXIT
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 #endif
 }
diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu
@@ -188,7 +188,7 @@ __global__ void llama4_moe_fc13_swiglu_fp8_kernel(int num_tokens,
     }
 
 #if ENABLE_PREEXIT
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 #endif
 }
@@ -307,7 +307,7 @@ __global__ void llama4_moe_fc2_fp8_kernel(int num_tokens,
     }
 
 #if ENABLE_PREEXIT
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 #endif
 }
diff --git a/cpp/tensorrt_llm/kernels/mlaKernels.cu b/cpp/tensorrt_llm/kernels/mlaKernels.cu
@@ -387,7 +387,7 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe,
     // Block/Head idx.
     size_t const head_idx = blockIdx.y;
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0)
@@ -595,7 +595,7 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe,
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 
     // The implementation of the parallel scan in the thread block (see CUB for details).
diff --git a/cpp/tensorrt_llm/kernels/quantization.cuh b/cpp/tensorrt_llm/kernels/quantization.cuh
@@ -792,7 +792,7 @@ quantize_with_block_size(
     int numPaddedColThreads = numPaddedCols / ELTS_PER_THREAD;
     int numColThreadsForSf = numColsForSf / ELTS_PER_THREAD;
 
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
     // Input tensor batch/row/col loops.
     for (int rowIdx = blockIdx.x; rowIdx < numPaddedRowsForSf; rowIdx += gridDim.x)
     {
@@ -860,7 +860,7 @@ quantize_with_block_size(
             }
         }
     }
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
@@ -778,7 +778,7 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams<T, KVCacheBu
     // Head idx.
     int const head_idx = blockIdx.y;
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     // Variable sequence length.
@@ -1083,7 +1083,7 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams<T, KVCacheBu
         }
     }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ template <QuantizeMode QUANTIZE_MODE, bool QUANTIZE, typename T_OUT, typename T_`
`42`	`42`	`__global__ void scaleMatrix(T_OUT* output, T_S const* input_scale, T_IN const* input, int64_t numel, int64_t lda)`
`43`	`43`	`{`
`44`	`44`	`#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`45`		`- asm volatile("griddepcontrol.wait;");`
	`45`	`+ cudaGridDependencySynchronize();`
`46`	`46`	`#endif`
`47`	`47`
`48`	`48`	`for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < numel; i += blockDim.x * gridDim.x)`
`@@ -62,7 +62,7 @@ __global__ void scaleMatrix(T_OUT* output, T_S const* input_scale, T_IN const* i`
`62`	`62`	`}`
`63`	`63`	`}`
`64`	`64`	`#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`65`		`- asm volatile("griddepcontrol.launch_dependents;");`
	`65`	`+ cudaTriggerProgrammaticLaunchCompletion();`
`66`	`66`	`#endif`
`67`	`67`	`}`
`68`	`68`
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ CUTLASS_DEVICE`
`46`	`46`	`void launch_dependent_grids()`
`47`	`47`	`{`
`48`	`48`	`#if (defined(CUTLASS_GDC_ENABLED))`
`49`		`- asm volatile("griddepcontrol.launch_dependents;");`
	`49`	`+ cudaTriggerProgrammaticLaunchCompletion();`
`50`	`50`	`#endif`
`51`	`51`	`}`
`52`	`52`
`@@ -57,7 +57,7 @@ CUTLASS_DEVICE`
`57`	`57`	`void wait_on_dependent_grids()`
`58`	`58`	`{`
`59`	`59`	`#if (defined(CUTLASS_GDC_ENABLED))`
`60`		`- asm volatile("griddepcontrol.wait;");`
	`60`	`+ cudaGridDependencySynchronize();`
`61`	`61`	`#endif`
`62`	`62`	`}`
`63`	`63`
Original file line number	Diff line number	Diff line change
`@@ -247,7 +247,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_attn_scaling_`
`247`	`247`	`}`
`248`	`248`
`249`	`249`	`#if ENABLE_PREEXIT`
`250`		`- asm volatile("griddepcontrol.launch_dependents;");`
	`250`	`+ cudaTriggerProgrammaticLaunchCompletion();`
`251`	`251`	`#endif`
`252`	`252`	`#endif`
`253`	`253`	`}`
Original file line number	Diff line number	Diff line change
`@@ -234,7 +234,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_block_ker`
`234`	`234`	`}`
`235`	`235`
`236`	`236`	`#if ENABLE_PREEXIT`
`237`		`- asm volatile("griddepcontrol.launch_dependents;");`
	`237`	`+ cudaTriggerProgrammaticLaunchCompletion();`
`238`	`238`	`#endif`
`239`	`239`	`#endif`
`240`	`240`	`}`
Original file line number	Diff line number	Diff line change
`@@ -260,7 +260,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_warp_kern`
`260`	`260`	`}`
`261`	`261`
`262`	`262`	`#if ENABLE_PREEXIT`
`263`		`- asm volatile("griddepcontrol.launch_dependents;");`
	`263`	`+ cudaTriggerProgrammaticLaunchCompletion();`
`264`	`264`	`#endif`
`265`	`265`	`#endif`
`266`	`266`	`}`
Original file line number	Diff line number	Diff line change
`@@ -273,7 +273,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_fp8_gemm_swiglu_per_blo`
`273`	`273`	`}`
`274`	`274`
`275`	`275`	`#if ENABLE_PREEXIT`
`276`		`- asm volatile("griddepcontrol.launch_dependents;");`
	`276`	`+ cudaTriggerProgrammaticLaunchCompletion();`
`277`	`277`	`#endif`
`278`	`278`	`#endif`
`279`	`279`	`}`
Original file line number	Diff line number	Diff line change
`@@ -188,7 +188,7 @@ __global__ void llama4_moe_fc13_swiglu_fp8_kernel(int num_tokens,`
`188`	`188`	`}`
`189`	`189`
`190`	`190`	`#if ENABLE_PREEXIT`
`191`		`- asm volatile("griddepcontrol.launch_dependents;");`
	`191`	`+ cudaTriggerProgrammaticLaunchCompletion();`
`192`	`192`	`#endif`
`193`	`193`	`#endif`
`194`	`194`	`}`
`@@ -307,7 +307,7 @@ __global__ void llama4_moe_fc2_fp8_kernel(int num_tokens,`
`307`	`307`	`}`
`308`	`308`
`309`	`309`	`#if ENABLE_PREEXIT`
`310`		`- asm volatile("griddepcontrol.launch_dependents;");`
	`310`	`+ cudaTriggerProgrammaticLaunchCompletion();`
`311`	`311`	`#endif`
`312`	`312`	`#endif`
`313`	`313`	`}`