NVIDIA
diff --git a/‎cpp/tensorrt_llm/common/cudaFp8Utils.cu‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/common/cudaFp8Utils.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/common/envUtils.cpp‎
Lines changed: 13 additions & 2 deletions b/‎cpp/tensorrt_llm/common/envUtils.cpp‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu‎
Lines changed: 22 additions & 22 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu‎
Lines changed: 22 additions & 22 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu‎
Lines changed: 4 additions & 4 deletions b/‎cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh‎
Lines changed: 2 additions & 2 deletions
@@ -43,7 +43,7 @@ template <QuantizeMode QUANTIZE_MODE, bool QUANTIZE, typename T_OUT, typename T_
 __global__ void scaleMatrix(T_OUT* output, T_S const* input_scale, T_IN const* input, int64_t numel, int64_t lda)
 {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < numel; i += blockDim.x * gridDim.x)
@@ -63,7 +63,7 @@ __global__ void scaleMatrix(T_OUT* output, T_S const* input_scale, T_IN const* i
         }
     }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
 
@@ -249,15 +249,26 @@ bool getEnvUseTileSizeKv64ForTrtllmGen()
 bool getEnvEnablePDL()
 {
     static std::once_flag flag;
-    static bool enablePDL = false;
+    static bool enablePDL = true;
 
     std::call_once(flag,
         [&]()
         {
             if (getSMVersion() >= 90)
             {
                 // PDL will be enabled by setting the env variables `TRTLLM_ENABLE_PDL` to `1`
-                enablePDL = getBoolEnv("TRTLLM_ENABLE_PDL");
+                char const* env = std::getenv("TRTLLM_ENABLE_PDL");
+                if (env)
+                {
+                    if (env[0] == '1' && env[1] == '\0')
+                    {
+                        enablePDL = true;
+                    }
+                    else if (env[0] == '0' && env[1] == '\0')
+                    {
+                        enablePDL = false;
+                    }
+                };
             }
         });
     return enablePDL;
 
@@ -46,7 +46,7 @@ CUTLASS_DEVICE
 void launch_dependent_grids()
 {
 #if (defined(CUTLASS_GDC_ENABLED))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -57,7 +57,7 @@ CUTLASS_DEVICE
 void wait_on_dependent_grids()
 {
 #if (defined(CUTLASS_GDC_ENABLED))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 }
 
 
@@ -164,7 +164,7 @@ __global__ void buildMinLatencyActiveExpertMapsKernel(int* num_active_experts_pe
     int const cluster_size, int const num_experts_smem)
 {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
     // Use one block to process the min latency case
     int tid = threadIdx.x;
@@ -274,7 +274,7 @@ __global__ void buildMinLatencyActiveExpertMapsKernel(int* num_active_experts_pe
         }
     }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -333,7 +333,7 @@ __global__ void fusedBuildExpertMapsSortFirstTokenKernel(int const* const token_
 
     // Wait PDL before reading token_selected_experts
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
 // build expert map
@@ -374,7 +374,7 @@ __global__ void fusedBuildExpertMapsSortFirstTokenKernel(int const* const token_
 
 // We are done with compute, launch the dependent kernels while the stores are in flight
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 
     // write to shared memory and global memory
@@ -579,7 +579,7 @@ __global__ void blockExpertPrefixSumKernel(int const* token_selected_experts, in
     int const token_id = block_id * kNumTokensPerBlock + threadIdx.x;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     int expanded_token_id = -1;
@@ -612,7 +612,7 @@ __global__ void blockExpertPrefixSumKernel(int const* token_selected_experts, in
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -672,7 +672,7 @@ __global__ void globalExpertPrefixSumLargeKernel(int const* blocked_expert_count
     int cnt = 0;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     // Note: Because of limited registers, cannot store thread-level prefix sum or enable #pragma unroll
@@ -706,7 +706,7 @@ __global__ void globalExpertPrefixSumLargeKernel(int const* blocked_expert_count
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -718,7 +718,7 @@ __global__ void globalExpertPrefixSumKernel(int const* blocked_expert_counts, in
     __shared__ typename BlockScan::TempStorage temp_storage;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     int const cnt = threadIdx.x < num_experts_per_node * num_blocks_per_seq ? blocked_expert_counts[threadIdx.x] : 0;
@@ -739,7 +739,7 @@ __global__ void globalExpertPrefixSumKernel(int const* blocked_expert_counts, in
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -810,7 +810,7 @@ __global__ void mergeExpertPrefixSumKernel(int const* blocked_expert_counts, int
     int const token_id = block_id * blockDim.x + threadIdx.x;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     int const cnt = blocked_expert_counts[target_expert_id * num_blocks_per_seq + block_id];
@@ -825,7 +825,7 @@ __global__ void mergeExpertPrefixSumKernel(int const* blocked_expert_counts, int
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -1259,7 +1259,7 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     // Both gemms use the same token offset
@@ -1334,7 +1334,7 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir
         bias2, gemm2_output, router_scales, permuted_row_to_unpermuted_row, expert);
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -1395,7 +1395,7 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp
         "Only NVFP4, MXFP8 and WINT4_AFP8 supports outputting a different format as part of the expansion");
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     constexpr int VecSize = is_nvfp4 ? TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
@@ -1525,7 +1525,7 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 
     // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values in the padded SF
@@ -1717,7 +1717,7 @@ __global__ void finalizeMoeRoutingKernel(GemmOutputType const* expanded_permuted
     auto* reduced_row_ptr_v = reinterpret_cast<OutputElem*>(reduced_row_ptr);
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
 #pragma unroll
@@ -1757,7 +1757,7 @@ __global__ void finalizeMoeRoutingKernel(GemmOutputType const* expanded_permuted
         reduced_row_ptr_v[elem_index] = output_elem;
     }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -1776,7 +1776,7 @@ __global__ void finalizeMoeRoutingNoFillingKernel(GemmOutputType const* expanded
     assert(unpadded_cols <= padded_cols);
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node];
@@ -1865,7 +1865,7 @@ __global__ void finalizeMoeRoutingNoFillingKernel(GemmOutputType const* expanded
         }
     }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -2062,7 +2062,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
     int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node];
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
     for (int64_t token = blockIdx.x; token < num_valid_tokens; token += gridDim.x)
     {
@@ -2178,7 +2178,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 
     // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values in the padded SF
 
@@ -178,8 +178,8 @@ struct LowLatencyLayerNorm
 #if (defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 12))
         if constexpr (arch::is_major_v<9> || arch::is_major_v<10>)
         {
-            asm volatile("griddepcontrol.wait;\n");
-            asm volatile("griddepcontrol.launch_dependents;\n");
+            cudaGridDependencySynchronize();
+            cudaTriggerProgrammaticLaunchCompletion();
         }
 #endif
         load_to_register(&param.input[work_id * param.n], data, param.n);
 
@@ -211,7 +211,7 @@ struct WarpSpecializedLayerNorm
 
                 if constexpr (FIRST_RUN)
                 {
-                    asm volatile("griddepcontrol.wait;\n");
+                    cudaGridDependencySynchronize();
                 }
 
                 for (int i = 0; i < Traits::M_BLOCK; i++)
@@ -817,7 +817,7 @@ struct WarpSpecializedLayerNorm
                 {
                     scheduler(lane_id, gridDim.x * gridDim.y * gridDim.z, param, shared);
                     // PRE-EXIT after all tiles have been scheduled.
-                    asm volatile("griddepcontrol.launch_dependents;\n");
+                    cudaTriggerProgrammaticLaunchCompletion();
                 }
                 else if (warp_id == 1)
                 {
 
@@ -111,7 +111,7 @@ __global__ void GroupRMSNormBaseKernel(GroupRMSParams<n> params, int rounds)
     PackedType const* __restrict__ weight_ptr = nullptr;
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     // Find which input current warp operates on
@@ -263,7 +263,7 @@ __global__ void GroupRMSNormBaseKernel(GroupRMSParams<n> params, int rounds)
     }
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -305,7 +305,7 @@ __global__ void GroupRMSNormKernelLargeBatch(
     bool process_input_1 = warp_idx < warp_size_1;
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     // Get input pointers
@@ -565,7 +565,7 @@ __global__ void GroupRMSNormKernelLargeBatch(
     }
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
 
@@ -60,7 +60,7 @@ __global__ void llama4_bf16_bf16_gemm_kernel(int num_tokens,
         b_vec[chunk] = reinterpret_cast<aligned_bf16x4 const*>(B)[row * GEMM_K / VEC_SIZE + base_idx];
     }
 
-    asm volatile("griddepcontrol.wait;" ::: "memory");
+    cudaGridDependencySynchronize();
 
     // Process 5 chunks of 4 elements each
 #pragma unroll
 
@@ -100,7 +100,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_attn_scaling_
 #endif
 
 #if ENABLE_ACQBULK
-    asm volatile("griddepcontrol.wait;" ::: "memory");
+    cudaGridDependencySynchronize();
 #endif
 
     // Processing 8 elements each
@@ -250,7 +250,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_attn_scaling_
     }
 
 #if ENABLE_PREEXIT
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 #endif
 }
 
@@ -89,7 +89,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_block_ker
 #endif
 
 #if ENABLE_ACQBULK
-    asm volatile("griddepcontrol.wait;" ::: "memory");
+    cudaGridDependencySynchronize();
 #endif
 
     // Processing 8 elements each
@@ -237,7 +237,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_block_ker
     }
 
 #if ENABLE_PREEXIT
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 #endif
 }
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ template <QuantizeMode QUANTIZE_MODE, bool QUANTIZE, typename T_OUT, typename T_`
`43`	`43`	`__global__ void scaleMatrix(T_OUT* output, T_S const* input_scale, T_IN const* input, int64_t numel, int64_t lda)`
`44`	`44`	`{`
`45`	`45`	`#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`46`		`- asm volatile("griddepcontrol.wait;");`
	`46`	`+ cudaGridDependencySynchronize();`
`47`	`47`	`#endif`
`48`	`48`
`49`	`49`	`for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < numel; i += blockDim.x * gridDim.x)`
`@@ -63,7 +63,7 @@ __global__ void scaleMatrix(T_OUT* output, T_S const* input_scale, T_IN const* i`
`63`	`63`	`}`
`64`	`64`	`}`
`65`	`65`	`#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`66`		`- asm volatile("griddepcontrol.launch_dependents;");`
	`66`	`+ cudaTriggerProgrammaticLaunchCompletion();`
`67`	`67`	`#endif`
`68`	`68`	`}`
`69`	`69`
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ CUTLASS_DEVICE`
`46`	`46`	`void launch_dependent_grids()`
`47`	`47`	`{`
`48`	`48`	`#if (defined(CUTLASS_GDC_ENABLED))`
`49`		`- asm volatile("griddepcontrol.launch_dependents;");`
	`49`	`+ cudaTriggerProgrammaticLaunchCompletion();`
`50`	`50`	`#endif`
`51`	`51`	`}`
`52`	`52`
`@@ -57,7 +57,7 @@ CUTLASS_DEVICE`
`57`	`57`	`void wait_on_dependent_grids()`
`58`	`58`	`{`
`59`	`59`	`#if (defined(CUTLASS_GDC_ENABLED))`
`60`		`- asm volatile("griddepcontrol.wait;");`
	`60`	`+ cudaGridDependencySynchronize();`
`61`	`61`	`#endif`
`62`	`62`	`}`
`63`	`63`
Original file line number	Diff line number	Diff line change
`@@ -178,8 +178,8 @@ struct LowLatencyLayerNorm`
`178`	`178`	`#if (defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 12))`
`179`	`179`	`if constexpr (arch::is_major_v<9> \|\| arch::is_major_v<10>)`
`180`	`180`	`{`
`181`		`- asm volatile("griddepcontrol.wait;\n");`
`182`		`- asm volatile("griddepcontrol.launch_dependents;\n");`
	`181`	`+ cudaGridDependencySynchronize();`
	`182`	`+ cudaTriggerProgrammaticLaunchCompletion();`
`183`	`183`	`}`
`184`	`184`	`#endif`
`185`	`185`	`load_to_register(&param.input[work_id * param.n], data, param.n);`
Original file line number	Diff line number	Diff line change
`@@ -211,7 +211,7 @@ struct WarpSpecializedLayerNorm`
`211`	`211`
`212`	`212`	`if constexpr (FIRST_RUN)`
`213`	`213`	`{`
`214`		`- asm volatile("griddepcontrol.wait;\n");`
	`214`	`+ cudaGridDependencySynchronize();`
`215`	`215`	`}`
`216`	`216`
`217`	`217`	`for (int i = 0; i < Traits::M_BLOCK; i++)`
`@@ -817,7 +817,7 @@ struct WarpSpecializedLayerNorm`
`817`	`817`	`{`
`818`	`818`	`scheduler(lane_id, gridDim.x * gridDim.y * gridDim.z, param, shared);`
`819`	`819`	`// PRE-EXIT after all tiles have been scheduled.`
`820`		`- asm volatile("griddepcontrol.launch_dependents;\n");`
	`820`	`+ cudaTriggerProgrammaticLaunchCompletion();`
`821`	`821`	`}`
`822`	`822`	`else if (warp_id == 1)`
`823`	`823`	`{`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@ __global__ void llama4_bf16_bf16_gemm_kernel(int num_tokens,`
`60`	`60`	`b_vec[chunk] = reinterpret_cast<aligned_bf16x4 const>(B)[row GEMM_K / VEC_SIZE + base_idx];`
`61`	`61`	`}`
`62`	`62`
`63`		`- asm volatile("griddepcontrol.wait;" ::: "memory");`
	`63`	`+ cudaGridDependencySynchronize();`
`64`	`64`
`65`	`65`	`// Process 5 chunks of 4 elements each`
`66`	`66`	`#pragma unroll`
Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_attn_scaling_`
`100`	`100`	`#endif`
`101`	`101`
`102`	`102`	`#if ENABLE_ACQBULK`
`103`		`- asm volatile("griddepcontrol.wait;" ::: "memory");`
	`103`	`+ cudaGridDependencySynchronize();`
`104`	`104`	`#endif`
`105`	`105`
`106`	`106`	`// Processing 8 elements each`
`@@ -250,7 +250,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_attn_scaling_`
`250`	`250`	`}`
`251`	`251`
`252`	`252`	`#if ENABLE_PREEXIT`
`253`		`- asm volatile("griddepcontrol.launch_dependents;");`
	`253`	`+ cudaTriggerProgrammaticLaunchCompletion();`
`254`	`254`	`#endif`
`255`	`255`	`#endif`
`256`	`256`	`}`
Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_block_ker`
`89`	`89`	`#endif`
`90`	`90`
`91`	`91`	`#if ENABLE_ACQBULK`
`92`		`- asm volatile("griddepcontrol.wait;" ::: "memory");`
	`92`	`+ cudaGridDependencySynchronize();`
`93`	`93`	`#endif`
`94`	`94`
`95`	`95`	`// Processing 8 elements each`
`@@ -237,7 +237,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_block_ker`
`237`	`237`	`}`
`238`	`238`
`239`	`239`	`#if ENABLE_PREEXIT`
`240`		`- asm volatile("griddepcontrol.launch_dependents;");`
	`240`	`+ cudaTriggerProgrammaticLaunchCompletion();`
`241`	`241`	`#endif`
`242`	`242`	`#endif`
`243`	`243`	`}`