@@ -164,7 +164,7 @@ __global__ void buildMinLatencyActiveExpertMapsKernel(int* num_active_experts_pe
164164 int const cluster_size, int const num_experts_smem)
165165{
166166#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
167- asm volatile ( " griddepcontrol.wait; " );
167+ cudaGridDependencySynchronize ( );
168168#endif
169169 // Use one block to process the min latency case
170170 int tid = threadIdx .x ;
@@ -274,7 +274,7 @@ __global__ void buildMinLatencyActiveExpertMapsKernel(int* num_active_experts_pe
274274 }
275275 }
276276#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
277- asm volatile ( " griddepcontrol.launch_dependents; " );
277+ cudaTriggerProgrammaticLaunchCompletion ( );
278278#endif
279279}
280280
@@ -333,7 +333,7 @@ __global__ void fusedBuildExpertMapsSortFirstTokenKernel(int const* const token_
333333
334334 // Wait PDL before reading token_selected_experts
335335#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
336- asm volatile ( " griddepcontrol.wait; " );
336+ cudaGridDependencySynchronize ( );
337337#endif
338338
339339// build expert map
@@ -374,7 +374,7 @@ __global__ void fusedBuildExpertMapsSortFirstTokenKernel(int const* const token_
374374
375375// We are done with compute, launch the dependent kernels while the stores are in flight
376376#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
377- asm volatile ( " griddepcontrol.launch_dependents; " );
377+ cudaTriggerProgrammaticLaunchCompletion ( );
378378#endif
379379
380380 // write to shared memory and global memory
@@ -579,7 +579,7 @@ __global__ void blockExpertPrefixSumKernel(int const* token_selected_experts, in
579579 int const token_id = block_id * kNumTokensPerBlock + threadIdx .x ;
580580
581581#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
582- asm volatile ( " griddepcontrol.wait; " );
582+ cudaGridDependencySynchronize ( );
583583#endif
584584
585585 int expanded_token_id = -1 ;
@@ -612,7 +612,7 @@ __global__ void blockExpertPrefixSumKernel(int const* token_selected_experts, in
612612 }
613613
614614#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
615- asm volatile ( " griddepcontrol.launch_dependents; " );
615+ cudaTriggerProgrammaticLaunchCompletion ( );
616616#endif
617617}
618618
@@ -672,7 +672,7 @@ __global__ void globalExpertPrefixSumLargeKernel(int const* blocked_expert_count
672672 int cnt = 0 ;
673673
674674#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
675- asm volatile ( " griddepcontrol.wait; " );
675+ cudaGridDependencySynchronize ( );
676676#endif
677677
678678 // Note: Because of limited registers, cannot store thread-level prefix sum or enable #pragma unroll
@@ -706,7 +706,7 @@ __global__ void globalExpertPrefixSumLargeKernel(int const* blocked_expert_count
706706 }
707707
708708#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
709- asm volatile ( " griddepcontrol.launch_dependents; " );
709+ cudaTriggerProgrammaticLaunchCompletion ( );
710710#endif
711711}
712712
@@ -718,7 +718,7 @@ __global__ void globalExpertPrefixSumKernel(int const* blocked_expert_counts, in
718718 __shared__ typename BlockScan::TempStorage temp_storage;
719719
720720#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
721- asm volatile ( " griddepcontrol.wait; " );
721+ cudaGridDependencySynchronize ( );
722722#endif
723723
724724 int const cnt = threadIdx .x < num_experts_per_node * num_blocks_per_seq ? blocked_expert_counts[threadIdx .x ] : 0 ;
@@ -739,7 +739,7 @@ __global__ void globalExpertPrefixSumKernel(int const* blocked_expert_counts, in
739739 }
740740
741741#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
742- asm volatile ( " griddepcontrol.launch_dependents; " );
742+ cudaTriggerProgrammaticLaunchCompletion ( );
743743#endif
744744}
745745
@@ -810,7 +810,7 @@ __global__ void mergeExpertPrefixSumKernel(int const* blocked_expert_counts, int
810810 int const token_id = block_id * blockDim .x + threadIdx .x ;
811811
812812#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
813- asm volatile ( " griddepcontrol.wait; " );
813+ cudaGridDependencySynchronize ( );
814814#endif
815815
816816 int const cnt = blocked_expert_counts[target_expert_id * num_blocks_per_seq + block_id];
@@ -825,7 +825,7 @@ __global__ void mergeExpertPrefixSumKernel(int const* blocked_expert_counts, int
825825 }
826826
827827#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
828- asm volatile ( " griddepcontrol.launch_dependents; " );
828+ cudaTriggerProgrammaticLaunchCompletion ( );
829829#endif
830830}
831831
@@ -1259,7 +1259,7 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir
12591259 }
12601260
12611261#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1262- asm volatile ( " griddepcontrol.wait; " );
1262+ cudaGridDependencySynchronize ( );
12631263#endif
12641264
12651265 // Both gemms use the same token offset
@@ -1334,7 +1334,7 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir
13341334 bias2, gemm2_output, router_scales, permuted_row_to_unpermuted_row, expert);
13351335
13361336#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1337- asm volatile ( " griddepcontrol.launch_dependents; " );
1337+ cudaTriggerProgrammaticLaunchCompletion ( );
13381338#endif
13391339}
13401340
@@ -1395,7 +1395,7 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp
13951395 " Only NVFP4, MXFP8 and WINT4_AFP8 supports outputting a different format as part of the expansion" );
13961396
13971397#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1398- asm volatile ( " griddepcontrol.wait; " );
1398+ cudaGridDependencySynchronize ( );
13991399#endif
14001400
14011401 constexpr int VecSize = is_nvfp4 ? TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
@@ -1525,7 +1525,7 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp
15251525 }
15261526
15271527#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1528- asm volatile ( " griddepcontrol.launch_dependents; " );
1528+ cudaTriggerProgrammaticLaunchCompletion ( );
15291529#endif
15301530
15311531 // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values in the padded SF
@@ -1717,7 +1717,7 @@ __global__ void finalizeMoeRoutingKernel(GemmOutputType const* expanded_permuted
17171717 auto * reduced_row_ptr_v = reinterpret_cast <OutputElem*>(reduced_row_ptr);
17181718
17191719#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1720- asm volatile ( " griddepcontrol.wait; " );
1720+ cudaGridDependencySynchronize ( );
17211721#endif
17221722
17231723#pragma unroll
@@ -1757,7 +1757,7 @@ __global__ void finalizeMoeRoutingKernel(GemmOutputType const* expanded_permuted
17571757 reduced_row_ptr_v[elem_index] = output_elem;
17581758 }
17591759#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1760- asm volatile ( " griddepcontrol.launch_dependents; " );
1760+ cudaTriggerProgrammaticLaunchCompletion ( );
17611761#endif
17621762}
17631763
@@ -1776,7 +1776,7 @@ __global__ void finalizeMoeRoutingNoFillingKernel(GemmOutputType const* expanded
17761776 assert (unpadded_cols <= padded_cols);
17771777
17781778#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1779- asm volatile ( " griddepcontrol.wait; " );
1779+ cudaGridDependencySynchronize ( );
17801780#endif
17811781
17821782 int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node];
@@ -1865,7 +1865,7 @@ __global__ void finalizeMoeRoutingNoFillingKernel(GemmOutputType const* expanded
18651865 }
18661866 }
18671867#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1868- asm volatile ( " griddepcontrol.launch_dependents; " );
1868+ cudaTriggerProgrammaticLaunchCompletion ( );
18691869#endif
18701870}
18711871
@@ -2062,7 +2062,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
20622062 int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node];
20632063
20642064#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
2065- asm volatile ( " griddepcontrol.wait; " );
2065+ cudaGridDependencySynchronize ( );
20662066#endif
20672067 for (int64_t token = blockIdx .x ; token < num_valid_tokens; token += gridDim .x )
20682068 {
@@ -2178,7 +2178,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
21782178 }
21792179
21802180#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
2181- asm volatile ( " griddepcontrol.launch_dependents; " );
2181+ cudaTriggerProgrammaticLaunchCompletion ( );
21822182#endif
21832183
21842184 // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values in the padded SF
0 commit comments