@@ -161,7 +161,7 @@ __global__ void buildMinLatencyActiveExpertMapsKernel(int* num_active_experts_pe
161161 int const cluster_size, int const num_experts_smem)
162162{
163163#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
164- asm volatile ( " griddepcontrol.wait; " );
164+ cudaGridDependencySynchronize ( );
165165#endif
166166 // Use one block to process the min latency case
167167 int tid = threadIdx .x ;
@@ -271,7 +271,7 @@ __global__ void buildMinLatencyActiveExpertMapsKernel(int* num_active_experts_pe
271271 }
272272 }
273273#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
274- asm volatile ( " griddepcontrol.launch_dependents; " );
274+ cudaTriggerProgrammaticLaunchCompletion ( );
275275#endif
276276}
277277
@@ -330,7 +330,7 @@ __global__ void fusedBuildExpertMapsSortFirstTokenKernel(int const* const token_
330330
331331 // Wait PDL before reading token_selected_experts
332332#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
333- asm volatile ( " griddepcontrol.wait; " );
333+ cudaGridDependencySynchronize ( );
334334#endif
335335
336336// build expert map
@@ -371,7 +371,7 @@ __global__ void fusedBuildExpertMapsSortFirstTokenKernel(int const* const token_
371371
372372// We are done with compute, launch the dependent kernels while the stores are in flight
373373#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
374- asm volatile ( " griddepcontrol.launch_dependents; " );
374+ cudaTriggerProgrammaticLaunchCompletion ( );
375375#endif
376376
377377 // write to shared memory and global memory
@@ -576,7 +576,7 @@ __global__ void blockExpertPrefixSumKernel(int const* token_selected_experts, in
576576 int const token_id = block_id * kNumTokensPerBlock + threadIdx .x ;
577577
578578#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
579- asm volatile ( " griddepcontrol.wait; " );
579+ cudaGridDependencySynchronize ( );
580580#endif
581581
582582 int expanded_token_id = -1 ;
@@ -609,7 +609,7 @@ __global__ void blockExpertPrefixSumKernel(int const* token_selected_experts, in
609609 }
610610
611611#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
612- asm volatile ( " griddepcontrol.launch_dependents; " );
612+ cudaTriggerProgrammaticLaunchCompletion ( );
613613#endif
614614}
615615
@@ -669,7 +669,7 @@ __global__ void globalExpertPrefixSumLargeKernel(int const* blocked_expert_count
669669 int cnt = 0 ;
670670
671671#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
672- asm volatile ( " griddepcontrol.wait; " );
672+ cudaGridDependencySynchronize ( );
673673#endif
674674
675675 // Note: Because of limited registers, cannot store thread-level prefix sum or enable #pragma unroll
@@ -703,7 +703,7 @@ __global__ void globalExpertPrefixSumLargeKernel(int const* blocked_expert_count
703703 }
704704
705705#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
706- asm volatile ( " griddepcontrol.launch_dependents; " );
706+ cudaTriggerProgrammaticLaunchCompletion ( );
707707#endif
708708}
709709
@@ -715,7 +715,7 @@ __global__ void globalExpertPrefixSumKernel(int const* blocked_expert_counts, in
715715 __shared__ typename BlockScan::TempStorage temp_storage;
716716
717717#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
718- asm volatile ( " griddepcontrol.wait; " );
718+ cudaGridDependencySynchronize ( );
719719#endif
720720
721721 int const cnt = threadIdx .x < num_experts_per_node * num_blocks_per_seq ? blocked_expert_counts[threadIdx .x ] : 0 ;
@@ -736,7 +736,7 @@ __global__ void globalExpertPrefixSumKernel(int const* blocked_expert_counts, in
736736 }
737737
738738#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
739- asm volatile ( " griddepcontrol.launch_dependents; " );
739+ cudaTriggerProgrammaticLaunchCompletion ( );
740740#endif
741741}
742742
@@ -807,7 +807,7 @@ __global__ void mergeExpertPrefixSumKernel(int const* blocked_expert_counts, int
807807 int const token_id = block_id * blockDim .x + threadIdx .x ;
808808
809809#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
810- asm volatile ( " griddepcontrol.wait; " );
810+ cudaGridDependencySynchronize ( );
811811#endif
812812
813813 int const cnt = blocked_expert_counts[target_expert_id * num_blocks_per_seq + block_id];
@@ -822,7 +822,7 @@ __global__ void mergeExpertPrefixSumKernel(int const* blocked_expert_counts, int
822822 }
823823
824824#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
825- asm volatile ( " griddepcontrol.launch_dependents; " );
825+ cudaTriggerProgrammaticLaunchCompletion ( );
826826#endif
827827}
828828
@@ -1256,7 +1256,7 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir
12561256 }
12571257
12581258#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1259- asm volatile ( " griddepcontrol.wait; " );
1259+ cudaGridDependencySynchronize ( );
12601260#endif
12611261
12621262 // Both gemms use the same token offset
@@ -1331,7 +1331,7 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir
13311331 bias2, gemm2_output, router_scales, permuted_row_to_unpermuted_row, expert);
13321332
13331333#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1334- asm volatile ( " griddepcontrol.launch_dependents; " );
1334+ cudaTriggerProgrammaticLaunchCompletion ( );
13351335#endif
13361336}
13371337
@@ -1392,7 +1392,7 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp
13921392 " Only NVFP4, MXFP8 and WINT4_AFP8 supports outputting a different format as part of the expansion" );
13931393
13941394#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1395- asm volatile ( " griddepcontrol.wait; " );
1395+ cudaGridDependencySynchronize ( );
13961396#endif
13971397
13981398 constexpr int VecSize = is_nvfp4 ? TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
@@ -1522,7 +1522,7 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp
15221522 }
15231523
15241524#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1525- asm volatile ( " griddepcontrol.launch_dependents; " );
1525+ cudaTriggerProgrammaticLaunchCompletion ( );
15261526#endif
15271527
15281528 // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values in the padded SF
@@ -1714,7 +1714,7 @@ __global__ void finalizeMoeRoutingKernel(GemmOutputType const* expanded_permuted
17141714 auto * reduced_row_ptr_v = reinterpret_cast <OutputElem*>(reduced_row_ptr);
17151715
17161716#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1717- asm volatile ( " griddepcontrol.wait; " );
1717+ cudaGridDependencySynchronize ( );
17181718#endif
17191719
17201720#pragma unroll
@@ -1754,7 +1754,7 @@ __global__ void finalizeMoeRoutingKernel(GemmOutputType const* expanded_permuted
17541754 reduced_row_ptr_v[elem_index] = output_elem;
17551755 }
17561756#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1757- asm volatile ( " griddepcontrol.launch_dependents; " );
1757+ cudaTriggerProgrammaticLaunchCompletion ( );
17581758#endif
17591759}
17601760
@@ -1773,7 +1773,7 @@ __global__ void finalizeMoeRoutingNoFillingKernel(GemmOutputType const* expanded
17731773 assert (unpadded_cols <= padded_cols);
17741774
17751775#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1776- asm volatile ( " griddepcontrol.wait; " );
1776+ cudaGridDependencySynchronize ( );
17771777#endif
17781778
17791779 int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node];
@@ -1862,7 +1862,7 @@ __global__ void finalizeMoeRoutingNoFillingKernel(GemmOutputType const* expanded
18621862 }
18631863 }
18641864#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1865- asm volatile ( " griddepcontrol.launch_dependents; " );
1865+ cudaTriggerProgrammaticLaunchCompletion ( );
18661866#endif
18671867}
18681868
@@ -2059,7 +2059,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
20592059 int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node];
20602060
20612061#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
2062- asm volatile ( " griddepcontrol.wait; " );
2062+ cudaGridDependencySynchronize ( );
20632063#endif
20642064 for (int64_t token = blockIdx .x ; token < num_valid_tokens; token += gridDim .x )
20652065 {
@@ -2175,7 +2175,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
21752175 }
21762176
21772177#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
2178- asm volatile ( " griddepcontrol.launch_dependents; " );
2178+ cudaTriggerProgrammaticLaunchCompletion ( );
21792179#endif
21802180
21812181 // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values in the padded SF
0 commit comments