Skip to content

Commit e0bd47c

Browse files
committed
replace more asm code
Signed-off-by: Zhenhuan Chen <chenzhh3671@gmail.com>
1 parent 0c00d40 commit e0bd47c

File tree

8 files changed

+11
-11
lines changed

8 files changed

+11
-11
lines changed

cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,8 @@ struct LowLatencyLayerNorm
178178
#if (defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 12))
179179
if constexpr (arch::is_major_v<9> || arch::is_major_v<10>)
180180
{
181-
asm volatile("griddepcontrol.wait;\n");
182-
asm volatile("griddepcontrol.launch_dependents;\n");
181+
cudaGridDependencySynchronize();
182+
cudaTriggerProgrammaticLaunchCompletion();
183183
}
184184
#endif
185185
load_to_register(&param.input[work_id * param.n], data, param.n);

cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ struct WarpSpecializedLayerNorm
211211

212212
if constexpr (FIRST_RUN)
213213
{
214-
asm volatile("griddepcontrol.wait;\n");
214+
cudaGridDependencySynchronize();
215215
}
216216

217217
for (int i = 0; i < Traits::M_BLOCK; i++)
@@ -817,7 +817,7 @@ struct WarpSpecializedLayerNorm
817817
{
818818
scheduler(lane_id, gridDim.x * gridDim.y * gridDim.z, param, shared);
819819
// PRE-EXIT after all tiles have been scheduled.
820-
asm volatile("griddepcontrol.launch_dependents;\n");
820+
cudaTriggerProgrammaticLaunchCompletion();
821821
}
822822
else if (warp_id == 1)
823823
{

cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ __global__ void llama4_bf16_bf16_gemm_kernel(int num_tokens,
6060
b_vec[chunk] = reinterpret_cast<aligned_bf16x4 const*>(B)[row * GEMM_K / VEC_SIZE + base_idx];
6161
}
6262

63-
asm volatile("griddepcontrol.wait;" ::: "memory");
63+
cudaGridDependencySynchronize();
6464

6565
// Process 5 chunks of 4 elements each
6666
#pragma unroll

cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_attn_scaling_
100100
#endif
101101

102102
#if ENABLE_ACQBULK
103-
asm volatile("griddepcontrol.wait;" ::: "memory");
103+
cudaGridDependencySynchronize();
104104
#endif
105105

106106
// Processing 8 elements each

cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_block_ker
8989
#endif
9090

9191
#if ENABLE_ACQBULK
92-
asm volatile("griddepcontrol.wait;" ::: "memory");
92+
cudaGridDependencySynchronize();
9393
#endif
9494

9595
// Processing 8 elements each

cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_warp_kern
9191
#endif
9292

9393
#if ENABLE_ACQBULK
94-
asm volatile("griddepcontrol.wait;" ::: "memory");
94+
cudaGridDependencySynchronize();
9595
#endif
9696

9797
// Processing 8 elements each

cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_fp8_gemm_swiglu_per_blo
9898
#endif
9999

100100
#if ENABLE_ACQBULK
101-
asm volatile("griddepcontrol.wait;" ::: "memory");
101+
cudaGridDependencySynchronize();
102102
#endif
103103

104104
// Processing 8 elements each

cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ __global__ void llama4_moe_fc13_swiglu_fp8_kernel(int num_tokens,
8282

8383
// Logits depends on the previous kernel, so we cannot prefetch anything.
8484
#if ENABLE_ACQBULK
85-
asm volatile("griddepcontrol.wait;" ::: "memory");
85+
cudaGridDependencySynchronize();
8686
#endif
8787

8888
// Perform top1 within the current thread, which processes 4 experts.
@@ -242,7 +242,7 @@ __global__ void llama4_moe_fc2_fp8_kernel(int num_tokens,
242242
scaling_factors_shared[tid] = scaling_factors[tid];
243243

244244
#if ENABLE_ACQBULK
245-
asm volatile("griddepcontrol.wait;" ::: "memory");
245+
cudaGridDependencySynchronize();
246246
#endif
247247

248248
// Select the corresponding expert weight.

0 commit comments

Comments
 (0)