[kernel] Recompilation optimization triggered by triton function parameter optimization

l30072083 · l30072083 · commit 8f87c59922e5 · 2026-03-23T20:13:27.000+08:00
Signed-off-by: l30072083 &lt;liuchengzhuo1@h-partners.com&gt;
diff --git a/vllm_ascend/ops/triton/fla/chunk_delta_h.py b/vllm_ascend/ops/triton/fla/chunk_delta_h.py
@@ -26,7 +26,7 @@
         "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
     }
 )
-@triton.jit(do_not_specialize=["T"])
+@triton.jit(do_not_specialize=["T","H", "Hg", "K", "V"])
 def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
     k,
     v,
@@ -40,10 +40,10 @@ def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
     chunk_offsets,
     h_update,
     T,
-    H: tl.constexpr,
-    Hg: tl.constexpr,
-    K: tl.constexpr,
-    V: tl.constexpr,
+    H,
+    Hg,
+    K,
+    V,
     BT: tl.constexpr,
     USE_G: tl.constexpr,
     USE_INITIAL_STATE: tl.constexpr,
diff --git a/vllm_ascend/ops/triton/fla/cumsum.py b/vllm_ascend/ops/triton/fla/cumsum.py
@@ -26,7 +26,6 @@ def chunk_local_cumsum_scalar_kernel(
     cu_seqlens,
     chunk_indices,
     T,
-    B: tl.constexpr,
     H: tl.constexpr,
     BLOCK_T: tl.constexpr,
     REVERSE: tl.constexpr,
@@ -101,7 +100,6 @@ def chunk_local_cumsum_scalar(
         cu_seqlens=cu_seqlens,
         chunk_indices=block_indices,
         T=T,
-        B=B,
         H=H,
         BLOCK_T=OPTIM_BLOCK_SIZE,
         CHUNK_SIZE=chunk_size,
diff --git a/vllm_ascend/ops/triton/fused_gdn_gating.py b/vllm_ascend/ops/triton/fused_gdn_gating.py
@@ -10,7 +10,8 @@
 UNIFIED_BUFFER_SIZE = 1572864
 
 
-@triton.jit
+@triton.jit(do_not_specialize=["seq_len", "NUM_HEADS", "NUM_BATCHES", "beta",
+                               "threshold", "ROW_ITER"])
 def fused_gdn_gating_kernel(
     g,
     beta_output,
@@ -19,16 +20,17 @@ def fused_gdn_gating_kernel(
     b,
     dt_bias,
     seq_len,
-    NUM_HEADS: tl.constexpr,
-    NUM_BATCHES: tl.constexpr,
-    beta: tl.constexpr,
-    threshold: tl.constexpr,
+    NUM_HEADS,
+    NUM_BATCHES,
+    beta,
+    threshold,
     BLK_HEADS: tl.constexpr,
-    COL_ITER: tl.constexpr,
     BLK_BATCHES: tl.constexpr,
-    ROW_ITER: tl.constexpr,
+    ROW_ITER,
 ):
     i_b, i_s = tl.program_id(0), tl.program_id(1)
+    COL_ITER = tl.cdiv(NUM_HEADS, BLK_HEADS)
+
     for row_idx in range(0, ROW_ITER):
         batch_off = i_b * ROW_ITER * BLK_BATCHES + row_idx * BLK_BATCHES + tl.arange(0, BLK_BATCHES)
 
@@ -69,23 +71,11 @@ def fused_gdn_gating_patch(
     num_cores = get_vectorcore_num()
 
     BLK_HEADS = 8
-    COL_ITER = triton.cdiv(num_heads, BLK_HEADS)
-
-    elem_size = a.element_size()
-    max_ub_batches = int((UNIFIED_BUFFER_SIZE * 0.95) / (BLK_HEADS * elem_size))
-    if batch <= num_cores:
-        progs = batch
-        BLK_BATCHES = 1
-        ROW_ITER = 1
-    else:
-        progs = num_cores
-        FACTOR = 8 * num_heads
-        calc_blk_batches = (
-            triton.next_power_of_2(triton.cdiv(int(UNIFIED_BUFFER_SIZE * 0.95), FACTOR * BLK_HEADS * elem_size)) // 2
-        )
-        BLK_BATCHES = max(1, min(calc_blk_batches, max_ub_batches, 64))
-        row_per_core = triton.cdiv(batch, progs)
-        ROW_ITER = triton.cdiv(row_per_core, BLK_BATCHES)
+
+    progs = num_cores
+    row_per_core = triton.cdiv(batch, progs)
+    BLK_BATCHES = 64
+    ROW_ITER = triton.cdiv(row_per_core, BLK_BATCHES)
 
     g = torch.empty(1, batch, num_heads, dtype=torch.float32, device=a.device)
     beta_output = torch.empty(1, batch, num_heads, dtype=b.dtype, device=b.device)
@@ -104,7 +94,6 @@ def fused_gdn_gating_patch(
         beta,
         threshold,
         BLK_HEADS=BLK_HEADS,
-        COL_ITER=COL_ITER,
         BLK_BATCHES=BLK_BATCHES,
         ROW_ITER=ROW_ITER,
     )
diff --git a/vllm_ascend/ops/triton/reject_sample.py b/vllm_ascend/ops/triton/reject_sample.py
@@ -21,13 +21,8 @@
 
 
 def cal_grid_and_block_size(batch_size: int):
-    vectorcore_num = get_vectorcore_num()
-    if batch_size <= vectorcore_num:
-        grid = batch_size
-        block_size = 1
-    else:
-        grid = vectorcore_num
-        block_size = triton.next_power_of_2(triton.cdiv(batch_size, grid))
+    grid = batch_size
+    block_size = 64
     return grid, block_size
 
 
@@ -82,7 +77,7 @@ def bonus_renew(
     tl.store(output_token_ids_ptr + position * (max_spec_len + 1) + num_tokens1, bonus_token_id)
 
 
-@triton.jit(do_not_specialize=["max_spec_len"])
+@triton.jit(do_not_specialize=["vec_len", "max_spec_len"])
 def rejection_greedy_sample_triton(
     output_token_ids_ptr,  # [batch_size, max_spec_len + 1]
     cu_num_draft_tokens_ptr,  # [batch_size]
@@ -196,7 +191,7 @@ def rejection_random_sample_kernel(
                 )
 
 
-@triton.jit(do_not_specialize=["replace_from", "replace_to"])
+@triton.jit(do_not_specialize=["replace_from", "replace_to", "vec_len"])
 def expand_kernel(
     output_ptr,  # [num_tokens]
     input_ptr,  # [batch_size]
diff --git a/vllm_ascend/ops/triton/spec_decode/utils.py b/vllm_ascend/ops/triton/spec_decode/utils.py
@@ -18,7 +18,7 @@
 from vllm.triton_utils import tl, triton
 
 
-@triton.jit
+@triton.jit(do_not_specialize=["num_reqs"])
 def prepare_inputs_padded_kernel(
     cu_num_draft_tokens_ptr,  # [num_reqs]
     valid_sampled_tokens_count_ptr,  # [num_reqs]