Determine the chunk size at the kernel entry (#619)

yzhangcs · web-flow · commit a191c585f445 · 2025-10-25T00:50:51.000+08:00
* Determine the chunk size at the kernel entry

* Fix split_size
diff --git a/fla/ops/common/chunk_h.py b/fla/ops/common/chunk_h.py
@@ -283,8 +283,8 @@ def chunk_fwd_h(
     states_in_fp32: bool = False
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     B, T, H, K, V = *k.shape, v.shape[-1]
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
-    BS = BT if split_size is None else min(split_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
+    BS = BT if split_size is None else split_size
     assert BS % BT == 0, f"The `split_size` (got {BS}) must be a multiple of `chunk_size` {BT}"
     # N: the actual number of sequences in the batch with either equal or variable lengths
     if cu_seqlens is None:
@@ -341,8 +341,8 @@ def chunk_bwd_dh(
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     B, T, H, K, V = *k.shape, v.shape[-1]
     HQ = q.shape[2]
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
-    BS = BT if split_size is None else min(split_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
+    BS = BT if split_size is None else split_size
     assert BS % BT == 0, f"The `split_size` (got {BS}) must be a multiple of `chunk_size` {BT}"
     # N: the actual number of sequences in the batch with either equal or variable lengths
     # NG: number of groups in GQA
diff --git a/fla/ops/common/chunk_h_parallel.py b/fla/ops/common/chunk_h_parallel.py
@@ -418,7 +418,7 @@ def chunk_fwd_h(
     chunk_size: int = 64
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     B, T, H, K, V = *k.shape, v.shape[-1]
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
 
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     # N: the actual number of sequences in the batch with either equal or variable lengths
@@ -491,7 +491,7 @@ def chunk_bwd_dh(
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     B, T, H, K, V = *k.shape, v.shape[-1]
     HQ = q.shape[2]
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
 
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     # N: the actual number of sequences in the batch with either equal or variable lengths
diff --git a/fla/ops/common/chunk_o.py b/fla/ops/common/chunk_o.py
@@ -494,7 +494,7 @@ def chunk_fwd_o(
     chunk_size: int = 64
 ) -> torch.Tensor:
     B, T, H, K, V = *q.shape, v.shape[-1]
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
     if scale is None:
@@ -534,7 +534,7 @@ def chunk_bwd_dv(
     chunk_size: int = 64
 ) -> torch.Tensor:
     B, T, H, K, V = *k.shape, do.shape[-1]
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     # H100 can have larger block size
     if check_shared_mem('hopper', k.device.index):
@@ -585,7 +585,7 @@ def chunk_bwd_dv_local(
     chunk_size: int = 64
 ) -> torch.Tensor:
     B, T, H, K, V = *k.shape, do.shape[-1]
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     # H100 can have larger block size
     if check_shared_mem('hopper', k.device.index):
@@ -638,7 +638,7 @@ def chunk_bwd_dqkwg(
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
 
     B, T, H, K, V = *k.shape, v.shape[-1]
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
 
diff --git a/fla/ops/gated_delta_product/chunk_deltaproduct_o.py b/fla/ops/gated_delta_product/chunk_deltaproduct_o.py
@@ -130,7 +130,7 @@ def chunk_gated_delta_product_fwd_o(
 ) -> torch.Tensor:
     assert q.shape[1] * num_householder == k.shape[1], "q.shape[1] * num_householder must be equal to k.shape[1]"
     B, T, H, K, V = *q.shape, v.shape[-1]
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
     o = v.new_empty(B, T, H, V).fill_(-float('inf'))
diff --git a/fla/ops/generalized_delta_rule/dplr/chunk.py b/fla/ops/generalized_delta_rule/dplr/chunk.py
@@ -5,7 +5,6 @@
 from typing import Optional
 
 import torch
-import triton
 
 from fla.ops.generalized_delta_rule.dplr.chunk_A_bwd import chunk_dplr_bwd_dqk_intra
 from fla.ops.generalized_delta_rule.dplr.chunk_A_fwd import chunk_dplr_fwd_intra
@@ -32,9 +31,7 @@ def chunk_dplr_fwd(
     cu_seqlens: Optional[torch.LongTensor] = None,
     chunk_size: int = 64
 ):
-    T = q.shape[1]
-    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
-    gi, ge = chunk_rwkv6_fwd_cumsum(gk, BT, cu_seqlens=cu_seqlens)
+    gi, ge = chunk_rwkv6_fwd_cumsum(gk, chunk_size, cu_seqlens=cu_seqlens)
 
     A_ab, A_qk, A_ak, A_qb, qg, kg, ag, bg = chunk_dplr_fwd_intra(
         q=q,
@@ -45,7 +42,7 @@ def chunk_dplr_fwd(
         ge=ge,
         scale=scale,
         cu_seqlens=cu_seqlens,
-        chunk_size=BT,
+        chunk_size=chunk_size,
     )
     del ge
 
@@ -57,7 +54,7 @@ def chunk_dplr_fwd(
         A_ak=A_ak,
         v=v,
         cu_seqlens=cu_seqlens,
-        chunk_size=BT
+        chunk_size=chunk_size
     )
     del A_ab, A_ak
     h, v_new, final_state = chunk_dplr_fwd_h(
@@ -70,7 +67,7 @@ def chunk_dplr_fwd(
         initial_state=initial_state,
         output_final_state=output_final_state,
         cu_seqlens=cu_seqlens,
-        chunk_size=BT
+        chunk_size=chunk_size
     )
     del u, kg, bg, gi
 
@@ -82,7 +79,7 @@ def chunk_dplr_fwd(
         A_qb=A_qb,
         h=h,
         cu_seqlens=cu_seqlens,
-        chunk_size=BT
+        chunk_size=chunk_size
     )
     del v_new, h, A_qk, A_qb
 
@@ -136,12 +133,12 @@ def backward(
         dht: torch.Tensor
     ):
         q, k, v, a, b, gk, initial_state = ctx.saved_tensors
-        BT = ctx.chunk_size
+        chunk_size = ctx.chunk_size
         cu_seqlens = ctx.cu_seqlens
         scale = ctx.scale
 
         # ******* start recomputing everything, otherwise i believe the gpu memory will be exhausted *******
-        gi, ge = chunk_rwkv6_fwd_cumsum(gk, BT, cu_seqlens=cu_seqlens)
+        gi, ge = chunk_rwkv6_fwd_cumsum(gk, chunk_size, cu_seqlens=cu_seqlens)
 
         A_ab, A_qk, A_ak, A_qb, qg, kg, ag, bg = chunk_dplr_fwd_intra(
             q=q,
@@ -152,15 +149,15 @@ def backward(
             ge=ge,
             scale=scale,
             cu_seqlens=cu_seqlens,
-            chunk_size=BT,
+            chunk_size=chunk_size,
         )
         w, u, A_ab_inv = prepare_wy_repr_fwd(
             ag=ag,
             A_ab=A_ab,
             A_ak=A_ak,
             v=v,
             cu_seqlens=cu_seqlens,
-            chunk_size=BT
+            chunk_size=chunk_size
         )
         del A_ab
         h, v_new, _ = chunk_dplr_fwd_h(
@@ -172,7 +169,7 @@ def backward(
             gk=gi,
             initial_state=initial_state,
             cu_seqlens=cu_seqlens,
-            chunk_size=BT
+            chunk_size=chunk_size
         )
         del u
         # ******* end of recomputation *******
@@ -186,7 +183,7 @@ def backward(
             A_qb=A_qb,
             scale=scale,
             cu_seqlens=cu_seqlens,
-            chunk_size=BT
+            chunk_size=chunk_size
         )
 
         dh, dh0, dv_new = chunk_dplr_bwd_dhu(
@@ -199,7 +196,7 @@ def backward(
             do=do,
             dv=dv_new_intra,
             cu_seqlens=cu_seqlens,
-            chunk_size=BT
+            chunk_size=chunk_size
         )
 
         dv = chunk_dplr_bwd_dv(
@@ -208,7 +205,7 @@ def backward(
             do=do,
             dh=dh,
             cu_seqlens=cu_seqlens,
-            chunk_size=BT
+            chunk_size=chunk_size
         )
         del A_qk
 
@@ -224,7 +221,7 @@ def backward(
             w=w,
             gk=gi,
             cu_seqlens=cu_seqlens,
-            chunk_size=BT,
+            chunk_size=chunk_size,
             scale=scale,
         )
         del v_new
@@ -238,7 +235,7 @@ def backward(
             du=dv_new,
             dv0=dv,
             cu_seqlens=cu_seqlens,
-            chunk_size=BT
+            chunk_size=chunk_size
         )
         del A_ak
 
@@ -258,7 +255,7 @@ def backward(
             dkg=dkg,
             dag=dag,
             dbg=dbg,
-            chunk_size=BT,
+            chunk_size=chunk_size,
             scale=scale,
             cu_seqlens=cu_seqlens,
         )
diff --git a/fla/ops/generalized_delta_rule/dplr/chunk_A_bwd.py b/fla/ops/generalized_delta_rule/dplr/chunk_A_bwd.py
@@ -303,7 +303,7 @@ def chunk_dplr_bwd_dqk_intra(
     chunk_size: int = 64,
 ):
     B, T, H, K = q.shape
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
     BK = min(64, triton.next_power_of_2(K)) if check_shared_mem() else min(32, triton.next_power_of_2(K))
 
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
diff --git a/fla/ops/generalized_delta_rule/dplr/chunk_A_fwd.py b/fla/ops/generalized_delta_rule/dplr/chunk_A_fwd.py
@@ -153,7 +153,7 @@ def chunk_dplr_fwd_intra(
     cu_seqlens: Optional[torch.LongTensor] = None,
 ):
     B, T, H, K = k.shape
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
 
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
diff --git a/fla/ops/generalized_delta_rule/dplr/chunk_h_bwd.py b/fla/ops/generalized_delta_rule/dplr/chunk_h_bwd.py
@@ -121,7 +121,7 @@ def chunk_dplr_bwd_dhu(
     chunk_size: int = 64
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     B, T, H, K, V = *qg.shape, do.shape[-1]
-    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    BT = chunk_size
     BK = max(triton.next_power_of_2(K), 16)
     assert BK <= 256, "current kernel does not support head dimension being larger than 256."
     # H100
diff --git a/fla/ops/generalized_delta_rule/dplr/chunk_h_fwd.py b/fla/ops/generalized_delta_rule/dplr/chunk_h_fwd.py
@@ -120,7 +120,7 @@ def chunk_dplr_fwd_h(
     chunk_size: int = 64
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     B, T, H, K, V = *kg.shape, u.shape[-1]
-    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    BT = chunk_size
 
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     # N: the actual number of sequences in the batch with either equal or variable lengths
diff --git a/fla/ops/generalized_delta_rule/dplr/chunk_o_bwd.py b/fla/ops/generalized_delta_rule/dplr/chunk_o_bwd.py
@@ -301,7 +301,7 @@ def chunk_dplr_bwd_dv(
     chunk_size: int = 64
 ) -> torch.Tensor:
     B, T, H, K, V = *kg.shape, do.shape[-1]
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
 
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
@@ -344,7 +344,7 @@ def chunk_dplr_bwd_o(
 
     B, T, H, K, V = *w.shape, v.shape[-1]
 
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
 
@@ -398,7 +398,7 @@ def chunk_dplr_bwd_dAu(
     chunk_size: int = 64
 ) -> torch.Tensor:
     B, T, H, V = v.shape
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
 
diff --git a/fla/ops/generalized_delta_rule/dplr/chunk_o_fwd.py b/fla/ops/generalized_delta_rule/dplr/chunk_o_fwd.py
@@ -100,7 +100,7 @@ def chunk_dplr_fwd_o(
     chunk_size: int = 64
 ) -> torch.Tensor:
     B, T, H, K, V = *qg.shape, v.shape[-1]
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
 
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
diff --git a/fla/ops/generalized_delta_rule/dplr/wy_fast_bwd.py b/fla/ops/generalized_delta_rule/dplr/wy_fast_bwd.py
@@ -128,7 +128,7 @@ def chunk_dplr_bwd_wy(
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     A_ab_inv, A_ak, v, ag, dw, du = map(lambda x: x.contiguous(), [A_ab_inv, A_ak, v, ag, dw, du])
     B, T, H, K, V = *dw.shape, du.shape[-1]
-    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    BT = chunk_size
 
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
diff --git a/fla/ops/generalized_delta_rule/dplr/wy_fast_fwd.py b/fla/ops/generalized_delta_rule/dplr/wy_fast_fwd.py
@@ -216,7 +216,7 @@ def wu_fwd(
     chunk_size: int
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     B, T, H, K, V = *ag.shape, v.shape[-1]
-    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    BT = chunk_size
 
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
@@ -254,7 +254,7 @@ def prepare_wy_repr_fwd(
     chunk_size: int = 64
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     B, T, H, _ = ag.shape
-    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    BT = chunk_size
 
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
diff --git a/fla/ops/generalized_delta_rule/iplr/chunk.py b/fla/ops/generalized_delta_rule/iplr/chunk.py
@@ -218,7 +218,7 @@ def chunk_generalized_iplr_delta_rule_fwd_o(
     B, T, H, K, V = *q.shape, v.shape[-1]
     if scale is None:
         scale = k.shape[-1] ** -0.5
-    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BT = chunk_size
 
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
@@ -262,7 +262,7 @@ def chunk_generalized_iplr_delta_rule_fwd_h(
     chunk_size: int = 64
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     B, T, H, K, V = *k.shape, u.shape[-1]
-    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    BT = chunk_size
 
     chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
     # N: the actual number of sequences in the batch with either equal or variable lengths
@@ -333,15 +333,13 @@ def chunk_generalized_iplr_delta_rule_fwd(
     cu_seqlens: Optional[torch.LongTensor] = None,
     chunk_size: int = 64
 ):
-    T = q.shape[1]
-    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
     w, u, _ = prepare_wy_repr_fwd(
         a=a,
         b=b,
         k=k,
         v=v,
         cu_seqlens=cu_seqlens,
-        chunk_size=BT
+        chunk_size=chunk_size
     )
 
     h, v_new, final_state = chunk_generalized_iplr_delta_rule_fwd_h(
@@ -353,7 +351,7 @@ def chunk_generalized_iplr_delta_rule_fwd(
         initial_state=initial_state,
         output_final_state=output_final_state,
         cu_seqlens=cu_seqlens,
-        chunk_size=BT
+        chunk_size=chunk_size
     )
     o = chunk_generalized_iplr_delta_rule_fwd_o(
         q=q,
@@ -364,7 +362,7 @@ def chunk_generalized_iplr_delta_rule_fwd(
         h=h,
         scale=scale,
         cu_seqlens=cu_seqlens,
-        chunk_size=BT
+        chunk_size=chunk_size
     )
     return o, final_state
 
@@ -386,8 +384,7 @@ def forward(
         output_final_state: bool,
         cu_seqlens: Optional[torch.LongTensor] = None,
     ):
-        chunk_size = 64
-
+        chunk_size = min(64, max(triton.next_power_of_2(q.shape[1]), 16))
         o, final_state = chunk_generalized_iplr_delta_rule_fwd(
             q=q,
             k=k,
diff --git a/fla/ops/generalized_delta_rule/iplr/wy_fast.py b/fla/ops/generalized_delta_rule/iplr/wy_fast.py
diff --git a/fla/ops/gla/chunk.py b/fla/ops/gla/chunk.py
diff --git a/fla/ops/gsa/chunk.py b/fla/ops/gsa/chunk.py
diff --git a/fla/ops/mesa_net/chunk_h_fwd.py b/fla/ops/mesa_net/chunk_h_fwd.py
diff --git a/fla/ops/rwkv6/chunk.py b/fla/ops/rwkv6/chunk.py