[gdpa] subtiling activation to reduce register pressure

manman-ren · web-flow · commit f17d32223aac · 2025-08-27T21:28:52.000-07:00
Differential Revision: D81179333 Pull Request resolved: #364
diff --git a/tritonbench/operators/gdpa/gdpa_blackwell_tlx.py b/tritonbench/operators/gdpa/gdpa_blackwell_tlx.py
@@ -104,13 +104,6 @@ def _get_bufidx_phase(accum_cnt, NUM_BUFFERS):
     return bufIdx, phase
 
 
-@triton.jit
-def _reinterpret(qk_buf, bufIdx_qk):
-    qk_view = tlx.local_view(qk_buf, bufIdx_qk)
-    p_view = tlx.local_reinterpret(qk_view, tl.float16)
-    return p_view
-
-
 @triton.jit
 def _load_tma(
     bufIdx, phase, empty_bars, full_bars, buffers, desc, offset_1, offset_0, num_bytes
@@ -146,6 +139,69 @@ def _load_tma(
 #   qk0, qk1: producers
 #   p0, p1: sharing tmem spaces, and barriers with qk0, qk1 (consumers)
 #   o0, o1
+
+
+@triton.jit
+def _add_f32x2(a, b):
+    return tl.inline_asm_elementwise(
+        """
+        {
+            .reg .b64 ra, rb, rc;
+            mov.b64 ra, { $2, $3 };
+            mov.b64 rb, { $4, $5 };
+            add.f32x2 rc, ra, rb;
+            mov.b64 { $0, $1 }, rc;
+        }
+        """,
+        "=r,=r,r,r,r,r",
+        [a, b],
+        dtype=tl.float32,
+        is_pure=True,
+        pack=2,
+    )
+
+
+@triton.jit
+def _mul_f32x2(a, b):
+    return tl.inline_asm_elementwise(
+        """
+        {
+            .reg .b64 ra, rb, rc;
+            mov.b64 ra, { $2, $3 };
+            mov.b64 rb, { $4, $5 };
+            mul.f32x2 rc, ra, rb;
+            mov.b64 { $0, $1 }, rc;
+        }
+        """,
+        "=r,=r,r,r,r,r",
+        [a, b],
+        dtype=tl.float32,
+        is_pure=True,
+        pack=2,
+    )
+
+
+@triton.jit
+def _fma_f32x2(a, b, c):
+    return tl.inline_asm_elementwise(
+        """
+        {
+            .reg .b64 ra, rb, rc, rd;
+            mov.b64 ra, { $2, $3 };
+            mov.b64 rb, { $4, $5 };
+            mov.b64 rc, { $6, $7 };
+            fma.rn.f32x2 rd, ra, rb, rc;
+            mov.b64 { $0, $1 }, rd;
+        }
+        """,
+        "=r,=r,r,r,r,r,r,r",
+        [a, b, c],
+        dtype=tl.float32,
+        is_pure=True,
+        pack=2,
+    )
+
+
 @triton.jit
 def tanh_approx_fp32(x):
     output = tl.inline_asm_elementwise(
@@ -164,7 +220,16 @@ def tanh_approx_fp32(x):
 # typical configuration is 3/fast_gelu
 @triton.jit
 def fast_gelu(x):
-    return x * 0.5 * (1 + tanh_approx_fp32(0.7978845608 * x * (1.0 + 0.044715 * x * x)))
+    # following D80750725
+    # WAS: x * 0.5 * (1 + tanh_approx_fp32(0.7978845608 * x * (1.0 + 0.044715 * x * x))) * scaling
+    # NOW: x * tanh((c1 * x * x + c0)*x) + x
+    c1 = 0.0356774081
+    c0 = 0.7978845608
+    square = _mul_f32x2(x, x)
+    inner = _fma_f32x2(c1, square, c0)
+    inner = _mul_f32x2(inner, x)
+    out = _fma_f32x2(x, tanh_approx_fp32(inner), x)
+    return out
 
 
 @triton.autotune(
@@ -255,7 +320,7 @@ def gdpa_kernel_tma_ws_blackwell(
         )
 
     if USE_ON_DEVICE_TMA:
-        dtype = V.dtype.element_ty  # v_dtype)
+        dtype = V.dtype.element_ty
     else:
         dtype = tlx.dtype_of(v_desc)
 
@@ -287,18 +352,14 @@ def gdpa_kernel_tma_ws_blackwell(
     consumer_release_q0 = tlx.alloc_barriers(num_barriers=NUM_BUFFERS_Q, arrive_count=1)
     consumer_release_q1 = tlx.alloc_barriers(num_barriers=NUM_BUFFERS_Q, arrive_count=1)
     consumer_kv = tlx.alloc_barriers(num_barriers=NUM_BUFFERS_KV, arrive_count=1)
-    # consumer_v = tlx.alloc_barriers(num_barriers=NUM_BUFFERS_KV, arrive_count=1)
     consumer_release_kv = tlx.alloc_barriers(
         num_barriers=NUM_BUFFERS_KV, arrive_count=1
     )
     tlx.barrier_arrive(consumer_release_kv[0], 1)
     tlx.barrier_arrive(consumer_release_kv[1], 1)
     tlx.barrier_arrive(consumer_release_kv[2], 1)
-    # consumer_release_v = tlx.alloc_barriers(num_barriers=NUM_BUFFERS_KV, arrive_count=1)
 
-    # producer_qk0 == consumer_release_qk0
     producer_qk0 = tlx.alloc_barriers(num_barriers=NUM_BUFFERS_QK, arrive_count=1)
-    # producer_commit_qk0 == consumer_qk0
     producer_commit_qk0 = tlx.alloc_barriers(
         num_barriers=NUM_BUFFERS_QK, arrive_count=1
     )
@@ -307,13 +368,9 @@ def gdpa_kernel_tma_ws_blackwell(
         num_barriers=NUM_BUFFERS_QK, arrive_count=1
     )
 
-    producer_o0 = tlx.alloc_barriers(
-        num_barriers=NUM_BUFFERS_O, arrive_count=1
-    )  # only acquire for the first iteration
+    producer_o0 = tlx.alloc_barriers(num_barriers=NUM_BUFFERS_O, arrive_count=1)
     producer_commit_o0 = tlx.alloc_barriers(num_barriers=NUM_BUFFERS_O, arrive_count=1)
-    producer_o1 = tlx.alloc_barriers(
-        num_barriers=NUM_BUFFERS_O, arrive_count=1
-    )  # only acquire for the first iteration
+    producer_o1 = tlx.alloc_barriers(num_barriers=NUM_BUFFERS_O, arrive_count=1)
     producer_commit_o1 = tlx.alloc_barriers(num_barriers=NUM_BUFFERS_O, arrive_count=1)
 
     with tlx.async_tasks():
@@ -343,27 +400,31 @@ def gdpa_kernel_tma_ws_blackwell(
                     for start_n in range(lo, hi, BLOCK_N):
                         start_n = tl.multiple_of(start_n, BLOCK_N)
                         # tl.device_print("default start_n", start_n)
-                        ## communication channel for qk0, p0
-                        # qk in tmem, output p in tmem
                         bufIdx = accum_cnt % NUM_BUFFERS_QK
                         phase = (accum_cnt // NUM_BUFFERS_QK) & 1
                         qk_view = tlx.local_view(qk0_buf, bufIdx)
                         consumer_qk_view = tlx.local_view(producer_commit_qk0, bufIdx)
                         # tl.device_print("default producer_commit_qk0", accum_cnt)
                         # tl.device_print("default producer_commit_qk0_phase", phase)
                         tlx.barrier_wait(consumer_qk_view, phase)
-                        qk0 = tlx.local_load(qk_view)  # , tlx.storage_kind.tmem)
-                        # ConsumerWait for qk, ProducerAcquire for p
-                        # if activation_enum_int == 3:
+
+                        # qk_view: BLOCK_M // 2, HEAD_DIM
+                        qk_view_1st = tlx.subslice(qk_view, 0, HEAD_DIM // 2)
+                        qk0 = tlx.local_load(qk_view_1st)
                         p0 = fast_gelu(qk0)
-                        p0 *= qk_scale
-                        if USE_ON_DEVICE_TMA:
-                            p0 = p0.to(V.dtype.element_ty)  # v_dtype)
-                        else:
-                            p0 = p0.to(tlx.dtype_of(v_desc))
-                        qk_view = tlx.local_view(qk0_buf, bufIdx)
-                        p0_view = tlx.local_reinterpret(qk_view, dtype)
-                        tlx.local_store(p0_view, p0)  # , tlx.storage_kind.tmem)
+                        p0 = p0.to(dtype)
+                        p0_view = tlx.local_reinterpret(qk_view_1st, dtype)
+                        tlx.local_store(p0_view, p0)
+
+                        qk_view_2nd = tlx.subslice(
+                            qk_view, HEAD_DIM // 2, HEAD_DIM // 2
+                        )
+                        qk0 = tlx.local_load(qk_view_2nd)
+                        p0 = fast_gelu(qk0)
+                        p0 = p0.to(dtype)
+                        p0_view = tlx.local_reinterpret(qk_view_2nd, dtype)
+                        tlx.local_store(p0_view, p0)
+
                         # p and qk reuse tmem space, single producer commit for p via consumer_release_qk
                         consumer_release_qk_view = tlx.local_view(producer_qk0, bufIdx)
                         tlx.barrier_arrive(consumer_release_qk_view, 1)
@@ -382,10 +443,8 @@ def gdpa_kernel_tma_ws_blackwell(
                     bufIdx_o_outer, phase_o_outer = _get_bufidx_phase(
                         accum_cnt_outer, NUM_BUFFERS_O
                     )
-                    o0_view = tlx.local_view(
-                        o0_buf, bufIdx_o_outer
-                    )  # FIXME: index for the last iteration
-                    o0 = tlx.local_load(o0_view)  # , tlx.storage_kind.tmem)
+                    o0_view = tlx.local_view(o0_buf, bufIdx_o_outer)
+                    o0 = tlx.local_load(o0_view)
                     # release o0 here
                     consumer_release_o0_view = tlx.local_view(
                         producer_o0, bufIdx_o_outer
@@ -437,24 +496,29 @@ def gdpa_kernel_tma_ws_blackwell(
                     for start_n in range(lo, hi, BLOCK_N):
                         start_n = tl.multiple_of(start_n, BLOCK_N)
                         ## communication channel for qk1, p1
-                        # qk in tmem, output p in tmem
                         bufIdx = accum_cnt % NUM_BUFFERS_QK
                         phase = (accum_cnt // NUM_BUFFERS_QK) & 1
                         qk_view = tlx.local_view(qk1_buf, bufIdx)
                         consumer_qk_view = tlx.local_view(producer_commit_qk1, bufIdx)
                         tlx.barrier_wait(consumer_qk_view, phase)
-                        qk1 = tlx.local_load(qk_view)  # , tlx.storage_kind.tmem)
-                        # ConsumerWait for qk, ProducerAcquire for p
-                        # if activation_enum_int == 3:
-                        p1 = fast_gelu(qk1)
-                        p1 *= qk_scale
-                        if USE_ON_DEVICE_TMA:
-                            p1 = p1.to(V.dtype.element_ty)  # v_dtype)
-                        else:
-                            p1 = p1.to(tlx.dtype_of(v_desc))
-                        qk_view = tlx.local_view(qk1_buf, bufIdx)
-                        p1_view = tlx.local_reinterpret(qk_view, dtype)
-                        tlx.local_store(p1_view, p1)  # , tlx.storage_kind.tmem)
+
+                        # qk_view: BLOCK_M // 2, HEAD_DIM
+                        qk_view_1st = tlx.subslice(qk_view, 0, HEAD_DIM // 2)
+                        qk0 = tlx.local_load(qk_view_1st)
+                        p0 = fast_gelu(qk0)
+                        p0 = p0.to(dtype)
+                        p0_view = tlx.local_reinterpret(qk_view_1st, dtype)
+                        tlx.local_store(p0_view, p0)
+
+                        qk_view_2nd = tlx.subslice(
+                            qk_view, HEAD_DIM // 2, HEAD_DIM // 2
+                        )
+                        qk0 = tlx.local_load(qk_view_2nd)
+                        p0 = fast_gelu(qk0)
+                        p0 = p0.to(dtype)
+                        p0_view = tlx.local_reinterpret(qk_view_2nd, dtype)
+                        tlx.local_store(p0_view, p0)
+
                         # p and qk reuse tmem space, single producer commit for p via consumer_release_qk
                         consumer_release_qk_view = tlx.local_view(producer_qk1, bufIdx)
                         tlx.barrier_arrive(consumer_release_qk_view, 1)
@@ -477,10 +541,8 @@ def gdpa_kernel_tma_ws_blackwell(
                             strides=[HEAD_DIM * H, 1],
                             block_shape=[BLOCK_M // 2, BLOCK_D],
                         )
-                    o1_view = tlx.local_view(
-                        o1_buf, bufIdx_o_outer
-                    )  # FIXME: should be 0
-                    o1 = tlx.local_load(o1_view)  # , tlx.storage_kind.tmem)
+                    o1_view = tlx.local_view(o1_buf, bufIdx_o_outer)
+                    o1 = tlx.local_load(o1_view)
                     # release o1 here
                     consumer_release_o1_view = tlx.local_view(
                         producer_o1, bufIdx_o_outer
@@ -620,7 +682,6 @@ def gdpa_kernel_tma_ws_blackwell(
                         consumer_p0_view, phase_p
                     )  # consumer wait for p0 due to reuse of p0 and qk0
                     # reinterpret qk0 as p0
-                    # p0_view = _reinterpret(qk0_buf, bufIdx_p)
                     qk_view = tlx.local_view(qk0_buf, bufIdx_p)
                     p0_view = tlx.local_reinterpret(qk_view, dtype)
 
@@ -712,7 +773,6 @@ def gdpa_kernel_tma_ws_blackwell(
                             consumer_release_kv, bufIdx_v
                         )
                         # reinterpret as p1
-                        # p1_view = _reinterpret(qk1_buf, bufIdx_qk1)
                         qk_view = tlx.local_view(qk1_buf, bufIdx_qk1)
                         p1_view = tlx.local_reinterpret(qk_view, dtype)
                         tlx.async_dot(  # p1 . v from previous iteration
@@ -773,7 +833,6 @@ def gdpa_kernel_tma_ws_blackwell(
                             consumer_p0_view, phase_qk
                         )  # consumer wait for p0 use producer_qk0 due to reuse
                         # reinterpret as p0
-                        # p0_view = _reinterpret(qk0_buf, bufIdx_qk)
                         qk_view = tlx.local_view(qk0_buf, bufIdx_qk)
                         p0_view = tlx.local_reinterpret(qk_view, dtype)
 
@@ -822,7 +881,6 @@ def gdpa_kernel_tma_ws_blackwell(
                     tlx.barrier_wait(
                         consumer_p1_view, phase_qk1
                     )  # consumer wait for p1 due to reuse of p1 and qk1
-                    # p1_view = _reinterpret(qk1_buf, bufIdx_qk1)
                     qk_view = tlx.local_view(qk1_buf, bufIdx_qk1)
                     p1_view = tlx.local_reinterpret(qk_view, dtype)