[NPU] Frequencies fusion for Llama4_rope on NPU (#1053)

lowdy1 · web-flow · commit ad6f0a74922e · 2026-02-09T16:54:43.000+08:00
## Summary This PR is a descendant of #1035 It removes `_prepare_freqs` for simplicity and directly uses a single `freq_complex_ptr `for llama4_rope frequencies inside the Triton kernel. By avoiding extra preprocessing and reducing load, this approach simplifies the code path and improves performance. Benchmark results show better performance compared to the previous implementation. ## Testing Done Test done with `python -m pytest ./test/transformers/test_llama4_rope.py -v` Verified on Atlas 800I A2 - [ ] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/llama4_rope.py b/src/liger_kernel/ops/backends/_ascend/ops/llama4_rope.py
@@ -5,49 +5,7 @@
 from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
 
 
-def _prepare_freqs(freqs_cis: torch.Tensor, seq_len: int, head_dim_half: int):
-    """
-    Canonicalize freqs to (seq_len, head_dim_half) real/imag tensors.
-
-    Supports:
-    - complex freqs: (..., head_dim_half) complex -> real/imag
-    - packed freqs: (..., 2*head_dim_half) real -> split into real/imag
-    """
-    if freqs_cis.is_complex():
-        freqs_real = freqs_cis.real
-        freqs_imag = freqs_cis.imag
-    else:
-        if freqs_cis.shape[-1] == 2 * head_dim_half:
-            freqs_real = freqs_cis[..., :head_dim_half]
-            freqs_imag = freqs_cis[..., head_dim_half:]
-        else:
-            raise ValueError(
-                f"Unexpected freqs_cis shape for non-complex input: {freqs_cis.shape}, "
-                f"expected last dim = {2 * head_dim_half}"
-            )
-
-    if freqs_real.shape[-1] != head_dim_half:
-        raise ValueError(f"Unexpected last dim for freqs: {freqs_real.shape[-1]} (expected {head_dim_half})")
-
-    # Flatten leading dims -> (N, head_dim_half)
-    freqs_real = freqs_real.reshape(-1, head_dim_half)
-    freqs_imag = freqs_imag.reshape(-1, head_dim_half)
-
-    # Broadcast/slice to (seq_len, head_dim_half)
-    if freqs_real.shape[0] < seq_len:
-        if freqs_real.shape[0] == 1:
-            freqs_real = freqs_real.expand(seq_len, -1)
-            freqs_imag = freqs_imag.expand(seq_len, -1)
-        else:
-            raise ValueError(f"Insufficient rows in freqs: {freqs_real.shape[0]} < seq_len={seq_len}")
-    elif freqs_real.shape[0] > seq_len:
-        freqs_real = freqs_real[:seq_len]
-        freqs_imag = freqs_imag[:seq_len]
-
-    return freqs_real, freqs_imag
-
-
-def _cast_and_contiguous(q, k, freqs_real, freqs_imag):
+def _cast_and_contiguous(q, k, freqs_complex):
     # Align dtype: fp32 only when q is fp32; otherwise keep q dtype for perf
     compute_dtype = torch.float32 if q.dtype == torch.float32 else q.dtype
 
@@ -56,17 +14,15 @@ def _cast_and_contiguous(q, k, freqs_real, freqs_imag):
 
     q = q.to(compute_dtype).contiguous()
     k = k.to(compute_dtype).contiguous()
-    freqs_real = freqs_real.to(compute_dtype).contiguous()
-    freqs_imag = freqs_imag.to(compute_dtype).contiguous()
-    return q, k, freqs_real, freqs_imag, compute_dtype
+    freqs_complex = freqs_complex.contiguous()
+    return q, k, freqs_complex, compute_dtype
 
 
 @triton.jit
 def _triton_llama4_rope_npu(
     q_ptr,
     k_ptr,
-    freqs_real_ptr,
-    freqs_imag_ptr,
+    freqs_complex_ptr,
     q_row_stride,
     k_row_stride,
     q_head_stride,
@@ -84,8 +40,7 @@ def _triton_llama4_rope_npu(
     """
     Llama4 RoPE on Ascend NPU for interleaved complex layout:
     - q/k shape: (bs, sl, n_heads, hd)
-    - last dim layout: [real0, imag0, real1, imag1, ...]
-    - freqs_real/imag: (sl, hd//2)
+    - freqs_complex_ptr: (sl, hd//2, 2)
     """
     pid = tl.program_id(0).to(tl.int64)
     batch_idx = pid // sl
@@ -101,11 +56,14 @@ def _triton_llama4_rope_npu(
     hd_idx = tl.arange(0, hd)
     hd_mask = hd_idx < (hd)
 
-    freq_idx = tl.arange(0, hd // 2)
-    freq_mask = freq_idx < (hd // 2)
+    freq_idx = tl.arange(0, hd)
+    freq_mask = freq_idx < (hd)
 
-    freqs_real = tl.load(freqs_real_ptr + freq_base + freq_idx, mask=freq_mask, other=0.0)
-    freqs_imag = tl.load(freqs_imag_ptr + freq_base + freq_idx, mask=freq_mask, other=0.0) * imag_sign
+    freqs_complex = tl.load(freqs_complex_ptr + freq_base + freq_idx, mask=freq_mask, other=0.0)
+
+    freqs_complex = freqs_complex.reshape(hd // 2, 2, can_reorder=True)
+    freqs_real, freqs_imag = tl.split(freqs_complex)
+    freqs_imag = freqs_imag * imag_sign
 
     # Q heads (chunked for UB)
     for qh_block in range(0, n_qh, BLOCK_Q):
@@ -166,10 +124,14 @@ def llama4_rope_forward(q, k, freqs_cis):
     _, _, n_kh, _ = k.shape
     if hd % 2 != 0:
         raise ValueError(f"head_dim must be even for interleaved complex layout, got {hd}")
-    hd_half = hd // 2
 
-    freqs_real, freqs_imag = _prepare_freqs(freqs_cis, sl, hd_half)
-    q, k, freqs_real, freqs_imag, compute_dtype = _cast_and_contiguous(q, k, freqs_real, freqs_imag)
+    if freqs_cis.is_complex():
+        freqs_cis = freqs_cis.reshape(-1, freqs_cis.shape[-1])
+        if freqs_cis.shape[0] > sl:
+            freqs_cis = freqs_cis[:sl]
+        freqs_cis = torch.view_as_real(freqs_cis)
+
+    q, k, freqs_cis, compute_dtype = _cast_and_contiguous(q, k, freqs_cis)
 
     # UB tiling strategy: tile heads dimension only
     dtype_size = q.element_size()
@@ -195,13 +157,12 @@ def llama4_rope_forward(q, k, freqs_cis):
     _triton_llama4_rope_npu[(n_row,)](
         q,
         k,
-        freqs_real,
-        freqs_imag,
+        freqs_cis,
         q.stride(1),
         k.stride(1),
         q.stride(2),
         k.stride(2),
-        freqs_real.stride(0),
+        freqs_cis.stride(0),
         sl,
         bs,
         n_qh,
@@ -231,10 +192,14 @@ def llama4_rope_backward(dq, dk, freqs_cis):
     _, _, n_kh, _ = dk.shape
     if hd % 2 != 0:
         raise ValueError(f"head_dim must be even for interleaved complex layout, got {hd}")
-    hd_half = hd // 2
 
-    freqs_real, freqs_imag = _prepare_freqs(freqs_cis, sl, hd_half)
-    dq, dk, freqs_real, freqs_imag, compute_dtype = _cast_and_contiguous(dq, dk, freqs_real, freqs_imag)
+    if freqs_cis.is_complex():
+        freqs_cis = freqs_cis.reshape(-1, freqs_cis.shape[-1])
+        if freqs_cis.shape[0] > sl:
+            freqs_cis = freqs_cis[:sl]
+        freqs_cis = torch.view_as_real(freqs_cis)
+
+    dq, dk, freqs_cis, compute_dtype = _cast_and_contiguous(dq, dk, freqs_cis)
 
     # UB tiling strategy: tile heads dimension only
     dtype_size = dq.element_size()
@@ -260,13 +225,12 @@ def llama4_rope_backward(dq, dk, freqs_cis):
     _triton_llama4_rope_npu[(n_row,)](
         dq,
         dk,
-        freqs_real,
-        freqs_imag,
+        freqs_cis,
         dq.stride(1),
         dk.stride(1),
         dq.stride(2),
         dk.stride(2),
-        freqs_real.stride(0),
+        freqs_cis.stride(0),
         sl,
         bs,
         n_qh,
diff --git a/src/liger_kernel/ops/llama4_rope.py b/src/liger_kernel/ops/llama4_rope.py
@@ -3,72 +3,24 @@
 import triton.language as tl
 
 
-def _prepare_freqs(freqs_cis: torch.Tensor, seq_len: int, head_dim_half: int):
-    # Split or unpack complex frequencies into real and imag parts
-    if freqs_cis.is_complex():
-        freqs_real = freqs_cis.real
-        freqs_imag = freqs_cis.imag
-    else:
-        # Already split: last dim should be 2*head_dim_half
-        if freqs_cis.shape[-1] == 2 * head_dim_half:
-            freqs_real = freqs_cis[..., :head_dim_half]
-            freqs_imag = freqs_cis[..., head_dim_half:]
-        else:
-            raise ValueError(
-                f"Unexpected freqs_cis shape for non-complex input: {freqs_cis.shape}, expected last dim = {2 * head_dim_half}"
-            )
-
-    # Canonicalize to shape (seq_len, head_dim_half):
-    # 1) Ensure the last dimension is head_dim_half
-    if freqs_real.shape[-1] != head_dim_half:
-        raise ValueError(f"Unexpected last dim for freqs: {freqs_real.shape[-1]} (expected {head_dim_half})")
-    # 2) Flatten all leading dims to a single row dimension
-    freqs_real = freqs_real.reshape(-1, head_dim_half)
-    freqs_imag = freqs_imag.reshape(-1, head_dim_half)
-    # 3) If we have fewer rows than seq_len, allow broadcasting when single row
-    if freqs_real.shape[0] < seq_len:
-        if freqs_real.shape[0] == 1:
-            freqs_real = freqs_real.expand(seq_len, -1)
-            freqs_imag = freqs_imag.expand(seq_len, -1)
-        else:
-            raise ValueError(f"Insufficient rows in freqs: {freqs_real.shape[0]} < seq_len={seq_len}")
-    # 4) If we have more rows than seq_len (e.g., batch present), take the first seq_len rows
-    elif freqs_real.shape[0] > seq_len:
-        freqs_real = freqs_real[:seq_len]
-        freqs_imag = freqs_imag[:seq_len]
-
-    return freqs_real, freqs_imag
-
-
-def _maybe_to_dtype(t: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
-    return t if t.dtype == dtype else t.to(dtype)
-
-
-def _maybe_contiguous(t: torch.Tensor) -> torch.Tensor:
-    return t if t.is_contiguous() else t.contiguous()
-
-
-def _cast_and_contiguous(q, k, freqs_real, freqs_imag):
-    # Choose compute dtype: use fp32 only when inputs are fp32; otherwise keep input dtype for performance
+def _cast_and_contiguous(q, k, freqs_complex):
+    # Align dtype: fp32 only when q is fp32; otherwise keep q dtype for perf
     compute_dtype = torch.float32 if q.dtype == torch.float32 else q.dtype
 
-    # Make sure q/k share the same dtype before casting to compute dtype
     if k.dtype != q.dtype:
         k = k.to(q.dtype)
 
-    q = _maybe_contiguous(_maybe_to_dtype(q, compute_dtype))
-    k = _maybe_contiguous(_maybe_to_dtype(k, compute_dtype))
-    freqs_real = _maybe_contiguous(_maybe_to_dtype(freqs_real, compute_dtype))
-    freqs_imag = _maybe_contiguous(_maybe_to_dtype(freqs_imag, compute_dtype))
-    return q, k, freqs_real, freqs_imag
+    q = q.to(compute_dtype).contiguous()
+    k = k.to(compute_dtype).contiguous()
+    freqs_complex = freqs_complex.contiguous()
+    return q, k, freqs_complex
 
 
 @triton.jit
 def _llama4_rope_kernel(
     q_ptr,
     k_ptr,
-    freqs_real_ptr,
-    freqs_imag_ptr,
+    freqs_complex_ptr,
     q_row_stride,
     k_row_stride,
     q_head_stride,
@@ -101,16 +53,18 @@ def _llama4_rope_kernel(
     base_offset = batch_idx * seq_len + seq_idx
     q_base = q_ptr + base_offset * q_row_stride
     k_base = k_ptr + base_offset * k_row_stride
+    freq_base = seq_idx * freqs_row_stride
 
     # Tiling over dim/2
     for d_start in tl.static_range(0, head_dim_half, BLOCK_SIZE):
         d_indices = d_start + tl.arange(0, BLOCK_SIZE)
         mask_d = d_indices < head_dim_half
 
-        # Load frequencies once per tile (freqs layout: [seq_len, head_dim_half])
-        freq_idx = d_indices
-        freqs_real = tl.load(freqs_real_ptr + seq_idx * freqs_row_stride + freq_idx, mask=mask_d, other=0.0)
-        freqs_imag = tl.load(freqs_imag_ptr + seq_idx * freqs_row_stride + freq_idx, mask=mask_d, other=0.0)
+        # Compute offsets for the block
+        freq_offsets = d_indices[:, None] * 2 + tl.arange(0, 2)[None, :]
+        # Load the block
+        freqs_complex = tl.load(freqs_complex_ptr + freq_base + freq_offsets, mask=mask_d[:, None], other=0.0)
+        freqs_real, freqs_imag = tl.split(freqs_complex)
         freqs_imag = freqs_imag * imag_sign
 
         # Process one query head per program in pid_h
@@ -159,12 +113,14 @@ def llama4_rope_forward(q, k, freqs_cis, BLOCK_SIZE: int = None, imag_sign: floa
     batch_size, seq_len, n_q_heads, head_dim = q.shape
     _, _, n_k_heads, _ = k.shape
     head_dim_half = head_dim // 2
-
-    # Prepare frequencies
-    freqs_real, freqs_imag = _prepare_freqs(freqs_cis, seq_len, head_dim_half)
+    if freqs_cis.is_complex():
+        freqs_cis = freqs_cis.reshape(-1, freqs_cis.shape[-1])
+        if freqs_cis.shape[0] > seq_len:
+            freqs_cis = freqs_cis[:seq_len]
+        freqs_cis = torch.view_as_real(freqs_cis)
 
     # Cast to appropriate dtype and make contiguous only when needed
-    q, k, freqs_real, freqs_imag = _cast_and_contiguous(q, k, freqs_real, freqs_imag)
+    q, k, freqs_cis = _cast_and_contiguous(q, k, freqs_cis)
 
     # H100-optimized meta-params
     if BLOCK_SIZE is None:
@@ -181,13 +137,12 @@ def llama4_rope_forward(q, k, freqs_cis, BLOCK_SIZE: int = None, imag_sign: floa
     _llama4_rope_kernel[grid](
         q,
         k,
-        freqs_real,
-        freqs_imag,
+        freqs_cis,
         q.stride(1),
         k.stride(1),
         q.stride(2),
         k.stride(2),
-        freqs_real.stride(0),
+        freqs_cis.stride(0),
         seq_len,
         batch_size,
         imag_sign,