Fix illegal memory access in Triton RMSNorm and RoPE (#804)

vvvdwbvvv · lancerts · web-flow · commit b27b4c5f2c31 · 2025-07-21T18:00:50.000+08:00
## Summary  When using very large tensors (e.g. seq_len=1e6, hidden_size=4096), Triton’s default 32-bit `tl.program_id(0)` can overflow, leading to out-of-bounds memory accesses. This change casts the program ID to 64-bit (`tl.int64`) to ensure all pointer arithmetic stays within the valid address range. Fix #803  ## Testing Done   - Hardware Type: <BLANK> - [ ] run `make test` to ensure correctness - [ ] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence --------- Co-authored-by: Shao Tang <tangshao28@gmail.com>
diff --git a/src/liger_kernel/ops/rms_norm.py b/src/liger_kernel/ops/rms_norm.py
@@ -63,7 +63,7 @@ def _rms_norm_forward_kernel(
     3. https://arxiv.org/pdf/1910.07467
     """
 
-    row_idx = tl.program_id(0)
+    row_idx = tl.program_id(0).to(tl.int64)
     col_offsets = tl.arange(0, BLOCK_SIZE)
     mask = col_offsets < n_cols
 
@@ -137,7 +137,7 @@ def _rms_norm_backward_kernel(
     dw = sum(dy * (x / RMS)). summation over BxT dimension
     """
 
-    row_block_id = tl.program_id(0)
+    row_block_id = tl.program_id(0).to(tl.int64)
     row_start = row_block_id * rows_per_program
     row_end = min((row_block_id + 1) * rows_per_program, n_rows)
     col_offsets = tl.arange(0, BLOCK_SIZE)
diff --git a/src/liger_kernel/ops/rope.py b/src/liger_kernel/ops/rope.py
@@ -32,7 +32,7 @@ def _triton_rope(
 
     # cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
     # stride: (seq_len * head_dim, head_dim, 1)
-    pid = tl.program_id(0)
+    pid = tl.program_id(0).to(tl.int64)
 
     # locate start address
     q_ptr = q_ptr + pid * q_row_stride