implement persistent loop based rmsnorm kernel (#676)

xiaohuguo2023 · web-flow · commit 9cdcf1d546b7 · 2024-12-16T17:15:26.000Z
* implement persistent loop based rmsnorm kernel

* remove comment in autotune configs and always assume Triton kernel use GPU

* remove comments

* slightly improved perf

* add get_num_sms functions, and change triton_rmsnorn call interface

* num_stages =1 is the best

* update format
diff --git a/python/perf-kernels/rmsnorm.py b/python/perf-kernels/rmsnorm.py
@@ -16,6 +16,13 @@ def is_hip():
     return triton.runtime.driver.active.get_current_target().backend == "hip"
 
 
+def get_num_sms():
+    current_device_index = torch.cuda.current_device()
+    current_device = torch.cuda.get_device_properties(current_device_index)
+    num_sms = current_device.multi_processor_count
+    return num_sms
+
+
 def get_cuda_autotune_config():
     return [
         triton.Config({}, num_warps=4, num_stages=1),
@@ -25,9 +32,7 @@ def get_cuda_autotune_config():
 
 
 def get_hip_autotune_config():
-    return [
-        triton.Config({'waves_per_eu': we}, num_warps=nw, num_stages=2) for (we, nw) in product([0, 1, 2, 4], [8, 16])
-    ]
+    return [triton.Config({'waves_per_eu': we}, num_warps=nw) for (we, nw) in product([0, 1, 2, 4], [4, 8, 16])]
 
 
 def get_autotune_config():
@@ -37,102 +42,92 @@ def get_autotune_config():
         return get_hip_autotune_config()
 
 
-# accumulate sum of squares for a row in a blocked manner
-@triton.jit
-def accumulate_sum_squares(input_ptr, input_row_stride, n_cols, BLOCK_SIZE, row_idx):
-    col_offsets = tl.arange(0, BLOCK_SIZE)
-    sum_squares = tl.zeros([1], dtype=tl.float32)
-    row_input_ptr = input_ptr + row_idx * input_row_stride
-
-    n_cols_blks = tl.cdiv(n_cols, BLOCK_SIZE) - 1
-    for start in range(0, n_cols_blks * BLOCK_SIZE, BLOCK_SIZE):
-        cols = start + col_offsets
-        input_ptrs = row_input_ptr + cols
-        input_ptrs = tl.multiple_of(input_ptrs, (16, ))
-        x = tl.load(input_ptrs)
-        sum_squares += tl.sum(x * x, axis=0)
-
-    # loop peeling for mask
-    cols = n_cols_blks * BLOCK_SIZE + col_offsets
-    mask = cols < n_cols
-    input_ptrs = row_input_ptr + cols
-    input_ptrs = tl.multiple_of(input_ptrs, (16, ))
-    x = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg")
-    sum_squares += tl.sum(x * x, axis=0)
-
-    return sum_squares
-
-
-# apply normalization to each block of the row
-@triton.jit
-def apply_normalization(input_ptr, output_ptr, g_ptr, input_row_stride, output_row_stride, n_cols, norm_factor,
-                        BLOCK_SIZE, row_idx):
-    col_offsets = tl.arange(0, BLOCK_SIZE)
-    row_input_ptr = input_ptr + row_idx * input_row_stride
-    row_output_ptr = output_ptr + row_idx * output_row_stride
-
-    for start in range(0, n_cols, BLOCK_SIZE):
-        cols = start + col_offsets
-        mask = cols < n_cols
-        input_ptrs = row_input_ptr + cols
-        input_ptrs = tl.multiple_of(input_ptrs, (16, ))
-        g_ptrs = g_ptr + cols
-        output_ptrs = row_output_ptr + cols
-        x = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg")
-        g = tl.load(g_ptrs, mask=mask, other=0.0)
-        rms_norm = x * norm_factor * g
-        tl.store(output_ptrs, rms_norm, mask=mask)
-
-
-# Main kernel with both blocked and non-blocked versions based on BLOCK_SIZE
 @triton.autotune(configs=get_autotune_config(), key=['n_rows', 'n_cols'], use_cuda_graph=True)
 @triton.jit
 def rms_kernel(output_ptr, input_ptr, g_ptr, input_row_stride, output_row_stride, n_rows, n_cols, epsilon,
                BLOCK_SIZE: tl.constexpr, USE_BLOCKED: tl.constexpr, NUM_PRGMS: tl.constexpr):
-    row_idx = tl.program_id(0)  # Each program instance handles one row
+    row_start = tl.program_id(0)
     col_offsets = tl.arange(0, BLOCK_SIZE)
+    tl.assume(input_row_stride >= 0)
+    tl.assume(output_row_stride >= 0)
 
     if USE_BLOCKED:
-        # Blocked Approach: Accumulate sum of squares and normalize in chunks
-        sum_squares = accumulate_sum_squares(input_ptr, input_row_stride, n_cols, BLOCK_SIZE, row_idx)
-        mean_square = sum_squares / n_cols
-        norm_factor = tl.rsqrt(mean_square + epsilon)
 
-        # Apply normalization
-        apply_normalization(input_ptr, output_ptr, g_ptr, input_row_stride, output_row_stride, n_cols, norm_factor,
-                            BLOCK_SIZE, row_idx)
+        # Persistent loop for rows
+        for row_idx in tl.range(row_start, n_rows, NUM_PRGMS, num_stages=1):
+            row_input_ptr = input_ptr + row_idx * input_row_stride
+            row_output_ptr = output_ptr + row_idx * output_row_stride
+
+            # Accumulate sum of squares
+            sum_squares = tl.zeros([1], dtype=tl.float32)
+            n_cols_blks = tl.cdiv(n_cols, BLOCK_SIZE) - 1
+            for blk_idx in range(n_cols_blks, num_stages=1):
+                cols = blk_idx * BLOCK_SIZE + col_offsets
+                input_ptrs = row_input_ptr + cols
+                input_ptrs = tl.multiple_of(input_ptrs, (16, ))
+                x = tl.load(input_ptrs)
+                sum_squares += tl.sum(x * x, axis=0)
+
+            # Handle remainder
+            cols = n_cols_blks * BLOCK_SIZE + col_offsets
+            mask = cols < n_cols
+            input_ptrs = row_input_ptr + cols
+            input_ptrs = tl.multiple_of(input_ptrs, (16, ))
+            x = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg")
+            sum_squares += tl.sum(x * x, axis=0)
+
+            # Compute normalization factor
+            mean_square = sum_squares / n_cols
+            norm_factor = tl.rsqrt(mean_square + epsilon)
+
+            # Normalize and write output
+            for blk_idx in range(n_cols_blks, num_stages=1):
+                cols = blk_idx * BLOCK_SIZE + col_offsets
+                input_ptrs = row_input_ptr + cols
+                input_ptrs = tl.multiple_of(input_ptrs, (16, ))
+                x = tl.load(input_ptrs)
+                g_ptrs = g_ptr + cols
+                g = tl.load(g_ptrs)
+                rms_norm = x * norm_factor * g
+                output_ptrs = row_output_ptr + cols
+                tl.store(output_ptrs, rms_norm)
+
+            # Handle remainder
+            cols = n_cols_blks * BLOCK_SIZE + col_offsets
+            mask = cols < n_cols
+            input_ptrs = row_input_ptr + cols
+            x = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg")
+            g_ptrs = g_ptr + cols
+            g = tl.load(g_ptrs, mask=mask, other=0.0)
+            rms_norm = x * norm_factor * g
+            output_ptrs = row_output_ptr + cols
+            tl.store(output_ptrs, rms_norm, mask=mask)
 
     else:
         mask = col_offsets < n_cols
-        tl.assume(input_row_stride >= 0)
-        tl.assume(output_row_stride >= 0)
-        row_start_ptr = input_ptr + row_idx * input_row_stride
-        input_ptrs = row_start_ptr + col_offsets
-        input_ptrs = tl.multiple_of(input_ptrs, (16, ))
-        row = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg")
-        g = tl.load(g_ptr + col_offsets, mask=mask, other=0.0)
-        row_norm = row * row
-        row_norm = tl.sum(row_norm, axis=-1)
-        row_norm = row_norm / n_cols
-        row_norm = row_norm + epsilon
-        row_norm = tl.rsqrt(row_norm)
-        rms_norm = row * row_norm
-        rms_norm = rms_norm * g
-
-        output_row_start_ptr = output_ptr + row_idx * output_row_stride
-        output_ptrs = output_row_start_ptr + col_offsets
-        output_ptrs = tl.multiple_of(output_ptrs, (16, ))
-        tl.store(output_ptrs, rms_norm, mask=mask)
-
-
-def triton_rmsnorm(x, y, g, n_rows, n_cols, blk_size, epsilon=1e-6):
-    BLOCK_SIZE = blk_size
-    # Use blocked approach if BLOCK_SIZE larger than 65536 // x.element_size()
-    USE_BLOCKED = n_cols > BLOCK_SIZE
-
-    NUM_PRGMS = n_rows
+        for row_idx in tl.range(row_start, n_rows, NUM_PRGMS, num_stages=1):
+            row_start_ptr = input_ptr + row_idx * input_row_stride
+            input_ptrs = row_start_ptr + col_offsets
+            input_ptrs = tl.multiple_of(input_ptrs, (16, ))
+            row = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg")
+            g = tl.load(g_ptr + col_offsets, mask=mask, other=0.0)
+            row_norm = row * row
+            row_norm = tl.sum(row_norm, axis=-1)
+            row_norm = row_norm / n_cols
+            row_norm = row_norm + epsilon
+            row_norm = tl.rsqrt(row_norm)
+            rms_norm = row * row_norm
+            rms_norm = rms_norm * g
+
+            output_row_start_ptr = output_ptr + row_idx * output_row_stride
+            output_ptrs = output_row_start_ptr + col_offsets
+            output_ptrs = tl.multiple_of(output_ptrs, (16, ))
+            tl.store(output_ptrs, rms_norm, mask=mask)
+
+
+def triton_rmsnorm(x, y, g, n_rows, n_cols, blk_size, USE_BLOCKED, NUM_PRGMS, epsilon=1e-6):
     grid = lambda meta: (NUM_PRGMS, )
-    rms_kernel[grid](y, x, g, x.stride(0), y.stride(0), n_rows, n_cols, epsilon, BLOCK_SIZE, USE_BLOCKED, NUM_PRGMS)
+    rms_kernel[grid](y, x, g, x.stride(0), y.stride(0), n_rows, n_cols, epsilon, blk_size, USE_BLOCKED, NUM_PRGMS)
 
     return y
 
@@ -165,8 +160,10 @@ def test_rmsnorm(M, N):
     n_rows, n_cols = x.shape
     MAX_FUSED_SIZE = 65536 // x.element_size()
     blk_size = min(MAX_FUSED_SIZE, triton.next_power_of_2(n_cols))
+    USE_BLOCKED = n_cols > blk_size
+    NUM_PRGMS = min(n_rows, get_num_sms())
     g = torch.ones((1, N), device='cuda')
-    y_triton = triton_rmsnorm(x, y, g, n_rows, n_cols, blk_size)
+    y_triton = triton_rmsnorm(x, y, g, n_rows, n_cols, blk_size, USE_BLOCKED, NUM_PRGMS)
 
     y_torch = torch_rmsnorm(x, g)
 
@@ -219,16 +216,20 @@ def benchmark(M, N, provider):
         n_rows, n_cols = x.shape
         MAX_FUSED_SIZE = 65536 // x.element_size()
         blk_size = min(MAX_FUSED_SIZE, triton.next_power_of_2(n_cols))
+        USE_BLOCKED = n_cols > blk_size
+        NUM_PRGMS = min(n_rows, get_num_sms())
         stream = torch.cuda.Stream()
         torch.cuda.set_stream(stream)
         g = torch.ones((1, N), device='cuda')
         if provider == 'torch':
             ms = triton.testing.do_bench(lambda: torch_rmsnorm(x, g))
         if provider == 'triton':
-            ms = triton.testing.do_bench(lambda: triton_rmsnorm(x, y, g, n_rows, n_cols, blk_size))
+            ms = triton.testing.do_bench(
+                lambda: triton_rmsnorm(x, y, g, n_rows, n_cols, blk_size, USE_BLOCKED, NUM_PRGMS))
             global verbose
             if verbose:
                 print(f'SIZE: {N} Best tuning config: ({rms_kernel.best_config})')
+                print(f'time: {ms}')
         gbps = lambda ms: 2 * x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3)
         return gbps(ms)
 
@@ -266,8 +267,10 @@ def main():
         n_rows, n_cols = x.shape
         MAX_FUSED_SIZE = 65536 // x.element_size()
         blk_size = min(MAX_FUSED_SIZE, triton.next_power_of_2(n_cols))
+        USE_BLOCKED = n_cols > blk_size
+        NUM_PRGMS = min(n_rows, get_num_sms())
         g = torch.ones((1, args.N_start), device='cuda')
-        triton_rmsnorm(x, y, g, n_rows, n_cols, blk_size)
+        triton_rmsnorm(x, y, g, n_rows, n_cols, blk_size, USE_BLOCKED, NUM_PRGMS)
     else:
         verbose = args.v
         run_benchmark(args)