Double row matmul (#49)

aws-maxkoh · web-flow · commit a5e9d8cc2bd8 · 2025-04-09T10:29:05.000-07:00
* add: double row matmul kernel
diff --git a/src/nki_samples/reference/double_row_matmul.py b/src/nki_samples/reference/double_row_matmul.py
@@ -0,0 +1,202 @@
+"""
+Copyright (c) 2025, Amazon.com. All Rights Reserved
+
+kernels - Builtin high performance NKI kernels.
+
+"""
+
+from neuronxcc import nki
+import neuronxcc.nki.isa as nisa
+import neuronxcc.nki.language as nl
+
+@nki.jit(platform_target='trn2')
+def quantized_double_row_matmul(
+    lhs,
+    rhs_quantized, rhs_scale,
+    # Meta-parameters
+    TILES_IN_BLOCK_M,
+    TILES_IN_BLOCK_N,
+    TILES_IN_BLOCK_K
+):
+  """NKI kernel to compute a matrix multiplication by blocking along all dimensions
+     and performing fp8_e4m3 quantization on lhs matrix.
+  
+  Args:
+      lhs: an unquantized input tensor of shape [M,K], where K is a multiple of 128 *
+        TILES_IN_BLOCK_K and M is a multiple of 128 * TILES_IN_BLOCK_M.  It is the
+        left-hand-side argument of the matrix multiplication.
+      rhs_quantized: a pre-quantized input tensor of dtype float8_e4m3 and of shape 
+        [K // 2,2 * N] (reshaped from the original [K,N] rhs) where K is a multiple of 128 *
+        TILES_IN_BLOCK_K and N is a multiple of 512 * TILES_IN_BLOCK_N. It is the
+        right-hand-side argument of the matrix multiplication. See test_double_row_matmul.py
+        for the expected reshape to be performed on the original rhs matrix.
+      rhs_scale: the quantization column-wise scale of rhs of shape [128, N] that is 
+        pre-broadcasted from [1, N].
+      TILES_IN_BLOCK_*: meta parameters to control blocking dimensions
+  Returns:
+      result: the resulting output tensor of shape [M,N]
+  """
+
+  assert rhs_quantized.dtype == nl.float8_e4m3, "rhs must be pre-quantized to dtype float8_e4m3"
+
+  M, K = lhs.shape
+  K_RESHAPED, N_RESHAPED = rhs_quantized.shape
+  K_ = 2 * K_RESHAPED
+
+  assert K == K_, "lhs and rhs must have the same contraction dimension"
+
+  assert N_RESHAPED % 2 == 0, f"N_RESHAPED={N_RESHAPED} must be divisible by 2"
+  N = N_RESHAPED // 2
+
+  TILE_M = nl.tile_size.gemm_stationary_fmax  # 128
+  TILE_K = nl.tile_size.pmax  # 128
+  TILE_N = nl.tile_size.gemm_moving_fmax  # 512
+
+  BLOCK_M = TILE_M * TILES_IN_BLOCK_M
+  BLOCK_N = TILE_N * TILES_IN_BLOCK_N
+  BLOCK_K = TILE_K * TILES_IN_BLOCK_K
+
+  assert M % BLOCK_M == 0
+  assert N % BLOCK_N == 0
+  assert K % BLOCK_K == 0
+
+  # The size has to be multiple of block size.
+  NUM_BLOCK_M = M // BLOCK_M
+  NUM_BLOCK_N = N // BLOCK_N
+  NUM_BLOCK_K = K // BLOCK_K
+
+  # dtype fp8_e4m3 can represent [-240, 240].
+  FP8_RANGE = 240
+
+  assert TILES_IN_BLOCK_K % 2 == 0, f"TILES_IN_BLOCK_K={TILES_IN_BLOCK_K} must be even to load 2 tiles at a time for double row matmul"
+
+  result = nl.ndarray((M, N), dtype=lhs.dtype, buffer=nl.shared_hbm)
+
+  # Blocking M dimension (lhs partition dimension).
+  for m in nl.affine_range(NUM_BLOCK_M):
+    result_tiles = nl.zeros((TILE_M, NUM_BLOCK_N * TILES_IN_BLOCK_M * TILES_IN_BLOCK_N * TILE_N),
+                            dtype=lhs.dtype,
+                            buffer=nl.sbuf)
+
+    # Blocking K dimension (the contraction dimension).
+    # Use `sequential_range` because we do not want the compiler to change this loop by, 
+    # for example, vectorizing it.
+    for k in nl.sequential_range(NUM_BLOCK_K):
+      lhsT_quantized_tiles = nl.ndarray((TILES_IN_BLOCK_M, nl.par_dim(TILE_M), BLOCK_K),
+                                        dtype=nl.float8_e4m3,
+                                        buffer=nl.sbuf)
+      lhsT_scale_tiles = nl.ndarray((TILES_IN_BLOCK_M, nl.par_dim(TILE_M), 1),
+                                    dtype=lhs.dtype,
+                                    buffer=nl.sbuf)
+
+      i_lhs = nl.mgrid[0:TILE_M, 0:BLOCK_K]
+      for bm_l in nl.affine_range(TILES_IN_BLOCK_M):
+        # Load and quantize tiles from rhs,
+        # setting the load tile to [TILE_M, BLOCK_K] to optimize DMA performance.
+        lhs_i_m = m * BLOCK_M + bm_l * TILE_M + i_lhs.p
+        lhs_i_k = k * BLOCK_K + i_lhs.x
+
+        tile_block = nl.load(lhs[lhs_i_m, lhs_i_k])
+
+        # FIXME: use nisa.tensor_scalar_reduce to fuse nl.abs and nisa.tensor_reduce into 
+        #   1 operation.
+        abs_tile_block = nl.abs(tile_block)
+        lhsT_scale_tiles[bm_l] = nisa.tensor_reduce(nl.max,
+                                                    abs_tile_block,
+                                                    axis=[1])
+        lhsT_scale_tiles[bm_l] = nl.divide(lhsT_scale_tiles[bm_l], FP8_RANGE)
+        lhsT_quantized_tiles[bm_l] = nl.divide(tile_block, lhsT_scale_tiles[bm_l])
+
+        # For each [TILE_M, TILE_K] tiles, since TILE_K == TILE_M and the K dimension needs to be
+        # along the partition dimension, transpose said tiles in-place.
+        for bk_l in nl.affine_range(TILES_IN_BLOCK_K):
+          # FIXME: use dma_transpose instead of nc_transpose.
+          lhsT_quantized_tiles[bm_l, :,
+                               TILE_M * bk_l:(bk_l + 1) * TILE_M] = nisa.nc_transpose(lhsT_quantized_tiles[bm_l, :,
+                                                                                                           TILE_M * bk_l:(bk_l + 1) * TILE_M])
+
+      # Each lhs block's matmul results needs to be dequantized independent of another lhs block's matmul results.
+      # scoped_result_tiles stores the non-dequantized matmul results scoped to each `for m` and `for k` loops.
+      scoped_result_tiles = nl.zeros((TILE_M, NUM_BLOCK_N * TILES_IN_BLOCK_M * TILES_IN_BLOCK_N * TILE_N),
+                                     dtype=lhs.dtype,
+                                     buffer=nl.sbuf)
+
+      for n in nl.affine_range(NUM_BLOCK_N):
+        # Loading tiles from rhs,
+        # setting the load tile to [TILE_K, 2 * BLOCK_N] to optimize DMA performance
+        # (i.e. loading 2 rows of a rhs block at a time).
+        i_rhs = nl.mgrid[0:TILE_K, 0:2 * BLOCK_N]
+
+        rhs_quantized_tiles = nl.ndarray((TILES_IN_BLOCK_K // 2, nl.par_dim(TILE_K), 2 * BLOCK_N), dtype=rhs_quantized.dtype)
+        for bk_r in nl.affine_range(TILES_IN_BLOCK_K // 2):
+          rhs_quantized_i_k = (k * TILES_IN_BLOCK_K // 2 + bk_r) * TILE_K + i_rhs.p
+          rhs_quantized_i_n = 2 * n * BLOCK_N + i_rhs.x
+          rhs_quantized_tiles[bk_r] = nl.load(rhs_quantized[rhs_quantized_i_k, rhs_quantized_i_n])
+
+        # Do matmul with all tiles in the loaded lhs and rhs blocks.
+        i_res_mm = nl.mgrid[0:TILE_M, 0:TILE_N]
+        for bm in nl.affine_range(TILES_IN_BLOCK_M):
+          for bn in nl.affine_range(TILES_IN_BLOCK_N):
+            res_tile = nl.zeros((TILE_M, TILE_N), dtype=nl.float32, buffer=nl.psum)
+            for bk in nl.affine_range(TILES_IN_BLOCK_K // 2):
+              i_k, i_tile_m, i_m = nl.mgrid[0:TILE_K, 0:2, 0:TILE_M]
+              lhsT_double_tile = lhsT_quantized_tiles[
+                bm,
+                i_k,
+                bk * (2 * TILE_M) + i_tile_m * TILE_M + i_m
+              ]
+              assert lhsT_double_tile.shape == (TILE_K, 2, TILE_M)
+
+              i_k, i_tile_n, i_n = nl.mgrid[0:TILE_K, 0:2, 0:TILE_N]
+              rhs_double_tile = rhs_quantized_tiles[
+                bk,
+                i_k,
+                2 * bn * TILE_N + i_tile_n * TILE_N + i_n
+              ]
+              assert rhs_double_tile.shape == (TILE_K, 2, TILE_N)
+
+              res_tile[...] += nisa.nc_matmul(lhsT_double_tile,
+                                              rhs_double_tile,
+                                              perf_mode='double_row_gen3')
+
+            i_scoped_result_tiles_k = i_res_mm.p
+            i_scoped_result_tiles_n = bm * (NUM_BLOCK_N * BLOCK_N) + n * BLOCK_N + bn * TILE_N + i_res_mm.x
+            scoped_result_tiles[i_scoped_result_tiles_k, i_scoped_result_tiles_n] += res_tile[...]
+
+      # FIXME: dequantize using both lhs and rhs scales using nisa.scalar_tensor_tensor when
+      #   accumulating from PSUM to SBUF.
+      # Partially dequantize matmul results using lhs block scale.
+      i_scoped_result_tiles = nl.mgrid[0:TILE_K, 0:NUM_BLOCK_N * BLOCK_N]
+      for bm in nl.affine_range(TILES_IN_BLOCK_M):
+        result_tiles_i_k = i_scoped_result_tiles.p
+        result_tiles_i_n = bm * NUM_BLOCK_N * BLOCK_N + i_scoped_result_tiles.x
+        dequantized_tile_block = nisa.tensor_tensor(
+          scoped_result_tiles[result_tiles_i_k, result_tiles_i_n],
+          lhsT_scale_tiles[bm],
+          nl.multiply
+        )
+
+        result_tiles[result_tiles_i_k, result_tiles_i_n] += dequantized_tile_block
+
+    # Dequantize matmul results using rhs scale and copying results from SBUF to HBM.
+    rhs_scale_sbuf = nl.ndarray(rhs_scale.shape, buffer=nl.sbuf, dtype=rhs_scale.dtype)
+    rhs_scale_sbuf = nl.load(rhs_scale)
+
+    i_result = nl.mgrid[0:TILE_M, 0:N]
+    for bm in nl.affine_range(TILES_IN_BLOCK_M):
+      result_tiles_i_k = i_result.p
+      result_tiles_i_n = bm * (NUM_BLOCK_N * BLOCK_N) + i_result.x
+
+      result_i_m = m * BLOCK_M + bm * TILE_M + i_result.p
+      result_i_n = i_result.x
+
+      # FIXME: remove after dequantizing using nisa.scalar_tensor_tensor for dequantization.
+      dequantized = nisa.tensor_tensor(
+        result_tiles[result_tiles_i_k, result_tiles_i_n],
+        rhs_scale_sbuf,
+        nl.multiply
+      )
+
+      nl.store(result[result_i_m, result_i_n], value=dequantized)
+    
+  return result
diff --git a/test/unit/test_double_row_matmul.py b/test/unit/test_double_row_matmul.py
@@ -0,0 +1,132 @@
+"""
+Copyright (c) 2025, Amazon.com. All Rights Reserved
+"""
+import pytest
+from nki_samples.reference.double_row_matmul import quantized_double_row_matmul
+from neuronxcc.nki import benchmark, baremetal, simulate_kernel
+import neuronxcc.nki.language as nl
+import numpy as np
+
+xfail = pytest.mark.arch_specific_xfail
+
+
+bench_func = benchmark(warmup=5, iters=10)(quantized_double_row_matmul)
+
+def reshape(matrix):
+    """
+    Interleaves every [128,512] tiles from every 2 tile rows.
+
+    A [K,N] matrix is reshaped into [K//2, 2*N] where K must be divisible by 128 and 
+    N must be divisible by 512.
+
+    E.g. if Tij is the (i,j)-th tile and assuming a matrix with 4x4 [128,512] tiles,
+    the reshaped matrix looks as follows
+
+        # T11 T12 T13 T14          
+        # T21 T22 T23 T24   reshape   T11 T21 T12 T22 T13 T23 T14 T24
+        # T31 T32 T33 T34  -------->  T21 T41 T22 T42 T23 T43 T24 T44
+        # T41 T42 T43 T44
+    """
+    K, N = matrix.shape
+
+    TILE_K = 128
+    TILE_N = 512
+    
+    assert K % TILE_K == 0
+    assert N % TILE_N == 0
+
+    result = np.zeros((K // 2, 2 * N))
+    
+    for k in range(0, K // TILE_K, 2):
+      for n in range(N // TILE_N):
+        # Get 2 tiles in the same tile column and consecutive tile rows.
+        tile1 = matrix[k * TILE_K:(k + 1) * TILE_K, n * TILE_N:(n + 1) * TILE_N]
+        tile2 = matrix[(k + 1) * TILE_K:(k + 2) * TILE_K, n * TILE_N:(n + 1) * TILE_N]
+
+        result[(k // 2) * TILE_K:(k // 2 + 1) * TILE_K, n * TILE_N * 2:n * TILE_N * 2 + TILE_N] = tile1
+        result[(k//2) * TILE_K:(k // 2 + 1) * TILE_K, n * TILE_N * 2 + TILE_N:(n + 1) * TILE_N * 2] = tile2
+        
+        # Place the 2 tiles in the same tile row side by side.
+        result[(k // 2) * TILE_K:(k // 2 + 1) * TILE_K, n * TILE_N * 2:n * TILE_N * 2+TILE_N] = tile1
+        result[(k // 2) * TILE_K:(k // 2 + 1) * TILE_K, n * TILE_N * 2 + TILE_N:n * TILE_N * 2 + TILE_N + TILE_N] = tile2
+    
+    return result
+
+def column_wise_quantize(matrix):
+    """
+    Quantizes a matrix.
+
+    Returns a column-wise scale broadcasted to (128, matrix.shape[1]) and the quantized matrix.
+    """
+    FP8_RANGE = 240
+    column_wise_max = np.max(np.abs(matrix), axis=0, keepdims=True)
+    column_wise_scale = column_wise_max / FP8_RANGE
+
+    matrix_quantized = matrix / column_wise_scale
+    column_wise_scale = np.broadcast_to(column_wise_scale, (128, matrix.shape[1]))
+
+    return column_wise_scale, matrix_quantized
+
+class TestDoubleRowMatmul:
+
+    @xfail(fail=['trn1'])
+    @pytest.mark.parametrize("M, K, N, dtype, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K, max_p99_latency", [
+        [512, 16 * 1024, 1024, nl.bfloat16, 2, 2, 16, 320],
+    ])
+    def test_double_row_matmul_perf(self, M, K, N, dtype, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K, max_p99_latency):
+        # Initializing random inputs
+        lhs = np.random.rand(M, K)
+        rhs = np.random.rand(K, N)
+
+        # Quantizing rhs
+        rhs_scale, rhs_quantized = column_wise_quantize(rhs)
+        rhs_quantized_reshaped = reshape(rhs_quantized)
+
+        # Casting to the correct data type (rhs is pre-quantized, thus casted to FP8)
+        lhs = nl.static_cast(lhs, dtype)
+        rhs_scale = nl.static_cast(rhs_scale, dtype)
+        rhs_quantized_reshaped = nl.static_cast(rhs_quantized_reshaped, nl.float8_e4m3)
+        
+        # Latency checks
+        bench_func(lhs, rhs_quantized_reshaped, rhs_scale, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K)
+        latency_res = bench_func.benchmark_result.nc_latency
+        p99_latency = latency_res.get_latency_percentile(99)
+        
+        assert p99_latency <= max_p99_latency
+
+    @xfail(fail=['trn1'])
+    @pytest.mark.simulation
+    @pytest.mark.parametrize("M, K, N, dtype, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K", [
+        [512, 16 * 1024, 1024, nl.bfloat16, 2, 2, 16],
+        [512, 16 * 1024, 1024, nl.bfloat16, 4, 1, 32],
+        [512, 16 * 1024, 1024, nl.bfloat16, 4, 2, 128],
+    ])
+    def test_double_row_matmul_numerical(self, simulation_only, M, K, N, dtype, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K):
+        # Initializing random inputs
+        lhs = np.random.rand(M, K)
+        rhs = np.random.rand(K, N)
+        
+        # Correct CPU results
+        result_golden = np.matmul(lhs, rhs)
+
+        # Quantizing rhs
+        rhs_scale, rhs_quantized = column_wise_quantize(rhs)
+        rhs_quantized_reshaped = reshape(rhs_quantized)
+
+        # Casting to the correct data type (rhs is pre-quantized, thus casted to FP8)
+        lhs = nl.static_cast(lhs, dtype)
+        rhs_scale = nl.static_cast(rhs_scale, dtype)
+        rhs_quantized_reshaped = nl.static_cast(rhs_quantized_reshaped, nl.float8_e4m3)
+        
+        # Numerical accuracy checks
+        numeric_func = baremetal(quantized_double_row_matmul)
+
+        if simulation_only:
+            result_nki = simulate_kernel(numeric_func, lhs, rhs_quantized_reshaped, rhs_scale, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K)
+        else:
+            result_nki = numeric_func(lhs, rhs_quantized_reshaped, rhs_scale, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K)
+
+        # Casting result_nki from dtype BF16 back to FP32 to compare the NumPy and NKI results
+        result_nki = result_nki.astype(np.float32)
+        
+        assert np.allclose(result_golden, result_nki, rtol=2e-2)