|
| 1 | +# dma_complex_dims/aie2.py -*- Python -*- |
| 2 | +# |
| 3 | +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +# See https://llvm.org/LICENSE.txt for license information. |
| 5 | +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +# |
| 7 | +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates |
| 8 | + |
| 9 | +# REQUIRES: ryzen_ai, peano |
| 10 | +# |
| 11 | +# RUN: %python %S/aie2.py --m 8 --k 5 --K 20 --r 4 --s 5 > ./aie2.mlir |
| 12 | +# RUN: %python aiecc.py --no-aiesim --no-xchesscc --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir |
| 13 | +# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags |
| 14 | +# RUN: %run_on_npu ./test.exe -x final.xclbin -i insts.txt -k MLIR_AIE --m 8 --k 5 --K 20 --r 4 --s 5 |
| 15 | +import argparse |
| 16 | +import numpy as np |
| 17 | +import sys |
| 18 | + |
| 19 | +from aie.dialects.aie import * |
| 20 | +from aie.dialects.aiex import * |
| 21 | +from aie.extras.context import mlir_mod_ctx |
| 22 | +from aie.helpers.dialects.ext.scf import _for as range_ |
| 23 | +from aie.helpers.taplib import TensorAccessPattern |
| 24 | + |
| 25 | + |
| 26 | +# this resembles the buffer A data layout and transformations |
| 27 | +def my_passthrough(m, k, K, r, s): |
| 28 | + |
| 29 | + # large K must be divisible by small k |
| 30 | + assert K % k == 0 |
| 31 | + |
| 32 | + # assertions for m and k which should be divisible by the API sizes |
| 33 | + assert m % r == 0 |
| 34 | + assert k % s == 0 |
| 35 | + |
| 36 | + # compute tile is m x k (small tile) |
| 37 | + comp_tile_ty = np.ndarray[(m, k), np.dtype[np.int32]] |
| 38 | + |
| 39 | + # memory tile is m x K (larger tile) |
| 40 | + mem_tile_ty = np.ndarray[(m, K), np.dtype[np.int32]] |
| 41 | + |
| 42 | + with mlir_mod_ctx() as ctx: |
| 43 | + |
| 44 | + @device(AIEDevice.npu1_1col) |
| 45 | + def device_body(): |
| 46 | + |
| 47 | + # Tile declarations |
| 48 | + ShimTile = tile(0, 0) |
| 49 | + MemTile = tile(0, 1) |
| 50 | + ComputeTile = tile(0, 2) |
| 51 | + |
| 52 | + # AIE-array data movement with object fifos |
| 53 | + |
| 54 | + # Input |
| 55 | + of_in_shim_to_mem = object_fifo( |
| 56 | + "shim_to_mem", |
| 57 | + ShimTile, |
| 58 | + MemTile, |
| 59 | + 2, |
| 60 | + mem_tile_ty, |
| 61 | + ) |
| 62 | + |
| 63 | + of_in_mem_to_comp = object_fifo( |
| 64 | + "mem_to_comp", |
| 65 | + MemTile, |
| 66 | + ComputeTile, |
| 67 | + 2, |
| 68 | + comp_tile_ty, |
| 69 | + # 4D transformation in MemTile (MM2S) |
| 70 | + # Assumes that the "higher" MemTile size |
| 71 | + # defines the 4D transformation |
| 72 | + [ |
| 73 | + (K // k, m * k), |
| 74 | + (k // s, s), |
| 75 | + (m, k), |
| 76 | + (s, 1), |
| 77 | + ], |
| 78 | + # 3D transformation in CompTile (S2MM) |
| 79 | + [ |
| 80 | + [ |
| 81 | + (k // s, r * s), |
| 82 | + (m // r, r * k), |
| 83 | + (r * s, 1), |
| 84 | + ] |
| 85 | + ], |
| 86 | + ) |
| 87 | + |
| 88 | + # links mem to comp |
| 89 | + object_fifo_link(of_in_shim_to_mem, of_in_mem_to_comp) |
| 90 | + |
| 91 | + # Output |
| 92 | + of_out_comp_to_mem = object_fifo( |
| 93 | + "comp_to_mem", |
| 94 | + ComputeTile, |
| 95 | + MemTile, |
| 96 | + 2, |
| 97 | + comp_tile_ty, |
| 98 | + ) |
| 99 | + |
| 100 | + of_out_mem_to_shim = object_fifo( |
| 101 | + "mem_to_shim", MemTile, ShimTile, 2, mem_tile_ty |
| 102 | + ) |
| 103 | + |
| 104 | + # links comp to mem |
| 105 | + object_fifo_link(of_out_comp_to_mem, of_out_mem_to_shim) |
| 106 | + |
| 107 | + # Compute tile just passes, doesn't do any operation |
| 108 | + @core(ComputeTile) |
| 109 | + def core_body(): |
| 110 | + for _ in range_(sys.maxsize): |
| 111 | + for _ in range_(K // k): |
| 112 | + elem_in = of_in_mem_to_comp.acquire(ObjectFifoPort.Consume, 1) |
| 113 | + elem_out = of_out_comp_to_mem.acquire(ObjectFifoPort.Produce, 1) |
| 114 | + for i in range_(m): |
| 115 | + for j in range_(k): |
| 116 | + elem_out[i, j] = elem_in[i, j] |
| 117 | + |
| 118 | + of_in_mem_to_comp.release(ObjectFifoPort.Consume, 1) |
| 119 | + of_out_comp_to_mem.release(ObjectFifoPort.Produce, 1) |
| 120 | + |
| 121 | + # set the runtime type as 1D array |
| 122 | + runtime_ty = np.ndarray[(m * K,), np.dtype[np.int32]] |
| 123 | + |
| 124 | + # To/from AIE-array data movement |
| 125 | + @runtime_sequence(runtime_ty, runtime_ty, runtime_ty) |
| 126 | + def sequence(A, B, C): |
| 127 | + npu_dma_memcpy_nd( |
| 128 | + metadata=of_in_shim_to_mem, |
| 129 | + bd_id=1, |
| 130 | + mem=A, |
| 131 | + sizes=[1, 1, 1, m * K], |
| 132 | + ) |
| 133 | + |
| 134 | + npu_dma_memcpy_nd( |
| 135 | + metadata=of_out_mem_to_shim, bd_id=0, mem=C, sizes=[1, 1, 1, m * K] |
| 136 | + ) |
| 137 | + # wait only on output since input will have completed before output |
| 138 | + dma_wait(of_out_mem_to_shim) |
| 139 | + |
| 140 | + print(ctx.module) |
| 141 | + |
| 142 | + |
| 143 | +if __name__ == "__main__": |
| 144 | + p = argparse.ArgumentParser() |
| 145 | + p.add_argument("dims", help="m, k, K, r, s", type=int, nargs="*") |
| 146 | + args = p.parse_args() |
| 147 | + |
| 148 | + if len(args.dims) != 5: |
| 149 | + print("ERROR: Must provide all 5 dimensions", file=sys.stderr) |
| 150 | + exit(-1) |
| 151 | + |
| 152 | + my_passthrough( |
| 153 | + m=args.dims[0], |
| 154 | + k=args.dims[1], |
| 155 | + K=args.dims[2], |
| 156 | + r=args.dims[3], |
| 157 | + s=args.dims[4], |
| 158 | + ) |
0 commit comments