|
| 1 | +# |
| 2 | +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. |
| 3 | +# See https://llvm.org/LICENSE.txt for license information. |
| 4 | +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 5 | +# |
| 6 | +# (c) Copyright 2024 AMD Inc. |
| 7 | +import numpy as np |
| 8 | +import sys |
| 9 | + |
| 10 | +from aie.iron import ( |
| 11 | + GlobalBuffer, |
| 12 | + Kernel, |
| 13 | + ObjectFifo, |
| 14 | + Program, |
| 15 | + Runtime, |
| 16 | + Worker, |
| 17 | + WorkerRuntimeBarrier, |
| 18 | +) |
| 19 | +from aie.iron.placers import SequentialPlacer |
| 20 | +from aie.iron.device import NPU1Col1, NPU2Col1 |
| 21 | +from aie.iron.controlflow import range_ |
| 22 | + |
| 23 | + |
| 24 | +def conv2dk14( |
| 25 | + dev, |
| 26 | + width: int, |
| 27 | + height: int, |
| 28 | + in_channels: int, |
| 29 | + out_channels: int, |
| 30 | + kernel_size: int, |
| 31 | + trace_size: int, |
| 32 | +): |
| 33 | + enable_trace = 1 if trace_size > 0 else 0 |
| 34 | + |
| 35 | + # Kernel processes 16 tiles and 16 output channels at a time |
| 36 | + sub_out_channels = 16 |
| 37 | + sub_tiles = 16 |
| 38 | + |
| 39 | + actIn = kernel_size * kernel_size * in_channels * sub_tiles |
| 40 | + weights = kernel_size * kernel_size * in_channels * sub_out_channels |
| 41 | + actOut = sub_tiles * sub_out_channels |
| 42 | + |
| 43 | + out_channels_group = out_channels // sub_out_channels # 72 |
| 44 | + width_out = width // kernel_size |
| 45 | + height_out = height // kernel_size |
| 46 | + |
| 47 | + # we reload inputs 72 times (out_channels // sub_out_channels) |
| 48 | + tensorInSize = width * height * in_channels * out_channels_group |
| 49 | + # tensorInSize = width * height * in_channels * 2 |
| 50 | + |
| 51 | + tensorWeightsSize = weights * out_channels_group |
| 52 | + tensorOutSize = width_out * height_out * sub_out_channels * out_channels_group |
| 53 | + |
| 54 | + N_in_bytes = tensorOutSize # Number of bytes of output data (1 byte/elem) |
| 55 | + |
| 56 | + bufIn = kernel_size * width * in_channels |
| 57 | + bufOut = sub_out_channels * width_out * height_out |
| 58 | + |
| 59 | + # Type definitions |
| 60 | + actIn_ty = np.ndarray[(actIn,), np.dtype[np.uint8]] |
| 61 | + bufIn_ty = np.ndarray[(bufIn,), np.dtype[np.uint8]] |
| 62 | + |
| 63 | + weights_ty = np.ndarray[(weights,), np.dtype[np.int8]] |
| 64 | + |
| 65 | + out_ty = np.ndarray[(actOut,), np.dtype[np.int8]] |
| 66 | + bufOut_ty = np.ndarray[(bufOut,), np.dtype[np.int8]] |
| 67 | + tensorIn_ty = np.ndarray[(tensorInSize,), np.dtype[np.uint8]] |
| 68 | + tensorWeights_ty = np.ndarray[(tensorWeightsSize,), np.dtype[np.int8]] |
| 69 | + tensorOut_ty = np.ndarray[(tensorOutSize,), np.dtype[np.int8]] |
| 70 | + |
| 71 | + # AIE Core Function declarations |
| 72 | + conv2dk14_i8_kernel = Kernel( |
| 73 | + "conv2dk14_i8", |
| 74 | + "conv2dk14.o", |
| 75 | + [ |
| 76 | + actIn_ty, |
| 77 | + weights_ty, |
| 78 | + out_ty, |
| 79 | + np.int32, |
| 80 | + np.int32, |
| 81 | + np.int32, |
| 82 | + np.int32, |
| 83 | + np.int32, |
| 84 | + ], |
| 85 | + ) |
| 86 | + |
| 87 | + # AIE-array data movement with object fifos |
| 88 | + # Input |
| 89 | + of_inOF_act_L3L2 = ObjectFifo( |
| 90 | + bufIn_ty, |
| 91 | + name="inOF_act_L3L2", |
| 92 | + dims_from_stream_per_cons=[ |
| 93 | + (kernel_size, kernel_size * in_channels), # (14, 56) |
| 94 | + (64, kernel_size * kernel_size * in_channels), # (64, 784) |
| 95 | + (kernel_size * in_channels, 1), # (56, 1) |
| 96 | + ], |
| 97 | + ) |
| 98 | + of_act_L2_02 = of_inOF_act_L3L2.cons().forward( |
| 99 | + obj_type=actIn_ty, |
| 100 | + name="act_L2_02", |
| 101 | + dims_to_stream=[ |
| 102 | + (2, kernel_size * kernel_size * in_channels * 8), # (2, 6272) |
| 103 | + (kernel_size * kernel_size // 2, 2 * in_channels), # (98, 8) |
| 104 | + (8, kernel_size * kernel_size * in_channels), # (8, 784) |
| 105 | + (2 * in_channels, 1), # (8, 1) |
| 106 | + ], |
| 107 | + ) |
| 108 | + |
| 109 | + # wts |
| 110 | + of_inOF_wts_0_L3L2 = ObjectFifo(weights_ty, depth=1, name="inOF_wts_0_L3L2") |
| 111 | + |
| 112 | + # Output |
| 113 | + of_out_02_L2 = ObjectFifo(out_ty, name="out_02_L2") |
| 114 | + of_outOFL2L3 = of_out_02_L2.cons().forward( |
| 115 | + obj_type=bufOut_ty, |
| 116 | + name="outOFL2L3", |
| 117 | + dims_to_stream=[(256, 256), (16, 8), (2, 128), (8, 1)], |
| 118 | + ) |
| 119 | + |
| 120 | + # Setup a global buffer to hold runtime parameters |
| 121 | + # rtp = GlobalBuffer( |
| 122 | + # np.ndarray[(16,), np.dtype[np.int32]], |
| 123 | + # name="rtp", |
| 124 | + # use_write_rtp=True, |
| 125 | + # ) |
| 126 | + |
| 127 | + # rtp_barrier = WorkerRuntimeBarrier() |
| 128 | + |
| 129 | + # Task for the core to perform |
| 130 | + # def core_fn(of_wts, of_act, of_out, my_rtp, conv2dk14_i8, barrier): |
| 131 | + def core_fn(of_wts, of_act, of_out, conv2dk14_i8): |
| 132 | + y_dim = height // kernel_size |
| 133 | + x_blocks = 4 |
| 134 | + x_dim = width // x_blocks # num pixels for 1/4 of a row |
| 135 | + ci = in_channels |
| 136 | + co = sub_out_channels |
| 137 | + |
| 138 | + # barrier.wait_for_value(1) |
| 139 | + # scale = my_rtp[0] |
| 140 | + scale = 14 |
| 141 | + |
| 142 | + elemWts = of_wts.acquire(1) |
| 143 | + |
| 144 | + for _ in range_(y_dim): |
| 145 | + for _ in range_(x_blocks): |
| 146 | + elemIn = of_act.acquire(1) |
| 147 | + elemOut0 = of_out.acquire(1) |
| 148 | + |
| 149 | + conv2dk14_i8( |
| 150 | + elemIn, elemWts, elemOut0, x_dim, ci, co, kernel_size, scale |
| 151 | + ) |
| 152 | + of_act.release(1) |
| 153 | + of_out.release(1) |
| 154 | + of_wts.release(1) |
| 155 | + |
| 156 | + # Create a worker to perform the task |
| 157 | + worker = Worker( |
| 158 | + core_fn, |
| 159 | + [ |
| 160 | + of_inOF_wts_0_L3L2.cons(), |
| 161 | + of_act_L2_02.cons(), |
| 162 | + of_out_02_L2.prod(), |
| 163 | + # rtp, |
| 164 | + conv2dk14_i8_kernel, |
| 165 | + # rtp_barrier, |
| 166 | + ], |
| 167 | + stack_size=0x600, |
| 168 | + trace=enable_trace, |
| 169 | + ) |
| 170 | + |
| 171 | + # Runtime operations to move data to/from the AIE-array |
| 172 | + rt = Runtime() |
| 173 | + with rt.sequence(tensorIn_ty, tensorWeights_ty, tensorOut_ty) as (I, W, O): |
| 174 | + # Initialize the runtime parameter values |
| 175 | + def set_rtps(my_rtp): |
| 176 | + my_rtp[0] = 14 |
| 177 | + |
| 178 | + # rt.inline_ops(set_rtps, [rtp]) |
| 179 | + |
| 180 | + # rt.set_barrier(rtp_barrier, 1) |
| 181 | + |
| 182 | + rt.enable_trace(trace_size), |
| 183 | + |
| 184 | + # Start worker |
| 185 | + rt.start(worker) |
| 186 | + |
| 187 | + # Fill/drain input/output ObjectFifos |
| 188 | + rt.fill(of_inOF_act_L3L2.prod(), I) |
| 189 | + rt.fill(of_inOF_wts_0_L3L2.prod(), W) |
| 190 | + rt.drain(of_outOFL2L3.cons(), O, wait=True) |
| 191 | + |
| 192 | + # Place components (assign them resources on the device) and generate an MLIR module |
| 193 | + return Program(dev, rt).resolve_program(SequentialPlacer()) |
| 194 | + |
| 195 | + |
| 196 | +try: |
| 197 | + device_name = str(sys.argv[1]) |
| 198 | + if device_name == "npu": |
| 199 | + dev = NPU1Col1() |
| 200 | + elif device_name == "npu2": |
| 201 | + dev = NPU2Col1() |
| 202 | + else: |
| 203 | + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) |
| 204 | + width = int(sys.argv[2]) |
| 205 | + if width % 8 != 0 or width < 8: |
| 206 | + print("Width size must be a multiple of 8 and greater than or equal to 8") |
| 207 | + raise ValueError |
| 208 | + height = int(sys.argv[3]) |
| 209 | + if height % 8 != 0 or height < 8: |
| 210 | + print("Height size must be a multiple of 8 and greater than or equal to 8") |
| 211 | + raise ValueError |
| 212 | + in_channels = int(sys.argv[4]) |
| 213 | + if in_channels != 4: |
| 214 | + print("Input channels size must be equal to 4") |
| 215 | + raise ValueError |
| 216 | + out_channels = int(sys.argv[5]) |
| 217 | + if out_channels != 1152: |
| 218 | + print("Output channel size must be equal to 1152") |
| 219 | + raise ValueError |
| 220 | + kernel_size = int(sys.argv[6]) |
| 221 | + if kernel_size != 14: |
| 222 | + print("Kernel size must be 14 right now.") |
| 223 | + raise ValueError |
| 224 | + trace_size = 0 if (len(sys.argv) != 8) else int(sys.argv[7]) |
| 225 | +except ValueError: |
| 226 | + print("Argument has inappropriate value") |
| 227 | +module = conv2dk14( |
| 228 | + dev, width, height, in_channels, out_channels, kernel_size, trace_size |
| 229 | +) |
| 230 | +print(module) |
0 commit comments