Add unplaced IRON design for conv2d 14x14 (#2601)

jackl-xilinx · web-flow · commit 7b9a46746ac4 · 2025-09-23T22:38:19.000Z
diff --git a/aie_kernels/aie2p/conv2dk14.cc b/aie_kernels/aie2p/conv2dk14.cc
@@ -60,16 +60,15 @@ void conv2dk14_i8_scalar(uint8_t *input, int8_t *kernels, int8_t *output,
   int wts_indx = 0;
   int out_indx = 0;
 
-  const int output_channels_div_8 = output_channels / 8;
-  const int tiles_div_8 = input_width / kernel_width / 8;
-  const int pixels_div_2 = kernel_width * kernel_width / 2;
+  const int output_channels_div_8 = output_channels / 8;    // 2
+  const int tiles_div_8 = input_width / kernel_width / 8;   // 2
+  const int pixels_div_2 = kernel_width * kernel_width / 2; // 98
 
   for (oc = 0; oc < output_channels_div_8; oc++) { // 16 out of 1152
     for (oc8 = 0; oc8 < 8; oc8++) {
       for (nt = 0; nt < tiles_div_8; nt++) { // 16 out of 64 tiles in row
         for (nt8 = 0; nt8 < 8; nt8++) {
           int sum = 0;
-          int sum_srs = 0;
           for (pix = 0; pix < pixels_div_2; pix++) { // 196 // 2 = 98
             for (p2 = 0; p2 < 2; p2++) {
               in_indx = ((nt * (pixels_div_2) * 8 * 2) + (pix * 8 * 2) +
@@ -83,7 +82,7 @@ void conv2dk14_i8_scalar(uint8_t *input, int8_t *kernels, int8_t *output,
                      input[in_indx + 3] * kernels[wts_indx + 24];
             }
           }
-          sum_srs = (sum + (1 << (scale - 1))) >> scale;
+          int sum_srs = (sum + (1 << (scale - 1))) >> scale;
           sum_srs = (sum_srs > SMAX)    ? SMAX
                     : (sum_srs < -SMIN) ? -SMIN
                                         : sum_srs;
@@ -154,7 +153,7 @@ void conv2dk14_i8_vector(uint8_t *input, int8_t *kernels, int8_t *output,
   int8_t *__restrict out_ptr = output;
 
   for (int k = 0; k < output_channels_div_8; k++) { // 2
-    for (int j = 0; j < tiles_div_16; j++) {        // 2
+    for (int j = 0; j < tiles_div_16; j++) {        // 1
       AIE_PREPARE_FOR_PIPELINING
       AIE_LOOP_MIN_ITERATION_COUNT(98)
       // AIE_LOOP_UNROLL_FULL
diff --git a/programming_examples/ml/conv2d_14x14/Makefile b/programming_examples/ml/conv2d_14x14/Makefile
@@ -89,7 +89,7 @@ ifeq ($(CHESS), true)
 	cd ${@D} && aiecc.py --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host \
 		--xclbin-name=${@F} --npu-insts-name=insts_trace.bin $(<:%=../%)
 else
-	cd ${@D} && aiecc.py --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host \
+	cd ${@D} && aiecc.py -v --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --packet-sw-objFifos \
 		--no-xchesscc --no-xbridge --xclbin-name=${@F} --npu-insts-name=insts_trace.bin $(<:%=../%)
 endif
 
diff --git a/programming_examples/ml/conv2d_14x14/README.md b/programming_examples/ml/conv2d_14x14/README.md
@@ -65,15 +65,24 @@ To compile and run the design:
 make run_py
 ```
 
-
 To build and run the design while generating trace
 ```shell
-make trace_py
+make clean; make trace_py
+```
+
+To build and run the design for the unplaced IRON version (with or without generating trace), we need to add an additional qualifier.
+```shell
+make clean; make use_placed=0 num_act=72 run_py
+make clean; make use_placed=0 num_act=72 trace_py
 ```
 
 To build and run the 32-core design (trace not currently supported)
 ```shell
-make targetname=conv2dk14_32core num_act=1 run_py
+make clean; make targetname=conv2dk14_32core num_act=1 run_py
+```
+To build an drun the 32-core design with the scalar kernel (trace not currently supported)
+```shell
+make clean; make vectorized=false targetname=conv2dk14_32core num_act=1 run_py
 ```
 
 ## Multi-core Design Example (32-cores)
@@ -86,8 +95,8 @@ While the design was designed to be somewhat configurable, this is mostly tested
 
 ## Limitation Notes
 At the moment, the following limtations exist:
-* The scalar kernel version of this design has some intermittent runtime issue (CMD_ABORT triggered) for the full output channel size. Reducing this to 256 channels from 1152 is a workaround at the moment but further investigation is needed to fully resolve this.
-* Unplaced IRON version is in the works. At the moment, writing trace data to the 5th buffer which is the default for unplaced IRON seems to trigger a segfault. Further investgation needed.
+* The scalar kernel version of this design does not run properly in single core mode for the full data size because the total compute time exceeds the execution time limit of the npu driver (~2 seconds). You can reduce the number of output channels (576 channels works) or you can run the scalar kernel with the 32-core design as noted above.
+* Unplaced IRON now works but needs an additional qualifier for the testbench. However, there is a bug if the trace_size is 32,768 bytes (rather than 16kB or 8kB) which causes the unplaced IRON trace to seg fault. Still under investiation but choosing a smaller size seems to be a good workaround.
 * Trace for the 32-core variant currently causes the compilation to hang. Under investigation but the non-trace run works without issue.
 * There is behavior bug where the number of input/activation sets sent from the host to the AIE array needs to be a certain value in order for correct functionality. For the single core design, `num_act=2` is sufficient for non-trace runs (`run_py`) but for trace runs (`trace_py`), we need this to be `num_act=8`. For the 32-core design, `num_act=1` is sufficient but any value for trace runs causes it to hang at the moment. This is under investigation.
 
diff --git a/programming_examples/ml/conv2d_14x14/conv2dk14.py b/programming_examples/ml/conv2d_14x14/conv2dk14.py
@@ -0,0 +1,230 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+import numpy as np
+import sys
+
+from aie.iron import (
+    GlobalBuffer,
+    Kernel,
+    ObjectFifo,
+    Program,
+    Runtime,
+    Worker,
+    WorkerRuntimeBarrier,
+)
+from aie.iron.placers import SequentialPlacer
+from aie.iron.device import NPU1Col1, NPU2Col1
+from aie.iron.controlflow import range_
+
+
+def conv2dk14(
+    dev,
+    width: int,
+    height: int,
+    in_channels: int,
+    out_channels: int,
+    kernel_size: int,
+    trace_size: int,
+):
+    enable_trace = 1 if trace_size > 0 else 0
+
+    # Kernel processes 16 tiles and 16 output channels at a time
+    sub_out_channels = 16
+    sub_tiles = 16
+
+    actIn = kernel_size * kernel_size * in_channels * sub_tiles
+    weights = kernel_size * kernel_size * in_channels * sub_out_channels
+    actOut = sub_tiles * sub_out_channels
+
+    out_channels_group = out_channels // sub_out_channels  # 72
+    width_out = width // kernel_size
+    height_out = height // kernel_size
+
+    # we reload inputs 72 times (out_channels // sub_out_channels)
+    tensorInSize = width * height * in_channels * out_channels_group
+    # tensorInSize = width * height * in_channels * 2
+
+    tensorWeightsSize = weights * out_channels_group
+    tensorOutSize = width_out * height_out * sub_out_channels * out_channels_group
+
+    N_in_bytes = tensorOutSize  # Number of bytes of output data (1 byte/elem)
+
+    bufIn = kernel_size * width * in_channels
+    bufOut = sub_out_channels * width_out * height_out
+
+    # Type definitions
+    actIn_ty = np.ndarray[(actIn,), np.dtype[np.uint8]]
+    bufIn_ty = np.ndarray[(bufIn,), np.dtype[np.uint8]]
+
+    weights_ty = np.ndarray[(weights,), np.dtype[np.int8]]
+
+    out_ty = np.ndarray[(actOut,), np.dtype[np.int8]]
+    bufOut_ty = np.ndarray[(bufOut,), np.dtype[np.int8]]
+    tensorIn_ty = np.ndarray[(tensorInSize,), np.dtype[np.uint8]]
+    tensorWeights_ty = np.ndarray[(tensorWeightsSize,), np.dtype[np.int8]]
+    tensorOut_ty = np.ndarray[(tensorOutSize,), np.dtype[np.int8]]
+
+    # AIE Core Function declarations
+    conv2dk14_i8_kernel = Kernel(
+        "conv2dk14_i8",
+        "conv2dk14.o",
+        [
+            actIn_ty,
+            weights_ty,
+            out_ty,
+            np.int32,
+            np.int32,
+            np.int32,
+            np.int32,
+            np.int32,
+        ],
+    )
+
+    # AIE-array data movement with object fifos
+    # Input
+    of_inOF_act_L3L2 = ObjectFifo(
+        bufIn_ty,
+        name="inOF_act_L3L2",
+        dims_from_stream_per_cons=[
+            (kernel_size, kernel_size * in_channels),  # (14, 56)
+            (64, kernel_size * kernel_size * in_channels),  # (64, 784)
+            (kernel_size * in_channels, 1),  # (56, 1)
+        ],
+    )
+    of_act_L2_02 = of_inOF_act_L3L2.cons().forward(
+        obj_type=actIn_ty,
+        name="act_L2_02",
+        dims_to_stream=[
+            (2, kernel_size * kernel_size * in_channels * 8),  # (2, 6272)
+            (kernel_size * kernel_size // 2, 2 * in_channels),  # (98, 8)
+            (8, kernel_size * kernel_size * in_channels),  # (8, 784)
+            (2 * in_channels, 1),  # (8, 1)
+        ],
+    )
+
+    # wts
+    of_inOF_wts_0_L3L2 = ObjectFifo(weights_ty, depth=1, name="inOF_wts_0_L3L2")
+
+    # Output
+    of_out_02_L2 = ObjectFifo(out_ty, name="out_02_L2")
+    of_outOFL2L3 = of_out_02_L2.cons().forward(
+        obj_type=bufOut_ty,
+        name="outOFL2L3",
+        dims_to_stream=[(256, 256), (16, 8), (2, 128), (8, 1)],
+    )
+
+    # Setup a global buffer to hold runtime parameters
+    # rtp = GlobalBuffer(
+    #     np.ndarray[(16,), np.dtype[np.int32]],
+    #     name="rtp",
+    #     use_write_rtp=True,
+    # )
+
+    # rtp_barrier = WorkerRuntimeBarrier()
+
+    # Task for the core to perform
+    # def core_fn(of_wts, of_act, of_out, my_rtp, conv2dk14_i8, barrier):
+    def core_fn(of_wts, of_act, of_out, conv2dk14_i8):
+        y_dim = height // kernel_size
+        x_blocks = 4
+        x_dim = width // x_blocks  # num pixels for 1/4 of a row
+        ci = in_channels
+        co = sub_out_channels
+
+        # barrier.wait_for_value(1)
+        # scale = my_rtp[0]
+        scale = 14
+
+        elemWts = of_wts.acquire(1)
+
+        for _ in range_(y_dim):
+            for _ in range_(x_blocks):
+                elemIn = of_act.acquire(1)
+                elemOut0 = of_out.acquire(1)
+
+                conv2dk14_i8(
+                    elemIn, elemWts, elemOut0, x_dim, ci, co, kernel_size, scale
+                )
+                of_act.release(1)
+                of_out.release(1)
+        of_wts.release(1)
+
+    # Create a worker to perform the task
+    worker = Worker(
+        core_fn,
+        [
+            of_inOF_wts_0_L3L2.cons(),
+            of_act_L2_02.cons(),
+            of_out_02_L2.prod(),
+            # rtp,
+            conv2dk14_i8_kernel,
+            # rtp_barrier,
+        ],
+        stack_size=0x600,
+        trace=enable_trace,
+    )
+
+    # Runtime operations to move data to/from the AIE-array
+    rt = Runtime()
+    with rt.sequence(tensorIn_ty, tensorWeights_ty, tensorOut_ty) as (I, W, O):
+        # Initialize the runtime parameter values
+        def set_rtps(my_rtp):
+            my_rtp[0] = 14
+
+        # rt.inline_ops(set_rtps, [rtp])
+
+        # rt.set_barrier(rtp_barrier, 1)
+
+        rt.enable_trace(trace_size),
+
+        # Start worker
+        rt.start(worker)
+
+        # Fill/drain input/output ObjectFifos
+        rt.fill(of_inOF_act_L3L2.prod(), I)
+        rt.fill(of_inOF_wts_0_L3L2.prod(), W)
+        rt.drain(of_outOFL2L3.cons(), O, wait=True)
+
+    # Place components (assign them resources on the device) and generate an MLIR module
+    return Program(dev, rt).resolve_program(SequentialPlacer())
+
+
+try:
+    device_name = str(sys.argv[1])
+    if device_name == "npu":
+        dev = NPU1Col1()
+    elif device_name == "npu2":
+        dev = NPU2Col1()
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+    width = int(sys.argv[2])
+    if width % 8 != 0 or width < 8:
+        print("Width size must be a multiple of 8 and greater than or equal to 8")
+        raise ValueError
+    height = int(sys.argv[3])
+    if height % 8 != 0 or height < 8:
+        print("Height size must be a multiple of 8 and greater than or equal to 8")
+        raise ValueError
+    in_channels = int(sys.argv[4])
+    if in_channels != 4:
+        print("Input channels size must be equal to 4")
+        raise ValueError
+    out_channels = int(sys.argv[5])
+    if out_channels != 1152:
+        print("Output channel size must be equal to 1152")
+        raise ValueError
+    kernel_size = int(sys.argv[6])
+    if kernel_size != 14:
+        print("Kernel size must be 14 right now.")
+        raise ValueError
+    trace_size = 0 if (len(sys.argv) != 8) else int(sys.argv[7])
+except ValueError:
+    print("Argument has inappropriate value")
+module = conv2dk14(
+    dev, width, height, in_channels, out_channels, kernel_size, trace_size
+)
+print(module)
diff --git a/programming_examples/ml/conv2d_14x14/conv2dk14_32core_placed.py b/programming_examples/ml/conv2d_14x14/conv2dk14_32core_placed.py
@@ -268,8 +268,8 @@ def sequence(I, W, O):
                         tiles_to_trace=tiles_to_trace,
                         shim=shim_tiles[0],
                         trace_size=trace_size,
-                        trace_offset=N_in_bytes,
-                        ddr_id=2,
+                        # trace_offset=N_in_bytes,
+                        # ddr_id=2,
                         coretile_events=[
                             CoreEvent.INSTR_EVENT_0,
                             CoreEvent.INSTR_EVENT_1,
diff --git a/programming_examples/ml/conv2d_14x14/conv2dk14_placed.py b/programming_examples/ml/conv2d_14x14/conv2dk14_placed.py
@@ -207,8 +207,6 @@ def sequence(I, W, O):
                         tiles_to_trace=tiles_to_trace,
                         shim=ShimTile,
                         trace_size=trace_size,
-                        trace_offset=N_in_bytes,
-                        ddr_id=2,
                         coretile_events=[
                             CoreEvent.INSTR_EVENT_0,
                             CoreEvent.INSTR_EVENT_1,
diff --git a/programming_examples/ml/conv2d_14x14/run_strix_makefile_placed.lit b/programming_examples/ml/conv2d_14x14/run_strix_makefile_placed.lit
@@ -15,4 +15,10 @@
 // RUN: make -f %S/Makefile clean
 // RUN: env num_act=8 %run_on_npu2% make -f %S/Makefile trace_py devicename=npu2
 // RUN: make -f %S/Makefile clean
+// RUN: env num_act=72 %run_on_npu2% make -f %S/Makefile use_placed=0 run_py devicename=npu2
+// RUN: make -f %S/Makefile clean
+// RUN: env num_act=72 %run_on_npu2% make -f %S/Makefile use_placed=0 trace_py devicename=npu2
+// RUN: make -f %S/Makefile clean
 // RUN: env targetname=conv2dk14_32core num_act=1 %run_on_npu2% make -f %S/Makefile run_py devicename=npu2
+// RUN: make -f %S/Makefile clean
+// RUN: env targetname=conv2dk14_32core num_act=1 vectorized=false %run_on_npu2% make -f %S/Makefile run_py devicename=npu2
diff --git a/programming_examples/ml/conv2d_14x14/test.py b/programming_examples/ml/conv2d_14x14/test.py
@@ -121,7 +121,7 @@ def main(opts):
         dtype_out,
         enable_trace=enable_trace,
         trace_size=trace_size,
-        trace_after_output=True,
+        trace_after_output=False,
     )
 
     # ------------------------------------------------------
@@ -212,15 +212,18 @@ def forward(self, x):
     # ------------------------------------------------------
     for i in range(num_iter):
         start = time.time_ns()
-        # entire_buffer = execute(app, ifm_mem_fmt, total_wts)
-        entire_buffer = execute(app, ifm_mem_fmt_grp, total_wts)
+        if enable_trace:
+            data_buffer, trace_buffer = execute(
+                app, ifm_mem_fmt_grp, total_wts, enable_trace, False
+            )
+        else:
+            entire_buffer = execute(
+                app, ifm_mem_fmt_grp, total_wts, enable_trace, False
+            )
         stop = time.time_ns()
 
         if enable_trace:
-            #  Separate data and trace
-            data_buffer, trace_buffer = extract_trace(
-                entire_buffer, shape_out, dtype_out, trace_size
-            )
+            trace_buffer = trace_buffer.view(np.uint32)
             # Scale the data
             scaled_data_buffer = data_buffer * int8_scale
             # Write out the trace
diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py
diff --git a/python/utils/xrt.py b/python/utils/xrt.py