Enhanced conv2d by making width, height and channels configurable via Makefile. (#2022)

jackl-xilinx · web-flow · commit 6a3b9a5ac1ef · 2025-01-24T17:36:20.000Z
diff --git a/programming_examples/ml/conv2d/Makefile b/programming_examples/ml/conv2d/Makefile
@@ -11,13 +11,21 @@ include ${srcdir}/../../makefile-common
 
 mlirFileName = aie
 
+# Modify these params to configure design
+width = 32
+height = 32
+in_channels = 64
+out_channels = 64
+vectorized ?= true
 trace_size = 16384
 
 VPATH := ${srcdir}/../../../aie_kernels/aie2
 
-aie_py_src=conv2d.py
-aie_py_trace_src=conv2d_alt.py
-use_alt?=0
+aie_py_src = conv2d.py
+aie_py_trace_src = conv2d_alt.py
+use_alt ?= 0
+
+device = npu
 
 ifeq (${use_alt}, 1)
 aie_py_src=conv2d_alt.py
@@ -27,15 +35,29 @@ all: build/conv2dk1_i8.o build/final.xclbin
 
 build/conv2dk1_i8.o: conv2dk1_i8.cc
 	mkdir -p ${@D}
+ifeq ($(device),npu)
+ifeq ($(vectorized), true)
 	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang ${PEANOWRAP2_FLAGS} -DINT8_ACT -c $< -o ${@F}
+else
+	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang ${PEANOWRAP2_FLAGS} -DSCALAR -DINT8_ACT -c $< -o ${@F}
+endif
+else ifeq ($(device),npu2)
+ifeq ($(vectorized), true)
+	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang ${PEANOWRAP2P_FLAGS} -DINT8_ACT -c $< -o ${@F}
+else
+	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang ${PEANOWRAP2P_FLAGS} -DSCALAR -DINT8_ACT -c $< -o ${@F}
+endif
+else
+	echo "Device type not supported"
+endif
 
 build/${mlirFileName}.mlir: ${srcdir}/${aie_py_src}
 	mkdir -p ${@D}
-	python3 $< > $@
+	python3 $< ${device} ${width} ${height} ${in_channels} ${out_channels} 0 > $@
 
 build/${mlirFileName}_trace.mlir: ${srcdir}/${aie_py_trace_src}
 	mkdir -p ${@D}
-	python3 $< ${trace_size} > $@
+	python3 $< ${device} ${width} ${height} ${in_channels} ${out_channels} ${trace_size} > $@
 
 build/final.xclbin: build/${mlirFileName}.mlir build/conv2dk1_i8.o 
 	mkdir -p ${@D} 
@@ -50,13 +72,13 @@ build/final_trace.xclbin: build/${mlirFileName}_trace.mlir build/conv2dk1_i8.o
 		--xclbin-name=${@F} --npu-insts-name=insts_trace.txt $(<:%=../%)
 
 run_py: build/final.xclbin
-	${powershell} python3 ${srcdir}/test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+	${powershell} python3 ${srcdir}/test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -wd ${width} -ht ${height} -ic ${in_channels} -oc ${out_channels}
 
 trace_py: build/final_trace.xclbin
-	${powershell} python3 ${srcdir}/test.py -x build/final_trace.xclbin -i build/insts_trace.txt -k MLIR_AIE -t ${trace_size}
+	${powershell} python3 ${srcdir}/test.py -x build/final_trace.xclbin -i build/insts_trace.txt -k MLIR_AIE -wd ${width} -ht ${height} -ic ${in_channels} -oc ${out_channels} -t ${trace_size}
 	${srcdir}/../../utils/parse_trace.py --filename log/trace_conv2d.txt --mlir build/aie_trace.mlir --colshift 1 > log/trace_conv2d.json
 
 clean:
 	rm -rf build *.elf* *.lst *.bif ${mlirFileName}.mlir.prj log* *.xclbin sim \
 		chess* *.o insts.txt \
-		*.log aie_partition.json *.bin BOOT.BIN _x test.exe
+		*.log aie_partition.json *.bin BOOT.BIN _x test.exe
diff --git a/programming_examples/ml/conv2d/README.md b/programming_examples/ml/conv2d/README.md
@@ -81,12 +81,15 @@ To compile the design:
 make
 ```
 
-To compile the design:
+To compile the design using the lower-level IRON version (currently needed for trace):
 ```shell
 env use_alt=1 make
 ```
 
 To run the design:
 ```shell
 make run_py
-```
+```
+
+## Configure design
+To configure the parameters of the convolution such as data width, height and the number of input and output channels, you can edit the top of the `Makefile`. Choosing the scalar or vectorized version of the kernel can likewise be selected in the `Makefile` by modifying the `vectorized` variable. 
diff --git a/programming_examples/ml/conv2d/conv2d.py b/programming_examples/ml/conv2d/conv2d.py
@@ -12,30 +12,24 @@
 from aie.iron.device import NPU1Col1
 from aie.iron.controlflow import range_
 
-width = 32
-height = 32
-in_channels = 64
-out_channels = 64
 
-if len(sys.argv) == 3:
-    width = int(sys.argv[1])
-    height = int(sys.argv[2])
+def conv2dk1(
+    dev, width: int, height: int, in_channels: int, out_channels: int, trace_size: int
+):
 
+    actIn = width * in_channels  # 32*64 = 2048
+    bufIn = actIn * 2  # double buffer
 
-actIn = width * in_channels  # 32*64 = 2048
-bufIn = actIn * 2  # double buffer
+    weights = in_channels * out_channels
 
-weights = in_channels * out_channels
+    actOut = width * out_channels  # 32*64 = 2048
+    bufOut = actOut * 2  # double buffer
 
-actOut = width * out_channels  # 32*64 = 2048
-bufOut = actOut * 2  # double buffer
+    tensorInSize = width * height * in_channels
+    tensorOutSize = width * height * out_channels
 
-tensorSize = width * height * in_channels
+    N_in_bytes = tensorOutSize  # Number of bytes of output data (1 byte/elem)
 
-N_in_bytes = tensorSize  # Number of bytes of output data (1 byte/elem)
-
-
-def conv2dk1(trace_size: int):
     # Type definitions
     actIn_ty = np.ndarray[(actIn,), np.dtype[np.int8]]
     bufIn_ty = np.ndarray[(bufIn,), np.dtype[np.int8]]
@@ -44,7 +38,8 @@ def conv2dk1(trace_size: int):
 
     out_ty = np.ndarray[(actOut,), np.dtype[np.int8]]
     bufOut_ty = np.ndarray[(bufOut,), np.dtype[np.int8]]
-    tensor_ty = np.ndarray[(tensorSize,), np.dtype[np.int8]]
+    tensorIn_ty = np.ndarray[(tensorInSize,), np.dtype[np.int8]]
+    tensorOut_ty = np.ndarray[(tensorOutSize,), np.dtype[np.int8]]
 
     # AIE Core Function declarations
     conv2dk1_i8_kernel = Kernel(
@@ -82,10 +77,10 @@ def conv2dk1(trace_size: int):
 
     # Task for the core to perform
     def core_fn(of_wts, of_act, of_out, my_rtp, conv2dk1_i8):
-        y_dim = 32
-        x_dim = 32
-        ci = 64
-        co = 64
+        y_dim = height
+        x_dim = width
+        ci = in_channels
+        co = out_channels
 
         elemWts = of_wts.acquire(1)
         scale = my_rtp[0]
@@ -114,7 +109,7 @@ def core_fn(of_wts, of_act, of_out, my_rtp, conv2dk1_i8):
 
     # Runtime operations to move data to/from the AIE-array
     rt = Runtime()
-    with rt.sequence(tensor_ty, weights_ty, tensor_ty) as (I, W, O):
+    with rt.sequence(tensorIn_ty, weights_ty, tensorOut_ty) as (I, W, O):
         # Initialize the runtime parameter values
         def set_rtps(my_rtp):
             my_rtp[0] = 10
@@ -130,9 +125,39 @@ def set_rtps(my_rtp):
         rt.drain(of_outOFL2L3.cons(), O, wait=True)
 
     # Place components (assign them resources on the device) and generate an MLIR module
-    return Program(NPU1Col1(), rt).resolve_program(SequentialPlacer())
-
-
-if __name__ == "__main__":
-    trace_size = 0 if (len(sys.argv) != 2) else int(sys.argv[1])
-    print(conv2dk1(trace_size=trace_size))
+    return Program(dev, rt).resolve_program(SequentialPlacer())
+
+
+try:
+    device_name = str(sys.argv[1])
+    if device_name == "npu":
+        dev = NPU1Col1()
+    elif device_name == "npu2":
+        dev = NPU2()
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+    width = int(sys.argv[2])
+    if width % 8 != 0 or width < 8:
+        print("Width size must be a multiple of 8 and greater than or equal to 8")
+        raise ValueError
+    height = int(sys.argv[3])
+    if height < 2:
+        print("Height needs to be > 1 at the moment (BUG)")
+        raise ValueError
+    in_channels = int(sys.argv[4])
+    if in_channels % 8 != 0 or in_channels < 8:
+        print(
+            "Input channels size must be a multiple of 8 and greater than or equal to 8"
+        )
+        raise ValueError
+    out_channels = int(sys.argv[5])
+    if out_channels % 8 != 0 or out_channels < 8:
+        print(
+            "Output channel size must be a multiple of 8 and greater than or equal to 8"
+        )
+        raise ValueError
+    trace_size = 0 if (len(sys.argv) != 7) else int(sys.argv[6])
+except ValueError:
+    print("Argument has inappropriate value")
+module = conv2dk1(dev, width, height, in_channels, out_channels, trace_size)
+print(module)
diff --git a/programming_examples/ml/conv2d/conv2d_alt.py b/programming_examples/ml/conv2d/conv2d_alt.py
@@ -13,31 +13,24 @@
 from aie.helpers.dialects.ext.scf import _for as range_
 import aie.utils.trace as trace_utils
 
-width = 32
-height = 32
-in_channels = 64
-out_channels = 64
 
-if len(sys.argv) == 3:
-    width = int(sys.argv[1])
-    height = int(sys.argv[2])
-
-
-actIn = width * in_channels  # 32*64 = 2048
-bufIn = actIn * 2  # double buffer
+def conv2dk1(
+    dev, width: int, height: int, in_channels: int, out_channels: int, trace_size: int
+):
+    with mlir_mod_ctx() as ctx:
 
-weights = in_channels * out_channels
+        actIn = width * in_channels  # 32*64 = 2048
+        bufIn = actIn * 2  # double buffer
 
-actOut = width * out_channels  # 32*64 = 2048
-bufOut = actOut * 2  # double buffer
+        weights = in_channels * out_channels
 
-tensorSize = width * height * in_channels
+        actOut = width * out_channels  # 32*64 = 2048
+        bufOut = actOut * 2  # double buffer
 
-N_in_bytes = tensorSize  # Number of bytes of output data (1 byte/elem)
+        tensorInSize = width * height * in_channels
+        tensorOutSize = width * height * out_channels
 
-
-def conv2dk1(trace_size: int):
-    with mlir_mod_ctx() as ctx:
+        N_in_bytes = tensorOutSize  # Number of bytes of output data (1 byte/elem)
 
         @device(AIEDevice.npu1_1col)
         def device_body():
@@ -49,7 +42,8 @@ def device_body():
 
             out_ty = np.ndarray[(actOut,), np.dtype[np.int8]]
             bufOut_ty = np.ndarray[(bufOut,), np.dtype[np.int8]]
-            tensor_ty = np.ndarray[(tensorSize,), np.dtype[np.int8]]
+            tensorIn_ty = np.ndarray[(tensorInSize,), np.dtype[np.int8]]
+            tensorOut_ty = np.ndarray[(tensorOutSize,), np.dtype[np.int8]]
 
             # AIE Core Function declarations
             conv2dk1_i8 = external_func(
@@ -105,10 +99,10 @@ def device_body():
             # Compute tile 2
             @core(ComputeTile2, "conv2dk1_i8.o")
             def core_body():
-                y_dim = 32
-                x_dim = 32
-                ci = 64
-                co = 64
+                y_dim = height
+                x_dim = width
+                ci = in_channels
+                co = out_channels
 
                 for _ in range_(0xFFFFFFFF):
                     elemWts = of_inOF_wts_0_L3L2.acquire(ObjectFifoPort.Consume, 1)
@@ -126,7 +120,7 @@ def core_body():
                     of_inOF_wts_0_L3L2.release(ObjectFifoPort.Consume, 1)
 
             # To/from AIE-array data movement
-            @runtime_sequence(tensor_ty, weights_ty, tensor_ty)
+            @runtime_sequence(tensorIn_ty, weights_ty, tensorOut_ty)
             def sequence(I, W, O):
 
                 if trace_size > 0:
@@ -139,7 +133,7 @@ def sequence(I, W, O):
                 in_act_task = shim_dma_single_bd_task(
                     of_inOF_act_L3L2,
                     I,
-                    sizes=[1, 1, 1, tensorSize],
+                    sizes=[1, 1, 1, tensorInSize],
                     issue_token=True,
                 )
                 in_wts_task = shim_dma_single_bd_task(
@@ -151,7 +145,7 @@ def sequence(I, W, O):
                 out_task = shim_dma_single_bd_task(
                     of_outOFL2L3,
                     O,
-                    sizes=[1, 1, 1, tensorSize],
+                    sizes=[1, 1, 1, tensorOutSize],
                     issue_token=True,
                 )
 
@@ -163,5 +157,37 @@ def sequence(I, W, O):
 
 
 if __name__ == "__main__":
-    trace_size = 0 if (len(sys.argv) != 2) else int(sys.argv[1])
-    conv2dk1(trace_size=trace_size)
+    try:
+        device_name = str(sys.argv[1])
+        if device_name == "npu":
+            # dev = NPU1Col1()
+            dev = 0  # placeholders
+        elif device_name == "npu2":
+            # dev = NPU2()
+            dev = 1  # placeholders
+        else:
+            raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+        width = int(sys.argv[2])
+        if width % 8 != 0 or width < 8:
+            print("Width size must be a multiple of 8 and greater than or equal to 8")
+            raise ValueError
+        height = int(sys.argv[3])
+        if height % 8 != 0 or height < 8:
+            print("Height size must be a multiple of 8 and greater than or equal to 8")
+            raise ValueError
+        in_channels = int(sys.argv[4])
+        if in_channels % 8 != 0 or in_channels < 8:
+            print(
+                "Input channels size must be a multiple of 8 and greater than or equal to 8"
+            )
+            raise ValueError
+        out_channels = int(sys.argv[5])
+        if out_channels % 8 != 0 or out_channels < 8:
+            print(
+                "Output channel size must be a multiple of 8 and greater than or equal to 8"
+            )
+            raise ValueError
+        trace_size = 0 if (len(sys.argv) != 7) else int(sys.argv[6])
+    except ValueError:
+        print("Argument has inappropriate value")
+    conv2dk1(dev, width, height, in_channels, out_channels, trace_size)
diff --git a/programming_examples/ml/conv2d/test.py b/programming_examples/ml/conv2d/test.py