Skip to content

Commit 7419211

Browse files
hunhoffejgmelbergithub-actions[bot]
authored
dma_task in programming examples (#1919)
Co-authored-by: Joseph Melber <[email protected]> Co-authored-by: Joseph Melber <[email protected]> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
1 parent f85a4a9 commit 7419211

File tree

103 files changed

+6669
-136
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+6669
-136
lines changed

include/aie/Dialect/AIE/IR/AIEAttrs.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def BDDimLayoutAttr : AttrDef<AIE_Dialect, "BDDimLayout", []> {
140140
}];
141141

142142
let parameters = (ins
143-
"uint16_t" : $size,
143+
"uint32_t" : $size,
144144
"uint32_t" : $stride
145145
);
146146

include/aie/Dialect/AIE/IR/AIEOps.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -898,8 +898,8 @@ def AIE_DMABDOp: AIE_Op<"dma_bd", []> {
898898
int32_t getBufferElementTypeWidthInBytes() {
899899
return getBuffer().getType().getElementTypeBitWidth() / 8;
900900
}
901-
int32_t getLenInBytes() {
902-
if (std::optional<int32_t> len = getLen(); len.has_value())
901+
uint32_t getLenInBytes() {
902+
if (std::optional<uint32_t> len = getLen(); len.has_value())
903903
return len.value() * getBufferElementTypeWidthInBytes();
904904
else
905905
return getBuffer().getType().getNumElements() * getBufferElementTypeWidthInBytes();

lib/Dialect/AIEX/IR/AIEXDialect.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -443,9 +443,11 @@ LogicalResult AIEX::NpuPushQueueOp::verify() {
443443
LogicalResult AIEX::NpuWriteBdOp::verify() {
444444
const auto &targetModel = AIE::getTargetModel(*this);
445445
auto numBds = targetModel.getNumBDs(getColumn(), getRow());
446+
bool isLinearTransfer =
447+
(getD0Size() >= 1) && (getD1Size() == 1) && (getIterationSize() == 0);
446448
if (getBdId() > numBds)
447449
return emitOpError("BD ID exceeds the maximum ID.");
448-
if (getD0Size() > 0x3FF)
450+
if (!isLinearTransfer && getD0Size() > 0x3FF)
449451
return emitOpError("D0 Size exceeds the [0:1023] range.");
450452
if (getD0Stride() > 0xFFFFF)
451453
return emitOpError("D0 Stride exceeds the [0:1M-1] range.");

lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp

Lines changed: 51 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -225,8 +225,8 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
225225

226226
uint32_t bd_id = bd_op.getBdId().value();
227227
int64_t offset = bd_op.getOffsetInBytes();
228-
uint32_t len = bd_op.getLenInBytes();
229-
uint32_t len_addr_granularity = len * 8 / addr_granularity;
228+
uint64_t len = bd_op.getLenInBytes();
229+
uint64_t len_addr_granularity = len * 8 / addr_granularity;
230230

231231
if (offset * 8 % addr_granularity != 0) {
232232
return bd_op->emitOpError("Offset must be aligned to ")
@@ -253,7 +253,15 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
253253
llvm::SmallVector<int64_t, 4>(4, 0);
254254
std::fill(padBefore.begin(), padBefore.end(), 0);
255255
std::fill(padAfter.begin(), padAfter.end(), 0);
256-
int d2size = 0;
256+
257+
auto d0size = 0;
258+
auto d0stride = 0;
259+
auto d1size = 0;
260+
auto d1stride = 0;
261+
auto d2size = 0;
262+
auto d2stride = 0;
263+
auto iteration_size = 0;
264+
auto iteration_stride = 0;
257265

258266
if (dims && dims->size() > 0) {
259267
llvm::SmallVector<int64_t, 4> input_sizes =
@@ -273,6 +281,12 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
273281
input_sizes[i] = (*dims)[j].getSize();
274282
input_strides[i] = (*dims)[j].getStride();
275283
}
284+
285+
// Do not check input_sizes[3] because a repeat can still be considered a
286+
// linear transfer
287+
bool isLinearTransfer = (input_sizes[0] >= 1) && (input_sizes[1] == 1) &&
288+
(input_sizes[2] == 1);
289+
276290
if (dims->size() > 2) {
277291
d2size = (target_model.isMemTile(tile.getCol(), tile.getRow()))
278292
? (*dims)[2].getSize()
@@ -302,16 +316,43 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
302316
}
303317
getHardwareStridesWraps(target_model, buffer_type, input_sizes,
304318
input_strides, sizes, strides);
319+
305320
if (failed(verifyStridesWraps(bd_op, buffer_type, tile.getCol(),
306321
tile.getRow(), input_sizes, input_strides,
307-
sizes, strides))) {
322+
sizes, strides, isLinearTransfer))) {
308323
return failure();
309324
}
325+
326+
iteration_size = sizes[3];
327+
iteration_stride = strides[3];
328+
329+
if (!isLinearTransfer) {
330+
// d0_size, d0_stride
331+
d0size = sizes[0];
332+
d0stride = strides[0];
333+
334+
// d1_size, d1_stride
335+
d1size = sizes[1];
336+
d1stride = strides[1];
337+
338+
// d2_stride
339+
d2stride = strides[2];
340+
// d2_size set elsewhere
341+
}
342+
if (input_sizes[3] > 1 && input_strides[3] == 0) {
343+
// We allow users to encode the repeat_count as a dimension 3 stride
344+
// of 0. This must lower to a iteration wrap of 0, so no stride is
345+
// ever added. We then repeat the BD using the repeat_count in
346+
// NpuPushQueueOp.
347+
iteration_size = 0;
348+
iteration_stride = 0;
349+
}
350+
310351
// Ensure the total transfer length and the length expressed in the lowest
311352
// three dimensions of strides/wraps agree. (Fourth dimension is
312353
// iteration/repeat count and repeats the whole BD, so should not be
313354
// incorporated in length of a single BD invocation.)
314-
uint32_t len_dims_addr_granularity = 1;
355+
uint64_t len_dims_addr_granularity = 1;
315356
for (size_t i = 0; i < 3; i++) {
316357
len_dims_addr_granularity *= sizes[i];
317358
}
@@ -352,11 +393,11 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
352393
bd_op.getLoc(), tile.getCol(), bd_id, len_addr_granularity, offset, 0,
353394
0, 0, 0,
354395
/* TODO: Strides/Wraps */
355-
/*d0_size=*/sizes[0], /*d0_stride=*/strides[0],
356-
/*d1_size=*/sizes[1], /*d1_stride=*/strides[1],
357-
/*d2_size=*/d2size, /*d2_stride=*/strides[2],
358-
/*iteration_current=*/0, /*iteration_size=*/sizes[3],
359-
/*iteration_stride=*/strides[3],
396+
/*d0_size=*/d0size, /*d0_stride=*/d0stride,
397+
/*d1_size=*/d1size, /*d1_stride=*/d1stride,
398+
/*d2_size=*/d2size, /*d2_stride=*/d2stride,
399+
/*iteration_current=*/0, /*iteration_size=*/iteration_size,
400+
/*iteration_stride=*/iteration_stride,
360401
/* TODO: Next BD */
361402
/*next_bd=*/next_bd_id,
362403
/*row=*/tile.getRow(),
@@ -368,7 +409,6 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
368409
/*d1_zero_before=*/padBefore[1], /*d2_zero_before=*/padBefore[2],
369410
/*d0_zero_after=*/padAfter[0], /*d1_zero_after=*/padAfter[1],
370411
/*d2_zero_after=*/padAfter[2]);
371-
372412
return setAddressForSingleBD(builder, bd_op, tile);
373413
}
374414

programming_examples/basic/dma_transpose/Makefile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,14 @@ targetname = dmaTranspose
2020
M ?= 64
2121
K ?= 32
2222

23-
build/aie.mlir: ${srcdir}/aie2.py
23+
aie_py_src=aie2.py
24+
use_alt?=0
25+
26+
ifeq (${use_alt}, 1)
27+
aie_py_src=aie2_alt.py
28+
endif
29+
30+
build/aie.mlir: ${srcdir}/${aie_py_src}
2431
mkdir -p ${@D}
2532
python3 $< ${M} ${K} > $@
2633

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# dma_transpose/aie2.py -*- Python -*-
2+
#
3+
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4+
# See https://llvm.org/LICENSE.txt for license information.
5+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
#
7+
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
8+
import argparse
9+
import numpy as np
10+
import sys
11+
12+
from aie.dialects.aie import *
13+
from aie.dialects.aiex import *
14+
from aie.extras.context import mlir_mod_ctx
15+
from aie.helpers.dialects.ext.scf import _for as range_
16+
from aie.helpers.tensortiler import TensorTile
17+
18+
19+
def my_passthrough(M, K, N, generate_access_map=False):
20+
tensor_ty = np.ndarray[(M, K), np.dtype[np.int32]]
21+
data_transform = TensorTile(
22+
(M, K), offset=0, sizes=[1, 1, K, M], strides=[1, 1, 1, K]
23+
)
24+
if generate_access_map:
25+
data_transform.visualize(
26+
show_arrows=True, plot_access_count=False, file_path="transpose_data.png"
27+
)
28+
return
29+
30+
with mlir_mod_ctx() as ctx:
31+
32+
@device(AIEDevice.npu1_1col)
33+
def device_body():
34+
# Tile declarations
35+
ShimTile = tile(0, 0)
36+
ComputeTile2 = tile(0, 2)
37+
38+
# AIE-array data movement with object fifos
39+
of_in = object_fifo("in", ShimTile, ComputeTile2, 2, tensor_ty)
40+
of_out = object_fifo("out", ComputeTile2, ShimTile, 2, tensor_ty)
41+
object_fifo_link(of_in, of_out)
42+
43+
# Set up compute tiles
44+
45+
# Compute tile 2
46+
@core(ComputeTile2)
47+
def core_body():
48+
for _ in range_(sys.maxsize):
49+
pass
50+
51+
# To/from AIE-array data movement
52+
@runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
53+
def sequence(A, B, C):
54+
# The strides below are configured to read across all rows in the same column
55+
# Stride of K in dim/wrap 2 skips an entire row to read a full column
56+
in_task = shim_dma_single_bd_task(
57+
of_in, A, tensor_tile=data_transform, issue_token=True
58+
)
59+
out_task = shim_dma_single_bd_task(
60+
of_out, C, sizes=[1, 1, 1, N], issue_token=True
61+
)
62+
63+
dma_start_task(in_task, out_task)
64+
dma_await_task(in_task, out_task)
65+
66+
print(ctx.module)
67+
68+
69+
if __name__ == "__main__":
70+
p = argparse.ArgumentParser()
71+
p.add_argument("dims", help="M K", type=int, nargs="*", default=[64, 64])
72+
p.add_argument(
73+
"--generate-access-map",
74+
action="store_true",
75+
help="Produce a file showing data access order",
76+
)
77+
args = p.parse_args()
78+
79+
if len(args.dims) != 2:
80+
print(
81+
"ERROR: Must provide either no dimensions or both M and K", file=sys.stderr
82+
)
83+
exit(-1)
84+
my_passthrough(
85+
M=args.dims[0],
86+
K=args.dims[1],
87+
N=args.dims[0] * args.dims[1],
88+
generate_access_map=args.generate_access_map,
89+
)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// (c) Copyright 2024 Advanced Micro Devices, Inc.
2+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3+
//
4+
// REQUIRES: ryzen_ai, peano
5+
//
6+
// RUN: mkdir -p test_alt
7+
// RUN: cd test_alt
8+
// RUN: make -f %S/Makefile clean
9+
// RUN: env use_alt=1 make -f %S/Makefile
10+
// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
11+
// CHECK: PASS!
12+

programming_examples/basic/matrix_multiplication/cascade/Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ n_aie_cols?=4
2222
kernels=mm_${m}x${k}x${n}
2323
aieargs+=-m $m -k $k -n $n --n-aie-cols ${n_aie_cols}
2424
target_suffix=${M}x${K}x${N}_${m}x${k}x${n}_${n_aie_cols}c
25+
use_alt?=0
26+
27+
ifeq (${use_alt}, 1)
28+
aie_py_src=aie2_alt.py
29+
endif
2530

2631
include ${srcdir}/../makefile-common
2732

0 commit comments

Comments
 (0)