Skip to content

Commit e4c1861

Browse files
[Reprogram][ControlCodeToTransactionBinary] Add support for DMA Start Op (#1331)
-- This commit adds support for converting DMA start to transaction binary. -- Also involves initializing locks before processing any other ops. -- This is being added to AMDAIE dialect to make [DMA reprogramming](#1287) work. Signed-off-by: Abhishek Varma <[email protected]>
1 parent 342df56 commit e4c1861

File tree

7 files changed

+259
-21
lines changed

7 files changed

+259
-21
lines changed

compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIERT.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -205,20 +205,20 @@ LogicalResult configureLocksAndBd(Block &block, const TileLoc &tileLoc,
205205
return bufferOp.emitError("buffer must have address assigned");
206206
// Convert `xilinx::AIE::BDDimLayoutAttr` to
207207
// `mlir::iree_compiler::AMDAIE::BDDimLayout`.
208-
std::optional<std::vector<BDDimLayout>> maybeDims;
208+
std::optional<SmallVector<BDDimLayout>> maybeDims;
209209
if (std::optional<std::vector<BDDimLayoutAttr>> dims = bdOp.getDimensions()) {
210-
maybeDims = std::vector<BDDimLayout>{};
210+
maybeDims = SmallVector<BDDimLayout>{};
211211
for (const BDDimLayoutAttr &dim : (*dims)) {
212212
maybeDims->emplace_back(BDDimLayout{dim.getSize(), dim.getStride()});
213213
}
214214
}
215215

216216
// Convert `xilinx::AIE::BDPadLayoutAttr` to
217217
// `mlir::iree_compiler::AMDAIE::BDPadLayout`.
218-
std::optional<std::vector<BDPadLayout>> maybePadDims;
218+
std::optional<SmallVector<BDPadLayout>> maybePadDims;
219219
if (std::optional<std::vector<BDPadLayoutAttr>> dims =
220220
bdOp.getPadDimensions()) {
221-
maybePadDims = std::vector<BDPadLayout>{};
221+
maybePadDims = SmallVector<BDPadLayout>{};
222222
for (const BDPadLayoutAttr &dim : (*dims)) {
223223
maybePadDims->emplace_back(
224224
BDPadLayout{dim.getConstPadBefore(), dim.getConstPadAfter()});

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,20 +63,35 @@ LogicalResult convertOp(AMDAIE::NpuWriteBdOp op, TransactionBuilder &builder) {
6363
return success();
6464
}
6565

66+
LogicalResult convertOp(AMDAIE::DMAStartOp op, TransactionBuilder &builder) {
67+
return builder.appendDmaStartOp(op);
68+
}
69+
6670
LogicalResult controlCodeToTransaction(IRRewriter &rewriter,
6771
AMDAIE::ControlCodeOp controlCodeOp,
6872
TransactionBuilder &builder) {
6973
SmallVector<Operation *> toBeErased;
70-
WalkResult res = controlCodeOp->walk([&](Operation *op) {
74+
DenseSet<AMDAIE::LockOp> lockOps;
75+
// All locks used within control code are initialized before converting other
76+
// ops to transaction binary.
77+
WalkResult res = controlCodeOp->walk([&](AMDAIE::UseLockOp op) {
78+
auto lockOp = op.getLock().getDefiningOp<AMDAIE::LockOp>();
79+
if (lockOps.contains(lockOp)) return WalkResult::advance();
80+
if (failed(builder.appendLockOp(lockOp))) return WalkResult::interrupt();
81+
lockOps.insert(lockOp);
82+
return WalkResult::advance();
83+
});
84+
if (res.wasInterrupted()) return failure();
85+
res = controlCodeOp->walk([&](Operation *op) {
7186
LogicalResult switchResult =
7287
TypeSwitch<Operation *, LogicalResult>(op)
7388
.Case<AMDAIE::NpuAddressPatchOp, AMDAIE::NpuTctSyncOp,
74-
AMDAIE::NpuPushToQueueOp, AMDAIE::NpuWriteBdOp>(
75-
[&](auto npuOp) {
76-
if (failed(convertOp(npuOp, builder))) return failure();
77-
toBeErased.push_back(npuOp);
78-
return success();
79-
})
89+
AMDAIE::NpuPushToQueueOp, AMDAIE::NpuWriteBdOp,
90+
AMDAIE::DMAStartOp>([&](auto npuOp) {
91+
if (failed(convertOp(npuOp, builder))) return failure();
92+
toBeErased.push_back(npuOp);
93+
return success();
94+
})
8095
.Default([&](Operation *) { return success(); });
8196
if (failed(switchResult)) return WalkResult::interrupt();
8297
return WalkResult::advance();

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIETransactionBuilder.cpp

Lines changed: 119 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,123 @@ LogicalResult TransactionBuilder::appendAddressPatch(uint32_t addr,
6767
return configureCustomTxnOp(deviceModel, opCode, data, size);
6868
}
6969

70+
LogicalResult TransactionBuilder::appendLockOp(AMDAIE::LockOp lockOp) {
71+
auto tile = lockOp.getTile().getDefiningOp<AMDAIE::TileOp>();
72+
std::optional<int64_t> maybeCol = getConstantIntValue(tile.getCol());
73+
std::optional<int64_t> maybeRow = getConstantIntValue(tile.getRow());
74+
if (!maybeCol || !maybeRow) {
75+
return tile->emitOpError()
76+
<< "expected column and row integer value/constant";
77+
}
78+
XAie_LocType tileLoc = XAie_TileLoc(*maybeCol, *maybeRow);
79+
Lock lock{tileLoc, static_cast<uint8_t>(lockOp.getValue()),
80+
static_cast<int8_t>(*lockOp.getInitValue())};
81+
if (failed(initializeLock(deviceModel, lock))) return failure();
82+
return success();
83+
}
84+
85+
LogicalResult TransactionBuilder::appendDmaStartOp(
86+
AMDAIE::DMAStartOp dmaStartOp) {
87+
// Configure DMA Locks.
88+
auto tile = dmaStartOp.getTile().getDefiningOp<AMDAIE::TileOp>();
89+
std::optional<int64_t> maybeCol = getConstantIntValue(tile.getCol());
90+
std::optional<int64_t> maybeRow = getConstantIntValue(tile.getRow());
91+
if (!maybeCol || !maybeRow) {
92+
return tile->emitOpError()
93+
<< "expected column and row integer value/constant";
94+
}
95+
XAie_LocType tileLoc = XAie_TileLoc(*maybeCol, *maybeRow);
96+
FailureOr<XAie_DmaDesc> dmaDesc = initDMADesc(deviceModel, tileLoc);
97+
if (failed(dmaDesc)) return failure();
98+
99+
auto dmaBdOps = dmaStartOp.getOps<AMDAIE::DMABDOp>();
100+
// We currently expect only one DMABDOp within DMAStartOp.
101+
if (std::distance(dmaBdOps.begin(), dmaBdOps.end()) != 1) return failure();
102+
// Configure DMA BD ops within DMA Start op.
103+
AMDAIE::DMABDOp dmaBdOp = *dmaBdOps.begin();
104+
Block *parentBlock = dmaBdOp->getBlock();
105+
std::optional<int> acqValue, relValue, acqLockId, relLockId;
106+
for (AMDAIE::UseLockOp useLockOp : parentBlock->getOps<AMDAIE::UseLockOp>()) {
107+
auto lockOp = useLockOp.getLock().getDefiningOp<AMDAIE::LockOp>();
108+
if (useLockOp.getAction() == AMDAIE::LockAction::AcquireGreaterOrEqual ||
109+
useLockOp.getAction() == AMDAIE::LockAction::Acquire) {
110+
acqValue = useLockOp.getValue();
111+
if (useLockOp.getAction() == AMDAIE::LockAction::AcquireGreaterOrEqual)
112+
acqValue.value() = -acqValue.value();
113+
acqLockId = lockOp.getValue();
114+
} else if (useLockOp.getAction() == AMDAIE::LockAction::Release) {
115+
relValue = useLockOp.getValue();
116+
relLockId = lockOp.getValue();
117+
}
118+
}
119+
// Disable acquire and release locks if not set.
120+
if (!acqLockId) {
121+
acqLockId = 0;
122+
acqValue = 0;
123+
}
124+
if (!relLockId) {
125+
relLockId = 0;
126+
relValue = 0;
127+
}
128+
assert(acqValue && relValue && acqLockId && relLockId &&
129+
"expected both use_lock(acquire) and use_lock(release) with bd");
130+
if (failed(configureDMALocks(deviceModel, dmaDesc.value(), tileLoc, *acqValue,
131+
*relValue, *acqLockId, *relLockId,
132+
/*acqEn=*/true))) {
133+
return failure();
134+
}
135+
// Pull metadata related to packet routing, bdId, buffer length, size,
136+
// stride to pass to aie-rt.
137+
std::optional<uint32_t> bdId = dmaBdOp.getBdId();
138+
if (!bdId) return failure();
139+
bool validBd = true;
140+
std::optional<uint8_t> packetType;
141+
std::optional<uint8_t> packetID;
142+
bool enablePacket = false;
143+
144+
auto bufferOp = dmaBdOp.getBuffer().getDefiningOp<AMDAIE::BufferOp>();
145+
if (!bufferOp) return failure();
146+
std::optional<uint32_t> baseAddr = bufferOp.getAddress();
147+
if (!baseAddr) return failure();
148+
149+
std::optional<llvm::ArrayRef<BDDimLayoutAttr>> dims = dmaBdOp.getDimensions();
150+
if (!dims) return failure();
151+
std::optional<SmallVector<BDDimLayout>> maybeDims;
152+
maybeDims = llvm::map_to_vector(*dims, [](BDDimLayoutAttr attr) {
153+
return BDDimLayout{attr.getSize(), attr.getStride()};
154+
});
155+
std::optional<SmallVector<BDPadLayout>> maybePadDims;
156+
157+
bool enableNextBd = dmaBdOp.getNextBdId().has_value();
158+
std::optional<uint8_t> nextBdId =
159+
enableNextBd
160+
? std::optional<uint8_t>{static_cast<uint8_t>(*dmaBdOp.getNextBdId())}
161+
: std::nullopt;
162+
std::optional<BDIterLayout> maybeIter = std::nullopt;
163+
if (failed(configureDMABD(deviceModel, dmaDesc.value(), tileLoc, validBd,
164+
static_cast<uint8_t>(*bdId), enableNextBd, nextBdId,
165+
enablePacket, packetType, packetID, *baseAddr,
166+
dmaBdOp.getLenInBytes(), dmaBdOp.getOffsetInBytes(),
167+
dmaBdOp.getBufferElementTypeWidthInBytes(),
168+
maybeDims, maybePadDims, maybeIter))) {
169+
return failure();
170+
}
171+
172+
// Configure push to BD queue.
173+
// TODO: Generalize it as this is currently hardcoded to only shim side for
174+
// now.
175+
int chNum = dmaStartOp.getChannelIndex();
176+
auto channelDir = static_cast<DMAChannelDir>(dmaStartOp.getChannelDir());
177+
bool issueToken = tileLoc.Row == 0 && channelDir == DMAChannelDir::MM2S;
178+
bool setChannelEnable = true;
179+
if (failed(configurePushToBdQueue(
180+
deviceModel, tileLoc, chNum, channelDir, dmaBdOp.getBdId().value(),
181+
dmaStartOp.getRepeatCount(), issueToken, setChannelEnable))) {
182+
return failure();
183+
}
184+
return success();
185+
}
186+
70187
LogicalResult TransactionBuilder::appendTCTSync(uint32_t col, uint32_t row,
71188
uint32_t direction,
72189
uint32_t rowNum,
@@ -126,11 +243,11 @@ LogicalResult TransactionBuilder::appendWriteBdOp(
126243
uint32_t minStrideBitWidth = deviceModel.getMinStrideBitWidth();
127244
uint32_t bufferElementTypeWidthInBytes = minStrideBitWidth / 8;
128245
uint32_t bufferLengthInBytes = bufferLength * bufferElementTypeWidthInBytes;
129-
std::vector<BDDimLayout> dims = {
246+
SmallVector<BDDimLayout> dims = {
130247
{static_cast<uint16_t>(sizes[0]), static_cast<uint32_t>(strides[0])},
131248
{static_cast<uint16_t>(sizes[1]), static_cast<uint32_t>(strides[1])},
132249
{static_cast<uint16_t>(sizes[2]), static_cast<uint32_t>(strides[2])}};
133-
std::optional<std::vector<BDPadLayout>> pads = std::nullopt;
250+
std::optional<SmallVector<BDPadLayout>> pads = std::nullopt;
134251
BDIterLayout iter = {iterationStride, static_cast<uint8_t>(iterationSize),
135252
static_cast<uint8_t>(iterationCurrent)};
136253
return configureDMABD(deviceModel, dmaTileBd.value(), tileLoc, validBd, bdId,

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIETransactionBuilder.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#ifndef IREE_AMD_AIE_TRANSFORMS_AMDAIETRANSACTIONBUILDER_H_
88
#define IREE_AMD_AIE_TRANSFORMS_AMDAIETRANSACTIONBUILDER_H_
99

10+
#include "iree-amd-aie/IR/AMDAIEOps.h"
1011
#include "iree-amd-aie/aie_runtime/AMDAIEEnums.h"
1112
#include "iree-amd-aie/aie_runtime/iree_aie_configure.h"
1213
#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
@@ -28,6 +29,9 @@ class TransactionBuilder {
2829
LogicalResult appendAddressPatch(uint32_t addr, uint32_t argIdx,
2930
uint32_t offset);
3031

32+
LogicalResult appendLockOp(AMDAIE::LockOp lockOp);
33+
LogicalResult appendDmaStartOp(AMDAIE::DMAStartOp dmaStartOp);
34+
3135
LogicalResult appendTCTSync(uint32_t col, uint32_t row, uint32_t direction,
3236
uint32_t rowNum, uint32_t colNum,
3337
uint32_t channel);

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,3 +271,105 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
271271
return
272272
}
273273
}
274+
275+
// -----
276+
277+
// CHECK: 0x06030100
278+
// CHECK: 0x00000104
279+
// CHECK: 0x00000008
280+
// CHECK: 0x00000100
281+
// CHECK: 0x00200100
282+
// CHECK: 0x00000000
283+
// CHECK: 0x001C0020
284+
// CHECK: 0x00000000
285+
// CHECK: 0x00000001
286+
// CHECK: 0x00000018
287+
// CHECK: 0x00300100
288+
// CHECK: 0x00000000
289+
// CHECK: 0x001C0030
290+
// CHECK: 0x00000000
291+
// CHECK: 0x00000000
292+
// CHECK: 0x00000018
293+
// CHECK: 0x00000101
294+
// CHECK: 0x00000000
295+
// CHECK: 0x001A0000
296+
// CHECK: 0x00000030
297+
// CHECK: 0x00000400
298+
// CHECK: 0x00024000
299+
// CHECK: 0x00400000
300+
// CHECK: 0x0040001F
301+
// CHECK: 0x00000000
302+
// CHECK: 0x00000000
303+
// CHECK: 0x00000000
304+
// CHECK: 0x8143FF42
305+
// CHECK: 0x00140100
306+
// CHECK: 0x00000000
307+
// CHECK: 0x001A0614
308+
// CHECK: 0x00000000
309+
// CHECK: 0x00000000
310+
// CHECK: 0x00000018
311+
// CHECK: 0x00100100
312+
// CHECK: 0x00000000
313+
// CHECK: 0x001A0610
314+
// CHECK: 0x00000000
315+
// CHECK: 0x00000001
316+
// CHECK: 0x00000018
317+
// CHECK: 0x00000101
318+
// CHECK: 0x00000000
319+
// CHECK: 0x001A0000
320+
// CHECK: 0x00000030
321+
// CHECK: 0x00000400
322+
// CHECK: 0x00024000
323+
// CHECK: 0x00400000
324+
// CHECK: 0x0040001F
325+
// CHECK: 0x00000000
326+
// CHECK: 0x00000000
327+
// CHECK: 0x00000000
328+
// CHECK: 0x8142FF43
329+
// CHECK: 0x00340100
330+
// CHECK: 0x00000000
331+
// CHECK: 0x001A0634
332+
// CHECK: 0x00000000
333+
// CHECK: 0x00000000
334+
// CHECK: 0x00000018
335+
// CHECK: 0x00300100
336+
// CHECK: 0x00000000
337+
// CHECK: 0x001A0630
338+
// CHECK: 0x00000000
339+
// CHECK: 0x00000001
340+
// CHECK: 0x00000018
341+
// CHECK-LABE: @dma_start
342+
// CHECK: npu_instructions = dense_resource<npu_instructions> : tensor<64xui32>
343+
#executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 1 : i32, num_rows = 1 : i32, target_device = "npu1_4col", ukernels = "none"}>
344+
module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
345+
func.func @dma_start() {
346+
%c0 = arith.constant 0 : index
347+
%c1 = arith.constant 1 : index
348+
amdaie.workgroup {
349+
%tile_0_1 = amdaie.tile(%c0, %c1)
350+
%buffer = amdaie.buffer(%tile_0_1) {address = 65536 : i32, mem_bank = 1 : ui32, sym_name = "_anonymous1"} : memref<1024xi32, 1 : i32>
351+
%lock = amdaie.lock(%tile_0_1(2), 1)
352+
%lock_0 = amdaie.lock(%tile_0_1(3), 0)
353+
amdaie.controlcode {
354+
%0 = amdaie.dma_start(%tile_0_1, S2MM, 2) {
355+
amdaie.use_lock(%lock, AcquireGreaterOrEqual(1))
356+
amdaie.dma_bd(%buffer : memref<1024xi32, 1 : i32>) {bd_id = 0 : i32, dimensions = #amdaie<bd_dim_layout_array[<size = 32, stride = 32>, <size = 32, stride = 1>]>, len = 1024 : i32}
357+
amdaie.use_lock(%lock_0, Release(1))
358+
amdaie.next_bd ^bb1
359+
^bb1: // pred: ^bb0
360+
amdaie.end
361+
}
362+
%1 = amdaie.dma_start(%tile_0_1, MM2S, 0) {
363+
amdaie.use_lock(%lock_0, AcquireGreaterOrEqual(1))
364+
amdaie.dma_bd(%buffer : memref<1024xi32, 1 : i32>) {bd_id = 0 : i32, dimensions = #amdaie<bd_dim_layout_array[<size = 32, stride = 32>, <size = 32, stride = 1>]>, len = 1024 : i32}
365+
amdaie.use_lock(%lock, Release(1))
366+
amdaie.next_bd ^bb1
367+
^bb1: // pred: ^bb0
368+
amdaie.end
369+
}
370+
amdaie.end
371+
}
372+
}
373+
return
374+
}
375+
}

runtime/src/iree-amd-aie/aie_runtime/iree_aie_configure.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,8 @@ LogicalResult configureDMABD(
5858
std::optional<uint8_t> packetType, std::optional<uint8_t> packetId,
5959
uint64_t baseAddr, uint64_t lenInBytes, uint64_t offsetInBytes,
6060
uint32_t bufferElementTypeWidthInBytes,
61-
const std::optional<std::vector<BDDimLayout>> &maybeDims,
62-
const std::optional<std::vector<BDPadLayout>> &maybePadDims,
61+
const std::optional<SmallVector<BDDimLayout>> &maybeDims,
62+
const std::optional<SmallVector<BDPadLayout>> &maybePadDims,
6363
const std::optional<BDIterLayout> &maybeIter) {
6464
assert(dmaDesc.IsReady == XAIE_COMPONENT_IS_READY &&
6565
"XAie_DmaDescs need to be created using initDMADesc");
@@ -98,8 +98,8 @@ LogicalResult configureDMABD(
9898
// Pass down dimensions in reverse order; in the MLIR, this allows
9999
// us to specify step sizes/strides in the same order as we would for
100100
// RankedTensorType/MemRefType.
101-
uint16_t size = dims->at(i).size;
102-
uint32_t stride = dims->at(i).stride;
101+
uint16_t size = (*dims)[i].size;
102+
uint32_t stride = (*dims)[i].stride;
103103
size_t j = dims->size() - i - 1;
104104
if (j > 0) {
105105
if (stride * bufferElementTypeWidthInBytes % 4 != 0) {
@@ -139,8 +139,8 @@ LogicalResult configureDMABD(
139139
dmaPadTensor.NumDim = padDims->size();
140140
dmaPadTensor.PadDesc = new XAie_PadDesc[dmaPadTensor.NumDim];
141141
for (size_t i = 0; i < padDims->size(); i++) {
142-
uint8_t before = padDims->at(i).const_pad_before;
143-
uint8_t after = padDims->at(i).const_pad_after;
142+
uint8_t before = (*padDims)[i].const_pad_before;
143+
uint8_t after = (*padDims)[i].const_pad_after;
144144
size_t j = padDims->size() - i - 1;
145145
if (j == 0) {
146146
if (before * bufferElementTypeWidthInBytes % 4 != 0) {

runtime/src/iree-amd-aie/aie_runtime/iree_aie_configure.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,8 @@ LogicalResult configureDMABD(
164164
std::optional<uint8_t> packetType, std::optional<uint8_t> packetId,
165165
uint64_t baseAddr, uint64_t lenInBytes, uint64_t offsetInBytes,
166166
uint32_t bufferElementTypeWidthInBytes,
167-
const std::optional<std::vector<BDDimLayout>> &maybeDims,
168-
const std::optional<std::vector<BDPadLayout>> &maybePadDims,
167+
const std::optional<SmallVector<BDDimLayout>> &maybeDims,
168+
const std::optional<SmallVector<BDPadLayout>> &maybePadDims,
169169
const std::optional<BDIterLayout> &maybeIter);
170170

171171
/// Configures/sets up locks associated with a dma (actually the bd...).

0 commit comments

Comments
 (0)