Skip to content

Commit c004041

Browse files
authored
Rename func.func @sequence to its own op (Xilinx#1640)
1 parent d9612b9 commit c004041

File tree

97 files changed

+259
-223
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

97 files changed

+259
-223
lines changed

include/aie/Dialect/AIEX/IR/AIEX.td

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ include "mlir/IR/EnumAttr.td"
2020
include "mlir/IR/SymbolInterfaces.td"
2121
include "mlir/Interfaces/CallInterfaces.td"
2222
include "mlir/Interfaces/SideEffectInterfaces.td"
23+
include "mlir/IR/CommonAttrConstraints.td"
2324

2425
def AIEX_Dialect : Dialect {
2526
let name = "aiex";
@@ -463,6 +464,25 @@ def AIE_SelectOp: AIEX_Op<"select", []>, Results<(outs Index)> {
463464
];
464465
}
465466

467+
def AIE_RuntimeSequenceOp : AIEX_Op<"runtime_sequence", [NoTerminator, HasParent<"AIE::DeviceOp">]> {
468+
let summary = "Program the configuration co-processor of the AI Engine array";
469+
let description = [{
470+
Instructions in this operation allow for runtime (re-)configuration of the AI Engine array, such as configuring data movement buffer descriptors.
471+
These instructions will execute on the configuration co-processor of the AI Engine array.
472+
473+
Typically, these instructions include configuring the data transfers between host and AIE array on the shims.
474+
The input arguments are arguments passed in from the host at kernel invocation time. This may include buffers on the host.
475+
}];
476+
let arguments = (ins
477+
Variadic<AnyType>:$args
478+
);
479+
let regions = (region
480+
AnyRegion:$body
481+
);
482+
let hasCustomAssemblyFormat = 1;
483+
let hasVerifier = 1;
484+
}
485+
466486
def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
467487
AttrSizedOperandSegments,
468488
MyOffsetSizeAndStrideOpInterface

lib/Dialect/AIEX/IR/AIEXDialect.cpp

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,4 +276,72 @@ LogicalResult AIEX::NpuWriteBdOp::verify() {
276276
if (getIterationStride() > 0xFFFFF)
277277
return emitOpError("Iteration Stride exceeds the [0:1M-1] range.");
278278
return success();
279-
}
279+
}
280+
281+
//===----------------------------------------------------------------------===//
282+
// RuntimeSequenceOp
283+
//===----------------------------------------------------------------------===//
284+
285+
ParseResult AIEX::RuntimeSequenceOp::parse(OpAsmParser &parser,
286+
OperationState &result) {
287+
SmallVector<OpAsmParser::Argument> entryArgs;
288+
289+
// Entry arguments, e.g. (%addr: memref<1xi32>)
290+
ParseResult argParseResult = parser.parseCommaSeparatedList(
291+
OpAsmParser::Delimiter::Paren, [&]() -> ParseResult {
292+
OpAsmParser::Argument argument;
293+
if (parser.parseArgument(argument, true, true)) {
294+
return failure();
295+
}
296+
entryArgs.push_back(argument);
297+
return success();
298+
});
299+
if (argParseResult) {
300+
return argParseResult;
301+
}
302+
303+
// Body
304+
auto *body = result.addRegion();
305+
ParseResult bodyParseResult = parser.parseRegion(*body, entryArgs, false);
306+
if (bodyParseResult) {
307+
return bodyParseResult;
308+
}
309+
310+
return success();
311+
}
312+
313+
void AIEX::RuntimeSequenceOp::print(OpAsmPrinter &printer) {
314+
Region &body = getRegion();
315+
316+
printer << '(';
317+
for (unsigned i = 0, n = body.getNumArguments(); i < n; i++) {
318+
if (i > 0) {
319+
printer << ", ";
320+
}
321+
printer.printRegionArgument(body.getArgument(i));
322+
}
323+
printer << ')';
324+
325+
printer << ' ';
326+
printer.printRegion(body, false, true);
327+
}
328+
329+
LogicalResult AIEX::RuntimeSequenceOp::verify() {
330+
AIE::DeviceOp device = (*this)->getParentOfType<AIE::DeviceOp>();
331+
if (!device) {
332+
// this check is redudnant with the HasParent trait, but can't hurt
333+
(*this)->emitOpError() << "must be inside AIE device operation.";
334+
return failure();
335+
}
336+
auto seq_ops = device.getOps<AIEX::RuntimeSequenceOp>();
337+
if (std::distance(seq_ops.begin(), seq_ops.end()) > 1) {
338+
auto err = device.emitOpError()
339+
<< "Cannot have more than one runtime sequence per device.";
340+
for (auto it = seq_ops.begin(); it != seq_ops.end(); ++it) {
341+
AIEX::RuntimeSequenceOp seq_op = *it;
342+
err.attachNote(seq_op.getLoc()) << "Sequence operation definition here.";
343+
}
344+
return failure();
345+
}
346+
return success();
347+
}

lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
#include "aie/Dialect/AIEX/IR/AIEXDialect.h"
1313
#include "aie/Dialect/AIEX/Transforms/AIEXPasses.h"
1414

15-
#include "mlir/Dialect/Func/IR/FuncOps.h"
1615
#include "mlir/Pass/Pass.h"
1716
#include "mlir/Transforms/DialectConversion.h"
1817
#include "llvm/ADT/DenseMap.h"
@@ -227,7 +226,11 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
227226
column = IntegerAttr::get(i32ty, col);
228227

229228
// arg_idx
230-
Block &entryBB = op->getParentOfType<func::FuncOp>().getBody().front();
229+
AIEX::RuntimeSequenceOp seq_op =
230+
op->getParentOfType<AIEX::RuntimeSequenceOp>();
231+
assert(seq_op && "NpuDmaMemcpyNdOp must be inside a RuntimeSequenceOp; "
232+
"verify() should have ensured this.");
233+
Block &entryBB = seq_op.getBody().front();
231234
int arg_idx = -1;
232235
for (int i = 0, e = entryBB.getNumArguments(); i < e; i++) {
233236
if (entryBB.getArgument(i) == memref) {
@@ -448,7 +451,8 @@ struct WriteBdToBlockWritePattern : OpConversionPattern<NpuWriteBdOp> {
448451
{
449452
OpBuilder::InsertionGuard guard(rewriter);
450453
std::string name = "blockwrite_data_";
451-
rewriter.setInsertionPoint(op->getParentOfType<func::FuncOp>());
454+
rewriter.setInsertionPoint(
455+
op->getParentOfType<AIEX::RuntimeSequenceOp>());
452456
int id = 0;
453457
while (dev.lookupSymbol(name + std::to_string(id)))
454458
id++;

lib/Targets/AIETargetHSA.cpp

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -68,23 +68,15 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
6868
// Putting the standard header
6969
output << hsa_cpp_file_header;
7070

71-
// Getting the func op which has the data movement
72-
if (targetOp.getOps<mlir::func::FuncOp>().empty()) {
73-
return success();
74-
}
75-
7671
// Getting the sequence function op which contains the instructions
77-
mlir::func::FuncOp funcOp = NULL;
78-
for (auto op : targetOp.getOps<mlir::func::FuncOp>()) {
79-
if (op.getName().str().compare("sequence") == 0) {
80-
funcOp = op;
81-
}
82-
}
83-
84-
// If no funcOp then just return
85-
if (funcOp == NULL) {
72+
auto sequenceOps = targetOp.getOps<AIEX::RuntimeSequenceOp>();
73+
if (sequenceOps.empty()) {
74+
// If no sequenceOp then just return
8675
return success();
76+
} else if (std::distance(sequenceOps.begin(), sequenceOps.end()) > 1) {
77+
return module.emitOpError("expected at most one sequence operation");
8778
}
79+
AIEX::RuntimeSequenceOp sequenceOp = *sequenceOps.begin();
8880

8981
collectTiles(targetOp, tiles);
9082
collectBuffers(targetOp, buffers);
@@ -95,10 +87,11 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
9587
// Looping over every Memcpy operation so we take the correct number of
9688
// buffers
9789
int num_ops = 0;
98-
for (auto op : funcOp.getOps<NpuDmaMemcpyNdOp>()) {
90+
for (auto op : sequenceOp.getOps<NpuDmaMemcpyNdOp>()) {
9991
// Getting the IDs of the buffers
10092
auto memref = op.getMemref();
101-
Block &entryBB = op->getParentOfType<func::FuncOp>().getBody().front();
93+
Block &entryBB =
94+
op->getParentOfType<AIEX::RuntimeSequenceOp>().getBody().front();
10295
int arg_idx = -1;
10396
for (int i = 0, e = entryBB.getNumArguments(); i < e; i++) {
10497
if (entryBB.getArgument(i) == memref) {
@@ -117,8 +110,8 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
117110
output << "\tuint64_t packet_id = 0;\n";
118111

119112
int op_count = 0;
120-
for (auto op : funcOp.getOps<NpuDmaMemcpyNdOp>()) {
121-
auto dev = funcOp->getParentOfType<AIE::DeviceOp>();
113+
for (auto op : sequenceOp.getOps<NpuDmaMemcpyNdOp>()) {
114+
auto dev = sequenceOp->getParentOfType<AIE::DeviceOp>();
122115
if (!dev) {
123116
op.emitOpError("couldn't get DeviceOp");
124117
return failure();
@@ -163,7 +156,8 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
163156

164157
// Getting the ID of the buffer that we are using
165158
auto memref = op.getMemref();
166-
Block &entryBB = op->getParentOfType<func::FuncOp>().getBody().front();
159+
Block &entryBB =
160+
op->getParentOfType<AIEX::RuntimeSequenceOp>().getBody().front();
167161
int arg_idx = -1;
168162
for (int i = 0, e = entryBB.getNumArguments(); i < e; i++) {
169163
if (entryBB.getArgument(i) == memref) {

lib/Targets/AIETargetNPU.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -163,12 +163,10 @@ std::vector<uint32_t> xilinx::AIE::AIETranslateToNPU(ModuleOp module) {
163163
words[1] = 0x00000105;
164164

165165
DeviceOp deviceOp = *module.getOps<DeviceOp>().begin();
166-
auto funcOps = deviceOp.getOps<func::FuncOp>();
166+
auto sequenceOps = deviceOp.getOps<AIEX::RuntimeSequenceOp>();
167167
int count = 0;
168-
for (auto f : funcOps) {
169-
if (f.isDeclaration())
170-
continue;
171-
Block &entry = f.getRegion().front();
168+
for (auto f : sequenceOps) {
169+
Block &entry = f.getBody().front();
172170
for (auto &o : entry) {
173171
llvm::TypeSwitch<Operation *>(&o)
174172
.Case<NpuSyncOp>([&](auto op) {

programming_examples/basic/dma_transpose/aie2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def core_body():
5151
# To/from AIE-array data movement
5252
tensor_ty = T.memref(N, T.i32())
5353

54-
@FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
54+
@runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
5555
def sequence(A, B, C):
5656
npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
5757
# The strides below are configured to read across all rows in the same column

programming_examples/basic/matrix_multiplication/matrix_vector/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ In this design, one or multiple AI Engine compute cores (spread across hardware
1717
## Differences from the [Whole-Array Matrix-Matrix Multiplication Design](../whole_array/README.md)
1818

1919
- A specialized matrix-*vector* microkernel, named `matvec_vectorized` is used in this design, as opposed to the more general matrix-matrix microkernel (`matmul_vectorized`) used in the matrix-matrix-multiplication designs.
20-
- The data movement in this design varies as follows: An identical `32`-element chunk of the vector `B` is **broadcast** to the cores in all columns, whereas _distinct_ subsequent `32`&times;`32`-sized tiles of the `A` matrix are **distributed** to the cores. As such, each core is responsible for a distinct `32`-element chunk of the output vector `C`. These chunks are assembled (**joined**) at the shim tile level (in the `sequence()` function).
20+
- The data movement in this design varies as follows: An identical `32`-element chunk of the vector `B` is **broadcast** to the cores in all columns, whereas _distinct_ subsequent `32`&times;`32`-sized tiles of the `A` matrix are **distributed** to the cores. As such, each core is responsible for a distinct `32`-element chunk of the output vector `C`. These chunks are assembled (**joined**) at the shim tile level (in the `aiex.runtime_sequence()`).
2121
- This design does not use all available compute cores. Instead, it uses at most one core in each hardware column. The variable `n_cores` defines the number of columns to be used. It would however be possible to extend this design to use all cores.
2222

2323
## Building and Running the Design

programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ def core_body():
185185

186186
# To/from AIE-array data movement
187187

188-
@FuncOp.from_py_func(
188+
@runtime_sequence(
189189
T.memref(A_sz, dtype_in()),
190190
T.memref(B_sz, dtype_in()),
191191
T.memref(C_sz, dtype_out()),

programming_examples/basic/matrix_multiplication/single_core/aie2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ def core_body():
231231

232232
# To/from AIE-array data movement
233233

234-
@FuncOp.from_py_func(
234+
@runtime_sequence(
235235
T.memref(A_sz, dtype_in()),
236236
T.memref(B_sz, dtype_in()),
237237
T.memref(C_sz, dtype_out()),

programming_examples/basic/matrix_multiplication/whole_array/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ At a high level, the code does the following (in order):
2222

2323
1. [**Defining Core Computations:**](#4-defining-core-computations) The `core_body()` function contains the code that will be loaded onto each AIE core. This code describes the matrix multiplication using the input submatrices `a` and `b` acquired through the ObjectFIFOs. The results are accumulated in the output submatrix `c`.
2424

25-
1. [**Defining External Data Transfer Sequences:**](#5-defining-external-data-transfer-sequences) The `sequence()` function sets up matrix data movement from the host into the AIE compute cores, and back to the host after computation. It initializes Data Movement Accelerator (DMA) transfers, sets memory access patterns, and performs synchronization.
25+
1. [**Defining External Data Transfer Sequences:**](#5-defining-external-data-transfer-sequences) The `aie.runtime_sequence()` op sets up matrix data movement from the host into the AIE compute cores, and back to the host after computation. It initializes Data Movement Accelerator (DMA) transfers, sets memory access patterns, and performs synchronization.
2626

2727
1. **Generating the Design:** The `my_matmul()` function triggers the code generation process and represents the main entry point of the design. The final print statement outputs the MLIR representation of the AIE array configuration.
2828

@@ -72,7 +72,7 @@ The input and output matrix sizes are given by the user. We subdivide the input
7272

7373
1. **Tiling to Compute Core Submatrix Chunks:** The input and output matrices stream to/from the AIE compute cores in chunks of size of `m`&times;`k`, `k`&times;`n` and `n`&times;`m`. Tiling into these chunks allows each of the computation cores to concurrently work on distinct sub-sections of the input matrices in parallel, which improves performance. This also reduces on-chip memory requirements. The final result is re-assembled using the sub-matrix results of all cores.
7474

75-
> This tiling occurs in the `sequence()` function describing the host-to-memory-tile transfer.
75+
> This tiling occurs in the `aie.runtime_sequence()` operation describing the host-to-memory-tile transfer.
7676
We describe it further below, in section *"5. Defining External Data Transfer Sequences"*.
7777

7878
1. **Tiling to Vector Intrinsic Size:** The AIE compute cores calculate the matrix multiplication using efficient "multiply-accumulate" vector intrinsic instructions (`MAC` instructions). These hardware instructions process very small blocks of the matrix: size `r`&times;`s` blocks of `A` and size `s`&times;`t` blocks of `B`, producing an output of size `r`&times;`t` (`C`).
@@ -198,7 +198,7 @@ We define a `core_body()` function for each compute core `i`, inside of which we
198198

199199
### 5. Defining External Data Transfer Sequences
200200

201-
The function signature of the `sequence()` function lists as its arguments all the external buffers from the host that we wish to read from or write to on the AI Engine's shim tiles. The body of this function describes how these buffers are transfered from and to the host, including tiling the input matrices into `m`&times;`k` and `k`&times;`n`-sized sub-matrices, and combining the `m`&times;`n`-sized output tiles into the larger output `M`&times;`N` matrix buffer.
201+
The signature of the `aie.runtime_sequence()` operation lists as its arguments all the external buffers from the host that we wish to read from or write to on the AI Engine's shim tiles. The body of this function describes how these buffers are transfered from and to the host, including tiling the input matrices into `m`&times;`k` and `k`&times;`n`-sized sub-matrices, and combining the `m`&times;`n`-sized output tiles into the larger output `M`&times;`N` matrix buffer.
202202

203203
* The `tile_row_block` variable segments the M (rows of A) into smaller chunks, each containing `rows_per_block` tile rows. This is done so the buffer descriptors (BDs) can be reused for efficient DMA transfers.
204204
* For each column `i`:

0 commit comments

Comments
 (0)