Xilinx
diff --git a/‎include/aie/Dialect/AIEX/IR/AIEX.td‎
Lines changed: 20 additions & 0 deletions b/‎include/aie/Dialect/AIEX/IR/AIEX.td‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎lib/Dialect/AIEX/IR/AIEXDialect.cpp‎
Lines changed: 69 additions & 1 deletion b/‎lib/Dialect/AIEX/IR/AIEXDialect.cpp‎
Lines changed: 69 additions & 1 deletion
diff --git a/‎lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp‎
Lines changed: 7 additions & 3 deletions b/‎lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎lib/Targets/AIETargetHSA.cpp‎
Lines changed: 13 additions & 19 deletions b/‎lib/Targets/AIETargetHSA.cpp‎
Lines changed: 13 additions & 19 deletions
diff --git a/‎lib/Targets/AIETargetNPU.cpp‎
Lines changed: 3 additions & 5 deletions b/‎lib/Targets/AIETargetNPU.cpp‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎programming_examples/basic/dma_transpose/aie2.py‎
Lines changed: 1 addition & 1 deletion b/‎programming_examples/basic/dma_transpose/aie2.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎programming_examples/basic/matrix_multiplication/matrix_vector/README.md‎
Lines changed: 1 addition & 1 deletion b/‎programming_examples/basic/matrix_multiplication/matrix_vector/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py‎
Lines changed: 1 addition & 1 deletion b/‎programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎programming_examples/basic/matrix_multiplication/single_core/aie2.py‎
Lines changed: 1 addition & 1 deletion b/‎programming_examples/basic/matrix_multiplication/single_core/aie2.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎programming_examples/basic/matrix_multiplication/whole_array/README.md‎
Lines changed: 3 additions & 3 deletions b/‎programming_examples/basic/matrix_multiplication/whole_array/README.md‎
Lines changed: 3 additions & 3 deletions
@@ -20,6 +20,7 @@ include "mlir/IR/EnumAttr.td"
 include "mlir/IR/SymbolInterfaces.td"
 include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/CommonAttrConstraints.td"
 
 def AIEX_Dialect : Dialect {
   let name = "aiex";
@@ -463,6 +464,25 @@ def AIE_SelectOp: AIEX_Op<"select", []>, Results<(outs Index)> {
   ];
 }
 
+def AIE_RuntimeSequenceOp : AIEX_Op<"runtime_sequence", [NoTerminator, HasParent<"AIE::DeviceOp">]> {
+  let summary = "Program the configuration co-processor of the AI Engine array";
+  let description = [{
+    Instructions in this operation allow for runtime (re-)configuration of the AI Engine array, such as configuring data movement buffer descriptors.
+    These instructions will execute on the configuration co-processor of the AI Engine array.
+
+    Typically, these instructions include configuring the data transfers between host and AIE array on the shims.
+    The input arguments are arguments passed in from the host at kernel invocation time. This may include buffers on the host.
+  }];
+  let arguments = (ins
+    Variadic<AnyType>:$args
+  );
+  let regions = (region
+    AnyRegion:$body
+  );
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+}
+
 def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
     AttrSizedOperandSegments,
     MyOffsetSizeAndStrideOpInterface
 
@@ -276,4 +276,72 @@ LogicalResult AIEX::NpuWriteBdOp::verify() {
   if (getIterationStride() > 0xFFFFF)
     return emitOpError("Iteration Stride exceeds the [0:1M-1] range.");
   return success();
-}
+}
+
+//===----------------------------------------------------------------------===//
+// RuntimeSequenceOp
+//===----------------------------------------------------------------------===//
+
+ParseResult AIEX::RuntimeSequenceOp::parse(OpAsmParser &parser,
+                                           OperationState &result) {
+  SmallVector<OpAsmParser::Argument> entryArgs;
+
+  // Entry arguments,  e.g. (%addr: memref<1xi32>)
+  ParseResult argParseResult = parser.parseCommaSeparatedList(
+      OpAsmParser::Delimiter::Paren, [&]() -> ParseResult {
+        OpAsmParser::Argument argument;
+        if (parser.parseArgument(argument, true, true)) {
+          return failure();
+        }
+        entryArgs.push_back(argument);
+        return success();
+      });
+  if (argParseResult) {
+    return argParseResult;
+  }
+
+  // Body
+  auto *body = result.addRegion();
+  ParseResult bodyParseResult = parser.parseRegion(*body, entryArgs, false);
+  if (bodyParseResult) {
+    return bodyParseResult;
+  }
+
+  return success();
+}
+
+void AIEX::RuntimeSequenceOp::print(OpAsmPrinter &printer) {
+  Region &body = getRegion();
+
+  printer << '(';
+  for (unsigned i = 0, n = body.getNumArguments(); i < n; i++) {
+    if (i > 0) {
+      printer << ", ";
+    }
+    printer.printRegionArgument(body.getArgument(i));
+  }
+  printer << ')';
+
+  printer << ' ';
+  printer.printRegion(body, false, true);
+}
+
+LogicalResult AIEX::RuntimeSequenceOp::verify() {
+  AIE::DeviceOp device = (*this)->getParentOfType<AIE::DeviceOp>();
+  if (!device) {
+    // this check is redudnant with the HasParent trait, but can't hurt
+    (*this)->emitOpError() << "must be inside AIE device operation.";
+    return failure();
+  }
+  auto seq_ops = device.getOps<AIEX::RuntimeSequenceOp>();
+  if (std::distance(seq_ops.begin(), seq_ops.end()) > 1) {
+    auto err = device.emitOpError()
+               << "Cannot have more than one runtime sequence per device.";
+    for (auto it = seq_ops.begin(); it != seq_ops.end(); ++it) {
+      AIEX::RuntimeSequenceOp seq_op = *it;
+      err.attachNote(seq_op.getLoc()) << "Sequence operation definition here.";
+    }
+    return failure();
+  }
+  return success();
+}
@@ -12,7 +12,6 @@
 #include "aie/Dialect/AIEX/IR/AIEXDialect.h"
 #include "aie/Dialect/AIEX/Transforms/AIEXPasses.h"
 
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/DenseMap.h"
@@ -227,7 +226,11 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     column = IntegerAttr::get(i32ty, col);
 
     // arg_idx
-    Block &entryBB = op->getParentOfType<func::FuncOp>().getBody().front();
+    AIEX::RuntimeSequenceOp seq_op =
+        op->getParentOfType<AIEX::RuntimeSequenceOp>();
+    assert(seq_op && "NpuDmaMemcpyNdOp must be inside a RuntimeSequenceOp; "
+                     "verify() should have ensured this.");
+    Block &entryBB = seq_op.getBody().front();
     int arg_idx = -1;
     for (int i = 0, e = entryBB.getNumArguments(); i < e; i++) {
       if (entryBB.getArgument(i) == memref) {
@@ -448,7 +451,8 @@ struct WriteBdToBlockWritePattern : OpConversionPattern<NpuWriteBdOp> {
     {
       OpBuilder::InsertionGuard guard(rewriter);
       std::string name = "blockwrite_data_";
-      rewriter.setInsertionPoint(op->getParentOfType<func::FuncOp>());
+      rewriter.setInsertionPoint(
+          op->getParentOfType<AIEX::RuntimeSequenceOp>());
       int id = 0;
       while (dev.lookupSymbol(name + std::to_string(id)))
         id++;
 
@@ -68,23 +68,15 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
   // Putting the standard header
   output << hsa_cpp_file_header;
 
-  // Getting the func op which has the data movement
-  if (targetOp.getOps<mlir::func::FuncOp>().empty()) {
-    return success();
-  }
-
   // Getting the sequence function op which contains the instructions
-  mlir::func::FuncOp funcOp = NULL;
-  for (auto op : targetOp.getOps<mlir::func::FuncOp>()) {
-    if (op.getName().str().compare("sequence") == 0) {
-      funcOp = op;
-    }
-  }
-
-  // If no funcOp then just return
-  if (funcOp == NULL) {
+  auto sequenceOps = targetOp.getOps<AIEX::RuntimeSequenceOp>();
+  if (sequenceOps.empty()) {
+    // If no sequenceOp then just return
     return success();
+  } else if (std::distance(sequenceOps.begin(), sequenceOps.end()) > 1) {
+    return module.emitOpError("expected at most one sequence operation");
   }
+  AIEX::RuntimeSequenceOp sequenceOp = *sequenceOps.begin();
 
   collectTiles(targetOp, tiles);
   collectBuffers(targetOp, buffers);
@@ -95,10 +87,11 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
   // Looping over every Memcpy operation so we take the correct number of
   // buffers
   int num_ops = 0;
-  for (auto op : funcOp.getOps<NpuDmaMemcpyNdOp>()) {
+  for (auto op : sequenceOp.getOps<NpuDmaMemcpyNdOp>()) {
     // Getting the IDs of the buffers
     auto memref = op.getMemref();
-    Block &entryBB = op->getParentOfType<func::FuncOp>().getBody().front();
+    Block &entryBB =
+        op->getParentOfType<AIEX::RuntimeSequenceOp>().getBody().front();
     int arg_idx = -1;
     for (int i = 0, e = entryBB.getNumArguments(); i < e; i++) {
       if (entryBB.getArgument(i) == memref) {
@@ -117,8 +110,8 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
   output << "\tuint64_t packet_id = 0;\n";
 
   int op_count = 0;
-  for (auto op : funcOp.getOps<NpuDmaMemcpyNdOp>()) {
-    auto dev = funcOp->getParentOfType<AIE::DeviceOp>();
+  for (auto op : sequenceOp.getOps<NpuDmaMemcpyNdOp>()) {
+    auto dev = sequenceOp->getParentOfType<AIE::DeviceOp>();
     if (!dev) {
       op.emitOpError("couldn't get DeviceOp");
       return failure();
@@ -163,7 +156,8 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
 
     // Getting the ID of the buffer that we are using
     auto memref = op.getMemref();
-    Block &entryBB = op->getParentOfType<func::FuncOp>().getBody().front();
+    Block &entryBB =
+        op->getParentOfType<AIEX::RuntimeSequenceOp>().getBody().front();
     int arg_idx = -1;
     for (int i = 0, e = entryBB.getNumArguments(); i < e; i++) {
       if (entryBB.getArgument(i) == memref) {
 
@@ -163,12 +163,10 @@ std::vector<uint32_t> xilinx::AIE::AIETranslateToNPU(ModuleOp module) {
   words[1] = 0x00000105;
 
   DeviceOp deviceOp = *module.getOps<DeviceOp>().begin();
-  auto funcOps = deviceOp.getOps<func::FuncOp>();
+  auto sequenceOps = deviceOp.getOps<AIEX::RuntimeSequenceOp>();
   int count = 0;
-  for (auto f : funcOps) {
-    if (f.isDeclaration())
-      continue;
-    Block &entry = f.getRegion().front();
+  for (auto f : sequenceOps) {
+    Block &entry = f.getBody().front();
     for (auto &o : entry) {
       llvm::TypeSwitch<Operation *>(&o)
           .Case<NpuSyncOp>([&](auto op) {
 
@@ -51,7 +51,7 @@ def core_body():
             # To/from AIE-array data movement
             tensor_ty = T.memref(N, T.i32())
 
-            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+            @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
             def sequence(A, B, C):
                 npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
                 # The strides below are configured to read across all rows in the same column
 
@@ -17,7 +17,7 @@ In this design, one or multiple AI Engine compute cores (spread across hardware
 ## Differences from the [Whole-Array Matrix-Matrix Multiplication Design](../whole_array/README.md)
 
 - A specialized matrix-*vector* microkernel, named `matvec_vectorized` is used in this design, as opposed to the more general matrix-matrix microkernel (`matmul_vectorized`) used in the matrix-matrix-multiplication designs.
-- The data movement in this design varies as follows: An identical `32`-element chunk of the vector `B` is **broadcast** to the cores in all columns, whereas _distinct_ subsequent `32`&times;`32`-sized tiles of the `A` matrix are **distributed** to the cores. As such, each core is responsible for a distinct `32`-element chunk of the output vector `C`. These chunks are assembled (**joined**) at the shim tile level (in the `sequence()` function).
+- The data movement in this design varies as follows: An identical `32`-element chunk of the vector `B` is **broadcast** to the cores in all columns, whereas _distinct_ subsequent `32`&times;`32`-sized tiles of the `A` matrix are **distributed** to the cores. As such, each core is responsible for a distinct `32`-element chunk of the output vector `C`. These chunks are assembled (**joined**) at the shim tile level (in the `aiex.runtime_sequence()`).
 - This design does not use all available compute cores. Instead, it uses at most one core in each hardware column. The variable `n_cores` defines the number of columns to be used. It would however be possible to extend this design to use all cores.
 
 ## Building and Running the Design
 
@@ -185,7 +185,7 @@ def core_body():
 
             # To/from AIE-array data movement
 
-            @FuncOp.from_py_func(
+            @runtime_sequence(
                 T.memref(A_sz, dtype_in()),
                 T.memref(B_sz, dtype_in()),
                 T.memref(C_sz, dtype_out()),
 
@@ -231,7 +231,7 @@ def core_body():
 
             # To/from AIE-array data movement
 
-            @FuncOp.from_py_func(
+            @runtime_sequence(
                 T.memref(A_sz, dtype_in()),
                 T.memref(B_sz, dtype_in()),
                 T.memref(C_sz, dtype_out()),
 
@@ -22,7 +22,7 @@ At a high level, the code does the following (in order):
 
 1. [**Defining Core Computations:**](#4-defining-core-computations) The `core_body()` function contains the code that will be loaded onto each AIE core. This code describes the matrix multiplication using the input submatrices `a` and `b` acquired through the ObjectFIFOs. The results are accumulated in the output submatrix `c`.
 
-1. [**Defining External Data Transfer Sequences:**](#5-defining-external-data-transfer-sequences) The `sequence()` function sets up matrix data movement from the host into the AIE compute cores, and back to the host after computation. It initializes Data Movement Accelerator (DMA) transfers, sets memory access patterns, and performs synchronization.
+1. [**Defining External Data Transfer Sequences:**](#5-defining-external-data-transfer-sequences) The `aie.runtime_sequence()` op sets up matrix data movement from the host into the AIE compute cores, and back to the host after computation. It initializes Data Movement Accelerator (DMA) transfers, sets memory access patterns, and performs synchronization.
 
 1. **Generating the Design:** The `my_matmul()` function triggers the code generation process and represents the main entry point of the design. The final print statement outputs the MLIR representation of the AIE array configuration.
 
@@ -72,7 +72,7 @@ The input and output matrix sizes are given by the user. We subdivide the input
 
 1. **Tiling to Compute Core Submatrix Chunks:** The input and output matrices stream to/from the AIE compute cores in chunks of size of `m`&times;`k`, `k`&times;`n` and `n`&times;`m`. Tiling into these chunks allows each of the computation cores to concurrently work on distinct sub-sections of the input matrices in parallel, which improves performance. This also reduces on-chip memory requirements. The final result is re-assembled using the sub-matrix results of all cores.
 
-    > This tiling occurs in the `sequence()` function describing the host-to-memory-tile transfer.
+    > This tiling occurs in the `aie.runtime_sequence()` operation describing the host-to-memory-tile transfer.
 We describe it further below, in section *"5. Defining External Data Transfer Sequences"*.
 
 1. **Tiling to Vector Intrinsic Size:** The AIE compute cores calculate the matrix multiplication using efficient "multiply-accumulate" vector intrinsic instructions (`MAC` instructions). These hardware instructions process very small blocks of the matrix: size `r`&times;`s` blocks of `A` and size `s`&times;`t` blocks of  `B`, producing an output of size `r`&times;`t` (`C`). 
@@ -198,7 +198,7 @@ We define a `core_body()` function for each compute core `i`, inside of which we
 
 ### 5. Defining External Data Transfer Sequences
 
-The function signature of the `sequence()` function lists as its arguments all the external buffers from the host that we wish to read from or write to on the AI Engine's shim tiles. The body of this function describes how these buffers are transfered from and to the host, including tiling the input matrices into `m`&times;`k` and `k`&times;`n`-sized sub-matrices, and combining the `m`&times;`n`-sized output tiles into the larger output `M`&times;`N` matrix buffer.
+The signature of the `aie.runtime_sequence()` operation lists as its arguments all the external buffers from the host that we wish to read from or write to on the AI Engine's shim tiles. The body of this function describes how these buffers are transfered from and to the host, including tiling the input matrices into `m`&times;`k` and `k`&times;`n`-sized sub-matrices, and combining the `m`&times;`n`-sized output tiles into the larger output `M`&times;`N` matrix buffer.
 
 * The `tile_row_block` variable segments the M (rows of A) into smaller chunks, each containing `rows_per_block` tile rows. This is done so the buffer descriptors (BDs) can be reused for efficient DMA transfers.
 * For each column `i`: