amd
diff --git a/‎examples/autotune-matmul/matmul.py‎
Lines changed: 92 additions & 0 deletions b/‎examples/autotune-matmul/matmul.py‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎examples/autotune-matmul/transform_aie2.mlir‎
Lines changed: 197 additions & 0 deletions b/‎examples/autotune-matmul/transform_aie2.mlir‎
Lines changed: 197 additions & 0 deletions
diff --git a/‎examples/benchmark.py‎
Lines changed: 89 additions & 0 deletions b/‎examples/benchmark.py‎
Lines changed: 89 additions & 0 deletions
@@ -0,0 +1,92 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+# this is a benchmark which multiplies square matrices with maximum block size
+# to check the performance of tl.dot operation
+
+import torch
+import triton
+import triton.language as tl
+import sys, os
+
+sys.path.append(os.path.abspath(".."))
+import benchmark
+
+configs = [triton.Config(kwargs={'BLOCK_SIZE_M': 256}), triton.Config(kwargs={'BLOCK_SIZE_M': 128})]
+
+@triton.autotune(configs=configs, key=["M"])
+@triton.jit
+def bare_matmul(
+    A,
+    B,
+    C,
+    M: tl.constexpr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_m = tl.program_id(0)  # block row id
+    pid_n = tl.program_id(1)  # block column id
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    a_block = tl.load(A + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_block = tl.load(B + offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn)
+
+    c_block = tl.dot(a_block, b_block)
+
+    tl.store(C + offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn, c_block)
+
+
+# @benchmark.measure()
+def bench_matmul(M, N, K, provider):
+    device = "cpu"
+    dtype_in = torch.bfloat16
+    dtype_out = torch.float32
+    a = torch.randn((M, K), device=device, dtype=dtype_in)
+    b = torch.randn((K, N), device=device, dtype=dtype_in)
+    c = torch.empty((M, N), device=device, dtype=dtype_out)
+    if provider == "torch" or provider == "test":
+        c_ref = torch.matmul(a, b).to(dtype_out)
+    if provider == "triton" or provider == "test":
+        # 2D launch kernel where each block gets its own program.
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_SIZE_M"]),
+            triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        )
+        compiled_kernel = bare_matmul[grid](
+            a,
+            b,
+            c,
+            M,
+            N,
+            K,
+            a.stride(0),
+            a.stride(1),
+            b.stride(0),
+            b.stride(1),
+            c.stride(0),
+            c.stride(1),
+            # BLOCK_SIZE_M=256,
+            BLOCK_SIZE_N=256,
+            BLOCK_SIZE_K=K,
+        )
+        with open("tt.shared.mlir", "w") as f:
+            f.write(str(compiled_kernel.asm["ttsharedir"]))
+        if provider == "test":
+            torch.testing.assert_close(c, c_ref, atol=1e-2, rtol=1e-2)
+
+
+if __name__ == "__main__":
+    benchmark.select_npu_backend()
+    bench_matmul(256,256,256, "test")
@@ -0,0 +1,197 @@
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+////////////////////////////////////////////////////////////////////////////////
+// Transform Script for Matmul (Triton Ver3, Vectorized): Step-by-Step Annotated
+// This script transforms a matmul IR into a tiled, packed, bufferized, and
+// hardware-friendly form suitable for AIE execution. Each step is annotated
+// with its purpose, assumptions, and relation to the IR.
+////////////////////////////////////////////////////////////////////////////////
+
+transform.with_pdl_patterns {
+^bb0(%arg0: !pdl.operation):
+
+    // Main transformation sequence begins.
+    transform.sequence %arg0 : !pdl.operation failures(propagate) {
+    ^bb1(%arg1: !pdl.operation):
+
+    // Step 1: Match the fill and matmul ops.
+    // Assumption: The IR contains linalg.fill and linalg.matmul ops representing initialization and main computation.
+        %fill = transform.structured.match ops{["linalg.fill"]} in %arg1  : (!pdl.operation) -> !pdl.operation
+        %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1  : (!pdl.operation) -> !pdl.operation
+
+    // Step 2: Bufferize fill result to shared (L2) memory allocation.
+    // Purpose: Allocates the result buffer in memory space 1 (shared/L2), required for AIR/AIE memory hierarchy.
+    // Assumption: The result of the fill op will be written to L2/shared memory.
+        %buffer_res_shared, %new_fill = transform.structured.bufferize_to_allocation %fill
+          {memory_space = 1, bufferize_destination_only, emit_dealloc} : !pdl.operation
+
+    // Step 2.5: Tile memory copy operations using for loops.
+    // Purpose: Tiling the memcpy using for loops provides hints on how big the L2 memory footprint shall be,
+    // establishing the memory access patterns and tile sizes that guide subsequent L2 bufferization decisions.
+    // Assumption: The tile sizes [0, 256] and [256, 0] are chosen to optimize L2 memory usage patterns.
+        %func_1 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+        transform.air.convert_memref_copy_to_linalg_copy %func_1
+        %copies = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+        %copy_1, %copy_2 = transform.split_handle %copies : (!pdl.operation<"linalg.copy">) -> (!pdl.operation<"linalg.copy">, !pdl.operation<"linalg.copy">)
+        %tiled_copy_1, %tiled_copy_for_loop_1 =
+          transform.structured.tile_using_for %copy_1 tile_sizes [0, 256]
+          : (!pdl.operation) -> (!pdl.operation, !transform.op<"scf.for">)
+        %tiled_copy_2, %tiled_copy_for_loop_2 =
+          transform.structured.tile_using_for %copy_2 tile_sizes [256, 0]
+          : (!pdl.operation) -> (!pdl.operation, !transform.op<"scf.for">)
+
+    // Step 3: Tile matmul using scf.forall with tile size [64, 64].
+    // Purpose: Introduces parallelism and prepares for mapping to AIE columns.
+    // Assumption: The problem size is a multiple of 64, or padding will be handled later.
+        %matmul_1 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+        %tiled_matmul_1, %forall_1 =
+          transform.structured.tile_using_forall %matmul_1 tile_sizes [64, 64] : (!pdl.operation) -> (!pdl.operation, !pdl.operation)
+
+    // Step 4: Run canonicalization and CSE.
+    // Purpose: Cleans up the IR after tiling, merges redundant ops, and prepares for further transforms.
+    // Assumption: Canonicalization will simplify the IR and remove dead code.
+        %func_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+        transform.apply_patterns to %func_2 {
+            transform.apply_patterns.linalg.tiling_canonicalization
+            transform.apply_patterns.scf.for_loop_canonicalization
+            transform.apply_patterns.canonicalization
+        } : !pdl.operation
+        transform.apply_cse to %func_2 : !pdl.operation
+
+    // Step 5: Fuse fill operation into the forall loop.
+    // Purpose: Ensures initialization is fused with computation for efficiency.
+    // Assumption: The fill op is a direct consumer in the loop.
+        %fused_fill_1 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+        %fill_consumer = transform.get_consumers_of_result %fused_fill_1[0] : (!pdl.operation) -> (!pdl.operation)
+        %fused_fill_2, %fused_loop_2 = transform.structured.fuse_into_containing_op %fused_fill_1 into %fill_consumer : (!pdl.operation, !pdl.operation) -> (!pdl.operation, !pdl.operation)
+
+    // Step 6: Pack by applying data tiling; linalg.matmul becomes linalg.generic.
+    // Purpose: Prepares data for vectorized computation and memory layout optimization.
+    // Assumption: Packing sizes are chosen for hardware efficiency.
+        %packed = transform.structured.pack %tiled_matmul_1 packed_sizes = [4, 4, 8]
+          : (!pdl.operation) -> (!pdl.operation)
+
+    // Step 7: Transpose A matrix for packed layout.
+    // Purpose: Ensures correct memory layout for A operand.
+    // Assumption: Outer permutation [1, 0] is correct for hardware mapping.
+        %pack_producer_a = transform.get_producer_of_operand %packed[0]
+          : (!pdl.operation) -> (!pdl.operation)
+        %packed_a, %pack_a, %empty_unpack_a =
+          transform.structured.pack_transpose %pack_producer_a with_compute_op(%packed)
+          outer_perm = [1, 0] : (!pdl.operation, !pdl.operation)
+          -> (!pdl.operation, !pdl.operation, !pdl.operation)
+
+    // Step 8: Transpose B matrix for packed layout.
+    // Purpose: Ensures correct memory layout for B operand.
+    // Assumption: Outer and inner permutations [1, 0] are correct for hardware mapping.
+        %pack_producer_b = transform.get_producer_of_operand %packed_a[1]
+          : (!pdl.operation) -> (!pdl.operation)
+        %packed_b, %pack_b, %empty_unpack_b =
+          transform.structured.pack_transpose %pack_producer_b with_compute_op(%packed_a)
+          outer_perm = [1, 0] inner_perm = [1, 0] : (!pdl.operation, !pdl.operation)
+          -> (!pdl.operation, !pdl.operation, !pdl.operation)
+
+    // Step 9: Transpose C matrix for packed layout.
+    // Purpose: Ensures correct memory layout for C operand.
+    // Assumption: Outer permutation [1, 0] is correct for hardware mapping.
+        %unpack = transform.get_consumers_of_result %packed_b[0]
+          : (!pdl.operation) -> (!pdl.operation)
+        %packed_c, %pack_c, %unpack_c =
+          transform.structured.pack_transpose %unpack with_compute_op(%packed_b)
+          outer_perm = [1, 0] : (!pdl.operation, !pdl.operation)
+          -> (!pdl.operation, !pdl.operation, !pdl.operation)
+
+    // Step 10: Bufferize result to local memory allocation (AIE local, memory_space=2).
+    // Purpose: Moves result buffer to fast local memory for efficient AIE execution.
+    // Assumption: The result fits in local memory and can be promoted.
+        %buffer_c, %new_c = transform.structured.bufferize_to_allocation %pack_c
+          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !pdl.operation
+
+    // Step 11: Tile the reduction loop.
+    // Purpose: Enables vectorized reduction and efficient computation.
+    // Assumption: Tile size [0, 0, 4] is chosen for hardware efficiency.
+        %tiled_reduction, %for_loop =
+          transform.structured.tile_using_for %packed_c tile_sizes [0, 0, 4]
+          : (!pdl.operation) -> (!pdl.operation, !pdl.operation)
+
+    // Step 12: Fuse pack ops into the for loop.
+    // Purpose: Ensures packed data is available within the reduction loop.
+    // Assumption: Packing ops are direct consumers in the loop.
+        %fused_pack_a, %e1 = transform.structured.fuse_into_containing_op %pack_a into %for_loop
+          : (!pdl.operation, !pdl.operation) -> (!pdl.operation, !pdl.operation)
+        %fused_pack_b, %e2 = transform.structured.fuse_into_containing_op %pack_b into %for_loop
+          : (!pdl.operation, !pdl.operation) -> (!pdl.operation, !pdl.operation)
+
+    // Step 13: Promote the inputs to local memory (AIE local, memory_space=2).
+    // Purpose: Moves input operands to fast local memory for efficient AIE execution.
+    // Assumption: The operands are suitable for promotion and local memory is available.
+        %buffer_a, %new_a = transform.structured.bufferize_to_allocation %fused_pack_a
+          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !pdl.operation
+        %buffer_b, %new_b = transform.structured.bufferize_to_allocation %fused_pack_b
+          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !pdl.operation
+
+    // Step 14: Run canonicalization and CSE again.
+    // Purpose: Cleans up after bufferization and promotion, merges redundant allocs/copies.
+    // Assumption: Canonicalization will further simplify the IR.
+        %func_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+        transform.apply_patterns to %func_3 {
+            transform.apply_patterns.linalg.tiling_canonicalization
+            transform.apply_patterns.scf.for_loop_canonicalization
+            transform.apply_patterns.canonicalization
+        } : !pdl.operation
+        transform.apply_cse to %func_3 : !pdl.operation
+
+    // Step 15: One-shot bufferization of the function.
+    // Purpose: Converts all tensors to memrefs, finalizes bufferization for AIR/AIE lowering.
+    // Assumption: The function is now in DPS form and ready for bufferization.
+        %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+        %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!pdl.operation) -> !pdl.operation
+
+    // Step 16: Final canonicalization and AIR-specific cleanup.
+    // Purpose: Removes redundant memcpy ops, eliminates cascade memcpy patterns, and canonicalizes.
+    // Assumption: AIR passes will further optimize memory ops for hardware.
+        %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+        transform.apply_patterns to %func6 {
+            transform.apply_patterns.linalg.tiling_canonicalization
+            transform.apply_patterns.scf.for_loop_canonicalization
+            transform.apply_patterns.canonicalization
+        } : !pdl.operation
+        transform.apply_cse to %func6 : !pdl.operation
+        transform.apply_patterns to %func6 {
+            transform.apply_patterns.canonicalization
+        } : !pdl.operation
+        %func_op_updated = transform.air.remove_uninitialized_copy %func6
+        %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated
+
+    // Step 17: Tile linalg.generics for vectorization.
+    // Purpose: Final tiling to enable vectorized execution on AIE hardware.
+    // Assumption: Tile sizes [1, 1, 1, 0, 0, 0] are chosen for hardware vectorization.
+        %linalg_generics = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+        %inner_most_generics, %vec_loops:3 =
+          transform.structured.tile_using_for %linalg_generics tile_sizes [1, 1, 1, 0, 0, 0]
+          : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation)     
+
+    // Step 18: Tile linalg.fills for vectorized write.
+    // Purpose: Enables vectorized write for initialization.
+    // Assumption: Tile sizes [1, 1] are chosen for hardware vectorization.
+        %linalg_fills = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+        %inner_most_fills, %vec_fill_loops:2 =
+          transform.structured.tile_using_for %linalg_fills tile_sizes [1, 1]
+          : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)   
+
+    // Step 19: AIR Constructs Mapping
+    // Purpose: Convert high-level parallel constructs to AIE-specific operations for hardware execution.
+    // Convert parallel loops to AIE herd operations for multi-core execution
+        %forall_as_herd = transform.structured.match ops{["scf.forall"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+        %parallel = transform.loop.forall_to_parallel %forall_as_herd  : (!pdl.operation) -> !pdl.operation
+        %herd = transform.air.par_to_herd %parallel
+
+    // Convert memory copies to DMA operations for efficient data movement
+        %copies_in_herd = transform.structured.match ops{["memref.copy", "linalg.copy"]} in %herd : (!pdl.operation) -> !pdl.operation
+        %dmas_from_copies = transform.air.copy_to_dma %copies_in_herd
+        
+    // Apply vectorization to optimize for AIE vector units
+        %vectorized_herd = transform.air.herd_vectorize %herd
+    }
+}
@@ -0,0 +1,89 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+import time
+import numpy as np
+from functools import wraps
+import triton
+from triton.backends.amd_triton_npu.driver import NPUDriver
+from triton.backends.triton_shared.driver import CPUDriver
+
+
+def select_npu_backend():
+    triton.runtime.driver.set_active(NPUDriver())
+
+
+def select_cpu_backend():
+    triton.runtime.driver.set_active(CPUDriver())
+
+def select_gpu_backend():
+    triton.runtime.driver.reset_active()
+    
+# Unfortunately, we can't use triton.testing.perf_report and triton.testing.do_bench for NPU backend because
+# they are very specific to cuda
+
+
+def measure(
+    repeats=20,
+    percentiles=(),
+    timers={"Wall": time.perf_counter, "NPU": time.process_time},
+):
+    """
+    Decorator to benchmark a function.
+
+    Parameters:
+    - repeats (int): The number of times the function should be executed for each set of parameters.
+    - percentiles (tuple): The percentiles to compute on the execution times (e.g., (50, 90, 99)).
+    - timers (dict): A dictionary where keys are timer names (e.g., 'Wall', 'NPU') and values are timer functions
+                     that measure elapsed time. By default:
+                     * 'Wall': Uses time.perf_counter for high-resolution wall-clock time.
+                     * 'NPU': Uses time.process_time for NPU time spent by the process.
+
+    Returns:
+    - A decorated function that prints:
+        * Average execution time.
+        * Standard deviation time.
+        * Minimum and maximum times.
+        * Computed percentiles for each timer.
+    """
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            print(
+                f"{func.__name__}{args} {kwargs}, {repeats} times, all results in seconds"
+            )
+            times = {}
+            for t, _ in timers.items():
+                times[t] = []
+
+            for _ in range(repeats):
+                starts = {}
+                for t, f in timers.items():
+                    starts[t] = f()
+
+                result = func(*args, **kwargs)
+
+                for t, f in timers.items():
+                    times[t].append(f() - starts[t])
+
+            for t, _ in timers.items():
+                average_time = np.mean(times[t])
+                min_time = np.min(times[t])
+                max_time = np.max(times[t])
+                computed_percentiles = np.percentile(times[t], percentiles)
+                std_dev_time = np.std(times[t])
+
+                print(
+                    f"{t}: Avg={average_time:.6f}, min={min_time:.6f}, std={std_dev_time:.6f},",
+                    end=" ",
+                )
+                for p, value in zip(percentiles, computed_percentiles):
+                    print(f"{p}pp={value:.6f},", end=" ")
+                print(f"max={max_time:.6f}")
+
+            return result
+
+        return wrapper
+
+    return decorator