Skip to content

Commit e9fc525

Browse files
committed
Initial commit to examples
1 parent 8f0bd4f commit e9fc525

23 files changed

+3815
-0
lines changed

examples/autotune-matmul/matmul.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
2+
# SPDX-License-Identifier: MIT
3+
4+
# this is a benchmark which multiplies square matrices with maximum block size
5+
# to check the performance of tl.dot operation
6+
7+
import torch
8+
import triton
9+
import triton.language as tl
10+
import sys, os
11+
12+
sys.path.append(os.path.abspath(".."))
13+
import benchmark
14+
15+
configs = [triton.Config(kwargs={'BLOCK_SIZE_M': 256}), triton.Config(kwargs={'BLOCK_SIZE_M': 128})]
16+
17+
@triton.autotune(configs=configs, key=["M"])
18+
@triton.jit
19+
def bare_matmul(
20+
A,
21+
B,
22+
C,
23+
M: tl.constexpr,
24+
N: tl.constexpr,
25+
K: tl.constexpr,
26+
stride_am: tl.constexpr,
27+
stride_ak: tl.constexpr,
28+
stride_bk: tl.constexpr,
29+
stride_bn: tl.constexpr,
30+
stride_cm: tl.constexpr,
31+
stride_cn: tl.constexpr,
32+
BLOCK_SIZE_M: tl.constexpr,
33+
BLOCK_SIZE_N: tl.constexpr,
34+
BLOCK_SIZE_K: tl.constexpr,
35+
):
36+
pid_m = tl.program_id(0) # block row id
37+
pid_n = tl.program_id(1) # block column id
38+
39+
offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
40+
offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
41+
offs_k = tl.arange(0, BLOCK_SIZE_K)
42+
43+
a_block = tl.load(A + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
44+
b_block = tl.load(B + offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn)
45+
46+
c_block = tl.dot(a_block, b_block)
47+
48+
tl.store(C + offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn, c_block)
49+
50+
51+
# @benchmark.measure()
52+
def bench_matmul(M, N, K, provider):
53+
device = "cpu"
54+
dtype_in = torch.bfloat16
55+
dtype_out = torch.float32
56+
a = torch.randn((M, K), device=device, dtype=dtype_in)
57+
b = torch.randn((K, N), device=device, dtype=dtype_in)
58+
c = torch.empty((M, N), device=device, dtype=dtype_out)
59+
if provider == "torch" or provider == "test":
60+
c_ref = torch.matmul(a, b).to(dtype_out)
61+
if provider == "triton" or provider == "test":
62+
# 2D launch kernel where each block gets its own program.
63+
grid = lambda META: (
64+
triton.cdiv(M, META["BLOCK_SIZE_M"]),
65+
triton.cdiv(N, META["BLOCK_SIZE_N"]),
66+
)
67+
compiled_kernel = bare_matmul[grid](
68+
a,
69+
b,
70+
c,
71+
M,
72+
N,
73+
K,
74+
a.stride(0),
75+
a.stride(1),
76+
b.stride(0),
77+
b.stride(1),
78+
c.stride(0),
79+
c.stride(1),
80+
# BLOCK_SIZE_M=256,
81+
BLOCK_SIZE_N=256,
82+
BLOCK_SIZE_K=K,
83+
)
84+
with open("tt.shared.mlir", "w") as f:
85+
f.write(str(compiled_kernel.asm["ttsharedir"]))
86+
if provider == "test":
87+
torch.testing.assert_close(c, c_ref, atol=1e-2, rtol=1e-2)
88+
89+
90+
if __name__ == "__main__":
91+
benchmark.select_npu_backend()
92+
bench_matmul(256,256,256, "test")
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
////////////////////////////////////////////////////////////////////////////////
5+
// Transform Script for Matmul (Triton Ver3, Vectorized): Step-by-Step Annotated
6+
// This script transforms a matmul IR into a tiled, packed, bufferized, and
7+
// hardware-friendly form suitable for AIE execution. Each step is annotated
8+
// with its purpose, assumptions, and relation to the IR.
9+
////////////////////////////////////////////////////////////////////////////////
10+
11+
transform.with_pdl_patterns {
12+
^bb0(%arg0: !pdl.operation):
13+
14+
// Main transformation sequence begins.
15+
transform.sequence %arg0 : !pdl.operation failures(propagate) {
16+
^bb1(%arg1: !pdl.operation):
17+
18+
// Step 1: Match the fill and matmul ops.
19+
// Assumption: The IR contains linalg.fill and linalg.matmul ops representing initialization and main computation.
20+
%fill = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation
21+
%matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
22+
23+
// Step 2: Bufferize fill result to shared (L2) memory allocation.
24+
// Purpose: Allocates the result buffer in memory space 1 (shared/L2), required for AIR/AIE memory hierarchy.
25+
// Assumption: The result of the fill op will be written to L2/shared memory.
26+
%buffer_res_shared, %new_fill = transform.structured.bufferize_to_allocation %fill
27+
{memory_space = 1, bufferize_destination_only, emit_dealloc} : !pdl.operation
28+
29+
// Step 2.5: Tile memory copy operations using for loops.
30+
// Purpose: Tiling the memcpy using for loops provides hints on how big the L2 memory footprint shall be,
31+
// establishing the memory access patterns and tile sizes that guide subsequent L2 bufferization decisions.
32+
// Assumption: The tile sizes [0, 256] and [256, 0] are chosen to optimize L2 memory usage patterns.
33+
%func_1 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation
34+
transform.air.convert_memref_copy_to_linalg_copy %func_1
35+
%copies = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!pdl.operation) -> !pdl.operation
36+
%copy_1, %copy_2 = transform.split_handle %copies : (!pdl.operation<"linalg.copy">) -> (!pdl.operation<"linalg.copy">, !pdl.operation<"linalg.copy">)
37+
%tiled_copy_1, %tiled_copy_for_loop_1 =
38+
transform.structured.tile_using_for %copy_1 tile_sizes [0, 256]
39+
: (!pdl.operation) -> (!pdl.operation, !transform.op<"scf.for">)
40+
%tiled_copy_2, %tiled_copy_for_loop_2 =
41+
transform.structured.tile_using_for %copy_2 tile_sizes [256, 0]
42+
: (!pdl.operation) -> (!pdl.operation, !transform.op<"scf.for">)
43+
44+
// Step 3: Tile matmul using scf.forall with tile size [64, 64].
45+
// Purpose: Introduces parallelism and prepares for mapping to AIE columns.
46+
// Assumption: The problem size is a multiple of 64, or padding will be handled later.
47+
%matmul_1 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
48+
%tiled_matmul_1, %forall_1 =
49+
transform.structured.tile_using_forall %matmul_1 tile_sizes [64, 64] : (!pdl.operation) -> (!pdl.operation, !pdl.operation)
50+
51+
// Step 4: Run canonicalization and CSE.
52+
// Purpose: Cleans up the IR after tiling, merges redundant ops, and prepares for further transforms.
53+
// Assumption: Canonicalization will simplify the IR and remove dead code.
54+
%func_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation
55+
transform.apply_patterns to %func_2 {
56+
transform.apply_patterns.linalg.tiling_canonicalization
57+
transform.apply_patterns.scf.for_loop_canonicalization
58+
transform.apply_patterns.canonicalization
59+
} : !pdl.operation
60+
transform.apply_cse to %func_2 : !pdl.operation
61+
62+
// Step 5: Fuse fill operation into the forall loop.
63+
// Purpose: Ensures initialization is fused with computation for efficiency.
64+
// Assumption: The fill op is a direct consumer in the loop.
65+
%fused_fill_1 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation
66+
%fill_consumer = transform.get_consumers_of_result %fused_fill_1[0] : (!pdl.operation) -> (!pdl.operation)
67+
%fused_fill_2, %fused_loop_2 = transform.structured.fuse_into_containing_op %fused_fill_1 into %fill_consumer : (!pdl.operation, !pdl.operation) -> (!pdl.operation, !pdl.operation)
68+
69+
// Step 6: Pack by applying data tiling; linalg.matmul becomes linalg.generic.
70+
// Purpose: Prepares data for vectorized computation and memory layout optimization.
71+
// Assumption: Packing sizes are chosen for hardware efficiency.
72+
%packed = transform.structured.pack %tiled_matmul_1 packed_sizes = [4, 4, 8]
73+
: (!pdl.operation) -> (!pdl.operation)
74+
75+
// Step 7: Transpose A matrix for packed layout.
76+
// Purpose: Ensures correct memory layout for A operand.
77+
// Assumption: Outer permutation [1, 0] is correct for hardware mapping.
78+
%pack_producer_a = transform.get_producer_of_operand %packed[0]
79+
: (!pdl.operation) -> (!pdl.operation)
80+
%packed_a, %pack_a, %empty_unpack_a =
81+
transform.structured.pack_transpose %pack_producer_a with_compute_op(%packed)
82+
outer_perm = [1, 0] : (!pdl.operation, !pdl.operation)
83+
-> (!pdl.operation, !pdl.operation, !pdl.operation)
84+
85+
// Step 8: Transpose B matrix for packed layout.
86+
// Purpose: Ensures correct memory layout for B operand.
87+
// Assumption: Outer and inner permutations [1, 0] are correct for hardware mapping.
88+
%pack_producer_b = transform.get_producer_of_operand %packed_a[1]
89+
: (!pdl.operation) -> (!pdl.operation)
90+
%packed_b, %pack_b, %empty_unpack_b =
91+
transform.structured.pack_transpose %pack_producer_b with_compute_op(%packed_a)
92+
outer_perm = [1, 0] inner_perm = [1, 0] : (!pdl.operation, !pdl.operation)
93+
-> (!pdl.operation, !pdl.operation, !pdl.operation)
94+
95+
// Step 9: Transpose C matrix for packed layout.
96+
// Purpose: Ensures correct memory layout for C operand.
97+
// Assumption: Outer permutation [1, 0] is correct for hardware mapping.
98+
%unpack = transform.get_consumers_of_result %packed_b[0]
99+
: (!pdl.operation) -> (!pdl.operation)
100+
%packed_c, %pack_c, %unpack_c =
101+
transform.structured.pack_transpose %unpack with_compute_op(%packed_b)
102+
outer_perm = [1, 0] : (!pdl.operation, !pdl.operation)
103+
-> (!pdl.operation, !pdl.operation, !pdl.operation)
104+
105+
// Step 10: Bufferize result to local memory allocation (AIE local, memory_space=2).
106+
// Purpose: Moves result buffer to fast local memory for efficient AIE execution.
107+
// Assumption: The result fits in local memory and can be promoted.
108+
%buffer_c, %new_c = transform.structured.bufferize_to_allocation %pack_c
109+
{memory_space = 2, bufferize_destination_only, emit_dealloc} : !pdl.operation
110+
111+
// Step 11: Tile the reduction loop.
112+
// Purpose: Enables vectorized reduction and efficient computation.
113+
// Assumption: Tile size [0, 0, 4] is chosen for hardware efficiency.
114+
%tiled_reduction, %for_loop =
115+
transform.structured.tile_using_for %packed_c tile_sizes [0, 0, 4]
116+
: (!pdl.operation) -> (!pdl.operation, !pdl.operation)
117+
118+
// Step 12: Fuse pack ops into the for loop.
119+
// Purpose: Ensures packed data is available within the reduction loop.
120+
// Assumption: Packing ops are direct consumers in the loop.
121+
%fused_pack_a, %e1 = transform.structured.fuse_into_containing_op %pack_a into %for_loop
122+
: (!pdl.operation, !pdl.operation) -> (!pdl.operation, !pdl.operation)
123+
%fused_pack_b, %e2 = transform.structured.fuse_into_containing_op %pack_b into %for_loop
124+
: (!pdl.operation, !pdl.operation) -> (!pdl.operation, !pdl.operation)
125+
126+
// Step 13: Promote the inputs to local memory (AIE local, memory_space=2).
127+
// Purpose: Moves input operands to fast local memory for efficient AIE execution.
128+
// Assumption: The operands are suitable for promotion and local memory is available.
129+
%buffer_a, %new_a = transform.structured.bufferize_to_allocation %fused_pack_a
130+
{memory_space = 2, bufferize_destination_only, emit_dealloc} : !pdl.operation
131+
%buffer_b, %new_b = transform.structured.bufferize_to_allocation %fused_pack_b
132+
{memory_space = 2, bufferize_destination_only, emit_dealloc} : !pdl.operation
133+
134+
// Step 14: Run canonicalization and CSE again.
135+
// Purpose: Cleans up after bufferization and promotion, merges redundant allocs/copies.
136+
// Assumption: Canonicalization will further simplify the IR.
137+
%func_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation
138+
transform.apply_patterns to %func_3 {
139+
transform.apply_patterns.linalg.tiling_canonicalization
140+
transform.apply_patterns.scf.for_loop_canonicalization
141+
transform.apply_patterns.canonicalization
142+
} : !pdl.operation
143+
transform.apply_cse to %func_3 : !pdl.operation
144+
145+
// Step 15: One-shot bufferization of the function.
146+
// Purpose: Converts all tensors to memrefs, finalizes bufferization for AIR/AIE lowering.
147+
// Assumption: The function is now in DPS form and ready for bufferization.
148+
%func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation
149+
%func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!pdl.operation) -> !pdl.operation
150+
151+
// Step 16: Final canonicalization and AIR-specific cleanup.
152+
// Purpose: Removes redundant memcpy ops, eliminates cascade memcpy patterns, and canonicalizes.
153+
// Assumption: AIR passes will further optimize memory ops for hardware.
154+
%func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation
155+
transform.apply_patterns to %func6 {
156+
transform.apply_patterns.linalg.tiling_canonicalization
157+
transform.apply_patterns.scf.for_loop_canonicalization
158+
transform.apply_patterns.canonicalization
159+
} : !pdl.operation
160+
transform.apply_cse to %func6 : !pdl.operation
161+
transform.apply_patterns to %func6 {
162+
transform.apply_patterns.canonicalization
163+
} : !pdl.operation
164+
%func_op_updated = transform.air.remove_uninitialized_copy %func6
165+
%func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated
166+
167+
// Step 17: Tile linalg.generics for vectorization.
168+
// Purpose: Final tiling to enable vectorized execution on AIE hardware.
169+
// Assumption: Tile sizes [1, 1, 1, 0, 0, 0] are chosen for hardware vectorization.
170+
%linalg_generics = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
171+
%inner_most_generics, %vec_loops:3 =
172+
transform.structured.tile_using_for %linalg_generics tile_sizes [1, 1, 1, 0, 0, 0]
173+
: (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation)
174+
175+
// Step 18: Tile linalg.fills for vectorized write.
176+
// Purpose: Enables vectorized write for initialization.
177+
// Assumption: Tile sizes [1, 1] are chosen for hardware vectorization.
178+
%linalg_fills = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation
179+
%inner_most_fills, %vec_fill_loops:2 =
180+
transform.structured.tile_using_for %linalg_fills tile_sizes [1, 1]
181+
: (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
182+
183+
// Step 19: AIR Constructs Mapping
184+
// Purpose: Convert high-level parallel constructs to AIE-specific operations for hardware execution.
185+
// Convert parallel loops to AIE herd operations for multi-core execution
186+
%forall_as_herd = transform.structured.match ops{["scf.forall"]} in %arg1 : (!pdl.operation) -> !pdl.operation
187+
%parallel = transform.loop.forall_to_parallel %forall_as_herd : (!pdl.operation) -> !pdl.operation
188+
%herd = transform.air.par_to_herd %parallel
189+
190+
// Convert memory copies to DMA operations for efficient data movement
191+
%copies_in_herd = transform.structured.match ops{["memref.copy", "linalg.copy"]} in %herd : (!pdl.operation) -> !pdl.operation
192+
%dmas_from_copies = transform.air.copy_to_dma %copies_in_herd
193+
194+
// Apply vectorization to optimize for AIE vector units
195+
%vectorized_herd = transform.air.herd_vectorize %herd
196+
}
197+
}

examples/benchmark.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
2+
# SPDX-License-Identifier: MIT
3+
4+
import time
5+
import numpy as np
6+
from functools import wraps
7+
import triton
8+
from triton.backends.amd_triton_npu.driver import NPUDriver
9+
from triton.backends.triton_shared.driver import CPUDriver
10+
11+
12+
def select_npu_backend():
13+
triton.runtime.driver.set_active(NPUDriver())
14+
15+
16+
def select_cpu_backend():
17+
triton.runtime.driver.set_active(CPUDriver())
18+
19+
def select_gpu_backend():
20+
triton.runtime.driver.reset_active()
21+
22+
# Unfortunately, we can't use triton.testing.perf_report and triton.testing.do_bench for NPU backend because
23+
# they are very specific to cuda
24+
25+
26+
def measure(
27+
repeats=20,
28+
percentiles=(),
29+
timers={"Wall": time.perf_counter, "NPU": time.process_time},
30+
):
31+
"""
32+
Decorator to benchmark a function.
33+
34+
Parameters:
35+
- repeats (int): The number of times the function should be executed for each set of parameters.
36+
- percentiles (tuple): The percentiles to compute on the execution times (e.g., (50, 90, 99)).
37+
- timers (dict): A dictionary where keys are timer names (e.g., 'Wall', 'NPU') and values are timer functions
38+
that measure elapsed time. By default:
39+
* 'Wall': Uses time.perf_counter for high-resolution wall-clock time.
40+
* 'NPU': Uses time.process_time for NPU time spent by the process.
41+
42+
Returns:
43+
- A decorated function that prints:
44+
* Average execution time.
45+
* Standard deviation time.
46+
* Minimum and maximum times.
47+
* Computed percentiles for each timer.
48+
"""
49+
50+
def decorator(func):
51+
@wraps(func)
52+
def wrapper(*args, **kwargs):
53+
print(
54+
f"{func.__name__}{args} {kwargs}, {repeats} times, all results in seconds"
55+
)
56+
times = {}
57+
for t, _ in timers.items():
58+
times[t] = []
59+
60+
for _ in range(repeats):
61+
starts = {}
62+
for t, f in timers.items():
63+
starts[t] = f()
64+
65+
result = func(*args, **kwargs)
66+
67+
for t, f in timers.items():
68+
times[t].append(f() - starts[t])
69+
70+
for t, _ in timers.items():
71+
average_time = np.mean(times[t])
72+
min_time = np.min(times[t])
73+
max_time = np.max(times[t])
74+
computed_percentiles = np.percentile(times[t], percentiles)
75+
std_dev_time = np.std(times[t])
76+
77+
print(
78+
f"{t}: Avg={average_time:.6f}, min={min_time:.6f}, std={std_dev_time:.6f},",
79+
end=" ",
80+
)
81+
for p, value in zip(percentiles, computed_percentiles):
82+
print(f"{p}pp={value:.6f},", end=" ")
83+
print(f"max={max_time:.6f}")
84+
85+
return result
86+
87+
return wrapper
88+
89+
return decorator

0 commit comments

Comments
 (0)