|
| 1 | +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. |
| 2 | +// SPDX-License-Identifier: MIT |
| 3 | + |
| 4 | +//////////////////////////////////////////////////////////////////////////////// |
| 5 | +// Transform Script for Matmul (Triton Ver3, Vectorized): Step-by-Step Annotated |
| 6 | +// This script transforms a matmul IR into a tiled, packed, bufferized, and |
| 7 | +// hardware-friendly form suitable for AIE execution. Each step is annotated |
| 8 | +// with its purpose, assumptions, and relation to the IR. |
| 9 | +//////////////////////////////////////////////////////////////////////////////// |
| 10 | + |
| 11 | +transform.with_pdl_patterns { |
| 12 | +^bb0(%arg0: !pdl.operation): |
| 13 | + |
| 14 | + // Main transformation sequence begins. |
| 15 | + transform.sequence %arg0 : !pdl.operation failures(propagate) { |
| 16 | + ^bb1(%arg1: !pdl.operation): |
| 17 | + |
| 18 | + // Step 1: Match the fill and matmul ops. |
| 19 | + // Assumption: The IR contains linalg.fill and linalg.matmul ops representing initialization and main computation. |
| 20 | + %fill = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation |
| 21 | + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation |
| 22 | + |
| 23 | + // Step 2: Bufferize fill result to shared (L2) memory allocation. |
| 24 | + // Purpose: Allocates the result buffer in memory space 1 (shared/L2), required for AIR/AIE memory hierarchy. |
| 25 | + // Assumption: The result of the fill op will be written to L2/shared memory. |
| 26 | + %buffer_res_shared, %new_fill = transform.structured.bufferize_to_allocation %fill |
| 27 | + {memory_space = 1, bufferize_destination_only, emit_dealloc} : !pdl.operation |
| 28 | + |
| 29 | + // Step 2.5: Tile memory copy operations using for loops. |
| 30 | + // Purpose: Tiling the memcpy using for loops provides hints on how big the L2 memory footprint shall be, |
| 31 | + // establishing the memory access patterns and tile sizes that guide subsequent L2 bufferization decisions. |
| 32 | + // Assumption: The tile sizes [0, 256] and [256, 0] are chosen to optimize L2 memory usage patterns. |
| 33 | + %func_1 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation |
| 34 | + transform.air.convert_memref_copy_to_linalg_copy %func_1 |
| 35 | + %copies = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!pdl.operation) -> !pdl.operation |
| 36 | + %copy_1, %copy_2 = transform.split_handle %copies : (!pdl.operation<"linalg.copy">) -> (!pdl.operation<"linalg.copy">, !pdl.operation<"linalg.copy">) |
| 37 | + %tiled_copy_1, %tiled_copy_for_loop_1 = |
| 38 | + transform.structured.tile_using_for %copy_1 tile_sizes [0, 256] |
| 39 | + : (!pdl.operation) -> (!pdl.operation, !transform.op<"scf.for">) |
| 40 | + %tiled_copy_2, %tiled_copy_for_loop_2 = |
| 41 | + transform.structured.tile_using_for %copy_2 tile_sizes [256, 0] |
| 42 | + : (!pdl.operation) -> (!pdl.operation, !transform.op<"scf.for">) |
| 43 | + |
| 44 | + // Step 3: Tile matmul using scf.forall with tile size [64, 64]. |
| 45 | + // Purpose: Introduces parallelism and prepares for mapping to AIE columns. |
| 46 | + // Assumption: The problem size is a multiple of 64, or padding will be handled later. |
| 47 | + %matmul_1 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation |
| 48 | + %tiled_matmul_1, %forall_1 = |
| 49 | + transform.structured.tile_using_forall %matmul_1 tile_sizes [64, 64] : (!pdl.operation) -> (!pdl.operation, !pdl.operation) |
| 50 | + |
| 51 | + // Step 4: Run canonicalization and CSE. |
| 52 | + // Purpose: Cleans up the IR after tiling, merges redundant ops, and prepares for further transforms. |
| 53 | + // Assumption: Canonicalization will simplify the IR and remove dead code. |
| 54 | + %func_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation |
| 55 | + transform.apply_patterns to %func_2 { |
| 56 | + transform.apply_patterns.linalg.tiling_canonicalization |
| 57 | + transform.apply_patterns.scf.for_loop_canonicalization |
| 58 | + transform.apply_patterns.canonicalization |
| 59 | + } : !pdl.operation |
| 60 | + transform.apply_cse to %func_2 : !pdl.operation |
| 61 | + |
| 62 | + // Step 5: Fuse fill operation into the forall loop. |
| 63 | + // Purpose: Ensures initialization is fused with computation for efficiency. |
| 64 | + // Assumption: The fill op is a direct consumer in the loop. |
| 65 | + %fused_fill_1 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation |
| 66 | + %fill_consumer = transform.get_consumers_of_result %fused_fill_1[0] : (!pdl.operation) -> (!pdl.operation) |
| 67 | + %fused_fill_2, %fused_loop_2 = transform.structured.fuse_into_containing_op %fused_fill_1 into %fill_consumer : (!pdl.operation, !pdl.operation) -> (!pdl.operation, !pdl.operation) |
| 68 | + |
| 69 | + // Step 6: Pack by applying data tiling; linalg.matmul becomes linalg.generic. |
| 70 | + // Purpose: Prepares data for vectorized computation and memory layout optimization. |
| 71 | + // Assumption: Packing sizes are chosen for hardware efficiency. |
| 72 | + %packed = transform.structured.pack %tiled_matmul_1 packed_sizes = [4, 4, 8] |
| 73 | + : (!pdl.operation) -> (!pdl.operation) |
| 74 | + |
| 75 | + // Step 7: Transpose A matrix for packed layout. |
| 76 | + // Purpose: Ensures correct memory layout for A operand. |
| 77 | + // Assumption: Outer permutation [1, 0] is correct for hardware mapping. |
| 78 | + %pack_producer_a = transform.get_producer_of_operand %packed[0] |
| 79 | + : (!pdl.operation) -> (!pdl.operation) |
| 80 | + %packed_a, %pack_a, %empty_unpack_a = |
| 81 | + transform.structured.pack_transpose %pack_producer_a with_compute_op(%packed) |
| 82 | + outer_perm = [1, 0] : (!pdl.operation, !pdl.operation) |
| 83 | + -> (!pdl.operation, !pdl.operation, !pdl.operation) |
| 84 | + |
| 85 | + // Step 8: Transpose B matrix for packed layout. |
| 86 | + // Purpose: Ensures correct memory layout for B operand. |
| 87 | + // Assumption: Outer and inner permutations [1, 0] are correct for hardware mapping. |
| 88 | + %pack_producer_b = transform.get_producer_of_operand %packed_a[1] |
| 89 | + : (!pdl.operation) -> (!pdl.operation) |
| 90 | + %packed_b, %pack_b, %empty_unpack_b = |
| 91 | + transform.structured.pack_transpose %pack_producer_b with_compute_op(%packed_a) |
| 92 | + outer_perm = [1, 0] inner_perm = [1, 0] : (!pdl.operation, !pdl.operation) |
| 93 | + -> (!pdl.operation, !pdl.operation, !pdl.operation) |
| 94 | + |
| 95 | + // Step 9: Transpose C matrix for packed layout. |
| 96 | + // Purpose: Ensures correct memory layout for C operand. |
| 97 | + // Assumption: Outer permutation [1, 0] is correct for hardware mapping. |
| 98 | + %unpack = transform.get_consumers_of_result %packed_b[0] |
| 99 | + : (!pdl.operation) -> (!pdl.operation) |
| 100 | + %packed_c, %pack_c, %unpack_c = |
| 101 | + transform.structured.pack_transpose %unpack with_compute_op(%packed_b) |
| 102 | + outer_perm = [1, 0] : (!pdl.operation, !pdl.operation) |
| 103 | + -> (!pdl.operation, !pdl.operation, !pdl.operation) |
| 104 | + |
| 105 | + // Step 10: Bufferize result to local memory allocation (AIE local, memory_space=2). |
| 106 | + // Purpose: Moves result buffer to fast local memory for efficient AIE execution. |
| 107 | + // Assumption: The result fits in local memory and can be promoted. |
| 108 | + %buffer_c, %new_c = transform.structured.bufferize_to_allocation %pack_c |
| 109 | + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !pdl.operation |
| 110 | + |
| 111 | + // Step 11: Tile the reduction loop. |
| 112 | + // Purpose: Enables vectorized reduction and efficient computation. |
| 113 | + // Assumption: Tile size [0, 0, 4] is chosen for hardware efficiency. |
| 114 | + %tiled_reduction, %for_loop = |
| 115 | + transform.structured.tile_using_for %packed_c tile_sizes [0, 0, 4] |
| 116 | + : (!pdl.operation) -> (!pdl.operation, !pdl.operation) |
| 117 | + |
| 118 | + // Step 12: Fuse pack ops into the for loop. |
| 119 | + // Purpose: Ensures packed data is available within the reduction loop. |
| 120 | + // Assumption: Packing ops are direct consumers in the loop. |
| 121 | + %fused_pack_a, %e1 = transform.structured.fuse_into_containing_op %pack_a into %for_loop |
| 122 | + : (!pdl.operation, !pdl.operation) -> (!pdl.operation, !pdl.operation) |
| 123 | + %fused_pack_b, %e2 = transform.structured.fuse_into_containing_op %pack_b into %for_loop |
| 124 | + : (!pdl.operation, !pdl.operation) -> (!pdl.operation, !pdl.operation) |
| 125 | + |
| 126 | + // Step 13: Promote the inputs to local memory (AIE local, memory_space=2). |
| 127 | + // Purpose: Moves input operands to fast local memory for efficient AIE execution. |
| 128 | + // Assumption: The operands are suitable for promotion and local memory is available. |
| 129 | + %buffer_a, %new_a = transform.structured.bufferize_to_allocation %fused_pack_a |
| 130 | + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !pdl.operation |
| 131 | + %buffer_b, %new_b = transform.structured.bufferize_to_allocation %fused_pack_b |
| 132 | + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !pdl.operation |
| 133 | + |
| 134 | + // Step 14: Run canonicalization and CSE again. |
| 135 | + // Purpose: Cleans up after bufferization and promotion, merges redundant allocs/copies. |
| 136 | + // Assumption: Canonicalization will further simplify the IR. |
| 137 | + %func_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation |
| 138 | + transform.apply_patterns to %func_3 { |
| 139 | + transform.apply_patterns.linalg.tiling_canonicalization |
| 140 | + transform.apply_patterns.scf.for_loop_canonicalization |
| 141 | + transform.apply_patterns.canonicalization |
| 142 | + } : !pdl.operation |
| 143 | + transform.apply_cse to %func_3 : !pdl.operation |
| 144 | + |
| 145 | + // Step 15: One-shot bufferization of the function. |
| 146 | + // Purpose: Converts all tensors to memrefs, finalizes bufferization for AIR/AIE lowering. |
| 147 | + // Assumption: The function is now in DPS form and ready for bufferization. |
| 148 | + %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation |
| 149 | + %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!pdl.operation) -> !pdl.operation |
| 150 | + |
| 151 | + // Step 16: Final canonicalization and AIR-specific cleanup. |
| 152 | + // Purpose: Removes redundant memcpy ops, eliminates cascade memcpy patterns, and canonicalizes. |
| 153 | + // Assumption: AIR passes will further optimize memory ops for hardware. |
| 154 | + %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation |
| 155 | + transform.apply_patterns to %func6 { |
| 156 | + transform.apply_patterns.linalg.tiling_canonicalization |
| 157 | + transform.apply_patterns.scf.for_loop_canonicalization |
| 158 | + transform.apply_patterns.canonicalization |
| 159 | + } : !pdl.operation |
| 160 | + transform.apply_cse to %func6 : !pdl.operation |
| 161 | + transform.apply_patterns to %func6 { |
| 162 | + transform.apply_patterns.canonicalization |
| 163 | + } : !pdl.operation |
| 164 | + %func_op_updated = transform.air.remove_uninitialized_copy %func6 |
| 165 | + %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated |
| 166 | + |
| 167 | + // Step 17: Tile linalg.generics for vectorization. |
| 168 | + // Purpose: Final tiling to enable vectorized execution on AIE hardware. |
| 169 | + // Assumption: Tile sizes [1, 1, 1, 0, 0, 0] are chosen for hardware vectorization. |
| 170 | + %linalg_generics = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation |
| 171 | + %inner_most_generics, %vec_loops:3 = |
| 172 | + transform.structured.tile_using_for %linalg_generics tile_sizes [1, 1, 1, 0, 0, 0] |
| 173 | + : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation) |
| 174 | + |
| 175 | + // Step 18: Tile linalg.fills for vectorized write. |
| 176 | + // Purpose: Enables vectorized write for initialization. |
| 177 | + // Assumption: Tile sizes [1, 1] are chosen for hardware vectorization. |
| 178 | + %linalg_fills = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation |
| 179 | + %inner_most_fills, %vec_fill_loops:2 = |
| 180 | + transform.structured.tile_using_for %linalg_fills tile_sizes [1, 1] |
| 181 | + : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation) |
| 182 | + |
| 183 | + // Step 19: AIR Constructs Mapping |
| 184 | + // Purpose: Convert high-level parallel constructs to AIE-specific operations for hardware execution. |
| 185 | + // Convert parallel loops to AIE herd operations for multi-core execution |
| 186 | + %forall_as_herd = transform.structured.match ops{["scf.forall"]} in %arg1 : (!pdl.operation) -> !pdl.operation |
| 187 | + %parallel = transform.loop.forall_to_parallel %forall_as_herd : (!pdl.operation) -> !pdl.operation |
| 188 | + %herd = transform.air.par_to_herd %parallel |
| 189 | + |
| 190 | + // Convert memory copies to DMA operations for efficient data movement |
| 191 | + %copies_in_herd = transform.structured.match ops{["memref.copy", "linalg.copy"]} in %herd : (!pdl.operation) -> !pdl.operation |
| 192 | + %dmas_from_copies = transform.air.copy_to_dma %copies_in_herd |
| 193 | + |
| 194 | + // Apply vectorization to optimize for AIE vector units |
| 195 | + %vectorized_herd = transform.air.herd_vectorize %herd |
| 196 | + } |
| 197 | +} |
0 commit comments