|
| 1 | +//===- GPUTransformOps.td - GPU transform ops --------------*- tablegen -*-===// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | + |
| 9 | +#ifndef GPU_TRANSFORM_OPS |
| 10 | +#define GPU_TRANSFORM_OPS |
| 11 | + |
| 12 | +include "mlir/Dialect/Transform/IR/TransformDialect.td" |
| 13 | +include "mlir/Dialect/Transform/IR/TransformEffects.td" |
| 14 | +include "mlir/Dialect/Transform/IR/TransformInterfaces.td" |
| 15 | +include "mlir/Dialect/PDL/IR/PDLTypes.td" |
| 16 | +include "mlir/Interfaces/SideEffectInterfaces.td" |
| 17 | +include "mlir/IR/OpBase.td" |
| 18 | + |
| 19 | +def MapNestedForeachToThreads : |
| 20 | + Op<Transform_Dialect, "gpu.map_nested_foreach_to_threads", |
| 21 | + [FunctionalStyleTransformOpTrait, |
| 22 | + MemoryEffectsOpInterface, |
| 23 | + TransformEachOpTrait, |
| 24 | + TransformOpInterface]> { |
| 25 | + let description = [{ |
| 26 | + Target the `gpu.launch op` and rewrite all `scf.foreach_thread` |
| 27 | + nested in it to distributed `gpu.thread_id` attribute. |
| 28 | + |
| 29 | + The operation searches for `scf.foreach_thread` ops nested under `target` |
| 30 | + and maps each such op to GPU threads. Mapping is one-to-one and the |
| 31 | + induction variables of `scf.foreach_thread` are rewritten to |
| 32 | + `gpu.thread_id` according to the `thread_dim_mapping` attribute. |
| 33 | + |
| 34 | + Sibling `scf.foreach_thread` are supported in which case, the union of |
| 35 | + the number of threads is computed and may result in predication. |
| 36 | + |
| 37 | + Multiple scf.foreach_thread are supported per `gpu.launch` in which case, |
| 38 | + the max of all the threads is computed and taken for the global |
| 39 | + `gpu.thread_id`. If necessary, `scf.foreach_thread` that do not use the |
| 40 | + whole thread range result in predicated computations. |
| 41 | + |
| 42 | + Dynamic `scf.foreach_thread` trip counts are currently not supported. |
| 43 | + Dynamic block dim sizes are currently not supported. |
| 44 | + |
| 45 | + Only **bufferized** `scf.foreach_thread` are currently supported. |
| 46 | + Only `scf.foreach_thread` distributed to **at most 3 dimensions** are |
| 47 | + currently supported. |
| 48 | + |
| 49 | + Barriers are inserted after each scf.foreach_thread op for now. |
| 50 | + |
| 51 | + The operation alters the block size of the given gpu_launch using |
| 52 | + blockDim argument. |
| 53 | + |
| 54 | + #### Return modes: |
| 55 | + |
| 56 | + This operation ignores non-gpu_launch ops and drops them in the return. |
| 57 | + |
| 58 | + If any scf.foreach_thread with tensors is found, the transform definitely |
| 59 | + fails. |
| 60 | + |
| 61 | + If all the scf.foreach_thread operations contained within the LaunchOp |
| 62 | + referred to by the `target` PDLOperation lower to GPU properly, the |
| 63 | + transform succeeds. Otherwise the transform definitely fails. |
| 64 | + |
| 65 | + The returned handle points to the same LaunchOp operand, consuming it and |
| 66 | + producing a new SSA value to satisfy chaining and linearity of the IR |
| 67 | + properties. |
| 68 | + |
| 69 | + #### Example: |
| 70 | + |
| 71 | + ``` |
| 72 | + gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2) |
| 73 | + threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) { |
| 74 | + scf.foreach_thread (%i, %j) in (7, 9) { |
| 75 | + ... // body 1 |
| 76 | + } {thread_dim_mapping = [1, 0, 2]} |
| 77 | + scf.foreach_thread (%i) in (12) { |
| 78 | + ... // body 2 |
| 79 | + } |
| 80 | + gpu.terminator |
| 81 | + } |
| 82 | + ``` |
| 83 | + is translated to: |
| 84 | + |
| 85 | + ``` |
| 86 | + %bdimX = arith.constant 12 : index |
| 87 | + %bdimY = arith.constant 9 : index |
| 88 | + gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2) |
| 89 | + threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) { |
| 90 | + if (threadIdx.x < 9 && threadIdx.y < 7) { |
| 91 | + ... // body 1 |
| 92 | + } |
| 93 | + gpu.barrier |
| 94 | + if (threadIdx.y < 1) { |
| 95 | + ... // body 2 |
| 96 | + } |
| 97 | + gpu.barrier |
| 98 | + gpu.terminator |
| 99 | + } |
| 100 | + ``` |
| 101 | + }]; |
| 102 | + |
| 103 | + let arguments = (ins PDL_Operation:$target, |
| 104 | + DefaultValuedAttr<I64ArrayAttr, "{}">:$blockDim, |
| 105 | + DefaultValuedAttr<BoolAttr, "true">:$syncAfterDistribute); |
| 106 | + let results = (outs PDL_Operation:$result); |
| 107 | + |
| 108 | + let assemblyFormat = "$target attr-dict"; |
| 109 | + let extraClassDeclaration = [{ |
| 110 | + ::mlir::DiagnosedSilenceableFailure applyToOne( |
| 111 | + ::mlir::Operation *target, |
| 112 | + ::llvm::SmallVectorImpl<::mlir::Operation *> &results, |
| 113 | + ::mlir::transform::TransformState &state); |
| 114 | + }]; |
| 115 | +} |
| 116 | + |
| 117 | + |
| 118 | +def MapForeachToBlocks : |
| 119 | + Op<Transform_Dialect, "gpu.map_foreach_to_blocks", |
| 120 | + [FunctionalStyleTransformOpTrait, |
| 121 | + MemoryEffectsOpInterface, |
| 122 | + TransformOpInterface, |
| 123 | + TransformEachOpTrait]> { |
| 124 | + let description = [{ |
| 125 | + Target the gpu_launch op and rewrite the top level `scf.foreach_thread` |
| 126 | + to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute |
| 127 | + is set, then first generates `gpu_launch` and moves the top level |
| 128 | + `scf.foreach_thread` inside. |
| 129 | + |
| 130 | + The operation searches top level `scf.foreach_thread` ops under |
| 131 | + `gpu_launch` and maps each such op to GPU blocks. Mapping is |
| 132 | + one-to-one and the induction variables of `scf.foreach_thread` are |
| 133 | + rewritten to gpu.block_id according to the `thread_dim_apping` attribute. |
| 134 | + |
| 135 | + Dynamic, `scf.foreach_thread` trip counts are currently not supported. |
| 136 | + Dynamic block dim sizes are currently not supported. |
| 137 | + |
| 138 | + Only **bufferized** scf.foreach_thread are currently supported. |
| 139 | + Only scf.foreach_thread distributed to **at most 3 dimensions** are |
| 140 | + currently supported. |
| 141 | + |
| 142 | + The operation alters the block size of the given gpu_launch using |
| 143 | + gridDim argument. |
| 144 | + |
| 145 | + #### Return modes: |
| 146 | + |
| 147 | + This operation ignores non-gpu_launch ops and drops them in the return. |
| 148 | + |
| 149 | + If any scf.foreach_thread with tensors is found, the transform definitely |
| 150 | + fails. |
| 151 | + |
| 152 | + If all the scf.foreach_thread operations contained within the LaunchOp |
| 153 | + referred to by the `target` PDLOperation lower to GPU properly, the |
| 154 | + transform succeeds. Otherwise the transform definitely fails. |
| 155 | + |
| 156 | + The returned handle points to the same LaunchOp operand, consuming it and |
| 157 | + producing a new SSA value to satisfy chaining and linearity of the IR |
| 158 | + properties. |
| 159 | + }]; |
| 160 | + |
| 161 | + let arguments = (ins PDL_Operation:$target, |
| 162 | + DefaultValuedAttr<I64ArrayAttr, "{}">:$gridDim, |
| 163 | + UnitAttr:$generate_gpu_launch); |
| 164 | + let results = (outs PDL_Operation:$result); |
| 165 | + |
| 166 | + let assemblyFormat = "$target attr-dict"; |
| 167 | + let extraClassDeclaration = [{ |
| 168 | + ::mlir::DiagnosedSilenceableFailure applyToOne( |
| 169 | + ::mlir::Operation *target, |
| 170 | + ::llvm::SmallVectorImpl<::mlir::Operation *> &results, |
| 171 | + ::mlir::transform::TransformState &state); |
| 172 | + }]; |
| 173 | +} |
| 174 | + |
| 175 | +#endif // GPU_TRANSFORM_OPS |
0 commit comments