|
| 1 | +#include "TaskIdPropagation.h" |
| 2 | +#include "mlir/Analysis/DataFlow/SparseAnalysis.h" |
| 3 | +#include "mlir/Analysis/DataFlowFramework.h" |
| 4 | +#include "mlir/Dialect/SCF/IR/SCF.h" |
| 5 | +#include "mlir/Support/LLVM.h" |
| 6 | +#include "nvidia/hopper/lib/Transforms/WarpSpecialization/Utility.h" |
| 7 | +#include "triton/Dialect/TritonGPU/Transforms/Utility.h" |
| 8 | +#include "llvm/ADT/STLExtras.h" |
| 9 | +#include "llvm/Support/ErrorHandling.h" |
| 10 | +#include "llvm/Support/raw_ostream.h" |
| 11 | + |
| 12 | +#define DEBUG_TYPE "task-id-propagation" |
| 13 | +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") |
| 14 | +#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") |
| 15 | + |
| 16 | +using namespace mlir; |
| 17 | +using namespace mlir::dataflow; |
| 18 | + |
| 19 | +namespace mlir::triton::gpu { |
| 20 | + |
| 21 | +//===----------------------------------------------------------------------===// |
| 22 | +// TaskId |
| 23 | +//===----------------------------------------------------------------------===// |
| 24 | + |
| 25 | +void TaskId::print(raw_ostream &os) const { |
| 26 | + if (isUninitialized()) { |
| 27 | + os << "<UNINITIALIZED>"; |
| 28 | + return; |
| 29 | + } |
| 30 | + if (isUnknown()) { |
| 31 | + os << "<UNKNOWN>"; |
| 32 | + return; |
| 33 | + } |
| 34 | + return getTaskIds().print(os); |
| 35 | +} |
| 36 | + |
| 37 | +TaskId TaskId::join(const TaskId &lhs, const TaskId &rhs) { |
| 38 | + return TaskId::getUnknownTaskId(); |
| 39 | +} |
| 40 | + |
| 41 | +TaskId TaskId::meet(const TaskId &lhs, const TaskId &rhs) { |
| 42 | + if (lhs.isUnknown() || rhs.isUnknown()) |
| 43 | + return TaskId::getUnknownTaskId(); |
| 44 | + if (lhs.isUninitialized()) |
| 45 | + return rhs; |
| 46 | + if (rhs.isUninitialized()) |
| 47 | + return lhs; |
| 48 | + if (lhs == rhs) |
| 49 | + return lhs; |
| 50 | + |
| 51 | + auto context = lhs.getTaskIds().getContext(); |
| 52 | + auto lhsTasks = lhs.getTaskIds().asArrayRef(); |
| 53 | + auto rhsTasks = rhs.getTaskIds().asArrayRef(); |
| 54 | + // Meet the task ids by merging and deduplicating them |
| 55 | + SmallVector<AsyncTaskId> result(lhsTasks.begin(), lhsTasks.end()); |
| 56 | + result.insert(result.end(), rhsTasks.begin(), rhsTasks.end()); |
| 57 | + std::sort(result.begin(), result.end()); |
| 58 | + result.erase(std::unique(result.begin(), result.end()), result.end()); |
| 59 | + auto mergedAndDedupedTaskIds = |
| 60 | + TaskId(DenseI32ArrayAttr::get(context, ArrayRef<AsyncTaskId>(result))); |
| 61 | + return mergedAndDedupedTaskIds; |
| 62 | +} |
| 63 | + |
| 64 | +//===----------------------------------------------------------------------===// |
| 65 | +// TaskIdBackwardPropagation |
| 66 | +//===----------------------------------------------------------------------===// |
| 67 | + |
| 68 | +void TaskIdBackwardPropagation::propagateToYield( |
| 69 | + scf::YieldOp yieldOp, SmallVector<TaskId> &lattices) { |
| 70 | + for (auto [lattice, yieldOperand] : |
| 71 | + llvm::zip_equal(lattices, yieldOp->getOperands())) { |
| 72 | + auto yieldLattice = getLatticeElement(yieldOperand); |
| 73 | + ChangeResult changed = yieldLattice->meet(lattice); |
| 74 | + propagateIfChanged(yieldLattice, changed); |
| 75 | + } |
| 76 | +} |
| 77 | + |
| 78 | +void TaskIdBackwardPropagation::propagateToParent(Operation *op, |
| 79 | + const TaskId &taskId) { |
| 80 | + auto parentOp = op->getParentOp(); |
| 81 | + while (parentOp && !isa<triton::FuncOp>(parentOp)) { |
| 82 | + if (auto forOp = dyn_cast<scf::ForOp>(parentOp)) { |
| 83 | + // Propagate to the control operands of the for op. |
| 84 | + for (auto controlOperand : |
| 85 | + forOp.getOperands().take_front(forOp.getNumControlOperands())) { |
| 86 | + auto controlLattice = getLatticeElement(controlOperand); |
| 87 | + ChangeResult changed = controlLattice->meet(taskId); |
| 88 | + propagateIfChanged(controlLattice, changed); |
| 89 | + } |
| 90 | + } else if (auto ifOp = dyn_cast<scf::IfOp>(parentOp)) { |
| 91 | + auto cond = ifOp.getCondition(); |
| 92 | + auto condLattice = getLatticeElement(cond); |
| 93 | + ChangeResult changed = condLattice->meet(taskId); |
| 94 | + propagateIfChanged(condLattice, changed); |
| 95 | + } else { |
| 96 | + if (!isa<triton::FuncOp>(parentOp)) |
| 97 | + llvm_unreachable("Other parent ops are not supported."); |
| 98 | + } |
| 99 | + parentOp = parentOp->getParentOp(); |
| 100 | + } |
| 101 | +} |
| 102 | + |
| 103 | +LogicalResult TaskIdBackwardPropagation::visitOperation( |
| 104 | + Operation *op, ArrayRef<TaskIdLattice *> operands, |
| 105 | + ArrayRef<const TaskIdLattice *> results) { |
| 106 | + // Already annotated |
| 107 | + // TODO(Arda): Replace the following with getAsyncTaskIds when we no longer |
| 108 | + // need to dump the task ids into the IR. |
| 109 | + auto taskIdAttr = op->getAttrOfType<DenseI32ArrayAttr>("async_task_id"); |
| 110 | + if (taskIdAttr) { |
| 111 | + const auto annotated = TaskId(taskIdAttr); |
| 112 | + for (auto operandLattice : operands) { |
| 113 | + ChangeResult changed = operandLattice->meet(annotated); |
| 114 | + propagateIfChanged(operandLattice, changed); |
| 115 | + } |
| 116 | + // Propagate to the parent ops such as control flows |
| 117 | + propagateToParent(op, annotated); |
| 118 | + return success(); |
| 119 | + } |
| 120 | + // If it is not annotated by the user, propagate from results to the |
| 121 | + // operands |
| 122 | + for (const auto resultLattice : results) { |
| 123 | + for (auto operandLattice : operands) { |
| 124 | + ChangeResult changed = operandLattice->meet(resultLattice->getValue()); |
| 125 | + propagateIfChanged(operandLattice, changed); |
| 126 | + } |
| 127 | + } |
| 128 | + |
| 129 | + for (const auto resultLattice : results) |
| 130 | + propagateToParent(op, resultLattice->getValue()); |
| 131 | + |
| 132 | + return success(); |
| 133 | +} |
| 134 | + |
| 135 | +void TaskIdBackwardPropagation::visitBranchOperand(OpOperand &operand) { |
| 136 | + auto defOp = operand.getOwner(); |
| 137 | + assert(isa<scf::IfOp>(defOp) || isa<scf::ForOp>(defOp)); |
| 138 | + |
| 139 | + SmallVector<TaskId> lattices(defOp->getNumResults(), |
| 140 | + TaskId::getUninitialized()); |
| 141 | + for (auto [i, result] : llvm::enumerate(defOp->getResults())) { |
| 142 | + auto resultLattice = getLatticeElement(result); |
| 143 | + // Wait for all the results to be initialized. |
| 144 | + if (resultLattice->getValue().isUninitialized()) |
| 145 | + return; |
| 146 | + lattices[i] = |
| 147 | + resultLattice->getValue().meet(lattices[i], resultLattice->getValue()); |
| 148 | + } |
| 149 | + |
| 150 | + // Propagate to the yield ops |
| 151 | + if (auto forOp = dyn_cast<scf::ForOp>(defOp)) { |
| 152 | + auto yieldOp = cast<scf::YieldOp>(forOp.getBody()->getTerminator()); |
| 153 | + propagateToYield(yieldOp, lattices); |
| 154 | + } else if (auto ifOp = dyn_cast<scf::IfOp>(defOp)) { |
| 155 | + propagateToYield(ifOp.thenYield(), lattices); |
| 156 | + if (!ifOp.getElseRegion().empty()) |
| 157 | + propagateToYield(ifOp.elseYield(), lattices); |
| 158 | + } else { |
| 159 | + llvm_unreachable("Unknown branch operation"); |
| 160 | + } |
| 161 | + return; |
| 162 | + |
| 163 | + // TODO(Arda): Address what happens when loop is annotated |
| 164 | +} |
| 165 | + |
| 166 | +void TaskIdBackwardPropagation::visitCallOperand(OpOperand &operand) { |
| 167 | + llvm_unreachable( |
| 168 | + "Should not have any call operands in the IR after inlining."); |
| 169 | +} |
| 170 | + |
| 171 | +void TaskIdBackwardPropagation::setToExitState(TaskIdLattice *lattice) {} |
| 172 | + |
| 173 | +} // namespace mlir::triton::gpu |
0 commit comments