Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions test/TritonGPU/amd/amd-reorder-instructions.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -922,3 +922,32 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
tt.return
}
}

// Check that reordering preserves def-before-use for values used inside control flow regions
// For example, %12 should not be moved below the scf.if op %22
// CHECK: %{{.+}} = tt.make_range
// CHECK: %{{.+}} = scf.if %{{.+}}
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
tt.func public @reoder_across_nested(%arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg6: i32, %arg9: i64, %arg10: i64) attributes {noinline = false} {
%12 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
%21 = arith.cmpi slt, %arg9, %arg10 : i64
%22 = scf.if %21 -> (tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>) {
%30 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
%100 = scf.if %21 -> (tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>) {
%31 = tt.addptr %30, %12 : tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>, tensor<512xi32, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
scf.yield %31 : tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
} else {
%31 = tt.addptr %30, %12 : tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>, tensor<512xi32, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
scf.yield %31 : tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
}
scf.yield %100 : tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
} else {
%32 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
scf.yield %32 : tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
}
%23 = tt.splat %arg6 : i32 -> tensor<512xi32, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
%24 = arith.cmpi slt, %12, %23 : tensor<512xi32, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
%25 = tt.load %22, %24 : tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
tt.return
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "mlir/IR/Verifier.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/RegionUtils.h"
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
#include <deque>
Expand All @@ -24,6 +25,73 @@ static bool isLocalLoadOrDotLayoutConversion(Operation *op) {
return false;
}

// Copy of mlir::getBackwardSlice with changes to handle nested regions.
// This is a temporary local fix until these changes are upstreamed to mlir.
static void getDeepBackwardSlice(Operation *op,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider making this fix to the MLIR code base, perhaps through BackwardSliceOptions?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah BackwardSliceOptions will be a good place to handle this variant.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah we should consider landing such general changes to upstream mlir. Then we can update llvm submodule to bring it in.

SetVector<Operation *> *backwardSlice,
const BackwardSliceOptions &options) {
if (!op || op->hasTrait<OpTrait::IsIsolatedFromAbove>())
return;

// Evaluate whether we should keep this def.
// This is useful in particular to implement scoping; i.e. return the
// transitive backwardSlice in the current scope.
if (options.filter && !options.filter(op))
return;

SetVector<Value> usedValues;
Block *opBlock = op->getBlock();
auto f = [&](OpOperand *nestedValue) {
// Filter out values that are not defined in the block
// that contains 'op'. This is to avoid including values
// that are defined in the nested regions of 'op'.
if (auto *nestedOp = nestedValue->get().getDefiningOp()) {
if (opBlock == nestedOp->getBlock()) {
usedValues.insert(nestedValue->get());
}
}
};

// collect all the values used in the nested regions of this op
// SetVector<Region*> nestedRegions;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this line be removed?
Also it will be good to mark which part is different from mlir::getBackwardSlice.

for (auto &region : op->getRegions()) {
region.walk([&](Region *nestedRegion) {
mlir::visitUsedValuesDefinedAbove(*nestedRegion, *nestedRegion, f);
});
}

// collect all the values used in the op
for (const auto &en : llvm::enumerate(op->getOperands())) {
usedValues.insert(en.value());
}

for (const auto &en : llvm::enumerate(usedValues)) {
auto operand = en.value();
if (auto *definingOp = operand.getDefiningOp()) {
if (backwardSlice->count(definingOp) == 0)
getDeepBackwardSlice(definingOp, backwardSlice, options);
} else if (auto blockArg = dyn_cast<BlockArgument>(operand)) {
if (options.omitBlockArguments)
continue;

Block *block = blockArg.getOwner();
Operation *parentOp = block->getParentOp();
// TODO: determine whether we want to recurse backward into the other
// blocks of parentOp, which are not technically backward unless they flow
// into us. For now, just bail.
if (parentOp && backwardSlice->count(parentOp) == 0) {
assert(parentOp->getNumRegions() == 1 &&
parentOp->getRegion(0).getBlocks().size() == 1);
getDeepBackwardSlice(parentOp, backwardSlice, options);
}
} else {
llvm_unreachable("No definingOp and not a block argument.");
}
}

backwardSlice->insert(op);
}

// Search through block to find earliest insertion point for move op. This can
// be either an atomic op or last usage of source pointer. Search ends when move
// op is encountered.
Expand Down Expand Up @@ -221,8 +289,7 @@ class TritonAMDGPUReorderInstructionsPass
// Only move ops residing in the same block.
return defBlock == block;
};
mlir::getBackwardSlice(op, &backwardSet, options);
backwardSet.insert(op);
getDeepBackwardSlice(op, &backwardSet, options);

// Don't move a local_store if its source is a load from
// the same iteration.
Expand Down
Loading