[AMD] getBackwardSlice variant with handling for op regions

karthik-man · karthik-man · commit 307d8095820d · 2024-10-25T06:21:10.000-07:00
mlir::getBackwardSlice does not handle op regions. This can cause
the backward slice to not be in topological order and this can result
in the reordering pass moving a value's use before its def.
This is a temporary local fix until these changes are upstreamed to mlir.
diff --git a/test/TritonGPU/amd/amd-reorder-instructions.mlir b/test/TritonGPU/amd/amd-reorder-instructions.mlir
@@ -922,3 +922,32 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
     tt.return
   }
 }
+
+// Check that reordering preserves def-before-use for values used inside control flow regions
+// For example, %12 should not be moved below the scf.if op %22
+// CHECK:    %{{.+}} = tt.make_range
+// CHECK:    %{{.+}} = scf.if %{{.+}}
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
+  tt.func public @reoder_across_nested(%arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg6: i32, %arg9: i64, %arg10: i64) attributes {noinline = false} {
+    %12 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
+    %21 = arith.cmpi slt, %arg9, %arg10 : i64
+    %22 = scf.if %21 -> (tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>) {
+      %30 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
+      %100 = scf.if %21 -> (tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>) {
+        %31 = tt.addptr %30, %12 : tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>, tensor<512xi32, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
+        scf.yield %31 : tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
+      } else {
+         %31 = tt.addptr %30, %12 : tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>, tensor<512xi32, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
+         scf.yield %31 : tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
+      }
+      scf.yield %100 : tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
+    } else {
+      %32 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
+      scf.yield %32 : tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
+    }
+    %23 = tt.splat %arg6 : i32 -> tensor<512xi32, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
+    %24 = arith.cmpi slt, %12, %23 : tensor<512xi32, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
+    %25 = tt.load %22, %24 : tensor<512x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
@@ -5,6 +5,7 @@
 #include "mlir/IR/Verifier.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/RegionUtils.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include <deque>
@@ -24,6 +25,73 @@ static bool isLocalLoadOrDotLayoutConversion(Operation *op) {
   return false;
 }
 
+// Copy of mlir::getBackwardSlice with changes to handle nested regions. 
+// This is a temporary local fix until these changes are upstreamed to mlir.
+static void getDeepBackwardSlice(Operation *op,
+                                 SetVector<Operation *> *backwardSlice,
+                                 const BackwardSliceOptions &options) {
+  if (!op || op->hasTrait<OpTrait::IsIsolatedFromAbove>())
+    return;
+
+  // Evaluate whether we should keep this def.
+  // This is useful in particular to implement scoping; i.e. return the
+  // transitive backwardSlice in the current scope.
+  if (options.filter && !options.filter(op))
+    return;
+
+  SetVector<Value> usedValues;
+  Block *opBlock = op->getBlock();
+  auto f = [&](OpOperand *nestedValue) {
+    // Filter out values that are not defined in the block
+    // that contains 'op'. This is to avoid including values
+    // that are defined in the nested regions of 'op'.
+    if (auto *nestedOp = nestedValue->get().getDefiningOp()) {
+      if (opBlock == nestedOp->getBlock()) {
+        usedValues.insert(nestedValue->get());
+      }
+    }
+  };
+
+  // collect all the values used in the nested regions of this op
+  // SetVector<Region*> nestedRegions;
+  for (auto &region : op->getRegions()) {
+    region.walk([&](Region *nestedRegion) {
+      mlir::visitUsedValuesDefinedAbove(*nestedRegion, *nestedRegion, f);
+    });
+  }
+
+  // collect all the values used in the op
+  for (const auto &en : llvm::enumerate(op->getOperands())) {
+    usedValues.insert(en.value());
+  }
+
+  for (const auto &en : llvm::enumerate(usedValues)) {
+    auto operand = en.value();
+    if (auto *definingOp = operand.getDefiningOp()) {
+      if (backwardSlice->count(definingOp) == 0)
+        getDeepBackwardSlice(definingOp, backwardSlice, options);
+    } else if (auto blockArg = dyn_cast<BlockArgument>(operand)) {
+      if (options.omitBlockArguments)
+        continue;
+
+      Block *block = blockArg.getOwner();
+      Operation *parentOp = block->getParentOp();
+      // TODO: determine whether we want to recurse backward into the other
+      // blocks of parentOp, which are not technically backward unless they flow
+      // into us. For now, just bail.
+      if (parentOp && backwardSlice->count(parentOp) == 0) {
+        assert(parentOp->getNumRegions() == 1 &&
+               parentOp->getRegion(0).getBlocks().size() == 1);
+        getDeepBackwardSlice(parentOp, backwardSlice, options);
+      }
+    } else {
+      llvm_unreachable("No definingOp and not a block argument.");
+    }
+  }
+
+  backwardSlice->insert(op);
+}
+
 // Search through block to find earliest insertion point for move op. This can
 // be either an atomic op or last usage of source pointer. Search ends when move
 // op is encountered.
@@ -221,8 +289,7 @@ class TritonAMDGPUReorderInstructionsPass
         // Only move ops residing in the same block.
         return defBlock == block;
       };
-      mlir::getBackwardSlice(op, &backwardSet, options);
-      backwardSet.insert(op);
+      getDeepBackwardSlice(op, &backwardSet, options);
 
       // Don't move a local_store if its source is a load from
       // the same iteration.