feat: transpose scatter to scatter transpose

avik-pal · avik-pal · commit c2dfd55473b1 · 2026-01-05T17:17:34.000-06:00
diff --git a/src/enzyme_ad/jax/Passes/EnzymeHLOOpt.cpp b/src/enzyme_ad/jax/Passes/EnzymeHLOOpt.cpp
@@ -56,6 +56,7 @@
 #include <cstddef>
 #include <iterator>
 #include <mlir/IR/Value.h>
+#include <mlir/IR/ValueRange.h>
 #include <numeric>
 #define DEBUG_TYPE "enzymehloopt"
 
@@ -22032,15 +22033,12 @@ struct TransposeReverse final
     if (!reverseOp->getResult(0).hasOneUse())
       return failure();
 
-    auto invPerm = getInversePermutation(op.getPermutation());
-    SmallVector<int64_t> newReverseDims(reverseOp.getDimensions().size());
-    for (auto [i, dim] : llvm::enumerate(reverseOp.getDimensions()))
-      newReverseDims[i] = invPerm[dim];
-
     auto newTranspose = stablehlo::TransposeOp::create(
         rewriter, op.getLoc(), reverseOp.getOperand(), op.getPermutation());
-    rewriter.replaceOpWithNewOp<stablehlo::ReverseOp>(op, newTranspose,
-                                                      newReverseDims);
+    rewriter.replaceOpWithNewOp<stablehlo::ReverseOp>(
+        op, newTranspose,
+        applyInversePermutationToDims(op.getPermutation(),
+                                      reverseOp.getDimensions()));
     return success();
   }
 };
@@ -28543,6 +28541,56 @@ struct FuseReshapeCollapseOrExpandDimsIntoReduce final
   }
 };
 
+struct TransposeScatter final
+    : CheckedOpRewritePattern<stablehlo::TransposeOp, TransposeScatter> {
+  using CheckedOpRewritePattern::CheckedOpRewritePattern;
+
+  LogicalResult matchAndRewriteImpl(stablehlo::TransposeOp op,
+                                    PatternRewriter &rewriter) {
+    auto scatterOp = op.getOperand().getDefiningOp<stablehlo::ScatterOp>();
+    if (!scatterOp) {
+      return rewriter.notifyMatchFailure(op,
+                                         "TransposeOp with non-scatter input");
+    }
+
+    if (!isOnlyUsedInOperation(scatterOp, op)) {
+      return failure();
+    }
+
+    SmallVector<Value> transposedInputs;
+    for (auto input : scatterOp.getInputs()) {
+      auto transposedInput = stablehlo::TransposeOp::create(
+          rewriter, op.getLoc(), input, op.getPermutation());
+      transposedInputs.push_back(transposedInput);
+    }
+
+    auto scatterDims = scatterOp.getScatterDimensionNumbers();
+    auto invPerm = getInversePermutation(op.getPermutation());
+
+    auto newInputBatchingDims =
+        applyPermutationToDims(invPerm, scatterDims.getInputBatchingDims());
+    llvm::sort(newInputBatchingDims);
+
+    auto newScatterDimsToOperandDims = applyPermutationToDims(
+        invPerm, scatterDims.getScatterDimsToOperandDims());
+
+    auto newScatterDims = stablehlo::ScatterDimensionNumbersAttr::get(
+        rewriter.getContext(), scatterDims.getUpdateWindowDims(),
+        scatterDims.getInsertedWindowDims(), newInputBatchingDims,
+        scatterDims.getScatterIndicesBatchingDims(),
+        newScatterDimsToOperandDims, scatterDims.getIndexVectorDim());
+
+    auto newScatterOp = stablehlo::ScatterOp::create(
+        rewriter, op.getLoc(), TypeRange(op.getType()), transposedInputs,
+        scatterOp.getScatterIndices(), scatterOp.getUpdates(), newScatterDims,
+        scatterOp.getIndicesAreSortedAttr(), scatterOp.getUniqueIndicesAttr());
+    newScatterOp.getUpdateComputation().takeBody(
+        scatterOp.getUpdateComputation());
+    rewriter.replaceOp(op, newScatterOp->getResults());
+    return success();
+  }
+};
+
 ///////////////  End Imported from stablehlo
 
 // clang-format off
@@ -29065,13 +29113,14 @@ struct EnzymeHLOOptPass
     }
 
     if (passses & (2048 * 32)) {
-      patterns.add<TransposeWhile, TransposeSliceBase<stablehlo::SliceOp>,
-                   TransposeConcat, TransposeDUS, TransposeIota,
-                   TransposeReduceWindow, TransposeReduce, TransposeSelect,
-                   TransposeSliceBase<stablehlo::DynamicSliceOp>,
-                   TransposeReverse, TransposeBatchNormTraining,
-                   TransposeBatchNormInference, TransposeBatchNormGrad,
-                   TransposeIf, TransposeFFT, TransposeReshape>(context);
+      patterns
+          .add<TransposeWhile, TransposeSliceBase<stablehlo::SliceOp>,
+               TransposeConcat, TransposeDUS, TransposeIota,
+               TransposeReduceWindow, TransposeReduce, TransposeSelect,
+               TransposeSliceBase<stablehlo::DynamicSliceOp>, TransposeReverse,
+               TransposeBatchNormTraining, TransposeBatchNormInference,
+               TransposeBatchNormGrad, TransposeIf, TransposeFFT,
+               TransposeReshape, TransposeScatter>(context);
       patterns.add<TransposeElementwise>(true, context);
     }
 
diff --git a/src/enzyme_ad/jax/TransformOps/TransformOps.td b/src/enzyme_ad/jax/TransformOps/TransformOps.td
@@ -2694,3 +2694,8 @@ def ApplyWhileElementwiseReductionToReducePatterns : EnzymeHLOPatternOp<
     "while_elementwise_reduction_to_reduce"> {
   let patterns = ["WhileElementwiseReductionToReduce"];
 }
+
+def ApplyTransposeScatterPatterns : EnzymeHLOPatternOp<
+    "transpose_scatter"> {
+  let patterns = ["TransposeScatter"];
+}
diff --git a/src/enzyme_ad/jax/Utils.cpp b/src/enzyme_ad/jax/Utils.cpp
@@ -2384,6 +2384,35 @@ SmallVector<int64_t> getInversePermutation(ArrayRef<int64_t> perm) {
   return res;
 }
 
+SmallVector<int64_t> applyPermutationToDims(ArrayRef<int64_t> perm,
+                                            ArrayRef<int64_t> dims) {
+  SmallVector<int64_t> res(dims.size());
+  for (auto en : llvm::enumerate(dims)) {
+    res[en.index()] = perm[en.value()];
+  }
+  return res;
+}
+
+template <typename T>
+SmallVector<T> applyPermutation(ArrayRef<int64_t> perm, ArrayRef<T> values) {
+  SmallVector<T> res;
+  for (auto p : perm) {
+    res.push_back(values[p]);
+  }
+  return res;
+}
+
+SmallVector<int64_t> applyInversePermutationToDims(ArrayRef<int64_t> perm,
+                                                   ArrayRef<int64_t> dims) {
+  return applyPermutationToDims(getInversePermutation(perm), dims);
+}
+
+template <typename T>
+SmallVector<T> applyInversePermutation(ArrayRef<int64_t> perm,
+                                       ArrayRef<T> values) {
+  return applyPermutation(getInversePermutation(perm), values);
+}
+
 Value transposeSliceHelper(stablehlo::TransposeOp transpose,
                            PatternRewriter &rewriter, stablehlo::SliceOp op) {
   return transposeSliceHelper(transpose, rewriter, op.getStartIndices(),
@@ -2449,10 +2478,9 @@ Value sliceTransposeHelper(stablehlo::TransposeOp transpose,
   auto newUpdate =
       TransposeOpCreate(rewriter, transpose->getLoc(), op.getUpdate(),
                         transpose.getPermutation());
-  SmallVector<Value> starts;
-  for (auto ind : getInversePermutation(transpose.getPermutation())) {
-    starts.push_back(op.getStartIndices()[ind]);
-  }
+  SmallVector<Value> startIndices = llvm::to_vector(op.getStartIndices());
+  auto starts = applyInversePermutation(transpose.getPermutation(),
+                                        ArrayRef<Value>(startIndices));
   return stablehlo::DynamicUpdateSliceOp::create(
       rewriter, transpose->getLoc(), transpose.getOperand(), newUpdate, starts);
 }
@@ -2461,12 +2489,10 @@ Value sliceTransposeHelper(stablehlo::TransposeOp transpose,
                            PatternRewriter &rewriter, ArrayRef<int64_t> starts,
                            ArrayRef<int64_t> limits,
                            ArrayRef<int64_t> strides) {
-  SmallVector<int64_t> start, end, step;
-  for (auto ind : getInversePermutation(transpose.getPermutation())) {
-    start.push_back(starts[ind]);
-    end.push_back(limits[ind]);
-    step.push_back(strides[ind]);
-  }
+  auto invPerm = getInversePermutation(transpose.getPermutation());
+  auto start = applyPermutation(invPerm, starts);
+  auto end = applyPermutation(invPerm, limits);
+  auto step = applyPermutation(invPerm, strides);
   return SliceOpCreate(rewriter, transpose.getLoc(), transpose.getOperand(),
                        start, end, step);
 }
@@ -2475,12 +2501,9 @@ Value sliceTransposeHelper(stablehlo::TransposeOp transpose,
                            PatternRewriter &rewriter,
                            ArrayRef<Value> sliceStarts,
                            ArrayRef<int64_t> sliceSizes) {
-  SmallVector<int64_t> sizes;
-  SmallVector<Value> starts;
-  for (auto ind : getInversePermutation(transpose.getPermutation())) {
-    sizes.push_back(sliceSizes[ind]);
-    starts.push_back(sliceStarts[ind]);
-  }
+  auto invPerm = getInversePermutation(transpose.getPermutation());
+  auto sizes = applyPermutation(invPerm, sliceSizes);
+  auto starts = applyPermutation(invPerm, sliceStarts);
   return DynamicSliceOpCreate(rewriter, transpose.getLoc(),
                               transpose.getOperand(), starts, sizes);
 }
@@ -2549,6 +2572,15 @@ bool isFusible(Operation *op, stablehlo::BroadcastInDimOp bcast) {
       .Default([](auto other) { return matchPattern(other, m_Constant()); });
 }
 
+bool isFusible(Operation *op, stablehlo::TransposeOp transpose) {
+  return TypeSwitch<Operation *, bool>(op)
+      .Case<stablehlo::TransposeOp, stablehlo::BroadcastInDimOp>(
+          [](auto prevOp) { return true; })
+      .Case<stablehlo::ReshapeOp>(
+          [](auto reshape) { return reshapeIsTranspose(reshape); })
+      .Default([](auto other) { return matchPattern(other, m_Constant()); });
+}
+
 bool IsTensorFilled(Value input) {
   // Use a worklist-based approach to traverse the SSA def-use chain
   // and determine if the value is known to be a dense (fully-populated) matrix.
diff --git a/src/enzyme_ad/jax/Utils.h b/src/enzyme_ad/jax/Utils.h
@@ -1221,6 +1221,20 @@ bool canFuseIntoReduce(Operation *op);
 
 llvm::SmallVector<int64_t> getInversePermutation(ArrayRef<int64_t> perm);
 
+llvm::SmallVector<int64_t> applyPermutationToDims(ArrayRef<int64_t> perm,
+                                                  ArrayRef<int64_t> dims);
+
+llvm::SmallVector<int64_t>
+applyInversePermutationToDims(ArrayRef<int64_t> perm, ArrayRef<int64_t> dims);
+
+template <typename T>
+llvm::SmallVector<T> applyPermutation(ArrayRef<int64_t> perm,
+                                      ArrayRef<T> values);
+
+template <typename T>
+llvm::SmallVector<T> applyInversePermutation(ArrayRef<int64_t> perm,
+                                             ArrayRef<T> values);
+
 Value transposeSliceHelper(stablehlo::TransposeOp transpose,
                            PatternRewriter &rewriter, stablehlo::SliceOp op);
 Value transposeSliceHelper(stablehlo::TransposeOp transpose,
@@ -1258,6 +1272,7 @@ Value sliceTransposeHelper(stablehlo::TransposeOp transpose,
 bool isFusible(stablehlo::TransposeOp transpose, Operation *op);
 bool isFusible(Operation *op, stablehlo::BroadcastInDimOp bcast);
 bool isFusible(Operation *op, stablehlo::ReshapeOp reshape);
+bool isFusible(Operation *op, stablehlo::TransposeOp transpose);
 
 template <typename OpTy>
 Value getIdentityValueForOp(OpBuilder &builder, Location loc, Type elemType);
diff --git a/src/enzyme_ad/jax/primitives.py b/src/enzyme_ad/jax/primitives.py
@@ -570,6 +570,7 @@ def optimization_passes(
             "transpose_if",
             "transpose_fft",
             "transpose_reshape",
+            "transpose_scatter",
         ]
     elif transpose_propagate == "down":
         transform_passes_list += [
diff --git a/test/lit_tests/transposescatter.mlir b/test/lit_tests/transposescatter.mlir
@@ -0,0 +1,96 @@
+// RUN: enzymexlamlir-opt %s --pass-pipeline='builtin.module(enzyme-hlo-opt{passses=65536},enzyme-hlo-opt)' | FileCheck %s
+
+func.func @main1(%arg0: tensor<5x2xf32>, %arg1: tensor<4x3x2xf32>) -> tensor<5x2xf32> {
+    %c = stablehlo.constant dense<[[[0, 1, 2, 3], [3, 1, 0, 2], [2, 4, 4, 2]]]> : tensor<1x3x4xi64>
+    %0 = stablehlo.transpose %arg0, dims = [1, 0] : (tensor<5x2xf32>) -> tensor<2x5xf32>
+    %1 = stablehlo.transpose %arg1, dims = [2, 1, 0] : (tensor<4x3x2xf32>) -> tensor<2x3x4xf32>
+    %2 = "stablehlo.scatter"(%0, %c, %1) <{scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [0], inserted_window_dims = [1], scatter_dims_to_operand_dims = [1]>}> ({
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+      %4 = stablehlo.multiply %arg2, %arg3 : tensor<f32>
+      stablehlo.return %4 : tensor<f32>
+    }) : (tensor<2x5xf32>, tensor<1x3x4xi64>, tensor<2x3x4xf32>) -> tensor<2x5xf32>
+    %3 = stablehlo.transpose %2, dims = [1, 0] : (tensor<2x5xf32>) -> tensor<5x2xf32>
+    return %3 : tensor<5x2xf32>
+}
+
+// CHECK: func.func @main1(%arg0: tensor<5x2xf32>, %arg1: tensor<4x3x2xf32>) -> tensor<5x2xf32> {
+// CHECK-NEXT{LITERAL}:     %c = stablehlo.constant dense<[[[0, 1, 2, 3], [3, 1, 0, 2], [2, 4, 4, 2]]]> : tensor<1x3x4xi64>
+// CHECK-NEXT:     %0 = stablehlo.transpose %arg1, dims = [2, 1, 0] : (tensor<4x3x2xf32>) -> tensor<2x3x4xf32>
+// CHECK-NEXT:     %1 = "stablehlo.scatter"(%arg0, %c, %0) <{scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [0], inserted_window_dims = [1], scatter_dims_to_operand_dims = [0]>}> ({
+// CHECK-NEXT:     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+// CHECK-NEXT:       %2 = stablehlo.multiply %arg2, %arg3 : tensor<f32>
+// CHECK-NEXT:       stablehlo.return %2 : tensor<f32>
+// CHECK-NEXT:     }) : (tensor<5x2xf32>, tensor<1x3x4xi64>, tensor<2x3x4xf32>) -> tensor<5x2xf32>
+// CHECK-NEXT:     return %1 : tensor<5x2xf32>
+// CHECK-NEXT: }
+
+func.func @main2(%arg0: tensor<5x2xf32>, %arg1: tensor<4x3x2xf32>) -> tensor<5x2xf32> {
+    %c = stablehlo.constant dense<[[[0, 1, 2, 3], [3, 1, 0, 2], [2, 4, 4, 2]]]> : tensor<1x3x4xi64>
+    %0 = stablehlo.transpose %arg0, dims = [1, 0] : (tensor<5x2xf32>) -> tensor<2x5xf32>
+    %1 = stablehlo.transpose %arg1, dims = [2, 1, 0] : (tensor<4x3x2xf32>) -> tensor<2x3x4xf32>
+    %2 = "stablehlo.scatter"(%0, %c, %1) <{scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [0], inserted_window_dims = [1], scatter_dims_to_operand_dims = [1]>}> ({
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+      %4 = stablehlo.add %arg2, %arg3 : tensor<f32>
+      stablehlo.return %4 : tensor<f32>
+    }) : (tensor<2x5xf32>, tensor<1x3x4xi64>, tensor<2x3x4xf32>) -> tensor<2x5xf32>
+    %3 = stablehlo.transpose %2, dims = [1, 0] : (tensor<2x5xf32>) -> tensor<5x2xf32>
+    return %3 : tensor<5x2xf32>
+}
+
+// CHECK: func.func @main2(%arg0: tensor<5x2xf32>, %arg1: tensor<4x3x2xf32>) -> tensor<5x2xf32> {
+// CHECK-NEXT{LITERAL}:     %c = stablehlo.constant dense<[[[0, 1, 2, 3], [3, 1, 0, 2], [2, 4, 4, 2]]]> : tensor<1x3x4xi64>
+// CHECK-NEXT:     %0 = stablehlo.transpose %arg1, dims = [2, 1, 0] : (tensor<4x3x2xf32>) -> tensor<2x3x4xf32>
+// CHECK-NEXT:     %1 = "stablehlo.scatter"(%arg0, %c, %0) <{scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [0], inserted_window_dims = [1], scatter_dims_to_operand_dims = [0]>}> ({
+// CHECK-NEXT:     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+// CHECK-NEXT:       %2 = stablehlo.add %arg2, %arg3 : tensor<f32>
+// CHECK-NEXT:       stablehlo.return %2 : tensor<f32>
+// CHECK-NEXT:     }) : (tensor<5x2xf32>, tensor<1x3x4xi64>, tensor<2x3x4xf32>) -> tensor<5x2xf32>
+// CHECK-NEXT:     return %1 : tensor<5x2xf32>
+// CHECK-NEXT: }
+
+func.func @main3(%arg0: tensor<32x32xf32>, %arg1: tensor<32xf32>) -> tensor<32x32xf32> {
+    %c = stablehlo.constant dense<[[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11], [12, 12], [13, 13], [14, 14], [15, 15], [16, 16], [17, 17], [18, 18], [19, 19], [20, 20], [21, 21], [22, 22], [23, 23], [24, 24], [25, 25], [26, 26], [27, 27], [28, 28], [29, 29], [30, 30], [31, 31]]> : tensor<32x2xi64>
+    %cst = stablehlo.constant dense<0.000000e+00> : tensor<32x32xf32>
+    %0 = "stablehlo.scatter"(%cst, %c, %arg1) <{scatter_dimension_numbers = #stablehlo.scatter<inserted_window_dims = [0, 1], scatter_dims_to_operand_dims = [0, 1], index_vector_dim = 1>, unique_indices = true}> ({
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+      stablehlo.return %arg3 : tensor<f32>
+    }) {enzymexla.symmetric_matrix = [#enzymexla<guaranteed NOTGUARANTEED>]} : (tensor<32x32xf32>, tensor<32x2xi64>, tensor<32xf32>) -> tensor<32x32xf32>
+    %1 = stablehlo.transpose %0, dims = [1, 0] : (tensor<32x32xf32>) -> tensor<32x32xf32>
+    %2 = stablehlo.add %arg0, %1 {enzymexla.symmetric_matrix = [#enzymexla<guaranteed NOTGUARANTEED>]} : tensor<32x32xf32>
+    return %2 : tensor<32x32xf32>
+}
+
+// CHECK: func.func @main3(%arg0: tensor<32x32xf32>, %arg1: tensor<32xf32>) -> tensor<32x32xf32> {
+// CHECK-NEXT{LITERAL}:     %c = stablehlo.constant dense<[[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11], [12, 12], [13, 13], [14, 14], [15, 15], [16, 16], [17, 17], [18, 18], [19, 19], [20, 20], [21, 21], [22, 22], [23, 23], [24, 24], [25, 25], [26, 26], [27, 27], [28, 28], [29, 29], [30, 30], [31, 31]]> : tensor<32x2xi64>
+// CHECK-NEXT:     %cst = stablehlo.constant dense<0.000000e+00> : tensor<32x32xf32>
+// CHECK-NEXT:     %0 = "stablehlo.scatter"(%cst, %c, %arg1) <{scatter_dimension_numbers = #stablehlo.scatter<inserted_window_dims = [0, 1], scatter_dims_to_operand_dims = [1, 0], index_vector_dim = 1>, unique_indices = true}> ({
+// CHECK-NEXT:     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+// CHECK-NEXT:       stablehlo.return %arg3 : tensor<f32>
+// CHECK-NEXT:     }) : (tensor<32x32xf32>, tensor<32x2xi64>, tensor<32xf32>) -> tensor<32x32xf32>
+// CHECK-NEXT:     %1 = stablehlo.add %arg0, %0 {enzymexla.symmetric_matrix = [#enzymexla<guaranteed NOTGUARANTEED>]} : tensor<32x32xf32>
+// CHECK-NEXT:     return %1 : tensor<32x32xf32>
+// CHECK-NEXT: }
+
+func.func @main4(%arg0: tensor<3x4x4xf64>) -> tensor<3x4x4xf64> {
+  %cst = stablehlo.constant dense<2.000000e+00> : tensor<f64>
+  %cst_0 = stablehlo.constant dense<2.000000e+00> : tensor<3x4x3xf64>
+  %c = stablehlo.constant dense<[[0], [2], [1]]> : tensor<3x1xi64>
+  %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<3x4x4xf64>) -> tensor<4x4x3xf64>
+  %1 = "stablehlo.scatter"(%0, %c, %cst_0) <{indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [1, 2], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = true}> ({
+  ^bb0(%arg1: tensor<f64>, %arg2: tensor<f64>):
+    stablehlo.return %cst : tensor<f64>
+  }) : (tensor<4x4x3xf64>, tensor<3x1xi64>, tensor<3x4x3xf64>) -> tensor<4x4x3xf64>
+  %2 = stablehlo.transpose %1, dims = [2, 1, 0] : (tensor<4x4x3xf64>) -> tensor<3x4x4xf64>
+  return %2 : tensor<3x4x4xf64>
+}
+
+// CHECK: func.func @main4(%arg0: tensor<3x4x4xf64>) -> tensor<3x4x4xf64> {
+// CHECK-NEXT:     %cst = stablehlo.constant dense<2.000000e+00> : tensor<f64>
+// CHECK-NEXT:     %cst_0 = stablehlo.constant dense<2.000000e+00> : tensor<3x4x3xf64>
+// CHECK-NEXT{LITERAL}:     %c = stablehlo.constant dense<[[0], [2], [1]]> : tensor<3x1xi64>
+// CHECK-NEXT:     %0 = "stablehlo.scatter"(%arg0, %c, %cst_0) <{indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [1, 2], inserted_window_dims = [0], scatter_dims_to_operand_dims = [2], index_vector_dim = 1>, unique_indices = true}> ({
+// CHECK-NEXT:     ^bb0(%arg1: tensor<f64>, %arg2: tensor<f64>):
+// CHECK-NEXT:       stablehlo.return %cst : tensor<f64>
+// CHECK-NEXT:     }) : (tensor<3x4x4xf64>, tensor<3x1xi64>, tensor<3x4x3xf64>) -> tensor<3x4x4xf64>
+// CHECK-NEXT:     return %0 : tensor<3x4x4xf64>
+// CHECK-NEXT: }

Original file line number	Diff line number	Diff line change
`@@ -2694,3 +2694,8 @@ def ApplyWhileElementwiseReductionToReducePatterns : EnzymeHLOPatternOp<`
`2694`	`2694`	`"while_elementwise_reduction_to_reduce"> {`
`2695`	`2695`	`let patterns = ["WhileElementwiseReductionToReduce"];`
`2696`	`2696`	`}`
	`2697`	`+`
	`2698`	`+def ApplyTransposeScatterPatterns : EnzymeHLOPatternOp<`
	`2699`	`+ "transpose_scatter"> {`
	`2700`	`+ let patterns = ["TransposeScatter"];`
	`2701`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -570,6 +570,7 @@ def optimization_passes(`
`570`	`570`	`"transpose_if",`
`571`	`571`	`"transpose_fft",`
`572`	`572`	`"transpose_reshape",`
	`573`	`+ "transpose_scatter",`
`573`	`574`	`]`
`574`	`575`	`elif transpose_propagate == "down":`
`575`	`576`	`transform_passes_list += [`