feat: enable raising more operations from loops (#1567)

avik-pal · web-flow · commit c051c600464b · 2025-11-04T16:14:10.000-05:00
* feat: enable raising more operations from loops

* test: add test cases
diff --git a/src/enzyme_ad/jax/Passes/AutoBatching.cpp b/src/enzyme_ad/jax/Passes/AutoBatching.cpp
@@ -867,11 +867,8 @@ LogicalResult GreedyWhileLoopBatchFission::matchAndRewriteImpl(
     if (!validReshapes)
       continue;
 
-    // TODO: add scatter here once batch interface is
-    if (isa<stablehlo::DotGeneralOp, stablehlo::GatherOp, stablehlo::ReduceOp,
-            stablehlo::SortOp, stablehlo::TransposeOp,
-            stablehlo::BroadcastInDimOp, stablehlo::ReduceWindowOp>(op) ||
-        op->hasTrait<OpTrait::Elementwise>()) {
+    auto batchInterface = dyn_cast<BatchOpInterface>(op);
+    if (batchInterface || op->hasTrait<OpTrait::Elementwise>()) {
       if (liftOperationByBatching(rewriter, whileOp, slices, op, info,
                                   intermediateReshape)) {
         anyOpRewritten = true;
@@ -1247,6 +1244,9 @@ struct AutoBatchingPass
                    // op interface is implemented
                    ConcatInsertDimToBatch<stablehlo::SortOp>,
                    ConcatInsertDimToBatch<stablehlo::ReduceWindowOp>,
+                   ConcatInsertDimToBatch<stablehlo::ConcatenateOp>,
+                   ConcatInsertDimToBatch<stablehlo::GetDimensionSizeOp>,
+                   ConcatInsertDimToBatch<stablehlo::ReverseOp>,
                    ConcatInsertDimElementwiseToBatch>(context);
     }
 
@@ -1258,6 +1258,9 @@ struct AutoBatchingPass
           SliceToBatch<stablehlo::TransposeOp>,
           SliceToBatch<stablehlo::BroadcastInDimOp>,
           SliceToBatch<stablehlo::ReduceWindowOp>,
+          SliceToBatch<stablehlo::ConcatenateOp>,
+          SliceToBatch<stablehlo::GetDimensionSizeOp>,
+          SliceToBatch<stablehlo::ReverseOp>,
           // SliceToBatchReshape,
           SliceToBatchElementwise>(context);
     }
diff --git a/src/enzyme_ad/jax/TransformOps/TransformOps.td b/src/enzyme_ad/jax/TransformOps/TransformOps.td
@@ -2418,6 +2418,18 @@ def ApplyConcatInsertDimSortPatterns : EnzymeHLOPatternOp<
     "concat_insert_dim_sort"> {
   let patterns = ["ConcatInsertDimToBatch<stablehlo::SortOp>"];
 }
+def ApplyConcatInsertDimConcatenatePatterns : EnzymeHLOPatternOp<
+    "concat_insert_dim_concatenate"> {
+  let patterns = ["ConcatInsertDimToBatch<stablehlo::ConcatenateOp>"];
+}
+def ApplyConcatInsertDimGetDimensionSizePatterns : EnzymeHLOPatternOp<
+    "concat_insert_dim_get_dimension_size"> {
+  let patterns = ["ConcatInsertDimToBatch<stablehlo::GetDimensionSizeOp>"];
+}
+def ApplyConcatInsertDimReversePatterns : EnzymeHLOPatternOp<
+    "concat_insert_dim_reverse"> {
+  let patterns = ["ConcatInsertDimToBatch<stablehlo::ReverseOp>"];
+}
 def ApplyConcatInsertDimReduceWindowPatterns : EnzymeHLOPatternOp<
     "concat_insert_dim_reduce_window"> {
   let patterns = ["ConcatInsertDimToBatch<stablehlo::ReduceWindowOp>"];
@@ -2456,6 +2468,18 @@ def ApplyBroadcastInDimSliceToBatchPatterns : EnzymeHLOPatternOp<
     "broadcastindim_slice_to_batch"> {
   let patterns = ["SliceToBatch<stablehlo::BroadcastInDimOp>"];
 }
+def ApplyConcatenateSliceToBatchPatterns : EnzymeHLOPatternOp<
+    "concatenate_slice_to_batch"> {
+  let patterns = ["SliceToBatch<stablehlo::ConcatenateOp>"];
+}
+def ApplyGetDimensionSizeSliceToBatchPatterns : EnzymeHLOPatternOp<
+    "get_dimension_size_slice_to_batch"> {
+  let patterns = ["SliceToBatch<stablehlo::GetDimensionSizeOp>"];
+}
+def ApplyReverseSliceToBatchPatterns : EnzymeHLOPatternOp<
+    "reverse_slice_to_batch"> {
+  let patterns = ["SliceToBatch<stablehlo::ReverseOp>"];
+}
 def ApplyReduceWindowSliceToBatchPatterns : EnzymeHLOPatternOp<
     "reducewindow_slice_to_batch"> {
   let patterns = ["SliceToBatch<stablehlo::ReduceWindowOp>"];
diff --git a/test/lit_tests/autobatching/concatinsertdimreverse.mlir b/test/lit_tests/autobatching/concatinsertdimreverse.mlir
@@ -0,0 +1,24 @@
+// RUN: enzymexlamlir-opt --auto-batching --enzyme-hlo-opt %s | FileCheck %s
+
+module @reactant_loop1 attributes {mhlo.num_partitions = 1 : i64, mhlo.num_replicas = 1 : i64} {
+  func.func @main(%arg0: tensor<4x2x3xf32> {enzymexla.memory_effects = []}) -> tensor<4x2x3xf32> attributes {enzymexla.memory_effects = []} {
+    %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<4x2x3xf32>) -> tensor<3x2x4xf32>
+    %1 = stablehlo.slice %0 [0:3, 0:1, 0:4] : (tensor<3x2x4xf32>) -> tensor<3x1x4xf32>
+    %2 = stablehlo.reshape %1 : (tensor<3x1x4xf32>) -> tensor<3x4xf32>
+    %3 = stablehlo.reverse %2, dims = [0, 1] : tensor<3x4xf32>
+    %4 = stablehlo.broadcast_in_dim %3, dims = [2, 0] : (tensor<3x4xf32>) -> tensor<4x1x3xf32>
+    %5 = stablehlo.slice %0 [0:3, 1:2, 0:4] : (tensor<3x2x4xf32>) -> tensor<3x1x4xf32>
+    %6 = stablehlo.reshape %5 : (tensor<3x1x4xf32>) -> tensor<3x4xf32>
+    %7 = stablehlo.reverse %6, dims = [0, 1] : tensor<3x4xf32>
+    %8 = stablehlo.broadcast_in_dim %7, dims = [2, 0] : (tensor<3x4xf32>) -> tensor<4x1x3xf32>
+    %9 = stablehlo.concatenate %4, %8, dim = 1 : (tensor<4x1x3xf32>, tensor<4x1x3xf32>) -> tensor<4x2x3xf32>
+    return %9 : tensor<4x2x3xf32>
+  }
+}
+
+// CHECK: func.func @main(%arg0: tensor<4x2x3xf32> {enzymexla.memory_effects = []}) -> tensor<4x2x3xf32> attributes {enzymexla.memory_effects = []} {
+// CHECK-NEXT:     %0 = stablehlo.transpose %arg0, dims = [1, 2, 0] : (tensor<4x2x3xf32>) -> tensor<2x3x4xf32>
+// CHECK-NEXT:     %1 = stablehlo.reverse %0, dims = [1, 2] : tensor<2x3x4xf32>
+// CHECK-NEXT:     %2 = stablehlo.transpose %1, dims = [2, 0, 1] : (tensor<2x3x4xf32>) -> tensor<4x2x3xf32>
+// CHECK-NEXT:     return %2 : tensor<4x2x3xf32>
+// CHECK-NEXT: }
diff --git a/test/lit_tests/autobatching/loop_concatenate.mlir b/test/lit_tests/autobatching/loop_concatenate.mlir
@@ -0,0 +1,36 @@
+// RUN: enzymexlamlir-opt --auto-batching --enzyme-hlo-opt %s | FileCheck %s
+
+module {
+  func.func @main(%arg0: tensor<4x7x3xf32> {enzymexla.memory_effects = []}, %arg1: tensor<4x7x3xf32> {enzymexla.memory_effects = []}) -> tensor<8x7x3xf32> attributes {enzymexla.memory_effects = []} {
+    %c = stablehlo.constant dense<0> : tensor<i32>
+    %c_0 = stablehlo.constant dense<1> : tensor<i32>
+    %c_1 = stablehlo.constant dense<0> : tensor<i64>
+    %c_2 = stablehlo.constant dense<1> : tensor<i64>
+    %cst = stablehlo.constant dense<0.000000e+00> : tensor<8x7x3xf32>
+    %c_3 = stablehlo.constant dense<7> : tensor<i64>
+    %0:2 = stablehlo.while(%iterArg = %c_1, %iterArg_4 = %cst) : tensor<i64>, tensor<8x7x3xf32> attributes {enzyme.disable_mincut}
+    cond {
+      %1 = stablehlo.compare  LT, %iterArg, %c_3 : (tensor<i64>, tensor<i64>) -> tensor<i1>
+      stablehlo.return %1 : tensor<i1>
+    } do {
+      %1 = stablehlo.add %c_2, %iterArg : tensor<i64>
+      %2 = stablehlo.convert %1 : (tensor<i64>) -> tensor<i32>
+      %3 = stablehlo.subtract %2, %c_0 : tensor<i32>
+      %4 = stablehlo.dynamic_slice %arg0, %c, %3, %c, sizes = [4, 1, 3] : (tensor<4x7x3xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<4x1x3xf32>
+      %5 = stablehlo.dynamic_slice %arg1, %c, %3, %c, sizes = [4, 1, 3] : (tensor<4x7x3xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<4x1x3xf32>
+      %6 = stablehlo.concatenate %4, %5, dim = 0 : (tensor<4x1x3xf32>, tensor<4x1x3xf32>) -> tensor<8x1x3xf32>
+      %7 = stablehlo.dynamic_update_slice %iterArg_4, %6, %c, %3, %c : (tensor<8x7x3xf32>, tensor<8x1x3xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<8x7x3xf32>
+      stablehlo.return %1, %7 : tensor<i64>, tensor<8x7x3xf32>
+    }
+    return %0#1 : tensor<8x7x3xf32>
+  }
+}
+
+// CHECK:  func.func @main(%arg0: tensor<4x7x3xf32> {enzymexla.memory_effects = []}, %arg1: tensor<4x7x3xf32> {enzymexla.memory_effects = []}) -> tensor<8x7x3xf32> attributes {enzymexla.memory_effects = []} {
+// CHECK-NEXT:    %0 = stablehlo.broadcast_in_dim %arg0, dims = [1, 0, 3] : (tensor<4x7x3xf32>) -> tensor<7x4x1x3xf32>
+// CHECK-NEXT:    %1 = stablehlo.broadcast_in_dim %arg1, dims = [1, 0, 3] : (tensor<4x7x3xf32>) -> tensor<7x4x1x3xf32>
+// CHECK-NEXT:    %2 = stablehlo.concatenate %0, %1, dim = 1 : (tensor<7x4x1x3xf32>, tensor<7x4x1x3xf32>) -> tensor<7x8x1x3xf32>
+// CHECK-NEXT:    %3 = stablehlo.reshape %2 : (tensor<7x8x1x3xf32>) -> tensor<7x8x3xf32>
+// CHECK-NEXT:    %4 = stablehlo.transpose %3, dims = [1, 0, 2] : (tensor<7x8x3xf32>) -> tensor<8x7x3xf32>
+// CHECK-NEXT:    return %4 : tensor<8x7x3xf32>
+// CHECK-NEXT:  }
diff --git a/test/lit_tests/autobatching/loop_reverse.mlir b/test/lit_tests/autobatching/loop_reverse.mlir
@@ -0,0 +1,36 @@
+// RUN: enzymexlamlir-opt --auto-batching --enzyme-hlo-opt %s | FileCheck %s
+
+module {
+  func.func @main(%arg0: tensor<4x10x3xf32>) -> tensor<4x10x3xf32> {
+    %c = stablehlo.constant dense<0> : tensor<i32>
+    %cst = stablehlo.constant dense<0.000000e+00> : tensor<4x10x3xf32>
+    %c_0 = stablehlo.constant dense<1> : tensor<i32>
+    %c_1 = stablehlo.constant dense<0> : tensor<i64>
+    %c_2 = stablehlo.constant dense<10> : tensor<i64>
+    %c_3 = stablehlo.constant dense<1> : tensor<i64>
+    %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<4x10x3xf32>) -> tensor<3x10x4xf32>
+    %1:2 = stablehlo.while(%iterArg = %c_1, %iterArg_4 = %cst) : tensor<i64>, tensor<4x10x3xf32> attributes {enzyme.disable_mincut}
+    cond {
+      %2 = stablehlo.compare  LT, %iterArg, %c_2 : (tensor<i64>, tensor<i64>) -> tensor<i1>
+      stablehlo.return %2 : tensor<i1>
+    } do {
+      %2 = stablehlo.add %c_3, %iterArg : tensor<i64>
+      %3 = stablehlo.convert %2 : (tensor<i64>) -> tensor<i32>
+      %4 = stablehlo.subtract %3, %c_0 : tensor<i32>
+      %5 = stablehlo.dynamic_slice %0, %c, %4, %c, sizes = [3, 1, 4] : (tensor<3x10x4xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<3x1x4xf32>
+      %6 = stablehlo.reshape %5 : (tensor<3x1x4xf32>) -> tensor<3x4xf32>
+      %7 = stablehlo.reverse %6, dims = [0, 1] : tensor<3x4xf32>
+      %8 = stablehlo.broadcast_in_dim %7, dims = [2, 0] : (tensor<3x4xf32>) -> tensor<4x1x3xf32>
+      %9 = stablehlo.dynamic_update_slice %iterArg_4, %8, %c, %4, %c : (tensor<4x10x3xf32>, tensor<4x1x3xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<4x10x3xf32>
+      stablehlo.return %2, %9 : tensor<i64>, tensor<4x10x3xf32>
+    }
+    return %1#1 : tensor<4x10x3xf32>
+  }
+}
+
+// CHECK: func.func @main(%arg0: tensor<4x10x3xf32>) -> tensor<4x10x3xf32> {
+// CHECK-NEXT:     %0 = stablehlo.transpose %arg0, dims = [1, 2, 0] : (tensor<4x10x3xf32>) -> tensor<10x3x4xf32>
+// CHECK-NEXT:     %1 = stablehlo.reverse %0, dims = [1, 2] : tensor<10x3x4xf32>
+// CHECK-NEXT:     %2 = stablehlo.transpose %1, dims = [2, 0, 1] : (tensor<10x3x4xf32>) -> tensor<4x10x3xf32>
+// CHECK-NEXT:     return %2 : tensor<4x10x3xf32>
+// CHECK-NEXT: }