EnzymeAD
diff --git a/‎src/enzyme_ad/jax/Passes/EnzymeHLOOpt.cpp‎
Lines changed: 308 additions & 42 deletions b/‎src/enzyme_ad/jax/Passes/EnzymeHLOOpt.cpp‎
Lines changed: 308 additions & 42 deletions
diff --git a/‎src/enzyme_ad/jax/TransformOps/TransformOps.td‎
Lines changed: 8 additions & 0 deletions b/‎src/enzyme_ad/jax/TransformOps/TransformOps.td‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/enzyme_ad/jax/Utils.cpp‎
Lines changed: 11 additions & 8 deletions b/‎src/enzyme_ad/jax/Utils.cpp‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎src/enzyme_ad/jax/Utils.h‎
Lines changed: 1 addition & 0 deletions b/‎src/enzyme_ad/jax/Utils.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/enzyme_ad/jax/primitives.py‎
Lines changed: 2 additions & 0 deletions b/‎src/enzyme_ad/jax/primitives.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎test/lit_tests/autobatching/elementwise_loop.mlir‎
Lines changed: 6 additions & 6 deletions b/‎test/lit_tests/autobatching/elementwise_loop.mlir‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎test/lit_tests/autobatching/elementwise_loop_affine.mlir‎
Lines changed: 14 additions & 18 deletions b/‎test/lit_tests/autobatching/elementwise_loop_affine.mlir‎
Lines changed: 14 additions & 18 deletions
diff --git a/‎test/lit_tests/autobatching/indirect_iota_indexing.mlir‎
Lines changed: 3 additions & 3 deletions b/‎test/lit_tests/autobatching/indirect_iota_indexing.mlir‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/lit_tests/autobatching/indirect_iota_indexing2.mlir‎
Lines changed: 3 additions & 3 deletions b/‎test/lit_tests/autobatching/indirect_iota_indexing2.mlir‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/lit_tests/autobatching/nbody.mlir‎
Lines changed: 39 additions & 39 deletions b/‎test/lit_tests/autobatching/nbody.mlir‎
Lines changed: 39 additions & 39 deletions
@@ -301,6 +301,10 @@ def ReshapeEmptyBroadcastPatterns : EnzymeHLOPatternOp<
     "reshape_empty_broadcast"> {
   let patterns = ["ReshapeEmptyBroadcast"];
 }
+def ReshapeBroadcastPatterns : EnzymeHLOPatternOp<
+    "reshape_broadcast"> {
+  let patterns = ["ReshapeBroadcast"];
+}
 def ApplySliceReshapePadPatterns : EnzymeHLOPatternOp<
     "slice_reshape_pad"> {
   let patterns = ["SliceReshapePad"];
@@ -2561,6 +2565,10 @@ def ApplyReduceMulToDotGeneralPatterns : EnzymeHLOPatternOp<
     "reduce_mul_to_dot_general"> {
   let patterns = ["ReduceMulToDotGeneral"];
 }
+def ApplySplitReduceAddMulToAddDotGeneralPatterns : EnzymeHLOPatternOp<
+    "split_reduce_add_mul_to_add_dot_general"> {
+  let patterns = ["SplitReduceAddMulToAddDotGeneral"];
+}
 
 def ApplyDotGeneralOnlyDiagonalAccessPatterns : EnzymeHLOPatternOp<
     "dot_general_only_diagonal_access"> {
 
@@ -1020,14 +1020,16 @@ SmallVector<int64_t> findReshapeInsertionDims(ArrayRef<int64_t> inputShape,
 
 bool isInsertDimOp(stablehlo::ReshapeOp reshapeOp) {
   RankedTensorType inputTy = reshapeOp.getOperand().getType();
-  auto inputShape = inputTy.getShape();
   RankedTensorType outputTy = reshapeOp.getType();
-  auto outputShape = outputTy.getShape();
-  auto insertDims = findReshapeInsertionDims(inputShape, outputShape);
-  if (insertDims.empty()) {
-    return false;
-  }
-  return true;
+  auto insertDims = findReshapeInsertionDims(inputTy, outputTy);
+  return !insertDims.empty();
+}
+
+bool isDeleteDimOp(stablehlo::ReshapeOp reshapeOp) {
+  RankedTensorType inputTy = reshapeOp.getOperand().getType();
+  RankedTensorType outputTy = reshapeOp.getType();
+  auto deleteDims = findReshapeInsertionDims(outputTy, inputTy);
+  return !deleteDims.empty();
 }
 
 void getSingletonInsertionDims(stablehlo::BroadcastInDimOp bcastOp,
@@ -2427,7 +2429,6 @@ bool isFusible(stablehlo::TransposeOp transpose, Operation *op) {
   return false;
 }
 
-// TODO: implement more conditions especially for fusions with transpose
 bool isFusible(Operation *op, stablehlo::ReshapeOp reshape) {
   return TypeSwitch<Operation *, bool>(op)
       .Case<stablehlo::ReshapeOp>([](auto prevOp) { return true; })
@@ -2450,6 +2451,8 @@ bool isFusible(Operation *op, stablehlo::ReshapeOp reshape) {
         }
         return false;
       })
+      .Case<stablehlo::ReduceOp>(
+          [&](auto redOp) { return isDeleteDimOp(reshape); })
       .Default([](auto other) { return matchPattern(other, m_Constant()); });
 }
 
 
@@ -905,6 +905,7 @@ SmallVector<int64_t> findReshapeInsertionDims(ArrayRef<int64_t> inputShape,
                                               ArrayRef<int64_t> outputShape);
 
 bool isInsertDimOp(stablehlo::ReshapeOp reshapeOp);
+bool isDeleteDimOp(stablehlo::ReshapeOp reshapeOp);
 
 void getSingletonInsertionDims(stablehlo::BroadcastInDimOp bcastOp,
                                SmallVectorImpl<int64_t> &insertionDims);
 
@@ -202,6 +202,7 @@ def optimization_passes(
         "dot_general_simplify<16>",
         "transpose_simplify<16>",
         "reshape_empty_broadcast<1>",
+        "reshape_broadcast<1>",
         "broadcast_reshape<1>",
         "transpose_dot_reorder<1>",
         "dot_transpose<1>",
@@ -306,6 +307,7 @@ def optimization_passes(
         "trivial_reduce_window_to_reduce_op",
         "case_to_if",
         "reduce_mul_to_dot_general",
+        "split_reduce_add_mul_to_add_dot_general",
         "dot_general_add_distributive_simplify",
         "dot_general_subtract_distributive_simplify",
         "remove_no_ops_from_while_loop",
 
@@ -123,19 +123,19 @@ module {
 // CHECK-NEXT:   %c_3 = stablehlo.constant dense<2> : tensor<i32>
 // CHECK-NEXT:   %0 = stablehlo.dynamic_slice %arg0, %c_3, sizes = [10] : (tensor<10xf64>, tensor<i32>) -> tensor<10xf64>
 // CHECK-NEXT:   %1 = stablehlo.slice %0 [0:10:3] : (tensor<10xf64>) -> tensor<4xf64>
-// CHECK-NEXT:   %2 = stablehlo.dynamic_slice %arg0, %c_3, sizes = [10] : (tensor<10xf64>, tensor<i32>) -> tensor<10xf64>
-// CHECK-NEXT:   %3 = stablehlo.slice %2 [0:10:3] : (tensor<10xf64>) -> tensor<4xf64>
-// CHECK-NEXT:   %4 = stablehlo.sine %3 : tensor<4xf64>
-// CHECK-NEXT:   %5 = stablehlo.cosine %1 : tensor<4xf64>
+// CHECK-NEXT:   %2 = stablehlo.cosine %1 : tensor<4xf64>
+// CHECK-NEXT:   %3 = stablehlo.dynamic_slice %arg0, %c_3, sizes = [10] : (tensor<10xf64>, tensor<i32>) -> tensor<10xf64>
+// CHECK-NEXT:   %4 = stablehlo.slice %3 [0:10:3] : (tensor<10xf64>) -> tensor<4xf64>
+// CHECK-NEXT:   %5 = stablehlo.sine %4 : tensor<4xf64>
 // CHECK-NEXT:   %6:2 = stablehlo.while(%iterArg = %c_0, %iterArg_4 = %cst) : tensor<i64>, tensor<10xf64>
 // CHECK-NEXT:   cond {
 // CHECK-NEXT:     %7 = stablehlo.compare  LT, %iterArg, %c_1 : (tensor<i64>, tensor<i64>) -> tensor<i1>
 // CHECK-NEXT:     stablehlo.return %7 : tensor<i1>
 // CHECK-NEXT:   } do {
 // CHECK-NEXT:     %7 = stablehlo.add %c_2, %iterArg {enzymexla.bounds = {{.*}}} : tensor<i64>
 // CHECK-NEXT:     %8 = stablehlo.divide %iterArg, %c_2 {enzymexla.bounds = {{.*}}} : tensor<i64>
-// CHECK-NEXT:     %9 = stablehlo.dynamic_slice %4, %8, sizes = [1] : (tensor<4xf64>, tensor<i64>) -> tensor<1xf64>
-// CHECK-NEXT:     %10 = stablehlo.dynamic_slice %5, %8, sizes = [1] : (tensor<4xf64>, tensor<i64>) -> tensor<1xf64>
+// CHECK-NEXT:     %9 = stablehlo.dynamic_slice %5, %8, sizes = [1] : (tensor<4xf64>, tensor<i64>) -> tensor<1xf64>
+// CHECK-NEXT:     %10 = stablehlo.dynamic_slice %2, %8, sizes = [1] : (tensor<4xf64>, tensor<i64>) -> tensor<1xf64>
 // CHECK-NEXT:     %11 = stablehlo.subtract %10, %9 : tensor<1xf64>
 // CHECK-NEXT:     %12 = stablehlo.convert %7 {enzymexla.bounds = {{.*}}} : (tensor<i64>) -> tensor<i32>
 // CHECK-NEXT:     %13 = stablehlo.subtract %12, %c {enzymexla.bounds = {{.*}}} : tensor<i32>
 
@@ -35,16 +35,14 @@ func.func @main1(%arg0: tensor<25xf32>) -> tensor<13xf32> {
 
 // CHECK: func.func @main1(%arg0: tensor<25xf32>) -> tensor<13xf32> {
 // CHECK-NEXT:   %cst = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK-NEXT:   %cst_0 = stablehlo.constant dense<1.000000e+00> : tensor<10x1xf32>
-// CHECK-NEXT:   %cst_1 = stablehlo.constant dense<3.000000e+00> : tensor<10x1xf32>
+// CHECK-NEXT:   %cst_0 = stablehlo.constant dense<1.000000e+00> : tensor<10xf32>
+// CHECK-NEXT:   %cst_1 = stablehlo.constant dense<3.000000e+00> : tensor<10xf32>
 // CHECK-NEXT:   %0 = stablehlo.slice %arg0 [6:25:2] : (tensor<25xf32>) -> tensor<10xf32>
-// CHECK-NEXT:   %1 = stablehlo.reshape %0 : (tensor<10xf32>) -> tensor<10x1xf32>
-// CHECK-NEXT:   %2 = stablehlo.multiply %1, %cst_1 : tensor<10x1xf32>
-// CHECK-NEXT:   %3 = stablehlo.subtract %2, %cst_0 : tensor<10x1xf32>
-// CHECK-NEXT:   %4 = stablehlo.sine %3 : tensor<10x1xf32>
-// CHECK-NEXT:   %5 = stablehlo.reshape %4 : (tensor<10x1xf32>) -> tensor<10xf32>
-// CHECK-NEXT:   %6 = stablehlo.pad %5, %cst, low = [2], high = [1], interior = [0] : (tensor<10xf32>, tensor<f32>) -> tensor<13xf32>
-// CHECK-NEXT:   return %6 : tensor<13xf32>
+// CHECK-NEXT:   %1 = stablehlo.multiply %0, %cst_1 : tensor<10xf32>
+// CHECK-NEXT:   %2 = stablehlo.subtract %1, %cst_0 : tensor<10xf32>
+// CHECK-NEXT:   %3 = stablehlo.sine %2 : tensor<10xf32>
+// CHECK-NEXT:   %4 = stablehlo.pad %3, %cst, low = [2], high = [1], interior = [0] : (tensor<10xf32>, tensor<f32>) -> tensor<13xf32>
+// CHECK-NEXT:   return %4 : tensor<13xf32>
 // CHECK-NEXT: }
 
 func.func @main2(%arg0: tensor<25xf32>) -> tensor<13xf32> {
@@ -82,14 +80,12 @@ func.func @main2(%arg0: tensor<25xf32>) -> tensor<13xf32> {
 
 // CHECK: func.func @main2(%arg0: tensor<25xf32>) -> tensor<13xf32> {
 // CHECK-NEXT:   %cst = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK-NEXT:   %cst_0 = stablehlo.constant dense<1.000000e+00> : tensor<10x1xf32>
-// CHECK-NEXT:   %cst_1 = stablehlo.constant dense<3.000000e+00> : tensor<10x1xf32>
+// CHECK-NEXT:   %cst_0 = stablehlo.constant dense<1.000000e+00> : tensor<10xf32>
+// CHECK-NEXT:   %cst_1 = stablehlo.constant dense<3.000000e+00> : tensor<10xf32>
 // CHECK-NEXT:   %0 = stablehlo.slice %arg0 [6:25:2] : (tensor<25xf32>) -> tensor<10xf32>
-// CHECK-NEXT:   %1 = stablehlo.reshape %0 : (tensor<10xf32>) -> tensor<10x1xf32>
-// CHECK-NEXT:   %2 = stablehlo.multiply %1, %cst_1 : tensor<10x1xf32>
-// CHECK-NEXT:   %3 = stablehlo.subtract %2, %cst_0 : tensor<10x1xf32>
-// CHECK-NEXT:   %4 = stablehlo.sine %3 : tensor<10x1xf32>
-// CHECK-NEXT:   %5 = stablehlo.reshape %4 : (tensor<10x1xf32>) -> tensor<10xf32>
-// CHECK-NEXT:   %6 = stablehlo.pad %5, %cst, low = [2], high = [1], interior = [0] : (tensor<10xf32>, tensor<f32>) -> tensor<13xf32>
-// CHECK-NEXT:   return %6 : tensor<13xf32>
+// CHECK-NEXT:   %1 = stablehlo.multiply %0, %cst_1 : tensor<10xf32>
+// CHECK-NEXT:   %2 = stablehlo.subtract %1, %cst_0 : tensor<10xf32>
+// CHECK-NEXT:   %3 = stablehlo.sine %2 : tensor<10xf32>
+// CHECK-NEXT:   %4 = stablehlo.pad %3, %cst, low = [2], high = [1], interior = [0] : (tensor<10xf32>, tensor<f32>) -> tensor<13xf32>
+// CHECK-NEXT:   return %4 : tensor<13xf32>
 // CHECK-NEXT: }
@@ -32,9 +32,9 @@ module {
 }
 
 // CHECK: func.func @main(%arg0: tensor<10xf64>, %arg1: tensor<10xf64>) -> tensor<10xf32> {
-// CHECK-NEXT:     %0 = stablehlo.add %arg0, %arg1 : tensor<10xf64>
-// CHECK-NEXT:     %1 = stablehlo.maximum %arg0, %arg1 : tensor<10xf64>
-// CHECK-NEXT:     %2 = stablehlo.add %0, %1 : tensor<10xf64>
+// CHECK-NEXT:     %0 = stablehlo.maximum %arg0, %arg1 : tensor<10xf64>
+// CHECK-NEXT:     %1 = stablehlo.add %arg0, %arg1 : tensor<10xf64>
+// CHECK-NEXT:     %2 = stablehlo.add %1, %0 : tensor<10xf64>
 // CHECK-NEXT:     %3 = stablehlo.convert %2 : (tensor<10xf64>) -> tensor<10xf32>
 // CHECK-NEXT:     return %3 : tensor<10xf32>
 // CHECK-NEXT: }
 
@@ -33,8 +33,8 @@ module {
 // CHECK: func.func @main(%arg0: tensor<10xf64>, %arg1: tensor<10xf64>) -> tensor<6xf64> {
 // CHECK-NEXT:   %0 = stablehlo.slice %arg0 [2:8] : (tensor<10xf64>) -> tensor<6xf64>
 // CHECK-NEXT:   %1 = stablehlo.slice %arg1 [2:8] : (tensor<10xf64>) -> tensor<6xf64>
-// CHECK-NEXT:   %2 = stablehlo.add %0, %1 : tensor<6xf64>
-// CHECK-NEXT:   %3 = stablehlo.maximum %0, %1 : tensor<6xf64>
-// CHECK-NEXT:   %4 = stablehlo.add %2, %3 : tensor<6xf64>
+// CHECK-NEXT:   %2 = stablehlo.maximum %0, %1 : tensor<6xf64>
+// CHECK-NEXT:   %3 = stablehlo.add %0, %1 : tensor<6xf64>
+// CHECK-NEXT:   %4 = stablehlo.add %3, %2 : tensor<6xf64>
 // CHECK-NEXT:   return %4 : tensor<6xf64>
 // CHECK-NEXT: }
@@ -73,48 +73,48 @@ module {
 // CHECK-NEXT:   func.func @main(%arg0: tensor<100x3xf32>, %arg1: tensor<100xf32>) -> tensor<100x100x3xf32> {
 // CHECK-NEXT:     %cst = stablehlo.constant dense<1.000000e+00> : tensor<100x100xf32>
 // CHECK-NEXT:     %c = stablehlo.constant dense<1> : tensor<100x100xi64>
-// CHECK-NEXT:     %0 = stablehlo.slice %arg0 [0:100, 2:3] : (tensor<100x3xf32>) -> tensor<100x1xf32>
-// CHECK-NEXT:     %1 = stablehlo.broadcast_in_dim %0, dims = [0, 2] : (tensor<100x1xf32>) -> tensor<100x100x1x1xf32>
-// CHECK-NEXT:     %2 = stablehlo.slice %arg0 [0:100, 0:2] : (tensor<100x3xf32>) -> tensor<100x2xf32>
-// CHECK-NEXT:     %3 = stablehlo.broadcast_in_dim %2, dims = [1, 0] : (tensor<100x2xf32>) -> tensor<2x100x100x1x1xf32>
-// CHECK-NEXT:     %4 = stablehlo.iota dim = 1 : tensor<100x100xi64>
-// CHECK-NEXT:     %5 = stablehlo.add %c, %4 : tensor<100x100xi64>
-// CHECK-NEXT:     %6 = stablehlo.broadcast_in_dim %2, dims = [2, 0] : (tensor<100x2xf32>) -> tensor<2x100x100x1x1xf32>
-// CHECK-NEXT:     %7 = stablehlo.slice %6 [0:1, 0:100, 0:100, 0:1, 0:1] : (tensor<2x100x100x1x1xf32>) -> tensor<1x100x100x1x1xf32>
-// CHECK-NEXT:     %8 = stablehlo.reshape %7 : (tensor<1x100x100x1x1xf32>) -> tensor<100x100x1x1xf32>
-// CHECK-NEXT:     %9 = stablehlo.slice %6 [1:2, 0:100, 0:100, 0:1, 0:1] : (tensor<2x100x100x1x1xf32>) -> tensor<1x100x100x1x1xf32>
-// CHECK-NEXT:     %10 = stablehlo.reshape %9 : (tensor<1x100x100x1x1xf32>) -> tensor<100x100x1x1xf32>
-// CHECK-NEXT:     %11 = stablehlo.concatenate %8, %10, dim = 0 : (tensor<100x100x1x1xf32>, tensor<100x100x1x1xf32>) -> tensor<200x100x1x1xf32>
-// CHECK-NEXT:     %12 = stablehlo.reshape %11 : (tensor<200x100x1x1xf32>) -> tensor<2x100x100x1x1xf32>
-// CHECK-NEXT:     %13 = stablehlo.subtract %3, %12 : tensor<2x100x100x1x1xf32>
-// CHECK-NEXT:     %14 = stablehlo.slice %13 [0:1, 0:100, 0:100, 0:1, 0:1] : (tensor<2x100x100x1x1xf32>) -> tensor<1x100x100x1x1xf32>
-// CHECK-NEXT:     %15 = stablehlo.slice %13 [1:2, 0:100, 0:100, 0:1, 0:1] : (tensor<2x100x100x1x1xf32>) -> tensor<1x100x100x1x1xf32>
-// CHECK-NEXT:     %16 = stablehlo.iota dim = 0 : tensor<100x100xi64>
-// CHECK-NEXT:     %17 = stablehlo.add %c, %16 : tensor<100x100xi64>
-// CHECK-NEXT:     %18 = stablehlo.compare  EQ, %5, %17 : (tensor<100x100xi64>, tensor<100x100xi64>) -> tensor<100x100xi1>
-// CHECK-NEXT:     %19 = stablehlo.broadcast_in_dim %0, dims = [1, 2] : (tensor<100x1xf32>) -> tensor<100x100x1x1xf32>
-// CHECK-NEXT:     %20 = stablehlo.subtract %1, %19 : tensor<100x100x1x1xf32>
-// CHECK-NEXT:     %21 = stablehlo.reshape %20 : (tensor<100x100x1x1xf32>) -> tensor<100x100x1x1x1xf32>
-// CHECK-NEXT:     %22 = stablehlo.transpose %13, dims = [1, 2, 3, 4, 0] : (tensor<2x100x100x1x1xf32>) -> tensor<100x100x1x1x2xf32>
-// CHECK-NEXT:     %23 = stablehlo.concatenate %22, %21, dim = 4 : (tensor<100x100x1x1x2xf32>, tensor<100x100x1x1x1xf32>) -> tensor<100x100x1x1x3xf32>
-// CHECK-NEXT:     %24 = stablehlo.multiply %20, %20 : tensor<100x100x1x1xf32>
-// CHECK-NEXT:     %25 = stablehlo.reshape %14 : (tensor<1x100x100x1x1xf32>) -> tensor<100x100xf32>
-// CHECK-NEXT:     %26 = stablehlo.multiply %25, %25 : tensor<100x100xf32>
-// CHECK-NEXT:     %27 = stablehlo.reshape %15 : (tensor<1x100x100x1x1xf32>) -> tensor<100x100xf32>
-// CHECK-NEXT:     %28 = stablehlo.multiply %27, %27 : tensor<100x100xf32>
-// CHECK-NEXT:     %29 = stablehlo.add %26, %28 : tensor<100x100xf32>
-// CHECK-NEXT:     %30 = stablehlo.reshape %24 : (tensor<100x100x1x1xf32>) -> tensor<100x100xf32>
-// CHECK-NEXT:     %31 = stablehlo.add %29, %30 : tensor<100x100xf32>
-// CHECK-NEXT:     %32 = stablehlo.divide %cst, %31 : tensor<100x100xf32>
-// CHECK-NEXT:     %33 = stablehlo.select %18, %25, %32 : tensor<100x100xi1>, tensor<100x100xf32>
-// CHECK-NEXT:     %34 = stablehlo.broadcast_in_dim %arg1, dims = [0] : (tensor<100xf32>) -> tensor<100x100xf32>
-// CHECK-NEXT:     %35 = stablehlo.broadcast_in_dim %arg1, dims = [1] : (tensor<100xf32>) -> tensor<100x100xf32>
-// CHECK-NEXT:     %36 = stablehlo.multiply %34, %35 : tensor<100x100xf32>
-// CHECK-NEXT:     %37 = stablehlo.multiply %36, %33 : tensor<100x100xf32>
+// CHECK-NEXT:     %0 = stablehlo.broadcast_in_dim %arg1, dims = [0] : (tensor<100xf32>) -> tensor<100x100xf32>
+// CHECK-NEXT:     %1 = stablehlo.slice %arg0 [0:100, 2:3] : (tensor<100x3xf32>) -> tensor<100x1xf32>
+// CHECK-NEXT:     %2 = stablehlo.broadcast_in_dim %1, dims = [0, 2] : (tensor<100x1xf32>) -> tensor<100x100x1x1xf32>
+// CHECK-NEXT:     %3 = stablehlo.slice %arg0 [0:100, 0:2] : (tensor<100x3xf32>) -> tensor<100x2xf32>
+// CHECK-NEXT:     %4 = stablehlo.broadcast_in_dim %3, dims = [1, 0] : (tensor<100x2xf32>) -> tensor<2x100x100x1x1xf32>
+// CHECK-NEXT:     %5 = stablehlo.iota dim = 1 : tensor<100x100xi64>
+// CHECK-NEXT:     %6 = stablehlo.add %c, %5 : tensor<100x100xi64>
+// CHECK-NEXT:     %7 = stablehlo.broadcast_in_dim %3, dims = [2, 0] : (tensor<100x2xf32>) -> tensor<2x100x100x1x1xf32>
+// CHECK-NEXT:     %8 = stablehlo.slice %7 [0:1, 0:100, 0:100, 0:1, 0:1] : (tensor<2x100x100x1x1xf32>) -> tensor<1x100x100x1x1xf32>
+// CHECK-NEXT:     %9 = stablehlo.reshape %8 : (tensor<1x100x100x1x1xf32>) -> tensor<100x100x1x1xf32>
+// CHECK-NEXT:     %10 = stablehlo.slice %7 [1:2, 0:100, 0:100, 0:1, 0:1] : (tensor<2x100x100x1x1xf32>) -> tensor<1x100x100x1x1xf32>
+// CHECK-NEXT:     %11 = stablehlo.reshape %10 : (tensor<1x100x100x1x1xf32>) -> tensor<100x100x1x1xf32>
+// CHECK-NEXT:     %12 = stablehlo.concatenate %9, %11, dim = 0 : (tensor<100x100x1x1xf32>, tensor<100x100x1x1xf32>) -> tensor<200x100x1x1xf32>
+// CHECK-NEXT:     %13 = stablehlo.reshape %12 : (tensor<200x100x1x1xf32>) -> tensor<2x100x100x1x1xf32>
+// CHECK-NEXT:     %14 = stablehlo.subtract %4, %13 : tensor<2x100x100x1x1xf32>
+// CHECK-NEXT:     %15 = stablehlo.slice %14 [0:1, 0:100, 0:100, 0:1, 0:1] : (tensor<2x100x100x1x1xf32>) -> tensor<1x100x100x1x1xf32>
+// CHECK-NEXT:     %16 = stablehlo.slice %14 [1:2, 0:100, 0:100, 0:1, 0:1] : (tensor<2x100x100x1x1xf32>) -> tensor<1x100x100x1x1xf32>
+// CHECK-NEXT:     %17 = stablehlo.reshape %16 : (tensor<1x100x100x1x1xf32>) -> tensor<100x100xf32>
+// CHECK-NEXT:     %18 = stablehlo.iota dim = 0 : tensor<100x100xi64>
+// CHECK-NEXT:     %19 = stablehlo.add %c, %18 : tensor<100x100xi64>
+// CHECK-NEXT:     %20 = stablehlo.compare  EQ, %6, %19 : (tensor<100x100xi64>, tensor<100x100xi64>) -> tensor<100x100xi1>
+// CHECK-NEXT:     %21 = stablehlo.broadcast_in_dim %1, dims = [1, 2] : (tensor<100x1xf32>) -> tensor<100x100x1x1xf32>
+// CHECK-NEXT:     %22 = stablehlo.subtract %2, %21 : tensor<100x100x1x1xf32>
+// CHECK-NEXT:     %23 = stablehlo.broadcast_in_dim %arg1, dims = [1] : (tensor<100xf32>) -> tensor<100x100xf32>
+// CHECK-NEXT:     %24 = stablehlo.multiply %0, %23 : tensor<100x100xf32>
+// CHECK-NEXT:     %25 = stablehlo.reshape %22 : (tensor<100x100x1x1xf32>) -> tensor<100x100x1x1x1xf32>
+// CHECK-NEXT:     %26 = stablehlo.transpose %14, dims = [1, 2, 3, 4, 0] : (tensor<2x100x100x1x1xf32>) -> tensor<100x100x1x1x2xf32>
+// CHECK-NEXT:     %27 = stablehlo.concatenate %26, %25, dim = 4 : (tensor<100x100x1x1x2xf32>, tensor<100x100x1x1x1xf32>) -> tensor<100x100x1x1x3xf32>
+// CHECK-NEXT:     %28 = stablehlo.multiply %22, %22 : tensor<100x100x1x1xf32>
+// CHECK-NEXT:     %29 = stablehlo.multiply %17, %17 : tensor<100x100xf32>
+// CHECK-NEXT:     %30 = stablehlo.reshape %15 : (tensor<1x100x100x1x1xf32>) -> tensor<100x100xf32>
+// CHECK-NEXT:     %31 = stablehlo.multiply %30, %30 : tensor<100x100xf32>
+// CHECK-NEXT:     %32 = stablehlo.add %31, %29 : tensor<100x100xf32>
+// CHECK-NEXT:     %33 = stablehlo.reshape %28 : (tensor<100x100x1x1xf32>) -> tensor<100x100xf32>
+// CHECK-NEXT:     %34 = stablehlo.add %32, %33 : tensor<100x100xf32>
+// CHECK-NEXT:     %35 = stablehlo.divide %cst, %34 : tensor<100x100xf32>
+// CHECK-NEXT:     %36 = stablehlo.select %20, %30, %35 : tensor<100x100xi1>, tensor<100x100xf32>
+// CHECK-NEXT:     %37 = stablehlo.multiply %24, %36 : tensor<100x100xf32>
 // CHECK-NEXT:     %38 = stablehlo.broadcast_in_dim %37, dims = [0, 1] : (tensor<100x100xf32>) -> tensor<100x100x1x1x2xf32>
 // CHECK-NEXT:     %39 = stablehlo.reshape %37 : (tensor<100x100xf32>) -> tensor<100x100x1x1x1xf32>
 // CHECK-NEXT:     %40 = stablehlo.concatenate %38, %39, dim = 4 : (tensor<100x100x1x1x2xf32>, tensor<100x100x1x1x1xf32>) -> tensor<100x100x1x1x3xf32>
-// CHECK-NEXT:     %41 = stablehlo.multiply %40, %23 : tensor<100x100x1x1x3xf32>
+// CHECK-NEXT:     %41 = stablehlo.multiply %40, %27 : tensor<100x100x1x1x3xf32>
 // CHECK-NEXT:     %42 = stablehlo.reshape %41 : (tensor<100x100x1x1x3xf32>) -> tensor<100x100x3xf32>
 // CHECK-NEXT:     %43 = stablehlo.transpose %42, dims = [1, 0, 2] : (tensor<100x100x3xf32>) -> tensor<100x100x3xf32>
 // CHECK-NEXT:     return %43 : tensor<100x100x3xf32>