feat: iota tensor detection + indirect iota indexing simplification (#1542)

avik-pal · web-flow · commit b3d6fde1c3d1 · 2025-11-02T11:06:12.000-05:00
* feat: iota tensor detection

* chore: run fmt

* feat: rewrite iota ops

* test: indirect indexing

* feat: support more iota like ops for scatter detection
diff --git a/src/enzyme_ad/jax/Passes/AutoBatching.cpp b/src/enzyme_ad/jax/Passes/AutoBatching.cpp
@@ -10,6 +10,7 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "src/enzyme_ad/jax/Implementations/WhileLoopInfo.h"
 #include "src/enzyme_ad/jax/Passes/Passes.h"
+#include "src/enzyme_ad/jax/Passes/StructuredTensors.h"
 #include "src/enzyme_ad/jax/Utils.h"
 #include "stablehlo/dialect/StablehloOps.h"
 #include "llvm/ADT/DenseMap.h"
@@ -796,6 +797,44 @@ LogicalResult GreedyWhileLoopBatchFission::matchAndRewriteImpl(
   if (candidateSlices.empty())
     return rewriter.notifyMatchFailure(whileOp, "no candidate slices found");
 
+  bool anyOpRewritten = false;
+
+  // iota [idx] where iota starts at 0 and iter var also starts at 0
+  // replace this with idx
+  // If we do a successful rewrite here, we remove the DynamicSliceInfo from
+  // the candidateSlices vector (a later invocation will handle the rest)
+  SmallVector<DynamicSliceInfo> retainedSlices;
+  for (auto [i, slice] : llvm::enumerate(candidateSlices)) {
+    auto iotaDetection = detectIotaLikeTensor(slice.sliceOp.getOperand());
+    if (iotaDetection &&
+        slice.inductionVarDimension == iotaDetection.value().dimension &&
+        iotaDetection.value().start == 0 &&
+        iotaDetection.value().limit == limit) {
+      anyOpRewritten = true;
+
+      OpBuilder::InsertionGuard guard(rewriter);
+      rewriter.setInsertionPoint(slice.sliceOp);
+      Value newOperand = info.getInductionVariable();
+      auto sliceType =
+          cast<RankedTensorType>(slice.sliceOp.getResult().getType());
+      auto outElemType = sliceType.getElementType();
+      if (cast<TensorType>(newOperand.getType()).getElementType() !=
+          outElemType) {
+        newOperand = rewriter
+                         .create<stablehlo::ConvertOp>(
+                             slice.sliceOp.getLoc(),
+                             RankedTensorType::get({}, outElemType), newOperand)
+                         .getResult();
+      }
+      rewriter.replaceOpWithNewOp<stablehlo::BroadcastInDimOp>(
+          slice.sliceOp, sliceType, newOperand,
+          rewriter.getDenseI64ArrayAttr({}));
+    } else {
+      retainedSlices.push_back(slice);
+    }
+  }
+  candidateSlices = std::move(retainedSlices);
+
   // Create a map of user operations to their corresponding dynamic slices
   DenseMap<Operation *, SmallVector<DynamicSliceInfo>> userOpToSlicesMap;
   for (auto ds : candidateSlices) {
@@ -819,9 +858,8 @@ LogicalResult GreedyWhileLoopBatchFission::matchAndRewriteImpl(
   }
 
   if (userOpToSlicesMap.empty())
-    return failure();
+    return anyOpRewritten ? success() : failure();
 
-  bool wasLifted = false;
   for (auto &[op, slices] : userOpToSlicesMap) {
     SmallVector<bool> allIntermediateReshapes(slices.size());
     for (auto [i, slice] : llvm::enumerate(slices))
@@ -839,17 +877,17 @@ LogicalResult GreedyWhileLoopBatchFission::matchAndRewriteImpl(
         op->hasTrait<OpTrait::Elementwise>()) {
       if (liftOperationByBatching(rewriter, whileOp, slices, op, info,
                                   intermediateReshape)) {
-        wasLifted = true;
+        anyOpRewritten = true;
       }
     } else if (!intermediateReshape && isa<stablehlo::ReshapeOp>(op)) {
       if (liftSpecialReshapeOp(rewriter, whileOp, slices,
                                dyn_cast<stablehlo::ReshapeOp>(op), info)) {
-        wasLifted = true;
+        anyOpRewritten = true;
       }
     }
   }
 
-  return wasLifted ? success() : failure();
+  return anyOpRewritten ? success() : failure();
 };
 
 bool GreedyWhileLoopBatchFission::liftSpecialReshapeOp(
diff --git a/src/enzyme_ad/jax/Passes/StructuredTensors.cpp b/src/enzyme_ad/jax/Passes/StructuredTensors.cpp
@@ -75,8 +75,10 @@ absl::Status detectDiagonalTensor(stablehlo::ScatterOp scatterOp,
     return absl::InvalidArgumentError(
         "Scatter dimension numbers are not valid for a diagonal tensor.");
 
-  if (auto iotaOp = dyn_cast<stablehlo::IotaOp>(indices.getDefiningOp())) {
-    if (iotaOp.getIotaDimension() == 0) {
+  auto isIotaLikeTensor = detectIotaLikeTensor(indices);
+  if (isIotaLikeTensor) {
+    auto iotaLikeTensor = isIotaLikeTensor.value();
+    if (iotaLikeTensor.dimension == 0 && iotaLikeTensor.start == 0) {
       *outUpdates = updates;
       return absl::OkStatus();
     }
@@ -85,5 +87,174 @@ absl::Status detectDiagonalTensor(stablehlo::ScatterOp scatterOp,
   return absl::InvalidArgumentError("Not a diagonal tensor.");
 }
 
+std::optional<IotaLikeTensor> detectIotaLikeTensor(mlir::Value tensor) {
+  if (!tensor)
+    return std::nullopt;
+
+  auto elemType =
+      cast<mlir::RankedTensorType>(tensor.getType()).getElementType();
+  if (!isa<mlir::IntegerType>(elemType))
+    return std::nullopt;
+
+  struct ChainItem {
+    mlir::Operation *op;
+    int64_t offset; // only populated for AddOp/SubtractOp
+  };
+
+  // Build a chain of operations from startOp to the base case
+  SmallVector<ChainItem> chain;
+  llvm::DenseSet<mlir::Operation *> visited;
+  mlir::Operation *currentOp = tensor.getDefiningOp();
+
+  // Traverse to find base case
+  while (currentOp && !visited.contains(currentOp)) {
+    visited.insert(currentOp);
+
+    // check if we found a base case
+    if (isa<stablehlo::IotaOp, stablehlo::ConstantOp>(currentOp)) {
+      chain.push_back({currentOp, 0});
+      break;
+    }
+
+    // navigate to the next op. If any unsupported intermediate op is found,
+    // then return std::nullopt
+    Operation *nextOp;
+
+    // TODO: we might want to support broadcast_in_dim / insert_dims / drop_dims
+    // as well
+    if (isa<stablehlo::TransposeOp>(currentOp)) {
+      chain.push_back({currentOp, 0});
+      nextOp = currentOp->getOperand(0).getDefiningOp();
+    } else if (auto convertOp = dyn_cast<stablehlo::ConvertOp>(currentOp)) {
+      // if operand of convertOp is not a integer, then return std::nullopt
+      if (!isa<mlir::IntegerType>(
+              cast<TensorType>(convertOp.getOperand().getType())
+                  .getElementType()))
+        return std::nullopt;
+      chain.push_back({currentOp, 0});
+      nextOp = convertOp.getOperand().getDefiningOp();
+    } else if (auto addOp = dyn_cast<stablehlo::AddOp>(currentOp)) {
+      APInt offsetVal;
+      if (matchPattern(addOp.getRhs(), m_ConstantInt(&offsetVal))) {
+        chain.push_back({currentOp, offsetVal.getSExtValue()});
+        nextOp = addOp.getLhs().getDefiningOp();
+      } else if (matchPattern(addOp.getLhs(), m_ConstantInt(&offsetVal))) {
+        chain.push_back({currentOp, offsetVal.getSExtValue()});
+        nextOp = addOp.getRhs().getDefiningOp();
+      } else {
+        return std::nullopt;
+      }
+    } else if (auto subOp = dyn_cast<stablehlo::SubtractOp>(currentOp)) {
+      APInt offsetVal;
+      if (matchPattern(subOp.getRhs(), m_ConstantInt(&offsetVal))) {
+        chain.push_back({currentOp, -offsetVal.getSExtValue()});
+        nextOp = subOp.getLhs().getDefiningOp();
+      } else {
+        return std::nullopt;
+      }
+    } else { // unsupported op
+      return std::nullopt;
+    }
+
+    currentOp = nextOp;
+  }
+
+  if (chain.empty())
+    return std::nullopt;
+
+  // process the base case
+  IotaLikeTensor result;
+  if (auto iotaOp = dyn_cast<stablehlo::IotaOp>(chain.back().op)) {
+    auto iotaType = cast<RankedTensorType>(iotaOp.getResult().getType());
+    auto iotaDim = static_cast<int64_t>(iotaOp.getIotaDimension());
+    result = IotaLikeTensor{0, iotaType.getShape()[iotaDim], iotaDim, iotaType};
+  } else if (auto constantOp =
+                 dyn_cast<stablehlo::ConstantOp>(chain.back().op)) {
+    auto denseAttr = cast<DenseElementsAttr>(constantOp.getValue());
+    auto constType = cast<RankedTensorType>(constantOp.getResult().getType());
+    auto shape = constType.getShape();
+
+    if (denseAttr.isSplat())
+      return std::nullopt;
+
+    // Calculate strides for indexing
+    SmallVector<int64_t> strides(constType.getRank(), 1);
+    for (int64_t i = constType.getRank() - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * shape[i + 1];
+    }
+
+    bool isIotaLike = false;
+    auto denseAttrValues = denseAttr.getValues<APInt>();
+
+    for (int64_t dim = 0; dim < constType.getRank(); dim++) {
+      bool isIotaAlongDim = true;
+      std::optional<int64_t> detectedStart;
+
+      SmallVector<int64_t> indices(constType.getRank(), 0);
+      int64_t numElements = constType.getNumElements();
+
+      for (int64_t idx = 0; idx < numElements && isIotaAlongDim; idx++) {
+        int64_t temp = idx;
+        // linear to cartesian indexing
+        for (int64_t d = 0; d < constType.getRank(); d++) {
+          indices[d] = temp / strides[d];
+          temp = temp % strides[d];
+        }
+
+        int64_t actualValue = denseAttrValues[idx].getSExtValue();
+
+        if (!detectedStart) {
+          detectedStart = actualValue;
+        }
+
+        int64_t expectedValue = detectedStart.value() + indices[dim];
+        if (actualValue != expectedValue) {
+          isIotaAlongDim = false;
+          break;
+        }
+      }
+
+      if (isIotaAlongDim && detectedStart) {
+        isIotaLike = true;
+        result =
+            IotaLikeTensor{detectedStart.value(),
+                           detectedStart.value() + shape[dim], dim, constType};
+        break;
+      }
+    }
+
+    if (!isIotaLike)
+      return std::nullopt;
+  } else {
+    return std::nullopt;
+  }
+
+  // traverse the chain in reverse order
+  for (int64_t i = chain.size() - 2; i >= 0; i--) {
+    auto item = chain[i];
+
+    if (isa<stablehlo::ConvertOp>(item.op)) {
+      continue;
+    } else if (auto transposeOp = dyn_cast<stablehlo::TransposeOp>(item.op)) {
+      auto permutation = transposeOp.getPermutation();
+      for (int64_t idx = 0; idx < permutation.size(); idx++) {
+        if (permutation[idx] == result.dimension) {
+          result.dimension = idx;
+          break;
+        }
+      }
+      continue;
+    } else if (isa<stablehlo::AddOp, stablehlo::SubtractOp>(item.op)) {
+      result.start += item.offset;
+      continue;
+    }
+
+    assert(false && "reached unreachable case...");
+  }
+
+  result.tensorType = cast<RankedTensorType>(tensor.getType());
+  return result;
+}
+
 } // namespace enzyme
 } // namespace mlir
diff --git a/src/enzyme_ad/jax/Passes/StructuredTensors.h b/src/enzyme_ad/jax/Passes/StructuredTensors.h
@@ -5,6 +5,8 @@
 #include "src/enzyme_ad/jax/Utils.h"
 #include "stablehlo/dialect/StablehloOps.h"
 
+#include <optional>
+
 namespace mlir {
 namespace enzyme {
 
@@ -16,5 +18,14 @@ absl::Status detectConstantSetindexScatterOp(stablehlo::ScatterOp scatterOp,
 absl::Status detectDiagonalTensor(stablehlo::ScatterOp scatterOp,
                                   mlir::Value *outUpdates);
 
+struct IotaLikeTensor {
+  int64_t start;
+  int64_t limit;
+  int64_t dimension;
+  mlir::RankedTensorType tensorType;
+};
+
+std::optional<IotaLikeTensor> detectIotaLikeTensor(mlir::Value tensor);
+
 } // namespace enzyme
 } // namespace mlir
diff --git a/test/lit_tests/autobatching/indirect_iota_indexing.mlir b/test/lit_tests/autobatching/indirect_iota_indexing.mlir
@@ -0,0 +1,40 @@
+// RUN: enzymexlamlir-opt --enzyme-hlo-generate-td="patterns=reshape_dynamic_slice(1);while_is_copy_simplify;greedy_while_loop_batch_fission;broadcast_to_reshape;merge_consecutive_reshapes;reshape_licm(0)" --transform-interpreter --enzyme-hlo-remove-transform --inline --enzyme-hlo-opt --enzyme-hlo-generate-td="patterns=reshape_dynamic_slice(1);while_is_copy_simplify;greedy_while_loop_batch_fission;broadcast_to_reshape;merge_consecutive_reshapes;reshape_licm(0);reshape_elementwise(0)" --transform-interpreter --enzyme-hlo-remove-transform %s | FileCheck %s
+
+module {
+  func.func @main(%arg0: tensor<10xf64>, %arg1: tensor<10xf64>) -> tensor<10xf32> {
+    %c = stablehlo.constant dense<1> : tensor<i32>
+    %c_0 = stablehlo.constant dense<1> : tensor<i64>
+    %c_1 = stablehlo.constant dense<10> : tensor<i64>
+    %c_2 = stablehlo.constant dense<0> : tensor<i64>
+    %cst = stablehlo.constant dense<0.000000e+00> : tensor<10xf32>
+    %c_3 = stablehlo.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : tensor<10xi32>
+    %0:2 = stablehlo.while(%iterArg = %c_2, %iterArg_4 = %cst) : tensor<i64>, tensor<10xf32>
+    cond {
+      %1 = stablehlo.compare  LT, %iterArg, %c_1 : (tensor<i64>, tensor<i64>) -> tensor<i1>
+      stablehlo.return %1 : tensor<i1>
+    } do {
+      %1 = stablehlo.add %iterArg, %c_0 : tensor<i64>
+      %2 = stablehlo.dynamic_slice %c_3, %iterArg, sizes = [1] : (tensor<10xi32>, tensor<i64>) -> tensor<1xi32>
+      %3 = stablehlo.reshape %2 : (tensor<1xi32>) -> tensor<i32>
+      %4 = stablehlo.dynamic_slice %arg0, %3, sizes = [1] : (tensor<10xf64>, tensor<i32>) -> tensor<1xf64>
+      %5 = stablehlo.dynamic_slice %arg1, %3, sizes = [1] : (tensor<10xf64>, tensor<i32>) -> tensor<1xf64>
+      %6 = stablehlo.add %4, %5 : tensor<1xf64>
+      %7 = stablehlo.maximum %4, %5 : tensor<1xf64>
+      %8 = stablehlo.add %6, %7 : tensor<1xf64>
+      %9 = stablehlo.convert %8 : (tensor<1xf64>) -> tensor<1xf32>
+      %10 = stablehlo.convert %1 : (tensor<i64>) -> tensor<i32>
+      %11 = stablehlo.subtract %10, %c : tensor<i32>
+      %12 = stablehlo.dynamic_update_slice %iterArg_4, %9, %11 : (tensor<10xf32>, tensor<1xf32>, tensor<i32>) -> tensor<10xf32>
+      stablehlo.return %1, %12 : tensor<i64>, tensor<10xf32>
+    }
+    return %0#1 : tensor<10xf32>
+  }
+}
+
+// CHECK: func.func @main(%arg0: tensor<10xf64>, %arg1: tensor<10xf64>) -> tensor<10xf32> {
+// CHECK-NEXT:     %0 = stablehlo.add %arg0, %arg1 : tensor<10xf64>
+// CHECK-NEXT:     %1 = stablehlo.maximum %arg0, %arg1 : tensor<10xf64>
+// CHECK-NEXT:     %2 = stablehlo.add %0, %1 : tensor<10xf64>
+// CHECK-NEXT:     %3 = stablehlo.convert %2 : (tensor<10xf64>) -> tensor<10xf32>
+// CHECK-NEXT:     return %3 : tensor<10xf32>
+// CHECK-NEXT: }