EnzymeAD
diff --git a/‎.github/workflows/test-gb-25.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test-gb-25.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎patches/xla_spmd.patch‎
Lines changed: 0 additions & 15 deletions b/‎patches/xla_spmd.patch‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎src/enzyme_ad/jax/Dialect/Ops.cpp‎
Lines changed: 1 addition & 1 deletion b/‎src/enzyme_ad/jax/Dialect/Ops.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/enzyme_ad/jax/Implementations/TritonDerivatives.td‎
Lines changed: 0 additions & 1 deletion b/‎src/enzyme_ad/jax/Implementations/TritonDerivatives.td‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/enzyme_ad/jax/Passes/OptimizeCommunication.cpp‎
Lines changed: 155 additions & 16 deletions b/‎src/enzyme_ad/jax/Passes/OptimizeCommunication.cpp‎
Lines changed: 155 additions & 16 deletions
diff --git a/‎src/enzyme_ad/jax/clang_compile.cc‎
Lines changed: 0 additions & 1 deletion b/‎src/enzyme_ad/jax/clang_compile.cc‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎test/lit_tests/communication/multislice_custom_call.mlir‎
Lines changed: 43 additions & 0 deletions b/‎test/lit_tests/communication/multislice_custom_call.mlir‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎test/lit_tests/lower_multislice_custom_call.mlir‎
Lines changed: 0 additions & 18 deletions b/‎test/lit_tests/lower_multislice_custom_call.mlir‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎test/lit_tests/parallel-lower-inline.mlir‎
Lines changed: 4 additions & 4 deletions b/‎test/lit_tests/parallel-lower-inline.mlir‎
Lines changed: 4 additions & 4 deletions
@@ -201,7 +201,7 @@ jobs:
               ALL_TO_ALL_THRESHOLD=0
               ALL_GATHER_THRESHOLD=0
               ALL_REDUCE_THRESHOLD=0
-              COLLECTIVE_PERMUTE_THRESHOLD=339
+              COLLECTIVE_PERMUTE_THRESHOLD=345
           elif [[ '${{ contains(matrix.os, 'tpu') }}' == 'true' ]]; then
               ALL_TO_ALL_THRESHOLD=0
               ALL_GATHER_THRESHOLD=0
 
@@ -1551,7 +1551,7 @@ LogicalResult fixupGetFunc(LLVM::CallOp op, OpBuilder &rewriter,
 struct NoopResource : public SideEffects::Resource::Base<NoopResource> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(NoopResource)
 
-  StringRef getName() final { return "<NoopResource>"; }
+  StringRef getName() const final { return "<NoopResource>"; }
 };
 
 void NoopOp::build(OpBuilder &builder, OperationState &result,
 
@@ -25,7 +25,6 @@ def : TritonInactiveOp<"MakeRangeOp">;
 def : TritonInactiveOp<"PrintOp">;
 
 def : ReadOnlyIdentityOp<"triton", "AddPtrOp", [0]>;
-def : ReadOnlyIdentityOp<"triton", "AdvanceOp", [0]>;
 def : ReadOnlyIdentityOp<"triton", "LoadOp", [0]>;
 def : ReadOnlyIdentityOp<"triton", "SplatOp", [0]>;
 def : MemoryIdentityOp<"triton", "StoreOp", [1], [0]>;
 
@@ -2261,6 +2261,62 @@ struct MultiRotateCustomCallOptimize
   }
 };
 
+/// Detect whether this MultiSliceOp matches the cross-shard pattern:
+///   1. All strides are 1.
+///   2. For every sharded dimension except the multi-slice dimension,
+///      start/limit span the full tensor extent.
+///   3. Along the multi-slice dimension, every slice's start falls within
+///      one shard and its end falls within a different shard.
+bool detectCrossShardPattern(Value operand, Operation *op,
+                             ArrayRef<int64_t> startIndices,
+                             ArrayRef<int64_t> limitIndices,
+                             ArrayRef<int64_t> strides, int32_t dim,
+                             int32_t amount, bool &needsSlice) {
+  // --- Condition 1: unit strides everywhere ---
+  if (!llvm::all_of(strides, [](int64_t s) { return s == 1; }))
+    return false;
+
+  auto operandType = cast<RankedTensorType>(operand.getType());
+  auto operandSharding = mlir::sdy::getSharding(operand);
+  if (!operandSharding) {
+    return false;
+  }
+  ArrayRef<int64_t> shape = operandType.getShape();
+  int64_t rank = shape.size();
+
+  if (dim < 0 || dim >= rank)
+    return false;
+
+  // --- Condition 2: full span on every sharded dim except `dim` ---
+  for (int64_t d = 0; d < rank; ++d) {
+    if (d == dim)
+      continue;
+    int64_t numShards = getNumDevicesAlongDimension(operandSharding, d, op);
+    if (startIndices[d] != 0 || limitIndices[d] != shape[d]) {
+      needsSlice = true;
+      if (numShards > 1) {
+        return false;
+      }
+    }
+  }
+
+  // --- Condition 3: cross-shard slicing along `dim` ---
+  int64_t numShards = getNumDevicesAlongDimension(operandSharding, dim, op);
+  if (numShards <= 1)
+    return false; // Not sharded along the slice dimension.
+
+  int64_t dimSize = shape[dim];
+  int64_t shardSize = (dimSize + numShards - 1) / numShards;
+
+  if (startIndices[dim] > shardSize) {
+    return false;
+  }
+  if (shape[dim] - limitIndices[dim] > shardSize) {
+    return false;
+  }
+  return true;
+}
+
 struct MultiSliceCustomCallOptimize
     : public OpRewritePattern<enzymexla::MultiSliceOp> {
 
@@ -2283,40 +2339,123 @@ struct MultiSliceCustomCallOptimize
     if (slice->getParentOfType<sdy::ManualComputationOp>())
       return failure();
 
-    auto rotateDimension = slice.getDimension();
+    auto sliceDimension = slice.getDimension();
     auto shardings = mlir::sdy::getShardingPerValue(slice);
     if (!shardings)
       return rewriter.notifyMatchFailure(slice, "No sharding found.");
-    auto rotateSharding = shardings.getSharding(0);
+    auto sliceSharding = shardings.getSharding(0);
+    for (int64_t i = 1; i < slice.getNumResults(); ++i) {
+      if (shardings.getSharding(i) != sliceSharding)
+        return rewriter.notifyMatchFailure(
+            slice, "Not all results have the same sharding");
+    }
 
     int64_t numDevicesAlongDimension =
-        getNumDevicesAlongDimension(rotateSharding, rotateDimension, slice);
+        getNumDevicesAlongDimension(sliceSharding, sliceDimension, slice);
 
     if (numDevicesAlongDimension == 1) {
       return rewriter.notifyMatchFailure(
           slice,
           "numDevicesAlongDimension == 1. Communication is already optimized.");
     }
 
-    std::string start_indices =
-        serializeDenseI64ArrayAttr(slice.getStartIndices());
-    std::string limit_indices =
-        serializeDenseI64ArrayAttr(slice.getLimitIndices());
-    std::string strides = serializeDenseI64ArrayAttr(slice.getStrides());
+    Value customCallOperand = slice.getOperand();
+    auto operandSharding = mlir::sdy::getSharding(customCallOperand);
+    if (!operandSharding) {
+      return rewriter.notifyMatchFailure(slice, "No operand shardings");
+    }
+    if (sliceSharding != operandSharding) {
+      return rewriter.notifyMatchFailure(slice,
+                                         "Mismatched input/output sharding");
+    }
 
-    std::string opaque = "dimension=" + std::to_string(rotateDimension) +
+    // Only lower to custom call if the cross-shard pattern is detected.
+    auto startIndices = SmallVector<int64_t>(slice.getStartIndices());
+    auto limitIndices = SmallVector<int64_t>(slice.getLimitIndices());
+    auto strideVals = SmallVector<int64_t>(slice.getStrides());
+    bool needs_slice = false;
+    if (!detectCrossShardPattern(customCallOperand, slice, startIndices,
+                                 limitIndices, strideVals, sliceDimension,
+                                 slice.getAmount(), needs_slice))
+      return rewriter.notifyMatchFailure(
+          slice, "MultiSlice does not match cross-shard pattern.");
+
+    // --- Replace the needs_slice bail-out and custom-call emission with this:
+    // ---
+
+    SmallVector<int64_t> finalStartIndices(startIndices);
+    SmallVector<int64_t> finalLimitIndices(limitIndices);
+    SmallVector<int64_t> finalStrides(strideVals);
+
+    if (needs_slice) {
+      // Emit a preliminary stablehlo::SliceOp that trims replicated
+      // (unsharded) dimensions down to the requested range, so that
+      // the MultiSlice custom call afterwards spans the full axis on
+      // every dimension except `dim`.
+      auto operandType = cast<RankedTensorType>(customCallOperand.getType());
+      ArrayRef<int64_t> shape = operandType.getShape();
+      int64_t rank = shape.size();
+
+      auto operandSharding = sdy::getSharding(slice.getOperand());
+
+      SmallVector<int64_t> preStart(rank);
+      SmallVector<int64_t> preLimit(rank);
+      SmallVector<int64_t> preStrides(rank, 1);
+
+      for (int64_t d = 0; d < rank; ++d) {
+        if (d == sliceDimension) {
+          // Keep the full extent along the multi-slice dimension;
+          // the custom call handles cross-shard slicing there.
+          preStart[d] = 0;
+          preLimit[d] = shape[d];
+        } else {
+          int64_t numShards =
+              getNumDevicesAlongDimension(operandSharding, d, slice);
+          if (numShards <= 1 &&
+              (startIndices[d] != 0 || limitIndices[d] != shape[d])) {
+            // Replicated dim that doesn't span the full tensor —
+            // slice it now so the custom call can assume full extent.
+            preStart[d] = startIndices[d];
+            preLimit[d] = limitIndices[d];
+            // After pre-slicing, the custom call sees [0, newSize).
+            finalStartIndices[d] = 0;
+            finalLimitIndices[d] = limitIndices[d] - startIndices[d];
+          } else {
+            preStart[d] = 0;
+            preLimit[d] = shape[d];
+          }
+        }
+      }
+
+      auto preSliceOp = rewriter.create<stablehlo::SliceOp>(
+          slice.getLoc(), customCallOperand, preStart, preLimit, preStrides);
+
+      SmallVector<TensorShardingAttr> opShardings(1, sliceSharding);
+      sdy::setShardings(preSliceOp, TensorShardingPerValueAttr::get(
+                                        rewriter.getContext(), opShardings));
+
+      customCallOperand = preSliceOp.getResult();
+    }
+
+    std::string start_indices_str =
+        serializeDenseI64ArrayAttr(finalStartIndices);
+    std::string limit_indices_str =
+        serializeDenseI64ArrayAttr(finalLimitIndices);
+    std::string strides_str = serializeDenseI64ArrayAttr(finalStrides);
+
+    std::string opaque = "dimension=" + std::to_string(sliceDimension) +
                          ",amount=" + std::to_string(slice.getAmount()) +
-                         ",start_indices=" + start_indices +
-                         ",limit_indices=" + limit_indices +
-                         ",strides=" + strides;
+                         ",start_indices=" + start_indices_str +
+                         ",limit_indices=" + limit_indices_str +
+                         ",strides=" + strides_str;
 
-    auto fnSym = rewriter.getStringAttr("_SPMDEnzymeInternalOp_MultiSlice");
+    auto fnSym = rewriter.getStringAttr("_SPMDInternalOp_MultiSlice");
 
-    SmallVector<TensorShardingAttr> opShardings(slice.getNumResults(),
-                                                rotateSharding);
+    SmallVector<TensorShardingAttr> opShardings(slice.getAmount() + 1,
+                                                sliceSharding);
 
     auto ccall = rewriter.replaceOpWithNewOp<stablehlo::CustomCallOp>(
-        slice, slice->getResultTypes(), slice->getOperands(), fnSym,
+        slice, slice->getResultTypes(), ValueRange{customCallOperand}, fnSym,
         /*has_side_effect=*/rewriter.getBoolAttr(false),
         /*backend_config=*/rewriter.getStringAttr(opaque),
         /*api_version=*/nullptr,
 
@@ -505,7 +505,6 @@ struct tensor<T, n0, N...>
 
   DiagsBuffer->FlushDiagnostics(Clang->getDiagnostics());
   if (!Success) {
-    Clang->getDiagnosticClient().finish();
     llvm::errs() << " failed diag\n";
     return {};
   }
 
@@ -0,0 +1,43 @@
+// RUN: enzymexlamlir-opt %s --optimize-communication="multislice_custom_call=1" | FileCheck %s
+
+module  {
+  sdy.mesh @mesh = <["a"=2]>
+  func.func public @main(%arg0: tensor<10xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"a", ?}]>}) -> (tensor<7xf32>, tensor<7xf32>, tensor<7xf32>) {
+    %1, %2, %3 = "enzymexla.multi_slice"(%arg0) {
+      dimension = 0 : i32, 
+      amount = 2 : i32, 
+      start_indices = array<i64: 0>,
+      limit_indices = array<i64: 7>,
+      strides = array<i64: 1>,
+      sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"a", ?}]>, <@mesh, [{"a", ?}]>, <@mesh, [{"a", ?}]>]>
+    } : (tensor<10xf32>) -> (tensor<7xf32>, tensor<7xf32>, tensor<7xf32>)
+    return %1, %2, %3 : tensor<7xf32>, tensor<7xf32>, tensor<7xf32>
+  }
+}
+
+// CHECK:  func.func public @main(%arg0: tensor<10xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"a", ?}]>}) -> (tensor<7xf32>, tensor<7xf32>, tensor<7xf32>) {
+// CHECK-NEXT:    %0:3 = stablehlo.custom_call @_SPMDInternalOp_MultiSlice(%arg0) {backend_config = "dimension=0,amount=2,start_indices=[0],limit_indices=[7],strides=[1]", sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"a", ?}]>, <@mesh, [{"a", ?}]>, <@mesh, [{"a", ?}]>]>} : (tensor<10xf32>) -> (tensor<7xf32>, tensor<7xf32>, tensor<7xf32>)
+// CHECK-NEXT:    return %0#0, %0#1, %0#2 : tensor<7xf32>, tensor<7xf32>, tensor<7xf32>
+// CHECK-NEXT:  }
+
+
+module {
+  sdy.mesh @mesh = <["x"=2, "y"=2]>
+  func.func public @main(%arg0: tensor<20x1536x3056xf64> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"y"}, {"x"}]>}) -> (tensor<4x1520x3056xf64>, tensor<4x1520x3056xf64>) {
+    %0:2 = "enzymexla.multi_slice"(%arg0) {
+      amount = 1 : i32,
+      dimension = 1 : i32,
+      limit_indices = array<i64: 12, 1529, 3056>,
+      start_indices = array<i64: 8, 9, 0>,
+      strides = array<i64: 1, 1, 1>,
+      sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {"y"}, {"x"}]>, <@mesh, [{}, {"y"}, {"x"}]>]>
+    } : (tensor<20x1536x3056xf64>) -> (tensor<4x1520x3056xf64>, tensor<4x1520x3056xf64>)
+    return %0#0, %0#1 : tensor<4x1520x3056xf64>, tensor<4x1520x3056xf64>
+  }
+}
+
+// CHECK:  func.func public @main(%arg0: tensor<20x1536x3056xf64> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"y"}, {"x"}]>}) -> (tensor<4x1520x3056xf64>, tensor<4x1520x3056xf64>) {
+// CHECK-NEXT:    %[[SLICE:.*]] = stablehlo.slice %arg0 [8:12, 0:1536, 0:3056] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {"y"}, {"x"}]>]>} : (tensor<20x1536x3056xf64>) -> tensor<4x1536x3056xf64>
+// CHECK-NEXT:    %[[CC:.*]]:2 = stablehlo.custom_call @_SPMDInternalOp_MultiSlice(%[[SLICE]]) {backend_config = "dimension=1,amount=1,start_indices=[0, 9, 0],limit_indices=[4, 1529, 3056],strides=[1, 1, 1]", sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {"y"}, {"x"}]>, <@mesh, [{}, {"y"}, {"x"}]>]>} : (tensor<4x1536x3056xf64>) -> (tensor<4x1520x3056xf64>, tensor<4x1520x3056xf64>)
+// CHECK-NEXT:    return %[[CC]]#0, %[[CC]]#1 : tensor<4x1520x3056xf64>, tensor<4x1520x3056xf64>
+// CHECK-NEXT:  }
@@ -23,14 +23,14 @@ module {
 // CHECK:         scf.parallel 
 // CHECK:           memref.alloca_scope  {
 // CHECK:             scf.execute_region {
-// CHECK-DAG:               %[[a1:.*]] = llvm.alloca %0 x !llvm.struct<(i8)> {alignment = 1 : i64} : (i64) -> !llvm.ptr
 // CHECK-DAG:               %[[a2:.*]] = llvm.alloca %0 x !llvm.struct<(i8)> {alignment = 1 : i64} : (i64) -> !llvm.ptr
 // CHECK:               llvm.store %[[ld]], %[[a2]] : !llvm.struct<(i8)>, !llvm.ptr
 // CHECK:               memref.alloca_scope  {
+// CHECK-DAG:             %[[a1:.*]] = llvm.alloca %0 x !llvm.struct<(i8)> {alignment = 1 : i64} : (i64) -> !llvm.ptr
 // CHECK:                 scf.execute_region {
-// CHECK:                   "llvm.intr.memcpy"(%[[a1]], %[[a2]], %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
-// CHECK:                   %[[rld:.*]] = llvm.load %4 : !llvm.ptr -> !llvm.struct<(i8)>
-// CHECK:                   llvm.store %6, %arg5 : !llvm.struct<(i8)>, !llvm.ptr
+// CHECK:                    "llvm.intr.memcpy"(%[[a1]], %[[a2]], %0) <{arg_attrs = [{llvm.align = 1 : i64}], isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// CHECK:                   %[[rld:.*]] = llvm.load %[[a1]] : !llvm.ptr -> !llvm.struct<(i8)>
+// CHECK:                   llvm.store %[[rld]], %arg5 : !llvm.struct<(i8)>, !llvm.ptr
 // CHECK:                   scf.yield
 // CHECK:                 }
 // CHECK:               }
Original file line number	Diff line number	Diff line change
`@@ -505,7 +505,6 @@ struct tensor<T, n0, N...>`
`505`	`505`
`506`	`506`	`DiagsBuffer->FlushDiagnostics(Clang->getDiagnostics());`
`507`	`507`	`if (!Success) {`
`508`		`- Clang->getDiagnosticClient().finish();`
`509`	`508`	`llvm::errs() << " failed diag\n";`
`510`	`509`	`return {};`
`511`	`510`	`}`