intel
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 24 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 10 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 6 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 14 additions & 5 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 1 addition & 2 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 1 addition & 2 deletions
@@ -335,12 +335,18 @@ SmallVector<Value> delinearize(RewriterBase &rewriter, Location loc,
 SmallVector<Value> delinearize(RewriterBase &rewriter, Location loc,
                                Value linear, ArrayRef<unsigned> shape);
 
+SmallVector<unsigned> delinearize(unsigned linear, ArrayRef<unsigned> shape,
+                                  ArrayRef<unsigned> order);
+
 Value linearize(RewriterBase &rewriter, Location loc, ArrayRef<Value> multiDim,
                 ArrayRef<unsigned> shape, ArrayRef<unsigned> order);
 
 Value linearize(RewriterBase &rewriter, Location loc, ArrayRef<Value> multiDim,
                 ArrayRef<unsigned> shape);
 
+size_t linearize(ArrayRef<unsigned> multiDim, ArrayRef<unsigned> shape,
+                 ArrayRef<unsigned> order);
+
 Value addStringToModule(Location loc, RewriterBase &rewriter, StringRef key,
                         StringRef content);
 
@@ -495,6 +501,24 @@ inline Value dot(RewriterBase &rewriter, Location loc, ArrayRef<Value> offsets,
   return ret;
 }
 
+/// Extend 2d shared object to 3d.
+///
+/// If tensor has 3 dimensions, returns original shared object.
+/// If tensor shape is [M, N], return shared object describing shape [1, M, N]
+///
+/// This Function is used to simplify processing of 2d and 3d dot operands,
+/// particularly in the conversion of local_load operation.
+///
+/// \param rewriter
+/// \param loc
+/// \param smemObj
+/// \param shape shape of a tensor represented by smemObj
+/// \returns shared object describing 3d tensor
+SharedMemoryObject
+getExpandedSharedMemoryObject(ConversionPatternRewriter &rewriter, Location loc,
+                              SharedMemoryObject smemObj,
+                              ArrayRef<int64_t> shape);
+
 // -----------------------------------------------------------------------
 // Blocked layout indices
 // -----------------------------------------------------------------------
 
@@ -882,9 +882,18 @@ def TT_GatherOp : TT_Op<"gather", [Pure,
     dimension, and each dimension of the indices tensor that is not the gather
     dimension cannot be greater than the corresponding dimension in the input
     tensor.
+
+    The `efficient_layout` attribute is set when the compiler has determined an
+    optimized layout for the operation, indicating that it should not be
+    changed.
   }];
 
-  let arguments = (ins TT_Tensor:$src, TT_IntTensor:$indices, I32Attr:$axis);
+  let arguments = (ins
+    TT_Tensor:$src,
+    TT_IntTensor:$indices,
+    I32Attr:$axis,
+    UnitAttr:$efficient_layout
+  );
   let results = (outs TT_Tensor:$result);
 
   let assemblyFormat = [{
 
@@ -234,6 +234,12 @@ void dumpHWLayout(RankedTensorType tensorType);
 // Return a string representation of the layout of the tensor.
 std::string getLayoutStr(RankedTensorType tensorType, bool useHWPointOfView);
 
+template <typename T>
+llvm::SmallVector<T> expandMatrixShapeWithBatch(llvm::ArrayRef<T> s);
+
+llvm::SmallVector<unsigned>
+expandMatrixOrderWithBatch(llvm::ArrayRef<unsigned> o);
+
 } // namespace gpu
 } // namespace triton
 } // namespace mlir
 
@@ -158,11 +158,20 @@ def TritonGPUOptimizeThreadLocality : Pass<"tritongpu-optimize-thread-locality",
   let summary = "Reduce the cost of synchronization between threads in an SM";
 
   let description = [{
-    The aim of this pass is to reduce cross-thread communication for reduction
-    operations, by adjusting the reduction size (or layout) to avoid splitting
-    the reduction operation between multiple threads. Currently, this pass only
-    optimizes reduction yielded by loop to be thread-local until
-    after the loop completes.
+    The aim of this pass is to reduce cross-thread communication for certain
+    operations, like reductions, reshapes, and gathers.
+
+    For reduction operations, this pass attempts to adjust the reduction size
+    (or layout) to avoid splitting the reduction operation between multiple
+    threads. Currently, this pass only optimizes reduction yielded by loop to be
+    thread-local until after the loop completes.
+
+    For gathers, this pass will attempt to pick an optimized layout for gather
+    operations in the module. This is determined based on the shapes of the
+    gather operands as well as their existing layouts. The pass applies
+    heuristics to determine when it is appropriate to assign specific layouts
+    and trigger their respective codegen paths. For now, the pass only attempts
+    to apply layouts that result in warp-synchronous gathers.
   }];
 
   let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
 
@@ -163,8 +163,7 @@ Operation *cloneWithInferType(mlir::OpBuilder &rewriter, Operation *op,
 LogicalResult getConvertBackwardSlice(
     Value root, SetVector<Value> &slice, Attribute rootEncoding,
     DenseMap<Value, Attribute> &layout,
-    std::function<bool(Operation *)> stopPropagation = nullptr,
-    std::function<Value(Value, Attribute)> getExistingConversion = nullptr);
+    std::function<bool(Operation *)> stopPropagation = nullptr);
 
 // Populate pattern to remove dead cycles in ForOp.
 void populateForOpDeadArgumentElimination(RewritePatternSet &patterns);