intel
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 13 additions & 0 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 26 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 12 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 4 additions & 0 deletions b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 11 additions & 0 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp‎
Lines changed: 2 additions & 3 deletions b/‎lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp‎
Lines changed: 2 additions & 3 deletions
@@ -153,6 +153,19 @@ class ScanLoweringHelper {
   SmallVector<Type> srcElementTypes;
 };
 
+// Helper class for lowering `tt.gather` operations. This class shares lowering
+// logic between shared memory allocation and LLVM codegen.
+class GatherLoweringHelper {
+public:
+  GatherLoweringHelper(triton::GatherOp gatherOp);
+
+  // Get the shared memory scratch size required by this op.
+  unsigned getScratchSizeInBytes();
+
+private:
+  triton::GatherOp gatherOp;
+};
+
 // Decomposes a reshape into simpler pieces.
 //
 // As an example, suppose we have a reshape from [4,4,4] to [2,2,8,2].
 
@@ -92,6 +92,10 @@ void populateScanOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                   RewritePatternSet &patterns,
                                   const TargetInfoBase &targetInfo,
                                   PatternBenefit benefit);
+void populateGatherOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
+                                    RewritePatternSet &patterns,
+                                    const TargetInfoBase &targetInfo,
+                                    PatternBenefit benefit);
 
 void populateConvertLayoutOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                            const TargetInfoBase &targetInfo,
 
@@ -1125,6 +1125,11 @@ emitBaseIndexForLayout(Location loc, RewriterBase &rewriter,
 
 // Emit indices calculation within each ConversionPattern, and returns a
 // [elemsPerThread X rank] index matrix.
+//
+// For example, for a thread a owns `elemsPerThread` elements of a tensor with
+// type `type` and layout `layout`, the result will contain `elemsPerThread`
+// vectors. Each vector contains the SSA values of the indices required to
+// access the corresponding element, starting from the inner dimension.
 SmallVector<SmallVector<Value>>
 emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
             Attribute layout, RankedTensorType type, bool withCTAOffset);
 
@@ -869,6 +869,32 @@ def TT_HistogramOp : TT_Op<"histogram", [Pure]> {
   }];
 }
 
+//
+// Gather Op
+//
+def TT_GatherOp : TT_Op<"gather", [Pure,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "local gather operation";
+  let description = [{
+    Gather elements from the input tensor using the indices tensor along a
+    single specified axis. The output tensor has the same shape as the indices
+    tensor. The input and indices tensors must have the same number of
+    dimension, and each dimension of the indices tensor that is not the gather
+    dimension cannot be greater than the corresponding dimension in the input
+    tensor.
+  }];
+
+  let arguments = (ins TT_Tensor:$src, TT_IntTensor:$indices, I32Attr:$axis);
+  let results = (outs TT_Tensor:$result);
+
+  let assemblyFormat = [{
+    $src `[` $indices `]` attr-dict `:`
+    functional-type(operands, results)
+  }];
+
+  let hasVerifier = 1;
+}
+
 //
 // Print Op
 //
 
@@ -192,4 +192,16 @@ def TritonGPULoopScheduling: Pass<"tritongpu-loop-scheduling", "mlir::ModuleOp">
            "number of pipeline stages">
   ];
 }
+
+def TritonGPUCoalesceAsyncCopy: Pass<"tritongpu-coalesce-async-copy", "mlir::ModuleOp"> {
+  let summary = "Improve coalescing for async global to local copies";
+
+  let description = "For AsyncCopyGlobalToLocal ops where the shared encoding's vec is less than "
+                    "the blocked encoding's sizePerThread, this pass improves coalescing by clipping the "
+                    "sizePerThread value";
+
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
+                           "mlir::triton::TritonDialect"];
+}
+
 #endif
@@ -202,6 +202,11 @@ enum class MMALoadType {
                  // pipelining
 };
 MMALoadType getMMALoadType(Operation *loadOp);
+
+// Returns composed LinearLayout for register to shared copy
+std::optional<triton::LinearLayout>
+getRegToSharedLayout(MLIRContext *ctx, ArrayRef<int64_t> shape,
+                     Attribute srcEnc, Attribute dstEnc, int elemBitWidth);
 } // namespace mlir
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -125,6 +125,10 @@ unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
     ScanLoweringHelper helper(scanOp);
     return helper.getScratchSizeInBytes();
   }
+  if (auto gatherOp = dyn_cast<GatherOp>(op)) {
+    GatherLoweringHelper helper(gatherOp);
+    return helper.getScratchSizeInBytes();
+  }
   if (auto histogram = dyn_cast<HistogramOp>(op)) {
     auto dstTy = histogram.getType();
     int threadsPerWarp = gpu::TritonGPUDialect::getThreadsPerWarp(
 
@@ -415,6 +415,17 @@ unsigned ScanLoweringHelper::getAxisBlockStride() {
   llvm_unreachable("Axis not found in order");
 }
 
+GatherLoweringHelper::GatherLoweringHelper(triton::GatherOp gatherOp)
+    : gatherOp(gatherOp) {}
+
+unsigned GatherLoweringHelper::getScratchSizeInBytes() {
+  // For now, lower the gather op by writing the source tensor to shared memory.
+  // TODO(jeff): Leverage locality to avoid using scratch space when possible.
+  RankedTensorType srcType = gatherOp.getSrc().getType();
+  return product(srcType.getShape()) *
+         ceil<unsigned>(srcType.getElementTypeBitWidth(), 8);
+}
+
 unsigned getNumScratchElements(ArrayRef<unsigned> shape) {
   if (shape.empty())
     return 0;
 
@@ -13,6 +13,7 @@ add_triton_library(TritonGPUToLLVM
     AllocateSharedMemory.cpp
     ReduceOpToLLVM.cpp
     ScanOpToLLVM.cpp
+    GatherOpToLLVM.cpp
     ConvertLayoutOpToLLVM.cpp
     ControlFlowOpToLLVM.cpp
     FuncOpToLLVM.cpp
 
@@ -325,13 +325,12 @@ struct ElementwiseInlineAsmOpConversion
     // asmResults is a flat struct; pack its values into
     // [return_value][op.getPackedElement()].
     SmallVector<SmallVector<Value>> ret(op->getNumResults());
+    int structIdx = 0;
     for (int i = 0; i < op->getNumResults(); i++) {
-      int structIdx = 0;
       for (int j = 0; j < op.getPackedElement(); j++) {
         Value val;
         if (asmRetTypes.size() > 1) {
-          val =
-              extract_val(asmResults, i * op.getPackedElement() + structIdx++);
+          val = extract_val(asmResults, structIdx++);
         } else {
           val = asmResults;
         }