intel
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 13 additions & 0 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 26 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 4 additions & 0 deletions b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 11 additions & 0 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/GatherOpToLLVM.cpp‎
Lines changed: 112 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/GatherOpToLLVM.cpp‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp‎
Lines changed: 1 addition & 0 deletions b/‎lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 48 additions & 0 deletions b/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 48 additions & 0 deletions
@@ -153,6 +153,19 @@ class ScanLoweringHelper {
   SmallVector<Type> srcElementTypes;
 };
 
+// Helper class for lowering `tt.gather` operations. This class shares lowering
+// logic between shared memory allocation and LLVM codegen.
+class GatherLoweringHelper {
+public:
+  GatherLoweringHelper(triton::GatherOp gatherOp);
+
+  // Get the shared memory scratch size required by this op.
+  unsigned getScratchSizeInBytes();
+
+private:
+  triton::GatherOp gatherOp;
+};
+
 // Decomposes a reshape into simpler pieces.
 //
 // As an example, suppose we have a reshape from [4,4,4] to [2,2,8,2].
 
@@ -92,6 +92,10 @@ void populateScanOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                   RewritePatternSet &patterns,
                                   const TargetInfoBase &targetInfo,
                                   PatternBenefit benefit);
+void populateGatherOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
+                                    RewritePatternSet &patterns,
+                                    const TargetInfoBase &targetInfo,
+                                    PatternBenefit benefit);
 
 void populateConvertLayoutOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                            const TargetInfoBase &targetInfo,
 
@@ -1125,6 +1125,11 @@ emitBaseIndexForLayout(Location loc, RewriterBase &rewriter,
 
 // Emit indices calculation within each ConversionPattern, and returns a
 // [elemsPerThread X rank] index matrix.
+//
+// For example, for a thread a owns `elemsPerThread` elements of a tensor with
+// type `type` and layout `layout`, the result will contain `elemsPerThread`
+// vectors. Each vector contains the SSA values of the indices required to
+// access the corresponding element, starting from the inner dimension.
 SmallVector<SmallVector<Value>>
 emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
             Attribute layout, RankedTensorType type, bool withCTAOffset);
 
@@ -869,6 +869,32 @@ def TT_HistogramOp : TT_Op<"histogram", [Pure]> {
   }];
 }
 
+//
+// Gather Op
+//
+def TT_GatherOp : TT_Op<"gather", [Pure,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "local gather operation";
+  let description = [{
+    Gather elements from the input tensor using the indices tensor along a
+    single specified axis. The output tensor has the same shape as the indices
+    tensor. The input and indices tensors must have the same number of
+    dimension, and each dimension of the indices tensor that is not the gather
+    dimension cannot be greater than the corresponding dimension in the input
+    tensor.
+  }];
+
+  let arguments = (ins TT_Tensor:$src, TT_IntTensor:$indices, I32Attr:$axis);
+  let results = (outs TT_Tensor:$result);
+
+  let assemblyFormat = [{
+    $src `[` $indices `]` attr-dict `:`
+    functional-type(operands, results)
+  }];
+
+  let hasVerifier = 1;
+}
+
 //
 // Print Op
 //
 
@@ -125,6 +125,10 @@ unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
     ScanLoweringHelper helper(scanOp);
     return helper.getScratchSizeInBytes();
   }
+  if (auto gatherOp = dyn_cast<GatherOp>(op)) {
+    GatherLoweringHelper helper(gatherOp);
+    return helper.getScratchSizeInBytes();
+  }
   if (auto histogram = dyn_cast<HistogramOp>(op)) {
     auto dstTy = histogram.getType();
     int threadsPerWarp = gpu::TritonGPUDialect::getThreadsPerWarp(
 
@@ -408,6 +408,17 @@ unsigned ScanLoweringHelper::getAxisBlockStride() {
   llvm_unreachable("Axis not found in order");
 }
 
+GatherLoweringHelper::GatherLoweringHelper(triton::GatherOp gatherOp)
+    : gatherOp(gatherOp) {}
+
+unsigned GatherLoweringHelper::getScratchSizeInBytes() {
+  // For now, lower the gather op by writing the source tensor to shared memory.
+  // TODO(jeff): Leverage locality to avoid using scratch space when possible.
+  RankedTensorType srcType = gatherOp.getSrc().getType();
+  return product(srcType.getShape()) *
+         ceil<unsigned>(srcType.getElementTypeBitWidth(), 8);
+}
+
 unsigned getNumScratchElements(ArrayRef<unsigned> shape) {
   if (shape.empty())
     return 0;
 
@@ -13,6 +13,7 @@ add_triton_library(TritonGPUToLLVM
     AllocateSharedMemory.cpp
     ReduceOpToLLVM.cpp
     ScanOpToLLVM.cpp
+    GatherOpToLLVM.cpp
     ConvertLayoutOpToLLVM.cpp
     ControlFlowOpToLLVM.cpp
     FuncOpToLLVM.cpp
 
@@ -0,0 +1,112 @@
+#include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
+#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
+
+using namespace mlir;
+using namespace mlir::triton;
+
+namespace {
+class GatherOpConversion : public ConvertOpToLLVMPattern<GatherOp> {
+public:
+  GatherOpConversion(LLVMTypeConverter &typeConverter,
+                     const TargetInfoBase &targetInfo, PatternBenefit benefit)
+      : ConvertOpToLLVMPattern(typeConverter, benefit), targetInfo(targetInfo) {
+  }
+
+  LogicalResult
+  matchAndRewrite(GatherOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+
+private:
+  const TargetInfoBase &targetInfo;
+};
+
+LogicalResult
+GatherOpConversion::matchAndRewrite(GatherOp op, OpAdaptor adaptor,
+                                    ConversionPatternRewriter &rewriter) const {
+  Location loc = op.getLoc();
+  RankedTensorType srcType = op.getSrc().getType();
+
+  // Compute the src subtensor shape owned by this CTA.
+  SmallVector<unsigned> srcShapePerCTA =
+      convertType<unsigned>(triton::gpu::getShapePerCTA(srcType));
+
+  // Grab the src values in this thread.
+  SmallVector<Value> srcValues =
+      unpackLLElements(loc, adaptor.getSrc(), rewriter);
+
+  // Emit the indices of the src values owned by this thread.
+  SmallVector<SmallVector<Value>> srcIndices =
+      emitIndices(loc, rewriter, targetInfo, srcType.getEncoding(),
+                  op.getSrc().getType(), /*withCTAOffset=*/true);
+
+  // Store the src values owned by the thread into their respective location in
+  // the scratch memory.
+  assert(srcValues.size() == srcIndices.size());
+
+  // Get the base pointer to the scratch memory.
+  Value smemBase = LLVM::getSharedMemoryBase(loc, rewriter, targetInfo, op);
+
+  // For each src element owned by the thread, index into the scratch memory and
+  // then store it.
+  Type elemType = getTypeConverter()->convertType(srcType.getElementType());
+  for (auto [value, indices] : llvm::zip(srcValues, srcIndices)) {
+    // Convert the index at each dim into a single offset given the shape of the
+    // tensor.
+    Value offset = LLVM::linearize(rewriter, loc, indices, srcShapePerCTA);
+    // Emit the offset into the shared memory and then store the value.
+    Value ptr = gep(smemBase.getType(), elemType, smemBase, offset);
+    store(value, ptr);
+  }
+
+  // Synchronize the whole CTA.
+  barrier();
+
+  // Grab the index values owned by this thread.
+  SmallVector<Value> idxValues =
+      unpackLLElements(loc, adaptor.getIndices(), rewriter);
+
+  // Apply the layout of the destination tensor to obtain the indices of the
+  // column to gather along, then for each column, replace the index along the
+  // gather axis with the appropriate index value.
+  //
+  // I = LL(pid)
+  // idx = indices[I]
+  // I_gather = [I[d] if d != axis else idx for d in range(len(I))]
+  // out[I] = src[I_gather]
+  RankedTensorType dstType = op.getType();
+  SmallVector<SmallVector<Value>> dstIndices =
+      emitIndices(loc, rewriter, targetInfo, dstType.getEncoding(), dstType,
+                  /*withCTAOffset=*/true);
+
+  unsigned idxWidth = op.getIndices().getType().getElementTypeBitWidth();
+  unsigned axis = op.getAxis();
+  SmallVector<Value> results(dstIndices.size());
+  for (auto [i, idx, indices] : llvm::enumerate(idxValues, dstIndices)) {
+    // The LL index computations are performed with 32 bit integers. If the
+    // indices are something else, cast them to i32.
+    if (idxWidth > 32) {
+      idx = trunc(i32_ty, idx);
+    } else if (idxWidth < 32) {
+      // Negative indices don't make sense, so zero-extend.
+      idx = zext(i32_ty, idx);
+    }
+    indices[axis] = idx;
+    Value offset = LLVM::linearize(rewriter, loc, indices, srcShapePerCTA);
+    Value ptr = gep(smemBase.getType(), elemType, smemBase, offset);
+    results[i] = load(elemType, ptr);
+  }
+
+  Value packed =
+      packLLElements(loc, getTypeConverter(), results, rewriter, dstType);
+  rewriter.replaceOp(op, packed);
+  return success();
+}
+
+} // namespace
+
+void triton::populateGatherOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
+                                            RewritePatternSet &patterns,
+                                            const TargetInfoBase &targetInfo,
+                                            PatternBenefit benefit) {
+  patterns.insert<GatherOpConversion>(typeConverter, targetInfo, benefit);
+}
@@ -537,6 +537,7 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
       GenericOpPattern<triton::MakeRangeOp>, TritonExpandDimsPattern,
       TritonTransPattern, TritonDotPattern, GenericOpPattern<triton::LoadOp>,
       GenericOpPattern<triton::StoreOp>, GenericOpPattern<triton::HistogramOp>,
+      GenericOpPattern<triton::GatherOp>,
       GenericOpPattern<triton::ExternElementwiseOp>,
       GenericOpPattern<triton::PrintOp>, GenericOpPattern<triton::AssertOp>,
       GenericOpPattern<triton::AtomicCASOp>,
 
@@ -1073,6 +1073,54 @@ Speculation::Speculatability ExternElementwiseOp::getSpeculatability() {
   return Speculation::NotSpeculatable;
 }
 
+// -- GatherOp --
+LogicalResult GatherOp::verify() {
+  RankedTensorType indicesTy = getIndices().getType();
+  RankedTensorType srcTy = getSrc().getType();
+  RankedTensorType resTy = getResult().getType();
+
+  if (indicesTy.getShape() != resTy.getShape()) {
+    return emitOpError("indices and output shapes must match");
+  }
+  if (indicesTy.getEncoding() != resTy.getEncoding()) {
+    return emitOpError("indices and output encodings must match");
+  }
+  if (srcTy.getElementType() != resTy.getElementType()) {
+    return emitOpError("input and output element types must match");
+  }
+  if (srcTy.getRank() != indicesTy.getRank()) {
+    return emitOpError("input and indices ranks must match");
+  }
+  if (getAxis() >= srcTy.getRank()) {
+    return emitOpError("gather dimension must be less than the input rank");
+  }
+  for (int dim = 0; dim < indicesTy.getRank(); ++dim) {
+    if (dim == getAxis())
+      continue;
+    if (indicesTy.getShape()[dim] != srcTy.getShape()[dim]) {
+      return emitOpError("indices dimension ")
+             << dim << " must match the corresponding input dimension";
+    }
+  }
+
+  return success();
+}
+
+LogicalResult GatherOp::inferReturnTypes(
+    MLIRContext *context, std::optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  GatherOpAdaptor adaptor(operands, attributes, properties, regions);
+  auto indicesType = cast<RankedTensorType>(adaptor.getIndices().getType());
+  auto srcType = cast<RankedTensorType>(adaptor.getSrc().getType());
+
+  // Shape and encoding of the indices with the element type of the src.
+  inferredReturnTypes.push_back(
+      RankedTensorType::get(indicesType.getShape(), srcType.getElementType(),
+                            indicesType.getEncoding()));
+  return success();
+}
+
 // -- ExperimentalTensormapCreateOp --
 LogicalResult ExperimentalTensormapCreateOp::verify() {
   auto rank = getBoxDim().size();