intel
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 9 additions & 19 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 9 additions & 19 deletions
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 9 additions & 24 deletions b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 9 additions & 24 deletions
diff --git a/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 9 additions & 13 deletions b/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 9 additions & 13 deletions
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 61 additions & 33 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 61 additions & 33 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp‎
Lines changed: 1 addition & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp‎
Lines changed: 1 addition & 0 deletions
@@ -89,26 +89,14 @@ class ScanLoweringHelper {
   explicit ScanLoweringHelper(triton::ScanOp op) : scanOp(op) {
     auto firstTy = cast<RankedTensorType>(op.getOperands()[0].getType());
     srcShape = firstTy.getShape();
-    legacyEncoding = firstTy.getEncoding();
-    srcEncoding = triton::gpu::toLinearEncoding(legacyEncoding, srcShape);
+    srcEncoding = firstTy.getEncoding();
     srcElementTypes = op.getElementTypes();
-    // The codegen does not support different element/thread/warp order so
-    // we choose one a priori. We choose that of the blocked encoding.
-    // When we generalise this code to other layouts we'll probably need to
-    // get rid of all this logic and the *Stride auxiliary methods
-    // and replace them by transposes and reshapes on the LinearLayout
-    if (auto blockedEncoding =
-            dyn_cast<triton::gpu::BlockedEncodingAttr>(legacyEncoding)) {
-      order = llvm::to_vector(blockedEncoding.getOrder());
-    } else {
-      order = srcEncoding.getOrder();
-    }
 
     for (const auto &t : op.getInputTypes()) {
       if (t.getShape() != srcShape) {
         op.emitError() << "shape mismatch";
       }
-      if (t.getEncoding() != legacyEncoding) {
+      if (t.getEncoding() != srcEncoding) {
         op.emitError() << "encoding mismatch";
       }
     }
@@ -123,8 +111,12 @@ class ScanLoweringHelper {
   unsigned getNonAxisNumThreadsPerWarp();
   // Return the flat numbers of threads computing independent scan results.
   unsigned getNonAxisNumThreadsPerCTA();
+  // Return the number of warps per CTA along axis dim.
+  unsigned getAxisNumWarps();
   // Return the number of warps per CTA along axis dim with unique data.
   unsigned getAxisNumWarpsWithUniqueData();
+  // Return the number of threads per warp along axis dim.
+  unsigned getAxisNumThreadsPerWarp();
   // Return the number of threads per warp along axis dim with unique data.
   unsigned getAxisNumThreadsPerWarpWithUniqueData();
   // Return the number of blocks along axis dim.
@@ -147,20 +139,18 @@ class ScanLoweringHelper {
   Location getLoc() { return scanOp.getLoc(); }
   unsigned getAxis() { return scanOp.getAxis(); }
   bool getReverse() { return scanOp.getReverse(); }
-  triton::gpu::LinearEncodingAttr getEncoding() { return srcEncoding; }
+  triton::gpu::BlockedEncodingAttr getEncoding();
   llvm::ArrayRef<int64_t> getShape() { return srcShape; }
   unsigned getNumOperands() { return scanOp.getNumOperands(); }
   SmallVector<Type> getElementTypes() { return srcElementTypes; }
-  SmallVector<unsigned> getOrder() { return order; }
+  Attribute getSrcLayout() { return srcEncoding; }
   Region &getCombineOp();
 
 private:
   triton::ScanOp scanOp;
-  triton::gpu::LinearEncodingAttr srcEncoding;
-  Attribute legacyEncoding;
+  Attribute srcEncoding;
   llvm::ArrayRef<int64_t> srcShape;
   SmallVector<Type> srcElementTypes;
-  SmallVector<unsigned> order;
 };
 
 // Helper class for lowering `tt.gather` operations. This class shares lowering
 
@@ -28,8 +28,6 @@ namespace triton {
 
 // Bitwidth of pointers
 constexpr int kPtrBitWidth = 64;
-// Max shmem LDS/STS instruction in bits
-constexpr int kMaxShmemVecBitLength = 128;
 
 static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
                                                RankedTensorType dstTy) {
@@ -81,17 +79,15 @@ std::pair<unsigned, unsigned>
 getScratchCvtInOutVecLengths(RankedTensorType srcTy, RankedTensorType dstTy) {
   Attribute srcLayout = srcTy.getEncoding();
   Attribute dstLayout = dstTy.getEncoding();
-
-  auto srcLinAttr = gpu::toLinearEncoding(srcLayout, srcTy.getShape());
-  auto dstLinAttr = gpu::toLinearEncoding(dstLayout, dstTy.getShape());
-  auto inOrd = srcLinAttr.getOrder();
-  auto outOrd = dstLinAttr.getOrder();
-
+  const auto &inOrd = gpu::getOrder(srcLayout);
+  const auto &outOrd = gpu::getOrder(dstLayout);
   unsigned rank = srcTy.getRank();
 
-  unsigned srcContigPerThread = srcLinAttr.getContigPerThread()[inOrd[0]];
-  unsigned dstContigPerThread = dstLinAttr.getContigPerThread()[outOrd[0]];
-  // TODO: Fix the legacy issue that outOrd[0] == 0 always means
+  unsigned srcContigPerThread =
+      gpu::getUniqueContigPerThread(srcLayout, srcTy.getShape())[inOrd[0]];
+  unsigned dstContigPerThread =
+      gpu::getUniqueContigPerThread(dstLayout, dstTy.getShape())[outOrd[0]];
+  // TODO: Fix the legacy issue that ourOrd[0] == 0 always means
   //       that we cannot do vectorization.
   unsigned innerDim = rank - 1;
   unsigned inVec = outOrd[0] != innerDim  ? 1
@@ -121,7 +117,8 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
   Attribute dstLayout = dstTy.getEncoding();
 
   assert(cvtNeedsSharedMemory(srcTy, dstTy));
-  auto outOrd = gpu::toLinearEncoding(dstLayout, dstTy.getShape()).getOrder();
+
+  const auto &outOrd = gpu::getOrder(dstLayout);
   scratchConfig.order = outOrd;
 
   std::tie(scratchConfig.inVec, scratchConfig.outVec) =
@@ -132,18 +129,6 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
   unsigned contiguousShapeDim = scratchConfig.repShape[scratchConfig.order[0]];
   scratchConfig.inVec = std::min(scratchConfig.inVec, contiguousShapeDim);
   scratchConfig.outVec = std::min(scratchConfig.outVec, contiguousShapeDim);
-  // Clamp the vector length to kMaxShmemVecBitLength / element bitwidth as this
-  // is the max vectorisation
-  auto inBitWidth = isa<PointerType>(srcTy.getElementType())
-                        ? kPtrBitWidth
-                        : srcTy.getElementTypeBitWidth();
-  auto outBitWidth = isa<PointerType>(dstTy.getElementType())
-                         ? kPtrBitWidth
-                         : dstTy.getElementTypeBitWidth();
-  scratchConfig.inVec =
-      std::min(scratchConfig.inVec, kMaxShmemVecBitLength / inBitWidth);
-  scratchConfig.outVec =
-      std::min(scratchConfig.outVec, kMaxShmemVecBitLength / outBitWidth);
 
   // No padding is required if the tensor is 1-D, or if all dimensions except
   // the first accessed dimension have a size of 1.
 
@@ -1222,16 +1222,15 @@ unsigned ModuleAxisInfoAnalysis::getPtrContiguity(Value ptr) {
   auto tensorTy = dyn_cast<RankedTensorType>(ptr.getType());
   if (!tensorTy)
     return 1;
+  auto layout = tensorTy.getEncoding();
 
-  // FIXME: This is not as good as it could be, as we don't need to restrict
-  // the analysis to one dimension. We should determine contiguity on the
-  // flattenOuts() layout
-  auto linAttr =
-      gpu::toLinearEncoding(tensorTy.getEncoding(), tensorTy.getShape());
-  auto order = linAttr.getOrder();
+  // Here order should be ordered by contiguous first, so the first element
+  // should have the largest contiguous.
+  auto order = triton::gpu::getOrder(layout);
   unsigned align = getPtrAlignment(ptr);
 
-  auto uniqueContigPerThread = linAttr.getContigPerThread();
+  auto uniqueContigPerThread =
+      triton::gpu::getUniqueContigPerThread(layout, tensorTy.getShape());
   assert(order[0] < uniqueContigPerThread.size() &&
          "Unexpected uniqueContigPerThread size");
   unsigned contiguity = uniqueContigPerThread[order[0]];
@@ -1248,9 +1247,8 @@ unsigned ModuleAxisInfoAnalysis::getPtrAlignment(Value ptr) {
   auto *axisInfo = getAxisInfo(ptr);
   if (!axisInfo)
     return 1;
-  auto linAttr =
-      gpu::toLinearEncoding(tensorTy.getEncoding(), tensorTy.getShape());
-  auto order = linAttr.getOrder();
+  auto layout = tensorTy.getEncoding();
+  auto order = triton::gpu::getOrder(layout);
   auto maxMultipleBytes = axisInfo->getDivisibility(order[0]);
   auto maxContig = axisInfo->getContiguity(order[0]);
   auto elemNumBits = triton::getPointeeBitWidth(tensorTy);
@@ -1277,9 +1275,7 @@ unsigned ModuleAxisInfoAnalysis::getMaskAlignment(Value mask) {
   auto *axisInfo = getAxisInfo(mask);
   if (!axisInfo)
     return 1;
-  auto linAttr =
-      gpu::toLinearEncoding(tensorTy.getEncoding(), tensorTy.getShape());
-  auto maskOrder = linAttr.getOrder();
+  auto maskOrder = triton::gpu::getOrder(tensorTy.getEncoding());
   auto alignment = std::max<unsigned>(axisInfo->getConstancy(maskOrder[0]), 1);
   LDBG("getMaskAlignment maskOrder[0] " << maskOrder[0] << " alignment "
                                         << alignment);
 
@@ -23,18 +23,37 @@
 #include "triton/Tools/Sys/GetEnv.hpp"
 
 namespace mlir {
+namespace {
 
 using namespace triton;
 using namespace triton::gpu;
 
+int getParentAxis(Attribute layout, int axis) {
+  if (auto sliceEncoding = dyn_cast<SliceEncodingAttr>(layout)) {
+    axis = axis < sliceEncoding.getDim() ? axis : axis + 1;
+    return getParentAxis(sliceEncoding.getParent(), axis);
+  }
+  return axis;
+}
+
+SmallVector<unsigned> getParentOrder(Attribute layout) {
+  if (auto sliceEncoding = mlir::dyn_cast<SliceEncodingAttr>(layout)) {
+    return getParentOrder(sliceEncoding.getParent());
+  }
+  return getThreadOrder(layout);
+}
+
+} // namespace
+
 // TODO(jlebar): Move this class into namespace triton.
 bool ReduceOpHelper::isReductionOnLayoutFastAxis() {
-  auto linearEncoding = toLinearEncoding(getSrcLayout(), getSrcShape());
-  return linearEncoding.getOrder()[0] == axis;
+  return getParentAxis(getSrcLayout(), axis) ==
+         getParentOrder(getSrcLayout())[0];
 }
 
 SmallVector<unsigned> ReduceOpHelper::getOrderWithAxisAtBeginning() {
-  auto order = toLinearEncoding(getSrcLayout(), getSrcShape()).getOrder();
+  auto srcLayout = getSrcLayout();
+  auto order = getOrder(srcLayout);
   auto it = std::find(order.begin(), order.end(), axis);
   // delete the axis from order
   order.erase(it);
@@ -206,59 +225,69 @@ bool ReduceOpHelper::isSupportedLayout() {
 }
 
 unsigned ScanLoweringHelper::getAxisNumElementsPerThread() {
-  return getEncoding().getContigPerThread()[getAxis()];
+  return getEncoding().getSizePerThread()[getAxis()];
 }
 
 unsigned ScanLoweringHelper::getNonAxisNumElementsPerThread() {
-  auto contigPerThread = getEncoding().getContigPerThread();
-  contigPerThread[getAxis()] = 1;
-  return product<unsigned>(contigPerThread);
+  SmallVector<unsigned> sizePerThreads = getContigPerThread(getEncoding());
+  sizePerThreads[getAxis()] = 1;
+  return product<unsigned>(sizePerThreads);
 }
 
 Region &ScanLoweringHelper::getCombineOp() { return scanOp.getCombineOp(); }
 
+unsigned ScanLoweringHelper::getAxisNumThreadsPerWarp() {
+  return getThreadsPerWarp(getEncoding())[getAxis()];
+}
+
 unsigned ScanLoweringHelper::getAxisNumThreadsPerWarpWithUniqueData() {
-  return getEncoding().getThreadsPerWarp()[getAxis()];
+  return getThreadsPerWarpWithUniqueData(getEncoding(), getShape())[getAxis()];
 }
 
 unsigned ScanLoweringHelper::getNonAxisNumThreadsPerWarp() {
-  auto nThreads = product(getEncoding().getThreadsPerWarp());
-  return nThreads / getAxisNumThreadsPerWarpWithUniqueData();
+  auto threadsPerWarp = getThreadsPerWarp(getEncoding());
+  threadsPerWarp[getAxis()] = 1;
+  return product<unsigned>(threadsPerWarp);
 }
 
 // Return the flat numbers of threads computing independent scan results.
 unsigned ScanLoweringHelper::getNonAxisNumThreadsPerCTA() {
-  auto nWarps = product(getEncoding().getWarpsPerCTA());
-  return (nWarps / getAxisNumWarpsWithUniqueData()) *
-         getNonAxisNumThreadsPerWarp();
+  unsigned numParallelThreadsPerWarp = getNonAxisNumThreadsPerWarp();
+  auto warpsPerCTA = getWarpsPerCTA(getEncoding());
+  warpsPerCTA[getAxis()] = 1;
+  unsigned numParallelWarpsPerCTA = product<unsigned>(warpsPerCTA);
+  return numParallelThreadsPerWarp * numParallelWarpsPerCTA;
+}
+
+unsigned ScanLoweringHelper::getAxisNumWarps() {
+  return getWarpsPerCTA(getEncoding())[getAxis()];
 }
 
 unsigned ScanLoweringHelper::getAxisNumWarpsWithUniqueData() {
-  return getEncoding().getWarpsPerCTA()[getAxis()];
+  return getWarpsPerCTAWithUniqueData(getEncoding(), getShape())[getAxis()];
 }
 
 unsigned ScanLoweringHelper::getAxisNumBlocks() {
-  auto contigPerThread = getEncoding().getContigPerThread();
+  auto sizePerThreads = getSizePerThread(getEncoding());
   auto threadsPerWarp = getThreadsPerWarp(getEncoding());
   auto warpsPerCTA = getWarpsPerCTA(getEncoding());
   unsigned axis = getAxis();
   return ceil<unsigned>(
       getShape()[axis],
-      (contigPerThread[axis] * threadsPerWarp[axis] * warpsPerCTA[axis]));
+      (sizePerThreads[axis] * threadsPerWarp[axis] * warpsPerCTA[axis]));
 }
 
 unsigned ScanLoweringHelper::getNonAxisNumBlocks() {
-  auto contigPerThread = getEncoding().getContigPerThread();
+  auto sizePerThreads = getSizePerThread(getEncoding());
   auto threadsPerWarp = getThreadsPerWarp(getEncoding());
   auto warpsPerCTA = getWarpsPerCTA(getEncoding());
-  auto rank = contigPerThread.size();
   unsigned axis = getAxis();
   unsigned numBlocks = 1;
-  for (unsigned i = 0; i < rank; i++) {
+  for (unsigned i = 0; i < sizePerThreads.size(); i++) {
     if (i == axis)
       continue;
     numBlocks *=
-        ceil<unsigned>(getShape()[i], (contigPerThread[i] * threadsPerWarp[i] *
+        ceil<unsigned>(getShape()[i], (sizePerThreads[i] * threadsPerWarp[i] *
                                        warpsPerCTA[i]));
   }
   return numBlocks;
@@ -267,7 +296,7 @@ unsigned ScanLoweringHelper::getNonAxisNumBlocks() {
 bool ScanLoweringHelper::isSupported() {
   // TODO: Support the following cases:
   // 1. Scan on non-blocking encodings
-  if (!isa<BlockedEncodingAttr>(legacyEncoding))
+  if (!isa<BlockedEncodingAttr>(srcEncoding))
     return false;
   return true;
 }
@@ -555,43 +584,42 @@ getReshapeDecomposition(ArrayRef<int64_t> srcShape,
   return ret;
 }
 
+BlockedEncodingAttr ScanLoweringHelper::getEncoding() {
+  return cast<BlockedEncodingAttr>(srcEncoding);
+}
+
 unsigned ScanLoweringHelper::getAxisElementStride() {
-  auto order = getOrder();
+  auto order = getOrder(getEncoding());
   unsigned stride = 1;
   for (unsigned dim : order) {
     if (dim == getAxis())
       return stride;
-    stride *= getEncoding().getContigPerThread()[dim];
+    stride *= getContigPerThread(getEncoding())[dim];
   }
   llvm_unreachable("Axis not found in order");
 }
 
 unsigned ScanLoweringHelper::getAxisThreadStride() {
-  auto encoding = getEncoding();
-  auto kThread = StringAttr::get(encoding.getContext(), "lane");
-  // OOOGHHH This is nasty. We should implement this lowering via LLs natively
-  // to avoid this
-  auto threadsPerWarp = encoding.basesPerDim(kThread, /*skipBroadcast=*/false);
-  auto order = getOrder();
+  auto order = getOrder(getEncoding());
   unsigned stride = 1;
   for (unsigned dim : order) {
     if (dim == getAxis())
       return stride;
-    stride *= threadsPerWarp[dim];
+    stride *= getEncoding().getThreadsPerWarp()[dim];
   }
   llvm_unreachable("Axis not found in order");
 }
 
 unsigned ScanLoweringHelper::getAxisBlockStride() {
-  auto order = getOrder();
+  auto order = getOrder(getEncoding());
   unsigned stride = 1;
-  auto contigPerThread = getEncoding().getContigPerThread();
+  auto sizePerThreads = getSizePerThread(getEncoding());
   auto threadsPerWarp = getThreadsPerWarp(getEncoding());
   auto warpsPerCTA = getWarpsPerCTA(getEncoding());
   for (unsigned dim : order) {
     if (dim == getAxis())
       return stride;
-    stride *= ceil<unsigned int>(getShape()[dim], contigPerThread[dim] *
+    stride *= ceil<unsigned int>(getShape()[dim], sizePerThreads[dim] *
                                                       threadsPerWarp[dim] *
                                                       warpsPerCTA[dim]);
   }
 
@@ -9,6 +9,7 @@ using ::mlir::LLVM::linearize;
 using ::mlir::triton::gpu::DotOperandEncodingAttr;
 using ::mlir::triton::gpu::expandMatrixOrderWithBatch;
 using ::mlir::triton::gpu::expandMatrixShapeWithBatch;
+using ::mlir::triton::gpu::getContigPerThread;
 using ::mlir::triton::gpu::getOrder;
 using ::mlir::triton::gpu::getShapePerCTA;
 using ::mlir::triton::gpu::getSizePerThread;