intel
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 16 additions & 10 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 16 additions & 10 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 16 additions & 7 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 16 additions & 7 deletions
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 8 additions & 8 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/HistogramOpToLLVM.cpp‎
Lines changed: 3 additions & 2 deletions b/‎lib/Conversion/TritonGPUToLLVM/HistogramOpToLLVM.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -99,16 +99,14 @@ unsigned getTotalElemsPerThread(Attribute layout, ArrayRef<int64_t> shape);
 
 SmallVector<unsigned> getElemsPerThread(Type type);
 
-// Returns the number of warps per CTA that have access to non-replicated
-// elements of the tensor. E.g. for a blocked layout with sizePerThread = [1,
-// 1], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4] and tensor shape = [2, 2],
-// returns [1, 1], since the first warp has access to the full tensor, whereas
-// the other warps have access to replicated elements.
-SmallVector<unsigned> getWarpsPerCTA(Attribute layout,
-                                     ArrayRef<int64_t> tensorShape);
-inline SmallVector<unsigned> getWarpsPerCTA(RankedTensorType type) {
-  return getWarpsPerCTA(type.getEncoding(), type.getShape());
-}
+// Returns the number of threads per warp that may have access to replicated
+// elements. If you want non-replicated threads, use
+// getThreadsPerWarpWithUniqueData.
+SmallVector<unsigned> getThreadsPerWarp(Attribute layout);
+
+// Returns the number of warps per CTA that may have access to replicated
+// elements. If you want non-replicated warps, use getWarpsPerCTAWithUniqueData.
+SmallVector<unsigned> getWarpsPerCTA(Attribute layout);
 
 // Returns the number of contiguous elements of the logical tensor that each
 // thread has access to, on each dimension of the tensor. For a blocked layout
@@ -127,6 +125,14 @@ SmallVector<unsigned>
 getThreadsPerWarpWithUniqueData(Attribute layout,
                                 ArrayRef<int64_t> tensorShape);
 
+// Returns the number of warps per CTA that have access to non-replicated
+// elements of the tensor. E.g. for a blocked layout with sizePerThread = [1,
+// 1], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4] and tensor shape = [2, 2],
+// returns [1, 1], since the first warp has access to the full tensor, whereas
+// the other warps have access to replicated elements.
+SmallVector<unsigned>
+getWarpsPerCTAWithUniqueData(Attribute layout, ArrayRef<int64_t> tensorShape);
+
 // Returns the dimensions of the tensor from minor (fast-varying) to
 // major (slow-varying). For distributed layouts, this represents
 // the order of the elements within a thread.
 
@@ -597,6 +597,15 @@ We call each individual tile "rep".
                      /*defaultImplementation=*/[{
                          return toLinearEncoding($_self, shape).getElemsPerThread(shape);
                      }]>,
+    // Interface for the meta information about the multiple thread hierarchy.
+    InterfaceMethod<"Get the shape of the warps per CTA.",
+                    "SmallVector<unsigned>",
+                    "getWarpsPerCTA">,
+
+
+    InterfaceMethod<"Get the shape of the threads per warp",
+                    "SmallVector<unsigned>",
+                    "getThreadsPerWarp">,
     InterfaceMethod<"Convert to LinearLayout.",
                     "LinearLayout",
                     "toLinearLayout",
@@ -662,6 +671,8 @@ L(T) = [ {0,8} , {1,9} , {2,10}, {3,11}, {0,8} , {1, 9} , {2, 10}, {3, 11},
     SmallVector<unsigned> getCTAsPerCGA() const;
     SmallVector<unsigned> getCTAOrder() const;
     SmallVector<unsigned> getCTASplitNum() const;
+    SmallVector<unsigned> getWarpsPerCTA() const;
+    SmallVector<unsigned> getThreadsPerWarp() const;
 
     LinearLayout toLinearLayout(ArrayRef<int64_t> shape) const;
 
@@ -714,8 +725,6 @@ def LinearEncodingAttr : DistributedEncoding<"LinearEncoding", "linear_encoding"
     // If skipBroadcast is false, we count a base zero
     SmallVector<unsigned> basesPerDim(StringAttr dimName,
                                       bool skipBroadcast = true) const;
-    SmallVector<unsigned> getThreadsPerWarp() const;
-    SmallVector<unsigned> getWarpsPerCTA() const;
 
     // [FIXME LL] Supports legacy behaviour. We should remove these functions
     SmallVector<unsigned> getShapePerCTATile() const;
@@ -825,8 +834,8 @@ for
   let parameters = (
     ins
     ArrayRefParameter<"unsigned">:$sizePerThread,
-    ArrayRefParameter<"unsigned">:$threadsPerWarp,
-    ArrayRefParameter<"unsigned">:$warpsPerCTA,
+    ArrayRefParameter<"unsigned">:$threadsPerWarp__,
+    ArrayRefParameter<"unsigned">:$warpsPerCTA__,
     ArrayRefParameter<"unsigned">:$order, // the fastest-changing axis first
 
     // CTALayout is optional in the textual IR.  If omitted, we infer it to be a
@@ -1030,7 +1039,7 @@ V [ 0,4,8...60   1,5...61     2,6...62     3,7...63    ]   [ 128,132...188  129,
     ins
     "unsigned": $versionMajor,
     "unsigned": $versionMinor,
-    ArrayRefParameter<"unsigned">:$warpsPerCTA,
+    ArrayRefParameter<"unsigned">:$warpsPerCTA__,
     "unsigned":$MDim,
     "unsigned":$NDim,
     "bool":$isTransposed,
@@ -1151,7 +1160,7 @@ Row |
     ins
     "unsigned": $version,
     "bool":$isTransposed,
-    ArrayRefParameter<"unsigned">:$warpsPerCTA,
+    ArrayRefParameter<"unsigned">:$warpsPerCTA__,
     "CTALayoutAttr":$CTALayout
   );
 
@@ -1257,7 +1266,7 @@ For example, the matrix L corresponding to blockTileSize=[32,16] is:
     ins
     "unsigned":$versionMajor,
     "unsigned":$versionMinor,
-    ArrayRefParameter<"unsigned">:$warpsPerCTA,
+    ArrayRefParameter<"unsigned">:$warpsPerCTA__,
     "CTALayoutAttr":$CTALayout,
     ArrayRefParameter<"unsigned">:$instrShape
   );
 
@@ -100,7 +100,7 @@ bool shouldUseDistSmem(Attribute srcLayout, Attribute dstLayout) {
 }
 
 unsigned ReduceOpHelper::getInterWarpSizeWithUniqueData() {
-  return getWarpsPerCTA(srcEncoding, srcShape)[axis];
+  return getWarpsPerCTAWithUniqueData(srcEncoding, srcShape)[axis];
 }
 
 unsigned ReduceOpHelper::getIntraWarpSizeWithUniqueData() {
@@ -113,7 +113,7 @@ bool ReduceOpHelper::isWarpSynchronous() {
   // in order to remove this change.
   if (!srcEncoding)
     return true;
-  return getWarpsPerCTA(srcEncoding, srcShape)[axis] == 1;
+  return getWarpsPerCTAWithUniqueData(srcEncoding, srcShape)[axis] == 1;
 }
 
 SmallVector<unsigned> ReduceOpHelper::getScratchRepShape() {
@@ -180,8 +180,8 @@ unsigned ScanLoweringHelper::getAxisNumWarpsWithUniqueData() {
 
 unsigned ScanLoweringHelper::getAxisNumBlocks() {
   auto contigPerThread = getEncoding().getContigPerThread();
-  auto threadsPerWarp = getEncoding().getThreadsPerWarp();
-  auto warpsPerCTA = getEncoding().getWarpsPerCTA();
+  auto threadsPerWarp = getThreadsPerWarp(getEncoding());
+  auto warpsPerCTA = getWarpsPerCTA(getEncoding());
   unsigned axis = getAxis();
   return ceil<unsigned>(
       getShape()[axis],
@@ -190,8 +190,8 @@ unsigned ScanLoweringHelper::getAxisNumBlocks() {
 
 unsigned ScanLoweringHelper::getNonAxisNumBlocks() {
   auto contigPerThread = getEncoding().getContigPerThread();
-  auto threadsPerWarp = getEncoding().getThreadsPerWarp();
-  auto warpsPerCTA = getEncoding().getWarpsPerCTA();
+  auto threadsPerWarp = getThreadsPerWarp(getEncoding());
+  auto warpsPerCTA = getWarpsPerCTA(getEncoding());
   auto rank = contigPerThread.size();
   unsigned axis = getAxis();
   unsigned numBlocks = 1;
@@ -527,8 +527,8 @@ unsigned ScanLoweringHelper::getAxisBlockStride() {
   auto order = getOrder();
   unsigned stride = 1;
   auto contigPerThread = getEncoding().getContigPerThread();
-  auto threadsPerWarp = getEncoding().getThreadsPerWarp();
-  auto warpsPerCTA = getEncoding().getWarpsPerCTA();
+  auto threadsPerWarp = getThreadsPerWarp(getEncoding());
+  auto warpsPerCTA = getWarpsPerCTA(getEncoding());
   for (unsigned dim : order) {
     if (dim == getAxis())
       return stride;
 
@@ -88,8 +88,9 @@ static SmallVector<Value> computeCrossWarpHistogram(
     Value threadId, int numWarps) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   SmallVector<Value> histogramValues;
-  unsigned numWarpsWithUniqueData = mlir::triton::gpu::getWarpsPerCTA(
-      srcType.getEncoding(), srcType.getShape())[0];
+  unsigned numWarpsWithUniqueData =
+      mlir::triton::gpu::getWarpsPerCTAWithUniqueData(srcType.getEncoding(),
+                                                      srcType.getShape())[0];
   Value laneId = b.and_(threadId, b.i32_val(numThreadPerWarp - 1));
   // Initialize the shared memory with zeros.
   int64_t numElementPerThread =
 
@@ -392,8 +392,8 @@ ScanOpConversion::getDelinearizedIds(ConversionPatternRewriter &rewriter,
   unsigned axis = helper.getAxis();
   auto srcEncoding = helper.getEncoding();
 
-  auto threadsPerWarp = srcEncoding.getThreadsPerWarp();
-  auto warpsPerCTA = srcEncoding.getWarpsPerCTA();
+  auto threadsPerWarp = triton::gpu::getThreadsPerWarp(srcEncoding);
+  auto warpsPerCTA = triton::gpu::getWarpsPerCTA(srcEncoding);
   auto [multiDimLaneId, isRepresentativeLane] =
       getMultiDimLaneId(rewriter, helper, laneId);
   auto [multiDimWarpId, isRepresentativeWarp] =
 
@@ -155,7 +155,7 @@ struct TritonExpandDimsPattern
     retSizePerThread.insert(retSizePerThread.begin() + op.getAxis(), 1);
     auto retThreadsPerWarp = argEncoding.getThreadsPerWarp();
     retThreadsPerWarp.insert(retThreadsPerWarp.begin() + op.getAxis(), 1);
-    auto retWarpsPerCTA = to_vector(argEncoding.getWarpsPerCTA());
+    auto retWarpsPerCTA = argEncoding.getWarpsPerCTA();
     retWarpsPerCTA.insert(retWarpsPerCTA.begin() + op.getAxis(), 1);
     SmallVector<unsigned, 4> retOrder(retShape.size());
     std::iota(retOrder.begin(), retOrder.end(), 0);