[LAYOUTS] Get warp number and thread number from Module (#6068)

lezcano · web-flow · commit 9ca8bd3cc8c7 · 2025-03-10T15:12:44.000Z
As per title
diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -51,6 +51,10 @@ int lookupNumWarps(Operation *op);
 // verifiers.
 std::optional<int> maybeLookupNumWarps(Operation *op);
 
+// FIXME: Make this API and that of maybeLookupNumWarps consistent!
+// Utility to find the number of threads per warp
+int lookupThreadsPerWarp(OpBuilder &rewriter);
+
 class LinearLayoutCache {
 public:
   std::optional<LinearLayout> get(const CacheKey &key) {
@@ -97,8 +101,6 @@ SmallVector<unsigned> getElemsPerThread(Type type);
 // getThreadsPerWarpWithUniqueData.
 SmallVector<unsigned> getThreadsPerWarp(Attribute layout);
 
-unsigned getWarpSize(Attribute layout);
-
 // Returns the number of warps per CTA that may have access to replicated
 // elements. If you want non-replicated warps, use getWarpsPerCTAWithUniqueData.
 SmallVector<unsigned> getWarpsPerCTA(Attribute layout);
@@ -196,8 +198,6 @@ SmallVector<int64_t> getAllocationShapePerCTA(Attribute layout,
                                               ArrayRef<int64_t> shape);
 SmallVector<int64_t> getAllocationShapePerCTA(Type type);
 
-unsigned getNumWarpsPerCTA(Attribute layout);
-
 unsigned getNumCTAs(Attribute layout);
 
 // Return the order that represents that the batch is in row-major or
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -194,26 +194,17 @@ Value getThreadId(OpBuilder &rewriter, Location loc) {
   return tid;
 }
 
-static int lookupThreadsPerWarp(OpBuilder &rewriter) {
-  assert(rewriter.getInsertionBlock() && "expected an insertion point");
-  Operation *op = rewriter.getInsertionBlock()->getParentOp();
-  while (op && !isa<ModuleOp>(op))
-    op = op->getParentOp();
-  assert(op && "cannot create thread ID outside of module");
-  return triton::gpu::TritonGPUDialect::getThreadsPerWarp(cast<ModuleOp>(op));
-}
-
 Value getLaneId(OpBuilder &rewriter, Location loc) {
   TritonLLVMOpBuilder b(loc, rewriter);
   Value tid = getThreadId(rewriter, loc);
-  int threadsPerWarp = lookupThreadsPerWarp(rewriter);
+  int threadsPerWarp = triton::gpu::lookupThreadsPerWarp(rewriter);
   return b.urem(tid, b.i32_val(threadsPerWarp));
 }
 
 std::pair<Value, Value> getLaneAndWarpId(OpBuilder &rewriter, Location loc) {
   TritonLLVMOpBuilder b(loc, rewriter);
   Value tid = getThreadId(rewriter, loc);
-  int threadsPerWarp = lookupThreadsPerWarp(rewriter);
+  int threadsPerWarp = triton::gpu::lookupThreadsPerWarp(rewriter);
   Value warpSizeVal = b.i32_val(threadsPerWarp);
 
   Value laneId = b.urem(tid, warpSizeVal);
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -75,15 +75,6 @@ SmallVector<unsigned> getThreadsPerWarp(Attribute layout) {
   }
 }
 
-unsigned getWarpSize(Attribute layout) {
-  unsigned size = 1;
-  auto threadsPerWarp = getThreadsPerWarp(layout);
-  for (auto e : threadsPerWarp) {
-    size *= e;
-  }
-  return size;
-}
-
 SmallVector<unsigned>
 getThreadsPerWarpWithUniqueData(Attribute layout,
                                 ArrayRef<int64_t> tensorShape) {
@@ -377,28 +368,6 @@ SmallVector<int64_t> getAllocationShapePerCTA(Type type) {
                                   tensorType.getShape());
 }
 
-unsigned getNumWarpsPerCTA(Attribute layout) {
-  SmallVector<unsigned> warpsPerCTA;
-  if (auto blockedLayout = dyn_cast<BlockedEncodingAttr>(layout))
-    warpsPerCTA = blockedLayout.getWarpsPerCTA();
-  else if (auto sliceLayout = dyn_cast<SliceEncodingAttr>(layout))
-    return getNumWarpsPerCTA(sliceLayout.getParent());
-  else if (auto mmaLayout = dyn_cast<MmaEncodingTrait>(layout)) {
-    // Use the distributed layout interface to get the number of warps per
-    // CTA.
-    auto distributedLayout = cast<DistributedEncodingTrait>(layout);
-    warpsPerCTA = distributedLayout.getWarpsPerCTA();
-  } else if (auto mfmaLayout = dyn_cast<AMDMfmaEncodingAttr>(layout))
-    warpsPerCTA = mfmaLayout.getWarpsPerCTA();
-  else if (auto wmmaLayout = dyn_cast<AMDWmmaEncodingAttr>(layout))
-    warpsPerCTA = wmmaLayout.getWarpsPerCTA();
-  else if (auto dotLayout = dyn_cast<DotOperandEncodingAttr>(layout))
-    warpsPerCTA = dotLayout.getWarpsPerCTA();
-  else
-    llvm::report_fatal_error("Unimplemented usage of getNumWarpsPerCTA");
-  return product<unsigned>(warpsPerCTA);
-}
-
 unsigned getNumCTAs(Attribute layout) {
   return product<unsigned>(getCTAsPerCGA(layout));
 }
@@ -3496,3 +3465,12 @@ int triton::gpu::lookupNumWarps(Operation *op) {
   }
   return *numWarps;
 }
+
+int triton::gpu::lookupThreadsPerWarp(OpBuilder &rewriter) {
+  assert(rewriter.getInsertionBlock() && "expected an insertion point");
+  Operation *op = rewriter.getInsertionBlock()->getParentOp();
+  while (op && !isa<ModuleOp>(op))
+    op = op->getParentOp();
+  assert(op && "cannot create thread ID outside of module");
+  return triton::gpu::TritonGPUDialect::getThreadsPerWarp(cast<ModuleOp>(op));
+}
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/PlanCTA.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/PlanCTA.cpp
@@ -59,18 +59,18 @@ Type replaceLayout(const Type &type, const Attribute &newLayout) {
 
 ttg::DistributedEncodingTrait
 replaceCTALayout(ttg::DistributedEncodingTrait layout,
-                 llvm::ArrayRef<int64_t> shape,
+                 llvm::ArrayRef<int64_t> shape, int numWarps,
                  const ttg::CTALayoutAttr &newCTALayout) {
   if (auto blockedLayout = mlir::dyn_cast<ttg::BlockedEncodingAttr>(layout)) {
     return ttg::BlockedEncodingAttr::get(
         layout.getContext(), shape, blockedLayout.getSizePerThread(),
-        blockedLayout.getOrder(), ttg::getNumWarpsPerCTA(layout), 32,
-        newCTALayout);
+        blockedLayout.getOrder(), numWarps, 32, newCTALayout);
   } else if (auto sliceLayout =
                  mlir::dyn_cast<ttg::SliceEncodingAttr>(layout)) {
     return ttg::SliceEncodingAttr::get(
         layout.getContext(), sliceLayout.getDim(),
-        replaceCTALayout(sliceLayout.getParent(), shape, newCTALayout));
+        replaceCTALayout(sliceLayout.getParent(), shape, numWarps,
+                         newCTALayout));
   } else {
     // Other layouts are generated by passes after PlanCTAPass
     llvm::report_fatal_error("replaceCTALayout not implemented");
@@ -293,11 +293,15 @@ bool CTAPlanner::processDot(triton::FuncOp &funcOp) {
     // FIXME: Should consider IR with more than one DotOps
     setTiling({splitM, splitN, 1});
 
+    OpBuilder builder(dot);
+    auto numThreads = ttg::lookupThreadsPerWarp(builder);
+    auto numWarps = ttg::lookupNumWarps(dot);
+
     auto newCTALayout = ttg::CTALayoutAttr::get(ctx, {splitM, splitN},
                                                 {splitM, splitN}, {1, 0});
     auto newDLayout = ttg::BlockedEncodingAttr::get(
         ctx, dTy.getShape(), dLayout.getSizePerThread(), dLayout.getOrder(),
-        ttg::getNumWarpsPerCTA(dLayout), 32, newCTALayout);
+        numWarps, numThreads, newCTALayout);
     auto newALayout = ttg::DotOperandEncodingAttr::get(ctx, aLayout.getOpIdx(),
                                                        newDLayout, 0);
     auto newBLayout = ttg::DotOperandEncodingAttr::get(ctx, bLayout.getOpIdx(),
@@ -359,12 +363,14 @@ bool CTAPlanner::processReduce(triton::FuncOp &funcOp) {
     if (remainingCTAs > 0)
       CTAsPerCGA[order[rank - 1]] *= remainingCTAs;
 
+    auto numWarps = ttg::lookupNumWarps(reduce);
     auto CTALayout =
         ttg::CTALayoutAttr::get(context, CTAsPerCGA, CTASplitNum, CTAOrder);
     if (!tiled)
       setTiling(CTALayout.getCTAsPerCGA());
-    auto newSrcLayout = replaceCTALayout(
-        cast<ttg::DistributedEncodingTrait>(srcLayout), srcShape, CTALayout);
+    auto newSrcLayout =
+        replaceCTALayout(cast<ttg::DistributedEncodingTrait>(srcLayout),
+                         srcShape, numWarps, CTALayout);
     auto newResultLayout =
         ttg::SliceEncodingAttr::get(context, axis, newSrcLayout);
     unsigned numOperands = reduce.getNumOperands();
@@ -386,6 +392,7 @@ void CTAPlanner::processStoreLikeOps(triton::FuncOp &funcOp) {
       stores.push_back(op);
   });
   assert(stores.size() > 0 && "Cannot find store-like ops");
+  auto numWarps = ttg::lookupNumWarps(funcOp);
 
   ttg::CTALayoutAttr CTALayout;
   for (Operation *store : stores) {
@@ -398,7 +405,7 @@ void CTAPlanner::processStoreLikeOps(triton::FuncOp &funcOp) {
       }
       auto newLayout = replaceCTALayout(
           cast<ttg::DistributedEncodingTrait>(tensorTy.getEncoding()),
-          tensorTy.getShape(), CTALayout);
+          tensorTy.getShape(), numWarps, CTALayout);
       processElementwise(store, newLayout);
     }
   }
@@ -624,6 +631,7 @@ bool CTAPlanner::processLoadStore(Operation *op, Attribute layout) {
   }
 
   auto CTALayout = ttg::getCTALayout(layout);
+  auto numWarps = ttg::lookupNumWarps(op);
 
   llvm::SmallVector<Attribute> newOperandLayouts;
   for (unsigned i = 0; i < op->getNumOperands(); ++i) {
@@ -634,7 +642,7 @@ bool CTAPlanner::processLoadStore(Operation *op, Attribute layout) {
     auto oldLayout =
         cast<ttg::DistributedEncodingTrait>(tensorTy.getEncoding());
     auto newLayout =
-        replaceCTALayout(oldLayout, tensorTy.getShape(), CTALayout);
+        replaceCTALayout(oldLayout, tensorTy.getShape(), numWarps, CTALayout);
     newOperandLayouts.push_back(newLayout);
   }
 
@@ -647,7 +655,7 @@ bool CTAPlanner::processLoadStore(Operation *op, Attribute layout) {
     auto oldLayout =
         cast<ttg::DistributedEncodingTrait>(tensorTy.getEncoding());
     auto newLayout =
-        replaceCTALayout(oldLayout, tensorTy.getShape(), CTALayout);
+        replaceCTALayout(oldLayout, tensorTy.getShape(), numWarps, CTALayout);
     newResultLayouts.push_back(newLayout);
   }
 
diff --git a/test/TritonGPU/amd/mfma-double-rate.mlir b/test/TritonGPU/amd/mfma-double-rate.mlir
@@ -3,7 +3,7 @@
 // CHECK-LABEL:mfma_16x16x32_f16
 
 #mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = false}>
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "tttg.threads-per-warp" = 64 : i32} {
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
   tt.func public @mfma_16x16x32_f16(%arg0: tensor<16x32xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>,
                          %arg1: tensor<32x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma>
@@ -18,7 +18,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "tttg.th
 // CHECK-LABEL:mfma_16x16x32_bf16
 
 #mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = false}>
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "tttg.threads-per-warp" = 64 : i32} {
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
   tt.func public @mfma_16x16x32_bf16(%arg0: tensor<16x32xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>,
                          %arg1: tensor<32x16xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma>
@@ -33,7 +33,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "tttg.th
 // CHECK-LABEL:mfma_32x32x16_f16
 
 #mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = false}>
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "tttg.threads-per-warp" = 64 : i32} {
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
   tt.func public @mfma_32x32x16_f16(%arg0: tensor<32x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>,
                          %arg1: tensor<16x32xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
@@ -49,7 +49,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "tttg.th
 // CHECK-LABEL:mfma_32x32x16_bf16
 
 #mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = false}>
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "tttg.threads-per-warp" = 64 : i32} {
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
   tt.func public @mfma_32x32x16_bf16(%arg0: tensor<32x16xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>,
                          %arg1: tensor<16x32xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -57,7 +57,7 @@ struct ConvertLayoutOpMFMAToDotOpConversion
     auto mfmaLayout = dyn_cast<AMDMfmaEncodingAttr>(srcType.getEncoding());
     assert((mfmaLayout.getMDim() == 16 || mfmaLayout.getMDim() == 32) &&
            "Expected MFMA size 16 or 32");
-    assert(triton::gpu::getWarpSize(mfmaLayout) == 64 &&
+    assert(triton::gpu::lookupThreadsPerWarp(rewriter) == 64 &&
            "Expected warp size 64 for MFMA");
 
     auto elemTy = int_ty(8);
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp
@@ -252,7 +252,7 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
     numRepK = numReps[kDimIdx + 1];
   }
 
-  unsigned iWarpSize = triton::gpu::getWarpSize(mfmaLayout);
+  unsigned iWarpSize = triton::gpu::lookupThreadsPerWarp(rewriter);
   assert(iWarpSize == 64);
   Value warpSize = tb.i32_val(iWarpSize);
   Value linearWarpId = tb.udiv(thread, warpSize);
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp
@@ -171,7 +171,7 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
   auto numRepK = numReps[opIdx == 0 ? 2 : 1];
   auto repB = numReps[0];
 
-  unsigned iWaveSize = triton::gpu::getWarpSize(wmmaLayout);
+  unsigned iWaveSize = triton::gpu::lookupThreadsPerWarp(rewriter);
   assert(iWaveSize == 32);
   Value waveSize = tb.i32_val(iWaveSize);
   Value linearWaveId = tb.udiv(thread, waveSize);
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DecomposeUnsupportedConversions.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DecomposeUnsupportedConversions.cpp
@@ -82,7 +82,7 @@ struct DecomposeUnsupportedAMDConversions
         return;
       }
 
-      unsigned numWarps = triton::gpu::getNumWarpsPerCTA(srcEnc);
+      unsigned numWarps = lookupNumWarps(cvtOp);
 
       // Find all possible shapes of WarpsPerCTA by finding all possible
       // factorizations of numWarps. Pick shape for which both conversions in
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp
@@ -313,7 +313,7 @@ struct DotOpMFMAConversionHelper {
     auto dstElemTy = dTensorTy.getElementType();
     auto fc = unpackLLElements(loc, loadedC, rewriter);
 
-    unsigned warpSize = triton::gpu::getWarpSize(mfmaLayout);
+    unsigned warpSize = triton::gpu::lookupThreadsPerWarp(rewriter);
     // compute number of output elements that each thread holds for one MFMA
     // instruction.
     const int subBlocks =
@@ -640,7 +640,7 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
     auto dstElemTy = dTensorTy.getElementType();
     auto fc = unpackLLElements(loc, loadedC, rewriter);
 
-    unsigned warpSize = triton::gpu::getWarpSize(mfmaLayout);
+    unsigned warpSize = triton::gpu::lookupThreadsPerWarp(rewriter);
     // compute number of output elements that each thread holds for one MFMA
     // instruction. subBlocks
     const int subBlocks =
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/WMMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/WMMA.cpp
@@ -291,7 +291,7 @@ LogicalResult convertDot(DotOp op, DotOpAdaptor adaptor,
   auto dstElemTy = dTensorTy.getElementType();
   auto fc = unpackLLElements(loc, loadedC, rewriter);
 
-  unsigned warpSize = triton::gpu::getWarpSize(wmmaLayout);
+  unsigned warpSize = gpu::lookupThreadsPerWarp(rewriter);
   constexpr unsigned vgprElemBitWidth = 32;
   unsigned paddedOutputElemSize =
       wmmaVer == 1 ? vgprElemBitWidth / dstElemTy.getIntOrFloatBitWidth() : 1;
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -25,7 +25,6 @@ using ::mlir::LLVM::AMD::llLoad;
 using ::mlir::LLVM::AMD::llStore;
 using ::mlir::triton::AMD::ISAFamily;
 using ::mlir::triton::gpu::getTotalElemsPerThread;
-
 namespace {
 
 // Return a predicate that is true only if the current thread holds unique data,
@@ -1537,7 +1536,7 @@ struct AtomicRMWOpConversion
         Value offset = genPrefixSum(rewriter, maskI32);
         offset = b.mul(offset, maskI32);
         auto layout = tensorTy.getEncoding();
-        Value waveSize = b.i32_val(triton::gpu::getWarpSize(layout));
+        Value waveSize = b.i32_val(lookupThreadsPerWarp(rewriter));
         offset = b.select(b.icmp_eq(offset, b.i32_val(0)), waveSize, offset);
         Value idx = b.sub(offset, b.i32_val(1));
         idx = b.mul(idx, b.i32_val(4));
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/OptimizeLDSUsage.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/OptimizeLDSUsage.cpp
@@ -27,6 +27,7 @@
 #include "mlir/Pass/Pass.h"
 #include "triton/Analysis/Allocation.h"
 #include "triton/Conversion/TritonGPUToLLVM/Patterns.h"
+#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 
@@ -100,8 +101,8 @@ class OptimizeAMDLDSUsage
     auto ctx = srcEnc.getContext();
     auto rank = srcType.getRank();
 
-    unsigned numWarps = triton::gpu::getNumWarpsPerCTA(srcEnc);
-    auto warpSize = triton::gpu::getWarpSize(srcEnc);
+    unsigned numWarps = triton::gpu::lookupNumWarps(cvtOp);
+    auto warpSize = triton::gpu::lookupThreadsPerWarp(builder);
 
     // Find all possible shapes of WarpsPerCTA by finding all possible
     // factorizations of numWarps. Pick shape for which both conversions in

Original file line number	Diff line number	Diff line change
`@@ -252,7 +252,7 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,`
`252`	`252`	`numRepK = numReps[kDimIdx + 1];`
`253`	`253`	`}`
`254`	`254`
`255`		`- unsigned iWarpSize = triton::gpu::getWarpSize(mfmaLayout);`
	`255`	`+ unsigned iWarpSize = triton::gpu::lookupThreadsPerWarp(rewriter);`
`256`	`256`	`assert(iWarpSize == 64);`
`257`	`257`	`Value warpSize = tb.i32_val(iWarpSize);`
`258`	`258`	`Value linearWarpId = tb.udiv(thread, warpSize);`
Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@ struct DecomposeUnsupportedAMDConversions`
`82`	`82`	`return;`
`83`	`83`	`}`
`84`	`84`
`85`		`- unsigned numWarps = triton::gpu::getNumWarpsPerCTA(srcEnc);`
	`85`	`+ unsigned numWarps = lookupNumWarps(cvtOp);`
`86`	`86`
`87`	`87`	`// Find all possible shapes of WarpsPerCTA by finding all possible`
`88`	`88`	`// factorizations of numWarps. Pick shape for which both conversions in`