[Intel] Use 'CTAEncodingAttr' after '49b7472'

anmyachev · anmyachev · commit 8dc24ec00c24 · 2025-11-28T16:17:21.000Z
Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;
diff --git a/test/Conversion/intel/tritongpu_to_gen.mlir b/test/Conversion/intel/tritongpu_to_gen.mlir
@@ -1389,7 +1389,7 @@ tt.func @test_get_program_id(%a: tensor<32x!tt.ptr<i32>, #blocked0>) {
 
 // -----
 
-#blocked0 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [4], CTASplitNum = [1], CTAOrder = [0]}>
+#blocked0 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CGALayout = [[0], [0]]}>
 module attributes {"ttg.num-ctas" = 4 : i32, "ttg.num-warps" = 4 : i32} {
 // CHECK-LABEL: test_get_program_id
 tt.func @test_get_program_id(%a: tensor<32x!tt.ptr<i32>, #blocked0>) {
diff --git a/test/TritonIntelGPU/tritongpu_reduce_op_lowering.mlir b/test/TritonIntelGPU/tritongpu_reduce_op_lowering.mlir
@@ -2,7 +2,7 @@
 
 // COM: Tests reduction when threads_per_warp < num_warps.
 
-#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [64], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [64], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 64 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: reduce_problem_size_64_threads_per_warp_32
   tt.func @reduce_problem_size_64_threads_per_warp_32(%f : tensor<2048xi32, #blocked>) {
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUAttrDefs.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUAttrDefs.td
@@ -310,7 +310,7 @@ def Subgroup2DBlockEncodingAttr : DistributedEncoding<"Subgroup2DBlockEncoding",
   let parameters = (
     ins
     ArrayRefParameter<"unsigned">:$warpsPerCTA,
-    "CTALayoutAttr":$CTALayout,
+    "CTAEncodingAttr":$CTALayout,
     ArrayRefParameter<"unsigned">:$instrShape,
     "unsigned":$numBlocks,
     ArrayRefParameter<"unsigned">:$order,
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
@@ -146,22 +146,12 @@ DpasEncodingAttr::getRepOrderForOperand(OpIdx opIdx) const {
   return getOrderForDotOperand(unsigned(opIdx), rank, /*kMajor*/ true);
 }
 
-SmallVector<unsigned> DpasEncodingAttr::getCTASplitNum() const {
+CTAEncodingAttr DpasEncodingAttr::getCTALayout() const {
   size_t rank = getWarpsPerCTA().size();
-  SmallVector<unsigned> res(rank, 1);
-  return res;
-}
-
-SmallVector<unsigned> DpasEncodingAttr::getCTAOrder() const {
-  size_t rank = getWarpsPerCTA().size();
-  auto res = llvm::to_vector(llvm::reverse(llvm::seq<unsigned>(rank)));
-  return res;
-}
-
-SmallVector<unsigned> DpasEncodingAttr::getCTAsPerCGA() const {
-  size_t rank = getWarpsPerCTA().size();
-  SmallVector<unsigned> res(rank, 1);
-  return res;
+  SmallVector<unsigned> CTAsPerCGA(rank, 1);
+  auto CTAOrder = llvm::to_vector(llvm::reverse(llvm::seq<unsigned>(rank)));
+  return CTAEncodingAttr::fromSplitParams(getContext(), CTAsPerCGA, CTAsPerCGA,
+                                          CTAOrder);
 }
 
 SmallVector<int64_t>
@@ -441,16 +431,8 @@ LinearLayout WarpEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   llvm::report_fatal_error("NYI. WarpEncodingAttr::toLinearLayout");
 }
 
-SmallVector<unsigned> WarpEncodingAttr::getCTAsPerCGA() const {
-  llvm::report_fatal_error("NYI. WarpEncodingAttr::getCTAsPerCGA");
-}
-
-SmallVector<unsigned> WarpEncodingAttr::getCTAOrder() const {
-  llvm::report_fatal_error("NYI. WarpEncodingAttr::getCTAOrder");
-}
-
-SmallVector<unsigned> WarpEncodingAttr::getCTASplitNum() const {
-  llvm::report_fatal_error("NYI. WarpEncodingAttr::getCTASplitNum");
+CTAEncodingAttr WarpEncodingAttr::getCTALayout() const {
+  llvm::report_fatal_error("NYI. WarpEncodingAttr::getCTALayout");
 }
 
 Attribute WarpEncodingAttr::parse(AsmParser &parser, Type type) {
@@ -506,16 +488,16 @@ void WarpEncodingAttr::print(mlir::AsmPrinter &printer) const {
 //===----------------------------------------------------------------------===//
 
 namespace {
-std::optional<CTALayoutAttr> getCTALayoutOrError(
+std::optional<CTAEncodingAttr> getCTALayoutOrError(
     AsmParser &parser, std::optional<SmallVector<unsigned>> CTAsPerCGA,
     std::optional<SmallVector<unsigned>> CTASplitNum,
     std::optional<SmallVector<unsigned>> CTAOrder, unsigned rank) {
   if (CTAsPerCGA && CTASplitNum && CTAOrder) {
-    return CTALayoutAttr::get(parser.getContext(), *CTAsPerCGA, *CTASplitNum,
-                              *CTAOrder);
+    return CTAEncodingAttr::fromSplitParams(parser.getContext(), *CTAsPerCGA,
+                                            *CTASplitNum, *CTAOrder);
   }
   if (!CTAsPerCGA && !CTASplitNum && !CTAOrder) {
-    return CTALayoutAttr::getDefault(parser.getContext(), rank);
+    return CTAEncodingAttr::getDefault(parser.getContext(), rank);
   }
   parser.emitError(parser.getNameLoc(), "CTAsPerCGA, CTASplitNum, and CTAOrder "
                                         "must all be present or all be absent");
@@ -524,8 +506,8 @@ std::optional<CTALayoutAttr> getCTALayoutOrError(
 
 // Print the CTALayout if it's not equal to the default.
 void maybePrintCTALayout(mlir::MLIRContext *context, mlir::AsmPrinter &printer,
-                         CTALayoutAttr layout, unsigned rank) {
-  if (layout != CTALayoutAttr::getDefault(context, rank)) {
+                         CTAEncodingAttr layout, unsigned rank) {
+  if (layout != CTAEncodingAttr::getDefault(context, rank)) {
     printer << ", CTAsPerCGA = [" << ArrayRef(layout.getCTAsPerCGA()) << "]"
             << ", CTASplitNum = [" << ArrayRef(layout.getCTASplitNum()) << "]"
             << ", CTAOrder = [" << ArrayRef(layout.getCTAOrder()) << "]";
@@ -536,7 +518,7 @@ void maybePrintCTALayout(mlir::MLIRContext *context, mlir::AsmPrinter &printer,
 
 LogicalResult Subgroup2DBlockEncodingAttr::verify(
     function_ref<InFlightDiagnostic()> emitError,
-    ArrayRef<unsigned> warpsPerCTA, CTALayoutAttr CTALayout,
+    ArrayRef<unsigned> warpsPerCTA, CTAEncodingAttr CTALayout,
     ArrayRef<unsigned> instrShape, unsigned numBlocks, ArrayRef<unsigned> order,
     unsigned kWidth, unsigned threadsPerWarp) {
   if (instrShape.size() != 2) {
@@ -621,7 +603,7 @@ Attribute Subgroup2DBlockEncodingAttr::parse(AsmParser &parser, Type type) {
     }
   }
 
-  std::optional<CTALayoutAttr> CTALayout = getCTALayoutOrError(
+  std::optional<CTAEncodingAttr> CTALayout = getCTALayoutOrError(
       parser, CTAsPerCGA, CTASplitNum, CTAOrder, /*rank=*/warpsPerCTA.size());
   if (!CTALayout.has_value())
     return {};
@@ -898,8 +880,10 @@ struct TritonIntelGPUInferLayoutInterface
     // Cowardly refuse to handle encodings with multiple CTAs.  CTAsPerCGA
     // should be like the other fields in blocked encoding, but I'm not sure how
     // to handle CTASplitNum.
-    if (!all_of(src.getCTAsPerCGA(), [](int32_t x) { return x == 1; }) ||
-        !all_of(src.getCTASplitNum(), [](int32_t x) { return x == 1; })) {
+    if (!all_of(src.getCTALayout().getCTAsPerCGA(),
+                [](int32_t x) { return x == 1; }) ||
+        !all_of(src.getCTALayout().getCTASplitNum(),
+                [](int32_t x) { return x == 1; })) {
       return failure();
     }
 
@@ -1074,7 +1058,7 @@ struct TritonIntelGPUInferLayoutInterface
     auto dstOrder = inversePermutation(dstInvOrder);
 
     // CTALayout can be all 1's because we bailed on multi-CTA layouts above.
-    auto CTALayout = CTALayoutAttr::get(
+    auto CTALayout = CTAEncodingAttr::fromSplitParams(
         src.getContext(),
         /*CTAsPerCGA=*/SmallVector<unsigned>(dstShape.size(), 1),
         /*CTASplitNum=*/SmallVector<unsigned>(dstShape.size(), 1),
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp
@@ -24,10 +24,10 @@ namespace {
 //    for register layouts, and input dims [offset] for shared layouts.
 //  - cgaLayout: Arrangement of multiple blocks, i.e. input dims [block].
 //
-// Note that this is inconsistent with the type name CTALayoutAttr.  That type
+// Note that this is inconsistent with the type name CTAEncodingAttr.  That type
 // is equivalent to our cgaLayout.
 //
-// IMO the name CTALayoutAttr is wrong.  If we tried to be consistent anyway,
+// IMO the name CTAEncodingAttr is wrong.  If we tried to be consistent anyway,
 // then we'd have to rename ctaLayout to "warpLayout".  I think that's more
 // confusing than being inconsistent about "cgaLayout", especially when we have
 // to consider the size of the warpLayout (surely that's not the "warpSize").
@@ -57,8 +57,8 @@ LinearLayout identityND(StringAttr inDimName, ArrayRef<unsigned> shape,
 // the CTAsPerCGA CTAs (i.e. blocks) in the CGA (i.e. groups).
 //
 // See the nomenclature note at the top of the file for an explanation of why
-// this is called makeCgaLayout when it accepts a CTALayoutAttr.
-LinearLayout makeCgaLayout(CTALayoutAttr layout) {
+// this is called makeCgaLayout when it accepts a CTAEncodingAttr.
+LinearLayout makeCgaLayout(CTAEncodingAttr layout) {
   MLIRContext *ctx = layout.getContext();
   StringAttr kBlock = S("block");
 
@@ -464,7 +464,7 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
         LinearLayout::identity1D(numReps[0], kRegister, outDimNames[0]);
 
   return combineCtaCgaWithShape(std::move(tileLayout),
-                                CTALayoutAttr::getDefault(ctx, rank), shape);
+                                CTAEncodingAttr::getDefault(ctx, rank), shape);
 }
 
 LinearLayout dotOperandDpasToLinearLayout(DotOperandEncodingAttr dotDpasLayout,
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1076,7 +1076,7 @@ struct PrefetchOpConversion
                              identityStandardND(S("warp"), warpsPerCTA, order);
 
     return combineCtaCgaWithShape(std::move(ctaLayout),
-                                  CTALayoutAttr::getDefault(ctx, rank),
+                                  CTAEncodingAttr::getDefault(ctx, rank),
                                   tensorShape);
   }
 };
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVMBase.h b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVMBase.h
@@ -21,7 +21,6 @@ using namespace mlir::triton;
 using ::mlir::LLVM::delinearize;
 using ::mlir::LLVM::SharedMemoryObject;
 using ::mlir::triton::gpu::BlockedEncodingAttr;
-using ::mlir::triton::gpu::CTALayoutAttr;
 using ::mlir::triton::gpu::DotOperandEncodingAttr;
 using ::mlir::triton::gpu::SliceEncodingAttr;
 using ::mlir::triton::gpu::intel::DpasEncodingAttr;
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeDotOperands.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeDotOperands.cpp
@@ -16,7 +16,6 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
-#include "triton/Dialect/TritonGPU/IR/LayoutUtility.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeReductionLocality.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeReductionLocality.cpp
@@ -349,7 +349,7 @@ struct DpasOperandPattern final : OpRewritePattern<ReduceOp> {
                                            1,
                                            dpasEncoding.getWarpsPerCTA()[0]};
     constexpr std::array<unsigned, rank> order{0, 1, 2, 3, 4, 5, 6};
-    CTALayoutAttr ctaLayout = CTALayoutAttr::getDefault(getContext(), rank);
+    CTAEncodingAttr ctaLayout = CTAEncodingAttr::getDefault(getContext(), rank);
 
     auto encoding = rewriter.getAttr<BlockedEncodingAttr>(
         sizePerThread, threadsPerWarp, warpsPerCTA, order, ctaLayout);
@@ -407,7 +407,7 @@ struct DpasOperandPattern final : OpRewritePattern<ReduceOp> {
                                            dpasEncoding.getWarpsPerCTA()[1],
                                            dpasEncoding.getWarpsPerCTA()[0]};
     constexpr std::array<unsigned, rank> order{0, 1, 2, 3, 4};
-    CTALayoutAttr ctaLayout = CTALayoutAttr::getDefault(getContext(), rank);
+    CTAEncodingAttr ctaLayout = CTAEncodingAttr::getDefault(getContext(), rank);
 
     auto encoding = rewriter.getAttr<BlockedEncodingAttr>(
         sizePerThread, threadsPerWarp, warpsPerCTA, order, ctaLayout);
@@ -440,7 +440,7 @@ struct DpasOperandPattern final : OpRewritePattern<ReduceOp> {
                                            dpasEncoding.getWarpsPerCTA()[1],
                                            dpasEncoding.getWarpsPerCTA()[0]};
     constexpr std::array<unsigned, rank> order{0, 1, 2, 3};
-    CTALayoutAttr ctaLayout = CTALayoutAttr::getDefault(getContext(), rank);
+    CTAEncodingAttr ctaLayout = CTAEncodingAttr::getDefault(getContext(), rank);
 
     auto encoding = rewriter.getAttr<BlockedEncodingAttr>(
         sizePerThread, threadsPerWarp, warpsPerCTA, order, ctaLayout);
@@ -483,7 +483,7 @@ struct DpasOperandPattern final : OpRewritePattern<ReduceOp> {
     std::array<unsigned, rank> warpsPerCTA{dpasEncoding.getWarpsPerCTA()[1],
                                            dpasEncoding.getWarpsPerCTA()[0]};
     constexpr std::array<unsigned, rank> order{0, 1};
-    CTALayoutAttr ctaLayout = CTALayoutAttr::getDefault(getContext(), rank);
+    CTAEncodingAttr ctaLayout = CTAEncodingAttr::getDefault(getContext(), rank);
 
     auto parentEncoding = rewriter.getAttr<BlockedEncodingAttr>(
         sizePerThread, threadsPerWarp, warpsPerCTA, order, ctaLayout);
diff --git a/third_party/intel/unittest/Dialect/TritonIntelGPU/LinearLayoutConversionsTest.cpp b/third_party/intel/unittest/Dialect/TritonIntelGPU/LinearLayoutConversionsTest.cpp
@@ -31,11 +31,7 @@ class LinearLayoutConversionsTest : public ::testing::Test {
 
     // TODO: could put the getOrderForDotOperand in the builder?
     auto layout = Subgroup2DBlockEncodingAttr::get(
-        &ctx, warpsPerCTA,
-        CTALayoutAttr::get(
-            &ctx, dpasLayout.getCTAsPerCGA(), // TODO: add to DpasLayout?
-            dpasLayout.getCTASplitNum(), dpasLayout.getCTAOrder()),
-        instrShape, numBlocks,
+        &ctx, warpsPerCTA, dpasLayout.getCTALayout(), instrShape, numBlocks,
         getOrderForDotOperand(opIdx, /*rank*/ 2, /*kContig*/ true), kWidth,
         dpasLayout.getThreadsPerWarp());
     return layout;

Original file line number	Diff line number	Diff line change
`@@ -1076,7 +1076,7 @@ struct PrefetchOpConversion`
`1076`	`1076`	`identityStandardND(S("warp"), warpsPerCTA, order);`
`1077`	`1077`
`1078`	`1078`	`return combineCtaCgaWithShape(std::move(ctaLayout),`
`1079`		`- CTALayoutAttr::getDefault(ctx, rank),`
	`1079`	`+ CTAEncodingAttr::getDefault(ctx, rank),`
`1080`	`1080`	`tensorShape);`
`1081`	`1081`	`}`
`1082`	`1082`	`};`