Fix build and test failures from 593a1b5

whitneywhtsang · whitneywhtsang · commit eeed3f5ceb0a · 2025-03-28T21:10:38.000-04:00
Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUAttrDefs.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUAttrDefs.td
@@ -184,7 +184,7 @@ The semantic of this `tt.dot` includes GEMM tiling configuration as:
     "unsigned":$systolicDepth,
     "unsigned":$executionSize,
     "unsigned":$opsPerChannel,
-    ArrayRefParameter<"unsigned">:$warpsPerCTA__,
+    ArrayRefParameter<"unsigned">:$warpsPerCTA,
     ArrayRefParameter<"unsigned">:$repCluster,
     "unsigned":$threadsPerWarp
   );
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
@@ -169,7 +169,7 @@ DpasEncodingAttr::getDPASRepetitions(ArrayRef<int64_t> shape,
                                      OpIdx opIdx) const {
   // Always return a 3D shape repetitions for the ease of value handling, same
   // to mma.
-  SmallVector<unsigned> warpsPerCTA = getWarpsPerCTA();
+  auto warpsPerCTA = getWarpsPerCTA();
   size_t rank = shape.size();
   SmallVector<int64_t> rep(3, 1);
   switch (opIdx) {
@@ -239,11 +239,6 @@ unsigned DpasEncodingAttr::getTotalElemsPerThreadForOperand(
   llvm_unreachable("unexpected opIdx");
 }
 
-SmallVector<unsigned> DpasEncodingAttr::getWarpsPerCTA() const {
-  return SmallVector<unsigned>(getWarpsPerCTA__().begin(),
-                               getWarpsPerCTA__().end());
-}
-
 SmallVector<unsigned> DpasEncodingAttr::getContigPerThread() const {
   size_t rank = getWarpsPerCTA().size();
   assert(rank == 2 || rank == 3);
@@ -295,7 +290,7 @@ unsigned DpasEncodingAttr::getOpsPerChannel(Type elemType) {
 LogicalResult DpasEncodingAttr::verify(
     ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
     unsigned repeatCount, unsigned systolicDepth, unsigned executionSize,
-    unsigned opsPerChan, ::llvm::ArrayRef<unsigned> warpsPerCTA__,
+    unsigned opsPerChan, ::llvm::ArrayRef<unsigned> warpsPerCTA,
     ::llvm::ArrayRef<unsigned> repCluster, unsigned sugGroupSize) {
   if (repeatCount > 8 || repeatCount < 1) {
     return emitError() << "repeatCount must be in the range [1, 8], but was:"
@@ -378,7 +373,7 @@ void DpasEncodingAttr::print(AsmPrinter &printer) const {
   ArrayRef<unsigned> rB = shapeB;
   SmallVector<unsigned> shapeC = getShapeC();
   ArrayRef<unsigned> rC = shapeC;
-  SmallVector<unsigned> warpsPerCTA = getWarpsPerCTA();
+  auto warpsPerCTA = getWarpsPerCTA();
   ArrayRef<unsigned> repCluster = getRepCluster();
   printer << "<{"
           << "repeatCount = " << getRepeatCount() << ", "
@@ -436,10 +431,6 @@ SmallVector<unsigned> WarpEncodingAttr::getRepOrder() const {
   llvm::report_fatal_error("NYI. WarpEncodingAttr::getRepOrder");
 }
 
-SmallVector<unsigned> WarpEncodingAttr::getWarpsPerCTA() const {
-  llvm::report_fatal_error("NYI. WarpEncodingAttr::getWarpsPerCTA");
-}
-
 LinearLayout WarpEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   llvm::report_fatal_error("NYI. WarpEncodingAttr::toLinearLayout");
 }
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp
@@ -418,7 +418,7 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
   StringAttr kLane = S("lane");
   StringAttr kWarp = S("warp");
 
-  const SmallVector<unsigned> warpsPerCTA = dpas.getWarpsPerCTA();
+  auto warpsPerCTA = dpas.getWarpsPerCTA();
   int threadsPerWarp = product<unsigned>(dpas.getThreadsPerWarp());
   unsigned opsPerChannel = dpas.getOpsPerChannel();
   auto repCluster = dpas.getRepCluster();
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/HistogramOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/HistogramOpToLLVM.cpp
@@ -85,9 +85,8 @@ static SmallVector<Value> computeCrossWarpHistogram(
     Value threadId, int numWarps) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   SmallVector<Value> histogramValues;
-  unsigned numWarpsWithUniqueData =
-      mlir::triton::gpu::getWarpsPerCTAWithUniqueData(srcType.getEncoding(),
-                                                      srcType.getShape())[0];
+  unsigned numWarpsWithUniqueData = mlir::triton::gpu::getWarpsPerCTA(
+      srcType.getEncoding(), srcType.getShape())[0];
   Value laneId = b.and_(threadId, b.i32_val(numThreadPerWarp - 1));
   // Initialize the shared memory with zeros.
   int64_t numElementPerThread =
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -555,7 +555,7 @@ struct LoadOpToBlockIOConversion
     unsigned numElems = getTotalElemsPerThread(resultType);
     SmallVector<int64_t> numReps =
         dpasLayout.getDPASRepetitions(tensorShape, opIdx);
-    const SmallVector<unsigned> warpsPerCTA = dpasLayout.getWarpsPerCTA();
+    auto warpsPerCTA = dpasLayout.getWarpsPerCTA();
     SmallVector<unsigned> dpasWarpsOrder =
         getMatrixOrder(warpsPerCTA.size(), /*rowMajor*/ true);
     unsigned threadsPerWarp =
@@ -1046,7 +1046,7 @@ struct LoadOpConversion
     unsigned numElems = getTotalElemsPerThread(resultType);
     SmallVector<int64_t> numReps =
         dpasLayout.getDPASRepetitions(tensorShape, opIdx);
-    const SmallVector<unsigned> warpsPerCTA = dpasLayout.getWarpsPerCTA();
+    auto warpsPerCTA = dpasLayout.getWarpsPerCTA();
     SmallVector<unsigned> dpasWarpsOrder =
         getMatrixOrder(warpsPerCTA.size(), /*rowMajor*/ true);
     unsigned threadsPerWarp =
@@ -1678,7 +1678,7 @@ struct StoreOpConversion
     size_t rank = tensorShape.size();
     unsigned numElems = getTotalElemsPerThread(tensorType);
     SmallVector<unsigned> elemsPerInstr = dpasLayout.getDPASInstShapeC();
-    const SmallVector<unsigned> warpsPerCTA = dpasLayout.getWarpsPerCTA();
+    auto warpsPerCTA = dpasLayout.getWarpsPerCTA();
     SmallVector<int64_t> numReps =
         dpasLayout.getDPASRepetitions(tensorShape, 2);
     SmallVector<unsigned> dpasWarpsOrder =
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/DistributeToWarps.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/DistributeToWarps.cpp
@@ -120,7 +120,7 @@ SmallVector<Value> distributeOffset(const SmallVector<Value> &oldOffsets,
   Attribute layout = tensorType.getEncoding();
   if (auto dotEncoding = dyn_cast<ttg::DotOperandEncodingAttr>(layout))
     layout = dotEncoding.getParent();
-  const SmallVector<unsigned> &warpsPerCTA = ttg::getWarpsPerCTA(layout);
+  auto warpsPerCTA = cast<ttg::BlockedEncodingAttr>(layout).getWarpsPerCTA();
   size_t dims = warpsPerCTA.size();
   assert(dims <= 2 && "no more than 2D shape");
 
@@ -211,7 +211,8 @@ void distributeMakeRangeOp(tt::MakeRangeOp op, Value warpId) {
   auto sliceLayout = dyn_cast<ttg::SliceEncodingAttr>(tensorTy.getEncoding());
   assert(sliceLayout && "Expected slice layout");
 
-  auto parentWarpsPerCTA = ttg::getWarpsPerCTA(sliceLayout.getParent());
+  auto parentWarpsPerCTA =
+      cast<ttg::BlockedEncodingAttr>(sliceLayout.getParent()).getWarpsPerCTA();
   assert(parentWarpsPerCTA.size() == 2 && "Only slice of 2D layout supported");
   assert(parentWarpsPerCTA.back() == 1 &&
          "Warp distribution on second dimensions unsupported");