[NFC] Kill isBlockedToDotShortcut (#6003)

lezcano · web-flow · commit 74b4714d1a9f · 2025-02-24T14:10:30.000Z
LLs already cover this and more! I saw the pass had not been removed yet. I remove it in triton-lang/triton#6006
diff --git a/include/triton/Analysis/Utility.h b/include/triton/Analysis/Utility.h
@@ -242,8 +242,6 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
 
 bool atomicNeedsSharedMemory(Value result);
 
-bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy);
-
 // Return true if the src and dst layout match.
 bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
                                    RankedTensorType dstTy);
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
@@ -699,75 +699,6 @@ bool supportMMA(Value value, int version) {
          (elemTy.isInteger(8) && version >= 2);
 }
 
-bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy) {
-  auto blockedLayout = dyn_cast<BlockedEncodingAttr>(srcTy.getEncoding());
-  auto dotOperandLayout = dyn_cast<DotOperandEncodingAttr>(dstTy.getEncoding());
-  if (blockedLayout == nullptr || dotOperandLayout == nullptr)
-    return false;
-  auto parentLayout =
-      dyn_cast<BlockedEncodingAttr>(dotOperandLayout.getParent());
-  if (parentLayout == nullptr)
-    return false;
-  auto opShape = srcTy.getShape();
-  auto rank = opShape.size();
-
-  int kDim = dotOperandLayout.getOpIdx() == 0 ? rank - 1 : rank - 2;
-  int nonKDim = dotOperandLayout.getOpIdx() == 0 ? rank - 2 : rank - 1;
-  auto ctaLayout = blockedLayout.getCTALayout();
-
-  // The following logic checks that a source blocked layout matches a
-  // destination dot operand layout. This means that given tensor in source
-  // layout could be converted into destination layout without any data movement
-  // between registers or threads.
-  //
-  // It is considered a match if
-  // 1) Each thread in source layout holds a whole copy of all elements along
-  //    the K dimension of a tensor
-  // 2) Distribution of data along all other non-K dimensions(Batch/M/N)
-  //    matches between source and destination parent layouts.
-  //
-  // First condition comes from the property of dot operand layout with Blocked
-  // parent: size per threads along K dimension equals size of the tensor along
-  // K. Second condition comes from other property: dot operand layout
-  // inherits non-K dimensions from it's parent layout.
-  //
-  // clang-format off
-  //
-  // For example, following conversion is a no op:
-  //   tensor<128x32xf16,                          #blocked<{sizePerThread = [2, 32], threadsPerWarp = [32, 1]}>>
-  //     ->
-  //   tensor<128x32xf16, #dot_op<{opIdx=0, parent=#blocked<{sizePerThread = [2, 8], threadsPerWarp = [32, 1]}>>>
-  //
-  // clang-format on
-  bool ctaLayoutCompatible =
-      ctaLayout.getCTASplitNum()[kDim] == 1 &&
-      blockedLayout.getCTALayout() == parentLayout.getCTALayout();
-  bool threadHoldsWholeKDim =
-      blockedLayout.getSizePerThread()[kDim] == opShape[kDim];
-  bool nonKDimCompatible =
-      blockedLayout.getOrder() == parentLayout.getOrder() &&
-      blockedLayout.getSizePerThread()[nonKDim] ==
-          parentLayout.getSizePerThread()[nonKDim] &&
-      blockedLayout.getThreadsPerWarp()[nonKDim] ==
-          parentLayout.getThreadsPerWarp()[nonKDim] &&
-      blockedLayout.getWarpsPerCTA()[nonKDim] ==
-          parentLayout.getWarpsPerCTA()[nonKDim];
-  bool matrixDimsCompatible =
-      ctaLayoutCompatible && threadHoldsWholeKDim && nonKDimCompatible;
-  if (rank == 2)
-    return matrixDimsCompatible;
-
-  // additional check for batch dimension if it is present
-  assert(rank == 3);
-  bool bDimCompatible =
-      blockedLayout.getSizePerThread()[0] ==
-          parentLayout.getSizePerThread()[0] &&
-      blockedLayout.getThreadsPerWarp()[0] ==
-          parentLayout.getThreadsPerWarp()[0] &&
-      blockedLayout.getWarpsPerCTA()[0] == parentLayout.getWarpsPerCTA()[0];
-  return matrixDimsCompatible && bDimCompatible;
-}
-
 // For MMAV3 dotOperand layout matches mma operand for f16 and bf16 cases.
 bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
                                    RankedTensorType dstTy) {
@@ -854,13 +785,11 @@ bool cvtNeedsWarpShuffle(RankedTensorType srcTy, RankedTensorType dstTy) {
 }
 
 bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy) {
-  // TODO(jlebar): Remove these special cases (`isBlockedToDotShortcut` and
-  // `isMfmaToDotShortcut`) once they're fully subsumed by the linear-layout
-  // checks.
+  // TODO(jlebar): Remove these special cases `isMfmaToDotShortcut` once
+  // they're fully subsumed by the linear-layout checks.
   return !cvtReordersRegisters(srcTy, dstTy) &&
          !(cvtNeedsWarpShuffle(srcTy, dstTy) &&
            getWarpLayoutConvertDecomposition(srcTy, dstTy)) &&
-         !isBlockedToDotShortcut(srcTy, dstTy) &&
          !matchMmaV3AndDotOperandLayout(srcTy, dstTy) &&
          // to be removed when generalized warp shuffle conversions
          // are ready: