[BACKEND] Revert smem layout heuristic added in PR#5924 (#5983)

ThomasRaoux · web-flow · commit bca378d1c631 · 2025-02-21T09:28:41.000-08:00
Partial revert of triton-lang/triton#5924 The changed heuristic to pick the swizzling causes performance regression in some cases. Reverting it for now. cc: @ggengnv
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -1030,46 +1030,6 @@ StringRef getAMDArch(Operation *module) {
   return ref.drop_front(4); // drop the "hip:"
 }
 
-// Rough utility for obtaining a SharedEnc for a LinearEncoding,
-// as we've replaced DotOpEnc with Linear in some cases
-// (specifically, fp4ToFp and similar unpack-upcast thru join)
-std::optional<ttg::SwizzledSharedEncodingAttr>
-getSharedForLinear(ttg::LinearEncodingAttr enc,
-                   ArrayRef<unsigned int> globalOrder, ArrayRef<int64_t> shape,
-                   unsigned elemBitWidth, ttg::CTALayoutAttr ctaLayout) {
-  auto ctx = enc.getContext();
-  auto ll = enc.getLinearLayout();
-  auto rank = shape.size();
-
-  if (rank != 2)
-    return std::nullopt;
-
-  auto order = enc.getOrder();
-  assert(globalOrder.size() == rank);
-  // TODO add memdesc_trans support for dot(trans(cvt(src) #linear) #dot_op)
-  if (order != globalOrder)
-    return std::nullopt;
-
-  auto innerDim = order[0];
-  auto outerDim = order[1];
-  auto contigPerWarp = enc.getContigPerWarp();
-
-  constexpr unsigned BANK_SIZE{128};
-  auto elemBytes = elemBitWidth / 8;
-
-  auto vec = contigPerWarp[innerDim];
-  auto rowSize = elemBytes * (unsigned)shape[innerDim];
-  auto perPhase = std::max(BANK_SIZE / rowSize, 1u);
-  auto maxPhase = std::max(contigPerWarp[outerDim] / perPhase, 1u);
-
-  // cp.async does not support transfer size < 4B
-  if (vec * elemBytes < 4 && perPhase < maxPhase)
-    return std::nullopt;
-
-  return ttg::SwizzledSharedEncodingAttr::get(ctx, vec, perPhase, maxPhase,
-                                              order, ctaLayout);
-}
-
 // If all the transitive uses of the given value have are used by a convert to
 // the same dot operand encoding, return the shared encoding that needs to be
 // used to be compatible with users' layouts. If there are incompatible shared
@@ -1096,28 +1056,18 @@ getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible) {
     } else {
       if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
         return std::nullopt;
-      auto enc =
+      auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(
           cast<triton::gpu::TensorOrMemDesc>(user->getResult(0).getType())
-              .getEncoding();
+              .getEncoding());
+      if (!dotOpEnc)
+        return std::nullopt;
       auto srcTy = cast<triton::gpu::TensorOrMemDesc>(val.getType());
-      auto ctaLayout = ttg::getCTALayout(srcTy.getEncoding());
+      auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
       auto order = ttg::getOrder(srcTy.getEncoding());
       unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
-
-      if (auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(enc)) {
-        tempAttr = ttg::SwizzledSharedEncodingAttr::get(
-            val.getContext(), dotOpEnc, srcTy.getShape(), order, ctaLayout,
-            bitWidth, /*needTrans=*/false);
-      } else if (auto linearEnc = dyn_cast<ttg::LinearEncodingAttr>(enc)) {
-
-        auto attrOpt = getSharedForLinear(linearEnc, order, srcTy.getShape(),
-                                          bitWidth, ctaLayout);
-        if (!attrOpt)
-          return std::nullopt;
-        tempAttr = *attrOpt;
-      } else {
-        return std::nullopt;
-      }
+      tempAttr = ttg::SwizzledSharedEncodingAttr::get(
+          val.getContext(), dotOpEnc, srcTy.getShape(), order, CTALayout,
+          bitWidth, /*needTrans=*/false);
     }
     // Check that the shared encodings needed by the users are compatible.
     if (attr != nullptr && attr != tempAttr) {