From d31ccfe5846be60b262adeace8e69ee92e82e808 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 25 Oct 2024 07:04:07 +0200
Subject: [PATCH 1/5] [NFC] Use `get_config_var('EXT_SUFFIX')` instead of using
 `so` directly (#4958)

Change to improve platform independence.

How it works?

On Windows:
```python
>>> import sysconfig
>>> sysconfig.get_config_var("EXT_SUFFIX")
'.cp310-win_amd64.pyd'
>>> sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1]
'pyd'
```

On Linux:
```python
>>> import sysconfig
>>> sysconfig.get_config_var("EXT_SUFFIX")
'.cpython-310-x86_64-linux-gnu.so'
>>> sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1]
'so'
```

---------

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 python/triton/compiler/compiler.py   | 4 +++-
 third_party/nvidia/backend/driver.py | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py
index 8ca1f8b326..304b406974 100644
--- a/python/triton/compiler/compiler.py
+++ b/python/triton/compiler/compiler.py
@@ -15,6 +15,7 @@
 import re
 import functools
 import os
+import sysconfig
 
 # - ^\s*tt\.func\s+ : match the start of the string, any leading whitespace, the keyword func,
 #    and any following whitespace
@@ -151,7 +152,8 @@ def triton_key():
 
     # backend
     libtriton_hash = hashlib.sha256()
-    with open(os.path.join(TRITON_PATH, "_C/libtriton.so"), "rb") as f:
+    ext = sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1]
+    with open(os.path.join(TRITON_PATH, f"_C/libtriton.{ext}"), "rb") as f:
         while True:
             chunk = f.read(1024**2)
             if not chunk:
diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py
index 38ce62b0c2..fa832f68ef 100644
--- a/third_party/nvidia/backend/driver.py
+++ b/third_party/nvidia/backend/driver.py
@@ -1,5 +1,6 @@
 import functools
 import os
+import sysconfig
 import hashlib
 import subprocess
 import tempfile
@@ -48,7 +49,8 @@ def library_dirs():
 def compile_module_from_src(src, name):
     key = hashlib.sha256(src.encode("utf-8")).hexdigest()
     cache = get_cache_manager(key)
-    cache_path = cache.get_file(f"{name}.so")
+    ext = sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1]
+    cache_path = cache.get_file(f"{name}.{ext}")
     if cache_path is None:
         with tempfile.TemporaryDirectory() as tmpdir:
             src_path = os.path.join(tmpdir, "main.c")
@@ -56,7 +58,7 @@ def compile_module_from_src(src, name):
                 f.write(src)
             so = _build(name, src_path, tmpdir, library_dirs(), include_dir, libraries)
             with open(so, "rb") as f:
-                cache_path = cache.put(f.read(), f"{name}.so", binary=True)
+                cache_path = cache.put(f.read(), f"{name}.{ext}", binary=True)
     import importlib.util
     spec = importlib.util.spec_from_file_location(name, cache_path)
     mod = importlib.util.module_from_spec(spec)

From 15c5e5519ff129104956ef0a726656f9e5cb4359 Mon Sep 17 00:00:00 2001
From: Keren Zhou <kerenzhou@openai.com>
Date: Fri, 25 Oct 2024 09:12:23 -0700
Subject: [PATCH 2/5] [BACKEND] Improve detection of register to register
 conversion (#4991)

Specifically, it fixes problems when `srcLayout` and `dstLayout` have
different number of registers but the same number of not free registers.
We solved the problem by padding free registers to either `srcLayout` or
`dstLayout`, but this can be improved by fixing the `invertAndCompose`
function.
---
 include/triton/Analysis/Utility.h             |  2 +-
 include/triton/Tools/LinearLayout.h           |  7 ++
 lib/Analysis/Utility.cpp                      | 42 ++++++++++-
 .../TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp | 53 +++++++------
 lib/Tools/LinearLayout.cpp                    | 15 ++++
 test/Conversion/tritongpu_to_llvm.mlir        | 74 +++++++++++++++++++
 unittest/Tools/LinearLayoutTest.cpp           | 33 +++++++++
 7 files changed, 202 insertions(+), 24 deletions(-)

diff --git a/include/triton/Analysis/Utility.h b/include/triton/Analysis/Utility.h
index cb3e3d292e..4f6aff739c 100644
--- a/include/triton/Analysis/Utility.h
+++ b/include/triton/Analysis/Utility.h
@@ -212,7 +212,7 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
 
 bool atomicNeedsSharedMemory(Value result);
 
-bool isBlockedToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstT);
+bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy);
 
 bool isMfmaToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy);
 
diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
index c728cfbb32..47e3fca79b 100644
--- a/include/triton/Tools/LinearLayout.h
+++ b/include/triton/Tools/LinearLayout.h
@@ -679,6 +679,13 @@ class LinearLayout {
   // (i.e. every input bit affects the output).
   llvm::MapVector<StringAttr, int32_t> getFreeVariableMasks() const;
 
+  // Increase an input dimension without affecting the output dimension.  The
+  // added free variables are mapped to 0, ensuring that the new input
+  // dimensions correspond directly to the existing output space.  The function
+  // errors out if `newInDimSize` is less than the current size or the new size
+  // is not a power of 2.
+  LinearLayout resize(StringAttr inDim, int32_t newInDimSize) const;
+
   std::string toString() const;
 
   friend bool operator==(LinearLayout lhs, LinearLayout rhs);
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
index 4915d7b1ac..9782be48d7 100644
--- a/lib/Analysis/Utility.cpp
+++ b/lib/Analysis/Utility.cpp
@@ -536,7 +536,7 @@ bool supportMMA(Value value, int version) {
          (elemTy.isInteger(8) && version >= 2);
 }
 
-bool isBlockedToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstTy) {
+bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy) {
   auto blockedLayout = dyn_cast<BlockedEncodingAttr>(srcTy.getEncoding());
   auto dotOperandLayout = dyn_cast<DotOperandEncodingAttr>(dstTy.getEncoding());
   if (blockedLayout == nullptr || dotOperandLayout == nullptr)
@@ -655,8 +655,46 @@ std::optional<LinearLayout> minimalCvtLayout(RankedTensorType srcTy,
       toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
   if (!(srcLayout.has_value() && dstLayout.has_value()))
     return std::nullopt;
+  StringAttr kRegister = StringAttr::get(ctx, "register");
+  StringAttr kLane = StringAttr::get(ctx, "lane");
+  StringAttr kWarp = StringAttr::get(ctx, "warp");
+  StringAttr kBlock = StringAttr::get(ctx, "block");
+  auto numSrcRegs = srcLayout->getInDimSize(kRegister);
+  auto numDstRegs = dstLayout->getInDimSize(kRegister);
+  // The `invertAndCompose` function will generate a layout that is injective
+  // by assigning new output dimensions to free variables.  For instance,
+  // consider a scenario where `srcLayout` has a free variable in the lane
+  // dimension, while `dstLayout` has two free variables in the lane
+  // dimension and also a larger number of registers.
+  // The injective form of `srcLayout` will add only a single additional row
+  // to the transformation matrix, whereas the injective form of `dstLayout`
+  // will add two additional rows.  This discrepancy causes misleading results
+  // because the matrices end up with a different number of rows.
+  //
+  // Take `dstLayout ⋅ srcLayout^-1` as an example:
+  //
+  //   - `injective(dstLayout)`: [n, m] → [n + 2, m]
+  //   - `injective(srcLayout)`: [n, m] → [n + 1, m]
+  //   - `injective(srcLayout)^-1`: [n + 1, m] → [m, n + 1]
+  //   - `injective(dstLayout) ⋅ injective(srcLayout)^-1`: [n + 2, m] ⋅ [m, n +
+  //   1] → [n + 2, n + 1]
+  //
+  // Here, the `(n + 1)`-th row added by `dstLayout` represents the free
+  // variable in registers, and the `(n + 2)`-th row represents the free
+  // variable in lanes.  However, the `(n + 1)`-th row added by `srcLayout`
+  // represents the free variable in lanes.  As a result, the `(n + 1)`-th row
+  // in two layouts do not correspond to the same free variable.
+  //
+  // To address this issue, we pad the free variables in `srcLayout` and
+  // `dstLayout` to ensure they have the same number of registers.  This
+  // guarantees that the resulting matrices have the same number of rows,
+  // ensuring consistency in the composition process.
+  auto numRegs = std::max(numSrcRegs, numDstRegs);
+  auto srcLayoutWithFreeRegs = srcLayout->resize(kRegister, numRegs);
+  auto dstLayoutWithFreeRegs = dstLayout->resize(kRegister, numRegs);
   // comp describes the layout function to create dst from src.
-  LinearLayout comp = dstLayout->invertAndCompose(*srcLayout);
+  LinearLayout comp =
+      dstLayoutWithFreeRegs.invertAndCompose(srcLayoutWithFreeRegs);
   // We try to quotient by the largest subspace first
   auto dims = SmallVector<StringRef>{"block", "warp", "lane", "register"};
   for (auto dim : dims) {
diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
index a18b2cbc30..ea9091f4e1 100644
--- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -288,49 +288,54 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
       return rewriter.notifyMatchFailure(
           op, "NYI. srcTy and/or dstTy don't implement LLs yet");
     }
+    LinearLayout srcLayout =
+        *toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
+    LinearLayout dstLayout =
+        *toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
+
+    StringAttr kBlock = str_attr("block");
+    StringAttr kWarp = str_attr("warp");
+    StringAttr kLane = str_attr("lane");
+    StringAttr kRegister = str_attr("register");
 
     assert(to_vector(conversion->getInDimNames()) ==
            to_vector(conversion->getOutDimNames()));
     auto dims = conversion->getInDimNames();
-    if (llvm::is_contained(dims, str_attr("block"))) {
+    if (llvm::is_contained(dims, kBlock)) {
       // Case 1: Transfer between values in different CTAs.
       //          This requires moving values through distributed shared memory.
       return rewriter.notifyMatchFailure(
           op, "NYI: Transfer between different CTAs");
-    } else if (llvm::is_contained(dims, str_attr("warp"))) {
+    } else if (llvm::is_contained(dims, kWarp)) {
       // Case 2: Transfer between values in the same CTA, in which case we move
       //         values through shared memory.
-      LinearLayout srcLayout =
-          *toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
-      LinearLayout dstLayout =
-          *toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
       return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter);
-    } else if (llvm::is_contained(dims, str_attr("lane"))) {
+    } else if (llvm::is_contained(dims, kLane)) {
       // Case 3. Transfer between values in the same warp, in which case we try
       //         to move values using warp shuffles, though if the pattern is
       //         complicated enough we may fall back to using shared memory
       // TODO(Keren): implement warp shuffle instead of using the general
       // approach that uses shared memory
-      LinearLayout srcLayout =
-          *toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
-      LinearLayout dstLayout =
-          *toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
       return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter);
-    } else if (llvm::is_contained(dims, str_attr("register"))) {
+    } else if (llvm::is_contained(dims, kRegister) ||
+               dstLayout.getInDimSize(kRegister) !=
+                   srcLayout.getInDimSize(kRegister)) {
       // Case 4. Transfer between values in the same thread, in which case we
       //         simply reorder the elements of adaptor.getSrc().
-      return transferWithinThread(op, *conversion, adaptor, rewriter);
+      return transferWithinThread(
+          op, dstLayout.getFreeVariableMasks()[kRegister],
+          dstLayout.getInDimSize(kRegister), *conversion, adaptor, rewriter);
     } else {
-      // The two layouts are equivalent. We should probably remove these in
-      // RemoveLayoutConversion.
+      // Cast 5. The two layouts are equivalent. We should probably remove
+      // these in RemoveLayoutConversion.
       rewriter.replaceOp(op, adaptor.getSrc());
       return success();
     }
   }
 
   LogicalResult
-  transferWithinThread(ConvertLayoutOp op, const LinearLayout &conversion,
-                       OpAdaptor adaptor,
+  transferWithinThread(ConvertLayoutOp op, int32_t regMasks, int32_t numRegs,
+                       const LinearLayout &conversion, OpAdaptor adaptor,
                        ConversionPatternRewriter &rewriter) const {
     MLIRContext *ctx = op.getContext();
     auto loc = op.getLoc();
@@ -338,10 +343,16 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     assert(!cvtNeedsSharedMemory(op.getSrc().getType(), op.getType()));
 
     auto inVals = unpackLLElements(loc, adaptor.getSrc(), rewriter);
-    SmallVector<Value> outVals;
-    outVals.resize(conversion.getInDimSize(kRegister));
-    for (int i = 0; i < conversion.getInDimSize(kRegister); i++) {
-      auto srcIdx = conversion.apply({{kRegister, i}}).begin()->second;
+    SmallVector<Value> outVals(numRegs);
+    for (int i = 0; i < outVals.size(); i++) {
+      // Remove free masks from the register index
+      // For example, if idx = 0b00111, and masks = 0b00100, then we get
+      // 0b00011. It means that register 7 (0b111) has the same value as
+      // register 3 (0b011).
+      auto idx = i & (~regMasks);
+      auto srcIdx = conversion.hasInDim(kRegister)
+                        ? conversion.apply({{kRegister, idx}}).begin()->second
+                        : idx;
       outVals[i] = inVals[srcIdx];
     }
     Value result = packLLElements(loc, getTypeConverter(), outVals, rewriter,
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
index bf017f8c64..4319d1f086 100644
--- a/lib/Tools/LinearLayout.cpp
+++ b/lib/Tools/LinearLayout.cpp
@@ -1016,6 +1016,21 @@ bool LinearLayout::equalIgnoringOutDimSizes(const LinearLayout &other) const {
   return true;
 }
 
+LinearLayout LinearLayout::resize(StringAttr inDim,
+                                  int32_t newInDimSize) const {
+  BasesT bases = getBases();
+  assert(bases.contains(inDim) && "inDim not in layout");
+  assert(llvm::isPowerOf2_32(newInDimSize) &&
+         "newInDimSize must be a power of 2");
+  assert(newInDimSize >= getInDimSize(inDim) &&
+         "newInDimSize must be >= old size");
+  auto numFreeVariables = llvm::Log2_32(newInDimSize) - getInDimSizeLog2(inDim);
+  for (int i = 0; i < numFreeVariables; i++) {
+    bases[inDim].push_back(std::vector<int32_t>(getNumOutDims(), 0));
+  }
+  return LinearLayout(std::move(bases), llvm::to_vector(getOutDimNames()));
+}
+
 std::string LinearLayout::toString() const {
   // Start with a newline because we print out a bulleted list; it doesn't
   // make sense for the first line of this list to be on the same line as
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
index e1a2ec68bd..4a61ee4bc1 100644
--- a/test/Conversion/tritongpu_to_llvm.mlir
+++ b/test/Conversion/tritongpu_to_llvm.mlir
@@ -847,6 +847,80 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 
 // -----
 
+#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [1, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}>
+#dot1 = #triton_gpu.dot_op<{opIdx=0, parent=#mma, kWidth=2}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
+  // CHECK-LABEL: convert_layout_mmav2_dot_reg
+  tt.func @convert_layout_mmav2_dot_reg(%arg0: tensor<16x16xf16, #mma>) {
+    // CHECK-NOT: st.shared
+    // CHECK-NOT: llvm.load
+    %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #dot1>
+    tt.return
+  }
+}
+
+// -----
+
+#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
+#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
+  // CHECK-LABEL: convert_layout_mmav3_mmav3_0
+  tt.func @convert_layout_mmav3_mmav3_0(%arg0: tensor<64x64xf16, #mma0>) {
+    // CHECK-NOT: st.shared
+    // CHECK-NOT: llvm.load
+    %0 = triton_gpu.convert_layout %arg0 : tensor<64x64xf16, #mma0> -> tensor<64x64xf16, #mma1>
+    tt.return
+  }
+}
+
+// -----
+
+#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
+#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
+  // CHECK-LABEL: convert_layout_mmav3_mmav3_1
+  tt.func @convert_layout_mmav3_mmav3_1(%arg0: tensor<64x64xf16, #mma1>) {
+    // CHECK-NOT: st.shared
+    // CHECK-NOT: llvm.load
+    %0 = triton_gpu.convert_layout %arg0 : tensor<64x64xf16, #mma1> -> tensor<64x64xf16, #mma0>
+    tt.return
+  }
+}
+
+// -----
+
+#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
+#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
+  // CHECK-LABEL: convert_layout_mmav3_mmav3_2
+  tt.func @convert_layout_mmav3_mmav3_2(%arg0: tensor<16x16xf16, #mma1>) {
+    // CHECK-NOT: st.shared
+    // CHECK-NOT: llvm.load
+    %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mma1> -> tensor<16x16xf16, #mma0>
+    tt.return
+  }
+}
+
+// -----
+
+#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
+#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
+  // CHECK-LABEL: convert_layout_mmav3_mmav3_3
+  tt.func @convert_layout_mmav3_mmav3_3(%arg0: tensor<1x64xf16, #mma1>) {
+    // CHECK-NOT: st.shared
+    // CHECK-NOT: llvm.load
+    %0 = triton_gpu.convert_layout %arg0 : tensor<1x64xf16, #mma1> -> tensor<1x64xf16, #mma0>
+    tt.return
+  }
+}
+
+// -----
+
 #blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
 #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 256, 32]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32} {
diff --git a/unittest/Tools/LinearLayoutTest.cpp b/unittest/Tools/LinearLayoutTest.cpp
index f006447002..897172fd6d 100644
--- a/unittest/Tools/LinearLayoutTest.cpp
+++ b/unittest/Tools/LinearLayoutTest.cpp
@@ -747,6 +747,39 @@ TEST_F(LinearLayoutTest, QuotientIdentityMultipleDimensions) {
   ASSERT_TRUE(quotientLayout->quotient({S("dim2")}).has_value());
 }
 
+TEST_F(LinearLayoutTest, Resize) {
+  auto init = LinearLayout(
+      {
+          {S("in0"), {{0, 1}, {0, 2}}},
+          {S("in1"), {{1, 0}, {2, 0}}},
+          {S("in2"), {}},
+      },
+      {S("dim0"), S("dim1")});
+  EXPECT_EQ(init.resize(S("in0"), 8),
+            LinearLayout(
+                {
+                    {S("in0"), {{0, 1}, {0, 2}, {0, 0}}},
+                    {S("in1"), {{1, 0}, {2, 0}}},
+                    {S("in2"), {}},
+                },
+                {S("dim0"), S("dim1")}));
+  EXPECT_EQ(init.resize(S("in0"), 4), LinearLayout(
+                                          {
+                                              {S("in0"), {{0, 1}, {0, 2}}},
+                                              {S("in1"), {{1, 0}, {2, 0}}},
+                                              {S("in2"), {}},
+                                          },
+                                          {S("dim0"), S("dim1")}));
+  EXPECT_EQ(init.resize(S("in1"), 8),
+            LinearLayout(
+                {
+                    {S("in0"), {{0, 1}, {0, 2}}},
+                    {S("in1"), {{1, 0}, {2, 0}, {0, 0}}},
+                    {S("in2"), {}},
+                },
+                {S("dim0"), S("dim1")}));
+}
+
 } // anonymous namespace
 } // namespace mlir::triton
 

From 1918084cd4ce22c21476fd0366a40370f4b757ec Mon Sep 17 00:00:00 2001
From: David Berard <dberard@fb.com>
Date: Fri, 25 Oct 2024 13:13:24 -0700
Subject: [PATCH 3/5] [TEST] float16 test for test_tensor_atomic_rmw (#4981)

This adds float16 to the list of dtypes tested in
test_tensor_atomic_rmw. Note that the numerics were previously bad for
this test when run in float16; this PR "fixes" the numerics by
internally doing the sum in float32 (upcast, sum, downcast). Since the
purpose is to test the atomic_rmw, and the numerical issues of doing
sums in low-precision dtypes are generally know, I think this strategy
should be fine for this test.
---
 python/test/unit/language/test_core.py | 30 ++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index e044198046..182445836e 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -1453,17 +1453,29 @@ def kernel(X):
                           for shape in [(2, 2), (2, 8), (8, 2), (8, 8), (32, 32), (64, 64)]
                           for axis in [0, 1]
                           for num_ctas in num_ctas_list
-                          for dtype_x_str in ['float32', 'uint64', 'int64', 'float64']])
+                          for dtype_x_str in ['float16', 'float32', 'uint64', 'int64', 'float64']])
 def test_tensor_atomic_rmw(shape, axis, num_ctas, dtype_x_str, device):
+    if is_interpreter() and dtype_x_str == 'float16':
+        pytest.skip('float16 atomic_add does not work in the interpreter mode')
     shape0, shape1 = shape
     # triton kernel
 
     @triton.jit
-    def kernel(Z, X, OLD, AXIS: tl.constexpr, SHAPE0: tl.constexpr, SHAPE1: tl.constexpr):
+    def kernel(Z, X, OLD, AXIS: tl.constexpr, SHAPE0: tl.constexpr, SHAPE1: tl.constexpr, DTYPE: tl.constexpr):
         off0 = tl.arange(0, SHAPE0)
         off1 = tl.arange(0, SHAPE1)
         x = tl.load(X + off0[:, None] * SHAPE1 + off1[None, :])
+
+        if DTYPE == tl.float16:
+            # sum can have bad numerics when accumulating in float16.
+            # if we're dealing with float16, do the sum in float32.
+            x = x.to(tl.float32)
+
         z = tl.sum(x, axis=AXIS)
+
+        if DTYPE == tl.float16:
+            z = z.to(DTYPE)
+
         if AXIS == 1:
             old = tl.atomic_add(Z + off0, z)
             tl.store(OLD + off0, old)
@@ -1477,13 +1489,23 @@ def kernel(Z, X, OLD, AXIS: tl.constexpr, SHAPE0: tl.constexpr, SHAPE1: tl.const
     z = numpy_random(z_shape, dtype_str=dtype_x_str, rs=rs)
     old = np.zeros(z_shape, dtype=getattr(np, dtype_x_str))
     # reference results
-    z_ref = z + np.sum(x, axis=axis, keepdims=False)
+    if x.dtype == np.float16:
+        # do the sum in float32 to reduce numerical variation
+        z_ref = z + np.sum(x.astype(np.float32), axis=axis, keepdims=False).astype(x.dtype)
+    else:
+        z_ref = z + np.sum(x, axis=axis, keepdims=False)
     old_ref = np.copy(z)
     # triton result
     x_tri = to_triton(x, device=device)
     z_tri = to_triton(z, device=device)
     old_tri = to_triton(old, device=device)
-    kernel[(1, )](z_tri, x_tri, old_tri, axis, shape0, shape1, num_ctas=num_ctas)
+
+    def torch_to_triton_dtype(t):
+        if t == torch.float16:
+            return tl.float16
+        return None
+
+    kernel[(1, )](z_tri, x_tri, old_tri, axis, shape0, shape1, torch_to_triton_dtype(x_tri.dtype), num_ctas=num_ctas)
     np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=1e-4)
     np.testing.assert_equal(old_ref, to_numpy(old_tri))
 

From 78c8054298a81f578dcd8c79b519981c57dfb665 Mon Sep 17 00:00:00 2001
From: Ilya V <152324710+joviliast@users.noreply.github.com>
Date: Sun, 27 Oct 2024 02:57:52 +0200
Subject: [PATCH 4/5] [AMD] Emit vectorized 16-bit float LLVM atomic ops
 (#4925)

In the case of 16 bit floats operands for tt::AtomicRMWOp, construct
only one LLVM::AtomicRMWOp but use vector of elements.
Such approach allows to generate packed intrinsics and process 2
elements at once.
Added a lit test for f16 vectorized case.
---
 test/Conversion/amd/tritongpu_to_llvm.mlir    | 32 +++++++++++++++
 .../TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp  | 41 +++++++++----------
 2 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
index ef67338457..de0eb140e2 100644
--- a/test/Conversion/amd/tritongpu_to_llvm.mlir
+++ b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -62,3 +62,35 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     tt.return
   }
 }
+
+// -----
+
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
+  // CHECK-LABEL: atomic_add_f16
+  tt.func @atomic_add_f16(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1 : tensor<256xi1, #blocked1>, %arg2 : tensor<256xf16, #blocked1>) {
+    %range = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked1>
+    %base_ptr = tt.splat %arg0 : !tt.ptr<f16> -> tensor<256x!tt.ptr<f16>, #blocked1>
+    %ptr = tt.addptr %base_ptr, %range : tensor<256x!tt.ptr<f16>, #blocked1>, tensor<256xi32, #blocked1>
+    // CHECK: llvm.cond_br
+    // CHECK: llvm.atomicrmw fadd {{.*}} vector<2xf16>
+    %0 =  tt.atomic_rmw fadd, relaxed, gpu, %ptr, %arg2, %arg1 : (tensor<256x!tt.ptr<f16>, #blocked1>, tensor<256xf16, #blocked1>, tensor<256xi1, #blocked1>) -> tensor<256xf16, #blocked1>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
+  // CHECK-LABEL: atomic_add_bf16
+  tt.func @atomic_add_bf16(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32}, %arg1 : tensor<256xi1, #blocked2>, %arg2 : tensor<256xbf16, #blocked2>) {
+    %range = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked2>
+    %base_ptr = tt.splat %arg0 : !tt.ptr<bf16> -> tensor<256x!tt.ptr<bf16>, #blocked2>
+    %ptr = tt.addptr %base_ptr, %range : tensor<256x!tt.ptr<bf16>, #blocked2>, tensor<256xi32, #blocked2>
+    // CHECK: llvm.cond_br
+    // CHECK: llvm.atomicrmw fadd {{.*}} vector<2xbf16>
+    %0 =  tt.atomic_rmw fadd, relaxed, gpu, %ptr, %arg2, %arg1 : (tensor<256x!tt.ptr<bf16>, #blocked2>, tensor<256xbf16, #blocked2>, tensor<256xi1, #blocked2>) -> tensor<256xbf16, #blocked2>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
index a45efd4a79..5265f631ad 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -768,7 +768,11 @@ struct AtomicRMWOpConversion
     // tensor
     if (tensorTy) {
       auto valTy = cast<RankedTensorType>(val.getType());
-      vec = std::min<unsigned>(vec, valTy.getElementType().isF16() ? 2 : 1);
+      Type elTy = valTy.getElementType();
+      vec = std::min<unsigned>(vec, llvm::isa<FloatType>(elTy) &&
+                                            elTy.getIntOrFloatBitWidth() == 16
+                                        ? 2
+                                        : 1);
       // mask
       numElems = tensorTy.getNumElements();
     }
@@ -783,13 +787,22 @@ struct AtomicRMWOpConversion
     auto vecTy = vec_ty(valueElemTy, vec);
     auto retType = vec == 1 ? valueElemTy : vecTy;
     SmallVector<Value> resultVals(elemsPerThread);
-    const bool f16v2 = vec == 2 && valueElemTy.isF16();
     for (size_t i = 0; i < elemsPerThread; i += vec) {
       Value rmwPtr = ptrElements[i];
       // TODO: in case llMask is zero we can create only one branch for all
       // elemsPerThread.
       Value rmwMask = llMask ? and_(mask, maskElements[i]) : mask;
 
+      Value operand;
+      if (vec == 1) {
+        operand = valElements[i];
+      } else {
+        operand = undef(vecTy);
+        for (size_t ii = 0; ii < vec; ++ii)
+          operand =
+              insert_element(vecTy, operand, valElements[i + ii], i32_val(ii));
+      }
+
       Value undefVal = undef(retType);
       // Build blocks to bypass the atomic instruction for ~rmwMask.
       auto *curBlock = rewriter.getInsertionBlock();
@@ -806,25 +819,11 @@ struct AtomicRMWOpConversion
       auto maybeKind = matchAtomicOp(atomicRmwAttr);
       // TODO: use rocdl.raw.buffer.atomic from ROCDL dialect to use efficient
       // atomics for MI-* series of AMD GPU.
-      Value atom = rewriter
-                       .create<LLVM::AtomicRMWOp>(
-                           loc, *maybeKind, rmwPtr, valElements[i],
-                           atomicMemOrdering, StringRef("agent"))
-                       .getResult();
-
-      // NV for the f16v2 case generates one packed instruction. We have to
-      // create two separate instructions since LLVM::AtomicRMWOp doesn't
-      // support this. Can be optimized out with rocdl.raw.buffer.atomic.
-      if (f16v2) {
-        Value atom2 =
-            rewriter
-                .create<LLVM::AtomicRMWOp>(
-                    loc, *maybeKind, ptrElements[i + 1], valElements[i + 1],
-                    atomicMemOrdering, StringRef("agent"))
-                .getResult();
-        auto tmp = insert_element(vecTy, undef(vecTy), atom, i32_val(0));
-        atom = insert_element(vecTy, tmp, atom2, i32_val(1)).getResult();
-      }
+      Value atom =
+          rewriter
+              .create<LLVM::AtomicRMWOp>(loc, *maybeKind, rmwPtr, operand,
+                                         atomicMemOrdering, StringRef("agent"))
+              .getResult();
       if (!tensorTy) {
         if (atomicNeedsSharedMemory(op.getResult())) {
           Value atomPtr =

From 77f98f00658988d59626b50bf946159b3a3ad5f3 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Thu, 31 Oct 2024 14:00:25 +0000
Subject: [PATCH 5/5] Revert "[BACKEND] Improve detection of register to
 register conversion (#4991)"

This reverts commit 15c5e5519ff129104956ef0a726656f9e5cb4359.
---
 include/triton/Analysis/Utility.h             |  2 +-
 include/triton/Tools/LinearLayout.h           |  7 --
 lib/Analysis/Utility.cpp                      | 42 +----------
 .../TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp | 53 ++++++-------
 lib/Tools/LinearLayout.cpp                    | 15 ----
 test/Conversion/tritongpu_to_llvm.mlir        | 74 -------------------
 unittest/Tools/LinearLayoutTest.cpp           | 33 ---------
 7 files changed, 24 insertions(+), 202 deletions(-)

diff --git a/include/triton/Analysis/Utility.h b/include/triton/Analysis/Utility.h
index 4f6aff739c..cb3e3d292e 100644
--- a/include/triton/Analysis/Utility.h
+++ b/include/triton/Analysis/Utility.h
@@ -212,7 +212,7 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
 
 bool atomicNeedsSharedMemory(Value result);
 
-bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy);
+bool isBlockedToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstT);
 
 bool isMfmaToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy);
 
diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
index 47e3fca79b..c728cfbb32 100644
--- a/include/triton/Tools/LinearLayout.h
+++ b/include/triton/Tools/LinearLayout.h
@@ -679,13 +679,6 @@ class LinearLayout {
   // (i.e. every input bit affects the output).
   llvm::MapVector<StringAttr, int32_t> getFreeVariableMasks() const;
 
-  // Increase an input dimension without affecting the output dimension.  The
-  // added free variables are mapped to 0, ensuring that the new input
-  // dimensions correspond directly to the existing output space.  The function
-  // errors out if `newInDimSize` is less than the current size or the new size
-  // is not a power of 2.
-  LinearLayout resize(StringAttr inDim, int32_t newInDimSize) const;
-
   std::string toString() const;
 
   friend bool operator==(LinearLayout lhs, LinearLayout rhs);
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
index 339377f3ca..f51b3c1657 100644
--- a/lib/Analysis/Utility.cpp
+++ b/lib/Analysis/Utility.cpp
@@ -543,7 +543,7 @@ bool supportMMA(Value value, int version) {
          (elemTy.isInteger(8) && version >= 2);
 }
 
-bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy) {
+bool isBlockedToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstTy) {
   auto blockedLayout = dyn_cast<BlockedEncodingAttr>(srcTy.getEncoding());
   auto dotOperandLayout = dyn_cast<DotOperandEncodingAttr>(dstTy.getEncoding());
   if (blockedLayout == nullptr || dotOperandLayout == nullptr)
@@ -662,46 +662,8 @@ std::optional<LinearLayout> minimalCvtLayout(RankedTensorType srcTy,
       toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
   if (!(srcLayout.has_value() && dstLayout.has_value()))
     return std::nullopt;
-  StringAttr kRegister = StringAttr::get(ctx, "register");
-  StringAttr kLane = StringAttr::get(ctx, "lane");
-  StringAttr kWarp = StringAttr::get(ctx, "warp");
-  StringAttr kBlock = StringAttr::get(ctx, "block");
-  auto numSrcRegs = srcLayout->getInDimSize(kRegister);
-  auto numDstRegs = dstLayout->getInDimSize(kRegister);
-  // The `invertAndCompose` function will generate a layout that is injective
-  // by assigning new output dimensions to free variables.  For instance,
-  // consider a scenario where `srcLayout` has a free variable in the lane
-  // dimension, while `dstLayout` has two free variables in the lane
-  // dimension and also a larger number of registers.
-  // The injective form of `srcLayout` will add only a single additional row
-  // to the transformation matrix, whereas the injective form of `dstLayout`
-  // will add two additional rows.  This discrepancy causes misleading results
-  // because the matrices end up with a different number of rows.
-  //
-  // Take `dstLayout ⋅ srcLayout^-1` as an example:
-  //
-  //   - `injective(dstLayout)`: [n, m] → [n + 2, m]
-  //   - `injective(srcLayout)`: [n, m] → [n + 1, m]
-  //   - `injective(srcLayout)^-1`: [n + 1, m] → [m, n + 1]
-  //   - `injective(dstLayout) ⋅ injective(srcLayout)^-1`: [n + 2, m] ⋅ [m, n +
-  //   1] → [n + 2, n + 1]
-  //
-  // Here, the `(n + 1)`-th row added by `dstLayout` represents the free
-  // variable in registers, and the `(n + 2)`-th row represents the free
-  // variable in lanes.  However, the `(n + 1)`-th row added by `srcLayout`
-  // represents the free variable in lanes.  As a result, the `(n + 1)`-th row
-  // in two layouts do not correspond to the same free variable.
-  //
-  // To address this issue, we pad the free variables in `srcLayout` and
-  // `dstLayout` to ensure they have the same number of registers.  This
-  // guarantees that the resulting matrices have the same number of rows,
-  // ensuring consistency in the composition process.
-  auto numRegs = std::max(numSrcRegs, numDstRegs);
-  auto srcLayoutWithFreeRegs = srcLayout->resize(kRegister, numRegs);
-  auto dstLayoutWithFreeRegs = dstLayout->resize(kRegister, numRegs);
   // comp describes the layout function to create dst from src.
-  LinearLayout comp =
-      dstLayoutWithFreeRegs.invertAndCompose(srcLayoutWithFreeRegs);
+  LinearLayout comp = dstLayout->invertAndCompose(*srcLayout);
   // We try to quotient by the largest subspace first
   auto dims = SmallVector<StringRef>{"block", "warp", "lane", "register"};
   for (auto dim : dims) {
diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
index e19bc696d2..43c7095b21 100644
--- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -288,54 +288,49 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
       return rewriter.notifyMatchFailure(
           op, "NYI. srcTy and/or dstTy don't implement LLs yet");
     }
-    LinearLayout srcLayout =
-        *toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
-    LinearLayout dstLayout =
-        *toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
-
-    StringAttr kBlock = str_attr("block");
-    StringAttr kWarp = str_attr("warp");
-    StringAttr kLane = str_attr("lane");
-    StringAttr kRegister = str_attr("register");
 
     assert(to_vector(conversion->getInDimNames()) ==
            to_vector(conversion->getOutDimNames()));
     auto dims = conversion->getInDimNames();
-    if (llvm::is_contained(dims, kBlock)) {
+    if (llvm::is_contained(dims, str_attr("block"))) {
       // Case 1: Transfer between values in different CTAs.
       //          This requires moving values through distributed shared memory.
       return rewriter.notifyMatchFailure(
           op, "NYI: Transfer between different CTAs");
-    } else if (llvm::is_contained(dims, kWarp)) {
+    } else if (llvm::is_contained(dims, str_attr("warp"))) {
       // Case 2: Transfer between values in the same CTA, in which case we move
       //         values through shared memory.
+      LinearLayout srcLayout =
+          *toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
+      LinearLayout dstLayout =
+          *toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
       return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter);
-    } else if (llvm::is_contained(dims, kLane)) {
+    } else if (llvm::is_contained(dims, str_attr("lane"))) {
       // Case 3. Transfer between values in the same warp, in which case we try
       //         to move values using warp shuffles, though if the pattern is
       //         complicated enough we may fall back to using shared memory
       // TODO(Keren): implement warp shuffle instead of using the general
       // approach that uses shared memory
+      LinearLayout srcLayout =
+          *toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
+      LinearLayout dstLayout =
+          *toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
       return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter);
-    } else if (llvm::is_contained(dims, kRegister) ||
-               dstLayout.getInDimSize(kRegister) !=
-                   srcLayout.getInDimSize(kRegister)) {
+    } else if (llvm::is_contained(dims, str_attr("register"))) {
       // Case 4. Transfer between values in the same thread, in which case we
       //         simply reorder the elements of adaptor.getSrc().
-      return transferWithinThread(
-          op, dstLayout.getFreeVariableMasks()[kRegister],
-          dstLayout.getInDimSize(kRegister), *conversion, adaptor, rewriter);
+      return transferWithinThread(op, *conversion, adaptor, rewriter);
     } else {
-      // Cast 5. The two layouts are equivalent. We should probably remove
-      // these in RemoveLayoutConversion.
+      // The two layouts are equivalent. We should probably remove these in
+      // RemoveLayoutConversion.
       rewriter.replaceOp(op, adaptor.getSrc());
       return success();
     }
   }
 
   LogicalResult
-  transferWithinThread(ConvertLayoutOp op, int32_t regMasks, int32_t numRegs,
-                       const LinearLayout &conversion, OpAdaptor adaptor,
+  transferWithinThread(ConvertLayoutOp op, const LinearLayout &conversion,
+                       OpAdaptor adaptor,
                        ConversionPatternRewriter &rewriter) const {
     MLIRContext *ctx = op.getContext();
     auto loc = op.getLoc();
@@ -343,16 +338,10 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     assert(!cvtNeedsSharedMemory(op.getSrc().getType(), op.getType()));
 
     auto inVals = unpackLLElements(loc, adaptor.getSrc(), rewriter);
-    SmallVector<Value> outVals(numRegs);
-    for (int i = 0; i < outVals.size(); i++) {
-      // Remove free masks from the register index
-      // For example, if idx = 0b00111, and masks = 0b00100, then we get
-      // 0b00011. It means that register 7 (0b111) has the same value as
-      // register 3 (0b011).
-      auto idx = i & (~regMasks);
-      auto srcIdx = conversion.hasInDim(kRegister)
-                        ? conversion.apply({{kRegister, idx}}).begin()->second
-                        : idx;
+    SmallVector<Value> outVals;
+    outVals.resize(conversion.getInDimSize(kRegister));
+    for (int i = 0; i < conversion.getInDimSize(kRegister); i++) {
+      auto srcIdx = conversion.apply({{kRegister, i}}).begin()->second;
       outVals[i] = inVals[srcIdx];
     }
     Value result = packLLElements(loc, getTypeConverter(), outVals, rewriter,
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
index 4319d1f086..bf017f8c64 100644
--- a/lib/Tools/LinearLayout.cpp
+++ b/lib/Tools/LinearLayout.cpp
@@ -1016,21 +1016,6 @@ bool LinearLayout::equalIgnoringOutDimSizes(const LinearLayout &other) const {
   return true;
 }
 
-LinearLayout LinearLayout::resize(StringAttr inDim,
-                                  int32_t newInDimSize) const {
-  BasesT bases = getBases();
-  assert(bases.contains(inDim) && "inDim not in layout");
-  assert(llvm::isPowerOf2_32(newInDimSize) &&
-         "newInDimSize must be a power of 2");
-  assert(newInDimSize >= getInDimSize(inDim) &&
-         "newInDimSize must be >= old size");
-  auto numFreeVariables = llvm::Log2_32(newInDimSize) - getInDimSizeLog2(inDim);
-  for (int i = 0; i < numFreeVariables; i++) {
-    bases[inDim].push_back(std::vector<int32_t>(getNumOutDims(), 0));
-  }
-  return LinearLayout(std::move(bases), llvm::to_vector(getOutDimNames()));
-}
-
 std::string LinearLayout::toString() const {
   // Start with a newline because we print out a bulleted list; it doesn't
   // make sense for the first line of this list to be on the same line as
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
index 4a61ee4bc1..e1a2ec68bd 100644
--- a/test/Conversion/tritongpu_to_llvm.mlir
+++ b/test/Conversion/tritongpu_to_llvm.mlir
@@ -847,80 +847,6 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 
 // -----
 
-#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [1, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}>
-#dot1 = #triton_gpu.dot_op<{opIdx=0, parent=#mma, kWidth=2}>
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
-  // CHECK-LABEL: convert_layout_mmav2_dot_reg
-  tt.func @convert_layout_mmav2_dot_reg(%arg0: tensor<16x16xf16, #mma>) {
-    // CHECK-NOT: st.shared
-    // CHECK-NOT: llvm.load
-    %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #dot1>
-    tt.return
-  }
-}
-
-// -----
-
-#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
-#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
-  // CHECK-LABEL: convert_layout_mmav3_mmav3_0
-  tt.func @convert_layout_mmav3_mmav3_0(%arg0: tensor<64x64xf16, #mma0>) {
-    // CHECK-NOT: st.shared
-    // CHECK-NOT: llvm.load
-    %0 = triton_gpu.convert_layout %arg0 : tensor<64x64xf16, #mma0> -> tensor<64x64xf16, #mma1>
-    tt.return
-  }
-}
-
-// -----
-
-#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
-#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
-  // CHECK-LABEL: convert_layout_mmav3_mmav3_1
-  tt.func @convert_layout_mmav3_mmav3_1(%arg0: tensor<64x64xf16, #mma1>) {
-    // CHECK-NOT: st.shared
-    // CHECK-NOT: llvm.load
-    %0 = triton_gpu.convert_layout %arg0 : tensor<64x64xf16, #mma1> -> tensor<64x64xf16, #mma0>
-    tt.return
-  }
-}
-
-// -----
-
-#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
-#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
-  // CHECK-LABEL: convert_layout_mmav3_mmav3_2
-  tt.func @convert_layout_mmav3_mmav3_2(%arg0: tensor<16x16xf16, #mma1>) {
-    // CHECK-NOT: st.shared
-    // CHECK-NOT: llvm.load
-    %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mma1> -> tensor<16x16xf16, #mma0>
-    tt.return
-  }
-}
-
-// -----
-
-#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
-#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
-  // CHECK-LABEL: convert_layout_mmav3_mmav3_3
-  tt.func @convert_layout_mmav3_mmav3_3(%arg0: tensor<1x64xf16, #mma1>) {
-    // CHECK-NOT: st.shared
-    // CHECK-NOT: llvm.load
-    %0 = triton_gpu.convert_layout %arg0 : tensor<1x64xf16, #mma1> -> tensor<1x64xf16, #mma0>
-    tt.return
-  }
-}
-
-// -----
-
 #blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
 #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 256, 32]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32} {
diff --git a/unittest/Tools/LinearLayoutTest.cpp b/unittest/Tools/LinearLayoutTest.cpp
index 897172fd6d..f006447002 100644
--- a/unittest/Tools/LinearLayoutTest.cpp
+++ b/unittest/Tools/LinearLayoutTest.cpp
@@ -747,39 +747,6 @@ TEST_F(LinearLayoutTest, QuotientIdentityMultipleDimensions) {
   ASSERT_TRUE(quotientLayout->quotient({S("dim2")}).has_value());
 }
 
-TEST_F(LinearLayoutTest, Resize) {
-  auto init = LinearLayout(
-      {
-          {S("in0"), {{0, 1}, {0, 2}}},
-          {S("in1"), {{1, 0}, {2, 0}}},
-          {S("in2"), {}},
-      },
-      {S("dim0"), S("dim1")});
-  EXPECT_EQ(init.resize(S("in0"), 8),
-            LinearLayout(
-                {
-                    {S("in0"), {{0, 1}, {0, 2}, {0, 0}}},
-                    {S("in1"), {{1, 0}, {2, 0}}},
-                    {S("in2"), {}},
-                },
-                {S("dim0"), S("dim1")}));
-  EXPECT_EQ(init.resize(S("in0"), 4), LinearLayout(
-                                          {
-                                              {S("in0"), {{0, 1}, {0, 2}}},
-                                              {S("in1"), {{1, 0}, {2, 0}}},
-                                              {S("in2"), {}},
-                                          },
-                                          {S("dim0"), S("dim1")}));
-  EXPECT_EQ(init.resize(S("in1"), 8),
-            LinearLayout(
-                {
-                    {S("in0"), {{0, 1}, {0, 2}}},
-                    {S("in1"), {{1, 0}, {2, 0}, {0, 0}}},
-                    {S("in2"), {}},
-                },
-                {S("dim0"), S("dim1")}));
-}
-
 } // anonymous namespace
 } // namespace mlir::triton