From 26cde91ee56770d023253c7faf2e2055cd5be9e6 Mon Sep 17 00:00:00 2001 From: Shucai Xiao Date: Mon, 24 Nov 2025 11:30:37 -0600 Subject: [PATCH 1/3] refine the lds order for transposed input tensor --- .../Transforms/ReduceDataDuplication.cpp | 7 +++++++ test/Conversion/amd/mfma-shortcut.mlir | 16 ++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp index deec43f1161c..7640fa61d762 100644 --- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp +++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp @@ -45,6 +45,13 @@ class TritonGPUReduceDataDuplicationPass if (!cvtNeedsSharedMemory(srcType, dstType)) return; auto order = getOrderForMemory(srcType); + auto inputOp = cvtOp.getSrc().getDefiningOp(); + // if input of convert_layout is transOp, actuall order is the order of the transOp input. + // By setting lds order to be the same as input, ds_write is more efficient + if (auto transOp = dyn_cast(inputOp)) { + order = getOrderForMemory(cast(transOp.getSrc().getType())); + } + auto sharedMemorySpace = triton::gpu::SharedMemorySpaceAttr::get(srcType.getContext()); auto tmpType = triton::gpu::MemDescType::get( diff --git a/test/Conversion/amd/mfma-shortcut.mlir b/test/Conversion/amd/mfma-shortcut.mlir index 4a496459d0cb..24b5f13a85e7 100644 --- a/test/Conversion/amd/mfma-shortcut.mlir +++ b/test/Conversion/amd/mfma-shortcut.mlir @@ -108,3 +108,19 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} tt.return } } + + +// ----- + +#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [8, 1], instrShape = [16, 16], isTransposed = true}> +#linear = #ttg.linear<{register = [[1, 0], [2, 0], [16, 0]], lane = [[0, 1], [0, 2], [0, 4], [0, 8], [4, 0], [8, 0]], warp = [[0, 16], [0, 32], [0, 64]], block = []}> +module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} { + // GFX950-LABEL: mfma_dotop_lds_layout_order + tt.func public @mfma_dotop_lds_layout_order(%arg0: tensor<128x32xbf16, #mma>) { + %1 = tt.trans %arg0 {order = array} : tensor<128x32xbf16, #mma> -> tensor<32x128xbf16, #linear> + // GFX950-COUNT-2: llvm.store + // GFX950-COUNT-8: lvm.call_intrinsic "rocdl.ds.read.tr16.b64" + %2 = ttg.convert_layout %1 : tensor<32x128xbf16, #linear> -> tensor<32x128xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> + tt.return + } +} From 5ff2dd01da784ef62a54b1fa066113c57a2ff4d9 Mon Sep 17 00:00:00 2001 From: Shucai Xiao Date: Mon, 24 Nov 2025 11:31:54 -0600 Subject: [PATCH 2/3] clang format --- .../TritonGPU/Transforms/ReduceDataDuplication.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp index 7640fa61d762..88e01a2f9829 100644 --- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp +++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp @@ -46,10 +46,12 @@ class TritonGPUReduceDataDuplicationPass return; auto order = getOrderForMemory(srcType); auto inputOp = cvtOp.getSrc().getDefiningOp(); - // if input of convert_layout is transOp, actuall order is the order of the transOp input. - // By setting lds order to be the same as input, ds_write is more efficient + // if input of convert_layout is transOp, actuall order is the order of + // the transOp input. By setting lds order to be the same as input, + // ds_write is more efficient if (auto transOp = dyn_cast(inputOp)) { - order = getOrderForMemory(cast(transOp.getSrc().getType())); + order = getOrderForMemory( + cast(transOp.getSrc().getType())); } auto sharedMemorySpace = From 526b9d73f0e5f66a1f0fea0d939eb615e516e152 Mon Sep 17 00:00:00 2001 From: Shucai Xiao Date: Mon, 24 Nov 2025 14:12:38 -0600 Subject: [PATCH 3/3] fix lit tests and clang format --- .../TritonGPU/Transforms/ReduceDataDuplication.cpp | 8 +++++--- test/Conversion/amd/mfma-shortcut.mlir | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp index 88e01a2f9829..e49e0b7bed2a 100644 --- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp +++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp @@ -49,9 +49,11 @@ class TritonGPUReduceDataDuplicationPass // if input of convert_layout is transOp, actuall order is the order of // the transOp input. By setting lds order to be the same as input, // ds_write is more efficient - if (auto transOp = dyn_cast(inputOp)) { - order = getOrderForMemory( - cast(transOp.getSrc().getType())); + if (inputOp) { + if (auto transOp = dyn_cast(inputOp)) { + order = getOrderForMemory( + cast(transOp.getSrc().getType())); + } } auto sharedMemorySpace = diff --git a/test/Conversion/amd/mfma-shortcut.mlir b/test/Conversion/amd/mfma-shortcut.mlir index 24b5f13a85e7..0252722f3c25 100644 --- a/test/Conversion/amd/mfma-shortcut.mlir +++ b/test/Conversion/amd/mfma-shortcut.mlir @@ -119,7 +119,7 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} tt.func public @mfma_dotop_lds_layout_order(%arg0: tensor<128x32xbf16, #mma>) { %1 = tt.trans %arg0 {order = array} : tensor<128x32xbf16, #mma> -> tensor<32x128xbf16, #linear> // GFX950-COUNT-2: llvm.store - // GFX950-COUNT-8: lvm.call_intrinsic "rocdl.ds.read.tr16.b64" + // GFX950-COUNT-8: rocdl.ds.read.tr16.b64 %2 = ttg.convert_layout %1 : tensor<32x128xbf16, #linear> -> tensor<32x128xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> tt.return }