From cd237fff931358b4514c0a7a31eb00b187915f2f Mon Sep 17 00:00:00 2001 From: "Tiotto, Ettore" Date: Wed, 13 Aug 2025 19:11:03 +0000 Subject: [PATCH 1/3] [RemoveLayoutConversions]: Update unit tests Signed-off-by: Tiotto, Ettore --- .../backward_combine_dpas_dot_layout.mlir | 75 +++++++++++-------- 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir index bd9b67d9c5..1b4e8c2d54 100644 --- a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir +++ b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir @@ -75,16 +75,16 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, // ----- -// COM: Case 2: -// COM: Checks that DPAS encoding has been forwarded to the store op -// COM: and the ttg.convert_layout operation has been removed +// COM: Case 2: Similar to Case1 but the loads do not have the blockIO "row_major" attribute. +// COM: Checks that DPAS encoding has been forwarded from the dot op to the store op via the loop return values +// COM: and that the ttg.convert_layout operation has been removed. // CHECK: #[[DPAS:.+]] = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}> #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}> #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth=1}> #dot1 = #ttg.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} { +module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { %c8_i32 = arith.constant 8 : i32 %c64_i32 = arith.constant 64 : i32 @@ -128,12 +128,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr %34 = tt.advance %arg12, [%c32_i32, %c0_i32] : > scf.yield %32, %33, %34 : tensor<64x256xf32, #dpas>, !tt.ptr>, !tt.ptr> } + // CHECK: arith.truncf {{.*}} : tensor<64x256xf32, #[[DPAS]]> to tensor<64x256xf16, #[[DPAS]]> + // CHECK-NOT: ttg.convert_layout + // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array} : > + // CHECK: tt.store {{.*}}, {{.*}} {boundaryCheck = array} : !tt.ptr> %24 = arith.truncf %23#0 : tensor<64x256xf32, #dpas> to tensor<64x256xf16, #dpas> %25 = ttg.convert_layout %24 : tensor<64x256xf16, #dpas> -> tensor<64x256xf16, #blocked1> %26 = arith.extsi %arg8 : i32 to i64 - // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array} : > %27 = tt.make_tensor_ptr %arg2, [%15, %20], [%26, %c1_i64], [%14, %19] {order = array} : > - // CHECK: tt.store {{.*}}, {{.*}} {boundaryCheck = array} : !tt.ptr> tt.store %27, %25 {boundaryCheck = array} : !tt.ptr> tt.return } @@ -141,8 +143,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr // ----- -// COM: Case 3: -// COM: Checks that DPAS encoding has been forwarded to the store op +// COM: Case 3: Similar to Case 1 but with with an additional store after the loop +// COM: Checks that DPAS encoding has been forwarded from the dot op to the store op via the loop return values // COM: The `tt.make_tensor_ptr` has multiple users (the storeOp + another OP) // COM: The initial `tt.make_tensor_ptr` with non-DPAS encoding must be kept. // CHECK: #[[BLOCKED:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}> @@ -187,6 +189,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr %21 = arith.extsi %arg7 : i32 to i64 %22 = tt.make_tensor_ptr %arg1, [%16, %20], [%21, %c1_i64], [%c0_i32, %19] {order = array} : > %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c32_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<64x256xf32, #dpas>, !tt.ptr>, !tt.ptr>) : i32 { + // COM: Layout conversions in the loop should be removed. + // CHECK: scf.for + // CHECK-NOT: ttg.convert_layout + // CHECK: scf.yield %28 = tt.load %arg11 {boundaryCheck = array, ttig.block_io = "row_major"} : !tt.ptr> %29 = tt.load %arg12 {boundaryCheck = array, ttig.block_io = "row_major"} : !tt.ptr> %30 = ttg.convert_layout %28 : tensor<64x32xf16, #blocked> -> tensor<64x32xf16, #dot0> @@ -196,35 +202,40 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr %34 = tt.advance %arg12, [%c32_i32, %c0_i32] : > scf.yield %32, %33, %34 : tensor<64x256xf32, #dpas>, !tt.ptr>, !tt.ptr> } + // CHECK: arith.truncf + // CHECK-NOT: ttg.convert_layout + // CHECK-DAG: [[PTR1:%.*]] = tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array} : > + // CHECK-DAG: [[PTR2:%.*]] = tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array} : > + // CHECK-NEXT: tt.store [[PTR1]], {{.*}} {boundaryCheck = array} : !tt.ptr> + // CHECK-NEXT: tt.load [[PTR2]] {boundaryCheck = array} : !tt.ptr> %24 = arith.truncf %23#0 : tensor<64x256xf32, #dpas> to tensor<64x256xf16, #dpas> %25 = ttg.convert_layout %24 : tensor<64x256xf16, #dpas> -> tensor<64x256xf16, #blocked1> %26 = arith.extsi %arg8 : i32 to i64 - // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array} : > - // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array} : > %27 = tt.make_tensor_ptr %arg2, [%15, %20], [%26, %c1_i64], [%14, %19] {order = array} : > - // CHECK: tt.store {{.*}}, {{.*}} {boundaryCheck = array} : !tt.ptr> tt.store %27, %25 {boundaryCheck = array} : !tt.ptr> %35 = tt.load %27 {boundaryCheck = array} : !tt.ptr> + // CHECK-NUM-2: ttg.convert_layout %36 = tt.make_tensor_ptr %arg13, [%15, %16], [%17, %c1_i64], [%14, %c0_i32] {order = array} : > %37 = tt.load %36 {boundaryCheck = array} : !tt.ptr> %38 = ttg.convert_layout %37 : tensor<64x64xf16, #blocked> -> tensor<64x64xf16, #dot0> %39 = ttg.convert_layout %35 : tensor<64x256xf16, #blocked1> -> tensor<64x256xf16, #dot1> %40 = tt.dot %38, %39, %cst, inputPrecision = tf32 : tensor<64x64xf16, #dot0> * tensor<64x256xf16, #dot1> -> tensor<64x256xf32, #dpas> + // CHECK: tt.dot + // CHECK-NOT: ttg.convert_layout + // CHECK: [[PTR3:%.*]] = tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array} : > + // CHECK: tt.store [[PTR3]], {{.*}} {boundaryCheck = array} : !tt.ptr> %41 = ttg.convert_layout %40 : tensor<64x256xf32, #dpas> -> tensor<64x256xf32, #blocked1> - // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array} : > %42 = tt.make_tensor_ptr %arg14, [%15, %20], [%26, %c1_i64], [%14, %19] {order = array} : > - // CHECK: tt.store {{.*}}, {{.*}} {boundaryCheck = array} : !tt.ptr> tt.store %42, %41 {boundaryCheck = array} : !tt.ptr> tt.return } } - // ----- -// COM: Case 4: -// COM: Checks that DPAS encoding has been forwarded to the store op -// COM: and the ttg.convert_layout operation in the loop has been removed +// COM: Case 4: Similar to Case 1 but with a convert layout on the dot op return value op in the loop +// COM: Checks that DPAS encoding has been forwarded from the dot op to the store op through the loop results +// COM: and the ttg.convert_layout operations in the loop has been removed // CHECK: #[[DPAS:.+]] = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}> #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}> @@ -232,7 +243,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr #dot0 = #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth=1}> #dot1 = #ttg.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} { - tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { + tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg5: i32) { %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 %c0_i64 = arith.constant 0 : i64 @@ -241,6 +252,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr %18 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%c0_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > %22 = tt.make_tensor_ptr %arg1, [%c0_i64, %c0_i64], [%c0_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c32_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<64x256xf32, #blocked1>, !tt.ptr>, !tt.ptr>) : i32 { + // CHECK: scf.for + // CHECK-NOT: ttg.convert_layout + // CHECK: scf.yield %28 = tt.load %arg11 {boundaryCheck = array, ttig.block_io = "row_major" } : !tt.ptr> %29 = tt.load %arg12 {boundaryCheck = array, ttig.block_io = "row_major"} : !tt.ptr> %36 = ttg.convert_layout %arg10 : tensor<64x256xf32, #blocked1> -> tensor<64x256xf32, #dpas> @@ -249,14 +263,15 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr %32 = tt.dot %30, %31, %36, inputPrecision = tf32 : tensor<64x32xf16, #dot0> * tensor<32x256xf16, #dot1> -> tensor<64x256xf32, #dpas> %33 = tt.advance %arg11, [%c0_i32, %c32_i32] : > %34 = tt.advance %arg12, [%c32_i32, %c0_i32] : > - // CHECK-NOT: ttg.convert_layout %35 = ttg.convert_layout %32 : tensor<64x256xf32, #dpas> -> tensor<64x256xf32, #blocked1> scf.yield %35, %33, %34 : tensor<64x256xf32, #blocked1>, !tt.ptr>, !tt.ptr> } - %24 = arith.truncf %23#0 : tensor<64x256xf32, #blocked1> to tensor<64x256xf16, #blocked1> + // CHECK: arith.truncf + // CHECK-NOT: ttg.convert_layout // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array} : > - %27 = tt.make_tensor_ptr %arg2, [%c0_i64, %c0_i64], [%c0_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > // CHECK: tt.store {{.*}}, {{.*}} {boundaryCheck = array} : !tt.ptr> + %24 = arith.truncf %23#0 : tensor<64x256xf32, #blocked1> to tensor<64x256xf16, #blocked1> + %27 = tt.make_tensor_ptr %arg2, [%c0_i64, %c0_i64], [%c0_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > tt.store %27, %24 {boundaryCheck = array} : !tt.ptr> tt.return } @@ -270,8 +285,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr // CHECK: #[[BLOCKED:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}> #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}> #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}> -module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} { - tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr) { +module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} { + tt.func public @store_with_block_pointers(%arg0: !tt.ptr) { %c8_i32 = arith.constant 8 : i32 %c64_i64 = arith.constant 64 : i64 %c1_i64 = arith.constant 1 : i64 @@ -297,8 +312,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked2 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked3 = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}> -module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32, ttig.support_dpas, ttig.support_sg_2d_block} { - tt.func public @test_4866(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i64) { +module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} { + tt.func public @test_4866(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: i64) { %c1_i32 = arith.constant 1 : i32 %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #blocked> %cst_0 = arith.constant dense<5.000000e-01> : tensor<16x32xf32, #blocked1> @@ -311,11 +326,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr %1 = tt.make_tensor_ptr %arg1, [%arg2, %c64_i64], [%c64_i64, %c1_i64], [%c0_i32, %c32_i32] {order = array} : > %2:2 = scf.for %arg3 = %c0_i32 to %c16_i32 step %c1_i32 iter_args(%arg4 = %0, %arg5 = %1) -> (!tt.ptr>, !tt.ptr>) : i32 { // CHECK: scf.for {{.*}} - // CHECK: [[LOAD_RES:%.*]] = tt.load {{.*}} : !tt.ptr> - // CHECK: [[CONV1:%.*]] = ttg.convert_layout [[LOAD_RES]] : tensor<16x32xf16, #[[BLOCKED1]]> -> tensor<16x32xf16, #ttg.dot_op<{opIdx = 1, parent = #[[BLOCKED]]}>> - // CHECK: [[DOT_RES:%.*]] = tt.dot %cst_0, [[CONV1]], %cst : tensor<16x16xf16, #ttg.dot_op<{opIdx = 0, parent = #[[BLOCKED]]}>> * tensor<16x32xf16, #ttg.dot_op<{opIdx = 1, parent = #[[BLOCKED]]}>> -> tensor<16x32xf32, #[[BLOCKED]]> - // CHECK: [[CONV2:%.*]] = ttg.convert_layout [[DOT_RES]] : tensor<16x32xf32, #[[BLOCKED]]> -> tensor<16x32xf32, #[[BLOCKED1]]> - // CHECK: tt.store {{.*}}, [[CONV2]] : !tt.ptr> + // CHECK-NEXT: [[LOAD_RES:%.*]] = tt.load {{.*}} : !tt.ptr> + // CHECK-NEXT: [[CONV1:%.*]] = ttg.convert_layout [[LOAD_RES]] : tensor<16x32xf16, #[[BLOCKED1]]> -> tensor<16x32xf16, #ttg.dot_op<{opIdx = 1, parent = #[[BLOCKED]]}>> + // CHECK-NEXT: [[DOT_RES:%.*]] = tt.dot %cst_0, [[CONV1]], %cst : tensor<16x16xf16, #ttg.dot_op<{opIdx = 0, parent = #[[BLOCKED]]}>> * tensor<16x32xf16, #ttg.dot_op<{opIdx = 1, parent = #[[BLOCKED]]}>> -> tensor<16x32xf32, #[[BLOCKED]]> + // CHECK-NEXT: [[CONV2:%.*]] = ttg.convert_layout [[DOT_RES]] : tensor<16x32xf32, #[[BLOCKED]]> -> tensor<16x32xf32, #[[BLOCKED1]]> + // CHECK-NEXT: tt.store {{.*}}, [[CONV2]] : !tt.ptr> %3 = tt.load %arg4 : !tt.ptr> %4 = ttg.convert_layout %3 : tensor<16x32xf16, #blocked2> -> tensor<16x32xf16, #blocked1> %5 = ttg.convert_layout %cst : tensor<16x16xf16, #blocked> -> tensor<16x16xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked3}>> From cad2f23cd5385581bf3542334fc4e566b88419b1 Mon Sep 17 00:00:00 2001 From: "Tiotto, Ettore" Date: Mon, 18 Aug 2025 22:40:34 +0000 Subject: [PATCH 2/3] [RemoveLayoutConversions]: Reduce loop carried values Signed-off-by: Tiotto, Ettore --- .../backward_combine_dpas_dot_layout.mlir | 19 +++-- .../RemoveLayoutConversions.cpp | 72 +++++++++++++++++++ 2 files changed, 85 insertions(+), 6 deletions(-) diff --git a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir index 72d7f6d730..b27480f2f1 100644 --- a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir +++ b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir @@ -348,7 +348,8 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} // ----- -// COM: Reduce loop carried values. +// CHECK: #[[BLOCKED:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}> +// CHECK: #[[DPAS:.+]] = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}> #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}> #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> @@ -364,14 +365,15 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, %18 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%c0_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > %22 = tt.make_tensor_ptr %arg1, [%c0_i64, %c0_i64], [%c0_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c32_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<64x256xf32, #dpas>, !tt.ptr>, !tt.ptr>) : i32 { - // CHECK: scf.for + // COM: Ensure there are only 3 loop results and not layout conversion in the loop. + // CHECK: [[LOOP_RES:%.*]]:3 = scf.for // CHECK-NOT: ttg.convert_layout - // CHECK: scf.yield + // CHECK: scf.yield %28 = tt.load %arg11 {boundaryCheck = array, ttig.block_io = "row_major" } : !tt.ptr> %29 = tt.load %arg12 {boundaryCheck = array, ttig.block_io = "row_major"} : !tt.ptr> %30 = ttg.convert_layout %28 : tensor<64x32xf16, #blocked> -> tensor<64x32xf16, #dot0> %31 = ttg.convert_layout %29 : tensor<32x256xf16, #blocked1> -> tensor<32x256xf16, #dot1> - %32 = tt.dot %30, %31, %arg10, inputPrecision = tf32 : tensor<64x32xf16, #dot0> * tensor<32x256xf16, #dot1> -> tensor<64x256xf32, #dpas> + %32 = tt.dot %30, %31, %arg10, inputPrecision = tf32 : tensor<64x32xf16, #dot0> * tensor<32x256xf16, #dot1> -> tensor<64x256xf32, #dpas> %33 = tt.advance %arg11, [%c0_i32, %c32_i32] : > %34 = tt.advance %arg12, [%c32_i32, %c0_i32] : > scf.yield %32, %33, %34 : tensor<64x256xf32, #dpas>, !tt.ptr>, !tt.ptr> @@ -380,15 +382,20 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, %27 = tt.make_tensor_ptr %arg2, [%c0_i64, %c0_i64], [%c0_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > tt.store %27, %24 {boundaryCheck = array} : !tt.ptr> - // Reuse %23#1 (which will have dot layout after backward rematerialization), by injecting a convert layout op to / convert that value to a blocked layout. + // CHECK: [[LOAD1:%.*]] = tt.load [[LOOP_RES]]#1 {boundaryCheck = array, ttig.block_io = "row_major"} : !tt.ptr>> + // CHECK: [[CONV1:%.*]] = ttg.convert_layout [[LOAD1]] : tensor<64x32xf16, #ttg.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 1}>> -> tensor<64x32xf16, #[[BLOCKED]]> + // CHECK: [[PTR:%.*]] = tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array} : > + // CHECK: tt.store [[PTR]], [[CONV1]] {boundaryCheck = array} : !tt.ptr> %28 = tt.load %23#1 {boundaryCheck = array, ttig.block_io = "row_major" } : !tt.ptr> %29 = tt.make_tensor_ptr %arg2, [%c0_i64, %c0_i64], [%c0_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > tt.store %29, %28 {boundaryCheck = array} : !tt.ptr> + // CHECK: [[LOAD2:%.*]] = tt.load [[PTR]] {boundaryCheck = array, ttig.block_io = "row_major"} : !tt.ptr> + // CHECK: [[CONV2:%.*]] = ttg.convert_layout [[LOAD2]] : tensor<64x32xf16, #[[BLOCKED]]> -> tensor<64x32xf16, #ttg.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 1}>> + // CHECK: tt.store [[LOOP_RES]]#1, [[CONV2]] {boundaryCheck = array} : !tt.ptr>> %30 = tt.load %29 {boundaryCheck = array, ttig.block_io = "row_major" } : !tt.ptr> tt.store %23#1, %30 {boundaryCheck = array} : !tt.ptr> tt.return } } - diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp index 5dde69dcc9..55d994b52d 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp @@ -25,6 +25,7 @@ #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" +#include "llvm/ADT/TypeSwitch.h" #include namespace mlir::triton::gpu::intel { @@ -1267,6 +1268,77 @@ void LayoutRematerialization::backwardRematerialization() { convertOp.getResult()); } } + + // Reduce loop carried values if the value can be removed by using another + // loop yielded value plus a convert layout operation. + for (auto [pair, val] : rematMapping) { + if (!isa(pair.first)) + continue; + + auto arg = cast(pair.first); + if (!isTensorPointerType(arg.getType())) + continue; + + if (auto loopOp = + dyn_cast(arg.getOwner()->getParentOp())) { + // Loop arguments that corresponds to a loop result which is not used are + // not interesting. + OpResult loopRes = loopOp.getTiedLoopResult(arg); + if (loopRes.getNumUses() == 0) + continue; + + // Replace the loop result corresponding to the argument with an + // equivalent loop result. + auto rematArg = cast(val); + OpResult rematRes = loopOp.getTiedLoopResult(rematArg); + + for (OpOperand &use : loopRes.getUses()) { + Operation *user = use.getOwner(); + Location loc = user->getLoc(); + OpBuilder rewriter(user); + + TypeSwitch(user) + .Case([&](auto loadOp) { + auto newLoadOp = + rewriter.create(loc, rematRes, loadOp->getAttrs()); + auto convOp = rewriter.create( + loc, loadOp.getType(), newLoadOp.getResult()); + loadOp->replaceAllUsesWith(convOp); + LLVM_DEBUG({ + DBGS() << "Replaced:\n\t" << *loadOp << "\n"; + DBGS() << "with:\n\t" << *newLoadOp << "\n" + << "\t" << *convOp << "\n"; + }); + }) + .Case([&](auto storeOp) { + Value data = storeOp.getOperand(1); + auto dataType = cast(data.getType()); + auto newPtrType = cast(rematRes.getType()); + Attribute encoding = + cast(newPtrType.getPointeeType()) + .getEncoding(); + RankedTensorType newDataType = + dataType.cloneWithEncoding(encoding); + auto convOp = + rewriter.create(loc, newDataType, data); + auto newStoreOp = rewriter.create( + loc, rematRes, convOp, storeOp.getBoundaryCheck(), + storeOp.getCache(), storeOp.getEvict()); + opToDelete.insert(storeOp); + LLVM_DEBUG({ + DBGS() << "Replaced:\n\t" << *storeOp << "\n"; + DBGS() << "with:\n\t" << *convOp << "\n" + << "\t" << *newStoreOp << "\n"; + }); + }) + .Default([](auto op) { + llvm::report_fatal_error( + llvm::Twine("Unsupported operation: '" + + op->getName().getStringRef() + "'")); + }); + } + } + } } void LayoutRematerialization::hoistConvertOnTopOfExtOrBroadcast() { From 155d5fc027a053bbc2d3134ebfc0639a8cb93f9c Mon Sep 17 00:00:00 2001 From: "Tiotto, Ettore" Date: Tue, 19 Aug 2025 14:10:28 +0000 Subject: [PATCH 3/3] Address code review comment Signed-off-by: Tiotto, Ettore --- .../TritonIntelGPUTransforms/RemoveLayoutConversions.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp index 55d994b52d..6e4bc21960 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp @@ -1304,6 +1304,7 @@ void LayoutRematerialization::backwardRematerialization() { auto convOp = rewriter.create( loc, loadOp.getType(), newLoadOp.getResult()); loadOp->replaceAllUsesWith(convOp); + opToDelete.insert(loadOp); LLVM_DEBUG({ DBGS() << "Replaced:\n\t" << *loadOp << "\n"; DBGS() << "with:\n\t" << *newLoadOp << "\n" @@ -1332,9 +1333,9 @@ void LayoutRematerialization::backwardRematerialization() { }); }) .Default([](auto op) { - llvm::report_fatal_error( - llvm::Twine("Unsupported operation: '" + - op->getName().getStringRef() + "'")); + llvm::report_fatal_error(llvm::Twine( + "Unsupported operation in backward rematerialization: '" + + op->getName().getStringRef() + "'")); }); } }