5
5
// CHECK-LABEL: alloc_convert_load
6
6
// CHECK-32KLIMIT-LABEL: alloc_convert_load
7
7
// CHECK: %0 = ttg.local_alloc %arg0 : {{.*}}#blocked{{.*}}#shared
8
- // CHECK: %1 = ttg.convert_layout %arg1 : {{.*}}#blocked{{.*}}#blocked1
9
- // CHECK: %2 = ttg.convert_layout %1 : {{.*}}#blocked1{{.*}}#mma
8
+ // CHECK: %1 = ttg.convert_layout %arg1 {{.*}} : {{.*}}#blocked{{.*}}#blocked1
9
+ // CHECK: %2 = ttg.convert_layout %1 {{.*}} : {{.*}}#blocked1{{.*}}#mma
10
10
// CHECK: %3 = ttg.local_load %0 : {{.*}}#shared{{.*}}#ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
11
11
#blocked = #ttg.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [16 , 4 ], warpsPerCTA = [1 , 8 ], order = [0 , 1 ]}>
12
12
#mma = #ttg.amd_mfma <{version = 2 , warpsPerCTA = [1 , 8 ], instrShape = [32 , 32 ], isTransposed = false }>
@@ -28,8 +28,8 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32}
28
28
// CHECK-LABEL: alloc_convert_small_load
29
29
// CHECK-32KLIMIT-LABEL: alloc_convert_small_load
30
30
// CHECK: %0 = ttg.local_alloc %arg0 : {{.*}}#blocked{{.*}}#shared
31
- // CHECK: %1 = ttg.convert_layout %arg1 : {{.*}}#blocked{{.*}}#blocked1
32
- // CHECK: %2 = ttg.convert_layout %1 : {{.*}}#blocked1{{.*}}#mma
31
+ // CHECK: %1 = ttg.convert_layout %arg1 {{.*}} : {{.*}}#blocked{{.*}}#blocked1
32
+ // CHECK: %2 = ttg.convert_layout %1 {{.*}} : {{.*}}#blocked1{{.*}}#mma
33
33
// CHECK: %3 = ttg.local_load %0 : {{.*}}#shared{{.*}}#ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
34
34
#blocked = #ttg.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [16 , 4 ], warpsPerCTA = [1 , 8 ], order = [0 , 1 ]}>
35
35
#mma = #ttg.amd_mfma <{version = 2 , warpsPerCTA = [1 , 8 ], instrShape = [32 , 32 ], isTransposed = false }>
@@ -55,7 +55,7 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32}
55
55
// CHECK-32KLIMIT-LABEL: alloc_convert_3d_load
56
56
// CHECK: [[V0:%.*]] = ttg.local_alloc {{.*}}[[$BLOCKED1]]{{.*}}
57
57
// CHECK: [[V1:%.*]] = ttg.convert_layout {{.*}}[[$BLOCKED1]]{{.*}}[[$BLOCKED2]]
58
- // CHECK: [[V2:%.*]] = ttg.convert_layout [[V1]] : {{.*}}[[$BLOCKED2]]{{.*}}[[$MMA]]
58
+ // CHECK: [[V2:%.*]] = ttg.convert_layout [[V1]] {{.*}} : {{.*}}[[$BLOCKED2]]{{.*}}[[$MMA]]
59
59
// CHECK: [[V3:%.*]] = ttg.local_load [[V0]] : {{.*}}#ttg.dot_op<{opIdx = 0, parent = [[$MMA]], kWidth = 4}>>
60
60
#blocked = #ttg.blocked <{sizePerThread = [1 , 8 , 1 ], threadsPerWarp = [1 , 16 , 4 ], warpsPerCTA = [1 , 1 , 8 ], order = [0 , 1 , 2 ]}>
61
61
#mma = #ttg.amd_mfma <{version = 2 , warpsPerCTA = [1 , 1 , 8 ], instrShape = [32 , 32 ], isTransposed = false }>
@@ -75,12 +75,12 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32}
75
75
// Check that optimization triggers with custom LDS limit and do not triggers with default one
76
76
// CHECK-LABEL: alloc_convert_32k_limit
77
77
// CHECK: %0 = ttg.local_alloc %arg0 : {{.*}}#blocked{{.*}}#shared
78
- // CHECK: %1 = ttg.convert_layout %arg1 : {{.*}}#blocked{{.*}}#mma
78
+ // CHECK: %1 = ttg.convert_layout %arg1 {{.*}} : {{.*}}#blocked{{.*}}#mma
79
79
// CHECK: %2 = ttg.local_load %0 : {{.*}}#shared{{.*}}#ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
80
80
// CHECK-32KLIMIT-LABEL: alloc_convert_32k_limit
81
81
// CHECK-32KLIMIT: %0 = ttg.local_alloc %arg0 : {{.*}}#blocked{{.*}}#shared
82
- // CHECK-32KLIMIT: %1 = ttg.convert_layout %arg1 : {{.*}}#blocked{{.*}}#blocked1
83
- // CHECK-32KLIMIT: %2 = ttg.convert_layout %1 : {{.*}}#blocked1{{.*}}#mma
82
+ // CHECK-32KLIMIT: %1 = ttg.convert_layout %arg1 {{.*}} : {{.*}}#blocked{{.*}}#blocked1
83
+ // CHECK-32KLIMIT: %2 = ttg.convert_layout %1 {{.*}} : {{.*}}#blocked1{{.*}}#mma
84
84
// CHECK-32KLIMIT: %3 = ttg.local_load %0 : {{.*}}#shared{{.*}}#ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
85
85
#blocked = #ttg.blocked <{sizePerThread = [4 , 1 ], threadsPerWarp = [16 , 4 ], warpsPerCTA = [1 , 8 ], order = [0 , 1 ]}>
86
86
#mma = #ttg.amd_mfma <{version = 2 , warpsPerCTA = [1 , 8 ], instrShape = [32 , 32 ], isTransposed = false }>
@@ -106,9 +106,9 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32}
106
106
107
107
// CHECK: tt.func public @mfma_dot_shortcut([[ARG_0:%[a-z0-9]*]]: {{.*}}, [[ARG_1:%[a-z0-9]*]]: {{.*}}, [[ARG_2:%[a-z0-9]*]]: {{.*}})
108
108
// CHECK: [[ALLOC:%[0-9]+]] = ttg.local_alloc [[ARG_0]] : (tensor<128x128xf16, [[BLOCKED_1]]>) -> !ttg.memdesc<128x128xf16, [[SHARED]], #smem>
109
- // CHECK: [[INTERMEDIATE_CONV:%[0-9]+]] = ttg.convert_layout [[ARG_1]] : tensor<128x128xf32, [[BLOCKED_1]]> -> tensor<128x128xf32, [[BLOCKED_2]]>
110
- // CHECK: [[CONVERT_1:%[0-9]+]] = ttg.convert_layout [[INTERMEDIATE_CONV]] : tensor<128x128xf32, [[BLOCKED_2]]> -> tensor<128x128xf32, [[MMA_2]]>
111
- // CHECK: [[CONVERT_2:%[0-9]+]] = ttg.convert_layout [[ARG_2]] : tensor<256x128xf16, [[MMA_1]]> -> tensor<256x128xf16, #ttg.dot_op<{opIdx = 0, parent = [[MMA_1]], kWidth = 4}>>
109
+ // CHECK: [[INTERMEDIATE_CONV:%[0-9]+]] = ttg.convert_layout [[ARG_1]] {{.*}} : tensor<128x128xf32, [[BLOCKED_1]]> -> tensor<128x128xf32, [[BLOCKED_2]]>
110
+ // CHECK: [[CONVERT_1:%[0-9]+]] = ttg.convert_layout [[INTERMEDIATE_CONV]] {{.*}} : tensor<128x128xf32, [[BLOCKED_2]]> -> tensor<128x128xf32, [[MMA_2]]>
111
+ // CHECK: [[CONVERT_2:%[0-9]+]] = ttg.convert_layout [[ARG_2]] {{.*}} : tensor<256x128xf16, [[MMA_1]]> -> tensor<256x128xf16, #ttg.dot_op<{opIdx = 0, parent = [[MMA_1]], kWidth = 4}>>
112
112
// CHECK: [[LOAD:%[0-9]+]] = ttg.local_load [[ALLOC]] : !ttg.memdesc<128x128xf16, [[SHARED]], #smem> -> tensor<128x128xf16, #ttg.dot_op<{opIdx = 0, parent = [[MMA_2]], kWidth = 4}>>
113
113
#blocked = #ttg.blocked <{sizePerThread = [4 , 1 ], threadsPerWarp = [16 , 4 ], warpsPerCTA = [1 , 8 ], order = [0 , 1 ]}>
114
114
#mma1 = #ttg.amd_mfma <{version = 2 , warpsPerCTA = [1 , 8 ], instrShape = [32 , 32 ], isTransposed = false }>
0 commit comments