@@ -62,22 +62,40 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
62
62
tt.func @downcast_to_f8 (%arg0: tensor <8 x8 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>,
63
63
%arg1: tensor <8 x8 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>,
64
64
%arg2: tensor <8 x8 xbf16 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>) {
65
- // GFX950-COUNT-4: rocdl.cvt.scalef32.pk.bf8.f32
65
+ // GFX950: rocdl.cvt.scalef32.pk.bf8.f32 %{{.*}}, %{{.*}}, %{{.*}} -> %{{.*}}[false]
66
+ // GFX950: rocdl.cvt.scalef32.pk.bf8.f32 %{{.*}}, %{{.*}}, %{{.*}} -> %{{.*}}[true]
67
+ // GFX950: rocdl.cvt.scalef32.pk.bf8.f32 %{{.*}}, %{{.*}}, %{{.*}} -> %{{.*}}[false]
68
+ // GFX950: rocdl.cvt.scalef32.pk.bf8.f32 %{{.*}}, %{{.*}}, %{{.*}} -> %{{.*}}[true]
66
69
%0 = tt.fp_to_fp %arg0 , rounding = rtne : tensor <8 x8 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xf8 E5 M2 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
67
70
68
- // GFX950-COUNT-4: rocdl.cvt.scalef32.pk.bf8.f16
71
+ // GFX950: rocdl.cvt.scalef32.pk.bf8.f16 %{{.*}}, %{{.*}} -> %{{.*}}[false]
72
+ // GFX950: rocdl.cvt.scalef32.pk.bf8.f16 %{{.*}}, %{{.*}} -> %{{.*}}[true]
73
+ // GFX950: rocdl.cvt.scalef32.pk.bf8.f16 %{{.*}}, %{{.*}} -> %{{.*}}[false]
74
+ // GFX950: rocdl.cvt.scalef32.pk.bf8.f16 %{{.*}}, %{{.*}} -> %{{.*}}[true]
69
75
%1 = tt.fp_to_fp %arg1 , rounding = rtne : tensor <8 x8 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xf8 E5 M2 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
70
76
71
- // GFX950-COUNT-4: rocdl.cvt.scalef32.pk.bf8.bf16
77
+ // GFX950: rocdl.cvt.scalef32.pk.bf8.bf16 %{{.*}}, %{{.*}} -> %{{.*}}[false]
78
+ // GFX950: rocdl.cvt.scalef32.pk.bf8.bf16 %{{.*}}, %{{.*}} -> %{{.*}}[true]
79
+ // GFX950: rocdl.cvt.scalef32.pk.bf8.bf16 %{{.*}}, %{{.*}} -> %{{.*}}[false]
80
+ // GFX950: rocdl.cvt.scalef32.pk.bf8.bf16 %{{.*}}, %{{.*}} -> %{{.*}}[true]
72
81
%2 = tt.fp_to_fp %arg2 , rounding = rtne : tensor <8 x8 xbf16 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xf8 E5 M2 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
73
82
74
- // GFX950-COUNT-4: rocdl.cvt.scalef32.pk.fp8.f32
83
+ // GFX950: rocdl.cvt.scalef32.pk.fp8.f32 %{{.*}}, %{{.*}}, %{{.*}} -> %{{.*}}[false]
84
+ // GFX950: rocdl.cvt.scalef32.pk.fp8.f32 %{{.*}}, %{{.*}}, %{{.*}} -> %{{.*}}[true]
85
+ // GFX950: rocdl.cvt.scalef32.pk.fp8.f32 %{{.*}}, %{{.*}}, %{{.*}} -> %{{.*}}[false]
86
+ // GFX950: rocdl.cvt.scalef32.pk.fp8.f32 %{{.*}}, %{{.*}}, %{{.*}} -> %{{.*}}[true]
75
87
%3 = tt.fp_to_fp %arg0 , rounding = rtne : tensor <8 x8 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xf8 E4 M3 FN, #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
76
88
77
- // GFX950-COUNT-4: rocdl.cvt.scalef32.pk.fp8.f16
89
+ // GFX950: rocdl.cvt.scalef32.pk.fp8.f16 %{{.*}}, %{{.*}} -> %{{.*}}[false]
90
+ // GFX950: rocdl.cvt.scalef32.pk.fp8.f16 %{{.*}}, %{{.*}} -> %{{.*}}[true]
91
+ // GFX950: rocdl.cvt.scalef32.pk.fp8.f16 %{{.*}}, %{{.*}} -> %{{.*}}[false]
92
+ // GFX950: rocdl.cvt.scalef32.pk.fp8.f16 %{{.*}}, %{{.*}} -> %{{.*}}[true]
78
93
%4 = tt.fp_to_fp %arg1 , rounding = rtne : tensor <8 x8 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xf8 E4 M3 FN, #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
79
94
80
- // GFX950-COUNT-4: rocdl.cvt.scalef32.pk.fp8.bf16
95
+ // GFX950: rocdl.cvt.scalef32.pk.fp8.bf16 %{{.*}}, %{{.*}} -> %{{.*}}[false]
96
+ // GFX950: rocdl.cvt.scalef32.pk.fp8.bf16 %{{.*}}, %{{.*}} -> %{{.*}}[true]
97
+ // GFX950: rocdl.cvt.scalef32.pk.fp8.bf16 %{{.*}}, %{{.*}} -> %{{.*}}[false]
98
+ // GFX950: rocdl.cvt.scalef32.pk.fp8.bf16 %{{.*}}, %{{.*}} -> %{{.*}}[true]
81
99
%5 = tt.fp_to_fp %arg2 , rounding = rtne : tensor <8 x8 xbf16 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xf8 E4 M3 FN, #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
82
100
tt.return
83
101
}
@@ -89,7 +107,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
89
107
#blocked2 = #ttg.blocked <{sizePerThread = [1 , 8 ], threadsPerWarp = [4 , 8 ], warpsPerCTA = [4 , 1 ], order = [1 , 0 ]}>
90
108
module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
91
109
tt.func @downcast_to_bf8 (%arg0: tensor <8 x8 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>) {
92
- // GFX942-COUNT-4: rocdl.cvt.pk.bf8.f32
110
+ // GFX942: rocdl.cvt.pk.bf8.f32 %{{.*}}, %{{.*}} -> %{{.*}}[false]
111
+ // GFX942: rocdl.cvt.pk.bf8.f32 %{{.*}}, %{{.*}} -> %{{.*}}[true]
112
+ // GFX942: rocdl.cvt.pk.bf8.f32 %{{.*}}, %{{.*}} -> %{{.*}}[false]
113
+ // GFX942: rocdl.cvt.pk.bf8.f32 %{{.*}}, %{{.*}} -> %{{.*}}[true]
93
114
// GFX950-COUNT-16: llvm.trunc %{{.+}} : i32 to i8
94
115
%6 = tt.fp_to_fp %arg0 , rounding = rtne : tensor <8 x8 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xf8 E5 M2 FNUZ, #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
95
116
tt.return
@@ -102,7 +123,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
102
123
#blocked2 = #ttg.blocked <{sizePerThread = [1 , 8 ], threadsPerWarp = [4 , 8 ], warpsPerCTA = [4 , 1 ], order = [1 , 0 ]}>
103
124
module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
104
125
tt.func @f32_to_f8 (%arg0: tensor <8 x8 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>) {
105
- // GFX942-COUNT-4: rocdl.cvt.pk.fp8.f32
126
+ // GFX942: rocdl.cvt.pk.fp8.f32 %{{.*}}, %{{.*}} -> %{{.*}}[false]
127
+ // GFX942: rocdl.cvt.pk.fp8.f32 %{{.*}}, %{{.*}} -> %{{.*}}[true]
128
+ // GFX942: rocdl.cvt.pk.fp8.f32 %{{.*}}, %{{.*}} -> %{{.*}}[false]
129
+ // GFX942: rocdl.cvt.pk.fp8.f32 %{{.*}}, %{{.*}} -> %{{.*}}[true]
106
130
// GFX950-COUNT-16: llvm.trunc %{{.+}} : i32 to i8
107
131
%7 = tt.fp_to_fp %arg0 , rounding = rtne : tensor <8 x8 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xf8 E4 M3 FNUZ, #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
108
132
tt.return
@@ -118,28 +142,52 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
118
142
%arg1: tensor <8 x8 xf8 E4 M3 FN, #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>,
119
143
%arg2: tensor <8 x8 xf8 E5 M2 FNUZ, #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>,
120
144
%arg3: tensor <8 x8 xf8 E4 M3 FNUZ, #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>) {
121
- // GFX950-COUNT-4: rocdl.cvt.scalef32.pk.f32.bf8
145
+ // GFX950: rocdl.cvt.scalef32.pk.f32.bf8 %[[VR1:.*]][false]
146
+ // GFX950: rocdl.cvt.scalef32.pk.f32.bf8 %[[VR1]][true]
147
+ // GFX950: rocdl.cvt.scalef32.pk.f32.bf8 %[[VR2:.*]][false]
148
+ // GFX950: rocdl.cvt.scalef32.pk.f32.bf8 %[[VR2]][true]
122
149
%0 = tt.fp_to_fp %arg0 : tensor <8 x8 xf8 E5 M2 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
123
150
124
- // GFX950-COUNT-4: rocdl.cvt.scalef32.pk.f16.bf8
151
+ // GFX950: rocdl.cvt.scalef32.pk.f16.bf8 %[[VR3:.*]][false]
152
+ // GFX950: rocdl.cvt.scalef32.pk.f16.bf8 %[[VR3]][true]
153
+ // GFX950: rocdl.cvt.scalef32.pk.f16.bf8 %[[VR4:.*]][false]
154
+ // GFX950: rocdl.cvt.scalef32.pk.f16.bf8 %[[VR4]][true]
125
155
%1 = tt.fp_to_fp %arg0 : tensor <8 x8 xf8 E5 M2 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
126
156
127
- // GFX950-COUNT-4: rocdl.cvt.scalef32.pk.bf16.bf8
157
+ // GFX950: rocdl.cvt.scalef32.pk.bf16.bf8 %[[VR5:.*]][false]
158
+ // GFX950: rocdl.cvt.scalef32.pk.bf16.bf8 %[[VR5]][true]
159
+ // GFX950: rocdl.cvt.scalef32.pk.bf16.bf8 %[[VR6:.*]][false]
160
+ // GFX950: rocdl.cvt.scalef32.pk.bf16.bf8 %[[VR6]][true]
128
161
%2 = tt.fp_to_fp %arg0 : tensor <8 x8 xf8 E5 M2 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xbf16 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
129
162
130
- // GFX950-COUNT-4: rocdl.cvt.scalef32.pk.f32.fp8
163
+ // GFX950: rocdl.cvt.scalef32.pk.f32.fp8 %[[VR7:.*]][false]
164
+ // GFX950: rocdl.cvt.scalef32.pk.f32.fp8 %[[VR7]][true]
165
+ // GFX950: rocdl.cvt.scalef32.pk.f32.fp8 %[[VR8:.*]][false]
166
+ // GFX950: rocdl.cvt.scalef32.pk.f32.fp8 %[[VR8]][true]
131
167
%3 = tt.fp_to_fp %arg1 : tensor <8 x8 xf8 E4 M3 FN, #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
132
168
133
- // GFX950-COUNT-4: rocdl.cvt.scalef32.pk.f16.fp8
169
+ // GFX950: rocdl.cvt.scalef32.pk.f16.fp8 %[[VR9:.*]][false]
170
+ // GFX950: rocdl.cvt.scalef32.pk.f16.fp8 %[[VR9]][true]
171
+ // GFX950: rocdl.cvt.scalef32.pk.f16.fp8 %[[VR10:.*]][false]
172
+ // GFX950: rocdl.cvt.scalef32.pk.f16.fp8 %[[VR10]][true]
134
173
%4 = tt.fp_to_fp %arg1 : tensor <8 x8 xf8 E4 M3 FN, #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
135
174
136
- // GFX950-COUNT-4: rocdl.cvt.scalef32.pk.bf16.fp8
175
+ // GFX950: rocdl.cvt.scalef32.pk.bf16.fp8 %[[VR11:.*]][false]
176
+ // GFX950: rocdl.cvt.scalef32.pk.bf16.fp8 %[[VR11]][true]
177
+ // GFX950: rocdl.cvt.scalef32.pk.bf16.fp8 %[[VR12:.*]][false]
178
+ // GFX950: rocdl.cvt.scalef32.pk.bf16.fp8 %[[VR12]][true]
137
179
%5 = tt.fp_to_fp %arg1 : tensor <8 x8 xf8 E4 M3 FN, #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xbf16 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
138
180
139
- // GFX942-COUNT-4: rocdl.cvt.pk.f32.bf8
181
+ // GFX942: rocdl.cvt.pk.f32.bf8 %[[VR13:.*]][false]
182
+ // GFX942: rocdl.cvt.pk.f32.bf8 %[[VR13]][true]
183
+ // GFX942: rocdl.cvt.pk.f32.bf8 %[[VR14:.*]][false]
184
+ // GFX942: rocdl.cvt.pk.f32.bf8 %[[VR14]][true]
140
185
%6 = tt.fp_to_fp %arg2 : tensor <8 x8 xf8 E5 M2 FNUZ, #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
141
186
142
- // GFX942-COUNT-4: rocdl.cvt.pk.f32.fp8
187
+ // GFX942: rocdl.cvt.pk.f32.fp8 %[[VR15:.*]][false]
188
+ // GFX942: rocdl.cvt.pk.f32.fp8 %[[VR15]][true]
189
+ // GFX942: rocdl.cvt.pk.f32.fp8 %[[VR16:.*]][false]
190
+ // GFX942: rocdl.cvt.pk.f32.fp8 %[[VR16]][true]
143
191
%7 = tt.fp_to_fp %arg3 : tensor <8 x8 xf8 E4 M3 FNUZ, #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>> -> tensor <8 x8 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #blocked2 }>>
144
192
tt.return
145
193
}
0 commit comments