Skip to content

Commit 556f7ff

Browse files
committed
Implement vop3p complex pattern optmization for gisel
1 parent 5b1c281 commit 556f7ff

File tree

12 files changed

+400
-113
lines changed

12 files changed

+400
-113
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 353 additions & 28 deletions
Large diffs are not rendered by default.

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
187187

188188
ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const;
189189

190-
std::pair<Register, unsigned>
191-
selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI,
190+
std::pair<MachineOperand *, unsigned>
191+
selectVOP3PModsImpl(MachineOperand *Op, const MachineRegisterInfo &MRI,
192192
bool IsDOT = false) const;
193193

194194
InstructionSelector::ComplexRendererFns

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,7 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
6868
; GFX906-LABEL: v_fdot2_neg_c:
6969
; GFX906: ; %bb.0:
7070
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
71-
; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
72-
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
71+
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
7372
; GFX906-NEXT: s_setpc_b64 s[30:31]
7473
%neg.c = fneg float %c
7574
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -248,8 +248,7 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
248248
; GFX906-LABEL: v_sdot2_fnegf32_c:
249249
; GFX906: ; %bb.0:
250250
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251-
; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
252-
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
251+
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
253252
; GFX906-NEXT: s_setpc_b64 s[30:31]
254253
;
255254
; GFX908-LABEL: v_sdot2_fnegf32_c:
@@ -263,8 +262,7 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
263262
; GFX10-LABEL: v_sdot2_fnegf32_c:
264263
; GFX10: ; %bb.0:
265264
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
266-
; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
267-
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
265+
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
268266
; GFX10-NEXT: s_setpc_b64 s[30:31]
269267
%neg.c = fneg float %c
270268
%cast.neg.c = bitcast float %neg.c to i32
@@ -276,8 +274,7 @@ define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
276274
; GFX906-LABEL: v_sdot2_fnegv2f16_c:
277275
; GFX906: ; %bb.0:
278276
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
279-
; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
280-
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
277+
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
281278
; GFX906-NEXT: s_setpc_b64 s[30:31]
282279
;
283280
; GFX908-LABEL: v_sdot2_fnegv2f16_c:
@@ -291,8 +288,7 @@ define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
291288
; GFX10-LABEL: v_sdot2_fnegv2f16_c:
292289
; GFX10: ; %bb.0:
293290
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
294-
; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
295-
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
291+
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
296292
; GFX10-NEXT: s_setpc_b64 s[30:31]
297293
%neg.c = fneg <2 x half> %c
298294
%cast.neg.c = bitcast <2 x half> %neg.c to i32
@@ -304,8 +300,7 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
304300
; GFX906-LABEL: v_sdot2_shuffle10_a:
305301
; GFX906: ; %bb.0:
306302
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
307-
; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16
308-
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
303+
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
309304
; GFX906-NEXT: s_setpc_b64 s[30:31]
310305
;
311306
; GFX908-LABEL: v_sdot2_shuffle10_a:
@@ -319,8 +314,7 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
319314
; GFX10-LABEL: v_sdot2_shuffle10_a:
320315
; GFX10: ; %bb.0:
321316
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
322-
; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
323-
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
317+
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
324318
; GFX10-NEXT: s_setpc_b64 s[30:31]
325319
%shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
326320
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -331,8 +325,7 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
331325
; GFX906-LABEL: v_sdot2_shuffle10_b:
332326
; GFX906: ; %bb.0:
333327
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
334-
; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16
335-
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
328+
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
336329
; GFX906-NEXT: s_setpc_b64 s[30:31]
337330
;
338331
; GFX908-LABEL: v_sdot2_shuffle10_b:
@@ -346,8 +339,7 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
346339
; GFX10-LABEL: v_sdot2_shuffle10_b:
347340
; GFX10: ; %bb.0:
348341
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
349-
; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16
350-
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
342+
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
351343
; GFX10-NEXT: s_setpc_b64 s[30:31]
352344
%shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
353345
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,7 @@ define i32 @v_sdot4_fnegf32_a(float %a, i32 %b, i32 %c) {
9191
; GFX906-LABEL: v_sdot4_fnegf32_a:
9292
; GFX906: ; %bb.0:
9393
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
94-
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
95-
; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
94+
; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
9695
; GFX906-NEXT: s_setpc_b64 s[30:31]
9796
;
9897
; GFX10-LABEL: v_sdot4_fnegf32_a:
@@ -112,8 +111,7 @@ define i32 @v_sdot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
112111
; GFX906-LABEL: v_sdot4_fnegv2f16_a:
113112
; GFX906: ; %bb.0:
114113
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115-
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
116-
; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
114+
; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
117115
; GFX906-NEXT: s_setpc_b64 s[30:31]
118116
;
119117
; GFX10-LABEL: v_sdot4_fnegv2f16_a:

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,15 +47,13 @@ define i32 @v_sdot8_fnegf32_a(float %a, i32 %b, i32 %c) {
4747
; GFX906-LABEL: v_sdot8_fnegf32_a:
4848
; GFX906: ; %bb.0:
4949
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50-
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
51-
; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
50+
; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
5251
; GFX906-NEXT: s_setpc_b64 s[30:31]
5352
;
5453
; GFX10-LABEL: v_sdot8_fnegf32_a:
5554
; GFX10: ; %bb.0:
5655
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57-
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
58-
; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
56+
; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
5957
; GFX10-NEXT: s_setpc_b64 s[30:31]
6058
%neg.a = fneg float %a
6159
%cast.neg.a = bitcast float %neg.a to i32
@@ -67,15 +65,13 @@ define i32 @v_sdot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
6765
; GFX906-LABEL: v_sdot8_fnegv2f16_a:
6866
; GFX906: ; %bb.0:
6967
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70-
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
71-
; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
68+
; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
7269
; GFX906-NEXT: s_setpc_b64 s[30:31]
7370
;
7471
; GFX10-LABEL: v_sdot8_fnegv2f16_a:
7572
; GFX10: ; %bb.0:
7673
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77-
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
78-
; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
74+
; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
7975
; GFX10-NEXT: s_setpc_b64 s[30:31]
8076
%neg.a = fneg <2 x half> %a
8177
%cast.neg.a = bitcast <2 x half> %neg.a to i32

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -235,22 +235,19 @@ define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
235235
; GFX906-LABEL: v_udot2_fnegf32_c:
236236
; GFX906: ; %bb.0:
237237
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238-
; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
239-
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
238+
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
240239
; GFX906-NEXT: s_setpc_b64 s[30:31]
241240
;
242241
; GFX908-LABEL: v_udot2_fnegf32_c:
243242
; GFX908: ; %bb.0:
244243
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
245-
; GFX908-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
246-
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
244+
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
247245
; GFX908-NEXT: s_setpc_b64 s[30:31]
248246
;
249247
; GFX10-LABEL: v_udot2_fnegf32_c:
250248
; GFX10: ; %bb.0:
251249
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252-
; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
253-
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
250+
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
254251
; GFX10-NEXT: s_setpc_b64 s[30:31]
255252
%neg.c = fneg float %c
256253
%cast.neg.c = bitcast float %neg.c to i32
@@ -262,22 +259,19 @@ define i32 @v_udot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
262259
; GFX906-LABEL: v_udot2_fnegv2f16_c:
263260
; GFX906: ; %bb.0:
264261
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265-
; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
266-
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
262+
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
267263
; GFX906-NEXT: s_setpc_b64 s[30:31]
268264
;
269265
; GFX908-LABEL: v_udot2_fnegv2f16_c:
270266
; GFX908: ; %bb.0:
271267
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
272-
; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
273-
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
268+
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
274269
; GFX908-NEXT: s_setpc_b64 s[30:31]
275270
;
276271
; GFX10-LABEL: v_udot2_fnegv2f16_c:
277272
; GFX10: ; %bb.0:
278273
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
279-
; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
280-
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
274+
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
281275
; GFX10-NEXT: s_setpc_b64 s[30:31]
282276
%neg.c = fneg <2 x half> %c
283277
%cast.neg.c = bitcast <2 x half> %neg.c to i32
@@ -289,22 +283,19 @@ define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
289283
; GFX906-LABEL: v_udot2_shuffle10_a:
290284
; GFX906: ; %bb.0:
291285
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292-
; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16
293-
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
286+
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
294287
; GFX906-NEXT: s_setpc_b64 s[30:31]
295288
;
296289
; GFX908-LABEL: v_udot2_shuffle10_a:
297290
; GFX908: ; %bb.0:
298291
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
299-
; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16
300-
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
292+
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
301293
; GFX908-NEXT: s_setpc_b64 s[30:31]
302294
;
303295
; GFX10-LABEL: v_udot2_shuffle10_a:
304296
; GFX10: ; %bb.0:
305297
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
306-
; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
307-
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
298+
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
308299
; GFX10-NEXT: s_setpc_b64 s[30:31]
309300
%shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
310301
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -315,22 +306,19 @@ define i32 @v_udot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
315306
; GFX906-LABEL: v_udot2_shuffle10_b:
316307
; GFX906: ; %bb.0:
317308
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318-
; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16
319-
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
309+
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
320310
; GFX906-NEXT: s_setpc_b64 s[30:31]
321311
;
322312
; GFX908-LABEL: v_udot2_shuffle10_b:
323313
; GFX908: ; %bb.0:
324314
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325-
; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16
326-
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
315+
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
327316
; GFX908-NEXT: s_setpc_b64 s[30:31]
328317
;
329318
; GFX10-LABEL: v_udot2_shuffle10_b:
330319
; GFX10: ; %bb.0:
331320
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
332-
; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16
333-
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
321+
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
334322
; GFX10-NEXT: s_setpc_b64 s[30:31]
335323
%shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
336324
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,13 @@ define i32 @v_udot4_fnegf32_a(float %a, i32 %b, i32 %c) {
112112
; GFX906-LABEL: v_udot4_fnegf32_a:
113113
; GFX906: ; %bb.0:
114114
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115-
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
116-
; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
115+
; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
117116
; GFX906-NEXT: s_setpc_b64 s[30:31]
118117
;
119118
; GFX10PLUS-LABEL: v_udot4_fnegf32_a:
120119
; GFX10PLUS: ; %bb.0:
121120
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122-
; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
123-
; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
121+
; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
124122
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
125123
%neg.a = fneg float %a
126124
%cast.neg.a = bitcast float %neg.a to i32
@@ -132,15 +130,13 @@ define i32 @v_udot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
132130
; GFX906-LABEL: v_udot4_fnegv2f16_a:
133131
; GFX906: ; %bb.0:
134132
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
135-
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
136-
; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
133+
; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
137134
; GFX906-NEXT: s_setpc_b64 s[30:31]
138135
;
139136
; GFX10PLUS-LABEL: v_udot4_fnegv2f16_a:
140137
; GFX10PLUS: ; %bb.0:
141138
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
142-
; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
143-
; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
139+
; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
144140
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
145141
%neg.a = fneg <2 x half> %a
146142
%cast.neg.a = bitcast <2 x half> %neg.a to i32

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,15 +48,13 @@ define i32 @v_udot8_fnegf32_a(float %a, i32 %b, i32 %c) {
4848
; GFX906-LABEL: v_udot8_fnegf32_a:
4949
; GFX906: ; %bb.0:
5050
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51-
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
52-
; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
51+
; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
5352
; GFX906-NEXT: s_setpc_b64 s[30:31]
5453
;
5554
; GFX10PLUS-LABEL: v_udot8_fnegf32_a:
5655
; GFX10PLUS: ; %bb.0:
5756
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
58-
; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
59-
; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
57+
; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
6058
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
6159
%neg.a = fneg float %a
6260
%cast.neg.a = bitcast float %neg.a to i32
@@ -68,15 +66,13 @@ define i32 @v_udot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
6866
; GFX906-LABEL: v_udot8_fnegv2f16_a:
6967
; GFX906: ; %bb.0:
7068
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
71-
; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
72-
; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
69+
; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
7370
; GFX906-NEXT: s_setpc_b64 s[30:31]
7471
;
7572
; GFX10PLUS-LABEL: v_udot8_fnegv2f16_a:
7673
; GFX10PLUS: ; %bb.0:
7774
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78-
; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
79-
; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
75+
; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
8076
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
8177
%neg.a = fneg <2 x half> %a
8278
%cast.neg.a = bitcast <2 x half> %neg.a to i32

llvm/test/CodeGen/AMDGPU/packed-fp32.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
8787
; GCN-LABEL: {{^}}fadd_v2_v_lit_splat:
8888
; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
8989
; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}}
90-
; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0{{$}}
90+
; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}}
9191
define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
9292
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
9393
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -308,7 +308,7 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) {
308308
; GCN-LABEL: {{^}}fmul_v2_v_lit_splat:
309309
; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
310310
; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}}
311-
; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0{{$}}
311+
; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}}
312312
define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
313313
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
314314
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -432,7 +432,7 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) {
432432
; GCN-LABEL: {{^}}fma_v2_v_lit_splat:
433433
; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0
434434
; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}}
435-
; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0{{$}}
435+
; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}}
436436
define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
437437
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
438438
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -556,8 +556,8 @@ bb:
556556
; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
557557
; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
558558

559-
; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0{{$}}
560-
; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0{{$}}
559+
; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}}
560+
; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}}
561561
define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
562562
bb:
563563
%i12 = fadd <2 x float> zeroinitializer, %arg

0 commit comments

Comments
 (0)