Skip to content

Commit 8e6ca11

Browse files
frederik-hbcahoon
authored andcommitted
[AMDGPU] SIPeepholeSDWA: Disable on existing SDWA instructions (llvm#124131)
This PR reapplies the changes from PR llvm#123942 which had to be reverted because of a test failure. The test has been adjusted. (cherry picked from commit bfd9bc2)
1 parent bab7691 commit 8e6ca11

19 files changed

+388
-106
lines changed

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -957,8 +957,11 @@ bool isConvertibleToSDWA(MachineInstr &MI,
957957
const SIInstrInfo* TII) {
958958
// Check if this is already an SDWA instruction
959959
unsigned Opc = MI.getOpcode();
960-
if (TII->isSDWA(Opc))
961-
return true;
960+
if (TII->isSDWA(Opc)) {
961+
// FIXME: Reenable after fixing selection handling.
962+
// Cf. llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
963+
return false;
964+
}
962965

963966
// Check if this instruction has opcode that supports SDWA
964967
if (AMDGPU::getSDWAOp(Opc) == -1)

llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -280,8 +280,9 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
280280
; GFX8-NEXT: v_min_i16_e32 v1, v2, v1
281281
; GFX8-NEXT: v_add_u16_e32 v1, v3, v1
282282
; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
283+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
283284
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
284-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
285+
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
285286
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
286287
; GFX8-NEXT: s_setpc_b64 s[30:31]
287288
;
@@ -299,7 +300,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
299300
; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp
300301
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
301302
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
302-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
303+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
304+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
303305
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
304306
; GFX9-NEXT: s_setpc_b64 s[30:31]
305307
;
@@ -439,7 +441,8 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
439441
; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
440442
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
441443
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
442-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
444+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
445+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
443446
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
444447
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
445448
; GFX9-NEXT: ; return to shader part epilog
@@ -609,9 +612,11 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
609612
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
610613
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
611614
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
612-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
615+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
616+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
613617
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
614-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
618+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
619+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
615620
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
616621
; GFX8-NEXT: s_setpc_b64 s[30:31]
617622
;

llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -280,8 +280,9 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
280280
; GFX8-NEXT: v_min_i16_e32 v1, v1, v4
281281
; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1
282282
; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
283+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
283284
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
284-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
285+
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
285286
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
286287
; GFX8-NEXT: s_setpc_b64 s[30:31]
287288
;
@@ -299,7 +300,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
299300
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
300301
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
301302
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
302-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
303+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
304+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
303305
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
304306
; GFX9-NEXT: s_setpc_b64 s[30:31]
305307
;
@@ -439,7 +441,8 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
439441
; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
440442
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
441443
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
442-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
444+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
445+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
443446
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
444447
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
445448
; GFX9-NEXT: ; return to shader part epilog
@@ -609,9 +612,11 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
609612
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
610613
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
611614
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
612-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
615+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
616+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
613617
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
614-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
618+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
619+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
615620
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
616621
; GFX8-NEXT: s_setpc_b64 s[30:31]
617622
;

llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,8 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
224224
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp
225225
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
226226
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
227-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
227+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
228+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
228229
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
229230
; GFX9-NEXT: s_setpc_b64 s[30:31]
230231
;
@@ -329,7 +330,8 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
329330
; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
330331
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
331332
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
332-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
333+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
334+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
333335
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
334336
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
335337
; GFX9-NEXT: ; return to shader part epilog
@@ -451,9 +453,11 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
451453
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
452454
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
453455
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
454-
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
456+
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
457+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
455458
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
456-
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
459+
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
460+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
457461
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
458462
; GFX8-NEXT: s_setpc_b64 s[30:31]
459463
;
@@ -618,18 +622,20 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
618622
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
619623
; GFX8-NEXT: s_lshl_b32 s0, s3, 8
620624
; GFX8-NEXT: v_mov_b32_e32 v2, s1
621-
; GFX8-NEXT: s_lshl_b32 s1, s7, 8
622625
; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
623626
; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp
624-
; GFX8-NEXT: s_lshl_b32 s0, s4, 8
625-
; GFX8-NEXT: v_mov_b32_e32 v3, s1
627+
; GFX8-NEXT: s_lshl_b32 s1, s7, 8
626628
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
627629
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
628-
; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp
630+
; GFX8-NEXT: s_lshl_b32 s0, s4, 8
631+
; GFX8-NEXT: v_mov_b32_e32 v3, s1
629632
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
630-
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
633+
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
634+
; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp
635+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
631636
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
632-
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
637+
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
638+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
633639
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
634640
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
635641
; GFX8-NEXT: ; return to shader part epilog

llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,8 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
218218
; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
219219
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
220220
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
221-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
221+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
222+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
222223
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
223224
; GFX9-NEXT: s_setpc_b64 s[30:31]
224225
;
@@ -321,7 +322,8 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
321322
; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
322323
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
323324
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
324-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
325+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
326+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
325327
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
326328
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
327329
; GFX9-NEXT: ; return to shader part epilog
@@ -439,9 +441,11 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
439441
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
440442
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
441443
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
442-
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
444+
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
445+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
443446
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
444-
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
447+
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
448+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
445449
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
446450
; GFX8-NEXT: s_setpc_b64 s[30:31]
447451
;
@@ -602,18 +606,20 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
602606
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
603607
; GFX8-NEXT: s_lshl_b32 s0, s3, 8
604608
; GFX8-NEXT: v_mov_b32_e32 v2, s1
605-
; GFX8-NEXT: s_lshl_b32 s1, s7, 8
606609
; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
607610
; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp
608-
; GFX8-NEXT: s_lshl_b32 s0, s4, 8
609-
; GFX8-NEXT: v_mov_b32_e32 v3, s1
611+
; GFX8-NEXT: s_lshl_b32 s1, s7, 8
610612
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
611613
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
612-
; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp
614+
; GFX8-NEXT: s_lshl_b32 s0, s4, 8
615+
; GFX8-NEXT: v_mov_b32_e32 v3, s1
613616
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
614-
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
617+
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
618+
; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp
619+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
615620
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
616-
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
621+
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
622+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
617623
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
618624
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
619625
; GFX8-NEXT: ; return to shader part epilog

llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4749,8 +4749,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
47494749
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
47504750
; GFX8-NEXT: s_waitcnt vmcnt(0)
47514751
; GFX8-NEXT: v_mov_b32_e32 v5, v0
4752-
; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4752+
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5
4753+
; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
47534754
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
4755+
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
47544756
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
47554757
; GFX8-NEXT: v_mov_b32_e32 v0, v4
47564758
; GFX8-NEXT: v_mov_b32_e32 v1, v5
@@ -4977,8 +4979,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
49774979
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
49784980
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
49794981
; GFX8-NEXT: s_waitcnt vmcnt(0)
4980-
; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4982+
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
4983+
; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
49814984
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
4985+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
49824986
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
49834987
; GFX8-NEXT: v_mov_b32_e32 v5, v2
49844988
; GFX8-NEXT: v_mov_b32_e32 v4, v1
@@ -5393,7 +5397,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
53935397
; GFX8-NEXT: ; =>This Loop Header: Depth=1
53945398
; GFX8-NEXT: ; Child Loop BB14_4 Depth 2
53955399
; GFX8-NEXT: s_waitcnt vmcnt(0)
5396-
; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
5400+
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v8
5401+
; GFX8-NEXT: v_add_f16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
5402+
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
53975403
; GFX8-NEXT: v_add_f16_e32 v6, v8, v5
53985404
; GFX8-NEXT: v_or_b32_e32 v7, v6, v4
53995405
; GFX8-NEXT: v_mov_b32_e32 v6, v7

0 commit comments

Comments
 (0)