Skip to content

Commit a46e108

Browse files
committed
replace previous instr sequence with reg_sequence and s_lshr_b64 based one
1 parent adf0f19 commit a46e108

File tree

2 files changed

+117
-131
lines changed

2 files changed

+117
-131
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2764,10 +2764,20 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
27642764
$src2, /* clamp */ 0, /* op_sel */ 0)
27652765
>;
27662766

2767+
// The commented out code has been left intentionally to aid the review process, if needed.
2768+
// Will delete before landing.
2769+
//def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2),
2770+
// (S_OR_B32 (S_LSHR_B32 $src1, $src2), (S_LSHL_B32 $src0, (S_SUB_I32 (i32 32), $src2)))
2771+
//>;
2772+
27672773
def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2),
2768-
(S_OR_B32 (S_LSHR_B32 $src1, $src2), (S_LSHL_B32 $src0, (S_SUB_I32 (i32 32), $src2)))
2774+
(i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), (S_AND_B32 $src2, (i32 31))), sub0))
27692775
>;
27702776

2777+
//def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, (i32 ShiftAmt32Imm:$src2)),
2778+
// (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), $src2), sub0))
2779+
//>;
2780+
27712781
} // end True16Predicate = UseFakeTrue16Insts
27722782

27732783
/********** ====================== **********/

llvm/test/CodeGen/AMDGPU/fshr.ll

Lines changed: 106 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -103,11 +103,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
103103
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
104104
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
105105
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
106-
; GFX11-FAKE16-NEXT: s_sub_i32 s3, 32, s2
107-
; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, s2
108-
; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s3
106+
; GFX11-FAKE16-NEXT: s_mov_b32 s6, s1
107+
; GFX11-FAKE16-NEXT: s_mov_b32 s7, s0
108+
; GFX11-FAKE16-NEXT: s_and_b32 s0, s2, 31
109109
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
110-
; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
110+
; GFX11-FAKE16-NEXT: s_lshr_b64 s[0:1], s[6:7], s0
111111
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
112112
; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[4:5]
113113
; GFX11-FAKE16-NEXT: s_endpgm
@@ -131,11 +131,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
131131
; GFX12-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c
132132
; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
133133
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
134-
; GFX12-FAKE16-NEXT: s_sub_co_i32 s3, 32, s2
135-
; GFX12-FAKE16-NEXT: s_lshr_b32 s1, s1, s2
136-
; GFX12-FAKE16-NEXT: s_lshl_b32 s0, s0, s3
134+
; GFX12-FAKE16-NEXT: s_mov_b32 s6, s1
135+
; GFX12-FAKE16-NEXT: s_mov_b32 s7, s0
136+
; GFX12-FAKE16-NEXT: s_and_b32 s0, s2, 31
137137
; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
138-
; GFX12-FAKE16-NEXT: s_or_b32 s0, s1, s0
138+
; GFX12-FAKE16-NEXT: s_lshr_b64 s[0:1], s[6:7], s0
139139
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
140140
; GFX12-FAKE16-NEXT: global_store_b32 v0, v1, s[4:5]
141141
; GFX12-FAKE16-NEXT: s_endpgm
@@ -213,12 +213,11 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
213213
; GFX11-FAKE16-LABEL: fshr_i32_imm:
214214
; GFX11-FAKE16: ; %bb.0: ; %entry
215215
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
216-
; GFX11-FAKE16-NEXT: s_sub_i32 s4, 32, 7
217216
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
218-
; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, s4
219-
; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 7
217+
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s3
218+
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s2
220219
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
221-
; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
220+
; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[4:5], 7
222221
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
223222
; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
224223
; GFX11-FAKE16-NEXT: s_endpgm
@@ -235,12 +234,11 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
235234
; GFX12-FAKE16-LABEL: fshr_i32_imm:
236235
; GFX12-FAKE16: ; %bb.0: ; %entry
237236
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
238-
; GFX12-FAKE16-NEXT: s_sub_co_i32 s4, 32, 7
239237
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
240-
; GFX12-FAKE16-NEXT: s_lshl_b32 s2, s2, s4
241-
; GFX12-FAKE16-NEXT: s_lshr_b32 s3, s3, 7
238+
; GFX12-FAKE16-NEXT: s_mov_b32 s4, s3
239+
; GFX12-FAKE16-NEXT: s_mov_b32 s5, s2
242240
; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
243-
; GFX12-FAKE16-NEXT: s_or_b32 s2, s3, s2
241+
; GFX12-FAKE16-NEXT: s_lshr_b64 s[2:3], s[4:5], 7
244242
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
245243
; GFX12-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
246244
; GFX12-FAKE16-NEXT: s_endpgm
@@ -349,20 +347,19 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
349347
; GFX11-FAKE16-LABEL: fshr_v2i32:
350348
; GFX11-FAKE16: ; %bb.0: ; %entry
351349
; GFX11-FAKE16-NEXT: s_clause 0x2
352-
; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
353350
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
351+
; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
354352
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
355353
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
356-
; GFX11-FAKE16-NEXT: s_sub_i32 s8, 32, s6
357-
; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, s7
358-
; GFX11-FAKE16-NEXT: s_sub_i32 s7, 32, s7
359-
; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, s6
360-
; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s8
361-
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, s7
362-
; GFX11-FAKE16-NEXT: s_or_b32 s0, s2, s0
363-
; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s1
354+
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s3
355+
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
356+
; GFX11-FAKE16-NEXT: s_mov_b32 s3, s0
357+
; GFX11-FAKE16-NEXT: s_and_b32 s0, s6, 31
358+
; GFX11-FAKE16-NEXT: s_and_b32 s6, s7, 31
359+
; GFX11-FAKE16-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
360+
; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[8:9], s6
364361
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
365-
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
362+
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
366363
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
367364
; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
368365
; GFX11-FAKE16-NEXT: s_endpgm
@@ -386,20 +383,19 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
386383
; GFX12-FAKE16-LABEL: fshr_v2i32:
387384
; GFX12-FAKE16: ; %bb.0: ; %entry
388385
; GFX12-FAKE16-NEXT: s_clause 0x2
389-
; GFX12-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
390386
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
387+
; GFX12-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
391388
; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
392389
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
393-
; GFX12-FAKE16-NEXT: s_sub_co_i32 s8, 32, s6
394-
; GFX12-FAKE16-NEXT: s_lshr_b32 s3, s3, s7
395-
; GFX12-FAKE16-NEXT: s_sub_co_i32 s7, 32, s7
396-
; GFX12-FAKE16-NEXT: s_lshr_b32 s2, s2, s6
397-
; GFX12-FAKE16-NEXT: s_lshl_b32 s0, s0, s8
398-
; GFX12-FAKE16-NEXT: s_lshl_b32 s1, s1, s7
399-
; GFX12-FAKE16-NEXT: s_or_b32 s0, s2, s0
400-
; GFX12-FAKE16-NEXT: s_or_b32 s1, s3, s1
390+
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s3
391+
; GFX12-FAKE16-NEXT: s_mov_b32 s9, s1
392+
; GFX12-FAKE16-NEXT: s_mov_b32 s3, s0
393+
; GFX12-FAKE16-NEXT: s_and_b32 s0, s6, 31
394+
; GFX12-FAKE16-NEXT: s_and_b32 s6, s7, 31
395+
; GFX12-FAKE16-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
396+
; GFX12-FAKE16-NEXT: s_lshr_b64 s[2:3], s[8:9], s6
401397
; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
402-
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
398+
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
403399
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0
404400
; GFX12-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
405401
; GFX12-FAKE16-NEXT: s_endpgm
@@ -494,17 +490,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
494490
; GFX11-FAKE16-NEXT: s_clause 0x1
495491
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
496492
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
497-
; GFX11-FAKE16-NEXT: s_sub_i32 s6, 32, 9
498-
; GFX11-FAKE16-NEXT: s_sub_i32 s7, 32, 7
499493
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
500-
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, s6
501-
; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s7
502-
; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 7
503-
; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 9
504-
; GFX11-FAKE16-NEXT: s_or_b32 s0, s2, s0
505-
; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s1
494+
; GFX11-FAKE16-NEXT: s_mov_b32 s6, s3
495+
; GFX11-FAKE16-NEXT: s_mov_b32 s3, s0
496+
; GFX11-FAKE16-NEXT: s_mov_b32 s7, s1
497+
; GFX11-FAKE16-NEXT: s_lshr_b64 s[0:1], s[2:3], 7
498+
; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[6:7], 9
506499
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
507-
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
500+
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
508501
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
509502
; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
510503
; GFX11-FAKE16-NEXT: s_endpgm
@@ -526,17 +519,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
526519
; GFX12-FAKE16-NEXT: s_clause 0x1
527520
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
528521
; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
529-
; GFX12-FAKE16-NEXT: s_sub_co_i32 s6, 32, 9
530-
; GFX12-FAKE16-NEXT: s_sub_co_i32 s7, 32, 7
531522
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
532-
; GFX12-FAKE16-NEXT: s_lshl_b32 s1, s1, s6
533-
; GFX12-FAKE16-NEXT: s_lshl_b32 s0, s0, s7
534-
; GFX12-FAKE16-NEXT: s_lshr_b32 s2, s2, 7
535-
; GFX12-FAKE16-NEXT: s_lshr_b32 s3, s3, 9
536-
; GFX12-FAKE16-NEXT: s_or_b32 s0, s2, s0
537-
; GFX12-FAKE16-NEXT: s_or_b32 s1, s3, s1
523+
; GFX12-FAKE16-NEXT: s_mov_b32 s6, s3
524+
; GFX12-FAKE16-NEXT: s_mov_b32 s3, s0
525+
; GFX12-FAKE16-NEXT: s_mov_b32 s7, s1
526+
; GFX12-FAKE16-NEXT: s_lshr_b64 s[0:1], s[2:3], 7
527+
; GFX12-FAKE16-NEXT: s_lshr_b64 s[2:3], s[6:7], 9
538528
; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
539-
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
529+
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
540530
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0
541531
; GFX12-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
542532
; GFX12-FAKE16-NEXT: s_endpgm
@@ -676,30 +666,28 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
676666
; GFX11-FAKE16-LABEL: fshr_v4i32:
677667
; GFX11-FAKE16: ; %bb.0: ; %entry
678668
; GFX11-FAKE16-NEXT: s_clause 0x2
679-
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
680669
; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
670+
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
681671
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
682672
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
683-
; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s15, s3
684-
; GFX11-FAKE16-NEXT: s_sub_i32 s3, 32, s3
685-
; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s14, s2
686-
; GFX11-FAKE16-NEXT: s_sub_i32 s2, 32, s2
687-
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s13, s1
688-
; GFX11-FAKE16-NEXT: s_sub_i32 s1, 32, s1
689-
; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s12, s0
690-
; GFX11-FAKE16-NEXT: s_sub_i32 s0, 32, s0
691-
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s11, s3
692-
; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s10, s2
693-
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s9, s1
694-
; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s8, s0
695-
; GFX11-FAKE16-NEXT: s_or_b32 s3, s6, s3
696-
; GFX11-FAKE16-NEXT: s_or_b32 s2, s7, s2
697-
; GFX11-FAKE16-NEXT: s_or_b32 s0, s12, s0
698-
; GFX11-FAKE16-NEXT: s_or_b32 s1, s13, s1
673+
; GFX11-FAKE16-NEXT: s_mov_b32 s6, s15
674+
; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
675+
; GFX11-FAKE16-NEXT: s_and_b32 s11, s3, 31
676+
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s10
677+
; GFX11-FAKE16-NEXT: s_and_b32 s10, s2, 31
678+
; GFX11-FAKE16-NEXT: s_mov_b32 s2, s13
679+
; GFX11-FAKE16-NEXT: s_mov_b32 s3, s9
680+
; GFX11-FAKE16-NEXT: s_and_b32 s16, s1, 31
681+
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s8
682+
; GFX11-FAKE16-NEXT: s_and_b32 s8, s0, 31
683+
; GFX11-FAKE16-NEXT: s_lshr_b64 s[0:1], s[6:7], s11
684+
; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[14:15], s10
685+
; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[12:13], s8
686+
; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[2:3], s16
699687
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
700-
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
701-
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
702-
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2
688+
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
689+
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0
690+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s6
703691
; GFX11-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[4:5]
704692
; GFX11-FAKE16-NEXT: s_endpgm
705693
;
@@ -727,30 +715,28 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
727715
; GFX12-FAKE16-LABEL: fshr_v4i32:
728716
; GFX12-FAKE16: ; %bb.0: ; %entry
729717
; GFX12-FAKE16-NEXT: s_clause 0x2
730-
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
731718
; GFX12-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
719+
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
732720
; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
733721
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
734-
; GFX12-FAKE16-NEXT: s_lshr_b32 s6, s15, s3
735-
; GFX12-FAKE16-NEXT: s_sub_co_i32 s3, 32, s3
736-
; GFX12-FAKE16-NEXT: s_lshr_b32 s7, s14, s2
737-
; GFX12-FAKE16-NEXT: s_sub_co_i32 s2, 32, s2
738-
; GFX12-FAKE16-NEXT: s_lshr_b32 s13, s13, s1
739-
; GFX12-FAKE16-NEXT: s_sub_co_i32 s1, 32, s1
740-
; GFX12-FAKE16-NEXT: s_lshr_b32 s12, s12, s0
741-
; GFX12-FAKE16-NEXT: s_sub_co_i32 s0, 32, s0
742-
; GFX12-FAKE16-NEXT: s_lshl_b32 s3, s11, s3
743-
; GFX12-FAKE16-NEXT: s_lshl_b32 s2, s10, s2
744-
; GFX12-FAKE16-NEXT: s_lshl_b32 s1, s9, s1
745-
; GFX12-FAKE16-NEXT: s_lshl_b32 s0, s8, s0
746-
; GFX12-FAKE16-NEXT: s_or_b32 s3, s6, s3
747-
; GFX12-FAKE16-NEXT: s_or_b32 s2, s7, s2
748-
; GFX12-FAKE16-NEXT: s_or_b32 s0, s12, s0
749-
; GFX12-FAKE16-NEXT: s_or_b32 s1, s13, s1
722+
; GFX12-FAKE16-NEXT: s_mov_b32 s6, s15
723+
; GFX12-FAKE16-NEXT: s_mov_b32 s7, s11
724+
; GFX12-FAKE16-NEXT: s_and_b32 s11, s3, 31
725+
; GFX12-FAKE16-NEXT: s_mov_b32 s15, s10
726+
; GFX12-FAKE16-NEXT: s_and_b32 s10, s2, 31
727+
; GFX12-FAKE16-NEXT: s_mov_b32 s2, s13
728+
; GFX12-FAKE16-NEXT: s_mov_b32 s3, s9
729+
; GFX12-FAKE16-NEXT: s_and_b32 s16, s1, 31
730+
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s8
731+
; GFX12-FAKE16-NEXT: s_and_b32 s8, s0, 31
732+
; GFX12-FAKE16-NEXT: s_lshr_b64 s[0:1], s[6:7], s11
733+
; GFX12-FAKE16-NEXT: s_lshr_b64 s[6:7], s[14:15], s10
734+
; GFX12-FAKE16-NEXT: s_lshr_b64 s[8:9], s[12:13], s8
735+
; GFX12-FAKE16-NEXT: s_lshr_b64 s[2:3], s[2:3], s16
750736
; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
751-
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
752-
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
753-
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s2
737+
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
738+
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0
739+
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s6
754740
; GFX12-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[4:5]
755741
; GFX12-FAKE16-NEXT: s_endpgm
756742
entry:
@@ -862,26 +848,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
862848
; GFX11-FAKE16-NEXT: s_clause 0x1
863849
; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
864850
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
865-
; GFX11-FAKE16-NEXT: s_sub_i32 s2, 32, 1
866-
; GFX11-FAKE16-NEXT: s_sub_i32 s3, 32, 9
867-
; GFX11-FAKE16-NEXT: s_sub_i32 s6, 32, 7
868851
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
869-
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s11, s2
870-
; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s15, 1
871-
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s10, s3
872-
; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s14, 9
873-
; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s9, s6
874-
; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s13, 7
875-
; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s8, s2
876-
; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s12, 1
877-
; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
878-
; GFX11-FAKE16-NEXT: s_or_b32 s3, s7, s3
879-
; GFX11-FAKE16-NEXT: s_or_b32 s2, s8, s2
880-
; GFX11-FAKE16-NEXT: s_or_b32 s5, s9, s6
852+
; GFX11-FAKE16-NEXT: s_mov_b32 s2, s15
853+
; GFX11-FAKE16-NEXT: s_mov_b32 s3, s11
854+
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s10
855+
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s13
856+
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9
857+
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s8
858+
; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
859+
; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[14:15], 9
860+
; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[12:13], 1
861+
; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[4:5], 7
881862
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
882-
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
883-
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
884-
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s3
863+
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4
864+
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2
865+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s6
885866
; GFX11-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
886867
; GFX11-FAKE16-NEXT: s_endpgm
887868
;
@@ -904,26 +885,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
904885
; GFX12-FAKE16-NEXT: s_clause 0x1
905886
; GFX12-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
906887
; GFX12-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
907-
; GFX12-FAKE16-NEXT: s_sub_co_i32 s2, 32, 1
908-
; GFX12-FAKE16-NEXT: s_sub_co_i32 s3, 32, 9
909-
; GFX12-FAKE16-NEXT: s_sub_co_i32 s6, 32, 7
910888
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
911-
; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s11, s2
912-
; GFX12-FAKE16-NEXT: s_lshr_b32 s5, s15, 1
913-
; GFX12-FAKE16-NEXT: s_lshl_b32 s3, s10, s3
914-
; GFX12-FAKE16-NEXT: s_lshr_b32 s7, s14, 9
915-
; GFX12-FAKE16-NEXT: s_lshl_b32 s6, s9, s6
916-
; GFX12-FAKE16-NEXT: s_lshr_b32 s9, s13, 7
917-
; GFX12-FAKE16-NEXT: s_lshl_b32 s2, s8, s2
918-
; GFX12-FAKE16-NEXT: s_lshr_b32 s8, s12, 1
919-
; GFX12-FAKE16-NEXT: s_or_b32 s4, s5, s4
920-
; GFX12-FAKE16-NEXT: s_or_b32 s3, s7, s3
921-
; GFX12-FAKE16-NEXT: s_or_b32 s2, s8, s2
922-
; GFX12-FAKE16-NEXT: s_or_b32 s5, s9, s6
889+
; GFX12-FAKE16-NEXT: s_mov_b32 s2, s15
890+
; GFX12-FAKE16-NEXT: s_mov_b32 s3, s11
891+
; GFX12-FAKE16-NEXT: s_mov_b32 s15, s10
892+
; GFX12-FAKE16-NEXT: s_mov_b32 s4, s13
893+
; GFX12-FAKE16-NEXT: s_mov_b32 s5, s9
894+
; GFX12-FAKE16-NEXT: s_mov_b32 s13, s8
895+
; GFX12-FAKE16-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
896+
; GFX12-FAKE16-NEXT: s_lshr_b64 s[6:7], s[14:15], 9
897+
; GFX12-FAKE16-NEXT: s_lshr_b64 s[8:9], s[12:13], 1
898+
; GFX12-FAKE16-NEXT: s_lshr_b64 s[4:5], s[4:5], 7
923899
; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
924-
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
925-
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
926-
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s3
900+
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4
901+
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2
902+
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s6
927903
; GFX12-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
928904
; GFX12-FAKE16-NEXT: s_endpgm
929905
entry:

0 commit comments

Comments
 (0)