@@ -103,11 +103,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
103103; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
104104; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
105105; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
106- ; GFX11-FAKE16-NEXT: s_sub_i32 s3, 32, s2
107- ; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, s2
108- ; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s3
106+ ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s1
107+ ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s0
108+ ; GFX11-FAKE16-NEXT: s_and_b32 s0, s2, 31
109109; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
110- ; GFX11-FAKE16-NEXT: s_or_b32 s0, s1 , s0
110+ ; GFX11-FAKE16-NEXT: s_lshr_b64 s[0:1], s[6:7] , s0
111111; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
112112; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[4:5]
113113; GFX11-FAKE16-NEXT: s_endpgm
@@ -131,11 +131,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
131131; GFX12-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c
132132; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
133133; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
134- ; GFX12-FAKE16-NEXT: s_sub_co_i32 s3, 32, s2
135- ; GFX12-FAKE16-NEXT: s_lshr_b32 s1, s1, s2
136- ; GFX12-FAKE16-NEXT: s_lshl_b32 s0, s0, s3
134+ ; GFX12-FAKE16-NEXT: s_mov_b32 s6, s1
135+ ; GFX12-FAKE16-NEXT: s_mov_b32 s7, s0
136+ ; GFX12-FAKE16-NEXT: s_and_b32 s0, s2, 31
137137; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
138- ; GFX12-FAKE16-NEXT: s_or_b32 s0, s1 , s0
138+ ; GFX12-FAKE16-NEXT: s_lshr_b64 s[0:1], s[6:7] , s0
139139; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
140140; GFX12-FAKE16-NEXT: global_store_b32 v0, v1, s[4:5]
141141; GFX12-FAKE16-NEXT: s_endpgm
@@ -213,12 +213,11 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
213213; GFX11-FAKE16-LABEL: fshr_i32_imm:
214214; GFX11-FAKE16: ; %bb.0: ; %entry
215215; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
216- ; GFX11-FAKE16-NEXT: s_sub_i32 s4, 32, 7
217216; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
218- ; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, s4
219- ; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 7
217+ ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s3
218+ ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s2
220219; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
221- ; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
220+ ; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[4:5], 7
222221; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
223222; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
224223; GFX11-FAKE16-NEXT: s_endpgm
@@ -235,12 +234,11 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
235234; GFX12-FAKE16-LABEL: fshr_i32_imm:
236235; GFX12-FAKE16: ; %bb.0: ; %entry
237236; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
238- ; GFX12-FAKE16-NEXT: s_sub_co_i32 s4, 32, 7
239237; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
240- ; GFX12-FAKE16-NEXT: s_lshl_b32 s2, s2, s4
241- ; GFX12-FAKE16-NEXT: s_lshr_b32 s3, s3, 7
238+ ; GFX12-FAKE16-NEXT: s_mov_b32 s4, s3
239+ ; GFX12-FAKE16-NEXT: s_mov_b32 s5, s2
242240; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
243- ; GFX12-FAKE16-NEXT: s_or_b32 s2, s3, s2
241+ ; GFX12-FAKE16-NEXT: s_lshr_b64 s[2:3], s[4:5], 7
244242; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
245243; GFX12-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
246244; GFX12-FAKE16-NEXT: s_endpgm
@@ -349,20 +347,19 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
349347; GFX11-FAKE16-LABEL: fshr_v2i32:
350348; GFX11-FAKE16: ; %bb.0: ; %entry
351349; GFX11-FAKE16-NEXT: s_clause 0x2
352- ; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
353350; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
351+ ; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
354352; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
355353; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
356- ; GFX11-FAKE16-NEXT: s_sub_i32 s8, 32, s6
357- ; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, s7
358- ; GFX11-FAKE16-NEXT: s_sub_i32 s7, 32, s7
359- ; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, s6
360- ; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s8
361- ; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, s7
362- ; GFX11-FAKE16-NEXT: s_or_b32 s0, s2, s0
363- ; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s1
354+ ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s3
355+ ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
356+ ; GFX11-FAKE16-NEXT: s_mov_b32 s3, s0
357+ ; GFX11-FAKE16-NEXT: s_and_b32 s0, s6, 31
358+ ; GFX11-FAKE16-NEXT: s_and_b32 s6, s7, 31
359+ ; GFX11-FAKE16-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
360+ ; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[8:9], s6
364361; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
365- ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
362+ ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
366363; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
367364; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
368365; GFX11-FAKE16-NEXT: s_endpgm
@@ -386,20 +383,19 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
386383; GFX12-FAKE16-LABEL: fshr_v2i32:
387384; GFX12-FAKE16: ; %bb.0: ; %entry
388385; GFX12-FAKE16-NEXT: s_clause 0x2
389- ; GFX12-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
390386; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
387+ ; GFX12-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
391388; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
392389; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
393- ; GFX12-FAKE16-NEXT: s_sub_co_i32 s8, 32, s6
394- ; GFX12-FAKE16-NEXT: s_lshr_b32 s3, s3, s7
395- ; GFX12-FAKE16-NEXT: s_sub_co_i32 s7, 32, s7
396- ; GFX12-FAKE16-NEXT: s_lshr_b32 s2, s2, s6
397- ; GFX12-FAKE16-NEXT: s_lshl_b32 s0, s0, s8
398- ; GFX12-FAKE16-NEXT: s_lshl_b32 s1, s1, s7
399- ; GFX12-FAKE16-NEXT: s_or_b32 s0, s2, s0
400- ; GFX12-FAKE16-NEXT: s_or_b32 s1, s3, s1
390+ ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s3
391+ ; GFX12-FAKE16-NEXT: s_mov_b32 s9, s1
392+ ; GFX12-FAKE16-NEXT: s_mov_b32 s3, s0
393+ ; GFX12-FAKE16-NEXT: s_and_b32 s0, s6, 31
394+ ; GFX12-FAKE16-NEXT: s_and_b32 s6, s7, 31
395+ ; GFX12-FAKE16-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
396+ ; GFX12-FAKE16-NEXT: s_lshr_b64 s[2:3], s[8:9], s6
401397; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
402- ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
398+ ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
403399; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0
404400; GFX12-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
405401; GFX12-FAKE16-NEXT: s_endpgm
@@ -494,17 +490,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
494490; GFX11-FAKE16-NEXT: s_clause 0x1
495491; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
496492; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
497- ; GFX11-FAKE16-NEXT: s_sub_i32 s6, 32, 9
498- ; GFX11-FAKE16-NEXT: s_sub_i32 s7, 32, 7
499493; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
500- ; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, s6
501- ; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s7
502- ; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 7
503- ; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 9
504- ; GFX11-FAKE16-NEXT: s_or_b32 s0, s2, s0
505- ; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s1
494+ ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s3
495+ ; GFX11-FAKE16-NEXT: s_mov_b32 s3, s0
496+ ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s1
497+ ; GFX11-FAKE16-NEXT: s_lshr_b64 s[0:1], s[2:3], 7
498+ ; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[6:7], 9
506499; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
507- ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
500+ ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
508501; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
509502; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
510503; GFX11-FAKE16-NEXT: s_endpgm
@@ -526,17 +519,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
526519; GFX12-FAKE16-NEXT: s_clause 0x1
527520; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
528521; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
529- ; GFX12-FAKE16-NEXT: s_sub_co_i32 s6, 32, 9
530- ; GFX12-FAKE16-NEXT: s_sub_co_i32 s7, 32, 7
531522; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
532- ; GFX12-FAKE16-NEXT: s_lshl_b32 s1, s1, s6
533- ; GFX12-FAKE16-NEXT: s_lshl_b32 s0, s0, s7
534- ; GFX12-FAKE16-NEXT: s_lshr_b32 s2, s2, 7
535- ; GFX12-FAKE16-NEXT: s_lshr_b32 s3, s3, 9
536- ; GFX12-FAKE16-NEXT: s_or_b32 s0, s2, s0
537- ; GFX12-FAKE16-NEXT: s_or_b32 s1, s3, s1
523+ ; GFX12-FAKE16-NEXT: s_mov_b32 s6, s3
524+ ; GFX12-FAKE16-NEXT: s_mov_b32 s3, s0
525+ ; GFX12-FAKE16-NEXT: s_mov_b32 s7, s1
526+ ; GFX12-FAKE16-NEXT: s_lshr_b64 s[0:1], s[2:3], 7
527+ ; GFX12-FAKE16-NEXT: s_lshr_b64 s[2:3], s[6:7], 9
538528; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
539- ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
529+ ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
540530; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0
541531; GFX12-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
542532; GFX12-FAKE16-NEXT: s_endpgm
@@ -676,30 +666,28 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
676666; GFX11-FAKE16-LABEL: fshr_v4i32:
677667; GFX11-FAKE16: ; %bb.0: ; %entry
678668; GFX11-FAKE16-NEXT: s_clause 0x2
679- ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
680669; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
670+ ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
681671; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
682672; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
683- ; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s15, s3
684- ; GFX11-FAKE16-NEXT: s_sub_i32 s3, 32, s3
685- ; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s14, s2
686- ; GFX11-FAKE16-NEXT: s_sub_i32 s2, 32, s2
687- ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s13, s1
688- ; GFX11-FAKE16-NEXT: s_sub_i32 s1, 32, s1
689- ; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s12, s0
690- ; GFX11-FAKE16-NEXT: s_sub_i32 s0, 32, s0
691- ; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s11, s3
692- ; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s10, s2
693- ; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s9, s1
694- ; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s8, s0
695- ; GFX11-FAKE16-NEXT: s_or_b32 s3, s6, s3
696- ; GFX11-FAKE16-NEXT: s_or_b32 s2, s7, s2
697- ; GFX11-FAKE16-NEXT: s_or_b32 s0, s12, s0
698- ; GFX11-FAKE16-NEXT: s_or_b32 s1, s13, s1
673+ ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s15
674+ ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
675+ ; GFX11-FAKE16-NEXT: s_and_b32 s11, s3, 31
676+ ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s10
677+ ; GFX11-FAKE16-NEXT: s_and_b32 s10, s2, 31
678+ ; GFX11-FAKE16-NEXT: s_mov_b32 s2, s13
679+ ; GFX11-FAKE16-NEXT: s_mov_b32 s3, s9
680+ ; GFX11-FAKE16-NEXT: s_and_b32 s16, s1, 31
681+ ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s8
682+ ; GFX11-FAKE16-NEXT: s_and_b32 s8, s0, 31
683+ ; GFX11-FAKE16-NEXT: s_lshr_b64 s[0:1], s[6:7], s11
684+ ; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[14:15], s10
685+ ; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[12:13], s8
686+ ; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[2:3], s16
699687; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
700- ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
701- ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
702- ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2
688+ ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
689+ ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0
690+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s6
703691; GFX11-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[4:5]
704692; GFX11-FAKE16-NEXT: s_endpgm
705693;
@@ -727,30 +715,28 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
727715; GFX12-FAKE16-LABEL: fshr_v4i32:
728716; GFX12-FAKE16: ; %bb.0: ; %entry
729717; GFX12-FAKE16-NEXT: s_clause 0x2
730- ; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
731718; GFX12-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
719+ ; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
732720; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
733721; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
734- ; GFX12-FAKE16-NEXT: s_lshr_b32 s6, s15, s3
735- ; GFX12-FAKE16-NEXT: s_sub_co_i32 s3, 32, s3
736- ; GFX12-FAKE16-NEXT: s_lshr_b32 s7, s14, s2
737- ; GFX12-FAKE16-NEXT: s_sub_co_i32 s2, 32, s2
738- ; GFX12-FAKE16-NEXT: s_lshr_b32 s13, s13, s1
739- ; GFX12-FAKE16-NEXT: s_sub_co_i32 s1, 32, s1
740- ; GFX12-FAKE16-NEXT: s_lshr_b32 s12, s12, s0
741- ; GFX12-FAKE16-NEXT: s_sub_co_i32 s0, 32, s0
742- ; GFX12-FAKE16-NEXT: s_lshl_b32 s3, s11, s3
743- ; GFX12-FAKE16-NEXT: s_lshl_b32 s2, s10, s2
744- ; GFX12-FAKE16-NEXT: s_lshl_b32 s1, s9, s1
745- ; GFX12-FAKE16-NEXT: s_lshl_b32 s0, s8, s0
746- ; GFX12-FAKE16-NEXT: s_or_b32 s3, s6, s3
747- ; GFX12-FAKE16-NEXT: s_or_b32 s2, s7, s2
748- ; GFX12-FAKE16-NEXT: s_or_b32 s0, s12, s0
749- ; GFX12-FAKE16-NEXT: s_or_b32 s1, s13, s1
722+ ; GFX12-FAKE16-NEXT: s_mov_b32 s6, s15
723+ ; GFX12-FAKE16-NEXT: s_mov_b32 s7, s11
724+ ; GFX12-FAKE16-NEXT: s_and_b32 s11, s3, 31
725+ ; GFX12-FAKE16-NEXT: s_mov_b32 s15, s10
726+ ; GFX12-FAKE16-NEXT: s_and_b32 s10, s2, 31
727+ ; GFX12-FAKE16-NEXT: s_mov_b32 s2, s13
728+ ; GFX12-FAKE16-NEXT: s_mov_b32 s3, s9
729+ ; GFX12-FAKE16-NEXT: s_and_b32 s16, s1, 31
730+ ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s8
731+ ; GFX12-FAKE16-NEXT: s_and_b32 s8, s0, 31
732+ ; GFX12-FAKE16-NEXT: s_lshr_b64 s[0:1], s[6:7], s11
733+ ; GFX12-FAKE16-NEXT: s_lshr_b64 s[6:7], s[14:15], s10
734+ ; GFX12-FAKE16-NEXT: s_lshr_b64 s[8:9], s[12:13], s8
735+ ; GFX12-FAKE16-NEXT: s_lshr_b64 s[2:3], s[2:3], s16
750736; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
751- ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
752- ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
753- ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s2
737+ ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
738+ ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0
739+ ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s6
754740; GFX12-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[4:5]
755741; GFX12-FAKE16-NEXT: s_endpgm
756742entry:
@@ -862,26 +848,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
862848; GFX11-FAKE16-NEXT: s_clause 0x1
863849; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
864850; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
865- ; GFX11-FAKE16-NEXT: s_sub_i32 s2, 32, 1
866- ; GFX11-FAKE16-NEXT: s_sub_i32 s3, 32, 9
867- ; GFX11-FAKE16-NEXT: s_sub_i32 s6, 32, 7
868851; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
869- ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s11, s2
870- ; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s15, 1
871- ; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s10, s3
872- ; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s14, 9
873- ; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s9, s6
874- ; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s13, 7
875- ; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s8, s2
876- ; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s12, 1
877- ; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
878- ; GFX11-FAKE16-NEXT: s_or_b32 s3, s7, s3
879- ; GFX11-FAKE16-NEXT: s_or_b32 s2, s8, s2
880- ; GFX11-FAKE16-NEXT: s_or_b32 s5, s9, s6
852+ ; GFX11-FAKE16-NEXT: s_mov_b32 s2, s15
853+ ; GFX11-FAKE16-NEXT: s_mov_b32 s3, s11
854+ ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s10
855+ ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s13
856+ ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9
857+ ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s8
858+ ; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
859+ ; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[14:15], 9
860+ ; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[12:13], 1
861+ ; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[4:5], 7
881862; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
882- ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
883- ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
884- ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s3
863+ ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4
864+ ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2
865+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s6
885866; GFX11-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
886867; GFX11-FAKE16-NEXT: s_endpgm
887868;
@@ -904,26 +885,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
904885; GFX12-FAKE16-NEXT: s_clause 0x1
905886; GFX12-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
906887; GFX12-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
907- ; GFX12-FAKE16-NEXT: s_sub_co_i32 s2, 32, 1
908- ; GFX12-FAKE16-NEXT: s_sub_co_i32 s3, 32, 9
909- ; GFX12-FAKE16-NEXT: s_sub_co_i32 s6, 32, 7
910888; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
911- ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s11, s2
912- ; GFX12-FAKE16-NEXT: s_lshr_b32 s5, s15, 1
913- ; GFX12-FAKE16-NEXT: s_lshl_b32 s3, s10, s3
914- ; GFX12-FAKE16-NEXT: s_lshr_b32 s7, s14, 9
915- ; GFX12-FAKE16-NEXT: s_lshl_b32 s6, s9, s6
916- ; GFX12-FAKE16-NEXT: s_lshr_b32 s9, s13, 7
917- ; GFX12-FAKE16-NEXT: s_lshl_b32 s2, s8, s2
918- ; GFX12-FAKE16-NEXT: s_lshr_b32 s8, s12, 1
919- ; GFX12-FAKE16-NEXT: s_or_b32 s4, s5, s4
920- ; GFX12-FAKE16-NEXT: s_or_b32 s3, s7, s3
921- ; GFX12-FAKE16-NEXT: s_or_b32 s2, s8, s2
922- ; GFX12-FAKE16-NEXT: s_or_b32 s5, s9, s6
889+ ; GFX12-FAKE16-NEXT: s_mov_b32 s2, s15
890+ ; GFX12-FAKE16-NEXT: s_mov_b32 s3, s11
891+ ; GFX12-FAKE16-NEXT: s_mov_b32 s15, s10
892+ ; GFX12-FAKE16-NEXT: s_mov_b32 s4, s13
893+ ; GFX12-FAKE16-NEXT: s_mov_b32 s5, s9
894+ ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s8
895+ ; GFX12-FAKE16-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
896+ ; GFX12-FAKE16-NEXT: s_lshr_b64 s[6:7], s[14:15], 9
897+ ; GFX12-FAKE16-NEXT: s_lshr_b64 s[8:9], s[12:13], 1
898+ ; GFX12-FAKE16-NEXT: s_lshr_b64 s[4:5], s[4:5], 7
923899; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
924- ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
925- ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
926- ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s3
900+ ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4
901+ ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2
902+ ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s6
927903; GFX12-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
928904; GFX12-FAKE16-NEXT: s_endpgm
929905entry:
0 commit comments