@@ -103,11 +103,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
103103; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c 
104104; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24 
105105; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) 
106- ; GFX11-FAKE16-NEXT:    s_sub_i32 s3, 32, s2  
107- ; GFX11-FAKE16-NEXT:    s_lshr_b32 s1, s1, s2  
108- ; GFX11-FAKE16-NEXT:    s_lshl_b32  s0, s0, s3  
106+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s6, s1  
107+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s7, s0  
108+ ; GFX11-FAKE16-NEXT:    s_and_b32  s0, s2, 31  
109109; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 
110- ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1 , s0 
110+ ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[0:1], s[6:7] , s0 
111111; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 
112112; GFX11-FAKE16-NEXT:    global_store_b32 v0, v1, s[4:5] 
113113; GFX11-FAKE16-NEXT:    s_endpgm 
@@ -131,11 +131,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
131131; GFX12-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x2c 
132132; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24 
133133; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0 
134- ; GFX12-FAKE16-NEXT:    s_sub_co_i32 s3, 32, s2  
135- ; GFX12-FAKE16-NEXT:    s_lshr_b32 s1, s1, s2  
136- ; GFX12-FAKE16-NEXT:    s_lshl_b32  s0, s0, s3  
134+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s1  
135+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s0  
136+ ; GFX12-FAKE16-NEXT:    s_and_b32  s0, s2, 31  
137137; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 
138- ; GFX12-FAKE16-NEXT:    s_or_b32 s0, s1 , s0 
138+ ; GFX12-FAKE16-NEXT:    s_lshr_b64 s[0:1], s[6:7] , s0 
139139; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 
140140; GFX12-FAKE16-NEXT:    global_store_b32 v0, v1, s[4:5] 
141141; GFX12-FAKE16-NEXT:    s_endpgm 
@@ -213,12 +213,11 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
213213; GFX11-FAKE16-LABEL: fshr_i32_imm: 
214214; GFX11-FAKE16:       ; %bb.0: ; %entry 
215215; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 
216- ; GFX11-FAKE16-NEXT:    s_sub_i32 s4, 32, 7 
217216; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) 
218- ; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, s4  
219- ; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s3, 7  
217+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s3  
218+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s2  
220219; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 
221- ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2  
220+ ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[4:5], 7  
222221; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 
223222; GFX11-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1] 
224223; GFX11-FAKE16-NEXT:    s_endpgm 
@@ -235,12 +234,11 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
235234; GFX12-FAKE16-LABEL: fshr_i32_imm: 
236235; GFX12-FAKE16:       ; %bb.0: ; %entry 
237236; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 
238- ; GFX12-FAKE16-NEXT:    s_sub_co_i32 s4, 32, 7 
239237; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0 
240- ; GFX12-FAKE16-NEXT:    s_lshl_b32 s2, s2, s4  
241- ; GFX12-FAKE16-NEXT:    s_lshr_b32 s3, s3, 7  
238+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s4, s3  
239+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, s2  
242240; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 
243- ; GFX12-FAKE16-NEXT:    s_or_b32 s2, s3, s2  
241+ ; GFX12-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[4:5], 7  
244242; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 
245243; GFX12-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1] 
246244; GFX12-FAKE16-NEXT:    s_endpgm 
@@ -349,20 +347,19 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
349347; GFX11-FAKE16-LABEL: fshr_v2i32: 
350348; GFX11-FAKE16:       ; %bb.0: ; %entry 
351349; GFX11-FAKE16-NEXT:    s_clause 0x2 
352- ; GFX11-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c 
353350; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c 
351+ ; GFX11-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c 
354352; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24 
355353; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) 
356- ; GFX11-FAKE16-NEXT:    s_sub_i32 s8, 32, s6 
357- ; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s3, s7 
358- ; GFX11-FAKE16-NEXT:    s_sub_i32 s7, 32, s7 
359- ; GFX11-FAKE16-NEXT:    s_lshr_b32 s2, s2, s6 
360- ; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, s8 
361- ; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, s7 
362- ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s2, s0 
363- ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s1 
354+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s3 
355+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s1 
356+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s3, s0 
357+ ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s6, 31 
358+ ; GFX11-FAKE16-NEXT:    s_and_b32 s6, s7, 31 
359+ ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0 
360+ ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[8:9], s6 
364361; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) 
365- ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1  
362+ ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2  
366363; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0 
367364; GFX11-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[4:5] 
368365; GFX11-FAKE16-NEXT:    s_endpgm 
@@ -386,20 +383,19 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
386383; GFX12-FAKE16-LABEL: fshr_v2i32: 
387384; GFX12-FAKE16:       ; %bb.0: ; %entry 
388385; GFX12-FAKE16-NEXT:    s_clause 0x2 
389- ; GFX12-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c 
390386; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c 
387+ ; GFX12-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c 
391388; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24 
392389; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0 
393- ; GFX12-FAKE16-NEXT:    s_sub_co_i32 s8, 32, s6 
394- ; GFX12-FAKE16-NEXT:    s_lshr_b32 s3, s3, s7 
395- ; GFX12-FAKE16-NEXT:    s_sub_co_i32 s7, 32, s7 
396- ; GFX12-FAKE16-NEXT:    s_lshr_b32 s2, s2, s6 
397- ; GFX12-FAKE16-NEXT:    s_lshl_b32 s0, s0, s8 
398- ; GFX12-FAKE16-NEXT:    s_lshl_b32 s1, s1, s7 
399- ; GFX12-FAKE16-NEXT:    s_or_b32 s0, s2, s0 
400- ; GFX12-FAKE16-NEXT:    s_or_b32 s1, s3, s1 
390+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s3 
391+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1 
392+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s3, s0 
393+ ; GFX12-FAKE16-NEXT:    s_and_b32 s0, s6, 31 
394+ ; GFX12-FAKE16-NEXT:    s_and_b32 s6, s7, 31 
395+ ; GFX12-FAKE16-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0 
396+ ; GFX12-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[8:9], s6 
401397; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) 
402- ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1  
398+ ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2  
403399; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, s0 
404400; GFX12-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[4:5] 
405401; GFX12-FAKE16-NEXT:    s_endpgm 
@@ -494,17 +490,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
494490; GFX11-FAKE16-NEXT:    s_clause 0x1 
495491; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c 
496492; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24 
497- ; GFX11-FAKE16-NEXT:    s_sub_i32 s6, 32, 9 
498- ; GFX11-FAKE16-NEXT:    s_sub_i32 s7, 32, 7 
499493; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) 
500- ; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, s6 
501- ; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, s7 
502- ; GFX11-FAKE16-NEXT:    s_lshr_b32 s2, s2, 7 
503- ; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s3, 9 
504- ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s2, s0 
505- ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s1 
494+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s6, s3 
495+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s3, s0 
496+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s7, s1 
497+ ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[0:1], s[2:3], 7 
498+ ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[6:7], 9 
506499; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) 
507- ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1  
500+ ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2  
508501; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0 
509502; GFX11-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[4:5] 
510503; GFX11-FAKE16-NEXT:    s_endpgm 
@@ -526,17 +519,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
526519; GFX12-FAKE16-NEXT:    s_clause 0x1 
527520; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c 
528521; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24 
529- ; GFX12-FAKE16-NEXT:    s_sub_co_i32 s6, 32, 9 
530- ; GFX12-FAKE16-NEXT:    s_sub_co_i32 s7, 32, 7 
531522; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0 
532- ; GFX12-FAKE16-NEXT:    s_lshl_b32 s1, s1, s6 
533- ; GFX12-FAKE16-NEXT:    s_lshl_b32 s0, s0, s7 
534- ; GFX12-FAKE16-NEXT:    s_lshr_b32 s2, s2, 7 
535- ; GFX12-FAKE16-NEXT:    s_lshr_b32 s3, s3, 9 
536- ; GFX12-FAKE16-NEXT:    s_or_b32 s0, s2, s0 
537- ; GFX12-FAKE16-NEXT:    s_or_b32 s1, s3, s1 
523+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s3 
524+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s3, s0 
525+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s1 
526+ ; GFX12-FAKE16-NEXT:    s_lshr_b64 s[0:1], s[2:3], 7 
527+ ; GFX12-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[6:7], 9 
538528; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) 
539- ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1  
529+ ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2  
540530; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, s0 
541531; GFX12-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[4:5] 
542532; GFX12-FAKE16-NEXT:    s_endpgm 
@@ -676,30 +666,28 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
676666; GFX11-FAKE16-LABEL: fshr_v4i32: 
677667; GFX11-FAKE16:       ; %bb.0: ; %entry 
678668; GFX11-FAKE16-NEXT:    s_clause 0x2 
679- ; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54 
680669; GFX11-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34 
670+ ; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54 
681671; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24 
682672; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) 
683- ; GFX11-FAKE16-NEXT:    s_lshr_b32 s6, s15, s3 
684- ; GFX11-FAKE16-NEXT:    s_sub_i32 s3, 32, s3 
685- ; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s14, s2 
686- ; GFX11-FAKE16-NEXT:    s_sub_i32 s2, 32, s2 
687- ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s13, s1 
688- ; GFX11-FAKE16-NEXT:    s_sub_i32 s1, 32, s1 
689- ; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s12, s0 
690- ; GFX11-FAKE16-NEXT:    s_sub_i32 s0, 32, s0 
691- ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s11, s3 
692- ; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s10, s2 
693- ; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s9, s1 
694- ; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s8, s0 
695- ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s6, s3 
696- ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s7, s2 
697- ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s12, s0 
698- ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s13, s1 
673+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s6, s15 
674+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s7, s11 
675+ ; GFX11-FAKE16-NEXT:    s_and_b32 s11, s3, 31 
676+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s10 
677+ ; GFX11-FAKE16-NEXT:    s_and_b32 s10, s2, 31 
678+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, s13 
679+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s3, s9 
680+ ; GFX11-FAKE16-NEXT:    s_and_b32 s16, s1, 31 
681+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s8 
682+ ; GFX11-FAKE16-NEXT:    s_and_b32 s8, s0, 31 
683+ ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[0:1], s[6:7], s11 
684+ ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[6:7], s[14:15], s10 
685+ ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8 
686+ ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[2:3], s16 
699687; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) 
700- ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1  
701- ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0  :: v_dual_mov_b32 v3, s3  
702- ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s2  
688+ ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2  
689+ ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s8  :: v_dual_mov_b32 v3, s0  
690+ ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s6  
703691; GFX11-FAKE16-NEXT:    global_store_b128 v4, v[0:3], s[4:5] 
704692; GFX11-FAKE16-NEXT:    s_endpgm 
705693; 
@@ -727,30 +715,28 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
727715; GFX12-FAKE16-LABEL: fshr_v4i32: 
728716; GFX12-FAKE16:       ; %bb.0: ; %entry 
729717; GFX12-FAKE16-NEXT:    s_clause 0x2 
730- ; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54 
731718; GFX12-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34 
719+ ; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54 
732720; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24 
733721; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0 
734- ; GFX12-FAKE16-NEXT:    s_lshr_b32 s6, s15, s3 
735- ; GFX12-FAKE16-NEXT:    s_sub_co_i32 s3, 32, s3 
736- ; GFX12-FAKE16-NEXT:    s_lshr_b32 s7, s14, s2 
737- ; GFX12-FAKE16-NEXT:    s_sub_co_i32 s2, 32, s2 
738- ; GFX12-FAKE16-NEXT:    s_lshr_b32 s13, s13, s1 
739- ; GFX12-FAKE16-NEXT:    s_sub_co_i32 s1, 32, s1 
740- ; GFX12-FAKE16-NEXT:    s_lshr_b32 s12, s12, s0 
741- ; GFX12-FAKE16-NEXT:    s_sub_co_i32 s0, 32, s0 
742- ; GFX12-FAKE16-NEXT:    s_lshl_b32 s3, s11, s3 
743- ; GFX12-FAKE16-NEXT:    s_lshl_b32 s2, s10, s2 
744- ; GFX12-FAKE16-NEXT:    s_lshl_b32 s1, s9, s1 
745- ; GFX12-FAKE16-NEXT:    s_lshl_b32 s0, s8, s0 
746- ; GFX12-FAKE16-NEXT:    s_or_b32 s3, s6, s3 
747- ; GFX12-FAKE16-NEXT:    s_or_b32 s2, s7, s2 
748- ; GFX12-FAKE16-NEXT:    s_or_b32 s0, s12, s0 
749- ; GFX12-FAKE16-NEXT:    s_or_b32 s1, s13, s1 
722+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s15 
723+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11 
724+ ; GFX12-FAKE16-NEXT:    s_and_b32 s11, s3, 31 
725+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s10 
726+ ; GFX12-FAKE16-NEXT:    s_and_b32 s10, s2, 31 
727+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s2, s13 
728+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s3, s9 
729+ ; GFX12-FAKE16-NEXT:    s_and_b32 s16, s1, 31 
730+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s8 
731+ ; GFX12-FAKE16-NEXT:    s_and_b32 s8, s0, 31 
732+ ; GFX12-FAKE16-NEXT:    s_lshr_b64 s[0:1], s[6:7], s11 
733+ ; GFX12-FAKE16-NEXT:    s_lshr_b64 s[6:7], s[14:15], s10 
734+ ; GFX12-FAKE16-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8 
735+ ; GFX12-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[2:3], s16 
750736; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) 
751- ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1  
752- ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, s0  :: v_dual_mov_b32 v3, s3  
753- ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s2  
737+ ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2  
738+ ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, s8  :: v_dual_mov_b32 v3, s0  
739+ ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s6  
754740; GFX12-FAKE16-NEXT:    global_store_b128 v4, v[0:3], s[4:5] 
755741; GFX12-FAKE16-NEXT:    s_endpgm 
756742entry:
@@ -862,26 +848,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
862848; GFX11-FAKE16-NEXT:    s_clause 0x1 
863849; GFX11-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34 
864850; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24 
865- ; GFX11-FAKE16-NEXT:    s_sub_i32 s2, 32, 1 
866- ; GFX11-FAKE16-NEXT:    s_sub_i32 s3, 32, 9 
867- ; GFX11-FAKE16-NEXT:    s_sub_i32 s6, 32, 7 
868851; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) 
869- ; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s11, s2 
870- ; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s15, 1 
871- ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s10, s3 
872- ; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s14, 9 
873- ; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s9, s6 
874- ; GFX11-FAKE16-NEXT:    s_lshr_b32 s9, s13, 7 
875- ; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s8, s2 
876- ; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s12, 1 
877- ; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4 
878- ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s7, s3 
879- ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s8, s2 
880- ; GFX11-FAKE16-NEXT:    s_or_b32 s5, s9, s6 
852+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, s15 
853+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s3, s11 
854+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s10 
855+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s13 
856+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s9 
857+ ; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s8 
858+ ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1 
859+ ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[6:7], s[14:15], 9 
860+ ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[8:9], s[12:13], 1 
861+ ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[4:5], s[4:5], 7 
881862; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) 
882- ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5  
883- ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2  :: v_dual_mov_b32 v3, s4  
884- ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s3  
863+ ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4  
864+ ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s8  :: v_dual_mov_b32 v3, s2  
865+ ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s6  
885866; GFX11-FAKE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1] 
886867; GFX11-FAKE16-NEXT:    s_endpgm 
887868; 
@@ -904,26 +885,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
904885; GFX12-FAKE16-NEXT:    s_clause 0x1 
905886; GFX12-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34 
906887; GFX12-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24 
907- ; GFX12-FAKE16-NEXT:    s_sub_co_i32 s2, 32, 1 
908- ; GFX12-FAKE16-NEXT:    s_sub_co_i32 s3, 32, 9 
909- ; GFX12-FAKE16-NEXT:    s_sub_co_i32 s6, 32, 7 
910888; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0 
911- ; GFX12-FAKE16-NEXT:    s_lshl_b32 s4, s11, s2 
912- ; GFX12-FAKE16-NEXT:    s_lshr_b32 s5, s15, 1 
913- ; GFX12-FAKE16-NEXT:    s_lshl_b32 s3, s10, s3 
914- ; GFX12-FAKE16-NEXT:    s_lshr_b32 s7, s14, 9 
915- ; GFX12-FAKE16-NEXT:    s_lshl_b32 s6, s9, s6 
916- ; GFX12-FAKE16-NEXT:    s_lshr_b32 s9, s13, 7 
917- ; GFX12-FAKE16-NEXT:    s_lshl_b32 s2, s8, s2 
918- ; GFX12-FAKE16-NEXT:    s_lshr_b32 s8, s12, 1 
919- ; GFX12-FAKE16-NEXT:    s_or_b32 s4, s5, s4 
920- ; GFX12-FAKE16-NEXT:    s_or_b32 s3, s7, s3 
921- ; GFX12-FAKE16-NEXT:    s_or_b32 s2, s8, s2 
922- ; GFX12-FAKE16-NEXT:    s_or_b32 s5, s9, s6 
889+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s2, s15 
890+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s3, s11 
891+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s10 
892+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s4, s13 
893+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, s9 
894+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s8 
895+ ; GFX12-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1 
896+ ; GFX12-FAKE16-NEXT:    s_lshr_b64 s[6:7], s[14:15], 9 
897+ ; GFX12-FAKE16-NEXT:    s_lshr_b64 s[8:9], s[12:13], 1 
898+ ; GFX12-FAKE16-NEXT:    s_lshr_b64 s[4:5], s[4:5], 7 
923899; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) 
924- ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5  
925- ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, s2  :: v_dual_mov_b32 v3, s4  
926- ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s3  
900+ ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4  
901+ ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, s8  :: v_dual_mov_b32 v3, s2  
902+ ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s6  
927903; GFX12-FAKE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1] 
928904; GFX12-FAKE16-NEXT:    s_endpgm 
929905entry:
0 commit comments