@@ -103,13 +103,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
103103; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
104104; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
105105; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
106- ; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, -1
107- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
108106; GFX11-FAKE16-NEXT: s_sub_i32 s3, 32, s2
109107; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, s2
110108; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s3
109+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
111110; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
112- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
113111; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
114112; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[4:5]
115113; GFX11-FAKE16-NEXT: s_endpgm
@@ -133,13 +131,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
133131; GFX12-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c
134132; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
135133; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
136- ; GFX12-FAKE16-NEXT: s_and_b32 s2, s2, -1
137- ; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
138134; GFX12-FAKE16-NEXT: s_sub_co_i32 s3, 32, s2
139135; GFX12-FAKE16-NEXT: s_lshr_b32 s1, s1, s2
140136; GFX12-FAKE16-NEXT: s_lshl_b32 s0, s0, s3
137+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
141138; GFX12-FAKE16-NEXT: s_or_b32 s0, s1, s0
142- ; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
143139; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
144140; GFX12-FAKE16-NEXT: global_store_b32 v0, v1, s[4:5]
145141; GFX12-FAKE16-NEXT: s_endpgm
@@ -357,11 +353,9 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
357353; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
358354; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
359355; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
360- ; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, -1
361- ; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, -1
356+ ; GFX11-FAKE16-NEXT: s_sub_i32 s8, 32, s6
362357; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, s7
363358; GFX11-FAKE16-NEXT: s_sub_i32 s7, 32, s7
364- ; GFX11-FAKE16-NEXT: s_sub_i32 s8, 32, s6
365359; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, s6
366360; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s8
367361; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, s7
@@ -396,11 +390,9 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
396390; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
397391; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
398392; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
399- ; GFX12-FAKE16-NEXT: s_and_b32 s7, s7, -1
400- ; GFX12-FAKE16-NEXT: s_and_b32 s6, s6, -1
393+ ; GFX12-FAKE16-NEXT: s_sub_co_i32 s8, 32, s6
401394; GFX12-FAKE16-NEXT: s_lshr_b32 s3, s3, s7
402395; GFX12-FAKE16-NEXT: s_sub_co_i32 s7, 32, s7
403- ; GFX12-FAKE16-NEXT: s_sub_co_i32 s8, 32, s6
404396; GFX12-FAKE16-NEXT: s_lshr_b32 s2, s2, s6
405397; GFX12-FAKE16-NEXT: s_lshl_b32 s0, s0, s8
406398; GFX12-FAKE16-NEXT: s_lshl_b32 s1, s1, s7
@@ -688,10 +680,6 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
688680; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
689681; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
690682; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
691- ; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, -1
692- ; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, -1
693- ; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, -1
694- ; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, -1
695683; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s15, s3
696684; GFX11-FAKE16-NEXT: s_sub_i32 s3, 32, s3
697685; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s14, s2
@@ -743,10 +731,6 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
743731; GFX12-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
744732; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
745733; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
746- ; GFX12-FAKE16-NEXT: s_and_b32 s3, s3, -1
747- ; GFX12-FAKE16-NEXT: s_and_b32 s2, s2, -1
748- ; GFX12-FAKE16-NEXT: s_and_b32 s1, s1, -1
749- ; GFX12-FAKE16-NEXT: s_and_b32 s0, s0, -1
750734; GFX12-FAKE16-NEXT: s_lshr_b32 s6, s15, s3
751735; GFX12-FAKE16-NEXT: s_sub_co_i32 s3, 32, s3
752736; GFX12-FAKE16-NEXT: s_lshr_b32 s7, s14, s2
0 commit comments