@@ -17,9 +17,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
1717; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
1818; SI-NEXT: s_waitcnt vmcnt(0)
1919; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
20- ; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
21- ; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
22- ; SI-NEXT: v_lshrrev_b32_e32 v2, v3, v2
20+ ; SI-NEXT: v_and_b32_e32 v3, 31, v3
21+ ; SI-NEXT: v_bfe_u32 v2, v2, 0, v3
2322; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2423; SI-NEXT: s_endpgm
2524;
@@ -38,9 +37,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
3837; VI-NEXT: v_mov_b32_e32 v1, s1
3938; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
4039; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
41- ; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
42- ; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
43- ; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
40+ ; VI-NEXT: v_and_b32_e32 v2, 31, v4
41+ ; VI-NEXT: v_bfe_u32 v2, v3, 0, v2
4442; VI-NEXT: flat_store_dword v[0:1], v2
4543; VI-NEXT: s_endpgm
4644 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x ()
@@ -49,7 +47,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
4947 %out.gep = getelementptr i32 , ptr addrspace (1 ) %out , i32 %id.x
5048 %src = load volatile i32 , ptr addrspace (1 ) %in0.gep
5149 %width = load volatile i32 , ptr addrspace (1 ) %in0.gep
52- %sub = sub i32 32 , %width
50+ %width5 = and i32 %width , 31
51+ %sub = sub i32 32 , %width5
5352 %shl = shl i32 %src , %sub
5453 %bfe = lshr i32 %shl , %sub
5554 store i32 %bfe , ptr addrspace (1 ) %out.gep
@@ -72,6 +71,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
7271; SI-NEXT: s_waitcnt vmcnt(0)
7372; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
7473; SI-NEXT: s_mov_b32 s6, -1
74+ ; SI-NEXT: v_and_b32_e32 v3, 31, v3
7575; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
7676; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
7777; SI-NEXT: v_lshrrev_b32_e32 v3, v3, v2
@@ -95,7 +95,8 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
9595; VI-NEXT: v_mov_b32_e32 v1, s1
9696; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
9797; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
98- ; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
98+ ; VI-NEXT: v_and_b32_e32 v2, 31, v4
99+ ; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v2
99100; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
100101; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
101102; VI-NEXT: flat_store_dword v[0:1], v2
@@ -108,7 +109,8 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
108109 %out.gep = getelementptr i32 , ptr addrspace (1 ) %out , i32 %id.x
109110 %src = load volatile i32 , ptr addrspace (1 ) %in0.gep
110111 %width = load volatile i32 , ptr addrspace (1 ) %in0.gep
111- %sub = sub i32 32 , %width
112+ %width5 = and i32 %width , 31
113+ %sub = sub i32 32 , %width5
112114 %shl = shl i32 %src , %sub
113115 %bfe = lshr i32 %shl , %sub
114116 store i32 %bfe , ptr addrspace (1 ) %out.gep
@@ -219,9 +221,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
219221; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
220222; SI-NEXT: s_waitcnt vmcnt(0)
221223; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
222- ; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
223- ; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
224- ; SI-NEXT: v_ashrrev_i32_e32 v2, v3, v2
224+ ; SI-NEXT: v_and_b32_e32 v3, 31, v3
225+ ; SI-NEXT: v_bfe_i32 v2, v2, 0, v3
225226; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
226227; SI-NEXT: s_endpgm
227228;
@@ -240,9 +241,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
240241; VI-NEXT: v_mov_b32_e32 v1, s1
241242; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
242243; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
243- ; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
244- ; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
245- ; VI-NEXT: v_ashrrev_i32_e32 v2, v2, v3
244+ ; VI-NEXT: v_and_b32_e32 v2, 31, v4
245+ ; VI-NEXT: v_bfe_i32 v2, v3, 0, v2
246246; VI-NEXT: flat_store_dword v[0:1], v2
247247; VI-NEXT: s_endpgm
248248 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x ()
@@ -251,7 +251,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
251251 %out.gep = getelementptr i32 , ptr addrspace (1 ) %out , i32 %id.x
252252 %src = load volatile i32 , ptr addrspace (1 ) %in0.gep
253253 %width = load volatile i32 , ptr addrspace (1 ) %in0.gep
254- %sub = sub i32 32 , %width
254+ %width5 = and i32 %width , 31
255+ %sub = sub i32 32 , %width5
255256 %shl = shl i32 %src , %sub
256257 %bfe = ashr i32 %shl , %sub
257258 store i32 %bfe , ptr addrspace (1 ) %out.gep
0 commit comments