@@ -135,9 +135,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
135135; GFX7-NEXT: s_mov_b32 s7, 0xf000
136136; GFX7-NEXT: s_mov_b32 s6, -1
137137; GFX7-NEXT: s_waitcnt lgkmcnt(0)
138- ; GFX7-NEXT: s_xor_b32 s1, s1, s2
139- ; GFX7-NEXT: s_and_b32 s0, s0, s1
140- ; GFX7-NEXT: s_xor_b32 s0, s2, s0
138+ ; GFX7-NEXT: s_andn2_b32 s2, s2, s0
139+ ; GFX7-NEXT: s_and_b32 s0, s1, s0
140+ ; GFX7-NEXT: s_or_b32 s0, s0, s2
141141; GFX7-NEXT: v_mov_b32_e32 v0, s0
142142; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
143143; GFX7-NEXT: s_endpgm
@@ -147,9 +147,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
147147; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
148148; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
149149; GFX8-NEXT: s_waitcnt lgkmcnt(0)
150- ; GFX8-NEXT: s_xor_b32 s1, s1, s2
151- ; GFX8-NEXT: s_and_b32 s0, s0, s1
152- ; GFX8-NEXT: s_xor_b32 s0, s2, s0
150+ ; GFX8-NEXT: s_andn2_b32 s2, s2, s0
151+ ; GFX8-NEXT: s_and_b32 s0, s1, s0
152+ ; GFX8-NEXT: s_or_b32 s0, s0, s2
153153; GFX8-NEXT: v_mov_b32_e32 v0, s4
154154; GFX8-NEXT: v_mov_b32_e32 v1, s5
155155; GFX8-NEXT: v_mov_b32_e32 v2, s0
@@ -163,9 +163,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y
163163; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
164164; GFX10-NEXT: v_mov_b32_e32 v0, 0
165165; GFX10-NEXT: s_waitcnt lgkmcnt(0)
166- ; GFX10-NEXT: s_xor_b32 s1, s1, s2
167- ; GFX10-NEXT: s_and_b32 s0, s0, s1
168- ; GFX10-NEXT: s_xor_b32 s0, s2, s0
166+ ; GFX10-NEXT: s_andn2_b32 s2, s2, s0
167+ ; GFX10-NEXT: s_and_b32 s0, s1, s0
168+ ; GFX10-NEXT: s_or_b32 s0, s0, s2
169169; GFX10-NEXT: v_mov_b32_e32 v1, s0
170170; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
171171; GFX10-NEXT: s_endpgm
@@ -317,19 +317,26 @@ entry:
317317define amdgpu_ps float @s_s_v_bfi_sha256_ch (i32 inreg %x , i32 inreg %y , i32 %z ) {
318318; GFX7-LABEL: s_s_v_bfi_sha256_ch:
319319; GFX7: ; %bb.0: ; %entry
320- ; GFX7-NEXT: v_mov_b32_e32 v1, s0
321- ; GFX7-NEXT: v_bfi_b32 v0, v1, s1, v0
320+ ; GFX7-NEXT: s_not_b32 s1, s1
321+ ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
322+ ; GFX7-NEXT: s_nand_b32 s0, s1, s0
323+ ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
322324; GFX7-NEXT: ; return to shader part epilog
323325;
324326; GFX8-LABEL: s_s_v_bfi_sha256_ch:
325327; GFX8: ; %bb.0: ; %entry
326- ; GFX8-NEXT: v_mov_b32_e32 v1, s0
327- ; GFX8-NEXT: v_bfi_b32 v0, v1, s1, v0
328+ ; GFX8-NEXT: s_not_b32 s1, s1
329+ ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
330+ ; GFX8-NEXT: s_nand_b32 s0, s1, s0
331+ ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
328332; GFX8-NEXT: ; return to shader part epilog
329333;
330334; GFX10-LABEL: s_s_v_bfi_sha256_ch:
331335; GFX10: ; %bb.0: ; %entry
332- ; GFX10-NEXT: v_bfi_b32 v0, s0, s1, v0
336+ ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
337+ ; GFX10-NEXT: s_not_b32 s1, s1
338+ ; GFX10-NEXT: s_nand_b32 s0, s1, s0
339+ ; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
333340; GFX10-NEXT: ; return to shader part epilog
334341;
335342; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ch:
@@ -350,30 +357,40 @@ entry:
350357 ret float %cast
351358}
352359
353- define amdgpu_ps float @s_v_v_bfi_sha256_ch (i32 inreg %x , i32 %y , i32 %z ) {
360+ define amdgpu_ps float @s_v_v_bfi_sha256_ch (i32 inreg %x , i32 inreg %y , i32 %z ) {
354361; GFX7-LABEL: s_v_v_bfi_sha256_ch:
355362; GFX7: ; %bb.0: ; %entry
356- ; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1
363+ ; GFX7-NEXT: s_not_b32 s1, s1
364+ ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
365+ ; GFX7-NEXT: s_nand_b32 s0, s1, s0
366+ ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
357367; GFX7-NEXT: ; return to shader part epilog
358368;
359369; GFX8-LABEL: s_v_v_bfi_sha256_ch:
360370; GFX8: ; %bb.0: ; %entry
361- ; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1
371+ ; GFX8-NEXT: s_not_b32 s1, s1
372+ ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
373+ ; GFX8-NEXT: s_nand_b32 s0, s1, s0
374+ ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
362375; GFX8-NEXT: ; return to shader part epilog
363376;
364377; GFX10-LABEL: s_v_v_bfi_sha256_ch:
365378; GFX10: ; %bb.0: ; %entry
366- ; GFX10-NEXT: v_bfi_b32 v0, s0, v0, v1
379+ ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
380+ ; GFX10-NEXT: s_not_b32 s1, s1
381+ ; GFX10-NEXT: s_nand_b32 s0, s1, s0
382+ ; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
367383; GFX10-NEXT: ; return to shader part epilog
368384;
369385; GFX8-GISEL-LABEL: s_v_v_bfi_sha256_ch:
370386; GFX8-GISEL: ; %bb.0: ; %entry
371- ; GFX8-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1
387+ ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s0
388+ ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v1, s1, v0
372389; GFX8-GISEL-NEXT: ; return to shader part epilog
373390;
374391; GFX10-GISEL-LABEL: s_v_v_bfi_sha256_ch:
375392; GFX10-GISEL: ; %bb.0: ; %entry
376- ; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1
393+ ; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, s1, v0
377394; GFX10-GISEL-NEXT: ; return to shader part epilog
378395entry:
379396 %xor0 = xor i32 %y , %z
@@ -1008,24 +1025,32 @@ define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_1(i64 %a, i64 inreg %b, i6
10081025define amdgpu_ps <2 x float > @s_s_v_bitselect_i64_pat_1 (i64 inreg %a , i64 inreg %b , i64 %mask ) {
10091026; GFX7-LABEL: s_s_v_bitselect_i64_pat_1:
10101027; GFX7: ; %bb.0:
1011- ; GFX7-NEXT: v_mov_b32_e32 v2, s1
1012- ; GFX7-NEXT: v_bfi_b32 v1, s3, v2, v1
1013- ; GFX7-NEXT: v_mov_b32_e32 v2, s0
1014- ; GFX7-NEXT: v_bfi_b32 v0, s2, v2, v0
1028+ ; GFX7-NEXT: s_not_b64 s[0:1], s[0:1]
1029+ ; GFX7-NEXT: v_or_b32_e32 v1, s3, v1
1030+ ; GFX7-NEXT: v_or_b32_e32 v0, s2, v0
1031+ ; GFX7-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
1032+ ; GFX7-NEXT: v_and_b32_e32 v1, s1, v1
1033+ ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
10151034; GFX7-NEXT: ; return to shader part epilog
10161035;
10171036; GFX8-LABEL: s_s_v_bitselect_i64_pat_1:
10181037; GFX8: ; %bb.0:
1019- ; GFX8-NEXT: v_mov_b32_e32 v2, s1
1020- ; GFX8-NEXT: v_bfi_b32 v1, s3, v2, v1
1021- ; GFX8-NEXT: v_mov_b32_e32 v2, s0
1022- ; GFX8-NEXT: v_bfi_b32 v0, s2, v2, v0
1038+ ; GFX8-NEXT: s_not_b64 s[0:1], s[0:1]
1039+ ; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
1040+ ; GFX8-NEXT: v_or_b32_e32 v0, s2, v0
1041+ ; GFX8-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
1042+ ; GFX8-NEXT: v_and_b32_e32 v1, s1, v1
1043+ ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
10231044; GFX8-NEXT: ; return to shader part epilog
10241045;
10251046; GFX10-LABEL: s_s_v_bitselect_i64_pat_1:
10261047; GFX10: ; %bb.0:
1027- ; GFX10-NEXT: v_bfi_b32 v0, s2, s0, v0
1028- ; GFX10-NEXT: v_bfi_b32 v1, s3, s1, v1
1048+ ; GFX10-NEXT: v_or_b32_e32 v1, s3, v1
1049+ ; GFX10-NEXT: v_or_b32_e32 v0, s2, v0
1050+ ; GFX10-NEXT: s_not_b64 s[0:1], s[0:1]
1051+ ; GFX10-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
1052+ ; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
1053+ ; GFX10-NEXT: v_and_b32_e32 v1, s1, v1
10291054; GFX10-NEXT: ; return to shader part epilog
10301055;
10311056; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
@@ -1495,9 +1520,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
14951520; GFX7-NEXT: s_mov_b32 s7, 0xf000
14961521; GFX7-NEXT: s_mov_b32 s6, -1
14971522; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1498- ; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
14991523; GFX7-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1500- ; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1524+ ; GFX7-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
1525+ ; GFX7-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
15011526; GFX7-NEXT: s_add_u32 s0, s0, 10
15021527; GFX7-NEXT: s_addc_u32 s1, s1, 0
15031528; GFX7-NEXT: v_mov_b32_e32 v0, s0
@@ -1510,9 +1535,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
15101535; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
15111536; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
15121537; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1513- ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
15141538; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1515- ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1539+ ; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
1540+ ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
15161541; GFX8-NEXT: s_add_u32 s0, s0, 10
15171542; GFX8-NEXT: s_addc_u32 s1, s1, 0
15181543; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1526,9 +1551,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
15261551; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
15271552; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
15281553; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1529- ; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
15301554; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1531- ; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1555+ ; GFX10-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
1556+ ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
15321557; GFX10-NEXT: s_add_u32 s0, s0, 10
15331558; GFX10-NEXT: s_addc_u32 s1, s1, 0
15341559; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1583,9 +1608,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
15831608; GFX7-NEXT: s_mov_b32 s7, 0xf000
15841609; GFX7-NEXT: s_mov_b32 s6, -1
15851610; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1586- ; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
15871611; GFX7-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1588- ; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1612+ ; GFX7-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
1613+ ; GFX7-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
15891614; GFX7-NEXT: s_add_u32 s0, s0, 10
15901615; GFX7-NEXT: s_addc_u32 s1, s1, 0
15911616; GFX7-NEXT: v_mov_b32_e32 v0, s0
@@ -1598,9 +1623,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
15981623; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
15991624; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
16001625; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1601- ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
16021626; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1603- ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1627+ ; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
1628+ ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
16041629; GFX8-NEXT: s_add_u32 s0, s0, 10
16051630; GFX8-NEXT: s_addc_u32 s1, s1, 0
16061631; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1614,9 +1639,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
16141639; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
16151640; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
16161641; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1617- ; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
16181642; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
1619- ; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
1643+ ; GFX10-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3]
1644+ ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
16201645; GFX10-NEXT: s_add_u32 s0, s0, 10
16211646; GFX10-NEXT: s_addc_u32 s1, s1, 0
16221647; GFX10-NEXT: v_mov_b32_e32 v0, s0
0 commit comments