@@ -40,9 +40,10 @@ define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
4040; GFX7-LABEL: v_add_v2i16_fneg_lhs:
4141; GFX7: ; %bb.0:
4242; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
43- ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16 , v1
43+ ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff , v1
4444; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
45- ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
45+ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
46+ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
4647; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
4748; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
4849; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
@@ -52,7 +53,8 @@ define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
5253; GFX9-LABEL: v_add_v2i16_fneg_lhs:
5354; GFX9: ; %bb.0:
5455; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
55- ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
56+ ; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
57+ ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
5658; GFX9-NEXT: s_setpc_b64 s[30:31]
5759;
5860; GFX8-LABEL: v_add_v2i16_fneg_lhs:
@@ -67,7 +69,8 @@ define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
6769; GFX10-LABEL: v_add_v2i16_fneg_lhs:
6870; GFX10: ; %bb.0:
6971; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70- ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
72+ ; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
73+ ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1
7174; GFX10-NEXT: s_setpc_b64 s[30:31]
7275 %neg.a = fneg <2 x half > %a
7376 %cast.neg.a = bitcast <2 x half > %neg.a to <2 x i16 >
@@ -79,9 +82,10 @@ define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
7982; GFX7-LABEL: v_add_v2i16_fneg_rhs:
8083; GFX7: ; %bb.0:
8184; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82- ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16 , v3
85+ ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff , v3
8386; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
84- ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
87+ ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
88+ ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
8589; GFX7-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
8690; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
8791; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
@@ -91,7 +95,8 @@ define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
9195; GFX9-LABEL: v_add_v2i16_fneg_rhs:
9296; GFX9: ; %bb.0:
9397; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
94- ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
98+ ; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
99+ ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
95100; GFX9-NEXT: s_setpc_b64 s[30:31]
96101;
97102; GFX8-LABEL: v_add_v2i16_fneg_rhs:
@@ -106,7 +111,8 @@ define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
106111; GFX10-LABEL: v_add_v2i16_fneg_rhs:
107112; GFX10: ; %bb.0:
108113; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109- ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
114+ ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
115+ ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1
110116; GFX10-NEXT: s_setpc_b64 s[30:31]
111117 %neg.b = fneg <2 x half > %b
112118 %cast.neg.b = bitcast <2 x half > %neg.b to <2 x i16 >
@@ -118,11 +124,13 @@ define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
118124; GFX7-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
119125; GFX7: ; %bb.0:
120126; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121- ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16 , v1
127+ ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff , v1
122128; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
123- ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
124- ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
125- ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
129+ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
130+ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
131+ ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
132+ ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v3
133+ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
126134; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
127135; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
128136; GFX7-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
@@ -135,7 +143,9 @@ define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
135143; GFX9-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
136144; GFX9: ; %bb.0:
137145; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138- ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
146+ ; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
147+ ; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
148+ ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
139149; GFX9-NEXT: s_setpc_b64 s[30:31]
140150;
141151; GFX8-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
@@ -151,7 +161,9 @@ define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
151161; GFX10-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
152162; GFX10: ; %bb.0:
153163; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154- ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
164+ ; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
165+ ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
166+ ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1
155167; GFX10-NEXT: s_setpc_b64 s[30:31]
156168 %neg.a = fneg <2 x half > %a
157169 %neg.b = fneg <2 x half > %b
@@ -434,9 +446,10 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
434446define amdgpu_ps i32 @s_add_v2i16_fneg_lhs (<2 x half > inreg %a , <2 x i16 > inreg %b ) {
435447; GFX7-LABEL: s_add_v2i16_fneg_lhs:
436448; GFX7: ; %bb.0:
449+ ; GFX7-NEXT: s_and_b32 s1, 0xffff, s1
450+ ; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
437451; GFX7-NEXT: s_lshl_b32 s1, s1, 16
438- ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
439- ; GFX7-NEXT: s_or_b32 s0, s1, s0
452+ ; GFX7-NEXT: s_or_b32 s0, s0, s1
440453; GFX7-NEXT: s_xor_b32 s0, s0, 0x80008000
441454; GFX7-NEXT: s_lshr_b32 s1, s0, 16
442455; GFX7-NEXT: s_add_i32 s1, s1, s3
@@ -490,9 +503,10 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg
490503define amdgpu_ps i32 @s_add_v2i16_fneg_rhs (<2 x i16 > inreg %a , <2 x half > inreg %b ) {
491504; GFX7-LABEL: s_add_v2i16_fneg_rhs:
492505; GFX7: ; %bb.0:
506+ ; GFX7-NEXT: s_and_b32 s3, 0xffff, s3
507+ ; GFX7-NEXT: s_and_b32 s2, 0xffff, s2
493508; GFX7-NEXT: s_lshl_b32 s3, s3, 16
494- ; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
495- ; GFX7-NEXT: s_or_b32 s2, s3, s2
509+ ; GFX7-NEXT: s_or_b32 s2, s2, s3
496510; GFX7-NEXT: s_xor_b32 s2, s2, 0x80008000
497511; GFX7-NEXT: s_lshr_b32 s3, s2, 16
498512; GFX7-NEXT: s_add_i32 s1, s1, s3
@@ -546,11 +560,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg
546560define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs (<2 x half > inreg %a , <2 x half > inreg %b ) {
547561; GFX7-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
548562; GFX7: ; %bb.0:
563+ ; GFX7-NEXT: s_and_b32 s1, 0xffff, s1
564+ ; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
549565; GFX7-NEXT: s_lshl_b32 s1, s1, 16
550- ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
551- ; GFX7-NEXT: s_or_b32 s0, s1, s0
552- ; GFX7-NEXT: s_lshl_b32 s1, s3, 16
553- ; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
566+ ; GFX7-NEXT: s_or_b32 s0, s0, s1
567+ ; GFX7-NEXT: s_and_b32 s1, 0xffff, s2
568+ ; GFX7-NEXT: s_and_b32 s2, 0xffff, s3
569+ ; GFX7-NEXT: s_lshl_b32 s2, s2, 16
554570; GFX7-NEXT: s_or_b32 s1, s1, s2
555571; GFX7-NEXT: s_xor_b32 s0, s0, 0x80008000
556572; GFX7-NEXT: s_xor_b32 s1, s1, 0x80008000
0 commit comments