Skip to content

Commit 7f65dea

Browse files
committed
[DAGCombiner] Relax nsz constraint with fp->int->fp optimizations
1 parent da43b7e commit 7f65dea

File tree

7 files changed

+208
-205
lines changed

7 files changed

+208
-205
lines changed

llvm/include/llvm/CodeGen/SelectionDAG.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2322,6 +2322,10 @@ class SelectionDAG {
23222322
/// +nan are considered positive, -0.0, -inf and -nan are not.
23232323
LLVM_ABI bool cannotBeOrderedNegativeFP(SDValue Op) const;
23242324

2325+
/// Check if all uses of a floating-point value are insensitive to signed
2326+
/// zeros.
2327+
LLVM_ABI bool allUsesSignedZeroInsensitive(SDValue Op) const;
2328+
23252329
/// Test whether two SDValues are known to compare equal. This
23262330
/// is true if they are the same value, or if one is negative zero and the
23272331
/// other positive zero.

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18891,12 +18891,13 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
1889118891
bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP;
1889218892
assert(IsSigned || IsUnsigned);
1889318893

18894-
bool IsSignedZeroSafe = DAG.getTarget().Options.NoSignedZerosFPMath;
18894+
bool IsSignedZeroSafe = DAG.getTarget().Options.NoSignedZerosFPMath ||
18895+
DAG.allUsesSignedZeroInsensitive(SDValue(N, 0));
1889518896
// For signed conversions: The optimization changes signed zero behavior.
1889618897
if (IsSigned && !IsSignedZeroSafe)
1889718898
return SDValue();
1889818899
// For unsigned conversions, we need FABS to canonicalize -0.0 to +0.0
18899-
// (unless NoSignedZerosFPMath is set).
18900+
// (unless outputting a signed zero is OK).
1890018901
if (IsUnsigned && !IsSignedZeroSafe && !TLI.isFAbsFree(VT))
1890118902
return SDValue();
1890218903

@@ -19375,10 +19376,17 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
1937519376
// FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't
1937619377
// know it was called from a context with a nsz flag if the input fsub does
1937719378
// not.
19378-
if (N0.getOpcode() == ISD::FSUB && N->getFlags().hasNoSignedZeros() &&
19379-
N0.hasOneUse()) {
19380-
return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1),
19381-
N0.getOperand(0));
19379+
if (N0.getOpcode() == ISD::FSUB && N0.hasOneUse()) {
19380+
SDValue X = N0.getOperand(0);
19381+
SDValue Y = N0.getOperand(1);
19382+
19383+
// Safe if NoSignedZeros, or if we can prove X != Y (avoiding the -0.0 vs
19384+
// +0.0 issue) For now, we use a conservative check: if either operand is
19385+
// known never zero, then X - Y can't produce a signed zero from X == Y.
19386+
if (N->getFlags().hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(X) ||
19387+
DAG.isKnownNeverZeroFloat(Y)) {
19388+
return DAG.getNode(ISD::FSUB, SDLoc(N), VT, Y, X);
19389+
}
1938219390
}
1938319391

1938419392
if (SimplifyDemandedBits(SDValue(N, 0)))

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6075,6 +6075,35 @@ bool SelectionDAG::isKnownNeverZeroFloat(SDValue Op) const {
60756075
Op, [](ConstantFPSDNode *C) { return !C->isZero(); });
60766076
}
60776077

6078+
bool SelectionDAG::allUsesSignedZeroInsensitive(SDValue Op) const {
6079+
assert(Op.getValueType().isFloatingPoint());
6080+
return all_of(Op->uses(), [&](SDUse &Use) {
6081+
SDNode *User = Use.getUser();
6082+
unsigned OperandNo = Use.getOperandNo();
6083+
6084+
// Check if this use is insensitive to the sign of zero
6085+
switch (User->getOpcode()) {
6086+
case ISD::SETCC:
6087+
// Comparisons: IEEE-754 specifies +0.0 == -0.0.
6088+
case ISD::FABS:
6089+
// fabs always produces +0.0.
6090+
return true;
6091+
case ISD::FCOPYSIGN:
6092+
// copysign overwrites the sign bit of the first operand.
6093+
return OperandNo == 0;
6094+
case ISD::FADD:
6095+
case ISD::FSUB: {
6096+
// Arithmetic with non-zero constants fixes the uncertainty around the
6097+
// sign bit.
6098+
SDValue Other = User->getOperand(1 - OperandNo);
6099+
return isKnownNeverZeroFloat(Other);
6100+
}
6101+
default:
6102+
return false;
6103+
}
6104+
});
6105+
}
6106+
60786107
bool SelectionDAG::isKnownNeverZero(SDValue Op, unsigned Depth) const {
60796108
if (Depth >= MaxRecursionDepth)
60806109
return false; // Limit search depth.

llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,111 @@ entry:
134134
ret float %f
135135
}
136136

137+
define i1 @test_fcmp(float %x) {
138+
; CHECK-LABEL: test_fcmp:
139+
; CHECK: // %bb.0:
140+
; CHECK-NEXT: frintz s0, s0
141+
; CHECK-NEXT: fcmp s0, #0.0
142+
; CHECK-NEXT: cset w0, eq
143+
; CHECK-NEXT: ret
144+
;
145+
; NO-SIGNED-ZEROS-LABEL: test_fcmp:
146+
; NO-SIGNED-ZEROS: // %bb.0:
147+
; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
148+
; NO-SIGNED-ZEROS-NEXT: fcmp s0, #0.0
149+
; NO-SIGNED-ZEROS-NEXT: cset w0, eq
150+
; NO-SIGNED-ZEROS-NEXT: ret
151+
%conv1 = fptosi float %x to i32
152+
%conv2 = sitofp i32 %conv1 to float
153+
%cmp = fcmp oeq float %conv2, 0.0
154+
ret i1 %cmp
155+
}
156+
157+
define float @test_fabs(float %x) {
158+
; CHECK-LABEL: test_fabs:
159+
; CHECK: // %bb.0:
160+
; CHECK-NEXT: frintz s0, s0
161+
; CHECK-NEXT: fabs s0, s0
162+
; CHECK-NEXT: ret
163+
;
164+
; NO-SIGNED-ZEROS-LABEL: test_fabs:
165+
; NO-SIGNED-ZEROS: // %bb.0:
166+
; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
167+
; NO-SIGNED-ZEROS-NEXT: fabs s0, s0
168+
; NO-SIGNED-ZEROS-NEXT: ret
169+
%conv1 = fptosi float %x to i32
170+
%conv2 = sitofp i32 %conv1 to float
171+
%abs = call float @llvm.fabs.f32(float %conv2)
172+
ret float %abs
173+
}
174+
175+
define float @test_copysign(float %x, float %y) {
176+
; CHECK-LABEL: test_copysign:
177+
; CHECK: // %bb.0:
178+
; CHECK-NEXT: frintz s0, s0
179+
; CHECK-NEXT: mvni v2.4s, #128, lsl #24
180+
; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
181+
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
182+
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
183+
; CHECK-NEXT: ret
184+
;
185+
; NO-SIGNED-ZEROS-LABEL: test_copysign:
186+
; NO-SIGNED-ZEROS: // %bb.0:
187+
; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
188+
; NO-SIGNED-ZEROS-NEXT: mvni v2.4s, #128, lsl #24
189+
; NO-SIGNED-ZEROS-NEXT: // kill: def $s1 killed $s1 def $q1
190+
; NO-SIGNED-ZEROS-NEXT: bif v0.16b, v1.16b, v2.16b
191+
; NO-SIGNED-ZEROS-NEXT: // kill: def $s0 killed $s0 killed $q0
192+
; NO-SIGNED-ZEROS-NEXT: ret
193+
%conv1 = fptosi float %x to i32
194+
%conv2 = sitofp i32 %conv1 to float
195+
%combine = call float @llvm.copysign.f32(float %conv2, float %y)
196+
ret float %combine
197+
}
198+
199+
define float @test_fadd(float %x) {
200+
; CHECK-LABEL: test_fadd:
201+
; CHECK: // %bb.0:
202+
; CHECK-NEXT: frintz s0, s0
203+
; CHECK-NEXT: fmov s1, #1.00000000
204+
; CHECK-NEXT: fadd s0, s0, s1
205+
; CHECK-NEXT: ret
206+
;
207+
; NO-SIGNED-ZEROS-LABEL: test_fadd:
208+
; NO-SIGNED-ZEROS: // %bb.0:
209+
; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
210+
; NO-SIGNED-ZEROS-NEXT: fmov s1, #1.00000000
211+
; NO-SIGNED-ZEROS-NEXT: fadd s0, s0, s1
212+
; NO-SIGNED-ZEROS-NEXT: ret
213+
%conv1 = fptosi float %x to i32
214+
%conv2 = sitofp i32 %conv1 to float
215+
%add = fadd float %conv2, 1.0
216+
ret float %add
217+
}
218+
219+
define float @test_fsub(float %x) {
220+
; CHECK-LABEL: test_fsub:
221+
; CHECK: // %bb.0:
222+
; CHECK-NEXT: frintz s0, s0
223+
; CHECK-NEXT: fmov s1, #-1.00000000
224+
; CHECK-NEXT: fadd s0, s0, s1
225+
; CHECK-NEXT: ret
226+
;
227+
; NO-SIGNED-ZEROS-LABEL: test_fsub:
228+
; NO-SIGNED-ZEROS: // %bb.0:
229+
; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
230+
; NO-SIGNED-ZEROS-NEXT: fmov s1, #-1.00000000
231+
; NO-SIGNED-ZEROS-NEXT: fadd s0, s0, s1
232+
; NO-SIGNED-ZEROS-NEXT: ret
233+
%conv1 = fptosi float %x to i32
234+
%conv2 = sitofp i32 %conv1 to float
235+
%sub = fsub float %conv2, 1.0
236+
ret float %sub
237+
}
238+
137239
declare i32 @llvm.smin.i32(i32, i32)
138240
declare i32 @llvm.smax.i32(i32, i32)
139241
declare i32 @llvm.umin.i32(i32, i32)
140242
declare i32 @llvm.umax.i32(i32, i32)
243+
declare float @llvm.fabs.f32(float)
244+
declare float @llvm.copysign.f32(float, float)

llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll

Lines changed: 21 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2615,65 +2615,43 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) {
26152615
}
26162616

26172617
define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) {
2618-
; CI-SAFE-LABEL: select_fneg_posk_src_sub_f16:
2619-
; CI-SAFE: ; %bb.0:
2620-
; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2621-
; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
2622-
; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2623-
; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
2624-
; CI-SAFE-NEXT: v_add_f32_e32 v1, -4.0, v1
2625-
; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc
2626-
; CI-SAFE-NEXT: s_setpc_b64 s[30:31]
2618+
; CI-LABEL: select_fneg_posk_src_sub_f16:
2619+
; CI: ; %bb.0:
2620+
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2621+
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2622+
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2623+
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
2624+
; CI-NEXT: v_sub_f32_e32 v1, 4.0, v1
2625+
; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc
2626+
; CI-NEXT: s_setpc_b64 s[30:31]
26272627
;
2628-
; VI-SAFE-LABEL: select_fneg_posk_src_sub_f16:
2629-
; VI-SAFE: ; %bb.0:
2630-
; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2631-
; VI-SAFE-NEXT: v_add_f16_e32 v1, -4.0, v1
2632-
; VI-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v1
2633-
; VI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000
2634-
; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2635-
; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
2636-
; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
2628+
; VI-LABEL: select_fneg_posk_src_sub_f16:
2629+
; VI: ; %bb.0:
2630+
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2631+
; VI-NEXT: v_sub_f16_e32 v1, 4.0, v1
2632+
; VI-NEXT: v_mov_b32_e32 v2, 0x4000
2633+
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2634+
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
2635+
; VI-NEXT: s_setpc_b64 s[30:31]
26372636
;
26382637
; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_sub_f16:
26392638
; GFX11-SAFE-TRUE16: ; %bb.0:
26402639
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2641-
; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v1.l
26422640
; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
2643-
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2644-
; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
2641+
; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, 4.0, v1.l
2642+
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
26452643
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
26462644
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
26472645
;
26482646
; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_sub_f16:
26492647
; GFX11-SAFE-FAKE16: ; %bb.0:
26502648
; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2651-
; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v1
2649+
; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v1, 4.0, v1
26522650
; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
2653-
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2654-
; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
2651+
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
26552652
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
26562653
; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
26572654
;
2658-
; CI-NSZ-LABEL: select_fneg_posk_src_sub_f16:
2659-
; CI-NSZ: ; %bb.0:
2660-
; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2661-
; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
2662-
; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2663-
; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
2664-
; CI-NSZ-NEXT: v_sub_f32_e32 v1, 4.0, v1
2665-
; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc
2666-
; CI-NSZ-NEXT: s_setpc_b64 s[30:31]
2667-
;
2668-
; VI-NSZ-LABEL: select_fneg_posk_src_sub_f16:
2669-
; VI-NSZ: ; %bb.0:
2670-
; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2671-
; VI-NSZ-NEXT: v_sub_f16_e32 v1, 4.0, v1
2672-
; VI-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000
2673-
; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2674-
; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
2675-
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
2676-
;
26772655
; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_sub_f16:
26782656
; GFX11-NSZ-TRUE16: ; %bb.0:
26792657
; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll

Lines changed: 18 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3277,48 +3277,37 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
32773277
; CI-LABEL: select_fneg_posk_src_sub_v2f16:
32783278
; CI: ; %bb.0:
32793279
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3280-
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
32813280
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
3282-
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
3283-
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
3284-
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
3285-
; CI-NEXT: v_add_f32_e32 v3, -4.0, v3
3286-
; CI-NEXT: v_add_f32_e32 v2, -4.0, v2
32873281
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
3288-
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
3289-
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3290-
; CI-NEXT: v_or_b32_e32 v2, v2, v3
3291-
; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
3292-
; CI-NEXT: v_cvt_f32_f16_e32 v3, v2
3293-
; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
3282+
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
32943283
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
3295-
; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc
3284+
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
3285+
; CI-NEXT: v_sub_f32_e32 v2, 4.0, v2
3286+
; CI-NEXT: v_sub_f32_e32 v3, 4.0, v3
3287+
; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc
32963288
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
3297-
; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc
3289+
; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc
32983290
; CI-NEXT: s_setpc_b64 s[30:31]
32993291
;
33003292
; VI-LABEL: select_fneg_posk_src_sub_v2f16:
33013293
; VI: ; %bb.0:
33023294
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33033295
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
3304-
; VI-NEXT: v_mov_b32_e32 v1, 0xc400
3305-
; VI-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3306-
; VI-NEXT: v_add_f16_e32 v2, -4.0, v2
3307-
; VI-NEXT: v_or_b32_e32 v1, v2, v1
3308-
; VI-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
3309-
; VI-NEXT: v_mov_b32_e32 v2, 0x4000
3296+
; VI-NEXT: v_mov_b32_e32 v1, 0x4400
3297+
; VI-NEXT: v_sub_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3298+
; VI-NEXT: v_sub_f16_e32 v2, 4.0, v2
3299+
; VI-NEXT: v_mov_b32_e32 v3, 0x4000
33103300
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
3311-
; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5]
3312-
; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3301+
; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5]
3302+
; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
33133303
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
33143304
; VI-NEXT: s_setpc_b64 s[30:31]
33153305
;
33163306
; GFX9-LABEL: select_fneg_posk_src_sub_v2f16:
33173307
; GFX9: ; %bb.0:
33183308
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33193309
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
3320-
; GFX9-NEXT: v_pk_add_f16 v1, v2, -4.0 op_sel_hi:[1,0]
3321-
; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
3310+
; GFX9-NEXT: v_pk_add_f16 v1, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
33223311
; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000
33233312
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
33243313
; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5]
@@ -3330,28 +3319,25 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
33303319
; GFX11-TRUE16-LABEL: select_fneg_posk_src_sub_v2f16:
33313320
; GFX11-TRUE16: ; %bb.0:
33323321
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3333-
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0]
33343322
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
3323+
; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
33353324
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1
3336-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3337-
; GFX11-TRUE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v2
3325+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33383326
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
3339-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
33403327
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0
33413328
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
33423329
;
33433330
; GFX11-FAKE16-LABEL: select_fneg_posk_src_sub_v2f16:
33443331
; GFX11-FAKE16: ; %bb.0:
33453332
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3346-
; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0]
3333+
; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
33473334
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
3348-
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3349-
; GFX11-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
3335+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
33503336
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
33513337
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
33523338
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
3353-
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
33543339
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
3340+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
33553341
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
33563342
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
33573343
%cmp = icmp eq <2 x i32> %c, zeroinitializer

0 commit comments

Comments
 (0)