Skip to content

Commit a3d00e1

Browse files
authored
DAG: Fold copysign with a known signmask to a disjoint or (#167266)
If the sign bit is a computed sign mask (i.e., we know it's either +0 or -0), turn this into a disjoint or. This pattern appears in the pow implementations. We also need to know the sign bit of the magnitude is 0 for the or to be disjoint. Unfortunately the DAG's FP tracking is weak and we did not have a way to check if the sign bit is known 0, so add something for that. Ideally we would get a complete computeKnownFPClass implementation. This is intended to help avoid the regression which caused d3e7c4c to be reverted.
1 parent 315d705 commit a3d00e1

File tree

5 files changed

+67
-30
lines changed

5 files changed

+67
-30
lines changed

llvm/include/llvm/CodeGen/SelectionDAG.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2072,6 +2072,10 @@ class SelectionDAG {
20722072
/// We use this predicate to simplify operations downstream.
20732073
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth = 0) const;
20742074

2075+
/// Return true if the sign bit of Op is known to be zero, for a
2076+
/// floating-point value.
2077+
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth = 0) const;
2078+
20752079
/// Return true if 'Op & Mask' is known to be zero. We
20762080
/// use this predicate to simplify operations downstream. Op and Mask are
20772081
/// known to be the same type.

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18863,6 +18863,26 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
1886318863
if (SimplifyDemandedBits(SDValue(N, 0)))
1886418864
return SDValue(N, 0);
1886518865

18866+
if (VT != N1.getValueType())
18867+
return SDValue();
18868+
18869+
// If this is equivalent to a disjoint or, replace it with one. This can
18870+
// happen if the sign operand is a sign mask (i.e., x << sign_bit_position).
18871+
if (DAG.SignBitIsZeroFP(N0) &&
18872+
DAG.computeKnownBits(N1).Zero.isMaxSignedValue()) {
18873+
// TODO: Just directly match the shift pattern. computeKnownBits is heavy
18874+
// for a such a narrowly targeted case.
18875+
EVT IntVT = VT.changeTypeToInteger();
18876+
// TODO: It appears to be profitable in some situations to unconditionally
18877+
// emit a fabs(n0) to perform this combine.
18878+
SDValue CastSrc0 = DAG.getNode(ISD::BITCAST, DL, IntVT, N0);
18879+
SDValue CastSrc1 = DAG.getNode(ISD::BITCAST, DL, IntVT, N1);
18880+
18881+
SDValue SignOr = DAG.getNode(ISD::OR, DL, IntVT, CastSrc0, CastSrc1,
18882+
SDNodeFlags::Disjoint);
18883+
return DAG.getNode(ISD::BITCAST, DL, VT, SignOr);
18884+
}
18885+
1886618886
return SDValue();
1886718887
}
1886818888

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2920,6 +2920,34 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
29202920
return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
29212921
}
29222922

2923+
bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned Depth) const {
2924+
if (Depth >= MaxRecursionDepth)
2925+
return false; // Limit search depth.
2926+
2927+
unsigned Opc = Op.getOpcode();
2928+
switch (Opc) {
2929+
case ISD::FABS:
2930+
return true;
2931+
case ISD::AssertNoFPClass: {
2932+
FPClassTest NoFPClass =
2933+
static_cast<FPClassTest>(Op.getConstantOperandVal(1));
2934+
2935+
const FPClassTest TestMask = fcNan | fcNegative;
2936+
return (NoFPClass & TestMask) == TestMask;
2937+
}
2938+
case ISD::ARITH_FENCE:
2939+
return SignBitIsZeroFP(Op, Depth + 1);
2940+
case ISD::FEXP:
2941+
case ISD::FEXP2:
2942+
case ISD::FEXP10:
2943+
return Op->getFlags().hasNoNaNs();
2944+
default:
2945+
return false;
2946+
}
2947+
2948+
llvm_unreachable("covered opcode switch");
2949+
}
2950+
29232951
/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero. We use
29242952
/// this predicate to simplify operations downstream. Mask is known to be zero
29252953
/// for bits that V cannot have.

llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -345,15 +345,13 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) {
345345
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3
346346
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
347347
; GFX9-NEXT: v_fma_f32 v2, v2, v1, v3
348-
; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
349348
; GFX9-NEXT: v_exp_f32_e32 v2, v2
349+
; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
350350
; GFX9-NEXT: v_not_b32_e32 v3, 63
351351
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
352-
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
353352
; GFX9-NEXT: v_ldexp_f32 v2, v2, v3
354-
; GFX9-NEXT: v_and_b32_e32 v0, v1, v0
355-
; GFX9-NEXT: s_brev_b32 s4, -2
356-
; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0
353+
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
354+
; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2
357355
; GFX9-NEXT: s_setpc_b64 s[30:31]
358356
%y = sitofp i32 %y.i to float
359357
%y.fptosi = fptosi float %y to i32
@@ -379,7 +377,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
379377
; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1
380378
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
381379
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
382-
; GFX9-NEXT: v_writelane_b32 v43, s16, 15
380+
; GFX9-NEXT: v_writelane_b32 v43, s16, 14
383381
; GFX9-NEXT: v_writelane_b32 v43, s30, 0
384382
; GFX9-NEXT: v_writelane_b32 v43, s31, 1
385383
; GFX9-NEXT: v_writelane_b32 v43, s34, 2
@@ -391,19 +389,18 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
391389
; GFX9-NEXT: v_writelane_b32 v43, s48, 8
392390
; GFX9-NEXT: v_writelane_b32 v43, s49, 9
393391
; GFX9-NEXT: v_writelane_b32 v43, s50, 10
394-
; GFX9-NEXT: v_writelane_b32 v43, s51, 11
395392
; GFX9-NEXT: s_addk_i32 s32, 0x800
396393
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
397394
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
398395
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
399-
; GFX9-NEXT: v_writelane_b32 v43, s52, 12
396+
; GFX9-NEXT: v_writelane_b32 v43, s51, 11
400397
; GFX9-NEXT: v_mov_b32_e32 v42, v1
401-
; GFX9-NEXT: v_writelane_b32 v43, s53, 13
398+
; GFX9-NEXT: v_writelane_b32 v43, s52, 12
402399
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
403400
; GFX9-NEXT: s_getpc_b64 s[16:17]
404401
; GFX9-NEXT: s_add_u32 s16, s16, _Z4log2d@rel32@lo+4
405402
; GFX9-NEXT: s_addc_u32 s17, s17, _Z4log2d@rel32@hi+12
406-
; GFX9-NEXT: v_writelane_b32 v43, s54, 14
403+
; GFX9-NEXT: v_writelane_b32 v43, s53, 13
407404
; GFX9-NEXT: v_mov_b32_e32 v40, v31
408405
; GFX9-NEXT: v_mov_b32_e32 v41, v2
409406
; GFX9-NEXT: s_mov_b32 s50, s15
@@ -414,7 +411,6 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
414411
; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9]
415412
; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7]
416413
; GFX9-NEXT: s_mov_b64 s[48:49], s[4:5]
417-
; GFX9-NEXT: s_brev_b32 s54, -2
418414
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
419415
; GFX9-NEXT: v_cvt_f64_i32_e32 v[2:3], v41
420416
; GFX9-NEXT: s_getpc_b64 s[16:17]
@@ -436,8 +432,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
436432
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
437433
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
438434
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
439-
; GFX9-NEXT: v_bfi_b32 v1, s54, v1, v2
440-
; GFX9-NEXT: v_readlane_b32 s54, v43, 14
435+
; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
441436
; GFX9-NEXT: v_readlane_b32 s53, v43, 13
442437
; GFX9-NEXT: v_readlane_b32 s52, v43, 12
443438
; GFX9-NEXT: v_readlane_b32 s51, v43, 11
@@ -453,7 +448,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
453448
; GFX9-NEXT: v_readlane_b32 s31, v43, 1
454449
; GFX9-NEXT: v_readlane_b32 s30, v43, 0
455450
; GFX9-NEXT: s_mov_b32 s32, s33
456-
; GFX9-NEXT: v_readlane_b32 s4, v43, 15
451+
; GFX9-NEXT: v_readlane_b32 s4, v43, 14
457452
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
458453
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
459454
; GFX9-NEXT: s_mov_b64 exec, s[6:7]

llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,7 @@ define half @copysign_known_signmask_f16_known_positive_mag(half nofpclass(nan n
8181
; GFX9: ; %bb.0:
8282
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8383
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 15, v1
84-
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
85-
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
84+
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
8685
; GFX9-NEXT: s_setpc_b64 s[30:31]
8786
%signmask = shl i16 %sign, 15
8887
%signmask.bitcast = bitcast i16 %signmask to half
@@ -94,9 +93,7 @@ define float @copysign_known_signmask_f32_known_positive_mag(float nofpclass(nan
9493
; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag:
9594
; GFX9: ; %bb.0:
9695
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97-
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
98-
; GFX9-NEXT: s_brev_b32 s4, -2
99-
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
96+
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
10097
; GFX9-NEXT: s_setpc_b64 s[30:31]
10198
%signmask = shl i32 %sign, 31
10299
%signmask.bitcast = bitcast i32 %signmask to float
@@ -109,8 +106,7 @@ define double @copysign_known_signmask_f64_known_positive_mag(double nofpclass(n
109106
; GFX9: ; %bb.0:
110107
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
111108
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v2
112-
; GFX9-NEXT: s_brev_b32 s4, -2
113-
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
109+
; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
114110
; GFX9-NEXT: s_setpc_b64 s[30:31]
115111
%signmask = shl i64 %sign, 63
116112
%signmask.bitcast = bitcast i64 %signmask to double
@@ -130,11 +126,9 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp(float %x,
130126
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
131127
; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
132128
; GFX9-NEXT: v_exp_f32_e32 v0, v0
133-
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
134-
; GFX9-NEXT: s_brev_b32 s4, -2
135129
; GFX9-NEXT: v_mul_f32_e32 v2, 0x114b4ea4, v0
136130
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
137-
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
131+
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
138132
; GFX9-NEXT: s_setpc_b64 s[30:31]
139133
%signbit.known.zero = call nnan afn float @llvm.exp.f32(float %x)
140134
%signmask = shl i32 %sign, 31
@@ -155,10 +149,8 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp2(float %x
155149
; GFX9-NEXT: v_exp_f32_e32 v0, v0
156150
; GFX9-NEXT: v_not_b32_e32 v2, 63
157151
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
158-
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
159152
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
160-
; GFX9-NEXT: s_brev_b32 s4, -2
161-
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
153+
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
162154
; GFX9-NEXT: s_setpc_b64 s[30:31]
163155
%signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x)
164156
%signmask = shl i32 %sign, 31
@@ -179,10 +171,8 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp10(float %
179171
; GFX9-NEXT: v_exp_f32_e32 v0, v0
180172
; GFX9-NEXT: v_not_b32_e32 v2, 63
181173
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
182-
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
183174
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
184-
; GFX9-NEXT: s_brev_b32 s4, -2
185-
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
175+
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
186176
; GFX9-NEXT: s_setpc_b64 s[30:31]
187177
%signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x)
188178
%signmask = shl i32 %sign, 31

0 commit comments

Comments
 (0)