Skip to content

Commit 82336e5

Browse files
committed
[SelectionDAG] Handle fneg/fabs/fcopysign in SimplifyDemandedBits
1 parent 937be17 commit 82336e5

File tree

9 files changed

+228
-176
lines changed

9 files changed

+228
-176
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 6 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -18413,21 +18413,6 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
1841318413
}
1841418414
}
1841518415

18416-
// copysign(fabs(x), y) -> copysign(x, y)
18417-
// copysign(fneg(x), y) -> copysign(x, y)
18418-
// copysign(copysign(x,z), y) -> copysign(x, y)
18419-
if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
18420-
N0.getOpcode() == ISD::FCOPYSIGN)
18421-
return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0.getOperand(0), N1);
18422-
18423-
// copysign(x, abs(y)) -> abs(x)
18424-
if (N1.getOpcode() == ISD::FABS)
18425-
return DAG.getNode(ISD::FABS, DL, VT, N0);
18426-
18427-
// copysign(x, copysign(y,z)) -> copysign(x, z)
18428-
if (N1.getOpcode() == ISD::FCOPYSIGN)
18429-
return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(1));
18430-
1843118416
// copysign(x, fp_extend(y)) -> copysign(x, y)
1843218417
// copysign(x, fp_round(y)) -> copysign(x, y)
1843318418
if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
@@ -18968,6 +18953,9 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
1896818953
N0.getOperand(0));
1896918954
}
1897018955

18956+
if (SimplifyDemandedBits(N0, APInt::getAllOnes(VT.getScalarSizeInBits())))
18957+
return SDValue(N, 0);
18958+
1897118959
if (SDValue Cast = foldSignChangeInBitcast(N))
1897218960
return Cast;
1897318961

@@ -19041,14 +19029,9 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
1904119029
if (SDValue C = DAG.FoldConstantArithmetic(ISD::FABS, DL, VT, {N0}))
1904219030
return C;
1904319031

19044-
// fold (fabs (fabs x)) -> (fabs x)
19045-
if (N0.getOpcode() == ISD::FABS)
19046-
return N->getOperand(0);
19047-
19048-
// fold (fabs (fneg x)) -> (fabs x)
19049-
// fold (fabs (fcopysign x, y)) -> (fabs x)
19050-
if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
19051-
return DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
19032+
if (SimplifyDemandedBits(N0,
19033+
APInt::getSignedMaxValue(VT.getScalarSizeInBits())))
19034+
return SDValue(N, 0);
1905219035

1905319036
if (SDValue Cast = foldSignChangeInBitcast(N))
1905419037
return Cast;

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2966,6 +2966,77 @@ bool TargetLowering::SimplifyDemandedBits(
29662966
}
29672967
break;
29682968
}
2969+
case ISD::FABS: {
2970+
SDValue Op0 = Op.getOperand(0);
2971+
APInt SignMask = APInt::getSignMask(BitWidth);
2972+
2973+
if (!DemandedBits.intersects(SignMask))
2974+
return TLO.CombineTo(Op, Op0);
2975+
2976+
if (SimplifyDemandedBits(Op0, ~SignMask & DemandedBits, DemandedElts, Known,
2977+
TLO, Depth + 1))
2978+
return true;
2979+
2980+
if (Known.isNonNegative())
2981+
return TLO.CombineTo(Op, Op0);
2982+
if (Known.isNegative())
2983+
return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::FNEG, dl, VT, Op0));
2984+
2985+
Known.Zero |= SignMask;
2986+
Known.One &= ~SignMask;
2987+
2988+
break;
2989+
}
2990+
case ISD::FCOPYSIGN: {
2991+
SDValue Op0 = Op.getOperand(0);
2992+
SDValue Op1 = Op.getOperand(1);
2993+
APInt SignMask = APInt::getSignMask(BitWidth);
2994+
2995+
if (!DemandedBits.intersects(SignMask))
2996+
return TLO.CombineTo(Op, Op0);
2997+
2998+
if (SimplifyDemandedBits(Op0, ~SignMask & DemandedBits, DemandedElts, Known,
2999+
TLO, Depth + 1))
3000+
return true;
3001+
if (SimplifyDemandedBits(Op1, SignMask, DemandedElts, Known2, TLO,
3002+
Depth + 1))
3003+
return true;
3004+
3005+
if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
3006+
return true;
3007+
3008+
if ((Known.isNonNegative() && Known2.isNonNegative()) ||
3009+
(Known.isNegative() && Known2.isNegative()))
3010+
return TLO.CombineTo(Op, Op0);
3011+
3012+
if (Known2.isNonNegative())
3013+
return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::FABS, dl, VT, Op0));
3014+
3015+
if (Known2.isNegative()) {
3016+
Known.One |= SignMask;
3017+
Known.Zero &= ~SignMask;
3018+
}
3019+
3020+
break;
3021+
}
3022+
case ISD::FNEG: {
3023+
SDValue Op0 = Op.getOperand(0);
3024+
APInt SignMask = APInt::getSignMask(BitWidth);
3025+
3026+
if (!DemandedBits.intersects(SignMask))
3027+
return TLO.CombineTo(Op, Op0);
3028+
3029+
if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known, TLO,
3030+
Depth + 1))
3031+
return true;
3032+
3033+
if (Known.isNonNegative() || Known.isNegative()) {
3034+
Known.Zero ^= SignMask;
3035+
Known.One ^= SignMask;
3036+
}
3037+
3038+
break;
3039+
}
29693040
default:
29703041
// We also ask the target about intrinsics (which could be specific to it).
29713042
if (Op.getOpcode() >= ISD::BUILTIN_OP_END ||

llvm/test/CodeGen/AArch64/extract-vector-elt.ll

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -425,10 +425,7 @@ entry:
425425
define float @extract_v4i32_copysign_build_vector_const(<4 x float> %a, <4 x float> %b, i32 %c) {
426426
; CHECK-SD-LABEL: extract_v4i32_copysign_build_vector_const:
427427
; CHECK-SD: // %bb.0: // %entry
428-
; CHECK-SD-NEXT: adrp x8, .LCPI17_0
429-
; CHECK-SD-NEXT: mvni v1.4s, #128, lsl #24
430-
; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI17_0]
431-
; CHECK-SD-NEXT: bif v0.16b, v2.16b, v1.16b
428+
; CHECK-SD-NEXT: fabs v0.4s, v0.4s
432429
; CHECK-SD-NEXT: mov s0, v0.s[2]
433430
; CHECK-SD-NEXT: ret
434431
;

llvm/test/CodeGen/AMDGPU/bf16-conversions.ll

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -427,16 +427,18 @@ entry:
427427
define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
428428
; GFX-942-LABEL: fptrunc_f64_to_bf16_abs:
429429
; GFX-942: ; %bb.0: ; %entry
430-
; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
431-
; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
432-
; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6
433-
; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
434-
; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
435-
; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
430+
; GFX-942-NEXT: v_cvt_f32_f64_e64 v8, |v[0:1]|
431+
; GFX-942-NEXT: v_and_b32_e32 v5, 0x7fffffff, v1
432+
; GFX-942-NEXT: v_mov_b32_e32 v4, v0
433+
; GFX-942-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
434+
; GFX-942-NEXT: v_and_b32_e32 v9, 1, v8
435+
; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[4:5]|, |v[6:7]|
436+
; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[6:7]
437+
; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
436438
; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
437-
; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4
439+
; GFX-942-NEXT: v_add_u32_e32 v4, v8, v4
438440
; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
439-
; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
441+
; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
440442
; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1
441443
; GFX-942-NEXT: s_movk_i32 s0, 0x7fff
442444
; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s0
@@ -449,16 +451,18 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
449451
;
450452
; GFX-950-LABEL: fptrunc_f64_to_bf16_abs:
451453
; GFX-950: ; %bb.0: ; %entry
452-
; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
453-
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
454-
; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
455-
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
456-
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
457-
; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
458-
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
459-
; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0
454+
; GFX-950-NEXT: v_cvt_f32_f64_e64 v8, |v[0:1]|
455+
; GFX-950-NEXT: v_and_b32_e32 v5, 0x7fffffff, v1
456+
; GFX-950-NEXT: v_mov_b32_e32 v4, v0
457+
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
458+
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[6:7]
459+
; GFX-950-NEXT: v_and_b32_e32 v0, 1, v8
460+
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[4:5]|, |v[6:7]|
461+
; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
460462
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
461-
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
463+
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
464+
; GFX-950-NEXT: v_add_u32_e32 v0, v8, v0
465+
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
462466
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
463467
; GFX-950-NEXT: flat_store_short v[2:3], v0
464468
; GFX-950-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18639,17 +18639,17 @@ define bfloat @v_fabs_bf16(bfloat %a) {
1863918639
; GCN: ; %bb.0:
1864018640
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1864118641
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
18642-
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18643-
; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
18642+
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
18643+
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
1864418644
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
1864518645
; GCN-NEXT: s_setpc_b64 s[30:31]
1864618646
;
1864718647
; GFX7-LABEL: v_fabs_bf16:
1864818648
; GFX7: ; %bb.0:
1864918649
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1865018650
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
18651-
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18652-
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
18651+
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
18652+
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
1865318653
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
1865418654
; GFX7-NEXT: s_setpc_b64 s[30:31]
1865518655
;
@@ -18832,8 +18832,8 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
1883218832
; GCN: ; %bb.0:
1883318833
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1883418834
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
18835-
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18836-
; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
18835+
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
18836+
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
1883718837
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
1883818838
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1883918839
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -18843,8 +18843,8 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
1884318843
; GFX7: ; %bb.0:
1884418844
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1884518845
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
18846-
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18847-
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
18846+
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
18847+
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
1884818848
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
1884918849
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1885018850
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -18889,23 +18889,23 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
1888918889
; GCN-LABEL: s_fneg_fabs_bf16:
1889018890
; GCN: ; %bb.0:
1889118891
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
18892+
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
18893+
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
18894+
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18895+
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
18896+
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1889218897
; GCN-NEXT: v_readfirstlane_b32 s0, v0
18893-
; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
18894-
; GCN-NEXT: s_bitset0_b32 s0, 31
18895-
; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
18896-
; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000
18897-
; GCN-NEXT: s_lshr_b32 s0, s0, 16
1889818898
; GCN-NEXT: ; return to shader part epilog
1889918899
;
1890018900
; GFX7-LABEL: s_fneg_fabs_bf16:
1890118901
; GFX7: ; %bb.0:
1890218902
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
18903+
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
18904+
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
18905+
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18906+
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
18907+
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1890318908
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
18904-
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
18905-
; GFX7-NEXT: s_bitset0_b32 s0, 31
18906-
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
18907-
; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000
18908-
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
1890918909
; GFX7-NEXT: ; return to shader part epilog
1891018910
;
1891118911
; GFX8-LABEL: s_fneg_fabs_bf16:

0 commit comments

Comments
 (0)