Skip to content

Commit d6e880b

Browse files
committed
fix final known bits of fcopysign
1 parent d19201e commit d6e880b

File tree

10 files changed

+301
-341
lines changed

10 files changed

+301
-341
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 3 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -18399,34 +18399,12 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
1839918399
if (SDValue C = DAG.FoldConstantArithmetic(ISD::FCOPYSIGN, DL, VT, {N0, N1}))
1840018400
return C;
1840118401

18402-
if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) {
18403-
const APFloat &V = N1C->getValueAPF();
18404-
// copysign(x, c1) -> fabs(x) iff ispos(c1)
18405-
// copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
18406-
if (!V.isNegative()) {
18407-
if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT))
18408-
return DAG.getNode(ISD::FABS, DL, VT, N0);
18409-
} else {
18410-
if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
18411-
return DAG.getNode(ISD::FNEG, DL, VT,
18412-
DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0));
18413-
}
18414-
}
18415-
1841618402
// copysign(x, fp_extend(y)) -> copysign(x, y)
1841718403
// copysign(x, fp_round(y)) -> copysign(x, y)
1841818404
if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
1841918405
return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(0));
1842018406

18421-
// We only take the sign bit from the sign operand.
18422-
EVT SignVT = N1.getValueType();
18423-
if (SimplifyDemandedBits(N1,
18424-
APInt::getSignMask(SignVT.getScalarSizeInBits())))
18425-
return SDValue(N, 0);
18426-
18427-
// We only take the non-sign bits from the value operand
18428-
if (SimplifyDemandedBits(N0,
18429-
APInt::getSignedMaxValue(VT.getScalarSizeInBits())))
18407+
if (SimplifyDemandedBits(SDValue(N, 0)))
1843018408
return SDValue(N, 0);
1843118409

1843218410
return SDValue();
@@ -18953,7 +18931,7 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
1895318931
N0.getOperand(0));
1895418932
}
1895518933

18956-
if (SimplifyDemandedBits(N0, APInt::getAllOnes(VT.getScalarSizeInBits())))
18934+
if (SimplifyDemandedBits(SDValue(N, 0)))
1895718935
return SDValue(N, 0);
1895818936

1895918937
if (SDValue Cast = foldSignChangeInBitcast(N))
@@ -19033,8 +19011,7 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
1903319011
if (N0.getOpcode() == ISD::FABS)
1903419012
return N->getOperand(0);
1903519013

19036-
if (SimplifyDemandedBits(N0,
19037-
APInt::getSignedMaxValue(VT.getScalarSizeInBits())))
19014+
if (SimplifyDemandedBits(N0))
1903819015
return SDValue(N, 0);
1903919016

1904019017
if (SDValue Cast = foldSignChangeInBitcast(N))

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3000,9 +3000,7 @@ bool TargetLowering::SimplifyDemandedBits(
30003000
if (!DemandedBits.intersects(SignMask0))
30013001
return TLO.CombineTo(Op, Op0);
30023002

3003-
APInt ScalarDemandedBits = DemandedBits.trunc(BitWidth0);
3004-
3005-
if (SimplifyDemandedBits(Op0, ~SignMask0 & ScalarDemandedBits, DemandedElts,
3003+
if (SimplifyDemandedBits(Op0, ~SignMask0 & DemandedBits, DemandedElts,
30063004
Known, TLO, Depth + 1) ||
30073005
SimplifyDemandedBits(Op1, SignMask1, DemandedElts, Known2, TLO,
30083006
Depth + 1))
@@ -3016,11 +3014,13 @@ bool TargetLowering::SimplifyDemandedBits(
30163014
return TLO.CombineTo(
30173015
Op, TLO.DAG.getNode(ISD::FABS, dl, VT, Op0, Op->getFlags()));
30183016

3019-
if (Known2.isNegative()) {
3020-
Known.One |= SignMask0;
3021-
Known.Zero &= ~SignMask0;
3022-
}
3017+
if (Known2.isNegative())
3018+
return TLO.CombineTo(
3019+
Op, TLO.DAG.getNode(ISD::FNEG, dl, VT,
3020+
TLO.DAG.getNode(ISD::FABS, SDLoc(Op0), VT, Op0)));
30233021

3022+
Known.Zero &= ~SignMask0;
3023+
Known.One &= ~SignMask0;
30243024
break;
30253025
}
30263026
case ISD::FNEG: {

llvm/test/CodeGen/AArch64/extract-vector-elt.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -391,13 +391,10 @@ define float @extract_v4i32_copysign_build_vector(<4 x float> %a, <4 x float> %b
391391
; CHECK-SD: // %bb.0: // %entry
392392
; CHECK-SD-NEXT: sub sp, sp, #16
393393
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
394-
; CHECK-SD-NEXT: adrp x8, .LCPI16_0
395-
; CHECK-SD-NEXT: mvni v1.4s, #128, lsl #24
396-
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
397-
; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI16_0]
394+
; CHECK-SD-NEXT: fabs v0.4s, v0.4s
398395
; CHECK-SD-NEXT: mov x8, sp
396+
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
399397
; CHECK-SD-NEXT: bfi x8, x0, #2, #2
400-
; CHECK-SD-NEXT: bif v0.16b, v2.16b, v1.16b
401398
; CHECK-SD-NEXT: str q0, [sp]
402399
; CHECK-SD-NEXT: ldr s0, [x8]
403400
; CHECK-SD-NEXT: add sp, sp, #16

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18639,17 +18639,17 @@ define bfloat @v_fabs_bf16(bfloat %a) {
1863918639
; GCN: ; %bb.0:
1864018640
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1864118641
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
18642-
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
18643-
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
18642+
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18643+
; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
1864418644
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
1864518645
; GCN-NEXT: s_setpc_b64 s[30:31]
1864618646
;
1864718647
; GFX7-LABEL: v_fabs_bf16:
1864818648
; GFX7: ; %bb.0:
1864918649
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1865018650
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
18651-
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
18652-
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
18651+
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18652+
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
1865318653
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
1865418654
; GFX7-NEXT: s_setpc_b64 s[30:31]
1865518655
;
@@ -18832,8 +18832,8 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
1883218832
; GCN: ; %bb.0:
1883318833
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1883418834
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
18835-
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
18836-
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
18835+
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18836+
; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
1883718837
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
1883818838
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1883918839
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -18843,8 +18843,8 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
1884318843
; GFX7: ; %bb.0:
1884418844
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1884518845
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
18846-
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
18847-
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
18846+
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18847+
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
1884818848
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
1884918849
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1885018850
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -18889,23 +18889,23 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
1888918889
; GCN-LABEL: s_fneg_fabs_bf16:
1889018890
; GCN: ; %bb.0:
1889118891
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
18892-
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
18893-
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
18894-
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18895-
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
18896-
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1889718892
; GCN-NEXT: v_readfirstlane_b32 s0, v0
18893+
; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
18894+
; GCN-NEXT: s_bitset0_b32 s0, 31
18895+
; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
18896+
; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000
18897+
; GCN-NEXT: s_lshr_b32 s0, s0, 16
1889818898
; GCN-NEXT: ; return to shader part epilog
1889918899
;
1890018900
; GFX7-LABEL: s_fneg_fabs_bf16:
1890118901
; GFX7: ; %bb.0:
1890218902
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
18903-
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
18904-
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
18905-
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18906-
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
18907-
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1890818903
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
18904+
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
18905+
; GFX7-NEXT: s_bitset0_b32 s0, 31
18906+
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
18907+
; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000
18908+
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
1890918909
; GFX7-NEXT: ; return to shader part epilog
1891018910
;
1891118911
; GFX8-LABEL: s_fneg_fabs_bf16:

llvm/test/CodeGen/AMDGPU/fabs.bf16.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
220220
; CI-NEXT: s_waitcnt lgkmcnt(0)
221221
; CI-NEXT: s_and_b32 s4, s3, 0xffff0000
222222
; CI-NEXT: s_lshl_b32 s3, s3, 16
223-
; CI-NEXT: s_and_b32 s5, s2, 0x7fff0000
223+
; CI-NEXT: s_and_b32 s5, s2, 0xffff0000
224224
; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s4|
225225
; CI-NEXT: v_mul_f32_e64 v1, 1.0, |s3|
226226
; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s5|
@@ -944,7 +944,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2bf16(ptr addrspace(1) %in) #0 {
944944
; CI-NEXT: flat_load_dword v0, v[0:1]
945945
; CI-NEXT: s_waitcnt vmcnt(0)
946946
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
947-
; CI-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
947+
; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
948948
; CI-NEXT: v_mul_f32_e64 v1, 1.0, |v1|
949949
; CI-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
950950
; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1

llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll

Lines changed: 24 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -4388,12 +4388,11 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> %m
43884388
; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32:
43894389
; GFX8: ; %bb.0:
43904390
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4391-
; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
4392-
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
4393-
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
4394-
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
4391+
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
4392+
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
4393+
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
43954394
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
4396-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
4395+
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
43974396
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
43984397
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
43994398
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
@@ -5267,13 +5266,12 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> i
52675266
;
52685267
; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32:
52695268
; GFX8: ; %bb.0:
5270-
; GFX8-NEXT: s_bfe_u32 s4, s1, 0x10010
5271-
; GFX8-NEXT: s_add_i32 s4, s4, s1
5272-
; GFX8-NEXT: s_or_b32 s3, s1, 0x400000
5273-
; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff
5269+
; GFX8-NEXT: s_bfe_u32 s3, s1, 0x10010
5270+
; GFX8-NEXT: s_add_i32 s3, s3, s1
5271+
; GFX8-NEXT: s_addk_i32 s3, 0x7fff
52745272
; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1
52755273
; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
5276-
; GFX8-NEXT: s_cselect_b32 s1, s3, s6
5274+
; GFX8-NEXT: s_cselect_b32 s1, s1, s3
52775275
; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10010
52785276
; GFX8-NEXT: s_add_i32 s3, s3, s2
52795277
; GFX8-NEXT: s_addk_i32 s3, 0x7fff
@@ -6340,18 +6338,16 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32(<3 x bfloat> %m
63406338
; GFX8-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32:
63416339
; GFX8: ; %bb.0:
63426340
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6343-
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
6344-
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
6345-
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
6346-
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2
6347-
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
6348-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v5, vcc
63496341
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
6350-
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
63516342
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
6352-
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
6343+
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
63536344
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
63546345
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
6346+
; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1
6347+
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2
6348+
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
6349+
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
6350+
; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
63556351
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
63566352
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
63576353
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
@@ -7687,24 +7683,22 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32(<4 x bfloat> %m
76877683
; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32:
76887684
; GFX8: ; %bb.0:
76897685
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7690-
; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1
7691-
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4
7692-
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
7693-
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
7686+
; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
7687+
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
7688+
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
76947689
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
7695-
; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc
7696-
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
7697-
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
7698-
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
7699-
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
7700-
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v2
7701-
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
7702-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc
7690+
; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
77037691
; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
7692+
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
77047693
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
77057694
; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
77067695
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
77077696
; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
7697+
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
7698+
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
7699+
; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
7700+
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
7701+
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
77087702
; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
77097703
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
77107704
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6

0 commit comments

Comments
 (0)