From 84d910d63bf4a22ebd051f717837fd6892450e7a Mon Sep 17 00:00:00 2001 From: Michael Marjieh Date: Tue, 17 Sep 2024 08:49:00 +0300 Subject: [PATCH 1/3] [TargetLowering][SelectionDAG] Exploit nneg Flag in UINT_TO_FP 1. Propogate the nneg flag in WidenVecRes 2. Use SINT_TO_FP in expandUINT_TO_FP when possible. --- .../SelectionDAG/LegalizeVectorTypes.cpp | 10 +++++----- .../CodeGen/SelectionDAG/TargetLowering.cpp | 20 +++++++++++++------ llvm/test/CodeGen/VE/Scalar/cast.ll | 10 ++++++++++ 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 9674de7738838..e0b47e1045b96 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -5208,7 +5208,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { if (N->getOpcode() == ISD::ZERO_EXTEND && getTypeAction(InVT) == TargetLowering::TypePromoteInteger && TLI.getTypeToTransformTo(Ctx, InVT).getScalarSizeInBits() != - WidenVT.getScalarSizeInBits()) { + WidenVT.getScalarSizeInBits()) { InOp = ZExtPromotedInteger(InOp); InVT = InOp.getValueType(); if (WidenVT.getScalarSizeInBits() < InVT.getScalarSizeInBits()) @@ -5225,7 +5225,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { InVTEC = InVT.getVectorElementCount(); if (InVTEC == WidenEC) { if (N->getNumOperands() == 1) - return DAG.getNode(Opcode, DL, WidenVT, InOp); + return DAG.getNode(Opcode, DL, WidenVT, InOp, Flags); if (N->getNumOperands() == 3) { assert(N->isVPOpcode() && "Expected VP opcode"); SDValue Mask = @@ -5261,7 +5261,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { Ops[0] = InOp; SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops); if (N->getNumOperands() == 1) - return DAG.getNode(Opcode, DL, WidenVT, InVec); + return DAG.getNode(Opcode, DL, WidenVT, InVec, Flags); return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1), Flags); } @@ -5270,7 +5270,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { DAG.getVectorIdxConstant(0, DL)); // Extract the input and convert the shorten input vector. if (N->getNumOperands() == 1) - return DAG.getNode(Opcode, DL, WidenVT, InVal); + return DAG.getNode(Opcode, DL, WidenVT, InVal, Flags); return DAG.getNode(Opcode, DL, WidenVT, InVal, N->getOperand(1), Flags); } } @@ -5285,7 +5285,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp, DAG.getVectorIdxConstant(i, DL)); if (N->getNumOperands() == 1) - Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val); + Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, Flags); else Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1), Flags); } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 793b8ff164c23..87abda7ca2f99 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8364,18 +8364,26 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result, } bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result, - SDValue &Chain, - SelectionDAG &DAG) const { + SDValue &Chain, SelectionDAG &DAG) const { + SDValue Src = Node->getOperand(0); + EVT SrcVT = Src.getValueType(); + EVT DstVT = Node->getValueType(0); + + // If the input is known to be non-negative and SINT_TO_FP is legal then use + // it. + if (Node->getFlags().hasNonNeg() && + isOperationLegalOrCustom(ISD::SINT_TO_FP, DstVT)) { + Result = + DAG.getNode(ISD::SINT_TO_FP, SDLoc(Node), DstVT, Node->getOperand(0)); + return true; + } + // This transform is not correct for converting 0 when rounding mode is set // to round toward negative infinity which will produce -0.0. So disable under // strictfp. if (Node->isStrictFPOpcode()) return false; - SDValue Src = Node->getOperand(0); - EVT SrcVT = Src.getValueType(); - EVT DstVT = Node->getValueType(0); - if (SrcVT.getScalarType() != MVT::i64 || DstVT.getScalarType() != MVT::f64) return false; diff --git a/llvm/test/CodeGen/VE/Scalar/cast.ll b/llvm/test/CodeGen/VE/Scalar/cast.ll index 44782b342f4d0..9253b5591b351 100644 --- a/llvm/test/CodeGen/VE/Scalar/cast.ll +++ b/llvm/test/CodeGen/VE/Scalar/cast.ll @@ -568,6 +568,16 @@ define float @ull2f(i64 %x) { ret float %r } +define float @ull2f_nneg(i64 %x) { +; CHECK-LABEL: ull2f_nneg: +; CHECK: # %bb.0: +; CHECK-NEXT: cvt.d.l %s0, %s0 +; CHECK-NEXT: cvt.s.d %s0, %s0 +; CHECK-NEXT: b.l.t (, %s10) + %r = uitofp nneg i64 %x to float + ret float %r +} + define double @ull2d(i64 %x) { ; CHECK-LABEL: ull2d: ; CHECK: # %bb.0: From f822d0cd7380f982c60c731184be32232b00f4c1 Mon Sep 17 00:00:00 2001 From: Michael Marjieh Date: Wed, 25 Sep 2024 18:36:12 +0300 Subject: [PATCH 2/3] Fix One More Comment --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 87abda7ca2f99..102e4a9b5fd21 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8372,7 +8372,7 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result, // If the input is known to be non-negative and SINT_TO_FP is legal then use // it. if (Node->getFlags().hasNonNeg() && - isOperationLegalOrCustom(ISD::SINT_TO_FP, DstVT)) { + isOperationLegalOrCustom(ISD::SINT_TO_FP, SrcVT)) { Result = DAG.getNode(ISD::SINT_TO_FP, SDLoc(Node), DstVT, Node->getOperand(0)); return true; From 50c4ae6367027cda55d3945ea9e63a172e3440f2 Mon Sep 17 00:00:00 2001 From: Michael Marjieh Date: Mon, 14 Oct 2024 09:23:39 +0300 Subject: [PATCH 3/3] Add Test and Fix Strict Ops --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 23 +++++---- llvm/test/CodeGen/VE/Scalar/cast.ll | 28 ++++++++++ llvm/test/CodeGen/X86/avx512-cvt.ll | 51 ++++++++++++++++--- 3 files changed, 83 insertions(+), 19 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 102e4a9b5fd21..40f030d7b936f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8365,6 +8365,12 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result, bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result, SDValue &Chain, SelectionDAG &DAG) const { + // This transform is not correct for converting 0 when rounding mode is set + // to round toward negative infinity which will produce -0.0. So disable + // under strictfp. + if (Node->isStrictFPOpcode()) + return false; + SDValue Src = Node->getOperand(0); EVT SrcVT = Src.getValueType(); EVT DstVT = Node->getValueType(0); @@ -8378,16 +8384,11 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result, return true; } - // This transform is not correct for converting 0 when rounding mode is set - // to round toward negative infinity which will produce -0.0. So disable under - // strictfp. - if (Node->isStrictFPOpcode()) - return false; - if (SrcVT.getScalarType() != MVT::i64 || DstVT.getScalarType() != MVT::f64) return false; - // Only expand vector types if we have the appropriate vector bit operations. + // Only expand vector types if we have the appropriate vector bit + // operations. if (SrcVT.isVector() && (!isOperationLegalOrCustom(ISD::SRL, SrcVT) || !isOperationLegalOrCustom(ISD::FADD, DstVT) || !isOperationLegalOrCustom(ISD::FSUB, DstVT) || @@ -8401,8 +8402,9 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result, // Implementation of unsigned i64 to f64 following the algorithm in // __floatundidf in compiler_rt. This implementation performs rounding // correctly in all rounding modes with the exception of converting 0 - // when rounding toward negative infinity. In that case the fsub will produce - // -0.0. This will be added to +0.0 and produce -0.0 which is incorrect. + // when rounding toward negative infinity. In that case the fsub will + // produce -0.0. This will be added to +0.0 and produce -0.0 which is + // incorrect. SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT); SDValue TwoP84PlusTwoP52 = DAG.getConstantFP( llvm::bit_cast(UINT64_C(0x4530000000100000)), dl, DstVT); @@ -8416,8 +8418,7 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result, SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84); SDValue LoFlt = DAG.getBitcast(DstVT, LoOr); SDValue HiFlt = DAG.getBitcast(DstVT, HiOr); - SDValue HiSub = - DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); + SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub); return true; } diff --git a/llvm/test/CodeGen/VE/Scalar/cast.ll b/llvm/test/CodeGen/VE/Scalar/cast.ll index 9253b5591b351..6f6c93a1e639f 100644 --- a/llvm/test/CodeGen/VE/Scalar/cast.ll +++ b/llvm/test/CodeGen/VE/Scalar/cast.ll @@ -578,6 +578,34 @@ define float @ull2f_nneg(i64 %x) { ret float %r } +define float @ull2f_strict(i32 %x) { +; CHECK-LABEL: ull2f_strict: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.l %s11, -16, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB58_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB58_2: +; CHECK-NEXT: lea %s1, 1127219200 +; CHECK-NEXT: stl %s1, 12(, %s11) +; CHECK-NEXT: stl %s0, 8(, %s11) +; CHECK-NEXT: ld %s0, 8(, %s11) +; CHECK-NEXT: lea.sl %s1, 1127219200 +; CHECK-NEXT: fsub.d %s0, %s0, %s1 +; CHECK-NEXT: cvt.s.d %s0, %s0 +; CHECK-NEXT: adds.l %s11, 16, %s11 +; CHECK-NEXT: b.l.t (, %s10) + %val = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret float %val +} + define double @ull2d(i64 %x) { ; CHECK-LABEL: ull2d: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll index ff7f7b39c6c8b..a78d97782e6a3 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -312,11 +312,46 @@ define <4 x float> @ulto4f32(<4 x i64> %a) { ret <4 x float> %b } +define <4 x float> @ulto4f32_nneg(<4 x i64> %a) { +; NODQ-LABEL: ulto4f32_nneg: +; NODQ: # %bb.0: +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; NODQ-NEXT: vzeroupper +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ulto4f32_nneg: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 +; VLDQ-NEXT: vzeroupper +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ulto4f32_nneg: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %b = uitofp nneg <4 x i64> %a to <4 x float> + ret <4 x float> %b +} + define <8 x double> @ulto8f64(<8 x i64> %a) { ; NODQ-LABEL: ulto8f64: ; NODQ: # %bb.0: ; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; NODQ-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 +; NODQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) ; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0 ; NODQ-NEXT: vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; NODQ-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 @@ -342,14 +377,14 @@ define <16 x double> @ulto16f64(<16 x i64> %a) { ; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] ; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] ; NODQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; NODQ-NEXT: vpternlogq $248, %zmm2, %zmm0, %zmm4 +; NODQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm0 & zmm2) ; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0 ; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] ; NODQ-NEXT: vporq %zmm5, %zmm0, %zmm0 ; NODQ-NEXT: vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] ; NODQ-NEXT: vsubpd %zmm6, %zmm0, %zmm0 ; NODQ-NEXT: vaddpd %zmm0, %zmm4, %zmm0 -; NODQ-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm3 +; NODQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm1 & zmm2) ; NODQ-NEXT: vpsrlq $32, %zmm1, %zmm1 ; NODQ-NEXT: vporq %zmm5, %zmm1, %zmm1 ; NODQ-NEXT: vsubpd %zmm6, %zmm1, %zmm1 @@ -1483,7 +1518,7 @@ define <16 x float> @sbto16f32(<16 x i32> %a) { ; NODQ: # %bb.0: ; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NODQ-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0 ; NODQ-NEXT: retq ; @@ -1564,7 +1599,7 @@ define <16 x double> @sbto16f64(<16 x double> %a) { ; NODQ-NEXT: vcmpltpd %zmm0, %zmm2, %k0 ; NODQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 ; NODQ-NEXT: kunpckbw %k0, %k1, %k1 -; NODQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NODQ-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm0 ; NODQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm1 @@ -1603,7 +1638,7 @@ define <8 x double> @sbto8f64(<8 x double> %a) { ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; NOVLDQ-NEXT: retq ; @@ -1864,7 +1899,7 @@ define <16 x float> @ubto16f32(<16 x i32> %a) { ; NODQ: # %bb.0: ; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NODQ-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; NODQ-NEXT: vpsrld $31, %zmm0, %zmm0 ; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0 ; NODQ-NEXT: retq @@ -1894,7 +1929,7 @@ define <16 x double> @ubto16f64(<16 x i32> %a) { ; NODQ: # %bb.0: ; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NODQ-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; NODQ-NEXT: vpsrld $31, %zmm0, %zmm1 ; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm0 ; NODQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1