Skip to content

Commit dd828b4

Browse files
committed
DAG: Merge all sincos_stret emission code into legalizer
This avoids AArch64 legality rules depending on libcall availability. ARM, AArch64, and X86 all had custom lowering of fsincos which all were just to emit calls to sincos_stret / sincosf_stret. This messes with the cost heuristics around legality, because really it's an expand/libcall cost and not a favorable custom. This is a bit ugly, because we're emitting code trying to match the C ABI lowered IR type for the aggregate return type. This now also gives an easy way to lift the unhandled x86_32 darwin case, since ARM already handled the return as sret case.
1 parent dd8ade3 commit dd828b4

File tree

6 files changed

+118
-174
lines changed

6 files changed

+118
-174
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ class SelectionDAGLegalize {
163163
RTLIB::Libcall CallI128);
164164
void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
165165

166+
SDValue ExpandSincosStretLibCall(SDNode *Node) const;
167+
166168
SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
167169
const SDLoc &dl);
168170
SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
@@ -2423,6 +2425,101 @@ static bool useSinCos(SDNode *Node) {
24232425
return false;
24242426
}
24252427

2428+
SDValue SelectionDAGLegalize::ExpandSincosStretLibCall(SDNode *Node) const {
2429+
// For iOS, we want to call an alternative entry point: __sincos_stret,
2430+
// which returns the values in two S / D registers.
2431+
SDLoc dl(Node);
2432+
SDValue Arg = Node->getOperand(0);
2433+
EVT ArgVT = Arg.getValueType();
2434+
RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
2435+
RTLIB::LibcallImpl SincosStret = TLI.getLibcallImpl(LC);
2436+
if (SincosStret == RTLIB::Unsupported)
2437+
return SDValue();
2438+
2439+
/// There are 3 different ABI cases to handle:
2440+
/// - Direct return of separate fields in registers
2441+
/// - Single return as vector elements
2442+
/// - sret struct
2443+
2444+
const RTLIB::RuntimeLibcallsInfo &CallsInfo = TLI.getRuntimeLibcallsInfo();
2445+
2446+
const DataLayout &DL = DAG.getDataLayout();
2447+
2448+
auto [FuncTy, FuncAttrs] = CallsInfo.getFunctionTy(
2449+
*DAG.getContext(), TM.getTargetTriple(), DL, SincosStret);
2450+
2451+
Type *SincosStretRetTy = FuncTy->getReturnType();
2452+
CallingConv::ID CallConv = CallsInfo.getLibcallImplCallingConv(SincosStret);
2453+
StringRef LibcallImplName = CallsInfo.getLibcallImplName(SincosStret);
2454+
2455+
SDValue Callee = DAG.getExternalSymbol(LibcallImplName.data(),
2456+
TLI.getProgramPointerTy(DL));
2457+
2458+
TargetLowering::ArgListTy Args;
2459+
SDValue SRet;
2460+
2461+
int FrameIdx;
2462+
if (FuncTy->getParamType(0)->isPointerTy()) {
2463+
// Uses sret
2464+
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2465+
2466+
AttributeSet PtrAttrs = FuncAttrs.getParamAttrs(0);
2467+
Type *StructTy = PtrAttrs.getStructRetType();
2468+
const uint64_t ByteSize = DL.getTypeAllocSize(StructTy);
2469+
const Align StackAlign = DL.getPrefTypeAlign(StructTy);
2470+
2471+
FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
2472+
SRet = DAG.getFrameIndex(FrameIdx, TLI.getFrameIndexTy(DL));
2473+
2474+
TargetLowering::ArgListEntry Entry(SRet, FuncTy->getParamType(0));
2475+
Entry.IsSRet = true;
2476+
Entry.IndirectType = StructTy;
2477+
Entry.Alignment = StackAlign;
2478+
2479+
Args.push_back(Entry);
2480+
Args.emplace_back(Arg, FuncTy->getParamType(1));
2481+
} else {
2482+
Args.emplace_back(Arg, FuncTy->getParamType(0));
2483+
}
2484+
2485+
TargetLowering::CallLoweringInfo CLI(DAG);
2486+
CLI.setDebugLoc(dl)
2487+
.setChain(DAG.getEntryNode())
2488+
.setLibCallee(CallConv, SincosStretRetTy, Callee, std::move(Args))
2489+
.setIsPostTypeLegalization();
2490+
2491+
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
2492+
2493+
if (SRet) {
2494+
MachinePointerInfo PtrInfo =
2495+
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
2496+
SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, PtrInfo);
2497+
2498+
TypeSize StoreSize = ArgVT.getStoreSize();
2499+
2500+
// Address of cos field.
2501+
SDValue Add = DAG.getObjectPtrOffset(dl, SRet, StoreSize);
2502+
SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
2503+
PtrInfo.getWithOffset(StoreSize));
2504+
2505+
SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
2506+
return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0),
2507+
LoadCos.getValue(0));
2508+
}
2509+
2510+
if (!CallResult.first.getValueType().isVector())
2511+
return CallResult.first;
2512+
2513+
SDValue SinVal =
2514+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
2515+
DAG.getVectorIdxConstant(0, dl));
2516+
SDValue CosVal =
2517+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
2518+
DAG.getVectorIdxConstant(1, dl));
2519+
SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
2520+
return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
2521+
}
2522+
24262523
SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
24272524
SDLoc dl(Node);
24282525
EVT VT = Node->getValueType(0);
@@ -4730,6 +4827,18 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
47304827
case ISD::FSINCOS:
47314828
case ISD::FSINCOSPI: {
47324829
EVT VT = Node->getValueType(0);
4830+
4831+
if (Node->getOpcode() == ISD::FSINCOS) {
4832+
RTLIB::Libcall SincosStret = RTLIB::getSINCOS_STRET(VT);
4833+
if (SincosStret != RTLIB::UNKNOWN_LIBCALL) {
4834+
if (SDValue Expanded = ExpandSincosStretLibCall(Node)) {
4835+
Results.push_back(Expanded);
4836+
Results.push_back(Expanded.getValue(1));
4837+
break;
4838+
}
4839+
}
4840+
}
4841+
47334842
RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS
47344843
? RTLIB::getSINCOS(VT)
47354844
: RTLIB::getSINCOSPI(VT);

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 3 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1052,15 +1052,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
10521052
// Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
10531053
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
10541054

1055-
if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1056-
getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1057-
// Issue __sincos_stret if available.
1058-
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1059-
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1060-
} else {
1061-
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1062-
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1063-
}
1055+
// Issue __sincos_stret if available.
1056+
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1057+
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
10641058

10651059
// Make floating-point constants legal for the large code model, so they don't
10661060
// become loads from the constant pool.
@@ -5346,35 +5340,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
53465340
return SDValue();
53475341
}
53485342

5349-
SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5350-
SelectionDAG &DAG) const {
5351-
// For iOS, we want to call an alternative entry point: __sincos_stret,
5352-
// which returns the values in two S / D registers.
5353-
SDLoc DL(Op);
5354-
SDValue Arg = Op.getOperand(0);
5355-
EVT ArgVT = Arg.getValueType();
5356-
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
5357-
5358-
ArgListTy Args;
5359-
Args.emplace_back(Arg, ArgTy);
5360-
5361-
RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5362-
: RTLIB::SINCOS_STRET_F32;
5363-
const char *LibcallName = getLibcallName(LC);
5364-
SDValue Callee =
5365-
DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
5366-
5367-
StructType *RetTy = StructType::get(ArgTy, ArgTy);
5368-
TargetLowering::CallLoweringInfo CLI(DAG);
5369-
CallingConv::ID CC = getLibcallCallingConv(LC);
5370-
CLI.setDebugLoc(DL)
5371-
.setChain(DAG.getEntryNode())
5372-
.setLibCallee(CC, RetTy, Callee, std::move(Args));
5373-
5374-
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5375-
return CallResult.first;
5376-
}
5377-
53785343
static MVT getSVEContainerType(EVT ContentTy);
53795344

53805345
SDValue
@@ -7723,8 +7688,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
77237688
case ISD::FP_TO_SINT_SAT:
77247689
case ISD::FP_TO_UINT_SAT:
77257690
return LowerFP_TO_INT_SAT(Op, DAG);
7726-
case ISD::FSINCOS:
7727-
return LowerFSINCOS(Op, DAG);
77287691
case ISD::GET_ROUNDING:
77297692
return LowerGET_ROUNDING(Op, DAG);
77307693
case ISD::SET_ROUNDING:

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -745,7 +745,6 @@ class AArch64TargetLowering : public TargetLowering {
745745
SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
746746
SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
747747
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
748-
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
749748
SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const;
750749
SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
751750
SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 4 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1312,8 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
13121312
setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
13131313
}
13141314

1315-
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1316-
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1315+
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1316+
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
13171317

13181318
// FP-ARMv8 implements a lot of rounding-like FP operations.
13191319
if (Subtarget->hasFPARMv8Base()) {
@@ -9855,76 +9855,6 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
98559855
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
98569856
}
98579857

9858-
SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
9859-
// For iOS, we want to call an alternative entry point: __sincos_stret,
9860-
// return values are passed via sret.
9861-
SDLoc dl(Op);
9862-
SDValue Arg = Op.getOperand(0);
9863-
EVT ArgVT = Arg.getValueType();
9864-
RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
9865-
RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC);
9866-
if (SincosStret == RTLIB::Unsupported)
9867-
return SDValue();
9868-
9869-
assert(Subtarget->isTargetDarwin());
9870-
9871-
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9872-
auto PtrVT = getPointerTy(DAG.getDataLayout());
9873-
9874-
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9875-
9876-
// Pair of floats / doubles used to pass the result.
9877-
Type *RetTy = StructType::get(ArgTy, ArgTy);
9878-
auto &DL = DAG.getDataLayout();
9879-
9880-
ArgListTy Args;
9881-
bool ShouldUseSRet = getTM().isAPCS_ABI();
9882-
SDValue SRet;
9883-
if (ShouldUseSRet) {
9884-
// Create stack object for sret.
9885-
const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
9886-
const Align StackAlign = DL.getPrefTypeAlign(RetTy);
9887-
int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
9888-
SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
9889-
9890-
ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext()));
9891-
Entry.IsSExt = false;
9892-
Entry.IsZExt = false;
9893-
Entry.IsSRet = true;
9894-
Args.push_back(Entry);
9895-
RetTy = Type::getVoidTy(*DAG.getContext());
9896-
}
9897-
9898-
Args.emplace_back(Arg, ArgTy);
9899-
9900-
StringRef LibcallName = getLibcallImplName(SincosStret);
9901-
CallingConv::ID CC = getLibcallImplCallingConv(SincosStret);
9902-
SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL));
9903-
9904-
TargetLowering::CallLoweringInfo CLI(DAG);
9905-
CLI.setDebugLoc(dl)
9906-
.setChain(DAG.getEntryNode())
9907-
.setCallee(CC, RetTy, Callee, std::move(Args))
9908-
.setDiscardResult(ShouldUseSRet);
9909-
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
9910-
9911-
if (!ShouldUseSRet)
9912-
return CallResult.first;
9913-
9914-
SDValue LoadSin =
9915-
DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
9916-
9917-
// Address of cos field.
9918-
SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
9919-
DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
9920-
SDValue LoadCos =
9921-
DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
9922-
9923-
SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
9924-
return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
9925-
LoadSin.getValue(0), LoadCos.getValue(0));
9926-
}
9927-
99289858
SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
99299859
bool Signed,
99309860
SDValue &Chain) const {
@@ -10726,8 +10656,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1072610656
case ISD::VECREDUCE_SMAX:
1072710657
return LowerVecReduceMinMax(Op, DAG, Subtarget);
1072810658
case ISD::ATOMIC_LOAD:
10729-
case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
10730-
case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
10659+
case ISD::ATOMIC_STORE:
10660+
return LowerAtomicLoadStore(Op, DAG);
1073110661
case ISD::SDIVREM:
1073210662
case ISD::UDIVREM: return LowerDivRem(Op, DAG);
1073310663
case ISD::DYNAMIC_STACKALLOC:

llvm/lib/Target/ARM/ARMISelLowering.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -901,7 +901,6 @@ class VectorType;
901901
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
902902
const ARMSubtarget *ST) const;
903903
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
904-
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
905904
SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
906905
SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
907906
void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -2572,8 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
25722572
}
25732573

25742574
// Combine sin / cos into _sincos_stret if it is available.
2575-
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2576-
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2575+
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
2576+
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
25772577

25782578
if (Subtarget.isTargetWin64()) {
25792579
setOperationAction(ISD::SDIV, MVT::i128, Custom);
@@ -33004,61 +33004,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
3300433004
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
3300533005
}
3300633006

33007-
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33008-
SelectionDAG &DAG) {
33009-
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33010-
SDValue Arg = Op.getOperand(0);
33011-
EVT ArgVT = Arg.getValueType();
33012-
bool isF64 = ArgVT == MVT::f64;
33013-
33014-
RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33015-
const char *LibcallName = TLI.getLibcallName(LC);
33016-
if (!LibcallName)
33017-
return SDValue();
33018-
33019-
assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33020-
33021-
// For MacOSX, we want to call an alternative entry point: __sincos_stret,
33022-
// which returns the values as { float, float } (in XMM0) or
33023-
// { double, double } (which is returned in XMM0, XMM1).
33024-
SDLoc dl(Op);
33025-
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33026-
33027-
TargetLowering::ArgListTy Args;
33028-
Args.emplace_back(Arg, ArgTy);
33029-
33030-
// Only optimize x86_64 for now. i386 is a bit messy. For f32,
33031-
// the small struct {f32, f32} is returned in (eax, edx). For f64,
33032-
// the results are returned via SRet in memory.
33033-
SDValue Callee =
33034-
DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33035-
33036-
Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33037-
: (Type *)FixedVectorType::get(ArgTy, 2);
33038-
33039-
TargetLowering::CallLoweringInfo CLI(DAG);
33040-
CLI.setDebugLoc(dl)
33041-
.setChain(DAG.getEntryNode())
33042-
.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args))
33043-
.setIsPostTypeLegalization();
33044-
33045-
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33046-
33047-
if (isF64)
33048-
// Returned in xmm0 and xmm1.
33049-
return CallResult.first;
33050-
33051-
// Returned in bits 0:31 and 32:64 xmm0.
33052-
SDValue SinVal =
33053-
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33054-
DAG.getVectorIdxConstant(0, dl));
33055-
SDValue CosVal =
33056-
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33057-
DAG.getVectorIdxConstant(1, dl));
33058-
SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33059-
return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33060-
}
33061-
3306233007
/// Widen a vector input to a vector of NVT. The
3306333008
/// input vector must have the same element type as NVT.
3306433009
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
@@ -33663,7 +33608,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3366333608
case ISD::ABDS:
3366433609
case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
3366533610
case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33666-
case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
3366733611
case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
3366833612
case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
3366933613
case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);

0 commit comments

Comments
 (0)