Skip to content

Commit 831e79a

Browse files
authored
DAG: Merge all sincos_stret emission code into legalizer (#166295)
This avoids AArch64 legality rules depending on libcall availability. ARM, AArch64, and X86 all had custom lowering of fsincos which all were just to emit calls to sincos_stret / sincosf_stret. This messes with the cost heuristics around legality, because really it's an expand/libcall cost and not a favorable custom. This is a bit ugly, because we're emitting code trying to match the C ABI lowered IR type for the aggregate return type. This now also gives an easy way to lift the unhandled x86_32 darwin case, since ARM already handled the return as sret case.
1 parent fe106b6 commit 831e79a

File tree

6 files changed

+118
-174
lines changed

6 files changed

+118
-174
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ class SelectionDAGLegalize {
163163
RTLIB::Libcall CallI128);
164164
void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
165165

166+
SDValue ExpandSincosStretLibCall(SDNode *Node) const;
167+
166168
SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
167169
const SDLoc &dl);
168170
SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
@@ -2423,6 +2425,101 @@ static bool useSinCos(SDNode *Node) {
24232425
return false;
24242426
}
24252427

2428+
SDValue SelectionDAGLegalize::ExpandSincosStretLibCall(SDNode *Node) const {
2429+
// For iOS, we want to call an alternative entry point: __sincos_stret,
2430+
// which returns the values in two S / D registers.
2431+
SDLoc dl(Node);
2432+
SDValue Arg = Node->getOperand(0);
2433+
EVT ArgVT = Arg.getValueType();
2434+
RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
2435+
RTLIB::LibcallImpl SincosStret = TLI.getLibcallImpl(LC);
2436+
if (SincosStret == RTLIB::Unsupported)
2437+
return SDValue();
2438+
2439+
/// There are 3 different ABI cases to handle:
2440+
/// - Direct return of separate fields in registers
2441+
/// - Single return as vector elements
2442+
/// - sret struct
2443+
2444+
const RTLIB::RuntimeLibcallsInfo &CallsInfo = TLI.getRuntimeLibcallsInfo();
2445+
2446+
const DataLayout &DL = DAG.getDataLayout();
2447+
2448+
auto [FuncTy, FuncAttrs] = CallsInfo.getFunctionTy(
2449+
*DAG.getContext(), TM.getTargetTriple(), DL, SincosStret);
2450+
2451+
Type *SincosStretRetTy = FuncTy->getReturnType();
2452+
CallingConv::ID CallConv = CallsInfo.getLibcallImplCallingConv(SincosStret);
2453+
StringRef LibcallImplName = CallsInfo.getLibcallImplName(SincosStret);
2454+
2455+
SDValue Callee = DAG.getExternalSymbol(LibcallImplName.data(),
2456+
TLI.getProgramPointerTy(DL));
2457+
2458+
TargetLowering::ArgListTy Args;
2459+
SDValue SRet;
2460+
2461+
int FrameIdx;
2462+
if (FuncTy->getParamType(0)->isPointerTy()) {
2463+
// Uses sret
2464+
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2465+
2466+
AttributeSet PtrAttrs = FuncAttrs.getParamAttrs(0);
2467+
Type *StructTy = PtrAttrs.getStructRetType();
2468+
const uint64_t ByteSize = DL.getTypeAllocSize(StructTy);
2469+
const Align StackAlign = DL.getPrefTypeAlign(StructTy);
2470+
2471+
FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
2472+
SRet = DAG.getFrameIndex(FrameIdx, TLI.getFrameIndexTy(DL));
2473+
2474+
TargetLowering::ArgListEntry Entry(SRet, FuncTy->getParamType(0));
2475+
Entry.IsSRet = true;
2476+
Entry.IndirectType = StructTy;
2477+
Entry.Alignment = StackAlign;
2478+
2479+
Args.push_back(Entry);
2480+
Args.emplace_back(Arg, FuncTy->getParamType(1));
2481+
} else {
2482+
Args.emplace_back(Arg, FuncTy->getParamType(0));
2483+
}
2484+
2485+
TargetLowering::CallLoweringInfo CLI(DAG);
2486+
CLI.setDebugLoc(dl)
2487+
.setChain(DAG.getEntryNode())
2488+
.setLibCallee(CallConv, SincosStretRetTy, Callee, std::move(Args))
2489+
.setIsPostTypeLegalization();
2490+
2491+
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
2492+
2493+
if (SRet) {
2494+
MachinePointerInfo PtrInfo =
2495+
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
2496+
SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, PtrInfo);
2497+
2498+
TypeSize StoreSize = ArgVT.getStoreSize();
2499+
2500+
// Address of cos field.
2501+
SDValue Add = DAG.getObjectPtrOffset(dl, SRet, StoreSize);
2502+
SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
2503+
PtrInfo.getWithOffset(StoreSize));
2504+
2505+
SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
2506+
return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0),
2507+
LoadCos.getValue(0));
2508+
}
2509+
2510+
if (!CallResult.first.getValueType().isVector())
2511+
return CallResult.first;
2512+
2513+
SDValue SinVal =
2514+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
2515+
DAG.getVectorIdxConstant(0, dl));
2516+
SDValue CosVal =
2517+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
2518+
DAG.getVectorIdxConstant(1, dl));
2519+
SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
2520+
return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
2521+
}
2522+
24262523
SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
24272524
SDLoc dl(Node);
24282525
EVT VT = Node->getValueType(0);
@@ -4730,6 +4827,18 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
47304827
case ISD::FSINCOS:
47314828
case ISD::FSINCOSPI: {
47324829
EVT VT = Node->getValueType(0);
4830+
4831+
if (Node->getOpcode() == ISD::FSINCOS) {
4832+
RTLIB::Libcall SincosStret = RTLIB::getSINCOS_STRET(VT);
4833+
if (SincosStret != RTLIB::UNKNOWN_LIBCALL) {
4834+
if (SDValue Expanded = ExpandSincosStretLibCall(Node)) {
4835+
Results.push_back(Expanded);
4836+
Results.push_back(Expanded.getValue(1));
4837+
break;
4838+
}
4839+
}
4840+
}
4841+
47334842
RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS
47344843
? RTLIB::getSINCOS(VT)
47354844
: RTLIB::getSINCOSPI(VT);

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 3 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1052,15 +1052,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
10521052
// Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
10531053
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
10541054

1055-
if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1056-
getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1057-
// Issue __sincos_stret if available.
1058-
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1059-
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1060-
} else {
1061-
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1062-
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1063-
}
1055+
// Issue __sincos_stret if available.
1056+
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1057+
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
10641058

10651059
// Make floating-point constants legal for the large code model, so they don't
10661060
// become loads from the constant pool.
@@ -5346,35 +5340,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
53465340
return SDValue();
53475341
}
53485342

5349-
SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5350-
SelectionDAG &DAG) const {
5351-
// For iOS, we want to call an alternative entry point: __sincos_stret,
5352-
// which returns the values in two S / D registers.
5353-
SDLoc DL(Op);
5354-
SDValue Arg = Op.getOperand(0);
5355-
EVT ArgVT = Arg.getValueType();
5356-
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
5357-
5358-
ArgListTy Args;
5359-
Args.emplace_back(Arg, ArgTy);
5360-
5361-
RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5362-
: RTLIB::SINCOS_STRET_F32;
5363-
const char *LibcallName = getLibcallName(LC);
5364-
SDValue Callee =
5365-
DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
5366-
5367-
StructType *RetTy = StructType::get(ArgTy, ArgTy);
5368-
TargetLowering::CallLoweringInfo CLI(DAG);
5369-
CallingConv::ID CC = getLibcallCallingConv(LC);
5370-
CLI.setDebugLoc(DL)
5371-
.setChain(DAG.getEntryNode())
5372-
.setLibCallee(CC, RetTy, Callee, std::move(Args));
5373-
5374-
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5375-
return CallResult.first;
5376-
}
5377-
53785343
static MVT getSVEContainerType(EVT ContentTy);
53795344

53805345
SDValue
@@ -7723,8 +7688,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
77237688
case ISD::FP_TO_SINT_SAT:
77247689
case ISD::FP_TO_UINT_SAT:
77257690
return LowerFP_TO_INT_SAT(Op, DAG);
7726-
case ISD::FSINCOS:
7727-
return LowerFSINCOS(Op, DAG);
77287691
case ISD::GET_ROUNDING:
77297692
return LowerGET_ROUNDING(Op, DAG);
77307693
case ISD::SET_ROUNDING:

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -745,7 +745,6 @@ class AArch64TargetLowering : public TargetLowering {
745745
SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
746746
SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
747747
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
748-
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
749748
SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const;
750749
SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
751750
SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 4 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1312,8 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
13121312
setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
13131313
}
13141314

1315-
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1316-
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1315+
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1316+
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
13171317

13181318
// FP-ARMv8 implements a lot of rounding-like FP operations.
13191319
if (Subtarget->hasFPARMv8Base()) {
@@ -9855,76 +9855,6 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
98559855
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
98569856
}
98579857

9858-
SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
9859-
// For iOS, we want to call an alternative entry point: __sincos_stret,
9860-
// return values are passed via sret.
9861-
SDLoc dl(Op);
9862-
SDValue Arg = Op.getOperand(0);
9863-
EVT ArgVT = Arg.getValueType();
9864-
RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
9865-
RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC);
9866-
if (SincosStret == RTLIB::Unsupported)
9867-
return SDValue();
9868-
9869-
assert(Subtarget->isTargetDarwin());
9870-
9871-
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9872-
auto PtrVT = getPointerTy(DAG.getDataLayout());
9873-
9874-
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9875-
9876-
// Pair of floats / doubles used to pass the result.
9877-
Type *RetTy = StructType::get(ArgTy, ArgTy);
9878-
auto &DL = DAG.getDataLayout();
9879-
9880-
ArgListTy Args;
9881-
bool ShouldUseSRet = getTM().isAPCS_ABI();
9882-
SDValue SRet;
9883-
if (ShouldUseSRet) {
9884-
// Create stack object for sret.
9885-
const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
9886-
const Align StackAlign = DL.getPrefTypeAlign(RetTy);
9887-
int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
9888-
SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
9889-
9890-
ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext()));
9891-
Entry.IsSExt = false;
9892-
Entry.IsZExt = false;
9893-
Entry.IsSRet = true;
9894-
Args.push_back(Entry);
9895-
RetTy = Type::getVoidTy(*DAG.getContext());
9896-
}
9897-
9898-
Args.emplace_back(Arg, ArgTy);
9899-
9900-
StringRef LibcallName = getLibcallImplName(SincosStret);
9901-
CallingConv::ID CC = getLibcallImplCallingConv(SincosStret);
9902-
SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL));
9903-
9904-
TargetLowering::CallLoweringInfo CLI(DAG);
9905-
CLI.setDebugLoc(dl)
9906-
.setChain(DAG.getEntryNode())
9907-
.setCallee(CC, RetTy, Callee, std::move(Args))
9908-
.setDiscardResult(ShouldUseSRet);
9909-
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
9910-
9911-
if (!ShouldUseSRet)
9912-
return CallResult.first;
9913-
9914-
SDValue LoadSin =
9915-
DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
9916-
9917-
// Address of cos field.
9918-
SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
9919-
DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
9920-
SDValue LoadCos =
9921-
DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
9922-
9923-
SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
9924-
return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
9925-
LoadSin.getValue(0), LoadCos.getValue(0));
9926-
}
9927-
99289858
SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
99299859
bool Signed,
99309860
SDValue &Chain) const {
@@ -10726,8 +10656,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1072610656
case ISD::VECREDUCE_SMAX:
1072710657
return LowerVecReduceMinMax(Op, DAG, Subtarget);
1072810658
case ISD::ATOMIC_LOAD:
10729-
case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
10730-
case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
10659+
case ISD::ATOMIC_STORE:
10660+
return LowerAtomicLoadStore(Op, DAG);
1073110661
case ISD::SDIVREM:
1073210662
case ISD::UDIVREM: return LowerDivRem(Op, DAG);
1073310663
case ISD::DYNAMIC_STACKALLOC:

llvm/lib/Target/ARM/ARMISelLowering.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -901,7 +901,6 @@ class VectorType;
901901
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
902902
const ARMSubtarget *ST) const;
903903
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
904-
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
905904
SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
906905
SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
907906
void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -2572,8 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
25722572
}
25732573

25742574
// Combine sin / cos into _sincos_stret if it is available.
2575-
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2576-
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2575+
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
2576+
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
25772577

25782578
if (Subtarget.isTargetWin64()) {
25792579
setOperationAction(ISD::SDIV, MVT::i128, Custom);
@@ -33004,61 +33004,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
3300433004
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
3300533005
}
3300633006

33007-
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33008-
SelectionDAG &DAG) {
33009-
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33010-
SDValue Arg = Op.getOperand(0);
33011-
EVT ArgVT = Arg.getValueType();
33012-
bool isF64 = ArgVT == MVT::f64;
33013-
33014-
RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33015-
const char *LibcallName = TLI.getLibcallName(LC);
33016-
if (!LibcallName)
33017-
return SDValue();
33018-
33019-
assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33020-
33021-
// For MacOSX, we want to call an alternative entry point: __sincos_stret,
33022-
// which returns the values as { float, float } (in XMM0) or
33023-
// { double, double } (which is returned in XMM0, XMM1).
33024-
SDLoc dl(Op);
33025-
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33026-
33027-
TargetLowering::ArgListTy Args;
33028-
Args.emplace_back(Arg, ArgTy);
33029-
33030-
// Only optimize x86_64 for now. i386 is a bit messy. For f32,
33031-
// the small struct {f32, f32} is returned in (eax, edx). For f64,
33032-
// the results are returned via SRet in memory.
33033-
SDValue Callee =
33034-
DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33035-
33036-
Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33037-
: (Type *)FixedVectorType::get(ArgTy, 2);
33038-
33039-
TargetLowering::CallLoweringInfo CLI(DAG);
33040-
CLI.setDebugLoc(dl)
33041-
.setChain(DAG.getEntryNode())
33042-
.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args))
33043-
.setIsPostTypeLegalization();
33044-
33045-
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33046-
33047-
if (isF64)
33048-
// Returned in xmm0 and xmm1.
33049-
return CallResult.first;
33050-
33051-
// Returned in bits 0:31 and 32:64 xmm0.
33052-
SDValue SinVal =
33053-
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33054-
DAG.getVectorIdxConstant(0, dl));
33055-
SDValue CosVal =
33056-
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33057-
DAG.getVectorIdxConstant(1, dl));
33058-
SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33059-
return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33060-
}
33061-
3306233007
/// Widen a vector input to a vector of NVT. The
3306333008
/// input vector must have the same element type as NVT.
3306433009
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
@@ -33663,7 +33608,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3366333608
case ISD::ABDS:
3366433609
case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
3366533610
case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33666-
case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
3366733611
case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
3366833612
case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
3366933613
case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);

0 commit comments

Comments
 (0)