diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td index c5d176596d8c6..616640152c8d3 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td @@ -319,3 +319,19 @@ def : Pat<(bitconvert FPR64:$src), (MOVFR2GR_D FPR64:$src)>; let Predicates = [HasBasicD, IsLA64] in { def : PatFpr; } // Predicates = [HasBasicD, IsLA64] + +/// Pseudo-instructions needed for the soft-float ABI with LA32D + +let Predicates = [HasBasicD, IsLA32] in { +// Moves two GPRs to an FPR. +let usesCustomInserter = 1 in +def BuildPairF64Pseudo + : Pseudo<(outs FPR64:$dst), (ins GPR:$src1, GPR:$src2), + [(set FPR64:$dst, (loongarch_build_pair_f64 GPR:$src1, GPR:$src2))]>; + +// Moves an FPR to two GPRs. +let usesCustomInserter = 1 in +def SplitPairF64Pseudo + : Pseudo<(outs GPR:$dst1, GPR:$dst2), (ins FPR64:$src), + [(set GPR:$dst1, GPR:$dst2, (loongarch_split_pair_f64 FPR64:$src))]>; +} // Predicates = [HasBasicD, IsLA32] diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 99dae6ec3eb08..5e5f7d62ef509 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -169,6 +169,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_VOID, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); + if (Subtarget.hasBasicD()) + setOperationAction(ISD::BITCAST, MVT::i64, Custom); } setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); @@ -2713,13 +2715,20 @@ SDValue LoongArchTargetLowering::lowerBITCAST(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); + EVT VT = Op.getValueType(); SDValue Op0 = Op.getOperand(0); + EVT Op0VT = Op0.getValueType(); - if (Op.getValueType() == MVT::f32 && Op0.getValueType() == MVT::i32 && + if (Op.getValueType() == MVT::f32 && Op0VT == MVT::i32 && Subtarget.is64Bit() && Subtarget.hasBasicF()) { SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0); return DAG.getNode(LoongArchISD::MOVGR2FR_W_LA64, DL, MVT::f32, NewOp0); } + if (VT == MVT::f64 && Op0VT == MVT::i64 && !Subtarget.is64Bit()) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitScalar(Op0, DL, MVT::i32, MVT::i32); + return DAG.getNode(LoongArchISD::BUILD_PAIR_F64, DL, MVT::f64, Lo, Hi); + } return Op; } @@ -4006,6 +4015,12 @@ void LoongArchTargetLowering::ReplaceNodeResults( SDValue Dst = DAG.getNode(LoongArchISD::MOVFR2GR_S_LA64, DL, MVT::i64, Src); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Dst)); + } else if (VT == MVT::i64 && SrcVT == MVT::f64 && !Subtarget.is64Bit()) { + SDValue NewReg = DAG.getNode(LoongArchISD::SPLIT_PAIR_F64, DL, + DAG.getVTList(MVT::i32, MVT::i32), Src); + SDValue RetReg = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, + NewReg.getValue(0), NewReg.getValue(1)); + Results.push_back(RetReg); } break; } @@ -5649,6 +5664,37 @@ static SDValue performVMSKLTZCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue +performSPLIT_PAIR_F64Combine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + SDValue Op0 = N->getOperand(0); + SDLoc DL(N); + + // If the input to SplitPairF64 is just BuildPairF64 then the operation is + // redundant. Instead, use BuildPairF64's operands directly. + if (Op0->getOpcode() == LoongArchISD::BUILD_PAIR_F64) + return DCI.CombineTo(N, Op0.getOperand(0), Op0.getOperand(1)); + + if (Op0->isUndef()) { + SDValue Lo = DAG.getUNDEF(MVT::i32); + SDValue Hi = DAG.getUNDEF(MVT::i32); + return DCI.CombineTo(N, Lo, Hi); + } + + // It's cheaper to materialise two 32-bit integers than to load a double + // from the constant pool and transfer it to integer registers through the + // stack. + if (ConstantFPSDNode *C = dyn_cast(Op0)) { + APInt V = C->getValueAPF().bitcastToAPInt(); + SDValue Lo = DAG.getConstant(V.trunc(32), DL, MVT::i32); + SDValue Hi = DAG.getConstant(V.lshr(32).trunc(32), DL, MVT::i32); + return DCI.CombineTo(N, Lo, Hi); + } + + return SDValue(); +} + SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -5676,6 +5722,8 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, case LoongArchISD::VMSKLTZ: case LoongArchISD::XVMSKLTZ: return performVMSKLTZCombine(N, DAG, DCI, Subtarget); + case LoongArchISD::SPLIT_PAIR_F64: + return performSPLIT_PAIR_F64Combine(N, DAG, DCI, Subtarget); } return SDValue(); } @@ -6072,6 +6120,50 @@ emitPseudoVMSKCOND(MachineInstr &MI, MachineBasicBlock *BB, return BB; } +static MachineBasicBlock * +emitSplitPairF64Pseudo(MachineInstr &MI, MachineBasicBlock *BB, + const LoongArchSubtarget &Subtarget) { + assert(MI.getOpcode() == LoongArch::SplitPairF64Pseudo && + "Unexpected instruction"); + + MachineFunction &MF = *BB->getParent(); + DebugLoc DL = MI.getDebugLoc(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + Register LoReg = MI.getOperand(0).getReg(); + Register HiReg = MI.getOperand(1).getReg(); + Register SrcReg = MI.getOperand(2).getReg(); + + BuildMI(*BB, MI, DL, TII.get(LoongArch::MOVFR2GR_S_64), LoReg).addReg(SrcReg); + BuildMI(*BB, MI, DL, TII.get(LoongArch::MOVFRH2GR_S), HiReg) + .addReg(SrcReg, getKillRegState(MI.getOperand(2).isKill())); + MI.eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +static MachineBasicBlock * +emitBuildPairF64Pseudo(MachineInstr &MI, MachineBasicBlock *BB, + const LoongArchSubtarget &Subtarget) { + assert(MI.getOpcode() == LoongArch::BuildPairF64Pseudo && + "Unexpected instruction"); + + MachineFunction &MF = *BB->getParent(); + DebugLoc DL = MI.getDebugLoc(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + Register TmpReg = MRI.createVirtualRegister(&LoongArch::FPR64RegClass); + Register DstReg = MI.getOperand(0).getReg(); + Register LoReg = MI.getOperand(1).getReg(); + Register HiReg = MI.getOperand(2).getReg(); + + BuildMI(*BB, MI, DL, TII.get(LoongArch::MOVGR2FR_W_64), TmpReg) + .addReg(LoReg, getKillRegState(MI.getOperand(1).isKill())); + BuildMI(*BB, MI, DL, TII.get(LoongArch::MOVGR2FRH_W), DstReg) + .addReg(TmpReg, RegState::Kill) + .addReg(HiReg, getKillRegState(MI.getOperand(2).isKill())); + MI.eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + static bool isSelectPseudo(MachineInstr &MI) { switch (MI.getOpcode()) { default: @@ -6252,6 +6344,10 @@ MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter( } case LoongArch::Select_GPR_Using_CC_GPR: return emitSelectPseudo(MI, BB, Subtarget); + case LoongArch::BuildPairF64Pseudo: + return emitBuildPairF64Pseudo(MI, BB, Subtarget); + case LoongArch::SplitPairF64Pseudo: + return emitSplitPairF64Pseudo(MI, BB, Subtarget); case LoongArch::PseudoVBZ: case LoongArch::PseudoVBZ_B: case LoongArch::PseudoVBZ_H: @@ -6348,6 +6444,8 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(MOVGR2FR_W_LA64) NODE_NAME_CASE(MOVFR2GR_S_LA64) NODE_NAME_CASE(FTINT) + NODE_NAME_CASE(BUILD_PAIR_F64) + NODE_NAME_CASE(SPLIT_PAIR_F64) NODE_NAME_CASE(REVB_2H) NODE_NAME_CASE(REVB_2W) NODE_NAME_CASE(BITREV_4B) @@ -6527,21 +6625,6 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI, break; } - // FPR32 and FPR64 alias each other. - if (State.getFirstUnallocated(ArgFPR32s) == std::size(ArgFPR32s)) - UseGPRForFloat = true; - - if (UseGPRForFloat && ValVT == MVT::f32) { - LocVT = GRLenVT; - LocInfo = CCValAssign::BCvt; - } else if (UseGPRForFloat && GRLen == 64 && ValVT == MVT::f64) { - LocVT = MVT::i64; - LocInfo = CCValAssign::BCvt; - } else if (UseGPRForFloat && GRLen == 32 && ValVT == MVT::f64) { - // TODO: Handle passing f64 on LA32 with D feature. - report_fatal_error("Passing f64 with GPR on LA32 is undefined"); - } - // If this is a variadic argument, the LoongArch calling convention requires // that it is assigned an 'even' or 'aligned' register if it has (2*GRLen)/8 // byte alignment. An aligned register should be used regardless of whether @@ -6564,6 +6647,45 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI, assert(PendingLocs.size() == PendingArgFlags.size() && "PendingLocs and PendingArgFlags out of sync"); + // FPR32 and FPR64 alias each other. + if (State.getFirstUnallocated(ArgFPR32s) == std::size(ArgFPR32s)) + UseGPRForFloat = true; + + if (UseGPRForFloat && ValVT == MVT::f32) { + LocVT = GRLenVT; + LocInfo = CCValAssign::BCvt; + } else if (UseGPRForFloat && GRLen == 64 && ValVT == MVT::f64) { + LocVT = MVT::i64; + LocInfo = CCValAssign::BCvt; + } else if (UseGPRForFloat && GRLen == 32 && ValVT == MVT::f64) { + // Handle passing f64 on LA32D with a soft float ABI or when floating point + // registers are exhausted. + assert(PendingLocs.empty() && "Can't lower f64 if it is split"); + // Depending on available argument GPRS, f64 may be passed in a pair of + // GPRs, split between a GPR and the stack, or passed completely on the + // stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these + // cases. + MCRegister Reg = State.AllocateReg(ArgGPRs); + if (!Reg) { + int64_t StackOffset = State.AllocateStack(8, Align(8)); + State.addLoc( + CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); + return false; + } + LocVT = MVT::i32; + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + MCRegister HiReg = State.AllocateReg(ArgGPRs); + if (HiReg) { + State.addLoc( + CCValAssign::getCustomReg(ValNo, ValVT, HiReg, LocVT, LocInfo)); + } else { + int64_t StackOffset = State.AllocateStack(4, Align(4)); + State.addLoc( + CCValAssign::getCustomMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); + } + return false; + } + // Split arguments might be passed indirectly, so keep track of the pending // values. if (ValVT.isScalarInteger() && (ArgFlags.isSplit() || !PendingLocs.empty())) { @@ -6764,6 +6886,38 @@ static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), ValVT); } +static SDValue unpackF64OnLA32DSoftABI(SelectionDAG &DAG, SDValue Chain, + const CCValAssign &VA, + const CCValAssign &HiVA, + const SDLoc &DL) { + assert(VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64 && + "Unexpected VA"); + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + + assert(VA.isRegLoc() && "Expected register VA assignment"); + + Register LoVReg = RegInfo.createVirtualRegister(&LoongArch::GPRRegClass); + RegInfo.addLiveIn(VA.getLocReg(), LoVReg); + SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32); + SDValue Hi; + if (HiVA.isMemLoc()) { + // Second half of f64 is passed on the stack. + int FI = MFI.CreateFixedObject(4, HiVA.getLocMemOffset(), + /*IsImmutable=*/true); + SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); + Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN, + MachinePointerInfo::getFixedStack(MF, FI)); + } else { + // Second half of f64 is passed in another GPR. + Register HiVReg = RegInfo.createVirtualRegister(&LoongArch::GPRRegClass); + RegInfo.addLiveIn(HiVA.getLocReg(), HiVReg); + Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32); + } + return DAG.getNode(LoongArchISD::BUILD_PAIR_F64, DL, MVT::f64, Lo, Hi); +} + static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val, const CCValAssign &VA, const SDLoc &DL) { EVT LocVT = VA.getLocVT(); @@ -6861,11 +7015,16 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( else analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false, CC_LoongArch); - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + for (unsigned i = 0, e = ArgLocs.size(), InsIdx = 0; i != e; ++i, ++InsIdx) { CCValAssign &VA = ArgLocs[i]; SDValue ArgValue; - if (VA.isRegLoc()) - ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL, Ins[i], *this); + // Passing f64 on LA32D with a soft float ABI must be handled as a special + // case. + if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) { + assert(VA.needsCustom()); + ArgValue = unpackF64OnLA32DSoftABI(DAG, Chain, VA, ArgLocs[++i], DL); + } else if (VA.isRegLoc()) + ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL, Ins[InsIdx], *this); else ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL); if (VA.getLocInfo() == CCValAssign::Indirect) { @@ -6873,17 +7032,18 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( // load all parts of it here (using the same address). InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo())); - unsigned ArgIndex = Ins[i].OrigArgIndex; - unsigned ArgPartOffset = Ins[i].PartOffset; + unsigned ArgIndex = Ins[InsIdx].OrigArgIndex; + unsigned ArgPartOffset = Ins[InsIdx].PartOffset; assert(ArgPartOffset == 0); - while (i + 1 != e && Ins[i + 1].OrigArgIndex == ArgIndex) { + while (i + 1 != e && Ins[InsIdx + 1].OrigArgIndex == ArgIndex) { CCValAssign &PartVA = ArgLocs[i + 1]; - unsigned PartOffset = Ins[i + 1].PartOffset - ArgPartOffset; + unsigned PartOffset = Ins[InsIdx + 1].PartOffset - ArgPartOffset; SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL); SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue, Offset); InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address, MachinePointerInfo())); ++i; + ++InsIdx; } continue; } @@ -7112,31 +7272,67 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVector> RegsToPass; SmallVector MemOpChains; SDValue StackPtr; - for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) { + for (unsigned i = 0, j = 0, e = ArgLocs.size(), OutIdx = 0; i != e; + ++i, ++OutIdx) { CCValAssign &VA = ArgLocs[i]; - SDValue ArgValue = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; + SDValue ArgValue = OutVals[OutIdx]; + ISD::ArgFlagsTy Flags = Outs[OutIdx].Flags; + + // Handle passing f64 on LA32D with a soft float ABI as a special case. + if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) { + assert(VA.isRegLoc() && "Expected register VA assignment"); + assert(VA.needsCustom()); + SDValue SplitF64 = + DAG.getNode(LoongArchISD::SPLIT_PAIR_F64, DL, + DAG.getVTList(MVT::i32, MVT::i32), ArgValue); + SDValue Lo = SplitF64.getValue(0); + SDValue Hi = SplitF64.getValue(1); + + Register RegLo = VA.getLocReg(); + RegsToPass.push_back(std::make_pair(RegLo, Lo)); + + // Get the CCValAssign for the Hi part. + CCValAssign &HiVA = ArgLocs[++i]; + + if (HiVA.isMemLoc()) { + // Second half of f64 is passed on the stack. + if (!StackPtr.getNode()) + StackPtr = DAG.getCopyFromReg(Chain, DL, LoongArch::R3, PtrVT); + SDValue Address = + DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, + DAG.getIntPtrConstant(HiVA.getLocMemOffset(), DL)); + // Emit the store. + MemOpChains.push_back(DAG.getStore( + Chain, DL, Hi, Address, + MachinePointerInfo::getStack(MF, HiVA.getLocMemOffset()))); + } else { + // Second half of f64 is passed in another GPR. + Register RegHigh = HiVA.getLocReg(); + RegsToPass.push_back(std::make_pair(RegHigh, Hi)); + } + continue; + } // Promote the value if needed. // For now, only handle fully promoted and indirect arguments. if (VA.getLocInfo() == CCValAssign::Indirect) { // Store the argument in a stack slot and pass its address. Align StackAlign = - std::max(getPrefTypeAlign(Outs[i].ArgVT, DAG), + std::max(getPrefTypeAlign(Outs[OutIdx].ArgVT, DAG), getPrefTypeAlign(ArgValue.getValueType(), DAG)); TypeSize StoredSize = ArgValue.getValueType().getStoreSize(); // If the original argument was split and passed by reference, we need to // store the required parts of it here (and pass just one address). - unsigned ArgIndex = Outs[i].OrigArgIndex; - unsigned ArgPartOffset = Outs[i].PartOffset; + unsigned ArgIndex = Outs[OutIdx].OrigArgIndex; + unsigned ArgPartOffset = Outs[OutIdx].PartOffset; assert(ArgPartOffset == 0); // Calculate the total size to store. We don't have access to what we're // actually storing other than performing the loop and collecting the // info. SmallVector> Parts; - while (i + 1 != e && Outs[i + 1].OrigArgIndex == ArgIndex) { - SDValue PartValue = OutVals[i + 1]; - unsigned PartOffset = Outs[i + 1].PartOffset - ArgPartOffset; + while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == ArgIndex) { + SDValue PartValue = OutVals[OutIdx + 1]; + unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset; SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL); EVT PartVT = PartValue.getValueType(); @@ -7144,6 +7340,7 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG)); Parts.push_back(std::make_pair(PartValue, Offset)); ++i; + ++OutIdx; } SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign); int FI = cast(SpillSlot)->getIndex(); @@ -7279,7 +7476,8 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, analyzeInputArgs(MF, RetCCInfo, Ins, /*IsRet=*/true, CC_LoongArch); // Copy all of the result registers out of their specified physreg. - for (auto &VA : RVLocs) { + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { + auto &VA = RVLocs[i]; // Copy the value out. SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue); @@ -7287,7 +7485,16 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = RetValue.getValue(1); Glue = RetValue.getValue(2); - RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL); + if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) { + assert(VA.needsCustom()); + SDValue RetValue2 = DAG.getCopyFromReg(Chain, DL, RVLocs[++i].getLocReg(), + MVT::i32, Glue); + Chain = RetValue2.getValue(1); + Glue = RetValue2.getValue(2); + RetValue = DAG.getNode(LoongArchISD::BUILD_PAIR_F64, DL, MVT::f64, + RetValue, RetValue2); + } else + RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL); InVals.push_back(RetValue); } @@ -7333,17 +7540,37 @@ SDValue LoongArchTargetLowering::LowerReturn( SmallVector RetOps(1, Chain); // Copy the result values into the output registers. - for (unsigned i = 0, e = RVLocs.size(); i < e; ++i) { + for (unsigned i = 0, e = RVLocs.size(), OutIdx = 0; i < e; ++i, ++OutIdx) { + SDValue Val = OutVals[OutIdx]; CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); - // Handle a 'normal' return. - SDValue Val = convertValVTToLocVT(DAG, OutVals[i], VA, DL); - Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue); + if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) { + // Handle returning f64 on LA32D with a soft float ABI. + assert(VA.isRegLoc() && "Expected return via registers"); + assert(VA.needsCustom()); + SDValue SplitF64 = DAG.getNode(LoongArchISD::SPLIT_PAIR_F64, DL, + DAG.getVTList(MVT::i32, MVT::i32), Val); + SDValue Lo = SplitF64.getValue(0); + SDValue Hi = SplitF64.getValue(1); + Register RegLo = VA.getLocReg(); + Register RegHi = RVLocs[++i].getLocReg(); + + Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue); + Glue = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(RegLo, MVT::i32)); + Chain = DAG.getCopyToReg(Chain, DL, RegHi, Hi, Glue); + Glue = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(RegHi, MVT::i32)); + } else { + // Handle a 'normal' return. + Val = convertValVTToLocVT(DAG, Val, VA, DL); + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue); - // Guarantee that all emitted copies are stuck together. - Glue = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + // Guarantee that all emitted copies are stuck together. + Glue = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } } RetOps[0] = Chain; // Update chain. diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 79aa89726191b..60dc2b385a75c 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -60,6 +60,10 @@ enum NodeType : unsigned { FTINT, + // Build and split F64 pair + BUILD_PAIR_F64, + SPLIT_PAIR_F64, + // Bit counting operations CLZ_W, CTZ_W, diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index b6552ed33f5b1..2b94e65cac0e5 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -58,6 +58,13 @@ def SDT_LoongArchMovgr2fcsr : SDTypeProfile<0, 2, [SDTCisVT<0, GRLenVT>, def SDT_LoongArchMovfcsr2gr : SDTypeProfile<1, 1, [SDTCisVT<0, GRLenVT>, SDTCisSameAs<0, 1>]>; +def SDT_LoongArchBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, + SDTCisVT<1, i32>, + SDTCisSameAs<1, 2>]>; +def SDT_LoongArchSplitPairF64 : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, + SDTCisVT<1, i32>, + SDTCisVT<2, f64>]>; + // TODO: Add LoongArch specific DAG Nodes // Target-independent nodes, but with target-specific formats. def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart, @@ -165,6 +172,11 @@ def loongarch_iocsrwr_d : SDNode<"LoongArchISD::IOCSRWR_D", def loongarch_cpucfg : SDNode<"LoongArchISD::CPUCFG", SDTUnaryOp, [SDNPHasChain]>; +def loongarch_build_pair_f64 : SDNode<"LoongArchISD::BUILD_PAIR_F64", + SDT_LoongArchBuildPairF64>; +def loongarch_split_pair_f64 : SDNode<"LoongArchISD::SPLIT_PAIR_F64", + SDT_LoongArchSplitPairF64>; + def to_fclass_mask: SDNodeXFormgetZExtValue(); unsigned Mask = 0; diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-ilp32d.ll b/llvm/test/CodeGen/LoongArch/calling-conv-ilp32d.ll new file mode 100644 index 0000000000000..62c2cc999456c --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/calling-conv-ilp32d.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+d --target-abi=ilp32d < %s \ +; RUN: | FileCheck %s + +;; This file contains specific tests for the ilp32d ABI. + +;; Check pass floating-point arguments whith FPRs. + +define i32 @callee_float_in_fpr(i32 %a, float %b, double %c) nounwind { +; CHECK-LABEL: callee_float_in_fpr: +; CHECK: # %bb.0: +; CHECK-NEXT: ftintrz.w.s $fa0, $fa0 +; CHECK-NEXT: movfr2gr.s $a1, $fa0 +; CHECK-NEXT: ftintrz.w.d $fa0, $fa1 +; CHECK-NEXT: movfr2gr.s $a2, $fa0 +; CHECK-NEXT: add.w $a0, $a0, $a1 +; CHECK-NEXT: add.w $a0, $a0, $a2 +; CHECK-NEXT: ret + %b_fptosi = fptosi float %b to i32 + %c_fptosi = fptosi double %c to i32 + %1 = add i32 %a, %b_fptosi + %2 = add i32 %1, %c_fptosi + ret i32 %2 +} + +define i32 @caller_float_in_fpr() nounwind { +; CHECK-LABEL: caller_float_in_fpr: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.w $sp, $sp, -16 +; CHECK-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; CHECK-NEXT: movgr2fr.w $fa1, $zero +; CHECK-NEXT: movgr2frh.w $fa1, $zero +; CHECK-NEXT: movgr2fr.w $fa0, $zero +; CHECK-NEXT: ori $a0, $zero, 1 +; CHECK-NEXT: bl callee_float_in_fpr +; CHECK-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; CHECK-NEXT: addi.w $sp, $sp, 16 +; CHECK-NEXT: ret + %1 = call i32 @callee_float_in_fpr(i32 1, float 0.0, double 0.0) + ret i32 %1 +} + +;; Check that the GPR is used once the FPRs are exhausted. + +;; Must keep define on a single line due to an update_llc_test_checks.py limitation. +define i32 @callee_double_in_gpr_exhausted_fprs(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) nounwind { +; CHECK-LABEL: callee_double_in_gpr_exhausted_fprs: +; CHECK: # %bb.0: +; CHECK-NEXT: movgr2fr.w $fa0, $a0 +; CHECK-NEXT: movgr2frh.w $fa0, $a1 +; CHECK-NEXT: ftintrz.w.d $fa1, $fa7 +; CHECK-NEXT: movfr2gr.s $a0, $fa1 +; CHECK-NEXT: ftintrz.w.d $fa0, $fa0 +; CHECK-NEXT: movfr2gr.s $a1, $fa0 +; CHECK-NEXT: add.w $a0, $a0, $a1 +; CHECK-NEXT: ret + %h_fptosi = fptosi double %h to i32 + %i_fptosi = fptosi double %i to i32 + %1 = add i32 %h_fptosi, %i_fptosi + ret i32 %1 +} + +define i32 @caller_double_in_gpr_exhausted_fprs() nounwind { +; CHECK-LABEL: caller_double_in_gpr_exhausted_fprs: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.w $sp, $sp, -16 +; CHECK-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0) +; CHECK-NEXT: fld.d $fa1, $a0, %pc_lo12(.LCPI3_0) +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_1) +; CHECK-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI3_1) +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_2) +; CHECK-NEXT: fld.d $fa3, $a0, %pc_lo12(.LCPI3_2) +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_3) +; CHECK-NEXT: fld.d $fa4, $a0, %pc_lo12(.LCPI3_3) +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_4) +; CHECK-NEXT: fld.d $fa5, $a0, %pc_lo12(.LCPI3_4) +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_5) +; CHECK-NEXT: fld.d $fa6, $a0, %pc_lo12(.LCPI3_5) +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_6) +; CHECK-NEXT: fld.d $fa7, $a0, %pc_lo12(.LCPI3_6) +; CHECK-NEXT: addi.w $a0, $zero, 1 +; CHECK-NEXT: movgr2fr.w $fa0, $a0 +; CHECK-NEXT: ffint.s.w $fa0, $fa0 +; CHECK-NEXT: fcvt.d.s $fa0, $fa0 +; CHECK-NEXT: lu12i.w $a1, 262688 +; CHECK-NEXT: move $a0, $zero +; CHECK-NEXT: bl callee_double_in_gpr_exhausted_fprs +; CHECK-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; CHECK-NEXT: addi.w $sp, $sp, 16 +; CHECK-NEXT: ret + %1 = call i32 @callee_double_in_gpr_exhausted_fprs( + double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, + double 7.0, double 8.0, double 9.0) + ret i32 %1 +} + +;; Check that the stack is used once the FPRs and GPRs are both exhausted. + +;; Must keep define on a single line due to an update_llc_test_checks.py limitation. +define i32 @callee_double_on_stack_exhausted_fprs_gprs(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, double %j, double %k, double %l, double %m, double %n) nounwind { +; CHECK-LABEL: callee_double_on_stack_exhausted_fprs_gprs: +; CHECK: # %bb.0: +; CHECK-NEXT: fld.d $fa0, $sp, 0 +; CHECK-NEXT: fld.d $fa1, $sp, 8 +; CHECK-NEXT: ftintrz.w.d $fa0, $fa0 +; CHECK-NEXT: movfr2gr.s $a0, $fa0 +; CHECK-NEXT: ftintrz.w.d $fa0, $fa1 +; CHECK-NEXT: movfr2gr.s $a1, $fa0 +; CHECK-NEXT: add.w $a0, $a0, $a1 +; CHECK-NEXT: ret + %m_fptosi = fptosi double %m to i32 + %n_fptosi = fptosi double %n to i32 + %1 = add i32 %m_fptosi, %n_fptosi + ret i32 %1 +} + +define i32 @caller_double_on_stack_exhausted_fprs_gprs() nounwind { +; CHECK-LABEL: caller_double_on_stack_exhausted_fprs_gprs: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.w $sp, $sp, -32 +; CHECK-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; CHECK-NEXT: lu12i.w $a0, 262816 +; CHECK-NEXT: st.w $a0, $sp, 4 +; CHECK-NEXT: st.w $zero, $sp, 0 +; CHECK-NEXT: lu12i.w $a0, 262848 +; CHECK-NEXT: st.w $a0, $sp, 12 +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_0) +; CHECK-NEXT: fld.d $fa1, $a0, %pc_lo12(.LCPI5_0) +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_1) +; CHECK-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI5_1) +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_2) +; CHECK-NEXT: fld.d $fa3, $a0, %pc_lo12(.LCPI5_2) +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_3) +; CHECK-NEXT: fld.d $fa4, $a0, %pc_lo12(.LCPI5_3) +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_4) +; CHECK-NEXT: fld.d $fa5, $a0, %pc_lo12(.LCPI5_4) +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_5) +; CHECK-NEXT: fld.d $fa6, $a0, %pc_lo12(.LCPI5_5) +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_6) +; CHECK-NEXT: fld.d $fa7, $a0, %pc_lo12(.LCPI5_6) +; CHECK-NEXT: addi.w $a0, $zero, 1 +; CHECK-NEXT: movgr2fr.w $fa0, $a0 +; CHECK-NEXT: ffint.s.w $fa0, $fa0 +; CHECK-NEXT: fcvt.d.s $fa0, $fa0 +; CHECK-NEXT: lu12i.w $a1, 262688 +; CHECK-NEXT: lu12i.w $a3, 262720 +; CHECK-NEXT: lu12i.w $a5, 262752 +; CHECK-NEXT: lu12i.w $a7, 262784 +; CHECK-NEXT: st.w $zero, $sp, 8 +; CHECK-NEXT: move $a0, $zero +; CHECK-NEXT: move $a2, $zero +; CHECK-NEXT: move $a4, $zero +; CHECK-NEXT: move $a6, $zero +; CHECK-NEXT: bl callee_double_on_stack_exhausted_fprs_gprs +; CHECK-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; CHECK-NEXT: addi.w $sp, $sp, 32 +; CHECK-NEXT: ret + %1 = call i32 @callee_double_on_stack_exhausted_fprs_gprs( + double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, + double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, + double 13.0, double 14.0) + ret i32 %1 +} + +;; Check returning doubles. + +define double @callee_double_ret() nounwind { +; CHECK-LABEL: callee_double_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.w $a0, $zero, 1 +; CHECK-NEXT: movgr2fr.w $fa0, $a0 +; CHECK-NEXT: ffint.s.w $fa0, $fa0 +; CHECK-NEXT: fcvt.d.s $fa0, $fa0 +; CHECK-NEXT: ret + ret double 1.0 +} + +define i64 @caller_double_ret() nounwind { +; CHECK-LABEL: caller_double_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.w $sp, $sp, -16 +; CHECK-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; CHECK-NEXT: bl callee_double_ret +; CHECK-NEXT: movfr2gr.s $a0, $fa0 +; CHECK-NEXT: movfrh2gr.s $a1, $fa0 +; CHECK-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; CHECK-NEXT: addi.w $sp, $sp, 16 +; CHECK-NEXT: ret + %1 = call double @callee_double_ret() + %2 = bitcast double %1 to i64 + ret i64 %2 +} diff --git a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll index be9ea29b54c33..c1d75ddd32803 100644 --- a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll +++ b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll @@ -32,18 +32,14 @@ define double @constraint_f_double(double %a) nounwind { define double @constraint_gpr(double %a) { ; LA32-LABEL: constraint_gpr: ; LA32: # %bb.0: -; LA32-NEXT: addi.w $sp, $sp, -16 -; LA32-NEXT: .cfi_def_cfa_offset 16 -; LA32-NEXT: fst.d $fa0, $sp, 8 -; LA32-NEXT: ld.w $a7, $sp, 8 -; LA32-NEXT: ld.w $t0, $sp, 12 +; LA32-NEXT: .cfi_def_cfa_offset 0 +; LA32-NEXT: movfr2gr.s $a7, $fa0 +; LA32-NEXT: movfrh2gr.s $t0, $fa0 ; LA32-NEXT: #APP ; LA32-NEXT: move $a6, $a7 ; LA32-NEXT: #NO_APP -; LA32-NEXT: st.w $a7, $sp, 4 -; LA32-NEXT: st.w $a6, $sp, 0 -; LA32-NEXT: fld.d $fa0, $sp, 0 -; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: movgr2fr.w $fa0, $a6 +; LA32-NEXT: movgr2frh.w $fa0, $a7 ; LA32-NEXT: ret ; ; LA64-LABEL: constraint_gpr: diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll index 2a51fd97feb62..0b82ea220d7fb 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll @@ -279,11 +279,8 @@ define double @convert_u64_to_double(i64 %a) nounwind { define double @bitcast_i64_to_double(i64 %a, i64 %b) nounwind { ; LA32-LABEL: bitcast_i64_to_double: ; LA32: # %bb.0: -; LA32-NEXT: addi.w $sp, $sp, -16 -; LA32-NEXT: st.w $a1, $sp, 12 -; LA32-NEXT: st.w $a0, $sp, 8 -; LA32-NEXT: fld.d $fa0, $sp, 8 -; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: movgr2fr.w $fa0, $a0 +; LA32-NEXT: movgr2frh.w $fa0, $a1 ; LA32-NEXT: ret ; ; LA64-LABEL: bitcast_i64_to_double: @@ -297,11 +294,8 @@ define double @bitcast_i64_to_double(i64 %a, i64 %b) nounwind { define i64 @bitcast_double_to_i64(double %a) nounwind { ; LA32-LABEL: bitcast_double_to_i64: ; LA32: # %bb.0: -; LA32-NEXT: addi.w $sp, $sp, -16 -; LA32-NEXT: fst.d $fa0, $sp, 8 -; LA32-NEXT: ld.w $a0, $sp, 8 -; LA32-NEXT: ld.w $a1, $sp, 12 -; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: movfr2gr.s $a0, $fa0 +; LA32-NEXT: movfrh2gr.s $a1, $fa0 ; LA32-NEXT: ret ; ; LA64-LABEL: bitcast_double_to_i64: diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll index 78cabd37c0ad9..b6507e87f0886 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll @@ -115,9 +115,8 @@ define double @load_acquire_double(ptr %ptr) { ; LA32-NEXT: .cfi_offset 1, -4 ; LA32-NEXT: ori $a1, $zero, 2 ; LA32-NEXT: bl __atomic_load_8 -; LA32-NEXT: st.w $a1, $sp, 4 -; LA32-NEXT: st.w $a0, $sp, 0 -; LA32-NEXT: fld.d $fa0, $sp, 0 +; LA32-NEXT: movgr2fr.w $fa0, $a0 +; LA32-NEXT: movgr2frh.w $fa0, $a1 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 ; LA32-NEXT: ret @@ -234,9 +233,8 @@ define double @load_unordered_double(ptr %ptr) { ; LA32-NEXT: .cfi_offset 1, -4 ; LA32-NEXT: move $a1, $zero ; LA32-NEXT: bl __atomic_load_8 -; LA32-NEXT: st.w $a1, $sp, 4 -; LA32-NEXT: st.w $a0, $sp, 0 -; LA32-NEXT: fld.d $fa0, $sp, 0 +; LA32-NEXT: movgr2fr.w $fa0, $a0 +; LA32-NEXT: movgr2frh.w $fa0, $a1 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 ; LA32-NEXT: ret @@ -352,9 +350,8 @@ define double @load_monotonic_double(ptr %ptr) { ; LA32-NEXT: .cfi_offset 1, -4 ; LA32-NEXT: move $a1, $zero ; LA32-NEXT: bl __atomic_load_8 -; LA32-NEXT: st.w $a1, $sp, 4 -; LA32-NEXT: st.w $a0, $sp, 0 -; LA32-NEXT: fld.d $fa0, $sp, 0 +; LA32-NEXT: movgr2fr.w $fa0, $a0 +; LA32-NEXT: movgr2frh.w $fa0, $a1 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 ; LA32-NEXT: ret @@ -481,9 +478,8 @@ define double @load_seq_cst_double(ptr %ptr) { ; LA32-NEXT: .cfi_offset 1, -4 ; LA32-NEXT: ori $a1, $zero, 5 ; LA32-NEXT: bl __atomic_load_8 -; LA32-NEXT: st.w $a1, $sp, 4 -; LA32-NEXT: st.w $a0, $sp, 0 -; LA32-NEXT: fld.d $fa0, $sp, 0 +; LA32-NEXT: movgr2fr.w $fa0, $a0 +; LA32-NEXT: movgr2frh.w $fa0, $a1 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 ; LA32-NEXT: ret @@ -605,9 +601,8 @@ define void @store_release_double(ptr %ptr, double %v) { ; LA32-NEXT: .cfi_def_cfa_offset 16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill ; LA32-NEXT: .cfi_offset 1, -4 -; LA32-NEXT: fst.d $fa0, $sp, 0 -; LA32-NEXT: ld.w $a1, $sp, 0 -; LA32-NEXT: ld.w $a2, $sp, 4 +; LA32-NEXT: movfr2gr.s $a1, $fa0 +; LA32-NEXT: movfrh2gr.s $a2, $fa0 ; LA32-NEXT: ori $a3, $zero, 3 ; LA32-NEXT: bl __atomic_store_8 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -723,9 +718,8 @@ define void @store_unordered_double(ptr %ptr, double %v) { ; LA32-NEXT: .cfi_def_cfa_offset 16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill ; LA32-NEXT: .cfi_offset 1, -4 -; LA32-NEXT: fst.d $fa0, $sp, 0 -; LA32-NEXT: ld.w $a1, $sp, 0 -; LA32-NEXT: ld.w $a2, $sp, 4 +; LA32-NEXT: movfr2gr.s $a1, $fa0 +; LA32-NEXT: movfrh2gr.s $a2, $fa0 ; LA32-NEXT: move $a3, $zero ; LA32-NEXT: bl __atomic_store_8 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -841,9 +835,8 @@ define void @store_monotonic_double(ptr %ptr, double %v) { ; LA32-NEXT: .cfi_def_cfa_offset 16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill ; LA32-NEXT: .cfi_offset 1, -4 -; LA32-NEXT: fst.d $fa0, $sp, 0 -; LA32-NEXT: ld.w $a1, $sp, 0 -; LA32-NEXT: ld.w $a2, $sp, 4 +; LA32-NEXT: movfr2gr.s $a1, $fa0 +; LA32-NEXT: movfrh2gr.s $a2, $fa0 ; LA32-NEXT: move $a3, $zero ; LA32-NEXT: bl __atomic_store_8 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -973,9 +966,8 @@ define void @store_seq_cst_double(ptr %ptr, double %v) { ; LA32-NEXT: .cfi_def_cfa_offset 16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill ; LA32-NEXT: .cfi_offset 1, -4 -; LA32-NEXT: fst.d $fa0, $sp, 0 -; LA32-NEXT: ld.w $a1, $sp, 0 -; LA32-NEXT: ld.w $a2, $sp, 4 +; LA32-NEXT: movfr2gr.s $a1, $fa0 +; LA32-NEXT: movfrh2gr.s $a2, $fa0 ; LA32-NEXT: ori $a3, $zero, 5 ; LA32-NEXT: bl __atomic_store_8 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload