@@ -26565,21 +26565,37 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
2656526565 : AtomicExpansionKind::LLSC;
2656626566}
2656726567
26568+ // Return true if the atomic operation expansion will lower to use a library
26569+ // call, and is thus ineligible to use an LLSC expansion.
26570+ static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
26571+ const AtomicRMWInst *RMW) {
26572+ if (!RMW->isFloatingPointOperation())
26573+ return false;
26574+ switch (RMW->getType()->getScalarType()->getTypeID()) {
26575+ case Type::FloatTyID:
26576+ case Type::DoubleTyID:
26577+ case Type::HalfTyID:
26578+ case Type::BFloatTyID:
26579+ // Will use soft float
26580+ return !Subtarget.hasFPARMv8();
26581+ default:
26582+ // fp128 will emit library calls.
26583+ return true;
26584+ }
26585+
26586+ llvm_unreachable("covered type switch");
26587+ }
26588+
2656826589// The "default" for integer RMW operations is to expand to an LL/SC loop.
2656926590// However, with the LSE instructions (or outline-atomics mode, which provides
2657026591// library routines in place of the LSE-instructions), we can directly emit many
2657126592// operations instead.
26572- //
26573- // Floating-point operations are always emitted to a cmpxchg loop, because they
26574- // may trigger a trap which aborts an LLSC sequence.
2657526593TargetLowering::AtomicExpansionKind
2657626594AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
26577- unsigned Size = AI->getType()->getPrimitiveSizeInBits();
26595+ Type *Ty = AI->getType();
26596+ unsigned Size = Ty->getPrimitiveSizeInBits();
2657826597 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
2657926598
26580- if (AI->isFloatingPointOperation())
26581- return AtomicExpansionKind::CmpXChg;
26582-
2658326599 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
2658426600 (AI->getOperation() == AtomicRMWInst::Xchg ||
2658526601 AI->getOperation() == AtomicRMWInst::Or ||
@@ -26589,7 +26605,8 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
2658926605
2659026606 // Nand is not supported in LSE.
2659126607 // Leave 128 bits to LLSC or CmpXChg.
26592- if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
26608+ if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
26609+ !AI->isFloatingPointOperation()) {
2659326610 if (Subtarget->hasLSE())
2659426611 return AtomicExpansionKind::None;
2659526612 if (Subtarget->outlineAtomics()) {
@@ -26615,7 +26632,7 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
2661526632 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
2661626633 // we have a single CAS instruction that can replace the loop.
2661726634 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
26618- Subtarget->hasLSE())
26635+ Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI) )
2661926636 return AtomicExpansionKind::CmpXChg;
2662026637
2662126638 return AtomicExpansionKind::LLSC;
@@ -26662,10 +26679,14 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
2666226679
2666326680 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
2666426681 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
26665- Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
26666- Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
26667- return Builder.CreateOr(
26668- Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
26682+
26683+ auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
26684+ Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
26685+ Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
26686+
26687+ Value *Or = Builder.CreateOr(
26688+ Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
26689+ return Builder.CreateBitCast(Or, ValueTy);
2666926690 }
2667026691
2667126692 Type *Tys[] = { Addr->getType() };
@@ -26676,8 +26697,8 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
2667626697 const DataLayout &DL = M->getDataLayout();
2667726698 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
2667826699 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
26679- CI->addParamAttr(
26680- 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy ));
26700+ CI->addParamAttr(0, Attribute::get(Builder.getContext(),
26701+ Attribute::ElementType, IntEltTy ));
2668126702 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
2668226703
2668326704 return Builder.CreateBitCast(Trunc, ValueTy);
@@ -26703,9 +26724,13 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
2670326724 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
2670426725 Function *Stxr = Intrinsic::getDeclaration(M, Int);
2670526726 Type *Int64Ty = Type::getInt64Ty(M->getContext());
26727+ Type *Int128Ty = Type::getInt128Ty(M->getContext());
26728+
26729+ Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
2670626730
26707- Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
26708- Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
26731+ Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
26732+ Value *Hi =
26733+ Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
2670926734 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
2671026735 }
2671126736
0 commit comments