@@ -16037,26 +16037,15 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
1603716037 SNaN, Depth);
1603816038}
1603916039
16040- #if 0
16041- // FIXME: This should be checked before unsafe fp atomics are enabled
16042- // Global FP atomic instructions have a hardcoded FP mode and do not support
16043- // FP32 denormals, and only support v2f16 denormals.
16044- static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16045- const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16046- auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16047- if (&Flt == &APFloat::IEEEsingle())
16048- return DenormMode == DenormalMode::getPreserveSign();
16049- return DenormMode == DenormalMode::getIEEE();
16050- }
16051- #endif
16040+ // On older subtargets, global FP atomic instructions have a hardcoded FP mode
16041+ // and do not support FP32 denormals, and only support v2f16/f64 denormals.
16042+ static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
16043+ if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16044+ return true;
1605216045
16053- // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16054- // floating point atomic instructions. May generate more efficient code,
16055- // but may not respect rounding and denormal modes, and may give incorrect
16056- // results for certain memory destinations.
16057- bool unsafeFPAtomicsDisabled(Function *F) {
16058- return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16059- "true";
16046+ const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16047+ auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16048+ return DenormMode == DenormalMode::getPreserveSign();
1606016049}
1606116050
1606216051static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
@@ -16185,75 +16174,74 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1618516174 return AtomicExpansionKind::CmpXChg;
1618616175 }
1618716176
16188- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
16189- AS != AMDGPUAS::BUFFER_FAT_POINTER)
16190- return AtomicExpansionKind::CmpXChg;
16191-
16192- if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16193- return AtomicExpansionKind::None;
16194-
16195- if (AS == AMDGPUAS::FLAT_ADDRESS) {
16196- // gfx940, gfx12
16197- // FIXME: Needs to account for no fine-grained memory
16198- if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16199- return AtomicExpansionKind::None;
16200- } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16201- // gfx90a, gfx940, gfx12
16202- // FIXME: Needs to account for no fine-grained memory
16203- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16204- return AtomicExpansionKind::None;
16205-
16206- // gfx940, gfx12
16207- // FIXME: Needs to account for no fine-grained memory
16208- if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16209- return AtomicExpansionKind::None;
16210- } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16211- // gfx90a, gfx940, gfx12
16212- // FIXME: Needs to account for no fine-grained memory
16213- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16214- return AtomicExpansionKind::None;
16215-
16216- // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16217- // buffer. gfx12 does have the buffer version.
16218- if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16219- return AtomicExpansionKind::None;
16220- }
16221-
16222- if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16223- return AtomicExpansionKind::CmpXChg;
16224-
16225- // Always expand system scope fp atomics.
16226- if (HasSystemScope)
16177+ // LDS atomics respect the denormal mode from the mode register.
16178+ //
16179+ // Traditionally f32 global/buffer memory atomics would unconditionally
16180+ // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16181+ // flush.
16182+ //
16183+ // On targets with flat atomic fadd, denormals would flush depending on
16184+ // whether the target address resides in LDS or global memory. We consider
16185+ // this flat-maybe-flush as will-flush.
16186+ if (Ty->isFloatTy() &&
16187+ !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
16188+ !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
1622716189 return AtomicExpansionKind::CmpXChg;
1622816190
16229- // global and flat atomic fadd f64: gfx90a, gfx940.
16230- if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16231- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16232-
16233- if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16234- // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16235- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16236- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16237- // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16238- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16239- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16240- }
16191+ if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16192+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
16193+ // gfx940, gfx12
16194+ if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16195+ return AtomicExpansionKind::None;
16196+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16197+ // gfx90a, gfx940, gfx12
16198+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16199+ return AtomicExpansionKind::None;
16200+
16201+ // gfx940, gfx12
16202+ if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16203+ return AtomicExpansionKind::None;
16204+ } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16205+ // gfx90a, gfx940, gfx12
16206+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16207+ return AtomicExpansionKind::None;
16208+
16209+ // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16210+ // buffer. gfx12 does have the buffer version.
16211+ if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16212+ return AtomicExpansionKind::None;
16213+ }
1624116214
16242- // flat atomic fadd f32: gfx940, gfx11+.
16243- if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16244- if (Subtarget->hasFlatAtomicFaddF32Inst())
16215+ // global and flat atomic fadd f64: gfx90a, gfx940.
16216+ if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
1624516217 return ReportUnsafeHWInst(AtomicExpansionKind::None);
1624616218
16247- // If it is in flat address space, and the type is float, we will try to
16248- // expand it, if the target supports global and lds atomic fadd. The
16249- // reason we need that is, in the expansion, we emit the check of address
16250- // space. If it is in global address space, we emit the global atomic
16251- // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16252- if (Subtarget->hasLDSFPAtomicAddF32()) {
16219+ if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16220+ // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
1625316221 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16254- return AtomicExpansionKind::Expand;
16222+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16223+ // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
1625516224 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16256- return AtomicExpansionKind::Expand;
16225+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16226+ }
16227+
16228+ // flat atomic fadd f32: gfx940, gfx11+.
16229+ if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16230+ if (Subtarget->hasFlatAtomicFaddF32Inst())
16231+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16232+
16233+ // If it is in flat address space, and the type is float, we will try to
16234+ // expand it, if the target supports global and lds atomic fadd. The
16235+ // reason we need that is, in the expansion, we emit the check of
16236+ // address space. If it is in global address space, we emit the global
16237+ // atomic fadd; if it is in shared address space, we emit the LDS atomic
16238+ // fadd.
16239+ if (Subtarget->hasLDSFPAtomicAddF32()) {
16240+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16241+ return AtomicExpansionKind::Expand;
16242+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16243+ return AtomicExpansionKind::Expand;
16244+ }
1625716245 }
1625816246 }
1625916247
0 commit comments