@@ -16058,26 +16058,21 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
1605816058 SNaN, Depth);
1605916059}
1606016060
16061- #if 0
16062- // FIXME: This should be checked before unsafe fp atomics are enabled
16063- // Global FP atomic instructions have a hardcoded FP mode and do not support
16064- // FP32 denormals, and only support v2f16 denormals.
16065- static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16061+ // On older subtargets, global FP atomic instructions have a hardcoded FP mode
16062+ // and do not support FP32 denormals, and only support v2f16/f64 denormals.
16063+ static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
16064+ if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16065+ return true;
16066+
1606616067 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16067- auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16068- if (&Flt == &APFloat::IEEEsingle())
16069- return DenormMode == DenormalMode::getPreserveSign();
16070- return DenormMode == DenormalMode::getIEEE();
16071- }
16072- #endif
16068+ auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16069+ if (DenormMode == DenormalMode::getPreserveSign())
16070+ return true;
1607316071
16074- // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16075- // floating point atomic instructions. May generate more efficient code,
16076- // but may not respect rounding and denormal modes, and may give incorrect
16077- // results for certain memory destinations.
16078- bool unsafeFPAtomicsDisabled(Function *F) {
16079- return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16080- "true";
16072+ // TODO: Remove this.
16073+ return RMW->getFunction()
16074+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16075+ .getValueAsBool();
1608116076}
1608216077
1608316078static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
@@ -16206,82 +16201,85 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1620616201 return AtomicExpansionKind::CmpXChg;
1620716202 }
1620816203
16209- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
16210- AS != AMDGPUAS::BUFFER_FAT_POINTER)
16211- return AtomicExpansionKind::CmpXChg;
16212-
16213- if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16214- return AtomicExpansionKind::None;
16215-
16216- if (AS == AMDGPUAS::FLAT_ADDRESS) {
16217- // gfx940, gfx12
16218- // FIXME: Needs to account for no fine-grained memory
16219- if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16220- return AtomicExpansionKind::None;
16221- } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16222- // gfx90a, gfx940, gfx12
16223- // FIXME: Needs to account for no fine-grained memory
16224- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16225- return AtomicExpansionKind::None;
16226-
16227- // gfx940, gfx12
16228- // FIXME: Needs to account for no fine-grained memory
16229- if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16230- return AtomicExpansionKind::None;
16231- } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16232- // gfx90a, gfx940, gfx12
16233- // FIXME: Needs to account for no fine-grained memory
16234- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16235- return AtomicExpansionKind::None;
16236-
16237- // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16238- // buffer. gfx12 does have the buffer version.
16239- if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16240- return AtomicExpansionKind::None;
16241- }
16242-
16243- if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16244- return AtomicExpansionKind::CmpXChg;
16245-
16246- // Always expand system scope fp atomics.
16247- if (HasSystemScope)
16204+ // LDS atomics respect the denormal mode from the mode register.
16205+ //
16206+ // Traditionally f32 global/buffer memory atomics would unconditionally
16207+ // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16208+ // flush.
16209+ //
16210+ // On targets with flat atomic fadd, denormals would flush depending on
16211+ // whether the target address resides in LDS or global memory. We consider
16212+ // this flat-maybe-flush as will-flush.
16213+ if (Ty->isFloatTy() &&
16214+ !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
16215+ !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
1624816216 return AtomicExpansionKind::CmpXChg;
1624916217
16250- // global and flat atomic fadd f64: gfx90a, gfx940.
16251- if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16252- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16218+ // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16219+ // safe. The message phrasing also should be better.
16220+ if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16221+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
16222+ // gfx940, gfx12
16223+ if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16224+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16225+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16226+ // gfx90a, gfx940, gfx12
16227+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16228+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
1625316229
16254- if (AS != AMDGPUAS::FLAT_ADDRESS) {
16255- if (Ty->isFloatTy()) {
16256- // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16257- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16230+ // gfx940, gfx12
16231+ if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
1625816232 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16259- // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16260- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16233+ } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16234+ // gfx90a, gfx940, gfx12
16235+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
1626116236 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16262- } else {
16263- // gfx908
16264- if (RMW->use_empty() &&
16265- Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts () && isHalf2 (Ty))
16237+
16238+ // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16239+ // buffer. gfx12 does have the buffer version.
16240+ if ( Subtarget->hasAtomicBufferPkAddBF16Inst () && isBFloat2 (Ty))
1626616241 return ReportUnsafeHWInst(AtomicExpansionKind::None);
1626716242 }
16268- }
1626916243
16270- // flat atomic fadd f32: gfx940, gfx11+.
16271- if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16272- if (Subtarget->hasFlatAtomicFaddF32Inst())
16244+ // global and flat atomic fadd f64: gfx90a, gfx940.
16245+ if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
1627316246 return ReportUnsafeHWInst(AtomicExpansionKind::None);
1627416247
16275- // If it is in flat address space, and the type is float, we will try to
16276- // expand it, if the target supports global and lds atomic fadd. The
16277- // reason we need that is, in the expansion, we emit the check of address
16278- // space. If it is in global address space, we emit the global atomic
16279- // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16280- if (Subtarget->hasLDSFPAtomicAddF32()) {
16281- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16282- return AtomicExpansionKind::Expand;
16283- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16284- return AtomicExpansionKind::Expand;
16248+ if (AS != AMDGPUAS::FLAT_ADDRESS) {
16249+ if (Ty->isFloatTy()) {
16250+ // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16251+ // gfx11+.
16252+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16253+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16254+ // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16255+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16256+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16257+ } else {
16258+ // gfx908
16259+ if (RMW->use_empty() &&
16260+ Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
16261+ isHalf2(Ty))
16262+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16263+ }
16264+ }
16265+
16266+ // flat atomic fadd f32: gfx940, gfx11+.
16267+ if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16268+ if (Subtarget->hasFlatAtomicFaddF32Inst())
16269+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16270+
16271+ // If it is in flat address space, and the type is float, we will try to
16272+ // expand it, if the target supports global and lds atomic fadd. The
16273+ // reason we need that is, in the expansion, we emit the check of
16274+ // address space. If it is in global address space, we emit the global
16275+ // atomic fadd; if it is in shared address space, we emit the LDS atomic
16276+ // fadd.
16277+ if (Subtarget->hasLDSFPAtomicAddF32()) {
16278+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16279+ return AtomicExpansionKind::Expand;
16280+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16281+ return AtomicExpansionKind::Expand;
16282+ }
1628516283 }
1628616284 }
1628716285
0 commit comments