@@ -16067,26 +16067,21 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
1606716067 SNaN, Depth);
1606816068}
1606916069
16070- #if 0
16071- // FIXME: This should be checked before unsafe fp atomics are enabled
16072- // Global FP atomic instructions have a hardcoded FP mode and do not support
16073- // FP32 denormals, and only support v2f16 denormals.
16074- static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16070+ // On older subtargets, global FP atomic instructions have a hardcoded FP mode
16071+ // and do not support FP32 denormals, and only support v2f16/f64 denormals.
16072+ static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
16073+ if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16074+ return true;
16075+
1607516076 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16076- auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16077- if (&Flt == &APFloat::IEEEsingle())
16078- return DenormMode == DenormalMode::getPreserveSign();
16079- return DenormMode == DenormalMode::getIEEE();
16080- }
16081- #endif
16077+ auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16078+ if (DenormMode == DenormalMode::getPreserveSign())
16079+ return true;
1608216080
16083- // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16084- // floating point atomic instructions. May generate more efficient code,
16085- // but may not respect rounding and denormal modes, and may give incorrect
16086- // results for certain memory destinations.
16087- bool unsafeFPAtomicsDisabled(Function *F) {
16088- return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16089- "true";
16081+ // TODO: Remove this.
16082+ return RMW->getFunction()
16083+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16084+ .getValueAsBool();
1609016085}
1609116086
1609216087static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
@@ -16215,75 +16210,76 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1621516210 return AtomicExpansionKind::CmpXChg;
1621616211 }
1621716212
16218- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
16219- AS != AMDGPUAS::BUFFER_FAT_POINTER)
16220- return AtomicExpansionKind::CmpXChg;
16221-
16222- if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16223- return AtomicExpansionKind::None;
16224-
16225- if (AS == AMDGPUAS::FLAT_ADDRESS) {
16226- // gfx940, gfx12
16227- // FIXME: Needs to account for no fine-grained memory
16228- if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16229- return AtomicExpansionKind::None;
16230- } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16231- // gfx90a, gfx940, gfx12
16232- // FIXME: Needs to account for no fine-grained memory
16233- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16234- return AtomicExpansionKind::None;
16235-
16236- // gfx940, gfx12
16237- // FIXME: Needs to account for no fine-grained memory
16238- if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16239- return AtomicExpansionKind::None;
16240- } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16241- // gfx90a, gfx940, gfx12
16242- // FIXME: Needs to account for no fine-grained memory
16243- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16244- return AtomicExpansionKind::None;
16245-
16246- // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16247- // buffer. gfx12 does have the buffer version.
16248- if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16249- return AtomicExpansionKind::None;
16250- }
16251-
16252- if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16213+ // LDS atomics respect the denormal mode from the mode register.
16214+ //
16215+ // Traditionally f32 global/buffer memory atomics would unconditionally
16216+ // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16217+ // flush.
16218+ //
16219+ // On targets with flat atomic fadd, denormals would flush depending on
16220+ // whether the target address resides in LDS or global memory. We consider
16221+ // this flat-maybe-flush as will-flush.
16222+ if (Ty->isFloatTy() &&
16223+ !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
16224+ !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
1625316225 return AtomicExpansionKind::CmpXChg;
1625416226
16255- // Always expand system scope fp atomics.
16256- if (HasSystemScope)
16257- return AtomicExpansionKind::CmpXChg;
16227+ // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16228+ // safe. The message phrasing also should be better.
16229+ if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16230+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
16231+ // gfx940, gfx12
16232+ if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16233+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16234+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16235+ // gfx90a, gfx940, gfx12
16236+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16237+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
1625816238
16259- // global and flat atomic fadd f64: gfx90a, gfx940.
16260- if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16261- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16239+ // gfx940, gfx12
16240+ if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16241+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16242+ } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16243+ // gfx90a, gfx940, gfx12
16244+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16245+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
1626216246
16263- if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16264- // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16265- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16266- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16267- // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16268- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16269- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16270- }
16247+ // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16248+ // buffer. gfx12 does have the buffer version.
16249+ if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16250+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16251+ }
1627116252
16272- // flat atomic fadd f32: gfx940, gfx11+.
16273- if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16274- if (Subtarget->hasFlatAtomicFaddF32Inst())
16253+ // global and flat atomic fadd f64: gfx90a, gfx940.
16254+ if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
1627516255 return ReportUnsafeHWInst(AtomicExpansionKind::None);
1627616256
16277- // If it is in flat address space, and the type is float, we will try to
16278- // expand it, if the target supports global and lds atomic fadd. The
16279- // reason we need that is, in the expansion, we emit the check of address
16280- // space. If it is in global address space, we emit the global atomic
16281- // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16282- if (Subtarget->hasLDSFPAtomicAddF32()) {
16257+ if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16258+ // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
1628316259 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16284- return AtomicExpansionKind::Expand;
16260+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16261+ // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
1628516262 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16286- return AtomicExpansionKind::Expand;
16263+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16264+ }
16265+
16266+ // flat atomic fadd f32: gfx940, gfx11+.
16267+ if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16268+ if (Subtarget->hasFlatAtomicFaddF32Inst())
16269+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16270+
16271+ // If it is in flat address space, and the type is float, we will try to
16272+ // expand it, if the target supports global and lds atomic fadd. The
16273+ // reason we need that is, in the expansion, we emit the check of
16274+ // address space. If it is in global address space, we emit the global
16275+ // atomic fadd; if it is in shared address space, we emit the LDS atomic
16276+ // fadd.
16277+ if (Subtarget->hasLDSFPAtomicAddF32()) {
16278+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16279+ return AtomicExpansionKind::Expand;
16280+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16281+ return AtomicExpansionKind::Expand;
16282+ }
1628716283 }
1628816284 }
1628916285
0 commit comments