@@ -16054,26 +16054,21 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
1605416054 SNaN, Depth);
1605516055}
1605616056
16057- #if 0
16058- // FIXME: This should be checked before unsafe fp atomics are enabled
16059- // Global FP atomic instructions have a hardcoded FP mode and do not support
16060- // FP32 denormals, and only support v2f16 denormals.
16061- static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16057+ // On older subtargets, global FP atomic instructions have a hardcoded FP mode
16058+ // and do not support FP32 denormals, and only support v2f16/f64 denormals.
16059+ static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
16060+ if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16061+ return true;
16062+
1606216063 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16063- auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16064- if (&Flt == &APFloat::IEEEsingle())
16065- return DenormMode == DenormalMode::getPreserveSign();
16066- return DenormMode == DenormalMode::getIEEE();
16067- }
16068- #endif
16064+ auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16065+ if (DenormMode == DenormalMode::getPreserveSign())
16066+ return true;
1606916067
16070- // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16071- // floating point atomic instructions. May generate more efficient code,
16072- // but may not respect rounding and denormal modes, and may give incorrect
16073- // results for certain memory destinations.
16074- bool unsafeFPAtomicsDisabled(Function *F) {
16075- return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16076- "true";
16068+ // TODO: Remove this.
16069+ return RMW->getFunction()
16070+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16071+ .getValueAsBool();
1607716072}
1607816073
1607916074static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
@@ -16202,82 +16197,85 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1620216197 return AtomicExpansionKind::CmpXChg;
1620316198 }
1620416199
16205- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
16206- AS != AMDGPUAS::BUFFER_FAT_POINTER)
16207- return AtomicExpansionKind::CmpXChg;
16208-
16209- if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16210- return AtomicExpansionKind::None;
16211-
16212- if (AS == AMDGPUAS::FLAT_ADDRESS) {
16213- // gfx940, gfx12
16214- // FIXME: Needs to account for no fine-grained memory
16215- if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16216- return AtomicExpansionKind::None;
16217- } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16218- // gfx90a, gfx940, gfx12
16219- // FIXME: Needs to account for no fine-grained memory
16220- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16221- return AtomicExpansionKind::None;
16222-
16223- // gfx940, gfx12
16224- // FIXME: Needs to account for no fine-grained memory
16225- if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16226- return AtomicExpansionKind::None;
16227- } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16228- // gfx90a, gfx940, gfx12
16229- // FIXME: Needs to account for no fine-grained memory
16230- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16231- return AtomicExpansionKind::None;
16232-
16233- // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16234- // buffer. gfx12 does have the buffer version.
16235- if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16236- return AtomicExpansionKind::None;
16237- }
16238-
16239- if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16240- return AtomicExpansionKind::CmpXChg;
16241-
16242- // Always expand system scope fp atomics.
16243- if (HasSystemScope)
16200+ // LDS atomics respect the denormal mode from the mode register.
16201+ //
16202+ // Traditionally f32 global/buffer memory atomics would unconditionally
16203+ // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16204+ // flush.
16205+ //
16206+ // On targets with flat atomic fadd, denormals would flush depending on
16207+ // whether the target address resides in LDS or global memory. We consider
16208+ // this flat-maybe-flush as will-flush.
16209+ if (Ty->isFloatTy() &&
16210+ !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
16211+ !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
1624416212 return AtomicExpansionKind::CmpXChg;
1624516213
16246- // global and flat atomic fadd f64: gfx90a, gfx940.
16247- if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16248- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16214+ // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16215+ // safe. The message phrasing also should be better.
16216+ if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16217+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
16218+ // gfx940, gfx12
16219+ if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16220+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16221+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16222+ // gfx90a, gfx940, gfx12
16223+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16224+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
1624916225
16250- if (AS != AMDGPUAS::FLAT_ADDRESS) {
16251- if (Ty->isFloatTy()) {
16252- // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16253- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16226+ // gfx940, gfx12
16227+ if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
1625416228 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16255- // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16256- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16229+ } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16230+ // gfx90a, gfx940, gfx12
16231+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
1625716232 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16258- } else {
16259- // gfx908
16260- if (RMW->use_empty() &&
16261- Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts () && isHalf2 (Ty))
16233+
16234+ // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16235+ // buffer. gfx12 does have the buffer version.
16236+ if ( Subtarget->hasAtomicBufferPkAddBF16Inst () && isBFloat2 (Ty))
1626216237 return ReportUnsafeHWInst(AtomicExpansionKind::None);
1626316238 }
16264- }
1626516239
16266- // flat atomic fadd f32: gfx940, gfx11+.
16267- if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16268- if (Subtarget->hasFlatAtomicFaddF32Inst())
16240+ // global and flat atomic fadd f64: gfx90a, gfx940.
16241+ if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
1626916242 return ReportUnsafeHWInst(AtomicExpansionKind::None);
1627016243
16271- // If it is in flat address space, and the type is float, we will try to
16272- // expand it, if the target supports global and lds atomic fadd. The
16273- // reason we need that is, in the expansion, we emit the check of address
16274- // space. If it is in global address space, we emit the global atomic
16275- // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16276- if (Subtarget->hasLDSFPAtomicAddF32()) {
16277- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16278- return AtomicExpansionKind::Expand;
16279- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16280- return AtomicExpansionKind::Expand;
16244+ if (AS != AMDGPUAS::FLAT_ADDRESS) {
16245+ if (Ty->isFloatTy()) {
16246+ // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16247+ // gfx11+.
16248+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16249+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16250+ // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16251+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16252+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16253+ } else {
16254+ // gfx908
16255+ if (RMW->use_empty() &&
16256+ Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
16257+ isHalf2(Ty))
16258+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16259+ }
16260+ }
16261+
16262+ // flat atomic fadd f32: gfx940, gfx11+.
16263+ if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16264+ if (Subtarget->hasFlatAtomicFaddF32Inst())
16265+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16266+
16267+ // If it is in flat address space, and the type is float, we will try to
16268+ // expand it, if the target supports global and lds atomic fadd. The
16269+ // reason we need that is, in the expansion, we emit the check of
16270+ // address space. If it is in global address space, we emit the global
16271+ // atomic fadd; if it is in shared address space, we emit the LDS atomic
16272+ // fadd.
16273+ if (Subtarget->hasLDSFPAtomicAddF32()) {
16274+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16275+ return AtomicExpansionKind::Expand;
16276+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16277+ return AtomicExpansionKind::Expand;
16278+ }
1628116279 }
1628216280 }
1628316281
0 commit comments