@@ -16062,26 +16062,21 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
1606216062 SNaN, Depth);
1606316063}
1606416064
16065- #if 0
16066- // FIXME: This should be checked before unsafe fp atomics are enabled
16067- // Global FP atomic instructions have a hardcoded FP mode and do not support
16068- // FP32 denormals, and only support v2f16 denormals.
16069- static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16065+ // On older subtargets, global FP atomic instructions have a hardcoded FP mode
16066+ // and do not support FP32 denormals, and only support v2f16/f64 denormals.
16067+ static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
16068+ if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16069+ return true;
16070+
1607016071 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16071- auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16072- if (&Flt == &APFloat::IEEEsingle())
16073- return DenormMode == DenormalMode::getPreserveSign();
16074- return DenormMode == DenormalMode::getIEEE();
16075- }
16076- #endif
16072+ auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16073+ if (DenormMode == DenormalMode::getPreserveSign())
16074+ return true;
1607716075
16078- // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16079- // floating point atomic instructions. May generate more efficient code,
16080- // but may not respect rounding and denormal modes, and may give incorrect
16081- // results for certain memory destinations.
16082- bool unsafeFPAtomicsDisabled(Function *F) {
16083- return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16084- "true";
16076+ // TODO: Remove this.
16077+ return RMW->getFunction()
16078+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16079+ .getValueAsBool();
1608516080}
1608616081
1608716082static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
@@ -16210,82 +16205,85 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1621016205 return AtomicExpansionKind::CmpXChg;
1621116206 }
1621216207
16213- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
16214- AS != AMDGPUAS::BUFFER_FAT_POINTER)
16215- return AtomicExpansionKind::CmpXChg;
16216-
16217- if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16218- return AtomicExpansionKind::None;
16219-
16220- if (AS == AMDGPUAS::FLAT_ADDRESS) {
16221- // gfx940, gfx12
16222- // FIXME: Needs to account for no fine-grained memory
16223- if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16224- return AtomicExpansionKind::None;
16225- } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16226- // gfx90a, gfx940, gfx12
16227- // FIXME: Needs to account for no fine-grained memory
16228- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16229- return AtomicExpansionKind::None;
16230-
16231- // gfx940, gfx12
16232- // FIXME: Needs to account for no fine-grained memory
16233- if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16234- return AtomicExpansionKind::None;
16235- } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16236- // gfx90a, gfx940, gfx12
16237- // FIXME: Needs to account for no fine-grained memory
16238- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16239- return AtomicExpansionKind::None;
16240-
16241- // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16242- // buffer. gfx12 does have the buffer version.
16243- if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16244- return AtomicExpansionKind::None;
16245- }
16246-
16247- if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16248- return AtomicExpansionKind::CmpXChg;
16249-
16250- // Always expand system scope fp atomics.
16251- if (HasSystemScope)
16208+ // LDS atomics respect the denormal mode from the mode register.
16209+ //
16210+ // Traditionally f32 global/buffer memory atomics would unconditionally
16211+ // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16212+ // flush.
16213+ //
16214+ // On targets with flat atomic fadd, denormals would flush depending on
16215+ // whether the target address resides in LDS or global memory. We consider
16216+ // this flat-maybe-flush as will-flush.
16217+ if (Ty->isFloatTy() &&
16218+ !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
16219+ !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
1625216220 return AtomicExpansionKind::CmpXChg;
1625316221
16254- // global and flat atomic fadd f64: gfx90a, gfx940.
16255- if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16256- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16222+ // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16223+ // safe. The message phrasing also should be better.
16224+ if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16225+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
16226+ // gfx940, gfx12
16227+ if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16228+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16229+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16230+ // gfx90a, gfx940, gfx12
16231+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16232+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
1625716233
16258- if (AS != AMDGPUAS::FLAT_ADDRESS) {
16259- if (Ty->isFloatTy()) {
16260- // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16261- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16234+ // gfx940, gfx12
16235+ if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
1626216236 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16263- // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16264- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16237+ } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16238+ // gfx90a, gfx940, gfx12
16239+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
1626516240 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16266- } else {
16267- // gfx908
16268- if (RMW->use_empty() &&
16269- Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts () && isHalf2 (Ty))
16241+
16242+ // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16243+ // buffer. gfx12 does have the buffer version.
16244+ if ( Subtarget->hasAtomicBufferPkAddBF16Inst () && isBFloat2 (Ty))
1627016245 return ReportUnsafeHWInst(AtomicExpansionKind::None);
1627116246 }
16272- }
1627316247
16274- // flat atomic fadd f32: gfx940, gfx11+.
16275- if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16276- if (Subtarget->hasFlatAtomicFaddF32Inst())
16248+ // global and flat atomic fadd f64: gfx90a, gfx940.
16249+ if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
1627716250 return ReportUnsafeHWInst(AtomicExpansionKind::None);
1627816251
16279- // If it is in flat address space, and the type is float, we will try to
16280- // expand it, if the target supports global and lds atomic fadd. The
16281- // reason we need that is, in the expansion, we emit the check of address
16282- // space. If it is in global address space, we emit the global atomic
16283- // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16284- if (Subtarget->hasLDSFPAtomicAddF32()) {
16285- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16286- return AtomicExpansionKind::Expand;
16287- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16288- return AtomicExpansionKind::Expand;
16252+ if (AS != AMDGPUAS::FLAT_ADDRESS) {
16253+ if (Ty->isFloatTy()) {
16254+ // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16255+ // gfx11+.
16256+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16257+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16258+ // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16259+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16260+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16261+ } else {
16262+ // gfx908
16263+ if (RMW->use_empty() &&
16264+ Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
16265+ isHalf2(Ty))
16266+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16267+ }
16268+ }
16269+
16270+ // flat atomic fadd f32: gfx940, gfx11+.
16271+ if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16272+ if (Subtarget->hasFlatAtomicFaddF32Inst())
16273+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16274+
16275+ // If it is in flat address space, and the type is float, we will try to
16276+ // expand it, if the target supports global and lds atomic fadd. The
16277+ // reason we need that is, in the expansion, we emit the check of
16278+ // address space. If it is in global address space, we emit the global
16279+ // atomic fadd; if it is in shared address space, we emit the LDS atomic
16280+ // fadd.
16281+ if (Subtarget->hasLDSFPAtomicAddF32()) {
16282+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16283+ return AtomicExpansionKind::Expand;
16284+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16285+ return AtomicExpansionKind::Expand;
16286+ }
1628916287 }
1629016288 }
1629116289
0 commit comments