@@ -16075,56 +16075,50 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1607516075 return AtomicExpansionKind::CmpXChg;
1607616076 }
1607716077
16078- if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
16078+ if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
16079+ AS != AMDGPUAS::BUFFER_FAT_POINTER)
1607916080 return AtomicExpansionKind::CmpXChg;
1608016081
16081- if ((AMDGPU::isFlatGlobalAddrSpace(AS) ||
16082- AS == AMDGPUAS::BUFFER_FAT_POINTER) &&
16083- Subtarget->hasAtomicFaddNoRtnInsts()) {
16084- if (Subtarget->hasGFX940Insts())
16085- return AtomicExpansionKind::None;
16082+ // TODO: gfx940 supports v2f16 and v2bf16
16083+ if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16084+ return AtomicExpansionKind::None;
1608616085
16087- if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16088- return AtomicExpansionKind::CmpXChg;
16086+ if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16087+ return AtomicExpansionKind::CmpXChg;
1608916088
16090- // Always expand system scope fp atomics.
16091- if (HasSystemScope)
16092- return AtomicExpansionKind::CmpXChg;
16089+ // Always expand system scope fp atomics.
16090+ if (HasSystemScope)
16091+ return AtomicExpansionKind::CmpXChg;
1609316092
16094- if ((AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16095- AS == AMDGPUAS::BUFFER_FAT_POINTER) &&
16096- Ty->isFloatTy()) {
16097- // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16098- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16099- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16100- // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16101- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16102- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16103- }
16093+ // global and flat atomic fadd f64: gfx90a, gfx940.
16094+ if (Subtarget->hasGFX90AInsts() && Ty->isDoubleTy())
16095+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
1610416096
16105- // flat atomic fadd f32: gfx940, gfx11+.
16106- if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
16107- Subtarget->hasFlatAtomicFaddF32Inst())
16097+ if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16098+ // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16099+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16100+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16101+ // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16102+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
1610816103 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16104+ }
1610916105
16110- // global and flat atomic fadd f64: gfx90a, gfx940.
16111- if (Ty->isDoubleTy() && Subtarget->hasGFX90AInsts())
16106+ // flat atomic fadd f32: gfx940, gfx11+.
16107+ if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16108+ if (Subtarget->hasFlatAtomicFaddF32Inst())
1611216109 return ReportUnsafeHWInst(AtomicExpansionKind::None);
1611316110
1611416111 // If it is in flat address space, and the type is float, we will try to
1611516112 // expand it, if the target supports global and lds atomic fadd. The
1611616113 // reason we need that is, in the expansion, we emit the check of address
1611716114 // space. If it is in global address space, we emit the global atomic
1611816115 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16119- if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
16120- Subtarget->hasLDSFPAtomicAddF32()) {
16116+ if (Subtarget->hasLDSFPAtomicAddF32()) {
1612116117 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
1612216118 return AtomicExpansionKind::Expand;
1612316119 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
1612416120 return AtomicExpansionKind::Expand;
1612516121 }
16126-
16127- return AtomicExpansionKind::CmpXChg;
1612816122 }
1612916123
1613016124 return AtomicExpansionKind::CmpXChg;
0 commit comments