5858#include "llvm/IR/MDBuilder.h"
5959#include "llvm/IR/MatrixBuilder.h"
6060#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61+ #include "llvm/Support/AMDGPUAddrSpace.h"
6162#include "llvm/Support/ConvertUTF.h"
6263#include "llvm/Support/MathExtras.h"
6364#include "llvm/Support/ScopedPrinter.h"
@@ -18743,8 +18744,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1874318744 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
1874418745 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
1874518746 }
18746- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18747- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1874818747 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1874918748 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1875018749 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18756,18 +18755,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1875618755 Intrinsic::ID IID;
1875718756 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
1875818757 switch (BuiltinID) {
18759- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18760- ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18761- IID = Intrinsic::amdgcn_global_atomic_fadd;
18762- break;
1876318758 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1876418759 ArgTy = llvm::FixedVectorType::get(
1876518760 llvm::Type::getHalfTy(getLLVMContext()), 2);
1876618761 IID = Intrinsic::amdgcn_global_atomic_fadd;
1876718762 break;
18768- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18769- IID = Intrinsic::amdgcn_global_atomic_fadd;
18770- break;
1877118763 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1877218764 IID = Intrinsic::amdgcn_global_atomic_fmin;
1877318765 break;
@@ -19190,7 +19182,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1919019182 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1919119183 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1919219184 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
19193- case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
19185+ case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
19186+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19187+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
1919419188 llvm::AtomicRMWInst::BinOp BinOp;
1919519189 switch (BuiltinID) {
1919619190 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19206,6 +19200,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1920619200 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1920719201 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1920819202 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19203+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19204+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1920919205 BinOp = llvm::AtomicRMWInst::FAdd;
1921019206 break;
1921119207 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19240,8 +19236,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1924019236 ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
1924119237 EmitScalarExpr(E->getArg(3)), AO, SSID);
1924219238 } else {
19243- // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19244- SSID = llvm::SyncScope::System;
19239+ // Most of the builtins do not have syncscope/order arguments. For DS
19240+ // atomics the scope doesn't really matter, as they implicitly operate at
19241+ // workgroup scope.
19242+ //
19243+ // The global/flat cases need to use agent scope to consistently produce
19244+ // the native instruction instead of a cmpxchg expansion.
19245+ SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1924519246 AO = AtomicOrdering::SequentiallyConsistent;
1924619247
1924719248 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19256,6 +19257,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1925619257 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1925719258 if (Volatile)
1925819259 RMW->setVolatile(true);
19260+
19261+ unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19262+ if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19263+ // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19264+ // instruction for flat and global operations.
19265+ llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19266+ RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19267+
19268+ // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19269+ // instruction, but this only matters for float fadd.
19270+ if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19271+ RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19272+ }
19273+
1925919274 return Builder.CreateBitCast(RMW, OrigTy);
1926019275 }
1926119276 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
0 commit comments