5858#include "llvm/IR/MDBuilder.h"
5959#include "llvm/IR/MatrixBuilder.h"
6060#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61+ #include "llvm/Support/AMDGPUAddrSpace.h"
6162#include "llvm/Support/ConvertUTF.h"
6263#include "llvm/Support/MathExtras.h"
6364#include "llvm/Support/ScopedPrinter.h"
@@ -18919,8 +18920,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1891918920 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
1892018921 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
1892118922 }
18922- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18923- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1892418923 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1892518924 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1892618925 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18932,18 +18931,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1893218931 Intrinsic::ID IID;
1893318932 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
1893418933 switch (BuiltinID) {
18935- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18936- ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18937- IID = Intrinsic::amdgcn_global_atomic_fadd;
18938- break;
1893918934 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1894018935 ArgTy = llvm::FixedVectorType::get(
1894118936 llvm::Type::getHalfTy(getLLVMContext()), 2);
1894218937 IID = Intrinsic::amdgcn_global_atomic_fadd;
1894318938 break;
18944- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18945- IID = Intrinsic::amdgcn_global_atomic_fadd;
18946- break;
1894718939 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1894818940 IID = Intrinsic::amdgcn_global_atomic_fmin;
1894918941 break;
@@ -19366,7 +19358,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1936619358 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1936719359 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1936819360 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
19369- case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
19361+ case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
19362+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19363+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
1937019364 llvm::AtomicRMWInst::BinOp BinOp;
1937119365 switch (BuiltinID) {
1937219366 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19382,6 +19376,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1938219376 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1938319377 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1938419378 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19379+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19380+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1938519381 BinOp = llvm::AtomicRMWInst::FAdd;
1938619382 break;
1938719383 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19416,8 +19412,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1941619412 ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
1941719413 EmitScalarExpr(E->getArg(3)), AO, SSID);
1941819414 } else {
19419- // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19420- SSID = llvm::SyncScope::System;
19415+ // Most of the builtins do not have syncscope/order arguments. For DS
19416+ // atomics the scope doesn't really matter, as they implicitly operate at
19417+ // workgroup scope.
19418+ //
19419+ // The global/flat cases need to use agent scope to consistently produce
19420+ // the native instruction instead of a cmpxchg expansion.
19421+ SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1942119422 AO = AtomicOrdering::SequentiallyConsistent;
1942219423
1942319424 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19432,6 +19433,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1943219433 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1943319434 if (Volatile)
1943419435 RMW->setVolatile(true);
19436+
19437+ unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19438+ if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19439+ // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19440+ // instruction for flat and global operations.
19441+ llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19442+ RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19443+
19444+ // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19445+ // instruction, but this only matters for float fadd.
19446+ if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19447+ RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19448+ }
19449+
1943519450 return Builder.CreateBitCast(RMW, OrigTy);
1943619451 }
1943719452 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
0 commit comments