5858#include "llvm/IR/MDBuilder.h"
5959#include "llvm/IR/MatrixBuilder.h"
6060#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61+ #include "llvm/Support/AMDGPUAddrSpace.h"
6162#include "llvm/Support/ConvertUTF.h"
6263#include "llvm/Support/MathExtras.h"
6364#include "llvm/Support/ScopedPrinter.h"
@@ -18790,8 +18791,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1879018791 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
1879118792 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
1879218793 }
18793- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18794- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1879518794 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1879618795 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1879718796 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18803,18 +18802,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1880318802 Intrinsic::ID IID;
1880418803 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
1880518804 switch (BuiltinID) {
18806- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18807- ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18808- IID = Intrinsic::amdgcn_global_atomic_fadd;
18809- break;
1881018805 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1881118806 ArgTy = llvm::FixedVectorType::get(
1881218807 llvm::Type::getHalfTy(getLLVMContext()), 2);
1881318808 IID = Intrinsic::amdgcn_global_atomic_fadd;
1881418809 break;
18815- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18816- IID = Intrinsic::amdgcn_global_atomic_fadd;
18817- break;
1881818810 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1881918811 IID = Intrinsic::amdgcn_global_atomic_fmin;
1882018812 break;
@@ -19237,7 +19229,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1923719229 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1923819230 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1923919231 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
19240- case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
19232+ case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
19233+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19234+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
1924119235 llvm::AtomicRMWInst::BinOp BinOp;
1924219236 switch (BuiltinID) {
1924319237 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19253,6 +19247,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1925319247 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1925419248 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1925519249 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19250+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19251+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1925619252 BinOp = llvm::AtomicRMWInst::FAdd;
1925719253 break;
1925819254 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19287,8 +19283,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1928719283 ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
1928819284 EmitScalarExpr(E->getArg(3)), AO, SSID);
1928919285 } else {
19290- // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19291- SSID = llvm::SyncScope::System;
19286+ // Most of the builtins do not have syncscope/order arguments. For DS
19287+ // atomics the scope doesn't really matter, as they implicitly operate at
19288+ // workgroup scope.
19289+ //
19290+ // The global/flat cases need to use agent scope to consistently produce
19291+ // the native instruction instead of a cmpxchg expansion.
19292+ SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1929219293 AO = AtomicOrdering::SequentiallyConsistent;
1929319294
1929419295 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19303,6 +19304,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1930319304 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1930419305 if (Volatile)
1930519306 RMW->setVolatile(true);
19307+
19308+ unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19309+ if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19310+ // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19311+ // instruction for flat and global operations.
19312+ llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19313+ RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19314+
19315+ // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19316+ // instruction, but this only matters for float fadd.
19317+ if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19318+ RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19319+ }
19320+
1930619321 return Builder.CreateBitCast(RMW, OrigTy);
1930719322 }
1930819323 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
0 commit comments