5858#include "llvm/IR/MDBuilder.h"
5959#include "llvm/IR/MatrixBuilder.h"
6060#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61+ #include "llvm/Support/AMDGPUAddrSpace.h"
6162#include "llvm/Support/ConvertUTF.h"
6263#include "llvm/Support/MathExtras.h"
6364#include "llvm/Support/ScopedPrinter.h"
@@ -18776,8 +18777,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1877618777 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
1877718778 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
1877818779 }
18779- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18780- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1878118780 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1878218781 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1878318782 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18789,18 +18788,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1878918788 Intrinsic::ID IID;
1879018789 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
1879118790 switch (BuiltinID) {
18792- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18793- ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18794- IID = Intrinsic::amdgcn_global_atomic_fadd;
18795- break;
1879618791 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1879718792 ArgTy = llvm::FixedVectorType::get(
1879818793 llvm::Type::getHalfTy(getLLVMContext()), 2);
1879918794 IID = Intrinsic::amdgcn_global_atomic_fadd;
1880018795 break;
18801- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18802- IID = Intrinsic::amdgcn_global_atomic_fadd;
18803- break;
1880418796 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1880518797 IID = Intrinsic::amdgcn_global_atomic_fmin;
1880618798 break;
@@ -19223,7 +19215,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1922319215 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1922419216 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1922519217 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
19226- case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
19218+ case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
19219+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19220+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
1922719221 llvm::AtomicRMWInst::BinOp BinOp;
1922819222 switch (BuiltinID) {
1922919223 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19239,6 +19233,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1923919233 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1924019234 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1924119235 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19236+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19237+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1924219238 BinOp = llvm::AtomicRMWInst::FAdd;
1924319239 break;
1924419240 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19273,8 +19269,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1927319269 ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
1927419270 EmitScalarExpr(E->getArg(3)), AO, SSID);
1927519271 } else {
19276- // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19277- SSID = llvm::SyncScope::System;
19272+ // Most of the builtins do not have syncscope/order arguments. For DS
19273+ // atomics the scope doesn't really matter, as they implicitly operate at
19274+ // workgroup scope.
19275+ //
19276+ // The global/flat cases need to use agent scope to consistently produce
19277+ // the native instruction instead of a cmpxchg expansion.
19278+ SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1927819279 AO = AtomicOrdering::SequentiallyConsistent;
1927919280
1928019281 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19289,6 +19290,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1928919290 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1929019291 if (Volatile)
1929119292 RMW->setVolatile(true);
19293+
19294+ unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19295+ if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19296+ // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19297+ // instruction for flat and global operations.
19298+ llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19299+ RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19300+
19301+ // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19302+ // instruction, but this only matters for float fadd.
19303+ if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19304+ RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19305+ }
19306+
1929219307 return Builder.CreateBitCast(RMW, OrigTy);
1929319308 }
1929419309 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
0 commit comments