5858#include "llvm/IR/MDBuilder.h"
5959#include "llvm/IR/MatrixBuilder.h"
6060#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61+ #include "llvm/Support/AMDGPUAddrSpace.h"
6162#include "llvm/Support/ConvertUTF.h"
6263#include "llvm/Support/MathExtras.h"
6364#include "llvm/Support/ScopedPrinter.h"
@@ -18632,8 +18633,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1863218633 Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
1863318634 return Builder.CreateCall(F, { Src0, Builder.getFalse() });
1863418635 }
18635- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18636- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1863718636 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1863818637 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1863918638 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18645,18 +18644,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1864518644 Intrinsic::ID IID;
1864618645 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
1864718646 switch (BuiltinID) {
18648- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18649- ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18650- IID = Intrinsic::amdgcn_global_atomic_fadd;
18651- break;
1865218647 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1865318648 ArgTy = llvm::FixedVectorType::get(
1865418649 llvm::Type::getHalfTy(getLLVMContext()), 2);
1865518650 IID = Intrinsic::amdgcn_global_atomic_fadd;
1865618651 break;
18657- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18658- IID = Intrinsic::amdgcn_global_atomic_fadd;
18659- break;
1866018652 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1866118653 IID = Intrinsic::amdgcn_global_atomic_fmin;
1866218654 break;
@@ -19071,7 +19063,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1907119063 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1907219064 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1907319065 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
19074- case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
19066+ case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
19067+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19068+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
1907519069 llvm::AtomicRMWInst::BinOp BinOp;
1907619070 switch (BuiltinID) {
1907719071 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19087,6 +19081,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1908719081 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1908819082 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1908919083 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19084+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19085+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1909019086 BinOp = llvm::AtomicRMWInst::FAdd;
1909119087 break;
1909219088 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19121,8 +19117,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1912119117 ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
1912219118 EmitScalarExpr(E->getArg(3)), AO, SSID);
1912319119 } else {
19124- // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19125- SSID = llvm::SyncScope::System;
19120+ // Most of the builtins do not have syncscope/order arguments. For DS
19121+ // atomics the scope doesn't really matter, as they implicitly operate at
19122+ // workgroup scope.
19123+ //
19124+ // The global/flat cases need to use agent scope to consistently produce
19125+ // the native instruction instead of a cmpxchg expansion.
19126+ SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1912619127 AO = AtomicOrdering::SequentiallyConsistent;
1912719128
1912819129 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19137,6 +19138,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1913719138 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1913819139 if (Volatile)
1913919140 RMW->setVolatile(true);
19141+
19142+ unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19143+ if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19144+ // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19145+ // instruction for flat and global operations.
19146+ llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19147+ RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19148+
19149+ // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19150+ // instruction, but this only matters for float fadd.
19151+ if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19152+ RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19153+ }
19154+
1914019155 return Builder.CreateBitCast(RMW, OrigTy);
1914119156 }
1914219157 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
0 commit comments