5858#include "llvm/IR/MDBuilder.h"
5959#include "llvm/IR/MatrixBuilder.h"
6060#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
61+ #include "llvm/Support/AMDGPUAddrSpace.h"
6162#include "llvm/Support/ConvertUTF.h"
6263#include "llvm/Support/MathExtras.h"
6364#include "llvm/Support/ScopedPrinter.h"
@@ -18654,8 +18655,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1865418655 Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
1865518656 return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
1865618657 }
18657- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18658- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1865918658 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1866018659 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1866118660 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18667,18 +18666,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1866718666 Intrinsic::ID IID;
1866818667 llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
1866918668 switch (BuiltinID) {
18670- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18671- ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18672- IID = Intrinsic::amdgcn_global_atomic_fadd;
18673- break;
1867418669 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1867518670 ArgTy = llvm::FixedVectorType::get(
1867618671 llvm::Type::getHalfTy(getLLVMContext()), 2);
1867718672 IID = Intrinsic::amdgcn_global_atomic_fadd;
1867818673 break;
18679- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18680- IID = Intrinsic::amdgcn_global_atomic_fadd;
18681- break;
1868218674 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1868318675 IID = Intrinsic::amdgcn_global_atomic_fmin;
1868418676 break;
@@ -19091,7 +19083,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1909119083 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1909219084 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1909319085 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
19094- case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
19086+ case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19087+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19088+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
1909519089 llvm::AtomicRMWInst::BinOp BinOp;
1909619090 switch (BuiltinID) {
1909719091 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19107,6 +19101,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1910719101 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1910819102 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1910919103 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19104+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
19105+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1911019106 BinOp = llvm::AtomicRMWInst::FAdd;
1911119107 break;
1911219108 }
@@ -19133,8 +19129,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1913319129 ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
1913419130 EmitScalarExpr(E->getArg(3)), AO, SSID);
1913519131 } else {
19136- // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
19137- SSID = llvm::SyncScope::System;
19132+ // Most of the builtins do not have syncscope/order arguments. For DS
19133+ // atomics the scope doesn't really matter, as they implicitly operate at
19134+ // workgroup scope.
19135+ //
19136+ // The global/flat cases need to use agent scope to consistently produce
19137+ // the native instruction instead of a cmpxchg expansion.
19138+ SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
1913819139 AO = AtomicOrdering::SequentiallyConsistent;
1913919140
1914019141 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19149,6 +19150,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1914919150 Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1915019151 if (Volatile)
1915119152 RMW->setVolatile(true);
19153+
19154+ unsigned AddrSpace = Ptr.getType()->getAddressSpace();
19155+ if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
19156+ // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
19157+ // instruction for flat and global operations.
19158+ llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
19159+ RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
19160+
19161+ // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
19162+ // instruction, but this only matters for float fadd.
19163+ if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
19164+ RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
19165+ }
19166+
1915219167 return Builder.CreateBitCast(RMW, OrigTy);
1915319168 }
1915419169 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
0 commit comments