ROCm
diff --git a/‎clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp‎
Lines changed: 8 additions & 11 deletions b/‎clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp‎
Lines changed: 8 additions & 11 deletions
diff --git a/‎clang/test/OpenMP/amdgcn_target_fast_fp_apu.cpp‎
Lines changed: 3 additions & 3 deletions b/‎clang/test/OpenMP/amdgcn_target_fast_fp_apu.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎clang/test/OpenMP/amdgcn_usm_atomics_hint.cpp‎
Lines changed: 3 additions & 3 deletions b/‎clang/test/OpenMP/amdgcn_usm_atomics_hint.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎llvm/docs/ReleaseNotes.rst‎
Lines changed: 5 additions & 0 deletions b/‎llvm/docs/ReleaseNotes.rst‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td‎
Lines changed: 0 additions & 3 deletions b/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎llvm/lib/IR/AutoUpgrade.cpp‎
Lines changed: 2 additions & 2 deletions b/‎llvm/lib/IR/AutoUpgrade.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstructions.td‎
Lines changed: 0 additions & 5 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstructions.td‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp‎
Lines changed: 0 additions & 2 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td‎
Lines changed: 0 additions & 2 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp‎
Lines changed: 0 additions & 2 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp‎
Lines changed: 0 additions & 2 deletions
@@ -2890,23 +2890,21 @@ CGOpenMPRuntimeGPU::emitFastFPAtomicCall(CodeGenFunction &CGF, LValue X,
                                          RValue Update, BinaryOperatorKind BO,
                                          bool IsXBinopExpr) {
   CGBuilderTy &Bld = CGF.Builder;
-  unsigned int IID = -1;
+  llvm::AtomicRMWInst::BinOp Kind = llvm::AtomicRMWInst::FAdd;
   RValue UpdateFixed = Update;
   switch (BO) {
   case BO_Sub:
     UpdateFixed = RValue::get(Bld.CreateFNeg(Update.getScalarVal()));
-    IID = llvm::Intrinsic::amdgcn_flat_atomic_fadd;
+    Kind = llvm::AtomicRMWInst::FAdd;
     break;
   case BO_Add:
-    IID = llvm::Intrinsic::amdgcn_flat_atomic_fadd;
+    Kind = llvm::AtomicRMWInst::FAdd;
     break;
   case BO_LT:
-    IID = IsXBinopExpr ? llvm::Intrinsic::amdgcn_flat_atomic_fmax
-                       : llvm::Intrinsic::amdgcn_flat_atomic_fmin;
+    Kind = IsXBinopExpr ? llvm::AtomicRMWInst::FMax : llvm::AtomicRMWInst::FMin;
     break;
   case BO_GT:
-    IID = IsXBinopExpr ? llvm::Intrinsic::amdgcn_flat_atomic_fmin
-                       : llvm::Intrinsic::amdgcn_flat_atomic_fmax;
+    Kind = IsXBinopExpr ? llvm::AtomicRMWInst::FMin : llvm::AtomicRMWInst::FMax;
     break;
   default:
     // remaining operations are not supported yet
@@ -2930,10 +2928,9 @@ CGOpenMPRuntimeGPU::emitFastFPAtomicCall(CodeGenFunction &CGF, LValue X,
                                 CGM.getModule(), OMPRTL___kmpc_unsafeAtomicAdd),
                             FPAtomicArgs);
   } else {
-    llvm::Function *AtomicF = CGM.getIntrinsic(
-        IID, {FPAtomicArgs[1]->getType(), FPAtomicArgs[0]->getType(),
-              FPAtomicArgs[1]->getType()});
-    CallInst = CGF.EmitNounwindRuntimeCall(AtomicF, FPAtomicArgs);
+    CallInst =
+        Bld.CreateAtomicRMW(Kind, X.getAddress(), FPAtomicArgs[1],
+                            llvm::AtomicOrdering::SequentiallyConsistent);
   }
   return std::make_pair(true, RValue::get(CallInst));
 }
 
@@ -52,7 +52,7 @@ int main(){
 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK:       user_code.entry:
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[TMP0]], float 1.000000e+00) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
 // CHECK-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-NEXT:    ret void
 // CHECK:       worker.exit:
@@ -73,7 +73,7 @@ int main(){
 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK:       user_code.entry:
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[TMP0]], float 1.000000e+00) #[[ATTR2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
 // CHECK-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-NEXT:    ret void
 // CHECK:       worker.exit:
@@ -94,7 +94,7 @@ int main(){
 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK:       user_code.entry:
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[TMP0]], float 1.000000e+00) #[[ATTR2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
 // CHECK-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-NEXT:    ret void
 // CHECK:       worker.exit:
 
@@ -26,7 +26,7 @@ double test_amdgcn_target_atomic_hints() {
 
   #pragma omp target teams distribute parallel for map(tofrom:a,b)
   for (int i = 0; i < N; i++) {
-    // CHECK-HINTS: call {{.*}} @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64
+    // CHECK-HINTS: = atomicrmw fadd
     #pragma omp atomic hint(amd_fast_fp_atomics)
     a+=(double)i;
 
@@ -49,11 +49,11 @@ double test_amdgcn_target_atomic_unsafe_opt() {
 
   #pragma omp target teams distribute parallel for map(tofrom:a,b,c)
   for (int i = 0; i < N; i++) {
-    // CHECK-FLAG-UNSAFE: call {{.*}} @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64
+    // CHECK-FLAG-UNSAFE: = atomicrmw fadd
     #pragma omp atomic
     a+=(double)i;
 
-    // CHECK-FLAG-UNSAFE: call {{.*}} @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64
+    // CHECK-FLAG-UNSAFE: = atomicrmw fadd
     #pragma omp atomic hint(amd_fast_fp_atomics)
     b+=(double)i;
 
 
@@ -78,6 +78,11 @@ Changes to the AArch64 Backend
 Changes to the AMDGPU Backend
 -----------------------------
 
+* Removed ``llvm.amdgcn.flat.atomic.fadd`` and
+  ``llvm.amdgcn.global.atomic.fadd`` intrinsics. Users should use the
+  :ref:`atomicrmw <i_atomicrmw>` instruction with `fadd` and
+  addrspace(0) or addrspace(1) instead.
+
 Changes to the ARM Backend
 --------------------------
 
 
@@ -3027,8 +3027,6 @@ def int_amdgcn_dot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
 // gfx908 intrinsics
 // ===----------------------------------------------------------------------===//
 
-def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
-
 // llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
 class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
   ClangBuiltin<!subst("int", "__builtin", NAME)>,
@@ -3067,7 +3065,6 @@ def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v
 
 def int_amdgcn_global_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_flat_atomic_fadd   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_flat_atomic_fmin   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_flat_atomic_fmax   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 
 
@@ -1035,8 +1035,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
 
       if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
           Name.starts_with("ds.fmax") ||
-          Name.starts_with("global.atomic.fadd.v2bf16") ||
-          Name.starts_with("flat.atomic.fadd.v2bf16")) {
+          Name.starts_with("global.atomic.fadd") ||
+          Name.starts_with("flat.atomic.fadd")) {
         // Replaced with atomicrmw fadd/fmin/fmax, so there's no new
         // declaration.
         NewFn = nullptr;
 
@@ -618,16 +618,11 @@ multiclass local_addr_space_atomic_op {
     }
 }
 
-defm int_amdgcn_flat_atomic_fadd : noret_op;
-defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op;
 defm int_amdgcn_flat_atomic_fmin : noret_op;
 defm int_amdgcn_flat_atomic_fmax : noret_op;
-defm int_amdgcn_global_atomic_fadd : global_addr_space_atomic_op;
-defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op;
 defm int_amdgcn_global_atomic_fmin : noret_op;
 defm int_amdgcn_global_atomic_fmax : noret_op;
 defm int_amdgcn_global_atomic_csub : noret_op;
-defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
 defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
 defm int_amdgcn_flat_atomic_fmin_num : noret_op;
 defm int_amdgcn_flat_atomic_fmax_num : noret_op;
 
@@ -4914,13 +4914,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
     }
-    case Intrinsic::amdgcn_global_atomic_fadd:
     case Intrinsic::amdgcn_global_atomic_csub:
     case Intrinsic::amdgcn_global_atomic_fmin:
     case Intrinsic::amdgcn_global_atomic_fmax:
     case Intrinsic::amdgcn_global_atomic_fmin_num:
     case Intrinsic::amdgcn_global_atomic_fmax_num:
-    case Intrinsic::amdgcn_flat_atomic_fadd:
     case Intrinsic::amdgcn_flat_atomic_fmin:
     case Intrinsic::amdgcn_flat_atomic_fmax:
     case Intrinsic::amdgcn_flat_atomic_fmin_num:
 
@@ -239,13 +239,11 @@ def : SourceOfDivergence<int_r600_read_tidig_y>;
 def : SourceOfDivergence<int_r600_read_tidig_z>;
 def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
-def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
-def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
 
@@ -1045,7 +1045,6 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
   switch (IID) {
   case Intrinsic::amdgcn_is_shared:
   case Intrinsic::amdgcn_is_private:
-  case Intrinsic::amdgcn_flat_atomic_fadd:
   case Intrinsic::amdgcn_flat_atomic_fmax:
   case Intrinsic::amdgcn_flat_atomic_fmin:
   case Intrinsic::amdgcn_flat_atomic_fmax_num:
@@ -1107,7 +1106,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
     return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
                              {NewV, MaskOp});
   }
-  case Intrinsic::amdgcn_flat_atomic_fadd:
   case Intrinsic::amdgcn_flat_atomic_fmax:
   case Intrinsic::amdgcn_flat_atomic_fmin:
   case Intrinsic::amdgcn_flat_atomic_fmax_num: