ROCm
diff --git a/‎clang/docs/LanguageExtensions.rst‎
Lines changed: 46 additions & 0 deletions b/‎clang/docs/LanguageExtensions.rst‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎clang/include/clang/Basic/BuiltinsAMDGPU.def‎
Lines changed: 3 additions & 0 deletions b/‎clang/include/clang/Basic/BuiltinsAMDGPU.def‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎clang/include/clang/Sema/SemaAMDGPU.h‎
Lines changed: 2 additions & 0 deletions b/‎clang/include/clang/Sema/SemaAMDGPU.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎clang/lib/CodeGen/CGBuiltin.cpp‎
Lines changed: 20 additions & 0 deletions b/‎clang/lib/CodeGen/CGBuiltin.cpp‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎clang/lib/Sema/SemaAMDGPU.cpp‎
Lines changed: 16 additions & 0 deletions b/‎clang/lib/Sema/SemaAMDGPU.cpp‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl‎
Lines changed: 99 additions & 0 deletions b/‎clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl‎
Lines changed: 22 additions & 0 deletions b/‎clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl‎
Lines changed: 30 additions & 0 deletions b/‎clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎llvm/docs/AMDGPUUsage.rst‎
Lines changed: 79 additions & 0 deletions b/‎llvm/docs/AMDGPUUsage.rst‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h‎
Lines changed: 6 additions & 0 deletions b/‎llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h‎
Lines changed: 6 additions & 0 deletions
@@ -4901,6 +4901,52 @@ for a concrete target, and shall reflect the latter's identity and features.
 Thus, it is possible to author high-level code, in e.g. HIP, that is target
 adaptive in a dynamic fashion, contrary to macro based mechanisms.
 
+__builtin_amdgcn_global_load_b128 and __builtin_amdgcn_global_store_b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Signature:
+
+.. code-block:: c
+
+    typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u;
+    typedef v4u __attribute__((address_space(1))) *global_ptr_to_v4u;
+
+    v4u __builtin_amdgcn_global_load_b128(
+       v4u __attribute__((address_space(1))) *src,
+       const char                            *scope);
+
+    void __builtin_amdgcn_global_store_b128(
+       v4u __attribute__((address_space(1))) *dst,
+       v4u                                    data,
+       const char                            *scope);
+
+Load or store a vector of 4 unsigned integers from or to global memory with
+cache behavior specified by `scope` which must be a string literal.
+
+Valid values for `scope` are:
+
+===================== ==========================================================
+scope                 architecture name
+===================== ==========================================================
+``"wavefront"``       wave
+
+``"workgroup"``       group
+
+``"agent"``           device
+
+``""`` (empty string) system
+===================== ==========================================================
+
+These builtins are only supported on gfx942 and gfx950 devices.
+
+For semantics on gfx942, see Tables 47 and 48 in section 9.1.10 "Memory Scope
+and Temporal Controls" of the "AMD Instinct MI300" Instruction Set Architecture
+Reference.
+
+For semantics on gfx950, see Tables 49 and 50 in section 9.1.10 "Memory Scope
+and Temporal Controls" of the CDNA4 Instruction Set Architecture Reference.
+
+
 ARM/AArch64 Language Extensions
 -------------------------------
 
 
@@ -165,6 +165,9 @@ BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n")
 
 TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_load_lds, "vQbv*3IUiiiIiIi", "t", "vmem-to-lds-load-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_global_load_b128, "V4UiV4Ui*1cC*", "n", "gfx940-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_store_b128, "vV4Ui*1V4UicC*", "n", "gfx940-insts")
+
 //===----------------------------------------------------------------------===//
 // Ballot builtins.
 //===----------------------------------------------------------------------===//
 
@@ -30,6 +30,8 @@ class SemaAMDGPU : public SemaBase {
 
   bool CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
 
+  bool checkScopedMemAccessFunctionCall(CallExpr *TheCall);
+
   bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
                                unsigned NumDataArgs);
 
 
@@ -20116,6 +20116,26 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
     return Builder.CreateCall(F, {Addr});
   }
+  case AMDGPU::BI__builtin_amdgcn_global_load_b128:
+  case AMDGPU::BI__builtin_amdgcn_global_store_b128: {
+    const bool IsStore =
+        BuiltinID == AMDGPU::BI__builtin_amdgcn_global_store_b128;
+    LLVMContext &Ctx = CGM.getLLVMContext();
+    SmallVector<Value *, 5> Args = {EmitScalarExpr(E->getArg(0))}; // addr
+    if (IsStore)
+      Args.push_back(EmitScalarExpr(E->getArg(1))); // data
+    const unsigned ScopeIdx = E->getNumArgs() - 1;
+    StringRef ScopeLit =
+        cast<StringLiteral>(E->getArg(ScopeIdx)->IgnoreParenCasts())
+            ->getString();
+    llvm::MDNode *MD =
+        llvm::MDNode::get(Ctx, {llvm::MDString::get(Ctx, ScopeLit)});
+    Args.push_back(llvm::MetadataAsValue::get(Ctx, MD)); // scope
+    llvm::Function *F =
+        CGM.getIntrinsic(IsStore ? Intrinsic::amdgcn_global_store_b128
+                                 : Intrinsic::amdgcn_global_load_b128);
+    return Builder.CreateCall(F, Args);
+  }
   case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
     Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
                                    {llvm::Type::getInt64Ty(getLLVMContext())});
 
@@ -84,6 +84,9 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_update_dpp: {
     return checkMovDPPFunctionCall(TheCall, 6, 2);
   }
+  case AMDGPU::BI__builtin_amdgcn_global_load_b128:
+  case AMDGPU::BI__builtin_amdgcn_global_store_b128:
+    return checkScopedMemAccessFunctionCall(TheCall);
   default:
     return false;
   }
@@ -129,6 +132,19 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
   return false;
 }
 
+bool SemaAMDGPU::checkScopedMemAccessFunctionCall(CallExpr *TheCall) {
+  bool Fail = false;
+  // Last argument is a string literal
+  Expr *Arg = TheCall->getArg(TheCall->getNumArgs() - 1);
+  auto Scope = dyn_cast<StringLiteral>(Arg->IgnoreParenCasts());
+  if (!Scope) {
+    Fail = true;
+    Diag(TheCall->getBeginLoc(), diag::err_expr_not_string_literal)
+        << Arg->getSourceRange();
+  }
+  return Fail;
+}
+
 bool SemaAMDGPU::checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
                                          unsigned NumDataArgs) {
   assert(NumDataArgs <= 2);
 
@@ -0,0 +1,99 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals smart
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950         -emit-llvm -o - %s | FileCheck %s -check-prefixes=GFX,GFX950
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-4-generic -emit-llvm -o - %s | FileCheck %s -check-prefixes=GFX,GFX9_4_GENERIC
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
+//------------------------------------------------------------------------------
+// Store
+//------------------------------------------------------------------------------
+// GFX-LABEL: @test_amdgcn_global_store_b128_00(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META4:![0-9]+]])
+// GFX-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_00(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_global_store_b128(ptr, data, "wavefront");
+}
+
+// GFX-LABEL: @test_amdgcn_global_store_b128_01(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META5:![0-9]+]])
+// GFX-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_01(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_global_store_b128(ptr, data, "workgroup");
+}
+
+// GFX-LABEL: @test_amdgcn_global_store_b128_10(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META6:![0-9]+]])
+// GFX-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_10(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_global_store_b128(ptr, data, "agent");
+}
+
+// GFX-LABEL: @test_amdgcn_global_store_b128_11(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7:![0-9]+]])
+// GFX-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_11(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_global_store_b128(ptr, data, "");
+}
+
+//------------------------------------------------------------------------------
+// Load
+//------------------------------------------------------------------------------
+// GFX-LABEL: @test_amdgcn_global_load_b128_00(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META4]])
+// GFX-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_00(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_global_load_b128(ptr, "wavefront");
+}
+
+// GFX-LABEL: @test_amdgcn_global_load_b128_01(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META5]])
+// GFX-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_01(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_global_load_b128(ptr, "workgroup");
+}
+
+// GFX-LABEL: @test_amdgcn_global_load_b128_10(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META6]])
+// GFX-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_10(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_global_load_b128(ptr, "agent");
+}
+
+// GFX-LABEL: @test_amdgcn_global_load_b128_11(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META7]])
+// GFX-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_11(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_global_load_b128(ptr, "");
+}
+//.
+// GFX950: [[META4]] = !{!"wavefront"}
+// GFX950: [[META5]] = !{!"workgroup"}
+// GFX950: [[META6]] = !{!"agent"}
+// GFX950: [[META7]] = !{!""}
+//.
+// GFX9_4_GENERIC: [[META4]] = !{!"wavefront"}
+// GFX9_4_GENERIC: [[META5]] = !{!"workgroup"}
+// GFX9_4_GENERIC: [[META6]] = !{!"agent"}
+// GFX9_4_GENERIC: [[META7]] = !{!""}
+//.
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// GFX950: {{.*}}
+// GFX9_4_GENERIC: {{.*}}
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950         -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-4-generic -S -verify -o - %s
+// REQUIRES: amdgpu-registered-target
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
+void test_amdgcn_global_store_b128_00(v4u32 *ptr, v4u32 data, const char* scope) {
+  __builtin_amdgcn_global_store_b128(ptr, data, "");  //expected-error{{passing '__private v4u32 *__private' to parameter of type '__attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int __global *' changes address space of pointer}}
+}
+
+void test_amdgcn_global_store_b128_01(global_ptr_to_v4u32 ptr, v4u32 data, const char* scope) {
+  __builtin_amdgcn_global_store_b128(ptr, data, scope);  //expected-error{{expression is not a string literal}}
+}
+
+v4u32 test_amdgcn_global_load_b128_00(v4u32 *ptr, const char* scope) {
+  return __builtin_amdgcn_global_load_b128(ptr, "");  //expected-error{{passing '__private v4u32 *__private' to parameter of type '__attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int __global *' changes address space of pointer}}
+}
+
+v4u32 test_amdgcn_global_load_b128_01(global_ptr_to_v4u32 ptr, const char* scope) {
+  return __builtin_amdgcn_global_load_b128(ptr, scope);  //expected-error{{expression is not a string literal}}
+}
@@ -0,0 +1,30 @@
+// We test loads and stores separately because clang only seems to exit after
+// the first 'target feature' error.
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-generic    -DTEST_LOAD  -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-1-generic -DTEST_LOAD  -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-3-generic -DTEST_LOAD  -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx11-generic   -DTEST_LOAD  -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx12-generic   -DTEST_LOAD  -S -verify -o - %s
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-generic    -DTEST_STORE -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-1-generic -DTEST_STORE -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-3-generic -DTEST_STORE -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx11-generic   -DTEST_STORE -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx12-generic   -DTEST_STORE -S -verify -o - %s
+// REQUIRES: amdgpu-registered-target
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
+#ifdef TEST_LOAD
+v4u32 test_amdgcn_global_load_b128_01(global_ptr_to_v4u32 ptr, const char* scope) {
+  return __builtin_amdgcn_global_load_b128(ptr, ""); // expected-error{{'__builtin_amdgcn_global_load_b128' needs target feature gfx940-insts}}
+}
+#endif
+
+#ifdef TEST_STORE
+void test_amdgcn_global_store_b128_01(global_ptr_to_v4u32 ptr, v4u32 data, const char* scope) {
+  __builtin_amdgcn_global_store_b128(ptr, data, ""); // expected-error{{'__builtin_amdgcn_global_store_b128' needs target feature gfx940-insts}}
+}
+#endif
@@ -1436,6 +1436,85 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    - `v_mov_b32 <dest> <old>`
                                                    - `v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>`
 
+  llvm.amdgcn.global.load.b128                 This intrinsic is supported on gfx942, gfx950.
+  
+                                                   Signature:
+                                                   
+                                                   .. code-block:: llvm
+                                                      
+                                                      <4 x i32> @llvm.amdgcn.raw.load.store.b128(
+                                                          ptr addrspace(1), ; source
+                                                          metadata)         ; scope    - e.g. '!0' where '!0 = !{!"wavegroup"}'
+
+                                                   Reads the value from the source address with cache behavior
+                                                   specified by the scope.
+
+                                                   For gfc942 and gfx950 devices, this emits a
+                                                   ``global_load_dwordx4`` instruction with the appropriate
+                                                   ``SC0`` and ``SC1`` bits set.
+
+                                                   Valid values for scope are
+                                                   
+                                                   ===================== =============================================================
+                                                   scope                 architecture name
+                                                   ===================== =============================================================
+                                                   ``"wavefront"``       wave
+                                                   
+                                                   ``"workgroup"``       group
+                                                   
+                                                   ``"agent"``           device
+                                                   
+                                                   ``""`` (empty string) system
+                                                   ===================== =============================================================
+ 
+                                                   For semantics on gfx942, see Table 47 in section 9.1.10
+                                                   "Memory Scope and Temporal Controls" of the "AMD Instinct
+                                                   MI300" Instruction Set Architecture Reference.
+                                                   
+                                                   For semantics on gfx950, see Table 49 in section 9.1.10
+                                                   "Memory Scope and Temporal Controls" of the CDNA4
+                                                   Instruction Set Architecture Reference.
+                                                                                                      
+  llvm.amdgcn.global.store.b128                This intrinsic is supported on gfx942, gfx950.
+  
+                                                   Signature:
+                                                   
+                                                   .. code-block:: llvm
+                                                      
+                                                      void @llvm.amdgcn.global.store.b128(
+                                                          ptr addrspace(1), ; destination
+                                                          <4 x i32>,        ; value
+                                                          metadata)         ; scope    - e.g. '!0' where '!0 = !{!"wavegroup"}'
+
+                                                   Writes the value to the destination address with cache
+                                                   behavior specified by the scope.
+
+                                                   For gfc942 and gfx950 devices, this emits a
+                                                   ``global_store_dwordx4`` instruction with the appropriate
+                                                   ``SC0`` and ``SC1`` bits set.
+
+                                                   Valid values for scope are
+                                                   
+                                                   ===================== =============================================================
+                                                   scope                 architecture name
+                                                   ===================== =============================================================
+                                                   ``"wavefront"``       wave
+                                                   
+                                                   ``"workgroup"``       group
+                                                   
+                                                   ``"agent"``           device
+                                                   
+                                                   ``""`` (empty string) system
+                                                   ===================== =============================================================
+ 
+                                                   For semantics on gfx942, see Table 48 in section 9.1.10
+                                                   "Memory Scope and Temporal Controls" of the "AMD Instinct
+                                                   MI300" Instruction Set Architecture Reference.
+                                                   
+                                                   For semantics on gfx950, see Table 50 in section 9.1.10
+                                                   "Memory Scope and Temporal Controls" of the CDNA4
+                                                   Instruction Set Architecture Reference.
+                                                                                                      
   ==============================================   ==========================================================
 
 .. TODO::
 
@@ -239,6 +239,12 @@ enum {
   /// - SizeInBits(ULEB128) - The size of the pointer value in bits.
   GIM_CheckPointerToAny,
 
+  /// Check the machine type of the specified operand
+  /// - InsnID(ULEB128) - Instruction ID
+  /// - OpIdx(ULEB128) - Operand index
+  /// - MachineOperandType(ULEB128) - Expected type
+  GIM_CheckMachineOperandType,
+
   /// Check the register bank for the specified operand
   /// - InsnID(ULEB128) - Instruction ID
   /// - OpIdx(ULEB128) - Operand index