ROCm
diff --git a/‎clang/include/clang/Basic/BuiltinsAMDGPU.def‎
Lines changed: 2 additions & 0 deletions b/‎clang/include/clang/Basic/BuiltinsAMDGPU.def‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎clang/lib/Basic/Targets/SPIR.h‎
Lines changed: 6 additions & 0 deletions b/‎clang/lib/Basic/Targets/SPIR.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎clang/lib/CodeGen/CGLoopInfo.cpp‎
Lines changed: 29 additions & 17 deletions b/‎clang/lib/CodeGen/CGLoopInfo.cpp‎
Lines changed: 29 additions & 17 deletions
diff --git a/‎clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp‎
Lines changed: 2 additions & 0 deletions b/‎clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c‎
Lines changed: 1 addition & 1 deletion b/‎clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎clang/test/CodeGen/new-pass-manager-opt-bisect.c‎
Lines changed: 1 addition & 1 deletion b/‎clang/test/CodeGen/new-pass-manager-opt-bisect.c‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎clang/test/CodeGenCXX/pragma-loop.cpp‎
Lines changed: 41 additions & 0 deletions b/‎clang/test/CodeGenCXX/pragma-loop.cpp‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl‎
Lines changed: 38 additions & 0 deletions b/‎clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎clang/test/CodeGenOpenCL/scoped-atomic.cl‎
Lines changed: 18 additions & 0 deletions b/‎clang/test/CodeGenOpenCL/scoped-atomic.cl‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎clang/tools/libclang/libclang.map‎
Lines changed: 5 additions & 0 deletions b/‎clang/tools/libclang/libclang.map‎
Lines changed: 5 additions & 0 deletions
@@ -676,6 +676,8 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts")
 TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_log_bf16, "yy", "nc", "bf16-trans-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
 
@@ -264,6 +264,9 @@ class LLVM_LIBRARY_VISIBILITY SPIR32TargetInfo : public SPIRTargetInfo {
     PointerWidth = PointerAlign = 32;
     SizeType = TargetInfo::UnsignedInt;
     PtrDiffType = IntPtrType = TargetInfo::SignedInt;
+    // SPIR32 has support for atomic ops if atomic extension is enabled.
+    // Take the maximum because it's possible the Host supports wider types.
+    MaxAtomicInlineWidth = std::max<unsigned char>(MaxAtomicInlineWidth, 32);
     resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
                     "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
@@ -281,6 +284,9 @@ class LLVM_LIBRARY_VISIBILITY SPIR64TargetInfo : public SPIRTargetInfo {
     PointerWidth = PointerAlign = 64;
     SizeType = TargetInfo::UnsignedLong;
     PtrDiffType = IntPtrType = TargetInfo::SignedLong;
+    // SPIR64 has support for atomic ops if atomic extension is enabled.
+    // Take the maximum because it's possible the Host supports wider types.
+    MaxAtomicInlineWidth = std::max<unsigned char>(MaxAtomicInlineWidth, 64);
     resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
                     "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
 
@@ -221,18 +221,6 @@ LoopInfo::createLoopVectorizeMetadata(const LoopAttributes &Attrs,
     return createUnrollAndJamMetadata(Attrs, LoopProperties, HasUserTransforms);
   }
 
-  // Apply all loop properties to the vectorized loop.
-  SmallVector<Metadata *, 4> FollowupLoopProperties;
-  FollowupLoopProperties.append(LoopProperties.begin(), LoopProperties.end());
-
-  // Don't vectorize an already vectorized loop.
-  FollowupLoopProperties.push_back(
-      MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.isvectorized")));
-
-  bool FollowupHasTransforms = false;
-  SmallVector<Metadata *, 4> Followup = createUnrollAndJamMetadata(
-      Attrs, FollowupLoopProperties, FollowupHasTransforms);
-
   SmallVector<Metadata *, 4> Args;
   Args.append(LoopProperties.begin(), LoopProperties.end());
 
@@ -286,22 +274,46 @@ LoopInfo::createLoopVectorizeMetadata(const LoopAttributes &Attrs,
   // 5) it is implied when vectorize.width is unset (0) and the user
   //    explicitly requested fixed-width vectorization, i.e.
   //    vectorize.scalable.enable is false.
+  bool VectorizeEnabled = false;
   if (Attrs.VectorizeEnable != LoopAttributes::Unspecified ||
       (IsVectorPredicateEnabled && Attrs.VectorizeWidth != 1) ||
       Attrs.VectorizeWidth > 1 ||
       Attrs.VectorizeScalable == LoopAttributes::Enable ||
       (Attrs.VectorizeScalable == LoopAttributes::Disable &&
        Attrs.VectorizeWidth != 1)) {
-    bool AttrVal = Attrs.VectorizeEnable != LoopAttributes::Disable;
+    VectorizeEnabled = Attrs.VectorizeEnable != LoopAttributes::Disable;
     Args.push_back(
         MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
                           ConstantAsMetadata::get(ConstantInt::get(
-                              llvm::Type::getInt1Ty(Ctx), AttrVal))}));
+                              llvm::Type::getInt1Ty(Ctx), VectorizeEnabled))}));
   }
 
-  if (FollowupHasTransforms)
-    Args.push_back(
-        createFollowupMetadata("llvm.loop.vectorize.followup_all", Followup));
+  // Apply all loop properties to the vectorized loop.
+  SmallVector<Metadata *, 4> FollowupLoopProperties;
+
+  // If vectorization is not explicitly enabled, the follow-up metadata will be
+  // directly appended to the list currently being created. In that case, adding
+  // LoopProperties to FollowupLoopProperties would result in duplication.
+  if (VectorizeEnabled)
+    FollowupLoopProperties.append(LoopProperties.begin(), LoopProperties.end());
+
+  // Don't vectorize an already vectorized loop.
+  FollowupLoopProperties.push_back(
+      MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.isvectorized")));
+
+  bool FollowupHasTransforms = false;
+  SmallVector<Metadata *, 4> Followup = createUnrollAndJamMetadata(
+      Attrs, FollowupLoopProperties, FollowupHasTransforms);
+
+  if (FollowupHasTransforms) {
+    // If vectorization is explicitly enabled, we create a follow-up metadata,
+    // otherwise directly add the contents of it to Args.
+    if (VectorizeEnabled)
+      Args.push_back(
+          createFollowupMetadata("llvm.loop.vectorize.followup_all", Followup));
+    else
+      Args.append(Followup.begin(), Followup.end());
+  }
 
   HasUserTransforms = true;
   return Args;
 
@@ -433,6 +433,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_rsq:
   case AMDGPU::BI__builtin_amdgcn_rsqf:
   case AMDGPU::BI__builtin_amdgcn_rsqh:
+  case AMDGPU::BI__builtin_amdgcn_rsq_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq);
   case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
   case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
@@ -447,6 +448,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
     return EmitAMDGPUDispatchPtr(*this, E);
   case AMDGPU::BI__builtin_amdgcn_logf:
+  case AMDGPU::BI__builtin_amdgcn_log_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log);
   case AMDGPU::BI__builtin_amdgcn_exp2f:
     return emitBuiltinWithOneOverloadedType<1>(*this, E,
 
@@ -11,7 +11,7 @@
 // RUN: -S -ffp-exception-behavior=strict \
 // RUN: -o - %s | FileCheck --check-prefix=CHECK-ASM \
 // RUN: --check-prefix=FIXME-CHECK  %s
-// RUN: %clang_cc1 -triple powerpcspe -ffp-exception-behavior=strict \
+// RUN: %clang_cc1 -triple powerpc -ffp-exception-behavior=strict \
 // RUN: -target-feature +vsx -fexperimental-strict-floating-point -emit-llvm \
 // RUN: %s -o - | FileCheck --check-prefix=CHECK-CONSTRAINED %s
 
 
@@ -7,6 +7,6 @@
 // CHECK: BISECT: running pass (1)
 // CHECK-NOT: BISECT: running pass (1)
 // Make sure that legacy pass manager is running
-// CHECK: Instruction Selection
+// CHECK: -isel
 
 int func(int a) { return a; }
@@ -203,6 +203,43 @@ void for_test_scalable_1(int *List, int Length) {
   }
 }
 
+// Verify for loop is not performing vectorization
+void for_test_width_1(int *List, int Length) {
+#pragma clang loop vectorize_width(1) interleave_count(4) unroll(disable) distribute(disable)
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_20:.*]]
+    List[i] = i * 2;
+  }
+}
+
+// Verify for loop is not performing vectorization
+void for_test_fixed_1(int *List, int Length) {
+#pragma clang loop vectorize_width(1, fixed) interleave_count(4) unroll(disable) distribute(disable)
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_21:.*]]
+    List[i] = i * 2;
+  }
+}
+
+
+// Verify unroll attributes are directly attached to the loop metadata
+void for_test_vectorize_disable_unroll(int *List, int Length) {
+#pragma clang loop vectorize(disable) unroll_count(8)
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_22:.*]]
+    List[i] = i * 2;
+  }
+}
+
+// Verify unroll attributes are directly attached to the loop metadata
+void for_test_interleave_vectorize_disable_unroll(int *List, int Length) {
+#pragma clang loop vectorize(disable) interleave_count(4) unroll_count(8)
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_23:.*]]
+    List[i] = i * 2;
+  }
+}
+
 // CHECK-DAG: ![[MP:[0-9]+]] = !{!"llvm.loop.mustprogress"}
 
 // CHECK-DAG: ![[UNROLL_DISABLE:[0-9]+]] = !{!"llvm.loop.unroll.disable"}
@@ -270,3 +307,7 @@ void for_test_scalable_1(int *List, int Length) {
 // CHECK-DAG: ![[LOOP_17]] = distinct !{![[LOOP_17]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[FIXED_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]}
 // CHECK-DAG: ![[LOOP_18]] = distinct !{![[LOOP_18]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[SCALABLE_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]}
 // CHECK-DAG: ![[LOOP_19]] = distinct !{![[LOOP_19]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_1]], ![[SCALABLE_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]}
+// CHECK-DAG: ![[LOOP_20]] = distinct !{![[LOOP_20]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_1]], ![[FIXED_VEC]], ![[INTERLEAVE_4]]}
+// CHECK-DAG: ![[LOOP_21]] = distinct !{![[LOOP_21]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_1]], ![[FIXED_VEC]], ![[INTERLEAVE_4]]}
+// CHECK-DAG: ![[LOOP_22]] = distinct !{![[LOOP_22]], ![[MP]], ![[WIDTH_1]], ![[ISVECTORIZED]], ![[UNROLL_8]]}
+// CHECK-DAG: ![[LOOP_23]] = distinct !{![[LOOP_23]], ![[MP]], ![[WIDTH_1]], ![[INTERLEAVE_4]], ![[ISVECTORIZED]], ![[UNROLL_8]]}
@@ -80,6 +80,44 @@ void test_rcp_bf16(global __bf16* out, __bf16 a)
   *out = __builtin_amdgcn_rcp_bf16(a);
 }
 
+// CHECK-LABEL: @test_rsq_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.rsq.bf16(bfloat [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_rsq_bf16(global __bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_rsq_bf16(a);
+}
+
+// CHECK-LABEL: @test_log_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.log.bf16(bfloat [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_log_bf16(global __bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_log_bf16(a);
+}
+
 // CHECK-LABEL: @test_cvt_f16_fp8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -emit-llvm -o - -triple spir-unknown-unknown -verify
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -emit-llvm -o - -triple spir64-unknown-unknown -verify
+
+// expected-no-diagnostics
+
+int fi1a(int *i) {
+  int v;
+  __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
+  return v;
+}
+
+#ifdef __SPIR64__
+long fl1a(long *i) {
+  long v;
+  __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
+  return v;
+}
+#endif
@@ -327,6 +327,8 @@ LLVM_13 {
     clang_getRange;
     clang_getRangeEnd;
     clang_getRangeStart;
+    clang_getRemappings;
+    clang_getRemappingsFromFileList;
     clang_getResultType;
     clang_getSkippedRanges;
     clang_getSpecializedCursorTemplate;
@@ -387,6 +389,9 @@ LLVM_13 {
     clang_parseTranslationUnit;
     clang_parseTranslationUnit2;
     clang_parseTranslationUnit2FullArgv;
+    clang_remap_dispose;
+    clang_remap_getFilenames;
+    clang_remap_getNumFiles;
     clang_reparseTranslationUnit;
     clang_saveTranslationUnit;
     clang_sortCodeCompletionResults;