ROCm
diff --git a/‎clang/docs/LanguageExtensions.rst‎
Lines changed: 9 additions & 0 deletions b/‎clang/docs/LanguageExtensions.rst‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎clang/docs/OpenMPSupport.rst‎
Lines changed: 1 addition & 1 deletion b/‎clang/docs/OpenMPSupport.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎clang/docs/ReleaseNotes.rst‎
Lines changed: 3 additions & 0 deletions b/‎clang/docs/ReleaseNotes.rst‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎clang/include/clang/Basic/Builtins.td‎
Lines changed: 12 additions & 0 deletions b/‎clang/include/clang/Basic/Builtins.td‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎clang/include/clang/Basic/BuiltinsX86.td‎
Lines changed: 0 additions & 8 deletions b/‎clang/include/clang/Basic/BuiltinsX86.td‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎clang/lib/CodeGen/CGBuiltin.cpp‎
Lines changed: 7 additions & 0 deletions b/‎clang/lib/CodeGen/CGBuiltin.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎clang/lib/CodeGen/CGOpenMPRuntime.cpp‎
Lines changed: 26 additions & 1 deletion b/‎clang/lib/CodeGen/CGOpenMPRuntime.cpp‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎clang/lib/CodeGen/TargetBuiltins/X86.cpp‎
Lines changed: 0 additions & 2 deletions b/‎clang/lib/CodeGen/TargetBuiltins/X86.cpp‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎clang/lib/Headers/emmintrin.h‎
Lines changed: 8 additions & 4 deletions b/‎clang/lib/Headers/emmintrin.h‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎clang/lib/Headers/f16cintrin.h‎
Lines changed: 8 additions & 5 deletions b/‎clang/lib/Headers/f16cintrin.h‎
Lines changed: 8 additions & 5 deletions
@@ -857,6 +857,15 @@ of different sizes and signs is forbidden in binary and ternary builtins.
                                                 semantics, see `LangRef
                                                 <http://llvm.org/docs/LangRef.html#llvm-min-intrinsics-comparation>`_
                                                 for the comparison.
+T __builtin_elementwise_fshl(T x, T y, T z)     perform a funnel shift left. Concatenate x and y (x is the most        integer types
+                                                significant bits of the wide value), the combined value is shifted
+                                                left by z, and the most significant bits are extracted to produce
+                                                a result that is the same size as the original arguments.
+
+T __builtin_elementwise_fshr(T x, T y, T z)     perform a funnel shift right. Concatenate x and y (x is the most       integer types
+                                                significant bits of the wide value), the combined value is shifted
+                                                right by z, and the least significant bits are extracted to produce
+                                                a result that is the same size as the original arguments.
 ============================================== ====================================================================== =========================================
 
 
 
@@ -191,7 +191,7 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | teams construct on the host device                           | :good:`done`             | r371553                                                               |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | support non-contiguous array sections for target update      | :good:`done`             |                                                                       |
+| device                       | support non-contiguous array sections for target update      | :good:`done`             | https://github.com/llvm/llvm-project/pull/144635                                                                      |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | pointer attachment                                           | :good:`done`             |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 
@@ -107,6 +107,8 @@ C23 Feature Support
 
 Non-comprehensive list of changes in this release
 -------------------------------------------------
+- Added ``__builtin_elementwise_fshl`` and ``__builtin_elementwise_fshr``.
+
 - Added ``__builtin_elementwise_minnumnum`` and ``__builtin_elementwise_maxnumnum``.
 
 - Trapping UBSan (e.g. ``-fsanitize-trap=undefined``) now emits a string describing the reason for 
@@ -563,6 +565,7 @@ OpenMP Support
 - Added parsing and semantic analysis support for the ``need_device_addr``
   modifier in the ``adjust_args`` clause.
 - Allow array length to be omitted in array section subscript expression.
+- Fixed non-contiguous strided update in the ``omp target update`` directive with the ``from`` clause.
 
 Improvements
 ^^^^^^^^^^^^
 
@@ -1514,6 +1514,18 @@ def ElementwiseSubSat : Builtin {
   let Prototype = "void(...)";
 }
 
+def ElementwiseFshl : Builtin {
+  let Spellings = ["__builtin_elementwise_fshl"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseFshr : Builtin {
+  let Spellings = ["__builtin_elementwise_fshr"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 def ReduceMax : Builtin {
   let Spellings = ["__builtin_reduce_max"];
   let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
 
@@ -757,14 +757,6 @@ let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def vcvtps2ph256 : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int)">;
 }
 
-let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vcvtph2ps : X86Builtin<"_Vector<4, float>(_Vector<8, short>)">;
-}
-
-let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vcvtph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, short>)">;
-}
-
 let Features = "rdrnd", Attributes = [NoThrow] in {
   def rdrand16_step : X86Builtin<"unsigned int(unsigned short *)">;
   def rdrand32_step : X86Builtin<"unsigned int(unsigned int *)">;
 
@@ -4030,6 +4030,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_elementwise_fma:
     return RValue::get(
         emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::fma));
+  case Builtin::BI__builtin_elementwise_fshl:
+    return RValue::get(
+        emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::fshl));
+  case Builtin::BI__builtin_elementwise_fshr:
+    return RValue::get(
+        emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::fshr));
+
   case Builtin::BI__builtin_elementwise_add_sat:
   case Builtin::BI__builtin_elementwise_sub_sat: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
 
@@ -7535,7 +7535,32 @@ class MappableExprsHandler {
     // dimension.
     uint64_t DimSize = 1;
 
-    bool IsNonContiguous = CombinedInfo.NonContigInfo.IsNonContiguous;
+    // Detects non-contiguous updates due to strided accesses.
+    // Sets the 'IsNonContiguous' flag so that the 'MapType' bits are set
+    // correctly when generating information to be passed to the runtime. The
+    // flag is set to true if any array section has a stride not equal to 1, or
+    // if the stride is not a constant expression (conservatively assumed
+    // non-contiguous).
+    bool IsNonContiguous =
+        CombinedInfo.NonContigInfo.IsNonContiguous ||
+        any_of(Components, [&](const auto &Component) {
+          const auto *OASE =
+              dyn_cast<ArraySectionExpr>(Component.getAssociatedExpression());
+          if (!OASE)
+            return false;
+
+          const Expr *StrideExpr = OASE->getStride();
+          if (!StrideExpr)
+            return false;
+
+          const auto Constant =
+              StrideExpr->getIntegerConstantExpr(CGF.getContext());
+          if (!Constant)
+            return false;
+
+          return !Constant->isOne();
+        });
+
     bool IsPrevMemberReference = false;
 
     bool IsPartialMapped =
 
@@ -2841,8 +2841,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
 
   // f16c half2float intrinsics
-  case X86::BI__builtin_ia32_vcvtph2ps:
-  case X86::BI__builtin_ia32_vcvtph2ps256:
   case X86::BI__builtin_ia32_vcvtph2ps_mask:
   case X86::BI__builtin_ia32_vcvtph2ps256_mask:
   case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
 
@@ -3381,7 +3381,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
 /// \param __a
 ///    A 32-bit signed integer operand.
 /// \returns A 128-bit vector of [4 x i32].
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtsi32_si128(int __a) {
   return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
 }
 
@@ -3396,7 +3397,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
 /// \param __a
 ///    A 64-bit signed integer operand containing the value to be converted.
 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtsi64_si128(long long __a) {
   return __extension__(__m128i)(__v2di){__a, 0};
 }
 
@@ -3411,7 +3413,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
 ///    A vector of [4 x i32]. The least significant 32 bits are moved to the
 ///    destination.
 /// \returns A 32-bit signed integer containing the moved value.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtsi128_si32(__m128i __a) {
   __v4si __b = (__v4si)__a;
   return __b[0];
 }
@@ -3427,7 +3430,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
 ///    A vector of [2 x i64]. The least significant 64 bits are moved to the
 ///    destination.
 /// \returns A 64-bit signed integer containing the moved value.
-static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
+static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtsi128_si64(__m128i __a) {
   return __a[0];
 }
 
 
@@ -38,9 +38,7 @@
 static __inline float __DEFAULT_FN_ATTRS128
 _cvtsh_ss(unsigned short __a)
 {
-  __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
-  __v4sf __r = __builtin_ia32_vcvtph2ps(__v);
-  return __r[0];
+  return (float)__builtin_bit_cast(__fp16, __a);
 }
 
 /// Converts a 32-bit single-precision float value to a 16-bit
@@ -109,7 +107,10 @@ _cvtsh_ss(unsigned short __a)
 static __inline __m128 __DEFAULT_FN_ATTRS128
 _mm_cvtph_ps(__m128i __a)
 {
-  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+  typedef __fp16 __v4fp16 __attribute__((__vector_size__(8)));
+
+  __v4hi __v = __builtin_shufflevector((__v8hi)__a, (__v8hi)__a, 0, 1, 2, 3);
+  return (__m128) __builtin_convertvector((__v4fp16)__v, __v4sf);
 }
 
 /// Converts a 256-bit vector of [8 x float] into a 128-bit vector
@@ -153,7 +154,9 @@ _mm_cvtph_ps(__m128i __a)
 static __inline __m256 __DEFAULT_FN_ATTRS256
 _mm256_cvtph_ps(__m128i __a)
 {
-  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+  typedef __fp16 __v8fp16 __attribute__((__vector_size__(16), __aligned__(16)));
+
+  return (__m256) __builtin_convertvector((__v8fp16)__a, __v8sf);
 }
 
 #undef __DEFAULT_FN_ATTRS128
Original file line number	Diff line number	Diff line change
`@@ -38,9 +38,7 @@`
`38`	`38`	`static __inline float __DEFAULT_FN_ATTRS128`
`39`	`39`	`_cvtsh_ss(unsigned short __a)`
`40`	`40`	`{`
`41`		`- __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};`
`42`		`- __v4sf __r = __builtin_ia32_vcvtph2ps(__v);`
`43`		`- return __r[0];`
	`41`	`+ return (float)__builtin_bit_cast(__fp16, __a);`
`44`	`42`	`}`
`45`	`43`
`46`	`44`	`/// Converts a 32-bit single-precision float value to a 16-bit`
`@@ -109,7 +107,10 @@ _cvtsh_ss(unsigned short __a)`
`109`	`107`	`static __inline __m128 __DEFAULT_FN_ATTRS128`
`110`	`108`	`_mm_cvtph_ps(__m128i __a)`
`111`	`109`	`{`
`112`		`- return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);`
	`110`	`+ typedef __fp16 __v4fp16 __attribute__((__vector_size__(8)));`
	`111`	`+`
	`112`	`+ __v4hi __v = __builtin_shufflevector((__v8hi)__a, (__v8hi)__a, 0, 1, 2, 3);`
	`113`	`+ return (__m128) __builtin_convertvector((__v4fp16)__v, __v4sf);`
`113`	`114`	`}`
`114`	`115`
`115`	`116`	`/// Converts a 256-bit vector of [8 x float] into a 128-bit vector`
`@@ -153,7 +154,9 @@ _mm_cvtph_ps(__m128i __a)`
`153`	`154`	`static __inline __m256 __DEFAULT_FN_ATTRS256`
`154`	`155`	`_mm256_cvtph_ps(__m128i __a)`
`155`	`156`	`{`
`156`		`- return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);`
	`157`	`+ typedef __fp16 __v8fp16 __attribute__((__vector_size__(16), __aligned__(16)));`
	`158`	`+`
	`159`	`+ return (__m256) __builtin_convertvector((__v8fp16)__a, __v8sf);`
`157`	`160`	`}`
`158`	`161`
`159`	`162`	`#undef __DEFAULT_FN_ATTRS128`