[Headers][X86] VectorExprEvaluator::VisitCallExpr - allow SSE/AVX2/AVX512 pack intrinsics to be used in constexpr #156003

woruyu · 2025-08-29T10:36:09Z

Summary

This PR resolves #154283

llvmbot · 2025-08-29T10:36:42Z

@llvm/pr-subscribers-clang

Author: woruyu (woruyu)

Changes

Summary

This PR resolves #154283

Patch is 29.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156003.diff

12 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsX86.td (+16-12)
(modified) clang/lib/AST/ExprConstant.cpp (+99)
(modified) clang/lib/Headers/avx2intrin.h (+8-12)
(modified) clang/lib/Headers/avx512bwintrin.h (+8-12)
(modified) clang/lib/Headers/emmintrin.h (+6-6)
(modified) clang/lib/Headers/mmintrin.h (+12-15)
(modified) clang/lib/Headers/smmintrin.h (+2-2)
(modified) clang/test/CodeGen/X86/avx2-builtins.c (+4)
(modified) clang/test/CodeGen/X86/avx512bw-builtins.c (+4)
(modified) clang/test/CodeGen/X86/mmx-builtins.c (+3)
(modified) clang/test/CodeGen/X86/sse2-builtins.c (+3)
(modified) clang/test/CodeGen/X86/sse41-builtins.c (+1)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 527acd9ef086e..737be0f673e96 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -95,9 +95,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
   let Features = "sse2" in {
     def pavgb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
     def pavgw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
-    def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
-    def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
-    def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
     def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">;
     def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">;
     def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">;
@@ -108,6 +105,9 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
   let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
     def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
     def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
+    def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
+    def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
+    def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
   }
 
   let Features = "sse3" in {
@@ -314,7 +314,6 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
   def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
   def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
-  def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
   def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
   def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
   def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
@@ -333,6 +332,7 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
 
 let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+  def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
 }
 
 let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
@@ -568,10 +568,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
 
 let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
   def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
-  def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
-  def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
-  def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
-  def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
   def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
   def pavgb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
   def pavgw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
@@ -636,6 +632,11 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
   def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
   def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+
+  def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+  def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+  def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+  def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
 }
 
 let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
@@ -1355,15 +1356,18 @@ let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWi
 
 let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
-  def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
-  def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
-  def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
-  def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
   def pavgb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
   def pavgw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
   def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
 }
 
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+  def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+  def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+  def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+  def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+}
+
 let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def vpconflictdi_128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
 }
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index a71cb8b0143be..8f649eca776ec 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11599,6 +11599,89 @@ static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO,
   return false;
 }
 
+enum class PackKind {
+  SSWB,
+  USWB,
+  SSDW,
+  USDW
+}; // 16→8 signed/unsigned; 32→16 signed/unsigned
+
+static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
+                            PackKind K) {
+  APValue L, R;
+  if (!EvaluateAsRValue(Info, E->getArg(0), L) ||
+      !EvaluateAsRValue(Info, E->getArg(1), R))
+    return false;
+
+  unsigned SrcBits = (K == PackKind::SSWB || K == PackKind::USWB) ? 16 : 32;
+  unsigned DstBits = SrcBits / 2;
+
+  unsigned NL = L.getVectorLength();
+  unsigned NR = R.getVectorLength();
+  if (NL == 0 || NR == 0 || NL != NR)
+    return false;
+
+  // Bounds for saturation (extended to SrcBits for compares).
+  APInt Lo = (K == PackKind::USWB || K == PackKind::USDW)
+                 ? APInt(SrcBits, 0)
+                 : APInt::getSignedMinValue(DstBits).sext(SrcBits);
+  APInt Hi = (K == PackKind::USWB || K == PackKind::USDW)
+                 ? APInt::getMaxValue(DstBits).zext(SrcBits)
+                 : APInt::getSignedMaxValue(DstBits).sext(SrcBits);
+
+  // Result element signedness follows the builtin's return vector element type.
+  QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+  bool DestIsUnsigned = DestEltTy->isUnsignedIntegerType();
+
+  // Clamp one source element to the target range and narrow to DstBits.
+  auto clampOne = [&](const APSInt &X) -> APSInt {
+    APInt V = X;
+    if (V.getBitWidth() != SrcBits)
+      V = V.sextOrTrunc(SrcBits);
+
+    if (K == PackKind::USWB || K == PackKind::USDW) {
+      if (V.isNegative())
+        V = Lo;
+      else if (V.ugt(Hi))
+        V = Hi;
+      APInt Narrow = V.zextOrTrunc(DstBits);
+      return APSInt(Narrow, /*isUnsigned=*/true);
+    } else {
+      if (V.sgt(Hi))
+        V = Hi;
+      else if (V.slt(Lo))
+        V = Lo;
+      APInt Narrow = V.sextOrTrunc(DstBits);
+      return APSInt(Narrow, /*isUnsigned=*/DestIsUnsigned);
+    }
+  };
+
+  SmallVector<APValue, 64> Out;
+  Out.reserve(NL + NR);
+
+  // Process per 128-bit lane (MMX 64-bit uses a single lane).
+  unsigned VectorBits = NL * SrcBits;
+  unsigned srcPerLane, lanes;
+  if (VectorBits >= 128) {
+    srcPerLane = 128 / SrcBits; // 8 (16→8) or 4 (32→16)
+    lanes = VectorBits / 128;   // 1 (128b), 2 (256b), 4 (512b)
+  } else {
+    srcPerLane = NL; // MMX
+    lanes = 1;
+  }
+
+  for (unsigned lane = 0; lane != lanes; ++lane) {
+    unsigned base = lane * srcPerLane;
+    for (unsigned i = 0; i != srcPerLane; ++i)
+      Out.push_back(APValue(clampOne(L.getVectorElt(base + i).getInt())));
+    for (unsigned i = 0; i != srcPerLane; ++i)
+      Out.push_back(APValue(clampOne(R.getVectorElt(base + i).getInt())));
+  }
+
+  Result = APValue(Out.data(), Out.size());
+  return true;
+}
+
 bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   if (!IsConstantEvaluatedBuiltinCall(E))
     return ExprEvaluatorBaseTy::VisitCallExpr(E);
@@ -11752,6 +11835,22 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case X86::BI__builtin_ia32_packsswb128:
+  case X86::BI__builtin_ia32_packsswb256:
+  case X86::BI__builtin_ia32_packsswb512:
+    return evalPackBuiltin(E, Info, Result, PackKind::SSWB);
+  case X86::BI__builtin_ia32_packuswb128:
+  case X86::BI__builtin_ia32_packuswb256:
+  case X86::BI__builtin_ia32_packuswb512:
+    return evalPackBuiltin(E, Info, Result, PackKind::USWB);
+  case X86::BI__builtin_ia32_packssdw128:
+  case X86::BI__builtin_ia32_packssdw256:
+  case X86::BI__builtin_ia32_packssdw512:
+    return evalPackBuiltin(E, Info, Result, PackKind::SSDW);
+  case X86::BI__builtin_ia32_packusdw128:
+  case X86::BI__builtin_ia32_packusdw256:
+  case X86::BI__builtin_ia32_packusdw512:
+    return evalPackBuiltin(E, Info, Result, PackKind::USDW);
   case clang::X86::BI__builtin_ia32_pmuldq128:
   case clang::X86::BI__builtin_ia32_pmuldq256:
   case clang::X86::BI__builtin_ia32_pmuldq512:
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index ce5b2b7544d8c..22d7157f0db58 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -177,9 +177,8 @@ _mm256_abs_epi32(__m256i __a)
 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
 ///    result[255:192].
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packs_epi16(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -209,9 +208,8 @@ _mm256_packs_epi16(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
 ///    result[255:192].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packs_epi32(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
 }
 
@@ -240,9 +238,8 @@ _mm256_packs_epi32(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
 ///    result[255:192].
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packus_epi16(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -272,9 +269,8 @@ _mm256_packus_epi16(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
 ///    result[255:192].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi32(__m256i __V1, __m256i __V2)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packus_epi32(__m256i __V1, __m256i __V2) {
   return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
 }
 
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 9263f7af3ee2f..b56f53107fa92 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -525,9 +525,8 @@ _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
                                              (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi32(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packs_epi32(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
 }
 
@@ -547,9 +546,8 @@ _mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
                                        (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi16(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packs_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
 }
 
@@ -569,9 +567,8 @@ _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
                                         (__v64qi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi32(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packus_epi32(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
 }
 
@@ -591,9 +588,8 @@ _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
                                        (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi16(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packus_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
 }
 
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 8b6b62458dac1..f3c9eb7158c1f 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -4166,8 +4166,8 @@ void _mm_mfence(void);
 ///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///   written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
-                                                             __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packs_epi16(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
 }
 
@@ -4189,8 +4189,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
 ///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
 ///    are written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
-                                                             __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packs_epi32(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
 }
 
@@ -4212,8 +4212,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
 ///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///    written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
-                                                              __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packus_epi16(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
 }
 
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 6fe9d67b8976d..aa6845f60ee99 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -162,11 +162,10 @@ _mm_cvtm64_si64(__m64 __m)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pi16(__m64 __m1, __m64 __m2)
-{
-    return __trunc64(__builtin_ia32_packsswb128(
-        (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pi16(__m64 __m1, __m64 __m2) {
+  return __trunc64(__builtin_ia32_packsswb128(
+      (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
 }
 
 /// Converts, with saturation, 32-bit signed integers from both 64-bit integer
@@ -188,11 +187,10 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pi32(__m64 __m1, __m64 __m2)
-{
-    return __trunc64(__builtin_ia32_packssdw128(
-        (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pi32(__m64 __m1, __m64 __m2) {
+  return __trunc64(__builtin_ia32_packssdw128(
+      (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
 }
 
 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
@@ -214,11 +212,10 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pu16(__m64 __m1, __m64 __m2)
-{
-    return __trunc64(__builtin_ia32_packuswb128(
-        (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pu16(__m64 __m1, __m64 __m2) {
+  return __trunc64(__builtin_ia32_packuswb128(
+      (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 57d0d329312af..8acf4929d1125 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1475,8 +1475,8 @@ _mm_cvtepu32_epi64(__m128i __V) {
 ///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
 ///    written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
-                                                              __m128i __V2) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packus_epi32(__m128i __V1, __m128i __V2) {
   return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
 }
 
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 29cb3e8860be9..d2d8b53c24a96 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/...
[truncated]

llvmbot · 2025-08-29T10:36:43Z

@llvm/pr-subscribers-backend-x86

Author: woruyu (woruyu)

Changes

Summary

This PR resolves #154283

Patch is 29.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156003.diff

12 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsX86.td (+16-12)
(modified) clang/lib/AST/ExprConstant.cpp (+99)
(modified) clang/lib/Headers/avx2intrin.h (+8-12)
(modified) clang/lib/Headers/avx512bwintrin.h (+8-12)
(modified) clang/lib/Headers/emmintrin.h (+6-6)
(modified) clang/lib/Headers/mmintrin.h (+12-15)
(modified) clang/lib/Headers/smmintrin.h (+2-2)
(modified) clang/test/CodeGen/X86/avx2-builtins.c (+4)
(modified) clang/test/CodeGen/X86/avx512bw-builtins.c (+4)
(modified) clang/test/CodeGen/X86/mmx-builtins.c (+3)
(modified) clang/test/CodeGen/X86/sse2-builtins.c (+3)
(modified) clang/test/CodeGen/X86/sse41-builtins.c (+1)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 527acd9ef086e..737be0f673e96 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -95,9 +95,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
   let Features = "sse2" in {
     def pavgb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
     def pavgw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
-    def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
-    def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
-    def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
     def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">;
     def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">;
     def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">;
@@ -108,6 +105,9 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
   let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
     def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
     def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
+    def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
+    def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
+    def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
   }
 
   let Features = "sse3" in {
@@ -314,7 +314,6 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
   def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
   def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
-  def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
   def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
   def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
   def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
@@ -333,6 +332,7 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
 
 let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+  def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
 }
 
 let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
@@ -568,10 +568,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
 
 let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
   def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
-  def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
-  def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
-  def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
-  def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
   def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
   def pavgb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
   def pavgw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
@@ -636,6 +632,11 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
   def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
   def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+
+  def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+  def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+  def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+  def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
 }
 
 let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
@@ -1355,15 +1356,18 @@ let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWi
 
 let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
-  def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
-  def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
-  def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
-  def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
   def pavgb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
   def pavgw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
   def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
 }
 
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+  def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+  def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+  def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+  def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+}
+
 let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def vpconflictdi_128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
 }
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index a71cb8b0143be..8f649eca776ec 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11599,6 +11599,89 @@ static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO,
   return false;
 }
 
+enum class PackKind {
+  SSWB,
+  USWB,
+  SSDW,
+  USDW
+}; // 16→8 signed/unsigned; 32→16 signed/unsigned
+
+static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
+                            PackKind K) {
+  APValue L, R;
+  if (!EvaluateAsRValue(Info, E->getArg(0), L) ||
+      !EvaluateAsRValue(Info, E->getArg(1), R))
+    return false;
+
+  unsigned SrcBits = (K == PackKind::SSWB || K == PackKind::USWB) ? 16 : 32;
+  unsigned DstBits = SrcBits / 2;
+
+  unsigned NL = L.getVectorLength();
+  unsigned NR = R.getVectorLength();
+  if (NL == 0 || NR == 0 || NL != NR)
+    return false;
+
+  // Bounds for saturation (extended to SrcBits for compares).
+  APInt Lo = (K == PackKind::USWB || K == PackKind::USDW)
+                 ? APInt(SrcBits, 0)
+                 : APInt::getSignedMinValue(DstBits).sext(SrcBits);
+  APInt Hi = (K == PackKind::USWB || K == PackKind::USDW)
+                 ? APInt::getMaxValue(DstBits).zext(SrcBits)
+                 : APInt::getSignedMaxValue(DstBits).sext(SrcBits);
+
+  // Result element signedness follows the builtin's return vector element type.
+  QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+  bool DestIsUnsigned = DestEltTy->isUnsignedIntegerType();
+
+  // Clamp one source element to the target range and narrow to DstBits.
+  auto clampOne = [&](const APSInt &X) -> APSInt {
+    APInt V = X;
+    if (V.getBitWidth() != SrcBits)
+      V = V.sextOrTrunc(SrcBits);
+
+    if (K == PackKind::USWB || K == PackKind::USDW) {
+      if (V.isNegative())
+        V = Lo;
+      else if (V.ugt(Hi))
+        V = Hi;
+      APInt Narrow = V.zextOrTrunc(DstBits);
+      return APSInt(Narrow, /*isUnsigned=*/true);
+    } else {
+      if (V.sgt(Hi))
+        V = Hi;
+      else if (V.slt(Lo))
+        V = Lo;
+      APInt Narrow = V.sextOrTrunc(DstBits);
+      return APSInt(Narrow, /*isUnsigned=*/DestIsUnsigned);
+    }
+  };
+
+  SmallVector<APValue, 64> Out;
+  Out.reserve(NL + NR);
+
+  // Process per 128-bit lane (MMX 64-bit uses a single lane).
+  unsigned VectorBits = NL * SrcBits;
+  unsigned srcPerLane, lanes;
+  if (VectorBits >= 128) {
+    srcPerLane = 128 / SrcBits; // 8 (16→8) or 4 (32→16)
+    lanes = VectorBits / 128;   // 1 (128b), 2 (256b), 4 (512b)
+  } else {
+    srcPerLane = NL; // MMX
+    lanes = 1;
+  }
+
+  for (unsigned lane = 0; lane != lanes; ++lane) {
+    unsigned base = lane * srcPerLane;
+    for (unsigned i = 0; i != srcPerLane; ++i)
+      Out.push_back(APValue(clampOne(L.getVectorElt(base + i).getInt())));
+    for (unsigned i = 0; i != srcPerLane; ++i)
+      Out.push_back(APValue(clampOne(R.getVectorElt(base + i).getInt())));
+  }
+
+  Result = APValue(Out.data(), Out.size());
+  return true;
+}
+
 bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   if (!IsConstantEvaluatedBuiltinCall(E))
     return ExprEvaluatorBaseTy::VisitCallExpr(E);
@@ -11752,6 +11835,22 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case X86::BI__builtin_ia32_packsswb128:
+  case X86::BI__builtin_ia32_packsswb256:
+  case X86::BI__builtin_ia32_packsswb512:
+    return evalPackBuiltin(E, Info, Result, PackKind::SSWB);
+  case X86::BI__builtin_ia32_packuswb128:
+  case X86::BI__builtin_ia32_packuswb256:
+  case X86::BI__builtin_ia32_packuswb512:
+    return evalPackBuiltin(E, Info, Result, PackKind::USWB);
+  case X86::BI__builtin_ia32_packssdw128:
+  case X86::BI__builtin_ia32_packssdw256:
+  case X86::BI__builtin_ia32_packssdw512:
+    return evalPackBuiltin(E, Info, Result, PackKind::SSDW);
+  case X86::BI__builtin_ia32_packusdw128:
+  case X86::BI__builtin_ia32_packusdw256:
+  case X86::BI__builtin_ia32_packusdw512:
+    return evalPackBuiltin(E, Info, Result, PackKind::USDW);
   case clang::X86::BI__builtin_ia32_pmuldq128:
   case clang::X86::BI__builtin_ia32_pmuldq256:
   case clang::X86::BI__builtin_ia32_pmuldq512:
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index ce5b2b7544d8c..22d7157f0db58 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -177,9 +177,8 @@ _mm256_abs_epi32(__m256i __a)
 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
 ///    result[255:192].
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packs_epi16(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -209,9 +208,8 @@ _mm256_packs_epi16(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
 ///    result[255:192].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packs_epi32(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
 }
 
@@ -240,9 +238,8 @@ _mm256_packs_epi32(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
 ///    result[255:192].
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packus_epi16(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -272,9 +269,8 @@ _mm256_packus_epi16(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
 ///    result[255:192].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi32(__m256i __V1, __m256i __V2)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packus_epi32(__m256i __V1, __m256i __V2) {
   return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
 }
 
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 9263f7af3ee2f..b56f53107fa92 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -525,9 +525,8 @@ _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
                                              (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi32(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packs_epi32(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
 }
 
@@ -547,9 +546,8 @@ _mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
                                        (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi16(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packs_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
 }
 
@@ -569,9 +567,8 @@ _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
                                         (__v64qi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi32(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packus_epi32(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
 }
 
@@ -591,9 +588,8 @@ _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
                                        (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi16(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packus_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
 }
 
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 8b6b62458dac1..f3c9eb7158c1f 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -4166,8 +4166,8 @@ void _mm_mfence(void);
 ///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///   written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
-                                                             __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packs_epi16(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
 }
 
@@ -4189,8 +4189,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
 ///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
 ///    are written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
-                                                             __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packs_epi32(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
 }
 
@@ -4212,8 +4212,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
 ///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///    written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
-                                                              __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packus_epi16(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
 }
 
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 6fe9d67b8976d..aa6845f60ee99 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -162,11 +162,10 @@ _mm_cvtm64_si64(__m64 __m)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pi16(__m64 __m1, __m64 __m2)
-{
-    return __trunc64(__builtin_ia32_packsswb128(
-        (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pi16(__m64 __m1, __m64 __m2) {
+  return __trunc64(__builtin_ia32_packsswb128(
+      (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
 }
 
 /// Converts, with saturation, 32-bit signed integers from both 64-bit integer
@@ -188,11 +187,10 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pi32(__m64 __m1, __m64 __m2)
-{
-    return __trunc64(__builtin_ia32_packssdw128(
-        (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pi32(__m64 __m1, __m64 __m2) {
+  return __trunc64(__builtin_ia32_packssdw128(
+      (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
 }
 
 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
@@ -214,11 +212,10 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pu16(__m64 __m1, __m64 __m2)
-{
-    return __trunc64(__builtin_ia32_packuswb128(
-        (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pu16(__m64 __m1, __m64 __m2) {
+  return __trunc64(__builtin_ia32_packuswb128(
+      (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 57d0d329312af..8acf4929d1125 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1475,8 +1475,8 @@ _mm_cvtepu32_epi64(__m128i __V) {
 ///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
 ///    written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
-                                                              __m128i __V2) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packus_epi32(__m128i __V1, __m128i __V2) {
   return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
 }
 
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 29cb3e8860be9..d2d8b53c24a96 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/...
[truncated]

RKSimon

You need to add handling in InterpBuiltin.cpp as well for the new interpreter - I'd recommend waiting until I've completed #155891 - but I'd also recommend you replace the PackKind enum with a llvm::function_ref callback approach like I've used in #155891

clang/lib/AST/ExprConstant.cpp

woruyu · 2025-09-17T08:28:21Z

You need to add handling in InterpBuiltin.cpp as well for the new interpreter - I'd recommend waiting until I've completed #155891 - but I'd also recommend you replace the PackKind enum with a llvm::function_ref callback approach like I've used in #155891

Thanks for the pointers. I have finished New interpreter and callback replacement. By the way, both New interpreter and AST use the same NarrowElement logic; right now it’s duplicated.

clang/lib/AST/ByteCode/InterpBuiltin.cpp

clang/include/clang/Basic/BuiltinsX86.td

RKSimon

Make use of your callback functions already knowing what form of pack instruction they use - no need to pass in usat / bitwidth etc. in.

clang/lib/AST/ByteCode/InterpBuiltin.cpp

github-actions · 2025-09-19T04:12:16Z

✅ With the latest revision this PR passed the C/C++ code formatter.

clang/lib/AST/ByteCode/InterpBuiltin.cpp

clang/lib/AST/ExprConstant.cpp

…X512 pack intrinsics to be used in constexpr

clang/lib/AST/ByteCode/InterpBuiltin.cpp

clang/lib/AST/ExprConstant.cpp

RKSimon

one minor

clang/lib/AST/ExprConstant.cpp

RKSimon

LGTM - cheers!

…e-constexpr

…X512 pack intrinsics to be used in constexpr (llvm#156003) Fixes llvm#154283

llvmbot added clang Clang issues not falling into any other category backend:X86 clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:headers Headers provided by Clang, e.g. for intrinsics labels Aug 29, 2025

woruyu requested a review from RKSimon August 29, 2025 10:39

RKSimon requested changes Aug 29, 2025

View reviewed changes

shafik reviewed Aug 29, 2025

View reviewed changes

clang/lib/AST/ExprConstant.cpp Outdated Show resolved Hide resolved

clang/lib/AST/ExprConstant.cpp Outdated Show resolved Hide resolved

clang/lib/AST/ExprConstant.cpp Outdated Show resolved Hide resolved

clang/lib/AST/ExprConstant.cpp Outdated Show resolved Hide resolved

RKSimon mentioned this pull request Sep 4, 2025

[X86] Add MMX/SSE/AVX PHADD/SUB & HADDPS/D intrinsics to be used in constexpr #156822

Merged

woruyu self-assigned this Sep 16, 2025

woruyu force-pushed the fix/allow-SSE-AVX2-AVX512-pack-intrinsics-use-constexpr branch from 372f6ce to 1e95c44 Compare September 17, 2025 08:17

llvmbot added the clang:bytecode Issues for the clang bytecode constexpr interpreter label Sep 17, 2025

woruyu force-pushed the fix/allow-SSE-AVX2-AVX512-pack-intrinsics-use-constexpr branch from 1e95c44 to c43b49e Compare September 17, 2025 08:22

woruyu force-pushed the fix/allow-SSE-AVX2-AVX512-pack-intrinsics-use-constexpr branch from c43b49e to 9bb6c25 Compare September 17, 2025 08:31

RKSimon requested changes Sep 17, 2025

View reviewed changes

clang/lib/AST/ByteCode/InterpBuiltin.cpp Outdated Show resolved Hide resolved

clang/lib/AST/ByteCode/InterpBuiltin.cpp Outdated Show resolved Hide resolved

RKSimon reviewed Sep 17, 2025

View reviewed changes

clang/include/clang/Basic/BuiltinsX86.td Outdated Show resolved Hide resolved

woruyu force-pushed the fix/allow-SSE-AVX2-AVX512-pack-intrinsics-use-constexpr branch from 9bb6c25 to f00dd0f Compare September 18, 2025 03:16

RKSimon requested changes Sep 18, 2025

View reviewed changes

woruyu force-pushed the fix/allow-SSE-AVX2-AVX512-pack-intrinsics-use-constexpr branch from f00dd0f to 6fd3434 Compare September 19, 2025 04:08

tbaederr reviewed Sep 19, 2025

View reviewed changes

woruyu force-pushed the fix/allow-SSE-AVX2-AVX512-pack-intrinsics-use-constexpr branch from 4a03b16 to d19686d Compare September 19, 2025 09:21

RKSimon requested changes Sep 19, 2025

View reviewed changes

clang/lib/AST/ByteCode/InterpBuiltin.cpp Outdated Show resolved Hide resolved

clang/lib/AST/ByteCode/InterpBuiltin.cpp Outdated Show resolved Hide resolved

clang/lib/AST/ExprConstant.cpp Outdated Show resolved Hide resolved

[Headers][X86] VectorExprEvaluator::VisitCallExpr - allow SSE/AVX2/AV…

751b39b

…X512 pack intrinsics to be used in constexpr

tbaederr reviewed Sep 22, 2025

View reviewed changes

clang/lib/AST/ByteCode/InterpBuiltin.cpp Outdated Show resolved Hide resolved

fix: review

e09530b

woruyu force-pushed the fix/allow-SSE-AVX2-AVX512-pack-intrinsics-use-constexpr branch from d19686d to e09530b Compare September 22, 2025 03:34

fix: unnecessary include

3b3c325

RKSimon requested changes Sep 22, 2025

View reviewed changes

clang/lib/AST/ByteCode/InterpBuiltin.cpp Outdated Show resolved Hide resolved

clang/lib/AST/ExprConstant.cpp Outdated Show resolved Hide resolved

fix: review

83e4b0f

RKSimon requested changes Sep 24, 2025

View reviewed changes

clang/lib/AST/ExprConstant.cpp Outdated Show resolved Hide resolved

fix: modify capitalize first letter

ddfb042

RKSimon approved these changes Sep 24, 2025

View reviewed changes

Merge branch 'main' into fix/allow-SSE-AVX2-AVX512-pack-intrinsics-us…

857bc25

…e-constexpr

RKSimon enabled auto-merge (squash) September 24, 2025 12:39

RKSimon merged commit 0a268f8 into llvm:main Sep 24, 2025
9 checks passed

mahesh-attarde pushed a commit to mahesh-attarde/llvm-project that referenced this pull request Oct 3, 2025

[Headers][X86] VectorExprEvaluator::VisitCallExpr - allow SSE/AVX2/AV…

5e291a6

…X512 pack intrinsics to be used in constexpr (llvm#156003) Fixes llvm#154283

Uh oh!

[Headers][X86] VectorExprEvaluator::VisitCallExpr - allow SSE/AVX2/AVX512 pack intrinsics to be used in constexpr #156003

[Headers][X86] VectorExprEvaluator::VisitCallExpr - allow SSE/AVX2/AVX512 pack intrinsics to be used in constexpr #156003

Uh oh!

Conversation

woruyu commented Aug 29, 2025

Summary

Uh oh!

llvmbot commented Aug 29, 2025

Summary

Uh oh!

llvmbot commented Aug 29, 2025

Summary

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

woruyu commented Sep 17, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

RKSimon left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Sep 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

RKSimon left a comment •

edited

Loading

github-actions bot commented Sep 19, 2025 •

edited

Loading