Skip to content

Conversation

pedroclobo
Copy link
Member

Closes #153002.
Part of #30794.

Adds constexpr support for pmuludq and pmuldq intrinsics.

@llvmbot llvmbot added clang Clang issues not falling into any other category backend:X86 clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:headers Headers provided by Clang, e.g. for intrinsics llvm:support llvm:adt labels Aug 12, 2025
@llvmbot
Copy link
Member

llvmbot commented Aug 12, 2025

@llvm/pr-subscribers-llvm-adt

Author: Pedro Lobo (pedroclobo)

Changes

Closes #153002.
Part of #30794.

Adds constexpr support for pmuludq and pmuldq intrinsics.


Full diff: https://github.com/llvm/llvm-project/pull/153293.diff

13 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsX86.td (+16-4)
  • (modified) clang/lib/AST/ExprConstant.cpp (+37)
  • (modified) clang/lib/Headers/avx2intrin.h (+4-6)
  • (modified) clang/lib/Headers/avx512fintrin.h (+4-6)
  • (modified) clang/lib/Headers/emmintrin.h (+9-5)
  • (modified) clang/lib/Headers/smmintrin.h (+2-2)
  • (modified) clang/test/CodeGen/X86/avx2-builtins.c (+2)
  • (modified) clang/test/CodeGen/X86/avx512f-builtins.c (+2)
  • (modified) clang/test/CodeGen/X86/mmx-builtins.c (+1)
  • (modified) clang/test/CodeGen/X86/sse2-builtins.c (+1)
  • (modified) clang/test/CodeGen/X86/sse41-builtins.c (+1)
  • (modified) llvm/include/llvm/ADT/APInt.h (+6)
  • (modified) llvm/lib/Support/APInt.cpp (+16)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 3efc0be8fa698..ca2cc081b787a 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -268,7 +268,6 @@ let Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
 }
 
 let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
   def psraw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
   def psrad128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
   def psrlw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
@@ -290,6 +289,10 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
   def psrldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
 }
 
+let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
 let Features = "sse3", Attributes = [NoThrow] in {
   def monitor : X86Builtin<"void(void const *, unsigned int, unsigned int)">;
   def mwait : X86Builtin<"void(unsigned int, unsigned int)">;
@@ -312,7 +315,6 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
   def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
   def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
-  def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
   def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
   def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
   def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
@@ -329,6 +331,10 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def vec_set_v4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int, _Constant int)">;
 }
 
+let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
 let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def pcmpistrm128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
   def pcmpistri128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
@@ -580,9 +586,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
   def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
   def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
-  def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
   def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
   def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
   def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
   def pshufd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
@@ -620,6 +624,11 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
 }
 
+let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+  def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
 let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
   def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
@@ -1090,6 +1099,9 @@ let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWi
   def cvtpd2ps512_mask : X86Builtin<"_Vector<8, float>(_Vector<8, double>, _Vector<8, float>, unsigned char, _Constant int)">;
   def vcvtps2ph512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, float>, _Constant int, _Vector<16, short>, unsigned short)">;
   def vcvtph2ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, short>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def pmuldq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
   def pmuludq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
 }
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 36dd0f5d7a065..234f257122d8f 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11688,6 +11688,43 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case clang::X86::BI__builtin_ia32_pmuldq128:
+  case clang::X86::BI__builtin_ia32_pmuldq256:
+  case clang::X86::BI__builtin_ia32_pmuldq512:
+  case clang::X86::BI__builtin_ia32_pmuludq128:
+  case clang::X86::BI__builtin_ia32_pmuludq256:
+  case clang::X86::BI__builtin_ia32_pmuludq512: {
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+
+    unsigned SourceLen = SourceLHS.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(SourceLen / 2);
+
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      APSInt LHS = SourceLHS.getVectorElt(EltNum).getInt();
+      APSInt RHS = SourceRHS.getVectorElt(EltNum).getInt();
+
+      switch (E->getBuiltinCallee()) {
+      case clang::X86::BI__builtin_ia32_pmuludq128:
+      case clang::X86::BI__builtin_ia32_pmuludq256:
+      case clang::X86::BI__builtin_ia32_pmuludq512:
+        ResultElements.push_back(
+            APValue(APSInt(llvm::APIntOps::muludq(LHS, RHS), true)));
+        break;
+      case clang::X86::BI__builtin_ia32_pmuldq128:
+      case clang::X86::BI__builtin_ia32_pmuldq256:
+      case clang::X86::BI__builtin_ia32_pmuldq512:
+        ResultElements.push_back(
+            APValue(APSInt(llvm::APIntOps::muldq(LHS, RHS), false)));
+        break;
+      }
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
   case Builtin::BI__builtin_elementwise_max:
   case Builtin::BI__builtin_elementwise_min: {
     APValue SourceLHS, SourceRHS;
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index f00a8a516ecfe..b098417173ba3 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -1671,9 +1671,8 @@ _mm256_cvtepu32_epi64(__m128i __V) {
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x i64] containing the products.
-static __inline__  __m256i __DEFAULT_FN_ATTRS256
-_mm256_mul_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mul_epi32(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
 }
 
@@ -1800,9 +1799,8 @@ _mm256_mullo_epi32 (__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x i64] containing the products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mul_epu32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mul_epu32(__m256i __a, __m256i __b) {
   return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
 }
 
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 9fc1df3acd3d0..0ba5aa338a6b8 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -1417,9 +1417,8 @@ _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
                                              (__v8di)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mul_epi32(__m512i __X, __m512i __Y)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mul_epi32(__m512i __X, __m512i __Y) {
   return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
 }
 
@@ -1439,9 +1438,8 @@ _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
                                              (__v8di)_mm512_setzero_si512 ());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mul_epu32(__m512i __X, __m512i __Y)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mul_epu32(__m512i __X, __m512i __Y) {
   return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
 }
 
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index e632191113c4a..2ca25149db280 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -67,6 +67,9 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
 
 #define __trunc64(x)                                                           \
   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __zext128(x)                                                           \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, 2, 3)
 #define __anyext128(x)                                                         \
   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
                                     1, -1, -1)
@@ -2450,9 +2453,10 @@ _mm_mullo_epi16(__m128i __a, __m128i __b) {
 /// \param __b
 ///    A 64-bit integer containing one of the source operands.
 /// \returns A 64-bit integer vector containing the product of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
-  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
-                                             (__v4si)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_su32(__m64 __a,
+                                                                  __m64 __b) {
+  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__zext128(__a),
+                                             (__v4si)__zext128(__b)));
 }
 
 /// Multiplies 32-bit unsigned integer values contained in the lower
@@ -2468,8 +2472,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
 /// \param __b
 ///    A [2 x i64] vector containing one of the source operands.
 /// \returns A [2 x i64] vector containing the product of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
-                                                           __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_epu32(__m128i __a, __m128i __b) {
   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
 }
 
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index c5075c419b70b..57d0d329312af 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -567,8 +567,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
 ///    A 128-bit vector of [4 x i32].
 /// \returns A 128-bit vector of [2 x i64] containing the products of both
 ///    operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
-                                                           __m128i __V2) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_epi32(__m128i __V1, __m128i __V2) {
   return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
 }
 
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 8790485f00a8c..bd7390d58a74c 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -903,6 +903,7 @@ __m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
   // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mul_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_mul_epi32((__m256i)(__v8si){+1, -2, +3, -4, +5, -6, +7, -8}, (__m256i)(__v8si){-16, -14, +12, +10, -8, +6, -4, +2}), -16, -1, 36, 0, -40, -1, -28, -1));
 
 __m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_mul_epu32
@@ -911,6 +912,7 @@ __m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
   // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mul_epu32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_mul_epu32((__m256i)(__v8si){+1, -2, +3, -4, +5, -6, +7, -8}, (__m256i)(__v8si){-16, -14, +12, +10, -8, +6, -4, +2}), -16, 0, 36, 0, -40, 4, -28, 6));
 
 __m256i test_mm256_mulhi_epu16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_mulhi_epu16
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 048bc3057b5f4..68da485213941 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -3027,6 +3027,7 @@ __m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) {
   //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_mul_epi32(__A,__B);
 }
+TEST_CONSTEXPR(match_m512i(_mm512_mul_epi32((__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){16, 14, 12, 10, 8, 6, 4, 2}), 16, 28, 36, 40, 40, 36, 28, 16));
 
 __m512i test_mm512_maskz_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: test_mm512_maskz_mul_epi32
@@ -3057,6 +3058,7 @@ __m512i test_mm512_mul_epu32 (__m512i __A, __m512i __B) {
   //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_mul_epu32(__A,__B);
 }
+TEST_CONSTEXPR(match_m512i(_mm512_mul_epu32((__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){16, 14, 12, 10, 8, 6, 4, 2}), 16, 28, 36, 40, 40, 36, 28, 16));
 
 __m512i test_mm512_maskz_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: test_mm512_maskz_mul_epu32
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 07fab2d39054c..5c700107c58e9 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -385,6 +385,7 @@ __m64 test_mm_mul_su32(__m64 a, __m64 b) {
   // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_su32(a, b);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_mul_su32((__m64)(__v4hi){1, 2, 3, 4}, (__m64)(__v4hi){10, 8, 6, 4}), 10, 28, 16, 0));
 
 __m64 test_mm_mulhi_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mulhi_pi16
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 4ff14adcea0e1..0e00b2763db35 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -921,6 +921,7 @@ __m128i test_mm_mul_epu32(__m128i A, __m128i B) {
   // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_epu32(A, B);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mul_epu32((__m128i)(__v8hi){+1, -2, +3, -4, +5, -6, +7, -8}, (__m128i)(__v8hi){-16, -14, +12, +10, -8, +6, -4, +2}), -16, 18, 11, -15, -40, 82, -50, 6));
 
 __m128d test_mm_mul_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_mul_pd
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 10deb386d82aa..b7fc582788c8c 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -344,6 +344,7 @@ __m128i test_mm_mul_epi32(__m128i x, __m128i y) {
   // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_epi32(x, y);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mul_epi32((__m128i)(__v8hi){+1, -2, +3, -4, +5, -6, +7, -8}, (__m128i)(__v8hi){-16, -14, +12, +10, -8, +6, -4, +2}), -16, 18, 26, 0, -40, 82, -42, -1));
 
 __m128i test_mm_mullo_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_mullo_epi32
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 295506393a1c4..cf7ddf3a9719f 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -2294,6 +2294,12 @@ LLVM_ABI APInt mulhs(const APInt &C1, const APInt &C2);
 /// Returns the high N bits of the multiplication result.
 LLVM_ABI APInt mulhu(const APInt &C1, const APInt &C2);
 
+/// Performs (2*N)-bit multiplication on sign-extended operands.
+LLVM_ABI APInt muldq(const APInt &C1, const APInt &C2);
+
+/// Performs (2*N)-bit multiplication on zero-extended operands.
+LLVM_ABI APInt muludq(const APInt &C1, const APInt &C2);
+
 /// Compute X^N for N>=0.
 /// 0^0 is supported and returns 1.
 LLVM_ABI APInt pow(const APInt &X, int64_t N);
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index 954af7fff92a8..2dd83e8bbe2fd 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -3136,6 +3136,22 @@ APInt APIntOps::mulhu(const APInt &C1, const APInt &C2) {
   return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
 }
 
+APInt APIntOps::muludq(const APInt &C1, const APInt &C2) {
+  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
+  unsigned FullWidth = C1.getBitWidth() * 2;
+  APInt C1Ext = C1.zext(FullWidth);
+  APInt C2Ext = C2.zext(FullWidth);
+  return C1Ext * C2Ext;
+}
+
+APInt APIntOps::muldq(const APInt &C1, const APInt &C2) {
+  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
+  unsigned FullWidth = C1.getBitWidth() * 2;
+  APInt C1Ext = C1.sext(FullWidth);
+  APInt C2Ext = C2.sext(FullWidth);
+  return C1Ext * C2Ext;
+}
+
 APInt APIntOps::pow(const APInt &X, int64_t N) {
   assert(N >= 0 && "negative exponents not supported.");
   APInt Acc = APInt(X.getBitWidth(), 1);

@llvmbot
Copy link
Member

llvmbot commented Aug 12, 2025

@llvm/pr-subscribers-llvm-support

Author: Pedro Lobo (pedroclobo)

Changes

Closes #153002.
Part of #30794.

Adds constexpr support for pmuludq and pmuldq intrinsics.


Full diff: https://github.com/llvm/llvm-project/pull/153293.diff

13 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsX86.td (+16-4)
  • (modified) clang/lib/AST/ExprConstant.cpp (+37)
  • (modified) clang/lib/Headers/avx2intrin.h (+4-6)
  • (modified) clang/lib/Headers/avx512fintrin.h (+4-6)
  • (modified) clang/lib/Headers/emmintrin.h (+9-5)
  • (modified) clang/lib/Headers/smmintrin.h (+2-2)
  • (modified) clang/test/CodeGen/X86/avx2-builtins.c (+2)
  • (modified) clang/test/CodeGen/X86/avx512f-builtins.c (+2)
  • (modified) clang/test/CodeGen/X86/mmx-builtins.c (+1)
  • (modified) clang/test/CodeGen/X86/sse2-builtins.c (+1)
  • (modified) clang/test/CodeGen/X86/sse41-builtins.c (+1)
  • (modified) llvm/include/llvm/ADT/APInt.h (+6)
  • (modified) llvm/lib/Support/APInt.cpp (+16)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 3efc0be8fa698..ca2cc081b787a 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -268,7 +268,6 @@ let Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
 }
 
 let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
   def psraw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
   def psrad128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
   def psrlw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
@@ -290,6 +289,10 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
   def psrldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
 }
 
+let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
 let Features = "sse3", Attributes = [NoThrow] in {
   def monitor : X86Builtin<"void(void const *, unsigned int, unsigned int)">;
   def mwait : X86Builtin<"void(unsigned int, unsigned int)">;
@@ -312,7 +315,6 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
   def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
   def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
-  def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
   def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
   def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
   def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
@@ -329,6 +331,10 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def vec_set_v4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int, _Constant int)">;
 }
 
+let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
 let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def pcmpistrm128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
   def pcmpistri128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
@@ -580,9 +586,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
   def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
   def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
-  def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
   def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
   def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
   def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
   def pshufd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
@@ -620,6 +624,11 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
 }
 
+let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+  def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
 let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
   def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
@@ -1090,6 +1099,9 @@ let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWi
   def cvtpd2ps512_mask : X86Builtin<"_Vector<8, float>(_Vector<8, double>, _Vector<8, float>, unsigned char, _Constant int)">;
   def vcvtps2ph512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, float>, _Constant int, _Vector<16, short>, unsigned short)">;
   def vcvtph2ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, short>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def pmuldq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
   def pmuludq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
 }
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 36dd0f5d7a065..234f257122d8f 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11688,6 +11688,43 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case clang::X86::BI__builtin_ia32_pmuldq128:
+  case clang::X86::BI__builtin_ia32_pmuldq256:
+  case clang::X86::BI__builtin_ia32_pmuldq512:
+  case clang::X86::BI__builtin_ia32_pmuludq128:
+  case clang::X86::BI__builtin_ia32_pmuludq256:
+  case clang::X86::BI__builtin_ia32_pmuludq512: {
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+
+    unsigned SourceLen = SourceLHS.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(SourceLen / 2);
+
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      APSInt LHS = SourceLHS.getVectorElt(EltNum).getInt();
+      APSInt RHS = SourceRHS.getVectorElt(EltNum).getInt();
+
+      switch (E->getBuiltinCallee()) {
+      case clang::X86::BI__builtin_ia32_pmuludq128:
+      case clang::X86::BI__builtin_ia32_pmuludq256:
+      case clang::X86::BI__builtin_ia32_pmuludq512:
+        ResultElements.push_back(
+            APValue(APSInt(llvm::APIntOps::muludq(LHS, RHS), true)));
+        break;
+      case clang::X86::BI__builtin_ia32_pmuldq128:
+      case clang::X86::BI__builtin_ia32_pmuldq256:
+      case clang::X86::BI__builtin_ia32_pmuldq512:
+        ResultElements.push_back(
+            APValue(APSInt(llvm::APIntOps::muldq(LHS, RHS), false)));
+        break;
+      }
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
   case Builtin::BI__builtin_elementwise_max:
   case Builtin::BI__builtin_elementwise_min: {
     APValue SourceLHS, SourceRHS;
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index f00a8a516ecfe..b098417173ba3 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -1671,9 +1671,8 @@ _mm256_cvtepu32_epi64(__m128i __V) {
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x i64] containing the products.
-static __inline__  __m256i __DEFAULT_FN_ATTRS256
-_mm256_mul_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mul_epi32(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
 }
 
@@ -1800,9 +1799,8 @@ _mm256_mullo_epi32 (__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x i64] containing the products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mul_epu32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mul_epu32(__m256i __a, __m256i __b) {
   return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
 }
 
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 9fc1df3acd3d0..0ba5aa338a6b8 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -1417,9 +1417,8 @@ _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
                                              (__v8di)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mul_epi32(__m512i __X, __m512i __Y)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mul_epi32(__m512i __X, __m512i __Y) {
   return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
 }
 
@@ -1439,9 +1438,8 @@ _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
                                              (__v8di)_mm512_setzero_si512 ());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mul_epu32(__m512i __X, __m512i __Y)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mul_epu32(__m512i __X, __m512i __Y) {
   return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
 }
 
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index e632191113c4a..2ca25149db280 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -67,6 +67,9 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
 
 #define __trunc64(x)                                                           \
   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __zext128(x)                                                           \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, 2, 3)
 #define __anyext128(x)                                                         \
   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
                                     1, -1, -1)
@@ -2450,9 +2453,10 @@ _mm_mullo_epi16(__m128i __a, __m128i __b) {
 /// \param __b
 ///    A 64-bit integer containing one of the source operands.
 /// \returns A 64-bit integer vector containing the product of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
-  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
-                                             (__v4si)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_su32(__m64 __a,
+                                                                  __m64 __b) {
+  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__zext128(__a),
+                                             (__v4si)__zext128(__b)));
 }
 
 /// Multiplies 32-bit unsigned integer values contained in the lower
@@ -2468,8 +2472,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
 /// \param __b
 ///    A [2 x i64] vector containing one of the source operands.
 /// \returns A [2 x i64] vector containing the product of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
-                                                           __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_epu32(__m128i __a, __m128i __b) {
   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
 }
 
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index c5075c419b70b..57d0d329312af 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -567,8 +567,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
 ///    A 128-bit vector of [4 x i32].
 /// \returns A 128-bit vector of [2 x i64] containing the products of both
 ///    operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
-                                                           __m128i __V2) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_epi32(__m128i __V1, __m128i __V2) {
   return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
 }
 
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 8790485f00a8c..bd7390d58a74c 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -903,6 +903,7 @@ __m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
   // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mul_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_mul_epi32((__m256i)(__v8si){+1, -2, +3, -4, +5, -6, +7, -8}, (__m256i)(__v8si){-16, -14, +12, +10, -8, +6, -4, +2}), -16, -1, 36, 0, -40, -1, -28, -1));
 
 __m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_mul_epu32
@@ -911,6 +912,7 @@ __m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
   // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mul_epu32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_mul_epu32((__m256i)(__v8si){+1, -2, +3, -4, +5, -6, +7, -8}, (__m256i)(__v8si){-16, -14, +12, +10, -8, +6, -4, +2}), -16, 0, 36, 0, -40, 4, -28, 6));
 
 __m256i test_mm256_mulhi_epu16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_mulhi_epu16
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 048bc3057b5f4..68da485213941 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -3027,6 +3027,7 @@ __m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) {
   //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_mul_epi32(__A,__B);
 }
+TEST_CONSTEXPR(match_m512i(_mm512_mul_epi32((__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){16, 14, 12, 10, 8, 6, 4, 2}), 16, 28, 36, 40, 40, 36, 28, 16));
 
 __m512i test_mm512_maskz_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: test_mm512_maskz_mul_epi32
@@ -3057,6 +3058,7 @@ __m512i test_mm512_mul_epu32 (__m512i __A, __m512i __B) {
   //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_mul_epu32(__A,__B);
 }
+TEST_CONSTEXPR(match_m512i(_mm512_mul_epu32((__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){16, 14, 12, 10, 8, 6, 4, 2}), 16, 28, 36, 40, 40, 36, 28, 16));
 
 __m512i test_mm512_maskz_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: test_mm512_maskz_mul_epu32
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 07fab2d39054c..5c700107c58e9 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -385,6 +385,7 @@ __m64 test_mm_mul_su32(__m64 a, __m64 b) {
   // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_su32(a, b);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_mul_su32((__m64)(__v4hi){1, 2, 3, 4}, (__m64)(__v4hi){10, 8, 6, 4}), 10, 28, 16, 0));
 
 __m64 test_mm_mulhi_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mulhi_pi16
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 4ff14adcea0e1..0e00b2763db35 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -921,6 +921,7 @@ __m128i test_mm_mul_epu32(__m128i A, __m128i B) {
   // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_epu32(A, B);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mul_epu32((__m128i)(__v8hi){+1, -2, +3, -4, +5, -6, +7, -8}, (__m128i)(__v8hi){-16, -14, +12, +10, -8, +6, -4, +2}), -16, 18, 11, -15, -40, 82, -50, 6));
 
 __m128d test_mm_mul_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_mul_pd
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 10deb386d82aa..b7fc582788c8c 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -344,6 +344,7 @@ __m128i test_mm_mul_epi32(__m128i x, __m128i y) {
   // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_epi32(x, y);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mul_epi32((__m128i)(__v8hi){+1, -2, +3, -4, +5, -6, +7, -8}, (__m128i)(__v8hi){-16, -14, +12, +10, -8, +6, -4, +2}), -16, 18, 26, 0, -40, 82, -42, -1));
 
 __m128i test_mm_mullo_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_mullo_epi32
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 295506393a1c4..cf7ddf3a9719f 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -2294,6 +2294,12 @@ LLVM_ABI APInt mulhs(const APInt &C1, const APInt &C2);
 /// Returns the high N bits of the multiplication result.
 LLVM_ABI APInt mulhu(const APInt &C1, const APInt &C2);
 
+/// Performs (2*N)-bit multiplication on sign-extended operands.
+LLVM_ABI APInt muldq(const APInt &C1, const APInt &C2);
+
+/// Performs (2*N)-bit multiplication on zero-extended operands.
+LLVM_ABI APInt muludq(const APInt &C1, const APInt &C2);
+
 /// Compute X^N for N>=0.
 /// 0^0 is supported and returns 1.
 LLVM_ABI APInt pow(const APInt &X, int64_t N);
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index 954af7fff92a8..2dd83e8bbe2fd 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -3136,6 +3136,22 @@ APInt APIntOps::mulhu(const APInt &C1, const APInt &C2) {
   return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
 }
 
+APInt APIntOps::muludq(const APInt &C1, const APInt &C2) {
+  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
+  unsigned FullWidth = C1.getBitWidth() * 2;
+  APInt C1Ext = C1.zext(FullWidth);
+  APInt C2Ext = C2.zext(FullWidth);
+  return C1Ext * C2Ext;
+}
+
+APInt APIntOps::muldq(const APInt &C1, const APInt &C2) {
+  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
+  unsigned FullWidth = C1.getBitWidth() * 2;
+  APInt C1Ext = C1.sext(FullWidth);
+  APInt C2Ext = C2.sext(FullWidth);
+  return C1Ext * C2Ext;
+}
+
 APInt APIntOps::pow(const APInt &X, int64_t N) {
   assert(N >= 0 && "negative exponents not supported.");
   APInt Acc = APInt(X.getBitWidth(), 1);

@llvmbot
Copy link
Member

llvmbot commented Aug 12, 2025

@llvm/pr-subscribers-clang

Author: Pedro Lobo (pedroclobo)

Changes

Closes #153002.
Part of #30794.

Adds constexpr support for pmuludq and pmuldq intrinsics.


Full diff: https://github.com/llvm/llvm-project/pull/153293.diff

13 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsX86.td (+16-4)
  • (modified) clang/lib/AST/ExprConstant.cpp (+37)
  • (modified) clang/lib/Headers/avx2intrin.h (+4-6)
  • (modified) clang/lib/Headers/avx512fintrin.h (+4-6)
  • (modified) clang/lib/Headers/emmintrin.h (+9-5)
  • (modified) clang/lib/Headers/smmintrin.h (+2-2)
  • (modified) clang/test/CodeGen/X86/avx2-builtins.c (+2)
  • (modified) clang/test/CodeGen/X86/avx512f-builtins.c (+2)
  • (modified) clang/test/CodeGen/X86/mmx-builtins.c (+1)
  • (modified) clang/test/CodeGen/X86/sse2-builtins.c (+1)
  • (modified) clang/test/CodeGen/X86/sse41-builtins.c (+1)
  • (modified) llvm/include/llvm/ADT/APInt.h (+6)
  • (modified) llvm/lib/Support/APInt.cpp (+16)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 3efc0be8fa698..ca2cc081b787a 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -268,7 +268,6 @@ let Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
 }
 
 let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
   def psraw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
   def psrad128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
   def psrlw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
@@ -290,6 +289,10 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
   def psrldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
 }
 
+let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
 let Features = "sse3", Attributes = [NoThrow] in {
   def monitor : X86Builtin<"void(void const *, unsigned int, unsigned int)">;
   def mwait : X86Builtin<"void(unsigned int, unsigned int)">;
@@ -312,7 +315,6 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
   def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
   def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
-  def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
   def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
   def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
   def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
@@ -329,6 +331,10 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def vec_set_v4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int, _Constant int)">;
 }
 
+let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
 let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def pcmpistrm128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
   def pcmpistri128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
@@ -580,9 +586,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
   def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
   def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
-  def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
   def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
   def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
   def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
   def pshufd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
@@ -620,6 +624,11 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
 }
 
+let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+  def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
 let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
   def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
@@ -1090,6 +1099,9 @@ let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWi
   def cvtpd2ps512_mask : X86Builtin<"_Vector<8, float>(_Vector<8, double>, _Vector<8, float>, unsigned char, _Constant int)">;
   def vcvtps2ph512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, float>, _Constant int, _Vector<16, short>, unsigned short)">;
   def vcvtph2ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, short>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def pmuldq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
   def pmuludq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
 }
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 36dd0f5d7a065..234f257122d8f 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11688,6 +11688,43 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case clang::X86::BI__builtin_ia32_pmuldq128:
+  case clang::X86::BI__builtin_ia32_pmuldq256:
+  case clang::X86::BI__builtin_ia32_pmuldq512:
+  case clang::X86::BI__builtin_ia32_pmuludq128:
+  case clang::X86::BI__builtin_ia32_pmuludq256:
+  case clang::X86::BI__builtin_ia32_pmuludq512: {
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+
+    unsigned SourceLen = SourceLHS.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(SourceLen / 2);
+
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      APSInt LHS = SourceLHS.getVectorElt(EltNum).getInt();
+      APSInt RHS = SourceRHS.getVectorElt(EltNum).getInt();
+
+      switch (E->getBuiltinCallee()) {
+      case clang::X86::BI__builtin_ia32_pmuludq128:
+      case clang::X86::BI__builtin_ia32_pmuludq256:
+      case clang::X86::BI__builtin_ia32_pmuludq512:
+        ResultElements.push_back(
+            APValue(APSInt(llvm::APIntOps::muludq(LHS, RHS), true)));
+        break;
+      case clang::X86::BI__builtin_ia32_pmuldq128:
+      case clang::X86::BI__builtin_ia32_pmuldq256:
+      case clang::X86::BI__builtin_ia32_pmuldq512:
+        ResultElements.push_back(
+            APValue(APSInt(llvm::APIntOps::muldq(LHS, RHS), false)));
+        break;
+      }
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
   case Builtin::BI__builtin_elementwise_max:
   case Builtin::BI__builtin_elementwise_min: {
     APValue SourceLHS, SourceRHS;
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index f00a8a516ecfe..b098417173ba3 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -1671,9 +1671,8 @@ _mm256_cvtepu32_epi64(__m128i __V) {
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x i64] containing the products.
-static __inline__  __m256i __DEFAULT_FN_ATTRS256
-_mm256_mul_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mul_epi32(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
 }
 
@@ -1800,9 +1799,8 @@ _mm256_mullo_epi32 (__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x i64] containing the products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mul_epu32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mul_epu32(__m256i __a, __m256i __b) {
   return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
 }
 
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 9fc1df3acd3d0..0ba5aa338a6b8 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -1417,9 +1417,8 @@ _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
                                              (__v8di)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mul_epi32(__m512i __X, __m512i __Y)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mul_epi32(__m512i __X, __m512i __Y) {
   return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
 }
 
@@ -1439,9 +1438,8 @@ _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
                                              (__v8di)_mm512_setzero_si512 ());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mul_epu32(__m512i __X, __m512i __Y)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mul_epu32(__m512i __X, __m512i __Y) {
   return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
 }
 
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index e632191113c4a..2ca25149db280 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -67,6 +67,9 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
 
 #define __trunc64(x)                                                           \
   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __zext128(x)                                                           \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, 2, 3)
 #define __anyext128(x)                                                         \
   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
                                     1, -1, -1)
@@ -2450,9 +2453,10 @@ _mm_mullo_epi16(__m128i __a, __m128i __b) {
 /// \param __b
 ///    A 64-bit integer containing one of the source operands.
 /// \returns A 64-bit integer vector containing the product of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
-  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
-                                             (__v4si)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_su32(__m64 __a,
+                                                                  __m64 __b) {
+  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__zext128(__a),
+                                             (__v4si)__zext128(__b)));
 }
 
 /// Multiplies 32-bit unsigned integer values contained in the lower
@@ -2468,8 +2472,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
 /// \param __b
 ///    A [2 x i64] vector containing one of the source operands.
 /// \returns A [2 x i64] vector containing the product of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
-                                                           __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_epu32(__m128i __a, __m128i __b) {
   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
 }
 
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index c5075c419b70b..57d0d329312af 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -567,8 +567,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
 ///    A 128-bit vector of [4 x i32].
 /// \returns A 128-bit vector of [2 x i64] containing the products of both
 ///    operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
-                                                           __m128i __V2) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_epi32(__m128i __V1, __m128i __V2) {
   return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
 }
 
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 8790485f00a8c..bd7390d58a74c 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -903,6 +903,7 @@ __m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
   // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mul_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_mul_epi32((__m256i)(__v8si){+1, -2, +3, -4, +5, -6, +7, -8}, (__m256i)(__v8si){-16, -14, +12, +10, -8, +6, -4, +2}), -16, -1, 36, 0, -40, -1, -28, -1));
 
 __m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_mul_epu32
@@ -911,6 +912,7 @@ __m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
   // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mul_epu32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_mul_epu32((__m256i)(__v8si){+1, -2, +3, -4, +5, -6, +7, -8}, (__m256i)(__v8si){-16, -14, +12, +10, -8, +6, -4, +2}), -16, 0, 36, 0, -40, 4, -28, 6));
 
 __m256i test_mm256_mulhi_epu16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_mulhi_epu16
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 048bc3057b5f4..68da485213941 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -3027,6 +3027,7 @@ __m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) {
   //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_mul_epi32(__A,__B);
 }
+TEST_CONSTEXPR(match_m512i(_mm512_mul_epi32((__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){16, 14, 12, 10, 8, 6, 4, 2}), 16, 28, 36, 40, 40, 36, 28, 16));
 
 __m512i test_mm512_maskz_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: test_mm512_maskz_mul_epi32
@@ -3057,6 +3058,7 @@ __m512i test_mm512_mul_epu32 (__m512i __A, __m512i __B) {
   //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_mul_epu32(__A,__B);
 }
+TEST_CONSTEXPR(match_m512i(_mm512_mul_epu32((__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){16, 14, 12, 10, 8, 6, 4, 2}), 16, 28, 36, 40, 40, 36, 28, 16));
 
 __m512i test_mm512_maskz_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: test_mm512_maskz_mul_epu32
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 07fab2d39054c..5c700107c58e9 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -385,6 +385,7 @@ __m64 test_mm_mul_su32(__m64 a, __m64 b) {
   // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_su32(a, b);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_mul_su32((__m64)(__v4hi){1, 2, 3, 4}, (__m64)(__v4hi){10, 8, 6, 4}), 10, 28, 16, 0));
 
 __m64 test_mm_mulhi_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mulhi_pi16
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 4ff14adcea0e1..0e00b2763db35 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -921,6 +921,7 @@ __m128i test_mm_mul_epu32(__m128i A, __m128i B) {
   // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_epu32(A, B);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mul_epu32((__m128i)(__v8hi){+1, -2, +3, -4, +5, -6, +7, -8}, (__m128i)(__v8hi){-16, -14, +12, +10, -8, +6, -4, +2}), -16, 18, 11, -15, -40, 82, -50, 6));
 
 __m128d test_mm_mul_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_mul_pd
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 10deb386d82aa..b7fc582788c8c 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -344,6 +344,7 @@ __m128i test_mm_mul_epi32(__m128i x, __m128i y) {
   // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_epi32(x, y);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mul_epi32((__m128i)(__v8hi){+1, -2, +3, -4, +5, -6, +7, -8}, (__m128i)(__v8hi){-16, -14, +12, +10, -8, +6, -4, +2}), -16, 18, 26, 0, -40, 82, -42, -1));
 
 __m128i test_mm_mullo_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_mullo_epi32
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 295506393a1c4..cf7ddf3a9719f 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -2294,6 +2294,12 @@ LLVM_ABI APInt mulhs(const APInt &C1, const APInt &C2);
 /// Returns the high N bits of the multiplication result.
 LLVM_ABI APInt mulhu(const APInt &C1, const APInt &C2);
 
+/// Performs (2*N)-bit multiplication on sign-extended operands.
+LLVM_ABI APInt muldq(const APInt &C1, const APInt &C2);
+
+/// Performs (2*N)-bit multiplication on zero-extended operands.
+LLVM_ABI APInt muludq(const APInt &C1, const APInt &C2);
+
 /// Compute X^N for N>=0.
 /// 0^0 is supported and returns 1.
 LLVM_ABI APInt pow(const APInt &X, int64_t N);
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index 954af7fff92a8..2dd83e8bbe2fd 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -3136,6 +3136,22 @@ APInt APIntOps::mulhu(const APInt &C1, const APInt &C2) {
   return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
 }
 
+APInt APIntOps::muludq(const APInt &C1, const APInt &C2) {
+  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
+  unsigned FullWidth = C1.getBitWidth() * 2;
+  APInt C1Ext = C1.zext(FullWidth);
+  APInt C2Ext = C2.zext(FullWidth);
+  return C1Ext * C2Ext;
+}
+
+APInt APIntOps::muldq(const APInt &C1, const APInt &C2) {
+  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
+  unsigned FullWidth = C1.getBitWidth() * 2;
+  APInt C1Ext = C1.sext(FullWidth);
+  APInt C2Ext = C2.sext(FullWidth);
+  return C1Ext * C2Ext;
+}
+
 APInt APIntOps::pow(const APInt &X, int64_t N) {
   assert(N >= 0 && "negative exponents not supported.");
   APInt Acc = APInt(X.getBitWidth(), 1);

@llvmbot
Copy link
Member

llvmbot commented Aug 12, 2025

@llvm/pr-subscribers-backend-x86

Author: Pedro Lobo (pedroclobo)

Changes

Closes #153002.
Part of #30794.

Adds constexpr support for pmuludq and pmuldq intrinsics.


Full diff: https://github.com/llvm/llvm-project/pull/153293.diff

13 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsX86.td (+16-4)
  • (modified) clang/lib/AST/ExprConstant.cpp (+37)
  • (modified) clang/lib/Headers/avx2intrin.h (+4-6)
  • (modified) clang/lib/Headers/avx512fintrin.h (+4-6)
  • (modified) clang/lib/Headers/emmintrin.h (+9-5)
  • (modified) clang/lib/Headers/smmintrin.h (+2-2)
  • (modified) clang/test/CodeGen/X86/avx2-builtins.c (+2)
  • (modified) clang/test/CodeGen/X86/avx512f-builtins.c (+2)
  • (modified) clang/test/CodeGen/X86/mmx-builtins.c (+1)
  • (modified) clang/test/CodeGen/X86/sse2-builtins.c (+1)
  • (modified) clang/test/CodeGen/X86/sse41-builtins.c (+1)
  • (modified) llvm/include/llvm/ADT/APInt.h (+6)
  • (modified) llvm/lib/Support/APInt.cpp (+16)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 3efc0be8fa698..ca2cc081b787a 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -268,7 +268,6 @@ let Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
 }
 
 let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
   def psraw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
   def psrad128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
   def psrlw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
@@ -290,6 +289,10 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
   def psrldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
 }
 
+let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
 let Features = "sse3", Attributes = [NoThrow] in {
   def monitor : X86Builtin<"void(void const *, unsigned int, unsigned int)">;
   def mwait : X86Builtin<"void(unsigned int, unsigned int)">;
@@ -312,7 +315,6 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
   def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
   def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
-  def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
   def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
   def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
   def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
@@ -329,6 +331,10 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def vec_set_v4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int, _Constant int)">;
 }
 
+let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
 let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def pcmpistrm128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
   def pcmpistri128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
@@ -580,9 +586,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
   def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
   def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
-  def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
   def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
   def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
   def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
   def pshufd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
@@ -620,6 +624,11 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
 }
 
+let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+  def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
 let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
   def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
@@ -1090,6 +1099,9 @@ let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWi
   def cvtpd2ps512_mask : X86Builtin<"_Vector<8, float>(_Vector<8, double>, _Vector<8, float>, unsigned char, _Constant int)">;
   def vcvtps2ph512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, float>, _Constant int, _Vector<16, short>, unsigned short)">;
   def vcvtph2ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, short>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def pmuldq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
   def pmuludq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
 }
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 36dd0f5d7a065..234f257122d8f 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11688,6 +11688,43 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case clang::X86::BI__builtin_ia32_pmuldq128:
+  case clang::X86::BI__builtin_ia32_pmuldq256:
+  case clang::X86::BI__builtin_ia32_pmuldq512:
+  case clang::X86::BI__builtin_ia32_pmuludq128:
+  case clang::X86::BI__builtin_ia32_pmuludq256:
+  case clang::X86::BI__builtin_ia32_pmuludq512: {
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+
+    unsigned SourceLen = SourceLHS.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(SourceLen / 2);
+
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      APSInt LHS = SourceLHS.getVectorElt(EltNum).getInt();
+      APSInt RHS = SourceRHS.getVectorElt(EltNum).getInt();
+
+      switch (E->getBuiltinCallee()) {
+      case clang::X86::BI__builtin_ia32_pmuludq128:
+      case clang::X86::BI__builtin_ia32_pmuludq256:
+      case clang::X86::BI__builtin_ia32_pmuludq512:
+        ResultElements.push_back(
+            APValue(APSInt(llvm::APIntOps::muludq(LHS, RHS), true)));
+        break;
+      case clang::X86::BI__builtin_ia32_pmuldq128:
+      case clang::X86::BI__builtin_ia32_pmuldq256:
+      case clang::X86::BI__builtin_ia32_pmuldq512:
+        ResultElements.push_back(
+            APValue(APSInt(llvm::APIntOps::muldq(LHS, RHS), false)));
+        break;
+      }
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
   case Builtin::BI__builtin_elementwise_max:
   case Builtin::BI__builtin_elementwise_min: {
     APValue SourceLHS, SourceRHS;
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index f00a8a516ecfe..b098417173ba3 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -1671,9 +1671,8 @@ _mm256_cvtepu32_epi64(__m128i __V) {
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x i64] containing the products.
-static __inline__  __m256i __DEFAULT_FN_ATTRS256
-_mm256_mul_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mul_epi32(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
 }
 
@@ -1800,9 +1799,8 @@ _mm256_mullo_epi32 (__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x i64] containing the products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mul_epu32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mul_epu32(__m256i __a, __m256i __b) {
   return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
 }
 
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 9fc1df3acd3d0..0ba5aa338a6b8 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -1417,9 +1417,8 @@ _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
                                              (__v8di)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mul_epi32(__m512i __X, __m512i __Y)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mul_epi32(__m512i __X, __m512i __Y) {
   return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
 }
 
@@ -1439,9 +1438,8 @@ _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
                                              (__v8di)_mm512_setzero_si512 ());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mul_epu32(__m512i __X, __m512i __Y)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mul_epu32(__m512i __X, __m512i __Y) {
   return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
 }
 
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index e632191113c4a..2ca25149db280 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -67,6 +67,9 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
 
 #define __trunc64(x)                                                           \
   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __zext128(x)                                                           \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, 2, 3)
 #define __anyext128(x)                                                         \
   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
                                     1, -1, -1)
@@ -2450,9 +2453,10 @@ _mm_mullo_epi16(__m128i __a, __m128i __b) {
 /// \param __b
 ///    A 64-bit integer containing one of the source operands.
 /// \returns A 64-bit integer vector containing the product of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
-  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
-                                             (__v4si)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_su32(__m64 __a,
+                                                                  __m64 __b) {
+  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__zext128(__a),
+                                             (__v4si)__zext128(__b)));
 }
 
 /// Multiplies 32-bit unsigned integer values contained in the lower
@@ -2468,8 +2472,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
 /// \param __b
 ///    A [2 x i64] vector containing one of the source operands.
 /// \returns A [2 x i64] vector containing the product of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
-                                                           __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_epu32(__m128i __a, __m128i __b) {
   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
 }
 
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index c5075c419b70b..57d0d329312af 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -567,8 +567,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
 ///    A 128-bit vector of [4 x i32].
 /// \returns A 128-bit vector of [2 x i64] containing the products of both
 ///    operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
-                                                           __m128i __V2) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_epi32(__m128i __V1, __m128i __V2) {
   return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
 }
 
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 8790485f00a8c..bd7390d58a74c 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -903,6 +903,7 @@ __m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
   // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mul_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_mul_epi32((__m256i)(__v8si){+1, -2, +3, -4, +5, -6, +7, -8}, (__m256i)(__v8si){-16, -14, +12, +10, -8, +6, -4, +2}), -16, -1, 36, 0, -40, -1, -28, -1));
 
 __m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_mul_epu32
@@ -911,6 +912,7 @@ __m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
   // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mul_epu32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_mul_epu32((__m256i)(__v8si){+1, -2, +3, -4, +5, -6, +7, -8}, (__m256i)(__v8si){-16, -14, +12, +10, -8, +6, -4, +2}), -16, 0, 36, 0, -40, 4, -28, 6));
 
 __m256i test_mm256_mulhi_epu16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_mulhi_epu16
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 048bc3057b5f4..68da485213941 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -3027,6 +3027,7 @@ __m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) {
   //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_mul_epi32(__A,__B);
 }
+TEST_CONSTEXPR(match_m512i(_mm512_mul_epi32((__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){16, 14, 12, 10, 8, 6, 4, 2}), 16, 28, 36, 40, 40, 36, 28, 16));
 
 __m512i test_mm512_maskz_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: test_mm512_maskz_mul_epi32
@@ -3057,6 +3058,7 @@ __m512i test_mm512_mul_epu32 (__m512i __A, __m512i __B) {
   //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_mul_epu32(__A,__B);
 }
+TEST_CONSTEXPR(match_m512i(_mm512_mul_epu32((__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){16, 14, 12, 10, 8, 6, 4, 2}), 16, 28, 36, 40, 40, 36, 28, 16));
 
 __m512i test_mm512_maskz_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: test_mm512_maskz_mul_epu32
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 07fab2d39054c..5c700107c58e9 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -385,6 +385,7 @@ __m64 test_mm_mul_su32(__m64 a, __m64 b) {
   // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_su32(a, b);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_mul_su32((__m64)(__v4hi){1, 2, 3, 4}, (__m64)(__v4hi){10, 8, 6, 4}), 10, 28, 16, 0));
 
 __m64 test_mm_mulhi_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mulhi_pi16
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 4ff14adcea0e1..0e00b2763db35 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -921,6 +921,7 @@ __m128i test_mm_mul_epu32(__m128i A, __m128i B) {
   // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_epu32(A, B);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mul_epu32((__m128i)(__v8hi){+1, -2, +3, -4, +5, -6, +7, -8}, (__m128i)(__v8hi){-16, -14, +12, +10, -8, +6, -4, +2}), -16, 18, 11, -15, -40, 82, -50, 6));
 
 __m128d test_mm_mul_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_mul_pd
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 10deb386d82aa..b7fc582788c8c 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -344,6 +344,7 @@ __m128i test_mm_mul_epi32(__m128i x, __m128i y) {
   // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_epi32(x, y);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mul_epi32((__m128i)(__v8hi){+1, -2, +3, -4, +5, -6, +7, -8}, (__m128i)(__v8hi){-16, -14, +12, +10, -8, +6, -4, +2}), -16, 18, 26, 0, -40, 82, -42, -1));
 
 __m128i test_mm_mullo_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_mullo_epi32
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 295506393a1c4..cf7ddf3a9719f 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -2294,6 +2294,12 @@ LLVM_ABI APInt mulhs(const APInt &C1, const APInt &C2);
 /// Returns the high N bits of the multiplication result.
 LLVM_ABI APInt mulhu(const APInt &C1, const APInt &C2);
 
+/// Performs (2*N)-bit multiplication on sign-extended operands.
+LLVM_ABI APInt muldq(const APInt &C1, const APInt &C2);
+
+/// Performs (2*N)-bit multiplication on zero-extended operands.
+LLVM_ABI APInt muludq(const APInt &C1, const APInt &C2);
+
 /// Compute X^N for N>=0.
 /// 0^0 is supported and returns 1.
 LLVM_ABI APInt pow(const APInt &X, int64_t N);
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index 954af7fff92a8..2dd83e8bbe2fd 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -3136,6 +3136,22 @@ APInt APIntOps::mulhu(const APInt &C1, const APInt &C2) {
   return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
 }
 
+APInt APIntOps::muludq(const APInt &C1, const APInt &C2) {
+  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
+  unsigned FullWidth = C1.getBitWidth() * 2;
+  APInt C1Ext = C1.zext(FullWidth);
+  APInt C2Ext = C2.zext(FullWidth);
+  return C1Ext * C2Ext;
+}
+
+APInt APIntOps::muldq(const APInt &C1, const APInt &C2) {
+  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
+  unsigned FullWidth = C1.getBitWidth() * 2;
+  APInt C1Ext = C1.sext(FullWidth);
+  APInt C2Ext = C2.sext(FullWidth);
+  return C1Ext * C2Ext;
+}
+
 APInt APIntOps::pow(const APInt &X, int64_t N) {
   assert(N >= 0 && "negative exponents not supported.");
   APInt Acc = APInt(X.getBitWidth(), 1);

@pedroclobo pedroclobo requested a review from RKSimon August 12, 2025 20:58
Copy link
Member

@kuhar kuhar left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We prefer to land ADT changes separately and rely on C++ unit tests for correctness. This is primarily because reverting anything in ADT is painful because of having to rebuild the whole project.

@pedroclobo
Copy link
Member Author

Just opened a PR upstreaming the ADT changes (#153399).

pedroclobo added a commit that referenced this pull request Aug 13, 2025
Adds `mulsExtended` and `muluExtended` methods to `APInt`, as suggested
in #153293.
These are based on the `MULDQ` and `MULUDQ` x86 intrinsics.
@pedroclobo pedroclobo force-pushed the x86-muludq-muldq-constexpr branch from fb8aa21 to c38db4c Compare August 13, 2025 21:51
@pedroclobo
Copy link
Member Author

Rebased over 08eff57.

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one last minor

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM - cheers

@pedroclobo
Copy link
Member Author

Should I wait for @phoebewang's review before merging?

@RKSimon
Copy link
Collaborator

RKSimon commented Aug 15, 2025

Should I wait for @phoebewang's review before merging?

If you want a second opinion its fine to wait, but you're under no obligation to wait.

@pedroclobo pedroclobo force-pushed the x86-muludq-muldq-constexpr branch from 9ad0839 to c82f901 Compare August 15, 2025 17:54
Adds `constexpr` support for `pmuludq` and `pmuldq` intrinsics.
@pedroclobo pedroclobo force-pushed the x86-muludq-muldq-constexpr branch from c82f901 to 4163545 Compare August 15, 2025 17:57
@kuhar kuhar requested review from kuhar and removed request for kuhar August 15, 2025 18:03
@pedroclobo pedroclobo requested a review from phoebewang August 15, 2025 18:21
Copy link
Contributor

@phoebewang phoebewang left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

@pedroclobo pedroclobo merged commit d42a1d4 into llvm:main Aug 17, 2025
9 checks passed
@pedroclobo pedroclobo deleted the x86-muludq-muldq-constexpr branch August 17, 2025 23:32
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:X86 clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:headers Headers provided by Clang, e.g. for intrinsics clang Clang issues not falling into any other category llvm:adt llvm:support

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[Headers][X86] Allow pmuludq/pmuldq to be used in constexpr

5 participants