Skip to content

Conversation

@ahmednoursphinx
Copy link
Contributor

This PR Resolves #165161

@llvmbot llvmbot added clang Clang issues not falling into any other category backend:X86 clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:bytecode Issues for the clang bytecode constexpr interpreter labels Oct 29, 2025
@llvmbot
Copy link
Member

llvmbot commented Oct 29, 2025

@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-x86

Author: Ahmed Nour (ahmednoursphinx)

Changes

This PR Resolves #165161


Full diff: https://github.com/llvm/llvm-project/pull/165513.diff

4 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsX86.td (+4-1)
  • (modified) clang/lib/AST/ByteCode/InterpBuiltin.cpp (+36-4)
  • (modified) clang/lib/AST/ExprConstant.cpp (+35-4)
  • (modified) clang/test/CodeGen/X86/sse41-builtins.c (+10)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 0c85e280e748b..a431fc36b41c1 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -327,8 +327,11 @@ let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorW
   }
 }
 
-let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
+}
+
+let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
   def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
   def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 8f23001ea5a39..b1f0832860476 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3358,7 +3358,8 @@ static bool interp__builtin_x86_byteshift(
 static bool interp__builtin_ia32_shuffle_generic(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
-        GetSourceIndex) {
+        GetSourceIndex,
+    llvm::function_ref<bool(unsigned, unsigned)> ShouldZero = nullptr) {
 
   assert(Call->getNumArgs() == 3);
   unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
@@ -3373,9 +3374,20 @@ static bool interp__builtin_ia32_shuffle_generic(
   const Pointer &Dst = S.Stk.peek<Pointer>();
 
   for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
-    auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
-    const Pointer &Src = (SrcVecIdx == 0) ? A : B;
-    TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });
+    if (ShouldZero && ShouldZero(DstIdx, ShuffleMask)) {
+      // Zero out this element
+      if (ElemT == PT_Float) {
+        Dst.elem<Floating>(DstIdx) = Floating(S.getASTContext().getFloatTypeSemantics(VecT->getElementType()));
+      } else {
+        INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+          Dst.elem<T>(DstIdx) = T::from(0);
+        });
+      }
+    } else {
+      auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
+      const Pointer &Src = (SrcVecIdx == 0) ? A : B;
+      TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });
+    }
   }
   Dst.initializeAllElements();
 
@@ -4348,6 +4360,26 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
           return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
         });
+  case X86::BI__builtin_ia32_insertps128:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call,
+        [](unsigned DstIdx, unsigned Mask) {
+          // Bits [7:6]: select element from source vector Y (0-3)
+          // Bits [5:4]: select destination position (0-3)
+          unsigned SrcElem = (Mask >> 6) & 0x3;
+          unsigned DstElem = (Mask >> 4) & 0x3;
+          if (DstIdx == DstElem) {
+            // Insert element from source vector (B) at this position
+            return std::pair<unsigned, unsigned>{1, SrcElem};
+          } else {
+            // Copy from destination vector (A)
+            return std::pair<unsigned, unsigned>{0, DstIdx};
+          }
+        },
+        [](unsigned DstIdx, unsigned Mask) {
+          // Bits [3:0]: zero mask
+          return (Mask & (1 << DstIdx)) != 0;
+        });
   case X86::BI__builtin_ia32_pshufb128:
   case X86::BI__builtin_ia32_pshufb256:
   case X86::BI__builtin_ia32_pshufb512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 29ee089505125..17c966b8c9f4c 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11622,7 +11622,8 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
 static bool evalShuffleGeneric(
     EvalInfo &Info, const CallExpr *Call, APValue &Out,
     llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
-        GetSourceIndex) {
+        GetSourceIndex,
+    llvm::function_ref<bool(unsigned, unsigned)> ShouldZero = nullptr) {
 
   const auto *VT = Call->getType()->getAs<VectorType>();
   if (!VT)
@@ -11643,9 +11644,15 @@ static bool evalShuffleGeneric(
   ResultElements.reserve(NumElts);
 
   for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) {
-    auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
-    const APValue &Src = (SrcVecIdx == 0) ? A : B;
-    ResultElements.push_back(Src.getVectorElt(SrcIdx));
+    if (ShouldZero && ShouldZero(DstIdx, ShuffleMask)) {
+      // Zero out this element
+      QualType ElemTy = VT->getElementType();
+      ResultElements.push_back(APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy))));
+    } else {
+      auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
+      const APValue &Src = (SrcVecIdx == 0) ? A : B;
+      ResultElements.push_back(Src.getVectorElt(SrcIdx));
+    }
   }
 
   Out = APValue(ResultElements.data(), ResultElements.size());
@@ -12481,6 +12488,30 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
       return false;
     return Success(R, E);
   }
+  case X86::BI__builtin_ia32_insertps128: {
+    APValue R;
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx, unsigned Mask) -> std::pair<unsigned, unsigned> {
+              // Bits [7:6]: select element from source vector Y (0-3)
+              // Bits [5:4]: select destination position (0-3)
+              unsigned SrcElem = (Mask >> 6) & 0x3;
+              unsigned DstElem = (Mask >> 4) & 0x3;
+              if (DstIdx == DstElem) {
+                // Insert element from source vector (B) at this position
+                return {1, SrcElem};
+              } else {
+                // Copy from destination vector (A)
+                return {0, DstIdx};
+              }
+            },
+            [](unsigned DstIdx, unsigned Mask) -> bool {
+              // Bits [3:0]: zero mask
+              return (Mask & (1 << DstIdx)) != 0;
+            }))
+      return false;
+    return Success(R, E);
+  }
   case X86::BI__builtin_ia32_pshufb128:
   case X86::BI__builtin_ia32_pshufb256:
   case X86::BI__builtin_ia32_pshufb512: {
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 62cd392824bb2..35fa65a99836b 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -307,6 +307,16 @@ __m128 test_mm_insert_ps(__m128 x, __m128 y) {
   return _mm_insert_ps(x, y, 4);
 }
 
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x10), 1.0f, 10.0f, 3.0f, 4.0f))); // Insert Y[0] into X[1]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x00), 10.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x20), 1.0f, 2.0f, 10.0f, 4.0f))); // Insert Y[0] into X[2]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x30), 1.0f, 2.0f, 3.0f, 10.0f))); // Insert Y[0] into X[3]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x80), 30.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[2] into X[0]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x01), 0.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0], zero X[0]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0A), 10.0f, 0.0f, 3.0f, 0.0f))); // Insert Y[0] into X[0], zero X[1] and X[3]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0F), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[0] into X[0], zero all
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0xCF), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[3] into X[0], zero all
+
 __m128i test_mm_max_epi8(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_max_epi8
   // CHECK: call <16 x i8> @llvm.smax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})

@tbaederr tbaederr requested a review from RKSimon October 29, 2025 06:02
@github-actions
Copy link

github-actions bot commented Oct 29, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
GetSourceIndex) {
GetSourceIndex,
llvm::function_ref<bool(unsigned, unsigned)> ShouldZero = nullptr) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could just adjust the GetSourceIndex signature to either be std::tuple<unsigned, unsigned, bool> or std::pair<unsigned, int> and treat negative cases as zero.

There will be other shuffles (PSHUFB in particular) that will want zero handling as well.

CC @chaitanyav

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks - the single callback approach looks better to me

EvalInfo &Info, const CallExpr *Call, APValue &Out,
llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
GetSourceIndex) {
GetSourceIndex,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove ShouldZero and update this callback to match interp__builtin_ia32_shuffle_generic

@ahmednoursphinx
Copy link
Contributor Author

Hey @RKSimon, Thanks for the feedback
PR is ready for review again when you have time

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM - cheers

@RKSimon RKSimon enabled auto-merge (squash) October 30, 2025 18:19
@RKSimon RKSimon merged commit cf85cf4 into llvm:main Oct 30, 2025
8 of 9 checks passed
luciechoi pushed a commit to luciechoi/llvm-project that referenced this pull request Nov 1, 2025
DEBADRIBASAK pushed a commit to DEBADRIBASAK/llvm-project that referenced this pull request Nov 3, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:X86 clang:bytecode Issues for the clang bytecode constexpr interpreter clang:frontend Language frontend issues, e.g. anything involving "Sema" clang Clang issues not falling into any other category

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - allow insertps intrinsic to be used in constexp

3 participants