Address reviews + upstream tests for pshufd

Thibault-Monnier · Thibault-Monnier · commit 8e2a43b1c21c · 2025-11-25T22:34:53.000+01:00
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -70,11 +70,12 @@ static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder,
 
 static cir::VecShuffleOp emitPshufW(CIRGenFunction &cgf,
                                     CIRGenBuilderTy &builder,
-                                    llvm::SmallVector<mlir::Value> &ops,
+                                    const mlir::Value vec,
+                                    const mlir::Value immediate,
                                     const CallExpr *expr, const bool isLow) {
-  uint32_t imm = cgf.getZExtIntValueFromConstOp(ops[1]);
+  uint32_t imm = cgf.getZExtIntValueFromConstOp(immediate);
 
-  auto vecTy = cast<cir::VectorType>(ops[0].getType());
+  auto vecTy = cast<cir::VectorType>(vec.getType());
   unsigned numElts = vecTy.getSize();
 
   unsigned firstHalfStart = isLow ? 0 : 4;
@@ -93,10 +94,35 @@ static cir::VecShuffleOp emitPshufW(CIRGenFunction &cgf,
       indices[l + i] = l + i;
   }
 
-  return builder.createVecShuffle(cgf.getLoc(expr->getExprLoc()), ops[0],
+  return builder.createVecShuffle(cgf.getLoc(expr->getExprLoc()), vec,
                                   ArrayRef(indices, numElts));
 }
 
+static llvm::SmallVector<int64_t, 16>
+computeMaskPshufDOrShufP(CIRGenFunction &cgf, const mlir::Value vec,
+                         uint32_t imm, const bool isShufP) {
+  auto vecTy = cast<cir::VectorType>(vec.getType());
+  unsigned numElts = vecTy.getSize();
+  unsigned numLanes = cgf.cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
+  unsigned numLaneElts = numElts / numLanes;
+
+  // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+  imm = (imm & 0xff) * 0x01010101;
+
+  llvm::SmallVector<int64_t, 16> indices(numElts);
+  for (unsigned l = 0; l != numElts; l += numLaneElts) {
+    for (unsigned i = 0; i != numLaneElts; ++i) {
+      uint32_t idx = imm % numLaneElts;
+      imm /= numLaneElts;
+      if (isShufP && i >= (numLaneElts / 2))
+        idx += numElts;
+      indices[l + i] = l + idx;
+    }
+  }
+
+  return indices;
+}
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -550,19 +576,19 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_pblendw256:
   case X86::BI__builtin_ia32_pblendd128:
   case X86::BI__builtin_ia32_pblendd256:
-  cgm.errorNYI(expr->getSourceRange(),
-           std::string("unimplemented X86 builtin call: ") +
-               getContext().BuiltinInfo.getName(builtinID));
-  return {};
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_pshuflw:
   case X86::BI__builtin_ia32_pshuflw256:
   case X86::BI__builtin_ia32_pshuflw512: {
-    return emitPshufW(*this, builder, ops, expr, true);
+    return emitPshufW(*this, builder, ops[0], ops[1], expr, true);
   }
   case X86::BI__builtin_ia32_pshufhw:
   case X86::BI__builtin_ia32_pshufhw256:
   case X86::BI__builtin_ia32_pshufhw512: {
-    return emitPshufW(*this, builder, ops, expr, false);
+    return emitPshufW(*this, builder, ops[0], ops[1], expr, false);
   }
   case X86::BI__builtin_ia32_pshufd:
   case X86::BI__builtin_ia32_pshufd256:
@@ -573,59 +599,24 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_vpermilps256:
   case X86::BI__builtin_ia32_vpermilpd512:
   case X86::BI__builtin_ia32_vpermilps512: {
-    // TODO: Add tests for this branch.
-    uint32_t imm = getSExtIntValueFromConstOp(ops[1]);
-
-    auto vecTy = cast<cir::VectorType>(ops[0].getType());
-    unsigned numElts = vecTy.getSize();
-    auto eltTy = vecTy.getElementType();
+    const uint32_t imm = getSExtIntValueFromConstOp(ops[1]);
+    const llvm::SmallVector<int64_t, 16> mask =
+        computeMaskPshufDOrShufP(*this, ops[0], imm, false);
 
-    unsigned eltBitWidth = getTypeSizeInBits(eltTy).getFixedValue();
-    unsigned numLaneElts = 128 / eltBitWidth;
-
-    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
-    imm = (imm & 0xff) * 0x01010101;
-
-    llvm::SmallVector<int64_t, 16> indices;
-    for (unsigned l = 0; l != numElts; l += numLaneElts) {
-      for (unsigned i = 0; i != numLaneElts; ++i) {
-        indices.push_back((imm % numLaneElts) + l);
-        imm /= numLaneElts;
-      }
-    }
-
-    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0],
-                                    indices);
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], mask);
   }
   case X86::BI__builtin_ia32_shufpd:
   case X86::BI__builtin_ia32_shufpd256:
   case X86::BI__builtin_ia32_shufpd512:
   case X86::BI__builtin_ia32_shufps:
   case X86::BI__builtin_ia32_shufps256:
   case X86::BI__builtin_ia32_shufps512: {
-    uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
-
-    auto vecTy = cast<cir::VectorType>(ops[0].getType());
-    unsigned numElts = vecTy.getSize();
-    unsigned numLanes = cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
-    unsigned numLaneElts = numElts / numLanes;
-
-    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
-    imm = (imm & 0xff) * 0x01010101;
-
-    int64_t indices[16];
-    for (unsigned l = 0; l != numElts; l += numLaneElts) {
-      for (unsigned i = 0; i != numLaneElts; ++i) {
-        uint32_t idx = imm % numLaneElts;
-        imm /= numLaneElts;
-        if (i >= (numLaneElts / 2))
-          idx += numElts;
-        indices[l + i] = l + idx;
-      }
-    }
+    const uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
+    const llvm::SmallVector<int64_t, 16> mask =
+        computeMaskPshufDOrShufP(*this, ops[0], imm, true);
 
     return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
-                                    ArrayRef(indices, numElts));
+                                    mask);
   }
   case X86::BI__builtin_ia32_permdi256:
   case X86::BI__builtin_ia32_permdf256:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -202,6 +202,22 @@ class CIRGenFunction : public CIRGenTypeCache {
     return convertType(getContext().getTypeDeclType(t));
   }
 
+  /// Get integer from a mlir::Value that is an int constant or a constant op.
+  static int64_t getSExtIntValueFromConstOp(mlir::Value val) {
+    auto constOp = val.getDefiningOp<cir::ConstantOp>();
+    assert(constOp && "getIntValueFromConstOp call with non ConstantOp");
+    return constOp.getIntValue().getSExtValue();
+  }
+
+  /// Get zero-extended integer from a mlir::Value that is an int constant or a
+  /// constant op.
+  static int64_t getZExtIntValueFromConstOp(mlir::Value val) {
+    auto constOp = val.getDefiningOp<cir::ConstantOp>();
+    assert(constOp &&
+           "getZeroExtendedIntValueFromConstOp call with non ConstantOp");
+    return constOp.getIntValue().getZExtValue();
+  }
+
   ///  Return the cir::TypeEvaluationKind of QualType \c type.
   static cir::TypeEvaluationKind getEvaluationKind(clang::QualType type);
 
@@ -1349,28 +1365,6 @@ class CIRGenFunction : public CIRGenTypeCache {
                                     cir::IntType resType, mlir::Value emittedE,
                                     bool isDynamic);
 
-  /// Get integer from a mlir::Value that is an int constant or a constant op.
-  static int64_t getSExtIntValueFromConstOp(mlir::Value val) {
-    auto constOp = val.getDefiningOp<cir::ConstantOp>();
-    assert(constOp && "getIntValueFromConstOp call with non ConstantOp");
-    return constOp.getIntValue().getSExtValue();
-  }
-
-  /// Get zero-extended integer from a mlir::Value that is an int constant or a
-  /// constant op.
-  static int64_t getZExtIntValueFromConstOp(mlir::Value val) {
-    auto constOp = val.getDefiningOp<cir::ConstantOp>();
-    assert(constOp &&
-           "getZeroExtendedIntValueFromConstOp call with non ConstantOp");
-    return constOp.getIntValue().getZExtValue();
-  }
-
-  /// Get size of type in bits using SizedTypeInterface
-  llvm::TypeSize getTypeSizeInBits(mlir::Type ty) const {
-    assert(cir::isSized(ty) && "Type must implement SizedTypeInterface");
-    return cgm.getDataLayout().getTypeSizeInBits(ty);
-  }
-
   mlir::Value evaluateOrEmitBuiltinObjectSize(const clang::Expr *e,
                                               unsigned type,
                                               cir::IntType resType,
diff --git a/clang/test/CIR/CodeGen/X86/builtin-x86-pshufd.cpp b/clang/test/CIR/CodeGen/X86/builtin-x86-pshufd.cpp
@@ -0,0 +1,113 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+
+// Test that __builtin_ia32_pshufd and __builtin_ia32_vpermilp generates correct CIR vec.shuffle operations
+// This verifies the fix for SIMD intrinsic support that was previously NYI
+
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef double __v2df __attribute__((__vector_size__(16)));
+typedef float __v8sf __attribute__((__vector_size__(32)));
+typedef double __v4df __attribute__((__vector_size__(32)));
+typedef float __v16sf __attribute__((__vector_size__(64)));
+typedef double __v8df __attribute__((__vector_size__(64)));
+
+typedef __v4si __m128i;
+typedef __v4sf __m128;
+typedef __v2df __m128d;
+typedef __v8sf __m256;
+typedef __v4df __m256d;
+typedef __v16sf __m512;
+typedef __v8df __m512d;
+
+// CHECK-LABEL: @_Z11test_pshufdv
+void test_pshufd() {
+    __m128i vec = {1, 2, 3, 4};
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<4 x !s32i>
+    __m128i result = __builtin_ia32_pshufd(vec, 0x4E);
+}
+
+// CHECK-LABEL: @_Z19test_different_maskv
+void test_different_mask() {
+    __m128i vec = {10, 20, 30, 40};
+    // Test different immediate value: 0x1B = 00011011 = [3,2,1,0] reversed
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i] : !cir.vector<4 x !s32i>
+    __m128i result = __builtin_ia32_pshufd(vec, 0x1B);
+}
+
+// CHECK-LABEL: @_Z9test_casev
+void test_case() {
+    __m128i p0 = {1, 2, 3, 4};
+
+    // This reproduces the exact pattern from stb_image.h:2685 that was failing:
+    // _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e));
+    // Which expands to: __builtin_ia32_pshufd(p0, 0x4e)
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<4 x !s32i>
+    __m128i out_vec = __builtin_ia32_pshufd(p0, 0x4e);
+}
+
+// CHECK-LABEL: @_Z15test_vpermilps4v
+void test_vpermilps4() {
+    __m128 vec = {1.0f, 2.0f, 3.0f, 4.0f};
+    // vpermilps with immediate 0x4E = 01001110 = [1,3,2,0] for 4 elements
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} :  !cir.vector<4 x !cir.float>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<4 x !cir.float>
+    __m128 result = __builtin_ia32_vpermilps(vec, 0x4E);
+}
+
+// CHECK-LABEL: @_Z15test_vpermilpd2v
+void test_vpermilpd2() {
+    __m128d vec = {1.0, 2.0};
+    // vpermilpd with immediate 0x1 = 01 = [1,0] for 2 elements
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x !cir.double>
+    __m128d result = __builtin_ia32_vpermilpd(vec, 0x1);
+}
+
+// CHECK-LABEL: @_Z17test_vpermilps256v
+void test_vpermilps256() {
+    __m256 vec = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+    // vpermilps256 with immediate 0x1B = 00011011 = [3,2,1,0] for each 128-bit lane
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<7> : !s32i, #cir.int<6> : !s32i, #cir.int<5> : !s32i, #cir.int<4> : !s32i] : !cir.vector<8 x !cir.float>
+    __m256 result = __builtin_ia32_vpermilps256(vec, 0x1B);
+}
+
+// CHECK-LABEL: @_Z17test_vpermilpd256v
+void test_vpermilpd256() {
+    __m256d vec = {1.0, 2.0, 3.0, 4.0};
+    // vpermilpd256 with immediate 0x5 = 0101 = [1,0,1,0] for 4 elements
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i] : !cir.vector<4 x !cir.double>
+    __m256d result = __builtin_ia32_vpermilpd256(vec, 0x5);
+}
+
+// CHECK-LABEL: @_Z17test_vpermilps512v
+void test_vpermilps512() {
+    __m512 vec = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
+                  9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
+    // vpermilps512 with immediate 0x4E = 01001110 = [1,3,2,0] for each 128-bit lane
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !cir.float>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i] : !cir.vector<16 x !cir.float>
+    __m512 result = __builtin_ia32_vpermilps512(vec, 0x4E);
+}
+
+// CHECK-LABEL: @_Z17test_vpermilpd512v
+void test_vpermilpd512() {
+    __m512d vec = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
+    // vpermilpd512 with immediate 0x55 = 01010101 = [1,0,1,0,1,0,1,0] for 8 elements
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<5> : !s32i, #cir.int<4> : !s32i, #cir.int<7> : !s32i, #cir.int<6> : !s32i] : !cir.vector<8 x !cir.double>
+    __m512d result = __builtin_ia32_vpermilpd512(vec, 0x55);
+}
+
+// Test different immediate values
+// CHECK-LABEL: @_Z24test_vpermilps_differentv
+void test_vpermilps_different() {
+    __m128 vec = {10.0f, 20.0f, 30.0f, 40.0f};
+    // Test different immediate value: 0x1B = 00011011 = [3,2,1,0] reversed
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i] : !cir.vector<4 x !cir.float>
+    __m128 result = __builtin_ia32_vpermilps(vec, 0x1B);
+}
+
+// CHECK-LABEL: @_Z24test_vpermilpd_differentv
+void test_vpermilpd_different() {
+    __m128d vec = {100.0, 200.0};
+    // Test immediate 0x0 = 00 = [0,0] - duplicate first element
+    // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x !cir.double>
+    __m128d result = __builtin_ia32_vpermilpd(vec, 0x0);
+}
diff --git a/clang/test/CIR/CodeGen/X86/sse2-builtins.c b/clang/test/CIR/CodeGen/X86/sse2-builtins.c
@@ -146,4 +146,4 @@ __m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
   // OGCG-LABEL: test_mm_shuffle_pd
   // OGCG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 2>
   return _mm_shuffle_pd(A, B, 1);
-}
+}