Finish upstreaming CIR codegen for sse and sse2 builtins

Thibault-Monnier · Thibault-Monnier · commit b6118d40c4a1 · 2025-11-12T20:50:08.000+01:00
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -462,6 +462,36 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
                                          align, order);
   }
 
+  cir::VecShuffleOp
+  createVecShuffle(mlir::Location loc, mlir::Value vec1, mlir::Value vec2,
+                   llvm::ArrayRef<mlir::Attribute> maskAttrs) {
+    auto vecType = mlir::cast<cir::VectorType>(vec1.getType());
+    auto resultTy = cir::VectorType::get(getContext(), vecType.getElementType(),
+                                         maskAttrs.size());
+    return cir::VecShuffleOp::create(*this, loc, resultTy, vec1, vec2,
+                                     getArrayAttr(maskAttrs));
+  }
+
+  cir::VecShuffleOp createVecShuffle(mlir::Location loc, mlir::Value vec1,
+                                     mlir::Value vec2,
+                                     llvm::ArrayRef<int64_t> mask) {
+    llvm::SmallVector<mlir::Attribute, 4> maskAttrs;
+    for (int32_t idx : mask) {
+      maskAttrs.push_back(cir::IntAttr::get(getSInt32Ty(), idx));
+    }
+
+    return createVecShuffle(loc, vec1, vec2, maskAttrs);
+  }
+
+  cir::VecShuffleOp createVecShuffle(mlir::Location loc, mlir::Value vec1,
+                                     llvm::ArrayRef<int64_t> mask) {
+    /// Create a unary shuffle. The second vector operand of the IR instruction
+    /// is poison.
+    return createVecShuffle(
+        loc, vec1, getConstant(loc, getAttr<cir::PoisonAttr>(vec1.getType())),
+        mask);
+  }
+
   /// Create a cir.complex.real_ptr operation that derives a pointer to the real
   /// part of the complex value pointed to by the specified pointer value.
   mlir::Value createComplexRealPtr(mlir::Location loc, mlir::Value value) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -134,9 +134,17 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_tzcnt_u16:
   case X86::BI__builtin_ia32_tzcnt_u32:
   case X86::BI__builtin_ia32_tzcnt_u64:
+    UNIMPLEMENTED_BUILTIN();
   case X86::BI__builtin_ia32_undef128:
   case X86::BI__builtin_ia32_undef256:
   case X86::BI__builtin_ia32_undef512:
+    // The x86 definition of "undef" is not the same as the LLVM definition
+    // (PR32176). We leave optimizing away an unnecessary zero constant to the
+    // IR optimizer and backend.
+    // TODO: If we had a "freeze" IR instruction to generate a fixed undef
+    //  value, we should use that here instead of a zero.
+    return builder.getNullValue(convertType(e->getType()),
+                                getLoc(e->getExprLoc()));
   case X86::BI__builtin_ia32_vec_ext_v4hi:
   case X86::BI__builtin_ia32_vec_ext_v16qi:
   case X86::BI__builtin_ia32_vec_ext_v8hi:
@@ -146,7 +154,24 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_vec_ext_v32qi:
   case X86::BI__builtin_ia32_vec_ext_v16hi:
   case X86::BI__builtin_ia32_vec_ext_v8si:
-  case X86::BI__builtin_ia32_vec_ext_v4di:
+  case X86::BI__builtin_ia32_vec_ext_v4di: {
+    unsigned NumElts = cast<cir::VectorType>(ops[0].getType()).getSize();
+
+    uint64_t index =
+        ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue();
+
+    index &= NumElts - 1;
+
+    auto indexAttr = cir::IntAttr::get(
+        cir::IntType::get(&getMLIRContext(), 64, false), index);
+    auto indexVal =
+        cir::ConstantOp::create(builder, getLoc(e->getExprLoc()), indexAttr);
+
+    // These builtins exist so we can ensure the index is an ICE and in range.
+    // Otherwise we could just do this in the header file.
+    return cir::VecExtractOp::create(builder, getLoc(e->getExprLoc()), ops[0],
+                                     indexVal);
+  }
   case X86::BI__builtin_ia32_vec_set_v4hi:
   case X86::BI__builtin_ia32_vec_set_v16qi:
   case X86::BI__builtin_ia32_vec_set_v8hi:
@@ -499,12 +524,55 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_pblendw256:
   case X86::BI__builtin_ia32_pblendd128:
   case X86::BI__builtin_ia32_pblendd256:
+    UNIMPLEMENTED_BUILTIN();
   case X86::BI__builtin_ia32_pshuflw:
   case X86::BI__builtin_ia32_pshuflw256:
-  case X86::BI__builtin_ia32_pshuflw512:
+  case X86::BI__builtin_ia32_pshuflw512: {
+    unsigned imm =
+        ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue();
+    auto Ty = cast<cir::VectorType>(ops[0].getType());
+    unsigned numElts = Ty.getSize();
+
+    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+    imm = (imm & 0xff) * 0x01010101;
+
+    int64_t indices[32];
+    for (unsigned l = 0; l != numElts; l += 8) {
+      for (unsigned i = 0; i != 4; ++i) {
+        indices[l + i] = l + (imm & 3);
+        imm >>= 2;
+      }
+      for (unsigned i = 4; i != 8; ++i)
+        indices[l + i] = l + i;
+    }
+
+    return builder.createVecShuffle(getLoc(e->getExprLoc()), ops[0],
+                                    ArrayRef(indices, numElts));
+  }
   case X86::BI__builtin_ia32_pshufhw:
   case X86::BI__builtin_ia32_pshufhw256:
-  case X86::BI__builtin_ia32_pshufhw512:
+  case X86::BI__builtin_ia32_pshufhw512: {
+    unsigned imm =
+        ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue();
+    auto ty = cast<cir::VectorType>(ops[0].getType());
+    unsigned numElts = ty.getSize();
+
+    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+    imm = (imm & 0xff) * 0x01010101;
+
+    int64_t indices[32];
+    for (unsigned l = 0; l != numElts; l += 8) {
+      for (unsigned i = 0; i != 4; ++i)
+        indices[l + i] = l + i;
+      for (unsigned i = 4; i != 8; ++i) {
+        indices[l + i] = l + 4 + (imm & 3);
+        imm >>= 2;
+      }
+    }
+
+    return builder.createVecShuffle(getLoc(e->getExprLoc()), ops[0],
+                                    ArrayRef(indices, numElts));
+  }
   case X86::BI__builtin_ia32_pshufd:
   case X86::BI__builtin_ia32_pshufd256:
   case X86::BI__builtin_ia32_pshufd512:
@@ -514,12 +582,37 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_vpermilps256:
   case X86::BI__builtin_ia32_vpermilpd512:
   case X86::BI__builtin_ia32_vpermilps512:
+    UNIMPLEMENTED_BUILTIN();
   case X86::BI__builtin_ia32_shufpd:
   case X86::BI__builtin_ia32_shufpd256:
   case X86::BI__builtin_ia32_shufpd512:
   case X86::BI__builtin_ia32_shufps:
   case X86::BI__builtin_ia32_shufps256:
-  case X86::BI__builtin_ia32_shufps512:
+  case X86::BI__builtin_ia32_shufps512: {
+    unsigned imm =
+        ops[2].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue();
+    auto ty = cast<cir::VectorType>(ops[0].getType());
+    unsigned numElts = ty.getSize();
+    unsigned numLanes = cgm.getDataLayout().getTypeSizeInBits(ty) / 128;
+    unsigned numLaneElts = numElts / numLanes;
+
+    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+    imm = (imm & 0xff) * 0x01010101;
+
+    int64_t indices[16];
+    for (unsigned l = 0; l != numElts; l += numLaneElts) {
+      for (unsigned i = 0; i != numLaneElts; ++i) {
+        unsigned index = imm % numLaneElts;
+        imm /= numLaneElts;
+        if (i >= (numLaneElts / 2))
+          index += numElts;
+        indices[l + i] = l + index;
+      }
+    }
+
+    return builder.createVecShuffle(getLoc(e->getExprLoc()), ops[0], ops[1],
+                                    ArrayRef(indices, numElts));
+  }
   case X86::BI__builtin_ia32_permdi256:
   case X86::BI__builtin_ia32_permdf256:
   case X86::BI__builtin_ia32_permdi512:
diff --git a/clang/test/CIR/CodeGen/X86/sse-builtins.c b/clang/test/CIR/CodeGen/X86/sse-builtins.c
@@ -8,6 +8,9 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse -fclangir -emit-llvm -o %t.ll -Wall -Werror
 // RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
 
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+
 // This test mimics clang/test/CodeGen/X86/sse-builtins.c, which eventually
 // CIR shall be able to support fully.
 
@@ -50,3 +53,15 @@ unsigned int test_mm_getcsr(void) {
   // LLVM: load i32
   return _mm_getcsr();
 }
+
+__m128 test_mm_shuffle_ps(__m128 A, __m128 B) {
+  // CIR-LABEL: _mm_shuffle_ps
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i] : !cir.vector<4 x !cir.float>
+
+  // CHECK-LABEL: test_mm_shuffle_ps
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
+
+  // OGCG-LABEL: test_mm_shuffle_ps
+  // OGCG: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
+  return _mm_shuffle_ps(A, B, 0);
+}
diff --git a/clang/test/CIR/CodeGen/X86/sse2-builtins.c b/clang/test/CIR/CodeGen/X86/sse2-builtins.c
@@ -1,12 +1,18 @@
 // RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse2 -fclangir -emit-cir -o %t.cir -Wall -Werror
-// RUN: FileCheck --check-prefixes=CIR-CHECK --input-file=%t.cir %s
+// RUN: FileCheck --check-prefixes=CIR-CHECK,CIR-X64 --input-file=%t.cir %s
 // RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse2 -fno-signed-char -fclangir -emit-cir -o %t.cir -Wall -Werror
-// RUN: FileCheck --check-prefixes=CIR-CHECK --input-file=%t.cir %s
+// RUN: FileCheck --check-prefixes=CIR-CHECK,CIR-X64 --input-file=%t.cir %s
 
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse2 -fclangir -emit-llvm -o %t.ll -Wall -Werror
-// RUN: FileCheck --check-prefixes=LLVM-CHECK --input-file=%t.ll %s
+// RUN: FileCheck --check-prefixes=LLVM-CHECK,LLVM-X64 --input-file=%t.ll %s
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse2 -fno-signed-char -fclangir -emit-llvm -o %t.ll -Wall -Werror
-// RUN: FileCheck --check-prefixes=LLVM-CHECK --input-file=%t.ll %s
+// RUN: FileCheck --check-prefixes=LLVM-CHECK,LLVM-X64 --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
 
 // This test mimics clang/test/CodeGen/X86/sse2-builtins.c, which eventually
 // CIR shall be able to support fully.
@@ -21,6 +27,44 @@ void test_mm_clflush(void* A) {
   // LLVM-CHECK: call void @llvm.x86.sse2.clflush(ptr {{%.*}})
 }
 
+__m128d test_mm_undefined_pd(void) {
+  // CIR-X64-LABEL: _mm_undefined_pd
+  // CIR-X64: %{{.*}} = cir.const #cir.zero : !cir.vector<2 x !cir.double>
+  // CIR-X64: cir.return %{{.*}} : !cir.vector<2 x !cir.double>
+
+  // LLVM-X64-LABEL: test_mm_undefined_pd
+  // LLVM-X64: store <2 x double> zeroinitializer, ptr %[[A:.*]], align 16
+  // LLVM-X64: %{{.*}} = load <2 x double>, ptr %[[A]], align 16
+  // LLVM-X64: ret <2 x double> %{{.*}}
+  return _mm_undefined_pd();
+}
+
+__m128i test_mm_undefined_si128(void) {
+  // CIR-LABEL: _mm_undefined_si128
+  // CIR-CHECK: %[[A:.*]] = cir.const #cir.zero : !cir.vector<2 x !cir.double>
+  // CIR-CHECK: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<2 x !cir.double> -> !cir.vector<2 x !s64i>
+  // CIR-CHECK: cir.return %{{.*}} : !cir.vector<2 x !s64i>
+
+  // LLVM-CHECK-LABEL: test_mm_undefined_si128
+  // LLVM-CHECK: store <2 x i64> zeroinitializer, ptr %[[A:.*]], align 16
+  // LLVM-CHECK: %{{.*}} = load <2 x i64>, ptr %[[A]], align 16
+  // LLVM-CHECK: ret <2 x i64> %{{.*}}
+  return _mm_undefined_si128();
+}
+
+// Lowering to pextrw requires optimization.
+int test_mm_extract_epi16(__m128i A) {
+
+  // CIR-CHECK-LABEL: test_mm_extract_epi16
+  // CIR-CHECK %{{.*}} = cir.vec.extract %{{.*}}[%{{.*}} : {{!u32i|!u64i}}] : !cir.vector<!s16i x 8>
+  // CIR-CHECK %{{.*}} = cir.cast integral %{{.*}} : !u16i -> !s32i
+
+  // LLVM-CHECK-LABEL: test_mm_extract_epi16
+  // LLVM-CHECK: extractelement <8 x i16> %{{.*}}, {{i32|i64}} 1
+  // LLVM-CHECK: zext i16 %{{.*}} to i32
+  return _mm_extract_epi16(A, 1);
+}
+
 void test_mm_lfence(void) {
   // CIR-CHECK-LABEL: test_mm_lfence
   // LLVM-CHECK-LABEL: test_mm_lfence
@@ -35,4 +79,40 @@ void test_mm_mfence(void) {
   _mm_mfence();
   // CIR-CHECK: {{%.*}} = cir.llvm.intrinsic "x86.sse2.mfence" : () -> !void
   // LLVM-CHECK: call void @llvm.x86.sse2.mfence()
-}
+}
+
+__m128i test_mm_shufflelo_epi16(__m128i A) {
+  // CIR-LABEL: _mm_shufflelo_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s16i x 8>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] : !cir.vector<!s16i x 8>
+
+  // LLVM-LABEL: test_mm_shufflelo_epi16
+  // LLVM: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+
+  // OGCG-LABEL: test_mm_shufflelo_epi16
+  // OGCG: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+  return _mm_shufflelo_epi16(A, 0);
+}
+
+__m128i test_mm_shufflehi_epi16(__m128i A) {
+  // CIR-LABEL: _mm_shufflehi_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s16i x 8>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i] : !cir.vector<!s16i x 8>
+
+  // LLVM-LABEL: test_mm_shufflehi_epi16
+  // LLVM: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+
+  // OGCG-LABEL: test_mm_shufflehi_epi16
+  // OGCG: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+  return _mm_shufflehi_epi16(A, 0);
+}
+
+__m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
+  // CIR-LABEL: test_mm_shuffle_pd
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!cir.double x 2>) [#cir.int<1> : !s32i, #cir.int<2> : !s32i] : !cir.vector<!cir.double x 2>
+
+  // CHECK-LABEL: test_mm_shuffle_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 2>
+
+  // OGCG-LABEL: test_mm_shuffle_pd
+  // OGCG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 2>
+  return _mm_shuffle_pd(A, B, 1);
+}