llvm · andykaylor · Dec 2, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
@@ -211,6 +211,44 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                  std::string("unimplemented X86 builtin call: ") +
                      getContext().BuiltinInfo.getName(builtinID));
     return {};
+
+  case X86::BI__builtin_ia32_kunpckdi:
+  case X86::BI__builtin_ia32_kunpcksi:
+  case X86::BI__builtin_ia32_kunpckhi: {
+    // Get the number of elements from the bit width of the first operand.
+    unsigned numElems = cast<cir::IntType>(ops[0].getType()).getWidth();
+
+    // Convert both operands to mask vectors.
+    mlir::Value lhs = getMaskVecValue(*this, expr, ops[0], numElems);
+    mlir::Value rhs = getMaskVecValue(*this, expr, ops[1], numElems);
+
+    mlir::Location loc = getLoc(expr->getExprLoc());
+
+    // Create indices for extracting the first half of each vector.
+    SmallVector<mlir::Attribute, 32> halfIndices;
+    mlir::Type i32Ty = builder.getSInt32Ty();
+    for (auto i : llvm::seq<unsigned>(0, numElems / 2))
+      halfIndices.push_back(cir::IntAttr::get(i32Ty, i));
+
+    // Extract first half of each vector. This gives better codegen than
+    // doing it in a single shuffle.
+    lhs = builder.createVecShuffle(loc, lhs, lhs, halfIndices);
+    rhs = builder.createVecShuffle(loc, rhs, rhs, halfIndices);
+
+    // Create indices for concatenating the vectors.
+    // NOTE: Operands are swapped to match the intrinsic definition.
+    // After the half extraction, both vectors have numElems/2 elements.
+    // In createVecShuffle(rhs, lhs, indices), indices [0..numElems/2-1] select
+    // from rhs, and indices [numElems/2..numElems-1] select from lhs.
+    SmallVector<mlir::Attribute, 64> concatIndices;
+    for (auto i : llvm::seq<unsigned>(0, numElems))
+      concatIndices.push_back(cir::IntAttr::get(i32Ty, i));
+
+    // Concat the vectors (RHS first, then LHS).
+    mlir::Value res = builder.createVecShuffle(loc, rhs, lhs, concatIndices);
+    return builder.createBitcast(res, ops[0].getType());
+  }
+
   case X86::BI_mm_setcsr:
   case X86::BI__builtin_ia32_ldmxcsr: {
     mlir::Location loc = getLoc(expr->getExprLoc());
@@ -775,9 +813,6 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_kmovw:
   case X86::BI__builtin_ia32_kmovd:
   case X86::BI__builtin_ia32_kmovq:
-  case X86::BI__builtin_ia32_kunpckdi:
-  case X86::BI__builtin_ia32_kunpcksi:
-  case X86::BI__builtin_ia32_kunpckhi:
   case X86::BI__builtin_ia32_sqrtsh_round_mask:
   case X86::BI__builtin_ia32_sqrtsd_round_mask:
   case X86::BI__builtin_ia32_sqrtss_round_mask:

diff --git a/clang/test/CIR/CodeGen/X86/avx512f-builtins.c b/clang/test/CIR/CodeGen/X86/avx512f-builtins.c
@@ -77,3 +77,27 @@ __m512i test_mm512_undefined_epi32(void) {
   // OGCG: ret <8 x i64> zeroinitializer
   return _mm512_undefined_epi32();
 }
+
+__mmask16 test_mm512_kunpackb(__mmask16 A, __mmask16 B) {
+  // CIR-LABEL: test_mm512_kunpackb
+  // CIR: cir.call @{{.*}}kunpackb{{.*}}
+
+  // LLVM-LABEL: test_mm512_kunpackb
+  // LLVM: [[A_VEC:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[B_VEC:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[A_HALF:%.*]] = shufflevector <16 x i1> [[A_VEC]], <16 x i1> [[A_VEC]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // LLVM: [[B_HALF:%.*]] = shufflevector <16 x i1> [[B_VEC]], <16 x i1> [[B_VEC]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // LLVM: [[RES:%.*]] = shufflevector <8 x i1> [[B_HALF]], <8 x i1> [[A_HALF]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // LLVM: %{{.*}} = bitcast <16 x i1> [[RES]] to i16
+  // LLVM: ret i16 %{{.*}}
+
+  // OGCG-LABEL: test_mm512_kunpackb
+  // OGCG: [[A_VEC:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: [[B_VEC:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: [[A_HALF:%.*]] = shufflevector <16 x i1> [[A_VEC]], <16 x i1> [[A_VEC]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // OGCG: [[B_HALF:%.*]] = shufflevector <16 x i1> [[B_VEC]], <16 x i1> [[B_VEC]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // OGCG: [[RES:%.*]] = shufflevector <8 x i1> [[B_HALF]], <8 x i1> [[A_HALF]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // OGCG: %{{.*}} = bitcast <16 x i1> [[RES]] to i16
+  // OGCG: ret i16 %{{.*}}
+  return _mm512_kunpackb(A, B);
+}