[CIR][X86] Implement lowering for pmuldq / pmuludq builtins (#169853)

GeneraluseAI · web-flow · commit e71717c2f9e8 · 2025-12-03T16:07:12.000-08:00
part of [#167765](#167765) This patch adds CIR codegen support for X86 pmuldq and pmuludq operations, covering the signed and unsigned variants across all supported vector widths. The builtins now lower to the expected CIR representation matching the semantics of the corresponding LLVM intrinsics.
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -269,6 +269,40 @@ static mlir::Value emitX86FunnelShift(CIRGenBuilderTy &builder,
                              mlir::ValueRange{op0, op1, amt});
 }
 
+static mlir::Value emitX86Muldq(CIRGenBuilderTy &builder, mlir::Location loc,
+                                bool isSigned,
+                                SmallVectorImpl<mlir::Value> &ops,
+                                unsigned opTypePrimitiveSizeInBits) {
+  mlir::Type ty = cir::VectorType::get(builder.getSInt64Ty(),
+                                       opTypePrimitiveSizeInBits / 64);
+  mlir::Value lhs = builder.createBitcast(loc, ops[0], ty);
+  mlir::Value rhs = builder.createBitcast(loc, ops[1], ty);
+  if (isSigned) {
+    cir::ConstantOp shiftAmt =
+        builder.getConstant(loc, cir::IntAttr::get(builder.getSInt64Ty(), 32));
+    cir::VecSplatOp shiftSplatVecOp =
+        cir::VecSplatOp::create(builder, loc, ty, shiftAmt.getResult());
+    mlir::Value shiftSplatValue = shiftSplatVecOp.getResult();
+    // In CIR, right-shift operations are automatically lowered to either an
+    // arithmetic or logical shift depending on the operand type. The purpose
+    // of the shifts here is to propagate the sign bit of the 32-bit input
+    // into the upper bits of each vector lane.
+    lhs = builder.createShift(loc, lhs, shiftSplatValue, true);
+    lhs = builder.createShift(loc, lhs, shiftSplatValue, false);
+    rhs = builder.createShift(loc, rhs, shiftSplatValue, true);
+    rhs = builder.createShift(loc, rhs, shiftSplatValue, false);
+  } else {
+    cir::ConstantOp maskScalar = builder.getConstant(
+        loc, cir::IntAttr::get(builder.getSInt64Ty(), 0xffffffff));
+    cir::VecSplatOp mask =
+        cir::VecSplatOp::create(builder, loc, ty, maskScalar.getResult());
+    // Clear the upper bits
+    lhs = builder.createAnd(loc, lhs, mask);
+    rhs = builder.createAnd(loc, rhs, mask);
+  }
+  return builder.createMul(loc, lhs, rhs);
+}
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -1212,12 +1246,26 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_sqrtph512:
   case X86::BI__builtin_ia32_sqrtps512:
   case X86::BI__builtin_ia32_sqrtpd512:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_pmuludq128:
   case X86::BI__builtin_ia32_pmuludq256:
-  case X86::BI__builtin_ia32_pmuludq512:
+  case X86::BI__builtin_ia32_pmuludq512: {
+    unsigned opTypePrimitiveSizeInBits =
+        cgm.getDataLayout().getTypeSizeInBits(ops[0].getType());
+    return emitX86Muldq(builder, getLoc(expr->getExprLoc()), /*isSigned*/ false,
+                        ops, opTypePrimitiveSizeInBits);
+  }
   case X86::BI__builtin_ia32_pmuldq128:
   case X86::BI__builtin_ia32_pmuldq256:
-  case X86::BI__builtin_ia32_pmuldq512:
+  case X86::BI__builtin_ia32_pmuldq512: {
+    unsigned opTypePrimitiveSizeInBits =
+        cgm.getDataLayout().getTypeSizeInBits(ops[0].getType());
+    return emitX86Muldq(builder, getLoc(expr->getExprLoc()), /*isSigned*/ true,
+                        ops, opTypePrimitiveSizeInBits);
+  }
   case X86::BI__builtin_ia32_pternlogd512_mask:
   case X86::BI__builtin_ia32_pternlogq512_mask:
   case X86::BI__builtin_ia32_pternlogd128_mask:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
@@ -51,3 +51,55 @@ __m256i test_mm256_shufflehi_epi16(__m256i a) {
   // OGCG: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
   return _mm256_shufflehi_epi16(a, 107);
 }
+
+__m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
+  // CIR-LABEL: _mm256_mul_epu32
+  // CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
+  // CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
+  // CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i
+  // CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<4 x !s64i>
+  // CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]])
+  // CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]])
+  // CIR: [[MUL:%.*]]   = cir.binop(mul, [[AND_A]], [[AND_B]])
+
+  // LLVM-LABEL: _mm256_mul_epu32
+  // LLVM: and <4 x i64> %{{.*}}, splat (i64 4294967295)
+  // LLVM: and <4 x i64> %{{.*}}, splat (i64 4294967295)
+  // LLVM: mul <4 x i64> %{{.*}}, %{{.*}}
+
+  // OGCG-LABEL: _mm256_mul_epu32
+  // OGCG: and <4 x i64> %{{.*}}, splat (i64 4294967295)
+  // OGCG: and <4 x i64> %{{.*}}, splat (i64 4294967295)
+  // OGCG: mul <4 x i64> %{{.*}}, %{{.*}}
+
+return _mm256_mul_epu32(a, b);
+}
+
+__m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
+  // CIR-LABEL: _mm256_mul_epi32
+  // CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
+  // CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
+  // CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i
+  // CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<4 x !s64i>
+  // CIR: [[SHL_A:%.*]]  = cir.shift(left, [[A64]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
+  // CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
+  // CIR: [[SHL_B:%.*]]  = cir.shift(left, [[B64]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
+  // CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
+  // CIR: [[MUL:%.*]]    = cir.binop(mul, [[ASHR_A]], [[ASHR_B]])
+
+  // LLVM-LABEL: _mm256_mul_epi32
+  // LLVM: shl <4 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: ashr <4 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: shl <4 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: ashr <4 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: mul <4 x i64> %{{.*}}, %{{.*}}
+
+  // OGCG-LABEL: _mm256_mul_epi32
+  // OGCG: shl <4 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: ashr <4 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: shl <4 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: ashr <4 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: mul <4 x i64> %{{.*}}, %{{.*}}
+
+  return _mm256_mul_epi32(a, b);
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
@@ -695,3 +695,55 @@ void test_mm512_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, __m512i __i
   // OGCG: @llvm.x86.avx512.mask.scatter.qpi.512
   return _mm512_mask_i64scatter_epi32(__addr, __mask, __index, __v1, 2);
 }
+
+__m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) {
+  // CIR-LABEL: _mm512_mul_epi32
+  // CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i>
+  // CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i>
+  // CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i
+  // CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<8 x !s64i>
+  // CIR: [[SHL_A:%.*]]  = cir.shift(left, [[A64]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
+  // CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
+  // CIR: [[SHL_B:%.*]]  = cir.shift(left, [[B64]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
+  // CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
+  // CIR: [[MUL:%.*]]    = cir.binop(mul, [[ASHR_A]], [[ASHR_B]])
+
+  // LLVM-LABEL: _mm512_mul_epi32
+  // LLVM: shl <8 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: ashr <8 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: shl <8 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: ashr <8 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: mul <8 x i64> %{{.*}}, %{{.*}}
+
+  // OGCG-LABEL: _mm512_mul_epi32
+  // OGCG: shl <8 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: ashr <8 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: shl <8 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: ashr <8 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: mul <8 x i64> %{{.*}}, %{{.*}}
+
+  return _mm512_mul_epi32(__A, __B);
+}
+
+__m512i test_mm512_mul_epu32(__m512i __A, __m512i __B) {
+  // CIR-LABEL: _mm512_mul_epu32
+  // CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<8 x !s64i>
+  // CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<8 x !s64i>
+  // CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i
+  // CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<8 x !s64i>
+  // CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]])
+  // CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]])
+  // CIR: [[MUL:%.*]]   = cir.binop(mul, [[AND_A]], [[AND_B]])
+
+  // LLVM-LABEL: _mm512_mul_epu32
+  // LLVM: and <8 x i64> %{{.*}}, splat (i64 4294967295)
+  // LLVM: and <8 x i64> %{{.*}}, splat (i64 4294967295)
+  // LLVM: mul <8 x i64> %{{.*}}, %{{.*}}
+
+  // OGCG-LABEL: _mm512_mul_epu32
+  // OGCG: and <8 x i64> %{{.*}}, splat (i64 4294967295)
+  // OGCG: and <8 x i64> %{{.*}}, splat (i64 4294967295)
+  // OGCG: mul <8 x i64> %{{.*}}, %{{.*}}
+
+return _mm512_mul_epu32(__A, __B);
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c
@@ -159,3 +159,26 @@ __m128i test_mm_shuffle_epi32(__m128i A) {
     // OGCG: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
     return _mm_shuffle_epi32(A, 0x4E);
 }
+
+__m128i test_mm_mul_epu32(__m128i A, __m128i B) {
+  // CIR-LABEL: _mm_mul_epu32
+  // CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
+  // CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
+  // CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i
+  // CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<2 x !s64i>
+  // CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]])
+  // CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]])
+  // CIR: [[MUL:%.*]]   = cir.binop(mul, [[AND_A]], [[AND_B]])
+
+  // LLVM-LABEL: _mm_mul_epu32
+  // LLVM: and <2 x i64> %{{.*}}, splat (i64 4294967295)
+  // LLVM: and <2 x i64> %{{.*}}, splat (i64 4294967295)
+  // LLVM: mul <2 x i64> %{{.*}}, %{{.*}}
+
+  // OGCG-LABEL: _mm_mul_epu32
+  // OGCG: and <2 x i64> %{{.*}}, splat (i64 4294967295)
+  // OGCG: and <2 x i64> %{{.*}}, splat (i64 4294967295)
+  // OGCG: mul <2 x i64> %{{.*}}, %{{.*}}
+
+  return _mm_mul_epu32(A, B);
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/sse41-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/sse41-builtins.c
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+
+#include <immintrin.h>
+
+__m128i test_mm_mul_epi32(__m128i x, __m128i y) {
+  // CIR-LABEL: _mm_mul_epi32
+  // CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
+  // CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
+  // CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i
+  // CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<2 x !s64i>
+  // CIR: [[SHL_A:%.*]]  = cir.shift(left, [[A64]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
+  // CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
+  // CIR: [[SHL_B:%.*]]  = cir.shift(left, [[B64]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
+  // CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
+  // CIR: [[MUL:%.*]]    = cir.binop(mul, [[ASHR_A]], [[ASHR_B]])
+
+  // LLVM-LABEL: _mm_mul_epi32
+  // LLVM: shl <2 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: ashr <2 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: shl <2 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: ashr <2 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: mul <2 x i64> %{{.*}}, %{{.*}}
+
+  // OGCG-LABEL: _mm_mul_epi32
+  // OGCG: shl <2 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: ashr <2 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: shl <2 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: ashr <2 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: mul <2 x i64> %{{.*}}, %{{.*}}
+
+  return _mm_mul_epi32(x, y);
+}