Skip to content

Commit a24ba5b

Browse files
committed
[CIR][X86] Implement lowering for pmuldq / pmuludq builtins
This patch adds CIR codegen support for X86 pmuldq and pmuludq operations, covering the signed and unsigned variants across all supported vector widths. The builtins now lower to the expected CIR representation matching the semantics of the corresponding LLVM intrinsics.
1 parent 965c3d7 commit a24ba5b

File tree

5 files changed

+236
-0
lines changed

5 files changed

+236
-0
lines changed

clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,47 @@ static mlir::Value getMaskVecValue(CIRGenBuilderTy &builder, mlir::Location loc,
8585
return maskVec;
8686
}
8787

88+
static mlir::Value emitX86Muldq(CIRGenFunction &cgf, const CallExpr *expr,
89+
bool isSigned,
90+
SmallVectorImpl<mlir::Value> &ops) {
91+
CIRGenBuilderTy &builder = cgf.getBuilder();
92+
mlir::Location loc = cgf.getLoc(expr->getExprLoc());
93+
mlir::Type ty = ops[0].getType();
94+
unsigned tyPrimitiveSizeInBits =
95+
cgf.cgm.getDataLayout().getTypeSizeInBits(ty);
96+
mlir::Value lhs, rhs;
97+
// in cir, if a shiftOperation is shift right,it will be translated into Ashr
98+
// or lShr automatically in match and rewrite stage according to its operand's
99+
// type
100+
if (isSigned) {
101+
ty =
102+
cir::VectorType::get(builder.getSInt64Ty(), tyPrimitiveSizeInBits / 64);
103+
cir::ConstantOp shiftAmt =
104+
builder.getConstant(loc, cir::IntAttr::get(builder.getSInt64Ty(), 32));
105+
cir::VecSplatOp shiftSplatVecOp =
106+
cir::VecSplatOp::create(builder, loc, ty, shiftAmt.getResult());
107+
mlir::Value shiftSplatValue = shiftSplatVecOp.getResult();
108+
lhs = builder.createBitcast(loc, ops[0], ty);
109+
rhs = builder.createBitcast(loc, ops[1], ty);
110+
lhs = builder.createShift(loc, lhs, shiftSplatValue, true);
111+
lhs = builder.createShift(loc, lhs, shiftSplatValue, false);
112+
rhs = builder.createShift(loc, rhs, shiftSplatValue, true);
113+
rhs = builder.createShift(loc, rhs, shiftSplatValue, false);
114+
} else {
115+
ty =
116+
cir::VectorType::get(builder.getSInt64Ty(), tyPrimitiveSizeInBits / 64);
117+
cir::ConstantOp maskScalar = builder.getConstant(
118+
loc, cir::IntAttr::get(builder.getSInt64Ty(), 0xffffffff));
119+
cir::VecSplatOp mask =
120+
cir::VecSplatOp::create(builder, loc, ty, maskScalar.getResult());
121+
lhs = builder.createBitcast(loc, ops[0], ty);
122+
rhs = builder.createBitcast(loc, ops[1], ty);
123+
lhs = builder.createAnd(loc, lhs, mask);
124+
rhs = builder.createAnd(loc, rhs, mask);
125+
}
126+
return builder.createMul(loc, lhs, rhs);
127+
}
128+
88129
mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
89130
const CallExpr *expr) {
90131
if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -784,12 +825,18 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
784825
case X86::BI__builtin_ia32_sqrtph512:
785826
case X86::BI__builtin_ia32_sqrtps512:
786827
case X86::BI__builtin_ia32_sqrtpd512:
828+
cgm.errorNYI(expr->getSourceRange(),
829+
std::string("unimplemented X86 builtin call: ") +
830+
getContext().BuiltinInfo.getName(builtinID));
831+
return {};
787832
case X86::BI__builtin_ia32_pmuludq128:
788833
case X86::BI__builtin_ia32_pmuludq256:
789834
case X86::BI__builtin_ia32_pmuludq512:
835+
return emitX86Muldq(*this, expr, /*IsSigned*/ false, ops);
790836
case X86::BI__builtin_ia32_pmuldq128:
791837
case X86::BI__builtin_ia32_pmuldq256:
792838
case X86::BI__builtin_ia32_pmuldq512:
839+
return emitX86Muldq(*this, expr, /*IsSigned*/ true, ops);
793840
case X86::BI__builtin_ia32_pternlogd512_mask:
794841
case X86::BI__builtin_ia32_pternlogq512_mask:
795842
case X86::BI__builtin_ia32_pternlogd128_mask:
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
2+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
3+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
4+
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
5+
6+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
7+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
8+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
9+
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
10+
11+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
12+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
13+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
14+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
15+
16+
#include <immintrin.h>
17+
18+
__m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
19+
// CIR-LABEL: _mm256_mul_epu32
20+
// CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i
21+
// CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<4 x !s64i>
22+
// CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
23+
// CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
24+
// CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]])
25+
// CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]])
26+
// CIR: [[MUL:%.*]] = cir.binop(mul, [[AND_A]], [[AND_B]])
27+
28+
// LLVM-LABEL: _mm256_mul_epu32
29+
// LLVM: and <4 x i64> %{{.*}}, splat (i64 4294967295)
30+
// LLVM: and <4 x i64> %{{.*}}, splat (i64 4294967295)
31+
// LLVM: mul <4 x i64> %{{.*}}, %{{.*}}
32+
33+
// OGCG-LABEL: _mm256_mul_epu32
34+
// OGCG: and <4 x i64> %{{.*}}, splat (i64 4294967295)
35+
// OGCG: and <4 x i64> %{{.*}}, splat (i64 4294967295)
36+
// OGCG: mul <4 x i64> %{{.*}}, %{{.*}}
37+
38+
return _mm256_mul_epu32(a, b);
39+
}
40+
41+
__m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
42+
// CIR-LABEL: _mm256_mul_epi32
43+
// CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i
44+
// CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<4 x !s64i>
45+
// CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
46+
// CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
47+
// CIR: [[SHL_A:%.*]] = cir.shift(left, [[A64]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
48+
// CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
49+
// CIR: [[SHL_B:%.*]] = cir.shift(left, [[B64]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
50+
// CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
51+
// CIR: [[MUL:%.*]] = cir.binop(mul, [[ASHR_A]], [[ASHR_B]])
52+
53+
// LLVM-LABEL: _mm256_mul_epi32
54+
// LLVM: shl <4 x i64> %{{.*}}, splat (i64 32)
55+
// LLVM: ashr <4 x i64> %{{.*}}, splat (i64 32)
56+
// LLVM: shl <4 x i64> %{{.*}}, splat (i64 32)
57+
// LLVM: ashr <4 x i64> %{{.*}}, splat (i64 32)
58+
// LLVM: mul <4 x i64> %{{.*}}, %{{.*}}
59+
60+
// OGCG-LABEL: _mm256_mul_epi32
61+
// OGCG: shl <4 x i64> %{{.*}}, splat (i64 32)
62+
// OGCG: ashr <4 x i64> %{{.*}}, splat (i64 32)
63+
// OGCG: shl <4 x i64> %{{.*}}, splat (i64 32)
64+
// OGCG: ashr <4 x i64> %{{.*}}, splat (i64 32)
65+
// OGCG: mul <4 x i64> %{{.*}}, %{{.*}}
66+
67+
return _mm256_mul_epi32(a, b);
68+
}

clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c

100644100755
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,56 @@ __m512i test_mm512_undefined_epi32(void) {
7777
// OGCG: ret <8 x i64> zeroinitializer
7878
return _mm512_undefined_epi32();
7979
}
80+
81+
__m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) {
82+
// CIR-LABEL: _mm512_mul_epi32
83+
// CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i
84+
// CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<8 x !s64i>
85+
// CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i>
86+
// CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i>
87+
// CIR: [[SHL_A:%.*]] = cir.shift(left, [[A64]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
88+
// CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
89+
// CIR: [[SHL_B:%.*]] = cir.shift(left, [[B64]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
90+
// CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
91+
// CIR: [[MUL:%.*]] = cir.binop(mul, [[ASHR_A]], [[ASHR_B]])
92+
93+
// LLVM-LABEL: _mm512_mul_epi32
94+
// LLVM: shl <8 x i64> %{{.*}}, splat (i64 32)
95+
// LLVM: ashr <8 x i64> %{{.*}}, splat (i64 32)
96+
// LLVM: shl <8 x i64> %{{.*}}, splat (i64 32)
97+
// LLVM: ashr <8 x i64> %{{.*}}, splat (i64 32)
98+
// LLVM: mul <8 x i64> %{{.*}}, %{{.*}}
99+
100+
// OGCG-LABEL: _mm512_mul_epi32
101+
// OGCG: shl <8 x i64> %{{.*}}, splat (i64 32)
102+
// OGCG: ashr <8 x i64> %{{.*}}, splat (i64 32)
103+
// OGCG: shl <8 x i64> %{{.*}}, splat (i64 32)
104+
// OGCG: ashr <8 x i64> %{{.*}}, splat (i64 32)
105+
// OGCG: mul <8 x i64> %{{.*}}, %{{.*}}
106+
107+
return _mm512_mul_epi32(__A, __B);
108+
}
109+
110+
111+
__m512i test_mm512_mul_epu32(__m512i __A, __m512i __B) {
112+
// CIR-LABEL: _mm512_mul_epu32
113+
// CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i
114+
// CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<8 x !s64i>
115+
// CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<8 x !s64i>
116+
// CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<8 x !s64i>
117+
// CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]])
118+
// CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]])
119+
// CIR: [[MUL:%.*]] = cir.binop(mul, [[AND_A]], [[AND_B]])
120+
121+
// LLVM-LABEL: _mm512_mul_epu32
122+
// LLVM: and <8 x i64> %{{.*}}, splat (i64 4294967295)
123+
// LLVM: and <8 x i64> %{{.*}}, splat (i64 4294967295)
124+
// LLVM: mul <8 x i64> %{{.*}}, %{{.*}}
125+
126+
// OGCG-LABEL: _mm512_mul_epu32
127+
// OGCG: and <8 x i64> %{{.*}}, splat (i64 4294967295)
128+
// OGCG: and <8 x i64> %{{.*}}, splat (i64 4294967295)
129+
// OGCG: mul <8 x i64> %{{.*}}, %{{.*}}
130+
131+
return _mm512_mul_epu32(__A, __B);
132+
}

clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,3 +108,26 @@ void test_mm_pause(void) {
108108
// LLVM: call void @llvm.x86.sse2.pause()
109109
// OGCG: call void @llvm.x86.sse2.pause()
110110
}
111+
112+
__m128i test_mm_mul_epu32(__m128i A, __m128i B) {
113+
// CIR-LABEL: _mm_mul_epu32
114+
// CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i
115+
// CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<2 x !s64i>
116+
// CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
117+
// CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
118+
// CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]])
119+
// CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]])
120+
// CIR: [[MUL:%.*]] = cir.binop(mul, [[AND_A]], [[AND_B]])
121+
122+
// LLVM-LABEL: _mm_mul_epu32
123+
// LLVM: and <2 x i64> %{{.*}}, splat (i64 4294967295)
124+
// LLVM: and <2 x i64> %{{.*}}, splat (i64 4294967295)
125+
// LLVM: mul <2 x i64> %{{.*}}, %{{.*}}
126+
127+
// OGCG-LABEL: _mm_mul_epu32
128+
// OGCG: and <2 x i64> %{{.*}}, splat (i64 4294967295)
129+
// OGCG: and <2 x i64> %{{.*}}, splat (i64 4294967295)
130+
// OGCG: mul <2 x i64> %{{.*}}, %{{.*}}
131+
132+
return _mm_mul_epu32(A, B);
133+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
2+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
3+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
4+
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
5+
6+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
7+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
8+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
9+
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
10+
11+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
12+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
13+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
14+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
15+
16+
#include <immintrin.h>
17+
18+
__m128i test_mm_mul_epi32(__m128i x, __m128i y) {
19+
// CIR-LABEL: _mm_mul_epi32
20+
// CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i
21+
// CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<2 x !s64i>
22+
// CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
23+
// CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
24+
// CIR: [[SHL_A:%.*]] = cir.shift(left, [[A64]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
25+
// CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
26+
// CIR: [[SHL_B:%.*]] = cir.shift(left, [[B64]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
27+
// CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
28+
// CIR: [[MUL:%.*]] = cir.binop(mul, [[ASHR_A]], [[ASHR_B]])
29+
30+
// LLVM-LABEL: _mm_mul_epi32
31+
// LLVM: shl <2 x i64> %{{.*}}, splat (i64 32)
32+
// LLVM: ashr <2 x i64> %{{.*}}, splat (i64 32)
33+
// LLVM: shl <2 x i64> %{{.*}}, splat (i64 32)
34+
// LLVM: ashr <2 x i64> %{{.*}}, splat (i64 32)
35+
// LLVM: mul <2 x i64> %{{.*}}, %{{.*}}
36+
37+
// OGCG-LABEL: _mm_mul_epi32
38+
// OGCG: shl <2 x i64> %{{.*}}, splat (i64 32)
39+
// OGCG: ashr <2 x i64> %{{.*}}, splat (i64 32)
40+
// OGCG: shl <2 x i64> %{{.*}}, splat (i64 32)
41+
// OGCG: ashr <2 x i64> %{{.*}}, splat (i64 32)
42+
// OGCG: mul <2 x i64> %{{.*}}, %{{.*}}
43+
44+
return _mm_mul_epi32(x, y);
45+
}

0 commit comments

Comments
 (0)