Skip to content

Commit 14da804

Browse files
committed
[CIR][X86] Implement lowering for pmuldq / pmuludq builtins
This patch adds CIR codegen support for X86 pmuldq and pmuludq operations, covering the signed and unsigned variants across all supported vector widths. The builtins now lower to the expected CIR representation matching the semantics of the corresponding LLVM intrinsics.
1 parent 036279a commit 14da804

File tree

5 files changed

+238
-0
lines changed

5 files changed

+238
-0
lines changed

clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,47 @@ static mlir::Value emitX86MaskLogic(CIRGenBuilderTy &builder,
115115
ops[0].getType());
116116
}
117117

118+
static mlir::Value emitX86Muldq(CIRGenFunction &cgf, const CallExpr *expr,
119+
bool isSigned,
120+
SmallVectorImpl<mlir::Value> &ops) {
121+
CIRGenBuilderTy &builder = cgf.getBuilder();
122+
mlir::Location loc = cgf.getLoc(expr->getExprLoc());
123+
mlir::Type ty = ops[0].getType();
124+
unsigned tyPrimitiveSizeInBits =
125+
cgf.cgm.getDataLayout().getTypeSizeInBits(ty);
126+
mlir::Value lhs, rhs;
127+
// in cir, if a shiftOperation is shift right,it will be translated into Ashr
128+
// or lShr automatically in match and rewrite stage according to its operand's
129+
// type
130+
if (isSigned) {
131+
ty =
132+
cir::VectorType::get(builder.getSInt64Ty(), tyPrimitiveSizeInBits / 64);
133+
cir::ConstantOp shiftAmt =
134+
builder.getConstant(loc, cir::IntAttr::get(builder.getSInt64Ty(), 32));
135+
cir::VecSplatOp shiftSplatVecOp =
136+
cir::VecSplatOp::create(builder, loc, ty, shiftAmt.getResult());
137+
mlir::Value shiftSplatValue = shiftSplatVecOp.getResult();
138+
lhs = builder.createBitcast(loc, ops[0], ty);
139+
rhs = builder.createBitcast(loc, ops[1], ty);
140+
lhs = builder.createShift(loc, lhs, shiftSplatValue, true);
141+
lhs = builder.createShift(loc, lhs, shiftSplatValue, false);
142+
rhs = builder.createShift(loc, rhs, shiftSplatValue, true);
143+
rhs = builder.createShift(loc, rhs, shiftSplatValue, false);
144+
} else {
145+
ty =
146+
cir::VectorType::get(builder.getSInt64Ty(), tyPrimitiveSizeInBits / 64);
147+
cir::ConstantOp maskScalar = builder.getConstant(
148+
loc, cir::IntAttr::get(builder.getSInt64Ty(), 0xffffffff));
149+
cir::VecSplatOp mask =
150+
cir::VecSplatOp::create(builder, loc, ty, maskScalar.getResult());
151+
lhs = builder.createBitcast(loc, ops[0], ty);
152+
rhs = builder.createBitcast(loc, ops[1], ty);
153+
lhs = builder.createAnd(loc, lhs, mask);
154+
rhs = builder.createAnd(loc, rhs, mask);
155+
}
156+
return builder.createMul(loc, lhs, rhs);
157+
}
158+
118159
mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
119160
const CallExpr *expr) {
120161
if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -851,12 +892,18 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
851892
case X86::BI__builtin_ia32_sqrtph512:
852893
case X86::BI__builtin_ia32_sqrtps512:
853894
case X86::BI__builtin_ia32_sqrtpd512:
895+
cgm.errorNYI(expr->getSourceRange(),
896+
std::string("unimplemented X86 builtin call: ") +
897+
getContext().BuiltinInfo.getName(builtinID));
898+
return {};
854899
case X86::BI__builtin_ia32_pmuludq128:
855900
case X86::BI__builtin_ia32_pmuludq256:
856901
case X86::BI__builtin_ia32_pmuludq512:
902+
return emitX86Muldq(*this, expr, /*IsSigned*/ false, ops);
857903
case X86::BI__builtin_ia32_pmuldq128:
858904
case X86::BI__builtin_ia32_pmuldq256:
859905
case X86::BI__builtin_ia32_pmuldq512:
906+
return emitX86Muldq(*this, expr, /*IsSigned*/ true, ops);
860907
case X86::BI__builtin_ia32_pternlogd512_mask:
861908
case X86::BI__builtin_ia32_pternlogq512_mask:
862909
case X86::BI__builtin_ia32_pternlogd128_mask:
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
2+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
3+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
4+
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
5+
6+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
7+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
8+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
9+
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
10+
11+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
12+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
13+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
14+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
15+
16+
#include <immintrin.h>
17+
18+
__m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
19+
// CIR-LABEL: _mm256_mul_epu32
20+
// CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i
21+
// CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<4 x !s64i>
22+
// CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
23+
// CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
24+
// CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]])
25+
// CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]])
26+
// CIR: [[MUL:%.*]] = cir.binop(mul, [[AND_A]], [[AND_B]])
27+
28+
// LLVM-LABEL: _mm256_mul_epu32
29+
// LLVM: and <4 x i64> %{{.*}}, splat (i64 4294967295)
30+
// LLVM: and <4 x i64> %{{.*}}, splat (i64 4294967295)
31+
// LLVM: mul <4 x i64> %{{.*}}, %{{.*}}
32+
33+
// OGCG-LABEL: _mm256_mul_epu32
34+
// OGCG: and <4 x i64> %{{.*}}, splat (i64 4294967295)
35+
// OGCG: and <4 x i64> %{{.*}}, splat (i64 4294967295)
36+
// OGCG: mul <4 x i64> %{{.*}}, %{{.*}}
37+
38+
return _mm256_mul_epu32(a, b);
39+
}
40+
41+
__m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
42+
// CIR-LABEL: _mm256_mul_epi32
43+
// CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i
44+
// CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<4 x !s64i>
45+
// CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
46+
// CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
47+
// CIR: [[SHL_A:%.*]] = cir.shift(left, [[A64]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
48+
// CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
49+
// CIR: [[SHL_B:%.*]] = cir.shift(left, [[B64]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
50+
// CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
51+
// CIR: [[MUL:%.*]] = cir.binop(mul, [[ASHR_A]], [[ASHR_B]])
52+
53+
// LLVM-LABEL: _mm256_mul_epi32
54+
// LLVM: shl <4 x i64> %{{.*}}, splat (i64 32)
55+
// LLVM: ashr <4 x i64> %{{.*}}, splat (i64 32)
56+
// LLVM: shl <4 x i64> %{{.*}}, splat (i64 32)
57+
// LLVM: ashr <4 x i64> %{{.*}}, splat (i64 32)
58+
// LLVM: mul <4 x i64> %{{.*}}, %{{.*}}
59+
60+
// OGCG-LABEL: _mm256_mul_epi32
61+
// OGCG: shl <4 x i64> %{{.*}}, splat (i64 32)
62+
// OGCG: ashr <4 x i64> %{{.*}}, splat (i64 32)
63+
// OGCG: shl <4 x i64> %{{.*}}, splat (i64 32)
64+
// OGCG: ashr <4 x i64> %{{.*}}, splat (i64 32)
65+
// OGCG: mul <4 x i64> %{{.*}}, %{{.*}}
66+
67+
return _mm256_mul_epi32(a, b);
68+
}

clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c

100644100755
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,3 +228,58 @@ __mmask16 test_kmov_w(__mmask16 A) {
228228
// OGCG: bitcast <16 x i1> {{.*}} to i16
229229
return __builtin_ia32_kmovw(A);
230230
}
231+
232+
233+
__m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) {
234+
// CIR-LABEL: _mm512_mul_epi32
235+
// CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i
236+
// CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<8 x !s64i>
237+
// CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i>
238+
// CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i>
239+
// CIR: [[SHL_A:%.*]] = cir.shift(left, [[A64]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
240+
// CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
241+
// CIR: [[SHL_B:%.*]] = cir.shift(left, [[B64]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
242+
// CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
243+
// CIR: [[MUL:%.*]] = cir.binop(mul, [[ASHR_A]], [[ASHR_B]])
244+
245+
// LLVM-LABEL: _mm512_mul_epi32
246+
// LLVM: shl <8 x i64> %{{.*}}, splat (i64 32)
247+
// LLVM: ashr <8 x i64> %{{.*}}, splat (i64 32)
248+
// LLVM: shl <8 x i64> %{{.*}}, splat (i64 32)
249+
// LLVM: ashr <8 x i64> %{{.*}}, splat (i64 32)
250+
// LLVM: mul <8 x i64> %{{.*}}, %{{.*}}
251+
252+
// OGCG-LABEL: _mm512_mul_epi32
253+
// OGCG: shl <8 x i64> %{{.*}}, splat (i64 32)
254+
// OGCG: ashr <8 x i64> %{{.*}}, splat (i64 32)
255+
// OGCG: shl <8 x i64> %{{.*}}, splat (i64 32)
256+
// OGCG: ashr <8 x i64> %{{.*}}, splat (i64 32)
257+
// OGCG: mul <8 x i64> %{{.*}}, %{{.*}}
258+
259+
return _mm512_mul_epi32(__A, __B);
260+
}
261+
262+
263+
__m512i test_mm512_mul_epu32(__m512i __A, __m512i __B) {
264+
// CIR-LABEL: _mm512_mul_epu32
265+
// CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i
266+
// CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<8 x !s64i>
267+
// CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<8 x !s64i>
268+
// CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<8 x !s64i>
269+
// CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]])
270+
// CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]])
271+
// CIR: [[MUL:%.*]] = cir.binop(mul, [[AND_A]], [[AND_B]])
272+
273+
// LLVM-LABEL: _mm512_mul_epu32
274+
// LLVM: and <8 x i64> %{{.*}}, splat (i64 4294967295)
275+
// LLVM: and <8 x i64> %{{.*}}, splat (i64 4294967295)
276+
// LLVM: mul <8 x i64> %{{.*}}, %{{.*}}
277+
278+
// OGCG-LABEL: _mm512_mul_epu32
279+
// OGCG: and <8 x i64> %{{.*}}, splat (i64 4294967295)
280+
// OGCG: and <8 x i64> %{{.*}}, splat (i64 4294967295)
281+
// OGCG: mul <8 x i64> %{{.*}}, %{{.*}}
282+
283+
return _mm512_mul_epu32(__A, __B);
284+
}
285+

clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,3 +108,26 @@ void test_mm_pause(void) {
108108
// LLVM: call void @llvm.x86.sse2.pause()
109109
// OGCG: call void @llvm.x86.sse2.pause()
110110
}
111+
112+
__m128i test_mm_mul_epu32(__m128i A, __m128i B) {
113+
// CIR-LABEL: _mm_mul_epu32
114+
// CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i
115+
// CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<2 x !s64i>
116+
// CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
117+
// CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
118+
// CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]])
119+
// CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]])
120+
// CIR: [[MUL:%.*]] = cir.binop(mul, [[AND_A]], [[AND_B]])
121+
122+
// LLVM-LABEL: _mm_mul_epu32
123+
// LLVM: and <2 x i64> %{{.*}}, splat (i64 4294967295)
124+
// LLVM: and <2 x i64> %{{.*}}, splat (i64 4294967295)
125+
// LLVM: mul <2 x i64> %{{.*}}, %{{.*}}
126+
127+
// OGCG-LABEL: _mm_mul_epu32
128+
// OGCG: and <2 x i64> %{{.*}}, splat (i64 4294967295)
129+
// OGCG: and <2 x i64> %{{.*}}, splat (i64 4294967295)
130+
// OGCG: mul <2 x i64> %{{.*}}, %{{.*}}
131+
132+
return _mm_mul_epu32(A, B);
133+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
2+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
3+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
4+
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
5+
6+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
7+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
8+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
9+
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
10+
11+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
12+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
13+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
14+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
15+
16+
#include <immintrin.h>
17+
18+
__m128i test_mm_mul_epi32(__m128i x, __m128i y) {
19+
// CIR-LABEL: _mm_mul_epi32
20+
// CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i
21+
// CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<2 x !s64i>
22+
// CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
23+
// CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
24+
// CIR: [[SHL_A:%.*]] = cir.shift(left, [[A64]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
25+
// CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
26+
// CIR: [[SHL_B:%.*]] = cir.shift(left, [[B64]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
27+
// CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
28+
// CIR: [[MUL:%.*]] = cir.binop(mul, [[ASHR_A]], [[ASHR_B]])
29+
30+
// LLVM-LABEL: _mm_mul_epi32
31+
// LLVM: shl <2 x i64> %{{.*}}, splat (i64 32)
32+
// LLVM: ashr <2 x i64> %{{.*}}, splat (i64 32)
33+
// LLVM: shl <2 x i64> %{{.*}}, splat (i64 32)
34+
// LLVM: ashr <2 x i64> %{{.*}}, splat (i64 32)
35+
// LLVM: mul <2 x i64> %{{.*}}, %{{.*}}
36+
37+
// OGCG-LABEL: _mm_mul_epi32
38+
// OGCG: shl <2 x i64> %{{.*}}, splat (i64 32)
39+
// OGCG: ashr <2 x i64> %{{.*}}, splat (i64 32)
40+
// OGCG: shl <2 x i64> %{{.*}}, splat (i64 32)
41+
// OGCG: ashr <2 x i64> %{{.*}}, splat (i64 32)
42+
// OGCG: mul <2 x i64> %{{.*}}, %{{.*}}
43+
44+
return _mm_mul_epi32(x, y);
45+
}

0 commit comments

Comments
 (0)