Skip to content

Commit b6118d4

Browse files
Finish upstreaming CIR codegen for sse and sse2 builtins
1 parent c562999 commit b6118d4

File tree

4 files changed

+227
-9
lines changed

4 files changed

+227
-9
lines changed

clang/lib/CIR/CodeGen/CIRGenBuilder.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,36 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
462462
align, order);
463463
}
464464

465+
cir::VecShuffleOp
466+
createVecShuffle(mlir::Location loc, mlir::Value vec1, mlir::Value vec2,
467+
llvm::ArrayRef<mlir::Attribute> maskAttrs) {
468+
auto vecType = mlir::cast<cir::VectorType>(vec1.getType());
469+
auto resultTy = cir::VectorType::get(getContext(), vecType.getElementType(),
470+
maskAttrs.size());
471+
return cir::VecShuffleOp::create(*this, loc, resultTy, vec1, vec2,
472+
getArrayAttr(maskAttrs));
473+
}
474+
475+
cir::VecShuffleOp createVecShuffle(mlir::Location loc, mlir::Value vec1,
476+
mlir::Value vec2,
477+
llvm::ArrayRef<int64_t> mask) {
478+
llvm::SmallVector<mlir::Attribute, 4> maskAttrs;
479+
for (int32_t idx : mask) {
480+
maskAttrs.push_back(cir::IntAttr::get(getSInt32Ty(), idx));
481+
}
482+
483+
return createVecShuffle(loc, vec1, vec2, maskAttrs);
484+
}
485+
486+
cir::VecShuffleOp createVecShuffle(mlir::Location loc, mlir::Value vec1,
487+
llvm::ArrayRef<int64_t> mask) {
488+
/// Create a unary shuffle. The second vector operand of the IR instruction
489+
/// is poison.
490+
return createVecShuffle(
491+
loc, vec1, getConstant(loc, getAttr<cir::PoisonAttr>(vec1.getType())),
492+
mask);
493+
}
494+
465495
/// Create a cir.complex.real_ptr operation that derives a pointer to the real
466496
/// part of the complex value pointed to by the specified pointer value.
467497
mlir::Value createComplexRealPtr(mlir::Location loc, mlir::Value value) {

clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp

Lines changed: 97 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,17 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
134134
case X86::BI__builtin_ia32_tzcnt_u16:
135135
case X86::BI__builtin_ia32_tzcnt_u32:
136136
case X86::BI__builtin_ia32_tzcnt_u64:
137+
UNIMPLEMENTED_BUILTIN();
137138
case X86::BI__builtin_ia32_undef128:
138139
case X86::BI__builtin_ia32_undef256:
139140
case X86::BI__builtin_ia32_undef512:
141+
// The x86 definition of "undef" is not the same as the LLVM definition
142+
// (PR32176). We leave optimizing away an unnecessary zero constant to the
143+
// IR optimizer and backend.
144+
// TODO: If we had a "freeze" IR instruction to generate a fixed undef
145+
// value, we should use that here instead of a zero.
146+
return builder.getNullValue(convertType(e->getType()),
147+
getLoc(e->getExprLoc()));
140148
case X86::BI__builtin_ia32_vec_ext_v4hi:
141149
case X86::BI__builtin_ia32_vec_ext_v16qi:
142150
case X86::BI__builtin_ia32_vec_ext_v8hi:
@@ -146,7 +154,24 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
146154
case X86::BI__builtin_ia32_vec_ext_v32qi:
147155
case X86::BI__builtin_ia32_vec_ext_v16hi:
148156
case X86::BI__builtin_ia32_vec_ext_v8si:
149-
case X86::BI__builtin_ia32_vec_ext_v4di:
157+
case X86::BI__builtin_ia32_vec_ext_v4di: {
158+
unsigned NumElts = cast<cir::VectorType>(ops[0].getType()).getSize();
159+
160+
uint64_t index =
161+
ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue();
162+
163+
index &= NumElts - 1;
164+
165+
auto indexAttr = cir::IntAttr::get(
166+
cir::IntType::get(&getMLIRContext(), 64, false), index);
167+
auto indexVal =
168+
cir::ConstantOp::create(builder, getLoc(e->getExprLoc()), indexAttr);
169+
170+
// These builtins exist so we can ensure the index is an ICE and in range.
171+
// Otherwise we could just do this in the header file.
172+
return cir::VecExtractOp::create(builder, getLoc(e->getExprLoc()), ops[0],
173+
indexVal);
174+
}
150175
case X86::BI__builtin_ia32_vec_set_v4hi:
151176
case X86::BI__builtin_ia32_vec_set_v16qi:
152177
case X86::BI__builtin_ia32_vec_set_v8hi:
@@ -499,12 +524,55 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
499524
case X86::BI__builtin_ia32_pblendw256:
500525
case X86::BI__builtin_ia32_pblendd128:
501526
case X86::BI__builtin_ia32_pblendd256:
527+
UNIMPLEMENTED_BUILTIN();
502528
case X86::BI__builtin_ia32_pshuflw:
503529
case X86::BI__builtin_ia32_pshuflw256:
504-
case X86::BI__builtin_ia32_pshuflw512:
530+
case X86::BI__builtin_ia32_pshuflw512: {
531+
unsigned imm =
532+
ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue();
533+
auto Ty = cast<cir::VectorType>(ops[0].getType());
534+
unsigned numElts = Ty.getSize();
535+
536+
// Splat the 8-bits of immediate 4 times to help the loop wrap around.
537+
imm = (imm & 0xff) * 0x01010101;
538+
539+
int64_t indices[32];
540+
for (unsigned l = 0; l != numElts; l += 8) {
541+
for (unsigned i = 0; i != 4; ++i) {
542+
indices[l + i] = l + (imm & 3);
543+
imm >>= 2;
544+
}
545+
for (unsigned i = 4; i != 8; ++i)
546+
indices[l + i] = l + i;
547+
}
548+
549+
return builder.createVecShuffle(getLoc(e->getExprLoc()), ops[0],
550+
ArrayRef(indices, numElts));
551+
}
505552
case X86::BI__builtin_ia32_pshufhw:
506553
case X86::BI__builtin_ia32_pshufhw256:
507-
case X86::BI__builtin_ia32_pshufhw512:
554+
case X86::BI__builtin_ia32_pshufhw512: {
555+
unsigned imm =
556+
ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue();
557+
auto ty = cast<cir::VectorType>(ops[0].getType());
558+
unsigned numElts = ty.getSize();
559+
560+
// Splat the 8-bits of immediate 4 times to help the loop wrap around.
561+
imm = (imm & 0xff) * 0x01010101;
562+
563+
int64_t indices[32];
564+
for (unsigned l = 0; l != numElts; l += 8) {
565+
for (unsigned i = 0; i != 4; ++i)
566+
indices[l + i] = l + i;
567+
for (unsigned i = 4; i != 8; ++i) {
568+
indices[l + i] = l + 4 + (imm & 3);
569+
imm >>= 2;
570+
}
571+
}
572+
573+
return builder.createVecShuffle(getLoc(e->getExprLoc()), ops[0],
574+
ArrayRef(indices, numElts));
575+
}
508576
case X86::BI__builtin_ia32_pshufd:
509577
case X86::BI__builtin_ia32_pshufd256:
510578
case X86::BI__builtin_ia32_pshufd512:
@@ -514,12 +582,37 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
514582
case X86::BI__builtin_ia32_vpermilps256:
515583
case X86::BI__builtin_ia32_vpermilpd512:
516584
case X86::BI__builtin_ia32_vpermilps512:
585+
UNIMPLEMENTED_BUILTIN();
517586
case X86::BI__builtin_ia32_shufpd:
518587
case X86::BI__builtin_ia32_shufpd256:
519588
case X86::BI__builtin_ia32_shufpd512:
520589
case X86::BI__builtin_ia32_shufps:
521590
case X86::BI__builtin_ia32_shufps256:
522-
case X86::BI__builtin_ia32_shufps512:
591+
case X86::BI__builtin_ia32_shufps512: {
592+
unsigned imm =
593+
ops[2].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue();
594+
auto ty = cast<cir::VectorType>(ops[0].getType());
595+
unsigned numElts = ty.getSize();
596+
unsigned numLanes = cgm.getDataLayout().getTypeSizeInBits(ty) / 128;
597+
unsigned numLaneElts = numElts / numLanes;
598+
599+
// Splat the 8-bits of immediate 4 times to help the loop wrap around.
600+
imm = (imm & 0xff) * 0x01010101;
601+
602+
int64_t indices[16];
603+
for (unsigned l = 0; l != numElts; l += numLaneElts) {
604+
for (unsigned i = 0; i != numLaneElts; ++i) {
605+
unsigned index = imm % numLaneElts;
606+
imm /= numLaneElts;
607+
if (i >= (numLaneElts / 2))
608+
index += numElts;
609+
indices[l + i] = l + index;
610+
}
611+
}
612+
613+
return builder.createVecShuffle(getLoc(e->getExprLoc()), ops[0], ops[1],
614+
ArrayRef(indices, numElts));
615+
}
523616
case X86::BI__builtin_ia32_permdi256:
524617
case X86::BI__builtin_ia32_permdf256:
525618
case X86::BI__builtin_ia32_permdi512:

clang/test/CIR/CodeGen/X86/sse-builtins.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse -fclangir -emit-llvm -o %t.ll -Wall -Werror
99
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
1010

11+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
12+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
13+
1114
// This test mimics clang/test/CodeGen/X86/sse-builtins.c, which eventually
1215
// CIR shall be able to support fully.
1316

@@ -50,3 +53,15 @@ unsigned int test_mm_getcsr(void) {
5053
// LLVM: load i32
5154
return _mm_getcsr();
5255
}
56+
57+
__m128 test_mm_shuffle_ps(__m128 A, __m128 B) {
58+
// CIR-LABEL: _mm_shuffle_ps
59+
// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i] : !cir.vector<4 x !cir.float>
60+
61+
// CHECK-LABEL: test_mm_shuffle_ps
62+
// CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
63+
64+
// OGCG-LABEL: test_mm_shuffle_ps
65+
// OGCG: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
66+
return _mm_shuffle_ps(A, B, 0);
67+
}

clang/test/CIR/CodeGen/X86/sse2-builtins.c

Lines changed: 85 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse2 -fclangir -emit-cir -o %t.cir -Wall -Werror
2-
// RUN: FileCheck --check-prefixes=CIR-CHECK --input-file=%t.cir %s
2+
// RUN: FileCheck --check-prefixes=CIR-CHECK,CIR-X64 --input-file=%t.cir %s
33
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse2 -fno-signed-char -fclangir -emit-cir -o %t.cir -Wall -Werror
4-
// RUN: FileCheck --check-prefixes=CIR-CHECK --input-file=%t.cir %s
4+
// RUN: FileCheck --check-prefixes=CIR-CHECK,CIR-X64 --input-file=%t.cir %s
55

66
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse2 -fclangir -emit-llvm -o %t.ll -Wall -Werror
7-
// RUN: FileCheck --check-prefixes=LLVM-CHECK --input-file=%t.ll %s
7+
// RUN: FileCheck --check-prefixes=LLVM-CHECK,LLVM-X64 --input-file=%t.ll %s
88
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse2 -fno-signed-char -fclangir -emit-llvm -o %t.ll -Wall -Werror
9-
// RUN: FileCheck --check-prefixes=LLVM-CHECK --input-file=%t.ll %s
9+
// RUN: FileCheck --check-prefixes=LLVM-CHECK,LLVM-X64 --input-file=%t.ll %s
10+
11+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
12+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
13+
14+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
15+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
1016

1117
// This test mimics clang/test/CodeGen/X86/sse2-builtins.c, which eventually
1218
// CIR shall be able to support fully.
@@ -21,6 +27,44 @@ void test_mm_clflush(void* A) {
2127
// LLVM-CHECK: call void @llvm.x86.sse2.clflush(ptr {{%.*}})
2228
}
2329

30+
__m128d test_mm_undefined_pd(void) {
31+
// CIR-X64-LABEL: _mm_undefined_pd
32+
// CIR-X64: %{{.*}} = cir.const #cir.zero : !cir.vector<2 x !cir.double>
33+
// CIR-X64: cir.return %{{.*}} : !cir.vector<2 x !cir.double>
34+
35+
// LLVM-X64-LABEL: test_mm_undefined_pd
36+
// LLVM-X64: store <2 x double> zeroinitializer, ptr %[[A:.*]], align 16
37+
// LLVM-X64: %{{.*}} = load <2 x double>, ptr %[[A]], align 16
38+
// LLVM-X64: ret <2 x double> %{{.*}}
39+
return _mm_undefined_pd();
40+
}
41+
42+
__m128i test_mm_undefined_si128(void) {
43+
// CIR-LABEL: _mm_undefined_si128
44+
// CIR-CHECK: %[[A:.*]] = cir.const #cir.zero : !cir.vector<2 x !cir.double>
45+
// CIR-CHECK: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<2 x !cir.double> -> !cir.vector<2 x !s64i>
46+
// CIR-CHECK: cir.return %{{.*}} : !cir.vector<2 x !s64i>
47+
48+
// LLVM-CHECK-LABEL: test_mm_undefined_si128
49+
// LLVM-CHECK: store <2 x i64> zeroinitializer, ptr %[[A:.*]], align 16
50+
// LLVM-CHECK: %{{.*}} = load <2 x i64>, ptr %[[A]], align 16
51+
// LLVM-CHECK: ret <2 x i64> %{{.*}}
52+
return _mm_undefined_si128();
53+
}
54+
55+
// Lowering to pextrw requires optimization.
56+
int test_mm_extract_epi16(__m128i A) {
57+
58+
// CIR-CHECK-LABEL: test_mm_extract_epi16
59+
// CIR-CHECK %{{.*}} = cir.vec.extract %{{.*}}[%{{.*}} : {{!u32i|!u64i}}] : !cir.vector<!s16i x 8>
60+
// CIR-CHECK %{{.*}} = cir.cast integral %{{.*}} : !u16i -> !s32i
61+
62+
// LLVM-CHECK-LABEL: test_mm_extract_epi16
63+
// LLVM-CHECK: extractelement <8 x i16> %{{.*}}, {{i32|i64}} 1
64+
// LLVM-CHECK: zext i16 %{{.*}} to i32
65+
return _mm_extract_epi16(A, 1);
66+
}
67+
2468
void test_mm_lfence(void) {
2569
// CIR-CHECK-LABEL: test_mm_lfence
2670
// LLVM-CHECK-LABEL: test_mm_lfence
@@ -35,4 +79,40 @@ void test_mm_mfence(void) {
3579
_mm_mfence();
3680
// CIR-CHECK: {{%.*}} = cir.llvm.intrinsic "x86.sse2.mfence" : () -> !void
3781
// LLVM-CHECK: call void @llvm.x86.sse2.mfence()
38-
}
82+
}
83+
84+
__m128i test_mm_shufflelo_epi16(__m128i A) {
85+
// CIR-LABEL: _mm_shufflelo_epi16
86+
// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s16i x 8>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] : !cir.vector<!s16i x 8>
87+
88+
// LLVM-LABEL: test_mm_shufflelo_epi16
89+
// LLVM: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
90+
91+
// OGCG-LABEL: test_mm_shufflelo_epi16
92+
// OGCG: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
93+
return _mm_shufflelo_epi16(A, 0);
94+
}
95+
96+
__m128i test_mm_shufflehi_epi16(__m128i A) {
97+
// CIR-LABEL: _mm_shufflehi_epi16
98+
// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s16i x 8>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i] : !cir.vector<!s16i x 8>
99+
100+
// LLVM-LABEL: test_mm_shufflehi_epi16
101+
// LLVM: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
102+
103+
// OGCG-LABEL: test_mm_shufflehi_epi16
104+
// OGCG: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
105+
return _mm_shufflehi_epi16(A, 0);
106+
}
107+
108+
__m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
109+
// CIR-LABEL: test_mm_shuffle_pd
110+
// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!cir.double x 2>) [#cir.int<1> : !s32i, #cir.int<2> : !s32i] : !cir.vector<!cir.double x 2>
111+
112+
// CHECK-LABEL: test_mm_shuffle_pd
113+
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 2>
114+
115+
// OGCG-LABEL: test_mm_shuffle_pd
116+
// OGCG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 2>
117+
return _mm_shuffle_pd(A, B, 1);
118+
}

0 commit comments

Comments
 (0)