Skip to content

Commit af2c1b7

Browse files
Thibault-MonnierHoney Goyal
authored andcommitted
[CIR] Upstream vec shuffle builtins in CIR codegen (llvm#169178)
This PR is part of llvm#167752. It upstreams the codegen and tests for the shuffle builtins implemented in the incubator, including: - `vinsert` + `insert` - `pblend` + `blend` - `vpermilp` - `pshuf` + `shufp` - `palignr` It does NOT upstream the `perm`, `vperm2`, `vpshuf`, `shuf_i` / `shuf_f` and `align` builtins, which are not yet implemented in the incubator. This _is_ a large commit, but most of it is tests. The `pshufd` / `vpermilp` builtins seem to have no test coverage in the incubator, what should I do?
1 parent 02466f5 commit af2c1b7

File tree

8 files changed

+394
-18
lines changed

8 files changed

+394
-18
lines changed

clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp

Lines changed: 89 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,69 @@ static mlir::Value getMaskVecValue(CIRGenBuilderTy &builder, mlir::Location loc,
8585
return maskVec;
8686
}
8787

88+
// Builds the VecShuffleOp for pshuflw and pshufhw x86 builtins.
89+
//
90+
// The vector is split into lanes of 8 word elements (16 bits). The lower or
91+
// upper half of each lane, controlled by `isLow`, is shuffled in the following
92+
// way: The immediate is truncated to 8 bits, separated into 4 2-bit fields. The
93+
// i-th field's value represents the resulting index of the i-th element in the
94+
// half lane after shuffling. The other half of the lane remains unchanged.
95+
static cir::VecShuffleOp emitPshufWord(CIRGenBuilderTy &builder,
96+
const mlir::Value vec,
97+
const mlir::Value immediate,
98+
const mlir::Location loc,
99+
const bool isLow) {
100+
uint32_t imm = CIRGenFunction::getZExtIntValueFromConstOp(immediate);
101+
102+
auto vecTy = cast<cir::VectorType>(vec.getType());
103+
unsigned numElts = vecTy.getSize();
104+
105+
unsigned firstHalfStart = isLow ? 0 : 4;
106+
unsigned secondHalfStart = 4 - firstHalfStart;
107+
108+
// Splat the 8-bits of immediate 4 times to help the loop wrap around.
109+
imm = (imm & 0xff) * 0x01010101;
110+
111+
int64_t indices[32];
112+
for (unsigned l = 0; l != numElts; l += 8) {
113+
for (unsigned i = firstHalfStart; i != firstHalfStart + 4; ++i) {
114+
indices[l + i] = l + (imm & 3) + firstHalfStart;
115+
imm >>= 2;
116+
}
117+
for (unsigned i = secondHalfStart; i != secondHalfStart + 4; ++i)
118+
indices[l + i] = l + i;
119+
}
120+
121+
return builder.createVecShuffle(loc, vec, ArrayRef(indices, numElts));
122+
}
123+
124+
// Builds the shuffle mask for pshufd and shufpd/shufps x86 builtins.
125+
// The shuffle mask is written to outIndices.
126+
static void
127+
computeFullLaneShuffleMask(CIRGenFunction &cgf, const mlir::Value vec,
128+
uint32_t imm, const bool isShufP,
129+
llvm::SmallVectorImpl<int64_t> &outIndices) {
130+
auto vecTy = cast<cir::VectorType>(vec.getType());
131+
unsigned numElts = vecTy.getSize();
132+
unsigned numLanes = cgf.cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
133+
unsigned numLaneElts = numElts / numLanes;
134+
135+
// Splat the 8-bits of immediate 4 times to help the loop wrap around.
136+
imm = (imm & 0xff) * 0x01010101;
137+
138+
for (unsigned l = 0; l != numElts; l += numLaneElts) {
139+
for (unsigned i = 0; i != numLaneElts; ++i) {
140+
uint32_t idx = imm % numLaneElts;
141+
imm /= numLaneElts;
142+
if (isShufP && i >= (numLaneElts / 2))
143+
idx += numElts;
144+
outIndices[l + i] = l + idx;
145+
}
146+
}
147+
148+
outIndices.resize(numElts);
149+
}
150+
88151
static mlir::Value emitX86MaskAddLogic(CIRGenBuilderTy &builder,
89152
mlir::Location loc,
90153
const std::string &intrinsicName,
@@ -270,9 +333,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
270333
case X86::BI__builtin_ia32_vec_ext_v4di: {
271334
unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize();
272335

273-
uint64_t index =
274-
ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue();
275-
336+
uint64_t index = getZExtIntValueFromConstOp(ops[1]);
276337
index &= numElts - 1;
277338

278339
cir::ConstantOp indexVal =
@@ -728,12 +789,20 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
728789
case X86::BI__builtin_ia32_pblendw256:
729790
case X86::BI__builtin_ia32_pblendd128:
730791
case X86::BI__builtin_ia32_pblendd256:
792+
cgm.errorNYI(expr->getSourceRange(),
793+
std::string("unimplemented X86 builtin call: ") +
794+
getContext().BuiltinInfo.getName(builtinID));
795+
return {};
731796
case X86::BI__builtin_ia32_pshuflw:
732797
case X86::BI__builtin_ia32_pshuflw256:
733798
case X86::BI__builtin_ia32_pshuflw512:
799+
return emitPshufWord(builder, ops[0], ops[1], getLoc(expr->getExprLoc()),
800+
true);
734801
case X86::BI__builtin_ia32_pshufhw:
735802
case X86::BI__builtin_ia32_pshufhw256:
736803
case X86::BI__builtin_ia32_pshufhw512:
804+
return emitPshufWord(builder, ops[0], ops[1], getLoc(expr->getExprLoc()),
805+
false);
737806
case X86::BI__builtin_ia32_pshufd:
738807
case X86::BI__builtin_ia32_pshufd256:
739808
case X86::BI__builtin_ia32_pshufd512:
@@ -742,13 +811,28 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
742811
case X86::BI__builtin_ia32_vpermilpd256:
743812
case X86::BI__builtin_ia32_vpermilps256:
744813
case X86::BI__builtin_ia32_vpermilpd512:
745-
case X86::BI__builtin_ia32_vpermilps512:
814+
case X86::BI__builtin_ia32_vpermilps512: {
815+
const uint32_t imm = getSExtIntValueFromConstOp(ops[1]);
816+
817+
llvm::SmallVector<int64_t, 16> mask(16);
818+
computeFullLaneShuffleMask(*this, ops[0], imm, false, mask);
819+
820+
return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], mask);
821+
}
746822
case X86::BI__builtin_ia32_shufpd:
747823
case X86::BI__builtin_ia32_shufpd256:
748824
case X86::BI__builtin_ia32_shufpd512:
749825
case X86::BI__builtin_ia32_shufps:
750826
case X86::BI__builtin_ia32_shufps256:
751-
case X86::BI__builtin_ia32_shufps512:
827+
case X86::BI__builtin_ia32_shufps512: {
828+
const uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
829+
830+
llvm::SmallVector<int64_t, 16> mask(16);
831+
computeFullLaneShuffleMask(*this, ops[0], imm, true, mask);
832+
833+
return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
834+
mask);
835+
}
752836
case X86::BI__builtin_ia32_permdi256:
753837
case X86::BI__builtin_ia32_permdf256:
754838
case X86::BI__builtin_ia32_permdi512:

clang/lib/CIR/CodeGen/CIRGenFunction.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,22 @@ class CIRGenFunction : public CIRGenTypeCache {
203203
return convertType(getContext().getTypeDeclType(t));
204204
}
205205

206+
/// Get integer from a mlir::Value that is an int constant or a constant op.
207+
static int64_t getSExtIntValueFromConstOp(mlir::Value val) {
208+
auto constOp = val.getDefiningOp<cir::ConstantOp>();
209+
assert(constOp && "getIntValueFromConstOp call with non ConstantOp");
210+
return constOp.getIntValue().getSExtValue();
211+
}
212+
213+
/// Get zero-extended integer from a mlir::Value that is an int constant or a
214+
/// constant op.
215+
static int64_t getZExtIntValueFromConstOp(mlir::Value val) {
216+
auto constOp = val.getDefiningOp<cir::ConstantOp>();
217+
assert(constOp &&
218+
"getZeroExtendedIntValueFromConstOp call with non ConstantOp");
219+
return constOp.getIntValue().getZExtValue();
220+
}
221+
206222
/// Return the cir::TypeEvaluationKind of QualType \c type.
207223
static cir::TypeEvaluationKind getEvaluationKind(clang::QualType type);
208224

@@ -1816,7 +1832,7 @@ class CIRGenFunction : public CIRGenTypeCache {
18161832

18171833
mlir::LogicalResult emitWhileStmt(const clang::WhileStmt &s);
18181834

1819-
mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *e);
1835+
mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr);
18201836

18211837
/// Given an assignment `*lhs = rhs`, emit a test that checks if \p rhs is
18221838
/// nonnull, if 1\p LHS is marked _Nonnull.

clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,4 +73,76 @@ __m256i test_mm256_undefined_si256(void) {
7373
// OGCG-LABEL: test_mm256_undefined_si256
7474
// OGCG: ret <4 x i64> zeroinitializer
7575
return _mm256_undefined_si256();
76-
}
76+
}
77+
78+
__m256d test_mm256_shuffle_pd(__m256d A, __m256d B) {
79+
// CIR-LABEL: test_mm256_shuffle_pd
80+
// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.double>) [#cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<2> : !s32i, #cir.int<6> : !s32i] : !cir.vector<4 x !cir.double>
81+
82+
// LLVM-LABEL: test_mm256_shuffle_pd
83+
// LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
84+
85+
// OGCG-LABEL: test_mm256_shuffle_pd
86+
// OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
87+
return _mm256_shuffle_pd(A, B, 0);
88+
}
89+
90+
__m256 test_mm256_shuffle_ps(__m256 A, __m256 B) {
91+
// CIR-LABEL: test_mm256_shuffle_ps
92+
// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<8> : !s32i, #cir.int<8> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<12> : !s32i, #cir.int<12> : !s32i] : !cir.vector<8 x !cir.float>
93+
94+
// LLVM-LABEL: test_mm256_shuffle_ps
95+
// LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
96+
97+
// OGCG-LABEL: test_mm256_shuffle_ps
98+
// OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
99+
return _mm256_shuffle_ps(A, B, 0);
100+
}
101+
102+
__m128 test_mm_permute_ps(__m128 A) {
103+
// CIR-LABEL: test_mm_permute_ps
104+
// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<4 x !cir.float>
105+
106+
// LLVM-LABEL: test_mm_permute_ps
107+
// LLVM: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
108+
109+
// OGCG-LABEL: test_mm_permute_ps
110+
// OGCG: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
111+
return _mm_permute_ps(A, 0x4E);
112+
}
113+
114+
__m256 test_mm256_permute_ps(__m256 A) {
115+
// CIR-LABEL: test_mm256_permute_ps
116+
// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i] : !cir.vector<8 x !cir.float>
117+
118+
// LLVM-LABEL: test_mm256_permute_ps
119+
// LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
120+
121+
// OGCG-LABEL: test_mm256_permute_ps
122+
// OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
123+
return _mm256_permute_ps(A, 0x4E);
124+
}
125+
126+
__m128d test_mm_permute_pd(__m128d A) {
127+
// CIR-LABEL: test_mm_permute_pd
128+
// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x !cir.double>
129+
130+
// LLVM-LABEL: test_mm_permute_pd
131+
// LLVM: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 0>
132+
133+
// OGCG-LABEL: test_mm_permute_pd
134+
// OGCG: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 0>
135+
return _mm_permute_pd(A, 0x1);
136+
}
137+
138+
__m256d test_mm256_permute_pd(__m256d A) {
139+
// CIR-LABEL: test_mm256_permute_pd
140+
// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i] : !cir.vector<4 x !cir.double>
141+
142+
// LLVM-LABEL: test_mm256_permute_pd
143+
// LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
144+
145+
// OGCG-LABEL: test_mm256_permute_pd
146+
// OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
147+
return _mm256_permute_pd(A, 0x5);
148+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o %t.cir -Wall -Werror
2+
// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
3+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir -emit-cir -o %t.cir -Wall -Werror
4+
// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
5+
6+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o %t.ll -Wall -Werror
7+
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
8+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir -emit-llvm -o %t.ll -Wall -Werror
9+
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
10+
11+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o %t.cir -Wall -Werror
12+
// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
13+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir -emit-cir -o %t.cir -Wall -Werror
14+
// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
15+
16+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o %t.ll -Wall -Werror
17+
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
18+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir -emit-llvm -o %t.ll -Wall -Werror
19+
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
20+
21+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
22+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
23+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
24+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
25+
26+
// This test mimics clang/test/CodeGen/X86/avx2-builtins.c, which eventually
27+
// CIR shall be able to support fully.
28+
29+
#include <immintrin.h>
30+
31+
__m256i test_mm256_shufflelo_epi16(__m256i a) {
32+
// CIR-LABEL: _mm256_shufflelo_epi16
33+
// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s16i>) [#cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<11> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<9> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i] : !cir.vector<16 x !s16i>
34+
35+
// LLVM-LABEL: test_mm256_shufflelo_epi16
36+
// LLVM: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
37+
38+
// OGCG-LABEL: test_mm256_shufflelo_epi16
39+
// OGCG: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
40+
return _mm256_shufflelo_epi16(a, 83);
41+
}
42+
43+
__m256i test_mm256_shufflehi_epi16(__m256i a) {
44+
// CIR-LABEL: _mm256_shufflehi_epi16
45+
// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s16i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<7> : !s32i, #cir.int<6> : !s32i, #cir.int<6> : !s32i, #cir.int<5> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<15> : !s32i, #cir.int<14> : !s32i, #cir.int<14> : !s32i, #cir.int<13> : !s32i] : !cir.vector<16 x !s16i>
46+
47+
// LLVM-LABEL: test_mm256_shufflehi_epi16
48+
// LLVM: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
49+
50+
// OGCG-LABEL: test_mm256_shufflehi_epi16
51+
// OGCG: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
52+
return _mm256_shufflehi_epi16(a, 107);
53+
}

0 commit comments

Comments
 (0)