Skip to content

Commit 8e2a43b

Browse files
Address reviews + upstream tests for pshufd
1 parent 8e108fb commit 8e2a43b

File tree

4 files changed

+174
-76
lines changed

4 files changed

+174
-76
lines changed

clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp

Lines changed: 44 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,12 @@ static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder,
7070

7171
static cir::VecShuffleOp emitPshufW(CIRGenFunction &cgf,
7272
CIRGenBuilderTy &builder,
73-
llvm::SmallVector<mlir::Value> &ops,
73+
const mlir::Value vec,
74+
const mlir::Value immediate,
7475
const CallExpr *expr, const bool isLow) {
75-
uint32_t imm = cgf.getZExtIntValueFromConstOp(ops[1]);
76+
uint32_t imm = cgf.getZExtIntValueFromConstOp(immediate);
7677

77-
auto vecTy = cast<cir::VectorType>(ops[0].getType());
78+
auto vecTy = cast<cir::VectorType>(vec.getType());
7879
unsigned numElts = vecTy.getSize();
7980

8081
unsigned firstHalfStart = isLow ? 0 : 4;
@@ -93,10 +94,35 @@ static cir::VecShuffleOp emitPshufW(CIRGenFunction &cgf,
9394
indices[l + i] = l + i;
9495
}
9596

96-
return builder.createVecShuffle(cgf.getLoc(expr->getExprLoc()), ops[0],
97+
return builder.createVecShuffle(cgf.getLoc(expr->getExprLoc()), vec,
9798
ArrayRef(indices, numElts));
9899
}
99100

101+
static llvm::SmallVector<int64_t, 16>
102+
computeMaskPshufDOrShufP(CIRGenFunction &cgf, const mlir::Value vec,
103+
uint32_t imm, const bool isShufP) {
104+
auto vecTy = cast<cir::VectorType>(vec.getType());
105+
unsigned numElts = vecTy.getSize();
106+
unsigned numLanes = cgf.cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
107+
unsigned numLaneElts = numElts / numLanes;
108+
109+
// Splat the 8-bits of immediate 4 times to help the loop wrap around.
110+
imm = (imm & 0xff) * 0x01010101;
111+
112+
llvm::SmallVector<int64_t, 16> indices(numElts);
113+
for (unsigned l = 0; l != numElts; l += numLaneElts) {
114+
for (unsigned i = 0; i != numLaneElts; ++i) {
115+
uint32_t idx = imm % numLaneElts;
116+
imm /= numLaneElts;
117+
if (isShufP && i >= (numLaneElts / 2))
118+
idx += numElts;
119+
indices[l + i] = l + idx;
120+
}
121+
}
122+
123+
return indices;
124+
}
125+
100126
mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
101127
const CallExpr *expr) {
102128
if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -550,19 +576,19 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
550576
case X86::BI__builtin_ia32_pblendw256:
551577
case X86::BI__builtin_ia32_pblendd128:
552578
case X86::BI__builtin_ia32_pblendd256:
553-
cgm.errorNYI(expr->getSourceRange(),
554-
std::string("unimplemented X86 builtin call: ") +
555-
getContext().BuiltinInfo.getName(builtinID));
556-
return {};
579+
cgm.errorNYI(expr->getSourceRange(),
580+
std::string("unimplemented X86 builtin call: ") +
581+
getContext().BuiltinInfo.getName(builtinID));
582+
return {};
557583
case X86::BI__builtin_ia32_pshuflw:
558584
case X86::BI__builtin_ia32_pshuflw256:
559585
case X86::BI__builtin_ia32_pshuflw512: {
560-
return emitPshufW(*this, builder, ops, expr, true);
586+
return emitPshufW(*this, builder, ops[0], ops[1], expr, true);
561587
}
562588
case X86::BI__builtin_ia32_pshufhw:
563589
case X86::BI__builtin_ia32_pshufhw256:
564590
case X86::BI__builtin_ia32_pshufhw512: {
565-
return emitPshufW(*this, builder, ops, expr, false);
591+
return emitPshufW(*this, builder, ops[0], ops[1], expr, false);
566592
}
567593
case X86::BI__builtin_ia32_pshufd:
568594
case X86::BI__builtin_ia32_pshufd256:
@@ -573,59 +599,24 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
573599
case X86::BI__builtin_ia32_vpermilps256:
574600
case X86::BI__builtin_ia32_vpermilpd512:
575601
case X86::BI__builtin_ia32_vpermilps512: {
576-
// TODO: Add tests for this branch.
577-
uint32_t imm = getSExtIntValueFromConstOp(ops[1]);
578-
579-
auto vecTy = cast<cir::VectorType>(ops[0].getType());
580-
unsigned numElts = vecTy.getSize();
581-
auto eltTy = vecTy.getElementType();
602+
const uint32_t imm = getSExtIntValueFromConstOp(ops[1]);
603+
const llvm::SmallVector<int64_t, 16> mask =
604+
computeMaskPshufDOrShufP(*this, ops[0], imm, false);
582605

583-
unsigned eltBitWidth = getTypeSizeInBits(eltTy).getFixedValue();
584-
unsigned numLaneElts = 128 / eltBitWidth;
585-
586-
// Splat the 8-bits of immediate 4 times to help the loop wrap around.
587-
imm = (imm & 0xff) * 0x01010101;
588-
589-
llvm::SmallVector<int64_t, 16> indices;
590-
for (unsigned l = 0; l != numElts; l += numLaneElts) {
591-
for (unsigned i = 0; i != numLaneElts; ++i) {
592-
indices.push_back((imm % numLaneElts) + l);
593-
imm /= numLaneElts;
594-
}
595-
}
596-
597-
return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0],
598-
indices);
606+
return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], mask);
599607
}
600608
case X86::BI__builtin_ia32_shufpd:
601609
case X86::BI__builtin_ia32_shufpd256:
602610
case X86::BI__builtin_ia32_shufpd512:
603611
case X86::BI__builtin_ia32_shufps:
604612
case X86::BI__builtin_ia32_shufps256:
605613
case X86::BI__builtin_ia32_shufps512: {
606-
uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
607-
608-
auto vecTy = cast<cir::VectorType>(ops[0].getType());
609-
unsigned numElts = vecTy.getSize();
610-
unsigned numLanes = cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
611-
unsigned numLaneElts = numElts / numLanes;
612-
613-
// Splat the 8-bits of immediate 4 times to help the loop wrap around.
614-
imm = (imm & 0xff) * 0x01010101;
615-
616-
int64_t indices[16];
617-
for (unsigned l = 0; l != numElts; l += numLaneElts) {
618-
for (unsigned i = 0; i != numLaneElts; ++i) {
619-
uint32_t idx = imm % numLaneElts;
620-
imm /= numLaneElts;
621-
if (i >= (numLaneElts / 2))
622-
idx += numElts;
623-
indices[l + i] = l + idx;
624-
}
625-
}
614+
const uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
615+
const llvm::SmallVector<int64_t, 16> mask =
616+
computeMaskPshufDOrShufP(*this, ops[0], imm, true);
626617

627618
return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
628-
ArrayRef(indices, numElts));
619+
mask);
629620
}
630621
case X86::BI__builtin_ia32_permdi256:
631622
case X86::BI__builtin_ia32_permdf256:

clang/lib/CIR/CodeGen/CIRGenFunction.h

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,22 @@ class CIRGenFunction : public CIRGenTypeCache {
202202
return convertType(getContext().getTypeDeclType(t));
203203
}
204204

205+
/// Get integer from a mlir::Value that is an int constant or a constant op.
206+
static int64_t getSExtIntValueFromConstOp(mlir::Value val) {
207+
auto constOp = val.getDefiningOp<cir::ConstantOp>();
208+
assert(constOp && "getIntValueFromConstOp call with non ConstantOp");
209+
return constOp.getIntValue().getSExtValue();
210+
}
211+
212+
/// Get zero-extended integer from a mlir::Value that is an int constant or a
213+
/// constant op.
214+
static int64_t getZExtIntValueFromConstOp(mlir::Value val) {
215+
auto constOp = val.getDefiningOp<cir::ConstantOp>();
216+
assert(constOp &&
217+
"getZeroExtendedIntValueFromConstOp call with non ConstantOp");
218+
return constOp.getIntValue().getZExtValue();
219+
}
220+
205221
/// Return the cir::TypeEvaluationKind of QualType \c type.
206222
static cir::TypeEvaluationKind getEvaluationKind(clang::QualType type);
207223

@@ -1349,28 +1365,6 @@ class CIRGenFunction : public CIRGenTypeCache {
13491365
cir::IntType resType, mlir::Value emittedE,
13501366
bool isDynamic);
13511367

1352-
/// Get integer from a mlir::Value that is an int constant or a constant op.
1353-
static int64_t getSExtIntValueFromConstOp(mlir::Value val) {
1354-
auto constOp = val.getDefiningOp<cir::ConstantOp>();
1355-
assert(constOp && "getIntValueFromConstOp call with non ConstantOp");
1356-
return constOp.getIntValue().getSExtValue();
1357-
}
1358-
1359-
/// Get zero-extended integer from a mlir::Value that is an int constant or a
1360-
/// constant op.
1361-
static int64_t getZExtIntValueFromConstOp(mlir::Value val) {
1362-
auto constOp = val.getDefiningOp<cir::ConstantOp>();
1363-
assert(constOp &&
1364-
"getZeroExtendedIntValueFromConstOp call with non ConstantOp");
1365-
return constOp.getIntValue().getZExtValue();
1366-
}
1367-
1368-
/// Get size of type in bits using SizedTypeInterface
1369-
llvm::TypeSize getTypeSizeInBits(mlir::Type ty) const {
1370-
assert(cir::isSized(ty) && "Type must implement SizedTypeInterface");
1371-
return cgm.getDataLayout().getTypeSizeInBits(ty);
1372-
}
1373-
13741368
mlir::Value evaluateOrEmitBuiltinObjectSize(const clang::Expr *e,
13751369
unsigned type,
13761370
cir::IntType resType,
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
2+
// RUN: FileCheck --input-file=%t.cir %s
3+
4+
// Test that __builtin_ia32_pshufd and __builtin_ia32_vpermilp generates correct CIR vec.shuffle operations
5+
// This verifies the fix for SIMD intrinsic support that was previously NYI
6+
7+
typedef int __v4si __attribute__((__vector_size__(16)));
8+
typedef float __v4sf __attribute__((__vector_size__(16)));
9+
typedef double __v2df __attribute__((__vector_size__(16)));
10+
typedef float __v8sf __attribute__((__vector_size__(32)));
11+
typedef double __v4df __attribute__((__vector_size__(32)));
12+
typedef float __v16sf __attribute__((__vector_size__(64)));
13+
typedef double __v8df __attribute__((__vector_size__(64)));
14+
15+
typedef __v4si __m128i;
16+
typedef __v4sf __m128;
17+
typedef __v2df __m128d;
18+
typedef __v8sf __m256;
19+
typedef __v4df __m256d;
20+
typedef __v16sf __m512;
21+
typedef __v8df __m512d;
22+
23+
// CHECK-LABEL: @_Z11test_pshufdv
24+
void test_pshufd() {
25+
__m128i vec = {1, 2, 3, 4};
26+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<4 x !s32i>
27+
__m128i result = __builtin_ia32_pshufd(vec, 0x4E);
28+
}
29+
30+
// CHECK-LABEL: @_Z19test_different_maskv
31+
void test_different_mask() {
32+
__m128i vec = {10, 20, 30, 40};
33+
// Test different immediate value: 0x1B = 00011011 = [3,2,1,0] reversed
34+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i] : !cir.vector<4 x !s32i>
35+
__m128i result = __builtin_ia32_pshufd(vec, 0x1B);
36+
}
37+
38+
// CHECK-LABEL: @_Z9test_casev
39+
void test_case() {
40+
__m128i p0 = {1, 2, 3, 4};
41+
42+
// This reproduces the exact pattern from stb_image.h:2685 that was failing:
43+
// _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e));
44+
// Which expands to: __builtin_ia32_pshufd(p0, 0x4e)
45+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<4 x !s32i>
46+
__m128i out_vec = __builtin_ia32_pshufd(p0, 0x4e);
47+
}
48+
49+
// CHECK-LABEL: @_Z15test_vpermilps4v
50+
void test_vpermilps4() {
51+
__m128 vec = {1.0f, 2.0f, 3.0f, 4.0f};
52+
// vpermilps with immediate 0x4E = 01001110 = [1,3,2,0] for 4 elements
53+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<4 x !cir.float>
54+
__m128 result = __builtin_ia32_vpermilps(vec, 0x4E);
55+
}
56+
57+
// CHECK-LABEL: @_Z15test_vpermilpd2v
58+
void test_vpermilpd2() {
59+
__m128d vec = {1.0, 2.0};
60+
// vpermilpd with immediate 0x1 = 01 = [1,0] for 2 elements
61+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x !cir.double>
62+
__m128d result = __builtin_ia32_vpermilpd(vec, 0x1);
63+
}
64+
65+
// CHECK-LABEL: @_Z17test_vpermilps256v
66+
void test_vpermilps256() {
67+
__m256 vec = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
68+
// vpermilps256 with immediate 0x1B = 00011011 = [3,2,1,0] for each 128-bit lane
69+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<7> : !s32i, #cir.int<6> : !s32i, #cir.int<5> : !s32i, #cir.int<4> : !s32i] : !cir.vector<8 x !cir.float>
70+
__m256 result = __builtin_ia32_vpermilps256(vec, 0x1B);
71+
}
72+
73+
// CHECK-LABEL: @_Z17test_vpermilpd256v
74+
void test_vpermilpd256() {
75+
__m256d vec = {1.0, 2.0, 3.0, 4.0};
76+
// vpermilpd256 with immediate 0x5 = 0101 = [1,0,1,0] for 4 elements
77+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i] : !cir.vector<4 x !cir.double>
78+
__m256d result = __builtin_ia32_vpermilpd256(vec, 0x5);
79+
}
80+
81+
// CHECK-LABEL: @_Z17test_vpermilps512v
82+
void test_vpermilps512() {
83+
__m512 vec = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
84+
9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
85+
// vpermilps512 with immediate 0x4E = 01001110 = [1,3,2,0] for each 128-bit lane
86+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !cir.float>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i] : !cir.vector<16 x !cir.float>
87+
__m512 result = __builtin_ia32_vpermilps512(vec, 0x4E);
88+
}
89+
90+
// CHECK-LABEL: @_Z17test_vpermilpd512v
91+
void test_vpermilpd512() {
92+
__m512d vec = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
93+
// vpermilpd512 with immediate 0x55 = 01010101 = [1,0,1,0,1,0,1,0] for 8 elements
94+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<5> : !s32i, #cir.int<4> : !s32i, #cir.int<7> : !s32i, #cir.int<6> : !s32i] : !cir.vector<8 x !cir.double>
95+
__m512d result = __builtin_ia32_vpermilpd512(vec, 0x55);
96+
}
97+
98+
// Test different immediate values
99+
// CHECK-LABEL: @_Z24test_vpermilps_differentv
100+
void test_vpermilps_different() {
101+
__m128 vec = {10.0f, 20.0f, 30.0f, 40.0f};
102+
// Test different immediate value: 0x1B = 00011011 = [3,2,1,0] reversed
103+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i] : !cir.vector<4 x !cir.float>
104+
__m128 result = __builtin_ia32_vpermilps(vec, 0x1B);
105+
}
106+
107+
// CHECK-LABEL: @_Z24test_vpermilpd_differentv
108+
void test_vpermilpd_different() {
109+
__m128d vec = {100.0, 200.0};
110+
// Test immediate 0x0 = 00 = [0,0] - duplicate first element
111+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x !cir.double>
112+
__m128d result = __builtin_ia32_vpermilpd(vec, 0x0);
113+
}

clang/test/CIR/CodeGen/X86/sse2-builtins.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,4 +146,4 @@ __m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
146146
// OGCG-LABEL: test_mm_shuffle_pd
147147
// OGCG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 2>
148148
return _mm_shuffle_pd(A, B, 1);
149-
}
149+
}

0 commit comments

Comments
 (0)