Skip to content

Commit c76fc2d

Browse files
Upstream vec shuffle builtins in CIR codegen
1 parent 1264620 commit c76fc2d

15 files changed

+905
-13
lines changed

clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp

Lines changed: 173 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,35 @@ static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder,
6868
return bitCast;
6969
}
7070

71+
static cir::VecShuffleOp emitPshufW(CIRGenFunction &cgf,
72+
CIRGenBuilderTy &builder,
73+
llvm::SmallVector<mlir::Value> &ops,
74+
const CallExpr *expr, const bool isLow) {
75+
uint32_t imm = cgf.getZExtIntValueFromConstOp(ops[1]);
76+
77+
auto vecTy = cast<cir::VectorType>(ops[0].getType());
78+
unsigned numElts = vecTy.getSize();
79+
80+
unsigned firstHalfStart = isLow ? 0 : 4;
81+
unsigned secondHalfStart = 4 - firstHalfStart;
82+
83+
// Splat the 8-bits of immediate 4 times to help the loop wrap around.
84+
imm = (imm & 0xff) * 0x01010101;
85+
86+
int64_t indices[32];
87+
for (unsigned l = 0; l != numElts; l += 8) {
88+
for (unsigned i = firstHalfStart; i != firstHalfStart + 4; ++i) {
89+
indices[l + i] = l + (imm & 3) + firstHalfStart;
90+
imm /= 4;
91+
}
92+
for (unsigned i = secondHalfStart; i != secondHalfStart + 4; ++i)
93+
indices[l + i] = l + i;
94+
}
95+
96+
return builder.createVecShuffle(cgf.getLoc(expr->getExprLoc()), ops[0],
97+
ArrayRef(indices, numElts));
98+
}
99+
71100
mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
72101
const CallExpr *expr) {
73102
if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -163,9 +192,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
163192
case X86::BI__builtin_ia32_vec_ext_v4di: {
164193
unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize();
165194

166-
uint64_t index =
167-
ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue();
168-
195+
uint64_t index = getZExtIntValueFromConstOp(ops[1]);
169196
index &= numElts - 1;
170197

171198
cir::ConstantOp indexVal =
@@ -497,6 +524,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
497524
case X86::BI__builtin_ia32_extracti64x2_256_mask:
498525
case X86::BI__builtin_ia32_extractf64x2_512_mask:
499526
case X86::BI__builtin_ia32_extracti64x2_512_mask:
527+
cgm.errorNYI(expr->getSourceRange(),
528+
std::string("unimplemented X86 builtin call: ") +
529+
getContext().BuiltinInfo.getName(builtinID));
530+
return {};
500531
case X86::BI__builtin_ia32_vinsertf128_pd256:
501532
case X86::BI__builtin_ia32_vinsertf128_ps256:
502533
case X86::BI__builtin_ia32_vinsertf128_si256:
@@ -512,23 +543,69 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
512543
case X86::BI__builtin_ia32_insertf64x2_256:
513544
case X86::BI__builtin_ia32_inserti64x2_256:
514545
case X86::BI__builtin_ia32_insertf64x2_512:
515-
case X86::BI__builtin_ia32_inserti64x2_512:
546+
case X86::BI__builtin_ia32_inserti64x2_512: {
547+
unsigned dstNumElts = cast<cir::VectorType>(ops[0].getType()).getSize();
548+
unsigned srcNumElts = cast<cir::VectorType>(ops[1].getType()).getSize();
549+
unsigned subVectors = dstNumElts / srcNumElts;
550+
assert(llvm::isPowerOf2_32(subVectors) && "Expected power of 2 subvectors");
551+
552+
uint64_t index = getZExtIntValueFromConstOp(ops[2]);
553+
index &= subVectors - 1; // Remove any extra bits.
554+
index *= srcNumElts;
555+
556+
int64_t indices[16];
557+
for (unsigned i = 0; i != dstNumElts; ++i)
558+
indices[i] = (i >= srcNumElts) ? srcNumElts + (i % srcNumElts) : i;
559+
560+
mlir::Value op1 = builder.createVecShuffle(
561+
getLoc(expr->getExprLoc()), ops[1], ArrayRef(indices, dstNumElts));
562+
563+
for (unsigned i = 0; i != dstNumElts; ++i) {
564+
if (i >= index && i < (index + srcNumElts))
565+
indices[i] = (i - index) + dstNumElts;
566+
else
567+
indices[i] = i;
568+
}
569+
570+
return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], op1,
571+
ArrayRef(indices, dstNumElts));
572+
}
516573
case X86::BI__builtin_ia32_pmovqd512_mask:
517574
case X86::BI__builtin_ia32_pmovwb512_mask:
575+
cgm.errorNYI(expr->getSourceRange(),
576+
std::string("unimplemented X86 builtin call: ") +
577+
getContext().BuiltinInfo.getName(builtinID));
578+
return {};
518579
case X86::BI__builtin_ia32_pblendw128:
519580
case X86::BI__builtin_ia32_blendpd:
520581
case X86::BI__builtin_ia32_blendps:
521582
case X86::BI__builtin_ia32_blendpd256:
522583
case X86::BI__builtin_ia32_blendps256:
523584
case X86::BI__builtin_ia32_pblendw256:
524585
case X86::BI__builtin_ia32_pblendd128:
525-
case X86::BI__builtin_ia32_pblendd256:
586+
case X86::BI__builtin_ia32_pblendd256: {
587+
uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
588+
unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize();
589+
590+
int64_t indices[16];
591+
// If there are more than 8 elements, the immediate is used twice so make
592+
// sure we handle that.
593+
for (unsigned i = 0; i != numElts; ++i)
594+
indices[i] = ((imm >> (i % 8)) & 0x1) ? numElts + i : i;
595+
596+
return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
597+
ArrayRef(indices, numElts));
598+
}
526599
case X86::BI__builtin_ia32_pshuflw:
527600
case X86::BI__builtin_ia32_pshuflw256:
528-
case X86::BI__builtin_ia32_pshuflw512:
601+
case X86::BI__builtin_ia32_pshuflw512: {
602+
return emitPshufW(*this, builder, ops, expr, true);
603+
}
529604
case X86::BI__builtin_ia32_pshufhw:
530605
case X86::BI__builtin_ia32_pshufhw256:
531-
case X86::BI__builtin_ia32_pshufhw512:
606+
case X86::BI__builtin_ia32_pshufhw512: {
607+
return emitPshufW(*this, builder, ops, expr, false);
608+
}
532609
case X86::BI__builtin_ia32_pshufd:
533610
case X86::BI__builtin_ia32_pshufd256:
534611
case X86::BI__builtin_ia32_pshufd512:
@@ -537,20 +614,106 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
537614
case X86::BI__builtin_ia32_vpermilpd256:
538615
case X86::BI__builtin_ia32_vpermilps256:
539616
case X86::BI__builtin_ia32_vpermilpd512:
540-
case X86::BI__builtin_ia32_vpermilps512:
617+
case X86::BI__builtin_ia32_vpermilps512: {
618+
// TODO: Add tests for this branch.
619+
uint32_t imm = getSExtIntValueFromConstOp(ops[1]);
620+
621+
auto vecTy = cast<cir::VectorType>(ops[0].getType());
622+
unsigned numElts = vecTy.getSize();
623+
auto eltTy = vecTy.getElementType();
624+
625+
unsigned eltBitWidth = getTypeSizeInBits(eltTy).getFixedValue();
626+
unsigned numLaneElts = 128 / eltBitWidth;
627+
628+
// Splat the 8-bits of immediate 4 times to help the loop wrap around.
629+
imm = (imm & 0xff) * 0x01010101;
630+
631+
llvm::SmallVector<int64_t, 16> indices;
632+
for (unsigned l = 0; l != numElts; l += numLaneElts) {
633+
for (unsigned i = 0; i != numLaneElts; ++i) {
634+
indices.push_back((imm % numLaneElts) + l);
635+
imm /= numLaneElts;
636+
}
637+
}
638+
639+
return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0],
640+
indices);
641+
}
541642
case X86::BI__builtin_ia32_shufpd:
542643
case X86::BI__builtin_ia32_shufpd256:
543644
case X86::BI__builtin_ia32_shufpd512:
544645
case X86::BI__builtin_ia32_shufps:
545646
case X86::BI__builtin_ia32_shufps256:
546-
case X86::BI__builtin_ia32_shufps512:
647+
case X86::BI__builtin_ia32_shufps512: {
648+
uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
649+
650+
auto vecTy = cast<cir::VectorType>(ops[0].getType());
651+
unsigned numElts = vecTy.getSize();
652+
unsigned numLanes = cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
653+
unsigned numLaneElts = numElts / numLanes;
654+
655+
// Splat the 8-bits of immediate 4 times to help the loop wrap around.
656+
imm = (imm & 0xff) * 0x01010101;
657+
658+
int64_t indices[16];
659+
for (unsigned l = 0; l != numElts; l += numLaneElts) {
660+
for (unsigned i = 0; i != numLaneElts; ++i) {
661+
uint32_t idx = imm % numLaneElts;
662+
imm /= numLaneElts;
663+
if (i >= (numLaneElts / 2))
664+
idx += numElts;
665+
indices[l + i] = l + idx;
666+
}
667+
}
668+
669+
return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
670+
ArrayRef(indices, numElts));
671+
}
547672
case X86::BI__builtin_ia32_permdi256:
548673
case X86::BI__builtin_ia32_permdf256:
549674
case X86::BI__builtin_ia32_permdi512:
550675
case X86::BI__builtin_ia32_permdf512:
676+
cgm.errorNYI(expr->getSourceRange(),
677+
std::string("unimplemented X86 builtin call: ") +
678+
getContext().BuiltinInfo.getName(builtinID));
679+
return {};
551680
case X86::BI__builtin_ia32_palignr128:
552681
case X86::BI__builtin_ia32_palignr256:
553-
case X86::BI__builtin_ia32_palignr512:
682+
case X86::BI__builtin_ia32_palignr512: {
683+
uint32_t shiftVal = getZExtIntValueFromConstOp(ops[2]) & 0xff;
684+
685+
unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize();
686+
assert(numElts % 16 == 0);
687+
688+
// If palignr is shifting the pair of vectors more than the size of two
689+
// lanes, emit zero.
690+
if (shiftVal >= 32)
691+
return builder.getNullValue(convertType(expr->getType()),
692+
getLoc(expr->getExprLoc()));
693+
694+
// If palignr is shifting the pair of input vectors more than one lane,
695+
// but less than two lanes, convert to shifting in zeroes.
696+
if (shiftVal > 16) {
697+
shiftVal -= 16;
698+
ops[1] = ops[0];
699+
ops[0] =
700+
builder.getNullValue(ops[0].getType(), getLoc(expr->getExprLoc()));
701+
}
702+
703+
int64_t indices[64];
704+
// 256-bit palignr operates on 128-bit lanes so we need to handle that
705+
for (unsigned l = 0; l != numElts; l += 16) {
706+
for (unsigned i = 0; i != 16; ++i) {
707+
uint32_t idx = shiftVal + i;
708+
if (idx >= 16)
709+
idx += numElts - 16; // End of lane, switch operand.
710+
indices[l + i] = l + idx;
711+
}
712+
}
713+
714+
return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[1], ops[0],
715+
ArrayRef(indices, numElts));
716+
}
554717
case X86::BI__builtin_ia32_alignd128:
555718
case X86::BI__builtin_ia32_alignd256:
556719
case X86::BI__builtin_ia32_alignd512:

clang/lib/CIR/CodeGen/CIRGenFunction.h

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1349,6 +1349,28 @@ class CIRGenFunction : public CIRGenTypeCache {
13491349
cir::IntType resType, mlir::Value emittedE,
13501350
bool isDynamic);
13511351

1352+
/// Get integer from a mlir::Value that is an int constant or a constant op.
1353+
static int64_t getSExtIntValueFromConstOp(mlir::Value val) {
1354+
auto constOp = val.getDefiningOp<cir::ConstantOp>();
1355+
assert(constOp && "getIntValueFromConstOp call with non ConstantOp");
1356+
return constOp.getIntValue().getSExtValue();
1357+
}
1358+
1359+
/// Get zero-extended integer from a mlir::Value that is an int constant or a
1360+
/// constant op.
1361+
static int64_t getZExtIntValueFromConstOp(mlir::Value val) {
1362+
auto constOp = val.getDefiningOp<cir::ConstantOp>();
1363+
assert(constOp &&
1364+
"getZeroExtendedIntValueFromConstOp call with non ConstantOp");
1365+
return constOp.getIntValue().getZExtValue();
1366+
}
1367+
1368+
/// Get size of type in bits using SizedTypeInterface
1369+
llvm::TypeSize getTypeSizeInBits(mlir::Type ty) const {
1370+
assert(cir::isSized(Ty) && "Type must implement SizedTypeInterface");
1371+
return cgm.getDataLayout().getTypeSizeInBits(ty);
1372+
}
1373+
13521374
mlir::Value evaluateOrEmitBuiltinObjectSize(const clang::Expr *e,
13531375
unsigned type,
13541376
cir::IntType resType,
@@ -1804,7 +1826,7 @@ class CIRGenFunction : public CIRGenTypeCache {
18041826

18051827
mlir::LogicalResult emitWhileStmt(const clang::WhileStmt &s);
18061828

1807-
mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *e);
1829+
mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr);
18081830

18091831
/// Given an assignment `*lhs = rhs`, emit a test that checks if \p rhs is
18101832
/// nonnull, if 1\p LHS is marked _Nonnull.

clang/test/CIR/CodeGen/X86/avx-builtins.c

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,4 +73,85 @@ __m256i test_mm256_undefined_si256(void) {
7373
// OGCG-LABEL: test_mm256_undefined_si256
7474
// OGCG: ret <4 x i64> zeroinitializer
7575
return _mm256_undefined_si256();
76+
}
77+
78+
__m256d test_mm256_blend_pd(__m256d A, __m256d B) {
79+
// CIR-LABEL: test_mm256_blend_pd
80+
// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.double>) [#cir.int<4> : !s32i, #cir.int<1> : !s32i, #cir.int<6> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
81+
82+
// LLVM-LABEL: test_mm256_blend_pd
83+
// LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
84+
85+
// OGCG-LABEL: test_mm256_blend_pd
86+
// OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
87+
return _mm256_blend_pd(A, B, 0x05);
88+
}
89+
90+
__m256 test_mm256_blend_ps(__m256 A, __m256 B) {
91+
// CIR-LABEL: test_mm256_blend_ps
92+
// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) [#cir.int<8> : !s32i, #cir.int<1> : !s32i, #cir.int<10> : !s32i, #cir.int<3> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.float>
93+
94+
// LLVM-LABEL: test_mm256_blend_ps
95+
// LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
96+
97+
// OGCG-LABEL: test_mm256_blend_ps
98+
// OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
99+
return _mm256_blend_ps(A, B, 0x35);
100+
}
101+
102+
__m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
103+
// CIR-LABEL: test_mm256_insertf128_pd
104+
// %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
105+
// %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.double>) [#cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
106+
107+
// LLVM-LABEL: test_mm256_insertf128_pd
108+
// LLVM: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
109+
// LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
110+
return _mm256_insertf128_pd(A, B, 0);
111+
}
112+
113+
__m256 test_mm256_insertf128_ps(__m256 A, __m128 B) {
114+
// CIR-LABEL: test_mm256_insertf128_ps
115+
// %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<8 x !cir.float>
116+
// %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i] : !cir.vector<8 x !cir.float>
117+
118+
// LLVM-LABEL: test_mm256_insertf128_ps
119+
// LLVM: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
120+
// LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
121+
return _mm256_insertf128_ps(A, B, 1);
122+
}
123+
124+
__m256i test_mm256_insertf128_si256(__m256i A, __m128i B) {
125+
// CIR-LABEL: test_mm256_insertf128_si256
126+
// %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<8 x !s32i>
127+
// %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s32i>) [#cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i]
128+
129+
// LLVM-LABEL: test_mm256_insertf128_si256
130+
// LLVM: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
131+
// LLVM: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
132+
return _mm256_insertf128_si256(A, B, 0);
133+
}
134+
135+
__m256d test_mm256_shuffle_pd(__m256d A, __m256d B) {
136+
// CIR-LABEL: test_mm256_shuffle_pd
137+
// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.double>) [#cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<2> : !s32i, #cir.int<6> : !s32i] : !cir.vector<4 x !cir.double>
138+
139+
// CHECK-LABEL: test_mm256_shuffle_pd
140+
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
141+
142+
// OGCG-LABEL: test_mm256_shuffle_pd
143+
// OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
144+
return _mm256_shuffle_pd(A, B, 0);
145+
}
146+
147+
__m256 test_mm256_shuffle_ps(__m256 A, __m256 B) {
148+
// CIR-LABEL: test_mm256_shuffle_ps
149+
// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<8> : !s32i, #cir.int<8> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<12> : !s32i, #cir.int<12> : !s32i] : !cir.vector<8 x !cir.float>
150+
151+
// CHECK-LABEL: test_mm256_shuffle_ps
152+
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
153+
154+
// OGCG-LABEL: test_mm256_shuffle_ps
155+
// OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
156+
return _mm256_shuffle_ps(A, B, 0);
76157
}

0 commit comments

Comments
 (0)