Skip to content

Commit 19b7f07

Browse files
[CIR] Added support for __builtin_ia32_pshufd (#1861)
## Summary This PR implements support for the `__builtin_ia32_pshufd` SIMD intrinsic in ClangIR, which was previously unimplemented and causing compilation failures. ## Problem The `__builtin_ia32_pshufd` intrinsic (used by `_mm_shuffle_epi32`) was hitting an `NYI` error, preventing compilation of code that uses common SIMD operations.
1 parent 90fc6c4 commit 19b7f07

File tree

2 files changed

+146
-2
lines changed

2 files changed

+146
-2
lines changed

clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "clang/Basic/TargetBuiltins.h"
2626
#include "clang/CIR/Dialect/IR/CIRDialect.h"
2727
#include "clang/CIR/Dialect/IR/CIRTypes.h"
28+
#include "llvm/ADT/TypeSwitch.h"
2829
#include "llvm/IR/IntrinsicsX86.h"
2930
#include "llvm/Support/ErrorHandling.h"
3031

@@ -1046,8 +1047,38 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
10461047
case X86::BI__builtin_ia32_vpermilpd256:
10471048
case X86::BI__builtin_ia32_vpermilps256:
10481049
case X86::BI__builtin_ia32_vpermilpd512:
1049-
case X86::BI__builtin_ia32_vpermilps512:
1050-
llvm_unreachable("pshufd NYI");
1050+
case X86::BI__builtin_ia32_vpermilps512: {
1051+
uint32_t imm = getIntValueFromConstOp(Ops[1]);
1052+
auto vecTy = cast<cir::VectorType>(Ops[0].getType());
1053+
unsigned numElts = vecTy.getSize();
1054+
auto eltTy = vecTy.getElementType();
1055+
1056+
assert(isSized(eltTy) && "Element type must be a sized type");
1057+
1058+
unsigned eltBitWidth =
1059+
llvm::TypeSwitch<mlir::Type, unsigned>(eltTy)
1060+
.Case<cir::IntType>([](auto intTy) { return intTy.getWidth(); })
1061+
.Case<cir::SingleType>([](auto) { return 32; })
1062+
.Case<cir::DoubleType>([](auto) { return 64; })
1063+
.Default([](auto) {
1064+
llvm_unreachable("NYI: Unsupported type");
1065+
return 0;
1066+
});
1067+
1068+
unsigned vecBitWidth = numElts * eltBitWidth;
1069+
unsigned numLanes = vecBitWidth / 128;
1070+
unsigned numLaneElts = numElts / numLanes;
1071+
1072+
imm = (imm & 0xff) * 0x01010101;
1073+
llvm::SmallVector<int64_t, 16> indices;
1074+
for (unsigned l = 0; l != numElts; l += numLaneElts) {
1075+
for (unsigned i = 0; i != numLaneElts; ++i) {
1076+
indices.push_back((imm % numLaneElts) + l);
1077+
imm /= numLaneElts;
1078+
}
1079+
}
1080+
return builder.createVecShuffle(getLoc(E->getExprLoc()), Ops[0], indices);
1081+
}
10511082
case X86::BI__builtin_ia32_shufpd:
10521083
case X86::BI__builtin_ia32_shufpd256:
10531084
case X86::BI__builtin_ia32_shufpd512:
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
2+
// RUN: FileCheck --input-file=%t.cir %s
3+
4+
// Test that __builtin_ia32_pshufd and __builtin_ia32_vpermilp generates correct CIR vec.shuffle operations
5+
// This verifies the fix for SIMD intrinsic support that was previously NYI
6+
7+
typedef int __v4si __attribute__((__vector_size__(16)));
8+
typedef float __v4sf __attribute__((__vector_size__(16)));
9+
typedef double __v2df __attribute__((__vector_size__(16)));
10+
typedef float __v8sf __attribute__((__vector_size__(32)));
11+
typedef double __v4df __attribute__((__vector_size__(32)));
12+
typedef float __v16sf __attribute__((__vector_size__(64)));
13+
typedef double __v8df __attribute__((__vector_size__(64)));
14+
15+
typedef __v4si __m128i;
16+
typedef __v4sf __m128;
17+
typedef __v2df __m128d;
18+
typedef __v8sf __m256;
19+
typedef __v4df __m256d;
20+
typedef __v16sf __m512;
21+
typedef __v8df __m512d;
22+
23+
// CHECK-LABEL: @_Z11test_pshufdv
24+
void test_pshufd() {
25+
__m128i vec = {1, 2, 3, 4};
26+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s32i x 4>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<!s32i x 4>
27+
__m128i result = __builtin_ia32_pshufd(vec, 0x4E);
28+
}
29+
30+
// CHECK-LABEL: @_Z19test_different_maskv
31+
void test_different_mask() {
32+
__m128i vec = {10, 20, 30, 40};
33+
// Test different immediate value: 0x1B = 00011011 = [3,2,1,0] reversed
34+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s32i x 4>) [#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i] : !cir.vector<!s32i x 4>
35+
__m128i result = __builtin_ia32_pshufd(vec, 0x1B);
36+
}
37+
38+
// CHECK-LABEL: @_Z9test_casev
39+
void test_case() {
40+
__m128i p0 = {1, 2, 3, 4};
41+
42+
// This reproduces the exact pattern from stb_image.h:2685 that was failing:
43+
// _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e));
44+
// Which expands to: __builtin_ia32_pshufd(p0, 0x4e)
45+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s32i x 4>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<!s32i x 4>
46+
__m128i out_vec = __builtin_ia32_pshufd(p0, 0x4e);
47+
}
48+
49+
// CHECK-LABEL: @_Z15test_vpermilps4v
50+
void test_vpermilps4() {
51+
__m128 vec = {1.0f, 2.0f, 3.0f, 4.0f};
52+
// vpermilps with immediate 0x4E = 01001110 = [1,3,2,0] for 4 elements
53+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!cir.float x 4>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<!cir.float x 4>
54+
__m128 result = __builtin_ia32_vpermilps(vec, 0x4E);
55+
}
56+
57+
// CHECK-LABEL: @_Z15test_vpermilpd2v
58+
void test_vpermilpd2() {
59+
__m128d vec = {1.0, 2.0};
60+
// vpermilpd with immediate 0x1 = 01 = [1,0] for 2 elements
61+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!cir.double x 2>) [#cir.int<1> : !s32i, #cir.int<0> : !s32i] : !cir.vector<!cir.double x 2>
62+
__m128d result = __builtin_ia32_vpermilpd(vec, 0x1);
63+
}
64+
65+
// CHECK-LABEL: @_Z17test_vpermilps256v
66+
void test_vpermilps256() {
67+
__m256 vec = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
68+
// vpermilps256 with immediate 0x1B = 00011011 = [3,2,1,0] for each 128-bit lane
69+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!cir.float x 8>) [#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<7> : !s32i, #cir.int<6> : !s32i, #cir.int<5> : !s32i, #cir.int<4> : !s32i] : !cir.vector<!cir.float x 8>
70+
__m256 result = __builtin_ia32_vpermilps256(vec, 0x1B);
71+
}
72+
73+
// CHECK-LABEL: @_Z17test_vpermilpd256v
74+
void test_vpermilpd256() {
75+
__m256d vec = {1.0, 2.0, 3.0, 4.0};
76+
// vpermilpd256 with immediate 0x5 = 0101 = [1,0,1,0] for 4 elements
77+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!cir.double x 4>) [#cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i] : !cir.vector<!cir.double x 4>
78+
__m256d result = __builtin_ia32_vpermilpd256(vec, 0x5);
79+
}
80+
81+
// CHECK-LABEL: @_Z17test_vpermilps512v
82+
void test_vpermilps512() {
83+
__m512 vec = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
84+
9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
85+
// vpermilps512 with immediate 0x4E = 01001110 = [1,3,2,0] for each 128-bit lane
86+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!cir.float x 16>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i] : !cir.vector<!cir.float x 16>
87+
__m512 result = __builtin_ia32_vpermilps512(vec, 0x4E);
88+
}
89+
90+
// CHECK-LABEL: @_Z17test_vpermilpd512v
91+
void test_vpermilpd512() {
92+
__m512d vec = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
93+
// vpermilpd512 with immediate 0x55 = 01010101 = [1,0,1,0,1,0,1,0] for 8 elements
94+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!cir.double x 8>) [#cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<5> : !s32i, #cir.int<4> : !s32i, #cir.int<7> : !s32i, #cir.int<6> : !s32i] : !cir.vector<!cir.double x 8>
95+
__m512d result = __builtin_ia32_vpermilpd512(vec, 0x55);
96+
}
97+
98+
// Test different immediate values
99+
// CHECK-LABEL: @_Z24test_vpermilps_differentv
100+
void test_vpermilps_different() {
101+
__m128 vec = {10.0f, 20.0f, 30.0f, 40.0f};
102+
// Test different immediate value: 0x1B = 00011011 = [3,2,1,0] reversed
103+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!cir.float x 4>) [#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i] : !cir.vector<!cir.float x 4>
104+
__m128 result = __builtin_ia32_vpermilps(vec, 0x1B);
105+
}
106+
107+
// CHECK-LABEL: @_Z24test_vpermilpd_differentv
108+
void test_vpermilpd_different() {
109+
__m128d vec = {100.0, 200.0};
110+
// Test immediate 0x0 = 00 = [0,0] - duplicate first element
111+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!cir.double x 2>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<!cir.double x 2>
112+
__m128d result = __builtin_ia32_vpermilpd(vec, 0x0);
113+
}

0 commit comments

Comments
 (0)