Skip to content

Commit 885eaa5

Browse files
committed
[CIR] Implement x86 rotate builtins
1 parent c8031c3 commit 885eaa5

File tree

4 files changed

+190
-9
lines changed

4 files changed

+190
-9
lines changed

clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,14 @@
1111
//
1212
//===----------------------------------------------------------------------===//
1313

14+
#include "CIRGenBuilder.h"
1415
#include "CIRGenFunction.h"
1516
#include "CIRGenModule.h"
17+
#include "mlir/IR/Location.h"
18+
#include "mlir/IR/ValueRange.h"
1619
#include "clang/Basic/Builtins.h"
1720
#include "clang/Basic/TargetBuiltins.h"
21+
#include "clang/CIR/Dialect/IR/CIRTypes.h"
1822
#include "clang/CIR/MissingFeatures.h"
1923

2024
using namespace clang;
@@ -115,6 +119,40 @@ static mlir::Value emitX86MaskLogic(CIRGenBuilderTy &builder,
115119
ops[0].getType());
116120
}
117121

122+
static mlir::Value emitX86FunnelShift(CIRGenBuilderTy &builder,
123+
mlir::Location location, mlir::Value &op0,
124+
mlir::Value &op1, mlir::Value &amt,
125+
bool isRight) {
126+
mlir::Type op0Ty = op0.getType();
127+
128+
// Amount may be scalar immediate, in which case create a splat vector.
129+
// Funnel shifts amounts are treated as modulo and types are all power-of-2
130+
// so we only care about the lowest log2 bits anyway.
131+
if (amt.getType() != op0Ty) {
132+
auto vecTy = mlir::cast<cir::VectorType>(op0Ty);
133+
uint64_t numElems = vecTy.getSize();
134+
135+
auto amtTy = mlir::cast<cir::IntType>(amt.getType());
136+
auto vecElemTy = mlir::cast<cir::IntType>(vecTy.getElementType());
137+
138+
// Cast to same width unsigned if not already unsigned.
139+
if (amtTy.isSigned()) {
140+
cir::IntType unsignedAmtTy = builder.getUIntNTy(amtTy.getWidth());
141+
amt = builder.createIntCast(amt, unsignedAmtTy);
142+
}
143+
// Cast the unsigned `amt` to operand element type's width unsigned.
144+
cir::IntType unsignedVecElemType = builder.getUIntNTy(vecElemTy.getWidth());
145+
amt = builder.createIntCast(amt, unsignedVecElemType);
146+
amt = cir::VecSplatOp::create(
147+
builder, location, cir::VectorType::get(unsignedVecElemType, numElems),
148+
amt);
149+
}
150+
151+
const StringRef intrinsicName = isRight ? "fshr" : "fshl";
152+
return emitIntrinsicCallOp(builder, location, intrinsicName, op0Ty,
153+
mlir::ValueRange{op0, op1, amt});
154+
}
155+
118156
mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
119157
const CallExpr *expr) {
120158
if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -691,12 +729,16 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
691729
case X86::BI__builtin_ia32_prolq128:
692730
case X86::BI__builtin_ia32_prolq256:
693731
case X86::BI__builtin_ia32_prolq512:
732+
return emitX86FunnelShift(this->getBuilder(), getLoc(expr->getExprLoc()),
733+
ops[0], ops[0], ops[1], false);
694734
case X86::BI__builtin_ia32_prord128:
695735
case X86::BI__builtin_ia32_prord256:
696736
case X86::BI__builtin_ia32_prord512:
697737
case X86::BI__builtin_ia32_prorq128:
698738
case X86::BI__builtin_ia32_prorq256:
699739
case X86::BI__builtin_ia32_prorq512:
740+
return emitX86FunnelShift(this->getBuilder(), getLoc(expr->getExprLoc()),
741+
ops[0], ops[0], ops[1], true);
700742
case X86::BI__builtin_ia32_selectb_128:
701743
case X86::BI__builtin_ia32_selectb_256:
702744
case X86::BI__builtin_ia32_selectb_512:

clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
__m512 test_mm512_undefined(void) {
1919
// CIR-LABEL: _mm512_undefined
2020
// CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
21-
// CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<16 x !cir.float>
21+
// CIR: cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<16 x !cir.float>
2222
// CIR: cir.return %{{.*}} : !cir.vector<16 x !cir.float>
2323

2424
// LLVM-LABEL: test_mm512_undefined
2525
// LLVM: store <16 x float> zeroinitializer, ptr %[[A:.*]], align 64
26-
// LLVM: %{{.*}} = load <16 x float>, ptr %[[A]], align 64
26+
// LLVM: load <16 x float>, ptr %[[A]], align 64
2727
// LLVM: ret <16 x float> %{{.*}}
2828

2929
// OGCG-LABEL: test_mm512_undefined
@@ -34,12 +34,12 @@ __m512 test_mm512_undefined(void) {
3434
__m512 test_mm512_undefined_ps(void) {
3535
// CIR-LABEL: _mm512_undefined_ps
3636
// CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
37-
// CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<16 x !cir.float>
37+
// CIR: cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<16 x !cir.float>
3838
// CIR: cir.return %{{.*}} : !cir.vector<16 x !cir.float>
3939

4040
// LLVM-LABEL: test_mm512_undefined_ps
4141
// LLVM: store <16 x float> zeroinitializer, ptr %[[A:.*]], align 64
42-
// LLVM: %{{.*}} = load <16 x float>, ptr %[[A]], align 64
42+
// LLVM: load <16 x float>, ptr %[[A]], align 64
4343
// LLVM: ret <16 x float> %{{.*}}
4444

4545
// OGCG-LABEL: test_mm512_undefined_ps
@@ -49,12 +49,12 @@ __m512 test_mm512_undefined_ps(void) {
4949

5050
__m512d test_mm512_undefined_pd(void) {
5151
// CIR-LABEL: _mm512_undefined_pd
52-
// CIR: %{{.*}} = cir.const #cir.zero : !cir.vector<8 x !cir.double>
52+
// CIR: cir.const #cir.zero : !cir.vector<8 x !cir.double>
5353
// CIR: cir.return %{{.*}} : !cir.vector<8 x !cir.double>
5454

5555
// LLVM-LABEL: test_mm512_undefined_pd
5656
// LLVM: store <8 x double> zeroinitializer, ptr %[[A:.*]], align 64
57-
// LLVM: %{{.*}} = load <8 x double>, ptr %[[A]], align 64
57+
// LLVM: load <8 x double>, ptr %[[A]], align 64
5858
// LLVM: ret <8 x double> %{{.*}}
5959

6060
// OGCG-LABEL: test_mm512_undefined_pd
@@ -64,13 +64,13 @@ __m512d test_mm512_undefined_pd(void) {
6464

6565
__m512i test_mm512_undefined_epi32(void) {
6666
// CIR-LABEL: _mm512_undefined_epi32
67-
// CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
68-
// CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<8 x !s64i>
67+
// CIR: cir.const #cir.zero : !cir.vector<8 x !cir.double>
68+
// CIR: cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<8 x !s64i>
6969
// CIR: cir.return %{{.*}} : !cir.vector<8 x !s64i>
7070

7171
// LLVM-LABEL: test_mm512_undefined_epi32
7272
// LLVM: store <8 x i64> zeroinitializer, ptr %[[A:.*]], align 64
73-
// LLVM: %{{.*}} = load <8 x i64>, ptr %[[A]], align 64
73+
// LLVM: load <8 x i64>, ptr %[[A]], align 64
7474
// LLVM: ret <8 x i64> %{{.*}}
7575

7676
// OGCG-LABEL: test_mm512_undefined_epi32
@@ -228,3 +228,36 @@ __mmask16 test_kmov_w(__mmask16 A) {
228228
// OGCG: bitcast <16 x i1> {{.*}} to i16
229229
return __builtin_ia32_kmovw(A);
230230
}
231+
232+
__m512i test_mm512_ror_epi32(__m512i __A) {
233+
// CIR-LABEL: test_mm512_ror_epi32
234+
// CIR: cir.cast integral %{{.*}} : !s32i -> !u32i
235+
// CIR: cir.vec.splat %{{.*}} : !u32i, !cir.vector<16 x !u32i>
236+
// CIR: cir.call_llvm_intrinsic "fshr" {{%.*}}: (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
237+
238+
// LLVM-LABEL: test_mm512_ror_epi32
239+
// LLVM: %[[CASTED_VAR:.*]] = bitcast <8 x i64> %{{.*}} to <16 x i32>
240+
// LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %[[CASTED_VAR]], <16 x i32> %[[CASTED_VAR]], <16 x i32> splat (i32 5))
241+
242+
// OGCG-LABEL: test_mm512_ror_epi32
243+
// OGCG: %[[CASTED_VAR:.*]] = bitcast <8 x i64> %{{.*}} to <16 x i32>
244+
// OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %[[CASTED_VAR]], <16 x i32> %[[CASTED_VAR]], <16 x i32> splat (i32 5))
245+
return _mm512_ror_epi32(__A, 5);
246+
}
247+
248+
__m512i test_mm512_ror_epi64(__m512i __A) {
249+
// CIR-LABEL: test_mm512_ror_epi64
250+
// CIR: cir.cast integral %{{.*}} : !s32i -> !u32i
251+
// CIR: cir.cast integral %{{.*}} : !u32i -> !u64i
252+
// CIR: cir.vec.splat %{{.*}} : !u64i, !cir.vector<8 x !u64i>
253+
// CIR: cir.call_llvm_intrinsic "fshr" {{%.*}}: (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
254+
255+
// LLVM-LABEL: test_mm512_ror_epi64
256+
// LLVM: %[[VAR:.*]] = load <8 x i64>, ptr {{%.*}}, align 64
257+
// LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %[[VAR]], <8 x i64> %[[VAR]], <8 x i64> splat (i64 5))
258+
259+
// OGCG-LABEL: test_mm512_ror_epi64
260+
// OGCG: %[[VAR:.*]] = load <8 x i64>, ptr {{%.*}}, align 64
261+
// OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %[[VAR]], <8 x i64> %[[VAR]], <8 x i64> splat (i64 5))
262+
return _mm512_ror_epi64(__A, 5);
263+
}
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
2+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
3+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir
4+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
5+
6+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
7+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
8+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll
9+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
10+
11+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
12+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
13+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir
14+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
15+
16+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
17+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
18+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll
19+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
20+
21+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
22+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
23+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
24+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
25+
26+
#include <x86intrin.h>
27+
28+
// This test mimics clang/test/CodeGen/X86/xop-builtins.c, which eventually
29+
// CIR shall be able to support fully.
30+
31+
__m128i test_mm_roti_epi8(__m128i a) {
32+
// CIR-LABEL: test_mm_roti_epi8
33+
// CIR: cir.vec.splat %{{.*}} : !{{[us]}}8i, !cir.vector<16 x !{{[us]}}8i>
34+
// CIR: cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>) -> !cir.vector<16 x !{{[su]}}8i>
35+
36+
// LLVM-LABEL: test_mm_roti_epi8
37+
// LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <16 x i8>
38+
// LLVM: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
39+
40+
// OGCG-LABEL: test_mm_roti_epi8
41+
// OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <16 x i8>
42+
// OGCG: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
43+
return _mm_roti_epi8(a, 1);
44+
}
45+
46+
__m128i test_mm_roti_epi16(__m128i a) {
47+
// CIR-LABEL: test_mm_roti_epi16
48+
// CIR: cir.cast integral %{{.*}} : !u8i -> !u16i
49+
// CIR: cir.vec.splat %{{.*}} : !{{[us]}}16i, !cir.vector<8 x !u16i>
50+
// CIR: cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !u16i>) -> !cir.vector<8 x !{{[su]}}16i>
51+
52+
// LLVM-LABEL: test_mm_roti_epi16
53+
// LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <8 x i16>
54+
// LLVM: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
55+
56+
// OGCG-LABEL: test_mm_roti_epi16
57+
// OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <8 x i16>
58+
// OGCG: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
59+
return _mm_roti_epi16(a, 50);
60+
}
61+
62+
__m128i test_mm_roti_epi32(__m128i a) {
63+
// CIR-LABEL: test_mm_roti_epi32
64+
// CIR: cir.cast integral %{{.*}} : !u8i -> !u32i
65+
// CIR: cir.vec.splat %{{.*}} : !{{[us]}}32i, !cir.vector<4 x !u32i>
66+
// CIR: cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !u32i>) -> !cir.vector<4 x !{{[su]}}32i>
67+
68+
// LLVM-LABEL: test_mm_roti_epi32
69+
// LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <4 x i32>
70+
// LLVM: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %[[CASTED_VAR]], <4 x i32> %[[CASTED_VAR]], <4 x i32> splat (i32 226))
71+
72+
// OGCG-LABEL: test_mm_roti_epi32
73+
// OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <4 x i32>
74+
// OGCG: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %[[CASTED_VAR]], <4 x i32> %[[CASTED_VAR]], <4 x i32> splat (i32 226))
75+
return _mm_roti_epi32(a, -30);
76+
}
77+
78+
__m128i test_mm_roti_epi64(__m128i a) {
79+
// CIR-LABEL: test_mm_roti_epi64
80+
// CIR: cir.cast integral %{{.*}} : !u8i -> !u64i
81+
// CIR: cir.vec.splat %{{.*}} : !u64i, !cir.vector<2 x !u64i>
82+
// CIR: cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !u64i>) -> !cir.vector<2 x !s64i>
83+
84+
// LLVM-LABEL: test_mm_roti_epi64
85+
// LLVM: %[[VAR:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
86+
// LLVM: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100))
87+
88+
// OGCG-LABEL: test_mm_roti_epi64
89+
// OGCG: %[[VAR:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
90+
// OGCG: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100))
91+
return _mm_roti_epi64(a, 100);
92+
}

shell.nix

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
let
2+
nixpkgs = fetchTarball "https://github.com/NixOS/nixpkgs/tarball/nixos-24.05";
3+
pkgs = import nixpkgs { config = {}; overlays = []; };
4+
in
5+
6+
7+
pkgs.mkShellNoCC {
8+
packages = with pkgs; [
9+
cmake
10+
ninja
11+
llvmPackages_latest.llvm
12+
];
13+
stdenv = pkgs.clangStdenv;
14+
}

0 commit comments

Comments
 (0)