Skip to content

Commit a35f3ca

Browse files
committed
[CIR] Implement x86 rotate builtins
1 parent d97746c commit a35f3ca

File tree

4 files changed

+190
-9
lines changed

4 files changed

+190
-9
lines changed

clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,14 @@
1111
//
1212
//===----------------------------------------------------------------------===//
1313

14+
#include "CIRGenBuilder.h"
1415
#include "CIRGenFunction.h"
1516
#include "CIRGenModule.h"
17+
#include "mlir/IR/Location.h"
18+
#include "mlir/IR/ValueRange.h"
1619
#include "clang/Basic/Builtins.h"
1720
#include "clang/Basic/TargetBuiltins.h"
21+
#include "clang/CIR/Dialect/IR/CIRTypes.h"
1822
#include "clang/CIR/MissingFeatures.h"
1923

2024
using namespace clang;
@@ -168,6 +172,40 @@ static mlir::Value emitVecInsert(CIRGenBuilderTy &builder, mlir::Location loc,
168172
return cir::VecInsertOp::create(builder, loc, vec, value, indexVal);
169173
}
170174

175+
static mlir::Value emitX86FunnelShift(CIRGenBuilderTy &builder,
176+
mlir::Location location, mlir::Value &op0,
177+
mlir::Value &op1, mlir::Value &amt,
178+
bool isRight) {
179+
mlir::Type op0Ty = op0.getType();
180+
181+
// Amount may be scalar immediate, in which case create a splat vector.
182+
// Funnel shifts amounts are treated as modulo and types are all power-of-2
183+
// so we only care about the lowest log2 bits anyway.
184+
if (amt.getType() != op0Ty) {
185+
auto vecTy = mlir::cast<cir::VectorType>(op0Ty);
186+
uint64_t numElems = vecTy.getSize();
187+
188+
auto amtTy = mlir::cast<cir::IntType>(amt.getType());
189+
auto vecElemTy = mlir::cast<cir::IntType>(vecTy.getElementType());
190+
191+
// Cast to same width unsigned if not already unsigned.
192+
if (amtTy.isSigned()) {
193+
cir::IntType unsignedAmtTy = builder.getUIntNTy(amtTy.getWidth());
194+
amt = builder.createIntCast(amt, unsignedAmtTy);
195+
}
196+
// Cast the unsigned `amt` to operand element type's width unsigned.
197+
cir::IntType unsignedVecElemType = builder.getUIntNTy(vecElemTy.getWidth());
198+
amt = builder.createIntCast(amt, unsignedVecElemType);
199+
amt = cir::VecSplatOp::create(
200+
builder, location, cir::VectorType::get(unsignedVecElemType, numElems),
201+
amt);
202+
}
203+
204+
const StringRef intrinsicName = isRight ? "fshr" : "fshl";
205+
return emitIntrinsicCallOp(builder, location, intrinsicName, op0Ty,
206+
mlir::ValueRange{op0, op1, amt});
207+
}
208+
171209
mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
172210
const CallExpr *expr) {
173211
if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -842,12 +880,16 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
842880
case X86::BI__builtin_ia32_prolq128:
843881
case X86::BI__builtin_ia32_prolq256:
844882
case X86::BI__builtin_ia32_prolq512:
883+
return emitX86FunnelShift(this->getBuilder(), getLoc(expr->getExprLoc()),
884+
ops[0], ops[0], ops[1], false);
845885
case X86::BI__builtin_ia32_prord128:
846886
case X86::BI__builtin_ia32_prord256:
847887
case X86::BI__builtin_ia32_prord512:
848888
case X86::BI__builtin_ia32_prorq128:
849889
case X86::BI__builtin_ia32_prorq256:
850890
case X86::BI__builtin_ia32_prorq512:
891+
return emitX86FunnelShift(this->getBuilder(), getLoc(expr->getExprLoc()),
892+
ops[0], ops[0], ops[1], true);
851893
case X86::BI__builtin_ia32_selectb_128:
852894
case X86::BI__builtin_ia32_selectb_256:
853895
case X86::BI__builtin_ia32_selectb_512:

clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
__m512 test_mm512_undefined(void) {
1919
// CIR-LABEL: _mm512_undefined
2020
// CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
21-
// CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<16 x !cir.float>
21+
// CIR: cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<16 x !cir.float>
2222
// CIR: cir.return %{{.*}} : !cir.vector<16 x !cir.float>
2323

2424
// LLVM-LABEL: test_mm512_undefined
2525
// LLVM: store <16 x float> zeroinitializer, ptr %[[A:.*]], align 64
26-
// LLVM: %{{.*}} = load <16 x float>, ptr %[[A]], align 64
26+
// LLVM: load <16 x float>, ptr %[[A]], align 64
2727
// LLVM: ret <16 x float> %{{.*}}
2828

2929
// OGCG-LABEL: test_mm512_undefined
@@ -34,12 +34,12 @@ __m512 test_mm512_undefined(void) {
3434
__m512 test_mm512_undefined_ps(void) {
3535
// CIR-LABEL: _mm512_undefined_ps
3636
// CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
37-
// CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<16 x !cir.float>
37+
// CIR: cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<16 x !cir.float>
3838
// CIR: cir.return %{{.*}} : !cir.vector<16 x !cir.float>
3939

4040
// LLVM-LABEL: test_mm512_undefined_ps
4141
// LLVM: store <16 x float> zeroinitializer, ptr %[[A:.*]], align 64
42-
// LLVM: %{{.*}} = load <16 x float>, ptr %[[A]], align 64
42+
// LLVM: load <16 x float>, ptr %[[A]], align 64
4343
// LLVM: ret <16 x float> %{{.*}}
4444

4545
// OGCG-LABEL: test_mm512_undefined_ps
@@ -49,12 +49,12 @@ __m512 test_mm512_undefined_ps(void) {
4949

5050
__m512d test_mm512_undefined_pd(void) {
5151
// CIR-LABEL: _mm512_undefined_pd
52-
// CIR: %{{.*}} = cir.const #cir.zero : !cir.vector<8 x !cir.double>
52+
// CIR: cir.const #cir.zero : !cir.vector<8 x !cir.double>
5353
// CIR: cir.return %{{.*}} : !cir.vector<8 x !cir.double>
5454

5555
// LLVM-LABEL: test_mm512_undefined_pd
5656
// LLVM: store <8 x double> zeroinitializer, ptr %[[A:.*]], align 64
57-
// LLVM: %{{.*}} = load <8 x double>, ptr %[[A]], align 64
57+
// LLVM: load <8 x double>, ptr %[[A]], align 64
5858
// LLVM: ret <8 x double> %{{.*}}
5959

6060
// OGCG-LABEL: test_mm512_undefined_pd
@@ -64,13 +64,13 @@ __m512d test_mm512_undefined_pd(void) {
6464

6565
__m512i test_mm512_undefined_epi32(void) {
6666
// CIR-LABEL: _mm512_undefined_epi32
67-
// CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
68-
// CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<8 x !s64i>
67+
// CIR: cir.const #cir.zero : !cir.vector<8 x !cir.double>
68+
// CIR: cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<8 x !s64i>
6969
// CIR: cir.return %{{.*}} : !cir.vector<8 x !s64i>
7070

7171
// LLVM-LABEL: test_mm512_undefined_epi32
7272
// LLVM: store <8 x i64> zeroinitializer, ptr %[[A:.*]], align 64
73-
// LLVM: %{{.*}} = load <8 x i64>, ptr %[[A]], align 64
73+
// LLVM: load <8 x i64>, ptr %[[A]], align 64
7474
// LLVM: ret <8 x i64> %{{.*}}
7575

7676
// OGCG-LABEL: test_mm512_undefined_epi32
@@ -446,3 +446,36 @@ __m512i test_mm512_mask_i32gather_epi64(__m512i __v1_old, __mmask8 __mask, __m25
446446
// OGCG: call <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512
447447
return _mm512_mask_i32gather_epi64(__v1_old, __mask, __index, __addr, 2);
448448
}
449+
450+
__m512i test_mm512_ror_epi32(__m512i __A) {
451+
// CIR-LABEL: test_mm512_ror_epi32
452+
// CIR: cir.cast integral %{{.*}} : !s32i -> !u32i
453+
// CIR: cir.vec.splat %{{.*}} : !u32i, !cir.vector<16 x !u32i>
454+
// CIR: cir.call_llvm_intrinsic "fshr" {{%.*}}: (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
455+
456+
// LLVM-LABEL: test_mm512_ror_epi32
457+
// LLVM: %[[CASTED_VAR:.*]] = bitcast <8 x i64> %{{.*}} to <16 x i32>
458+
// LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %[[CASTED_VAR]], <16 x i32> %[[CASTED_VAR]], <16 x i32> splat (i32 5))
459+
460+
// OGCG-LABEL: test_mm512_ror_epi32
461+
// OGCG: %[[CASTED_VAR:.*]] = bitcast <8 x i64> %{{.*}} to <16 x i32>
462+
// OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %[[CASTED_VAR]], <16 x i32> %[[CASTED_VAR]], <16 x i32> splat (i32 5))
463+
return _mm512_ror_epi32(__A, 5);
464+
}
465+
466+
__m512i test_mm512_ror_epi64(__m512i __A) {
467+
// CIR-LABEL: test_mm512_ror_epi64
468+
// CIR: cir.cast integral %{{.*}} : !s32i -> !u32i
469+
// CIR: cir.cast integral %{{.*}} : !u32i -> !u64i
470+
// CIR: cir.vec.splat %{{.*}} : !u64i, !cir.vector<8 x !u64i>
471+
// CIR: cir.call_llvm_intrinsic "fshr" {{%.*}}: (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
472+
473+
// LLVM-LABEL: test_mm512_ror_epi64
474+
// LLVM: %[[VAR:.*]] = load <8 x i64>, ptr {{%.*}}, align 64
475+
// LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %[[VAR]], <8 x i64> %[[VAR]], <8 x i64> splat (i64 5))
476+
477+
// OGCG-LABEL: test_mm512_ror_epi64
478+
// OGCG: %[[VAR:.*]] = load <8 x i64>, ptr {{%.*}}, align 64
479+
// OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %[[VAR]], <8 x i64> %[[VAR]], <8 x i64> splat (i64 5))
480+
return _mm512_ror_epi64(__A, 5);
481+
}
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
2+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
3+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir
4+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
5+
6+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
7+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
8+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll
9+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
10+
11+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
12+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
13+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir
14+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
15+
16+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
17+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
18+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll
19+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
20+
21+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
22+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
23+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
24+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
25+
26+
#include <x86intrin.h>
27+
28+
// This test mimics clang/test/CodeGen/X86/xop-builtins.c, which eventually
29+
// CIR shall be able to support fully.
30+
31+
__m128i test_mm_roti_epi8(__m128i a) {
32+
// CIR-LABEL: test_mm_roti_epi8
33+
// CIR: cir.vec.splat %{{.*}} : !{{[us]}}8i, !cir.vector<16 x !{{[us]}}8i>
34+
// CIR: cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>) -> !cir.vector<16 x !{{[su]}}8i>
35+
36+
// LLVM-LABEL: test_mm_roti_epi8
37+
// LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <16 x i8>
38+
// LLVM: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
39+
40+
// OGCG-LABEL: test_mm_roti_epi8
41+
// OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <16 x i8>
42+
// OGCG: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
43+
return _mm_roti_epi8(a, 1);
44+
}
45+
46+
__m128i test_mm_roti_epi16(__m128i a) {
47+
// CIR-LABEL: test_mm_roti_epi16
48+
// CIR: cir.cast integral %{{.*}} : !u8i -> !u16i
49+
// CIR: cir.vec.splat %{{.*}} : !{{[us]}}16i, !cir.vector<8 x !u16i>
50+
// CIR: cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !u16i>) -> !cir.vector<8 x !{{[su]}}16i>
51+
52+
// LLVM-LABEL: test_mm_roti_epi16
53+
// LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <8 x i16>
54+
// LLVM: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
55+
56+
// OGCG-LABEL: test_mm_roti_epi16
57+
// OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <8 x i16>
58+
// OGCG: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
59+
return _mm_roti_epi16(a, 50);
60+
}
61+
62+
__m128i test_mm_roti_epi32(__m128i a) {
63+
// CIR-LABEL: test_mm_roti_epi32
64+
// CIR: cir.cast integral %{{.*}} : !u8i -> !u32i
65+
// CIR: cir.vec.splat %{{.*}} : !{{[us]}}32i, !cir.vector<4 x !u32i>
66+
// CIR: cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !u32i>) -> !cir.vector<4 x !{{[su]}}32i>
67+
68+
// LLVM-LABEL: test_mm_roti_epi32
69+
// LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <4 x i32>
70+
// LLVM: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %[[CASTED_VAR]], <4 x i32> %[[CASTED_VAR]], <4 x i32> splat (i32 226))
71+
72+
// OGCG-LABEL: test_mm_roti_epi32
73+
// OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <4 x i32>
74+
// OGCG: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %[[CASTED_VAR]], <4 x i32> %[[CASTED_VAR]], <4 x i32> splat (i32 226))
75+
return _mm_roti_epi32(a, -30);
76+
}
77+
78+
__m128i test_mm_roti_epi64(__m128i a) {
79+
// CIR-LABEL: test_mm_roti_epi64
80+
// CIR: cir.cast integral %{{.*}} : !u8i -> !u64i
81+
// CIR: cir.vec.splat %{{.*}} : !u64i, !cir.vector<2 x !u64i>
82+
// CIR: cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !u64i>) -> !cir.vector<2 x !s64i>
83+
84+
// LLVM-LABEL: test_mm_roti_epi64
85+
// LLVM: %[[VAR:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
86+
// LLVM: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100))
87+
88+
// OGCG-LABEL: test_mm_roti_epi64
89+
// OGCG: %[[VAR:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
90+
// OGCG: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100))
91+
return _mm_roti_epi64(a, 100);
92+
}

shell.nix

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
let
2+
nixpkgs = fetchTarball "https://github.com/NixOS/nixpkgs/tarball/nixos-24.05";
3+
pkgs = import nixpkgs { config = {}; overlays = []; };
4+
in
5+
6+
7+
pkgs.mkShellNoCC {
8+
packages = with pkgs; [
9+
cmake
10+
ninja
11+
llvmPackages_latest.llvm
12+
];
13+
stdenv = pkgs.clangStdenv;
14+
}

0 commit comments

Comments
 (0)