Skip to content

Commit 271e99d

Browse files
authored
[CIR] Support x86 builtin rotate (llvm#169566)
This PR implements CodeGen for rotate builtins in CIR upstream. Issue llvm#167765
1 parent 7685e1f commit 271e99d

File tree

3 files changed

+167
-0
lines changed

3 files changed

+167
-0
lines changed

clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,14 @@
1111
//
1212
//===----------------------------------------------------------------------===//
1313

14+
#include "CIRGenBuilder.h"
1415
#include "CIRGenFunction.h"
1516
#include "CIRGenModule.h"
17+
#include "mlir/IR/Location.h"
18+
#include "mlir/IR/ValueRange.h"
1619
#include "clang/Basic/Builtins.h"
1720
#include "clang/Basic/TargetBuiltins.h"
21+
#include "clang/CIR/Dialect/IR/CIRTypes.h"
1822
#include "clang/CIR/MissingFeatures.h"
1923

2024
using namespace clang;
@@ -231,6 +235,40 @@ static mlir::Value emitVecInsert(CIRGenBuilderTy &builder, mlir::Location loc,
231235
return cir::VecInsertOp::create(builder, loc, vec, value, indexVal);
232236
}
233237

238+
static mlir::Value emitX86FunnelShift(CIRGenBuilderTy &builder,
239+
mlir::Location location, mlir::Value &op0,
240+
mlir::Value &op1, mlir::Value &amt,
241+
bool isRight) {
242+
mlir::Type op0Ty = op0.getType();
243+
244+
// Amount may be scalar immediate, in which case create a splat vector.
245+
// Funnel shifts amounts are treated as modulo and types are all power-of-2
246+
// so we only care about the lowest log2 bits anyway.
247+
if (amt.getType() != op0Ty) {
248+
auto vecTy = mlir::cast<cir::VectorType>(op0Ty);
249+
uint64_t numElems = vecTy.getSize();
250+
251+
auto amtTy = mlir::cast<cir::IntType>(amt.getType());
252+
auto vecElemTy = mlir::cast<cir::IntType>(vecTy.getElementType());
253+
254+
// If signed, cast to the same width but unsigned first to
255+
// ensure zero-extension when casting to a bigger unsigned `vecElemeTy`.
256+
if (amtTy.isSigned()) {
257+
cir::IntType unsignedAmtTy = builder.getUIntNTy(amtTy.getWidth());
258+
amt = builder.createIntCast(amt, unsignedAmtTy);
259+
}
260+
cir::IntType unsignedVecElemType = builder.getUIntNTy(vecElemTy.getWidth());
261+
amt = builder.createIntCast(amt, unsignedVecElemType);
262+
amt = cir::VecSplatOp::create(
263+
builder, location, cir::VectorType::get(unsignedVecElemType, numElems),
264+
amt);
265+
}
266+
267+
const StringRef intrinsicName = isRight ? "fshr" : "fshl";
268+
return emitIntrinsicCallOp(builder, location, intrinsicName, op0Ty,
269+
mlir::ValueRange{op0, op1, amt});
270+
}
271+
234272
mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
235273
const CallExpr *expr) {
236274
if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -926,12 +964,16 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
926964
case X86::BI__builtin_ia32_prolq128:
927965
case X86::BI__builtin_ia32_prolq256:
928966
case X86::BI__builtin_ia32_prolq512:
967+
return emitX86FunnelShift(builder, getLoc(expr->getExprLoc()), ops[0],
968+
ops[0], ops[1], false);
929969
case X86::BI__builtin_ia32_prord128:
930970
case X86::BI__builtin_ia32_prord256:
931971
case X86::BI__builtin_ia32_prord512:
932972
case X86::BI__builtin_ia32_prorq128:
933973
case X86::BI__builtin_ia32_prorq256:
934974
case X86::BI__builtin_ia32_prorq512:
975+
return emitX86FunnelShift(builder, getLoc(expr->getExprLoc()), ops[0],
976+
ops[0], ops[1], true);
935977
case X86::BI__builtin_ia32_selectb_128:
936978
case X86::BI__builtin_ia32_selectb_256:
937979
case X86::BI__builtin_ia32_selectb_512:

clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,3 +494,36 @@ __m512i test_mm512_mask_i32gather_epi64(__m512i __v1_old, __mmask8 __mask, __m25
494494
// OGCG: call <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512
495495
return _mm512_mask_i32gather_epi64(__v1_old, __mask, __index, __addr, 2);
496496
}
497+
498+
__m512i test_mm512_ror_epi32(__m512i __A) {
499+
// CIR-LABEL: test_mm512_ror_epi32
500+
// CIR: cir.cast integral %{{.*}} : !s32i -> !u32i
501+
// CIR: cir.vec.splat %{{.*}} : !u32i, !cir.vector<16 x !u32i>
502+
// CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}: (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
503+
504+
// LLVM-LABEL: test_mm512_ror_epi32
505+
// LLVM: %[[CASTED_VAR:.*]] = bitcast <8 x i64> %{{.*}} to <16 x i32>
506+
// LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %[[CASTED_VAR]], <16 x i32> %[[CASTED_VAR]], <16 x i32> splat (i32 5))
507+
508+
// OGCG-LABEL: test_mm512_ror_epi32
509+
// OGCG: %[[CASTED_VAR:.*]] = bitcast <8 x i64> %{{.*}} to <16 x i32>
510+
// OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %[[CASTED_VAR]], <16 x i32> %[[CASTED_VAR]], <16 x i32> splat (i32 5))
511+
return _mm512_ror_epi32(__A, 5);
512+
}
513+
514+
__m512i test_mm512_ror_epi64(__m512i __A) {
515+
// CIR-LABEL: test_mm512_ror_epi64
516+
// CIR: cir.cast integral %{{.*}} : !s32i -> !u32i
517+
// CIR: cir.cast integral %{{.*}} : !u32i -> !u64i
518+
// CIR: cir.vec.splat %{{.*}} : !u64i, !cir.vector<8 x !u64i>
519+
// CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}: (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
520+
521+
// LLVM-LABEL: test_mm512_ror_epi64
522+
// LLVM: %[[VAR:.*]] = load <8 x i64>, ptr %{{.*}}, align 64
523+
// LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %[[VAR]], <8 x i64> %[[VAR]], <8 x i64> splat (i64 5))
524+
525+
// OGCG-LABEL: test_mm512_ror_epi64
526+
// OGCG: %[[VAR:.*]] = load <8 x i64>, ptr %{{.*}}, align 64
527+
// OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %[[VAR]], <8 x i64> %[[VAR]], <8 x i64> splat (i64 5))
528+
return _mm512_ror_epi64(__A, 5);
529+
}
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
2+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
3+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir
4+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
5+
6+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
7+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
8+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll
9+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
10+
11+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
12+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
13+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir
14+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
15+
16+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
17+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
18+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll
19+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
20+
21+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
22+
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
23+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
24+
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
25+
26+
#include <x86intrin.h>
27+
28+
// This test mimics clang/test/CodeGen/X86/xop-builtins.c, which eventually
29+
// CIR shall be able to support fully.
30+
31+
__m128i test_mm_roti_epi8(__m128i a) {
32+
// CIR-LABEL: test_mm_roti_epi8
33+
// CIR: cir.vec.splat %{{.*}} : !{{[us]}}8i, !cir.vector<16 x !{{[us]}}8i>
34+
// CIR: cir.call_llvm_intrinsic "fshl" %{{.*}} : (!cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>) -> !cir.vector<16 x !{{[su]}}8i>
35+
36+
// LLVM-LABEL: test_mm_roti_epi8
37+
// LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <16 x i8>
38+
// LLVM: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
39+
40+
// OGCG-LABEL: test_mm_roti_epi8
41+
// OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <16 x i8>
42+
// OGCG: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
43+
return _mm_roti_epi8(a, 1);
44+
}
45+
46+
__m128i test_mm_roti_epi16(__m128i a) {
47+
// CIR-LABEL: test_mm_roti_epi16
48+
// CIR: cir.cast integral %{{.*}} : !u8i -> !u16i
49+
// CIR: cir.vec.splat %{{.*}} : !{{[us]}}16i, !cir.vector<8 x !u16i>
50+
// CIR: cir.call_llvm_intrinsic "fshl" %{{.*}} : (!cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !u16i>) -> !cir.vector<8 x !{{[su]}}16i>
51+
52+
// LLVM-LABEL: test_mm_roti_epi16
53+
// LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <8 x i16>
54+
// LLVM: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
55+
56+
// OGCG-LABEL: test_mm_roti_epi16
57+
// OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <8 x i16>
58+
// OGCG: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
59+
return _mm_roti_epi16(a, 50);
60+
}
61+
62+
__m128i test_mm_roti_epi32(__m128i a) {
63+
// CIR-LABEL: test_mm_roti_epi32
64+
// CIR: cir.cast integral %{{.*}} : !u8i -> !u32i
65+
// CIR: cir.vec.splat %{{.*}} : !{{[us]}}32i, !cir.vector<4 x !u32i>
66+
// CIR: cir.call_llvm_intrinsic "fshl" %{{.*}} : (!cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !u32i>) -> !cir.vector<4 x !{{[su]}}32i>
67+
68+
// LLVM-LABEL: test_mm_roti_epi32
69+
// LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <4 x i32>
70+
// LLVM: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %[[CASTED_VAR]], <4 x i32> %[[CASTED_VAR]], <4 x i32> splat (i32 226))
71+
72+
// OGCG-LABEL: test_mm_roti_epi32
73+
// OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <4 x i32>
74+
// OGCG: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %[[CASTED_VAR]], <4 x i32> %[[CASTED_VAR]], <4 x i32> splat (i32 226))
75+
return _mm_roti_epi32(a, -30);
76+
}
77+
78+
__m128i test_mm_roti_epi64(__m128i a) {
79+
// CIR-LABEL: test_mm_roti_epi64
80+
// CIR: cir.cast integral %{{.*}} : !u8i -> !u64i
81+
// CIR: cir.vec.splat %{{.*}} : !u64i, !cir.vector<2 x !u64i>
82+
// CIR: cir.call_llvm_intrinsic "fshl" %{{.*}} : (!cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !u64i>) -> !cir.vector<2 x !s64i>
83+
84+
// LLVM-LABEL: test_mm_roti_epi64
85+
// LLVM: %[[VAR:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
86+
// LLVM: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100))
87+
88+
// OGCG-LABEL: test_mm_roti_epi64
89+
// OGCG: %[[VAR:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
90+
// OGCG: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100))
91+
return _mm_roti_epi64(a, 100);
92+
}

0 commit comments

Comments
 (0)