-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[CIR] Support x86 builtin rotate #169566
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[CIR] Support x86 builtin rotate #169566
Conversation
|
@llvm/pr-subscribers-clang Author: Omar Hossam (moar55) ChangesThis PR implements CodeGen for rotate builtins in CIR upstream. I couldn't figure out how to properly handle the case where the Full diff: https://github.com/llvm/llvm-project/pull/169566.diff 2 Files Affected:
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index e7aa8a234efd9..b52dc7b1f4b91 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -13,8 +13,10 @@
#include "CIRGenFunction.h"
#include "CIRGenModule.h"
+#include "mlir/IR/ValueRange.h"
#include "clang/Basic/Builtins.h"
#include "clang/Basic/TargetBuiltins.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
#include "clang/CIR/MissingFeatures.h"
using namespace clang;
@@ -90,6 +92,33 @@ static mlir::Value getMaskVecValue(CIRGenFunction &cgf, const CallExpr *expr,
return maskVec;
}
+static mlir::Value emitX86FunnelShift(CIRGenFunction &cgf, const CallExpr *e,
+ mlir::Value &op0, mlir::Value &op1,
+ mlir::Value &amt, bool isRight) {
+ auto ty = op0.getType();
+
+ // Amount may be scalar immediate, in which case create a splat vector.
+ // Funnel shifts amounts are treated as modulo and types are all power-of-2
+ // so we only care about the lowest log2 bits anyway.
+ if (amt.getType() != ty) {
+ auto vecTy = mlir::cast<cir::VectorType>(ty);
+
+ auto numElems = vecTy.getSize();
+ auto vecElemType = mlir::cast<cir::IntType>(vecTy.getElementType());
+ auto signlessType =
+ cir::IntType::get(&cgf.getMLIRContext(), vecElemType.getWidth(), false);
+ amt = cgf.getBuilder().createIntCast(amt, signlessType);
+
+ amt = cir::VecSplatOp::create(cgf.getBuilder(), cgf.getLoc(e->getExprLoc()),
+ cir::VectorType::get(signlessType, numElems),
+ amt);
+ }
+
+ const std::string intrinsicName = isRight ? "fshr" : "fshl";
+ return emitIntrinsicCallOp(cgf, e, intrinsicName, ty,
+ mlir::ValueRange{op0, op1, amt});
+}
+
mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
const CallExpr *expr) {
if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -661,12 +690,14 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
case X86::BI__builtin_ia32_prolq128:
case X86::BI__builtin_ia32_prolq256:
case X86::BI__builtin_ia32_prolq512:
+ return emitX86FunnelShift(*this, expr, ops[0], ops[0], ops[1], false);
case X86::BI__builtin_ia32_prord128:
case X86::BI__builtin_ia32_prord256:
case X86::BI__builtin_ia32_prord512:
case X86::BI__builtin_ia32_prorq128:
case X86::BI__builtin_ia32_prorq256:
case X86::BI__builtin_ia32_prorq512:
+ return emitX86FunnelShift(*this, expr, ops[0], ops[0], ops[1], true);
case X86::BI__builtin_ia32_selectb_128:
case X86::BI__builtin_ia32_selectb_256:
case X86::BI__builtin_ia32_selectb_512:
diff --git a/clang/test/CIR/CodeGen/X86/xop-builtins.c b/clang/test/CIR/CodeGen/X86/xop-builtins.c
new file mode 100644
index 0000000000000..c8ae5eb0fd82d
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/xop-builtins.c
@@ -0,0 +1,79 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+
+#include <x86intrin.h>
+
+// This test mimics clang/test/CodeGen/X86/xop-builtins.c, which eventually
+// CIR shall be able to support fully.
+
+__m128i test_mm_roti_epi8(__m128i a) {
+ // CIR-LABEL: test_mm_roti_epi8
+ // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}8i, !cir.vector<16 x !{{[us]}}8i>
+ // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>) -> !cir.vector<16 x !{{[su]}}8i>
+ // LLVM-LABEL: test_mm_roti_epi8
+ // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <16 x i8>
+ // LLVM: {{%.*}} = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
+ // OGCG-LABEL: test_mm_roti_epi8
+ // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <16 x i8>
+ // OGCG: {{%.*}} = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
+ return _mm_roti_epi8(a, 1);
+}
+
+__m128i test_mm_roti_epi16(__m128i a) {
+ // CIR-LABEL: test_mm_roti_epi16
+ // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u16i
+ // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}16i, !cir.vector<8 x !{{[us]}}16i>
+ // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>) -> !cir.vector<8 x !{{[su]}}16i>
+ // LLVM-LABEL: test_mm_roti_epi16
+ // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <8 x i16>
+ // LLVM: {{%.*}} = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
+ // OGCG-LABEL: test_mm_roti_epi16
+ // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <8 x i16>
+ // OGCG: {{%.*}} = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
+ return _mm_roti_epi16(a, 50);
+ }
+
+//NOTE: This only works as I expect for CIR but not for LLVMIR
+__m128i test_mm_roti_epi32(__m128i a) {
+ // CIR-LABEL: test_mm_roti_epi32
+ // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u32i
+ // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}32i, !cir.vector<4 x !{{[us]}}32i>
+ // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>) -> !cir.vector<4 x !{{[su]}}32i>
+ return _mm_roti_epi32(a, -30);
+ }
+
+__m128i test_mm_roti_epi64(__m128i a) {
+ // CIR-LABEL: test_mm_roti_epi64
+ // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u64i
+ // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{.}}64i, !cir.vector<2 x !{{[us]}}64i>
+ // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !u64i>) -> !cir.vector<2 x !{{[su]}}64i>
+ // LLVM-LABEL: test_mm_roti_epi64
+ // LLVM: %[[VAR:.*]] = load <2 x i64>, ptr {{%.*}}, align 16
+ // LLVM: {{%.*}} = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100))
+ // OGCG-LABEL: test_mm_roti_epi64
+ // OGCG: %[[VAR:.*]] = load <2 x i64>, ptr {{%.*}}, align 16
+ // OGCG: {{%.*}} = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100))
+ return _mm_roti_epi64(a, 100);
+ }
|
|
@llvm/pr-subscribers-clangir Author: Omar Hossam (moar55) ChangesThis PR implements CodeGen for rotate builtins in CIR upstream. I couldn't figure out how to properly handle the case where the Full diff: https://github.com/llvm/llvm-project/pull/169566.diff 2 Files Affected:
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index e7aa8a234efd9..b52dc7b1f4b91 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -13,8 +13,10 @@
#include "CIRGenFunction.h"
#include "CIRGenModule.h"
+#include "mlir/IR/ValueRange.h"
#include "clang/Basic/Builtins.h"
#include "clang/Basic/TargetBuiltins.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
#include "clang/CIR/MissingFeatures.h"
using namespace clang;
@@ -90,6 +92,33 @@ static mlir::Value getMaskVecValue(CIRGenFunction &cgf, const CallExpr *expr,
return maskVec;
}
+static mlir::Value emitX86FunnelShift(CIRGenFunction &cgf, const CallExpr *e,
+ mlir::Value &op0, mlir::Value &op1,
+ mlir::Value &amt, bool isRight) {
+ auto ty = op0.getType();
+
+ // Amount may be scalar immediate, in which case create a splat vector.
+ // Funnel shifts amounts are treated as modulo and types are all power-of-2
+ // so we only care about the lowest log2 bits anyway.
+ if (amt.getType() != ty) {
+ auto vecTy = mlir::cast<cir::VectorType>(ty);
+
+ auto numElems = vecTy.getSize();
+ auto vecElemType = mlir::cast<cir::IntType>(vecTy.getElementType());
+ auto signlessType =
+ cir::IntType::get(&cgf.getMLIRContext(), vecElemType.getWidth(), false);
+ amt = cgf.getBuilder().createIntCast(amt, signlessType);
+
+ amt = cir::VecSplatOp::create(cgf.getBuilder(), cgf.getLoc(e->getExprLoc()),
+ cir::VectorType::get(signlessType, numElems),
+ amt);
+ }
+
+ const std::string intrinsicName = isRight ? "fshr" : "fshl";
+ return emitIntrinsicCallOp(cgf, e, intrinsicName, ty,
+ mlir::ValueRange{op0, op1, amt});
+}
+
mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
const CallExpr *expr) {
if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -661,12 +690,14 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
case X86::BI__builtin_ia32_prolq128:
case X86::BI__builtin_ia32_prolq256:
case X86::BI__builtin_ia32_prolq512:
+ return emitX86FunnelShift(*this, expr, ops[0], ops[0], ops[1], false);
case X86::BI__builtin_ia32_prord128:
case X86::BI__builtin_ia32_prord256:
case X86::BI__builtin_ia32_prord512:
case X86::BI__builtin_ia32_prorq128:
case X86::BI__builtin_ia32_prorq256:
case X86::BI__builtin_ia32_prorq512:
+ return emitX86FunnelShift(*this, expr, ops[0], ops[0], ops[1], true);
case X86::BI__builtin_ia32_selectb_128:
case X86::BI__builtin_ia32_selectb_256:
case X86::BI__builtin_ia32_selectb_512:
diff --git a/clang/test/CIR/CodeGen/X86/xop-builtins.c b/clang/test/CIR/CodeGen/X86/xop-builtins.c
new file mode 100644
index 0000000000000..c8ae5eb0fd82d
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/xop-builtins.c
@@ -0,0 +1,79 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+
+#include <x86intrin.h>
+
+// This test mimics clang/test/CodeGen/X86/xop-builtins.c, which eventually
+// CIR shall be able to support fully.
+
+__m128i test_mm_roti_epi8(__m128i a) {
+ // CIR-LABEL: test_mm_roti_epi8
+ // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}8i, !cir.vector<16 x !{{[us]}}8i>
+ // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>) -> !cir.vector<16 x !{{[su]}}8i>
+ // LLVM-LABEL: test_mm_roti_epi8
+ // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <16 x i8>
+ // LLVM: {{%.*}} = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
+ // OGCG-LABEL: test_mm_roti_epi8
+ // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <16 x i8>
+ // OGCG: {{%.*}} = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
+ return _mm_roti_epi8(a, 1);
+}
+
+__m128i test_mm_roti_epi16(__m128i a) {
+ // CIR-LABEL: test_mm_roti_epi16
+ // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u16i
+ // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}16i, !cir.vector<8 x !{{[us]}}16i>
+ // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>) -> !cir.vector<8 x !{{[su]}}16i>
+ // LLVM-LABEL: test_mm_roti_epi16
+ // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <8 x i16>
+ // LLVM: {{%.*}} = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
+ // OGCG-LABEL: test_mm_roti_epi16
+ // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <8 x i16>
+ // OGCG: {{%.*}} = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
+ return _mm_roti_epi16(a, 50);
+ }
+
+//NOTE: This only works as I expect for CIR but not for LLVMIR
+__m128i test_mm_roti_epi32(__m128i a) {
+ // CIR-LABEL: test_mm_roti_epi32
+ // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u32i
+ // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}32i, !cir.vector<4 x !{{[us]}}32i>
+ // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>) -> !cir.vector<4 x !{{[su]}}32i>
+ return _mm_roti_epi32(a, -30);
+ }
+
+__m128i test_mm_roti_epi64(__m128i a) {
+ // CIR-LABEL: test_mm_roti_epi64
+ // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u64i
+ // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{.}}64i, !cir.vector<2 x !{{[us]}}64i>
+ // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !u64i>) -> !cir.vector<2 x !{{[su]}}64i>
+ // LLVM-LABEL: test_mm_roti_epi64
+ // LLVM: %[[VAR:.*]] = load <2 x i64>, ptr {{%.*}}, align 16
+ // LLVM: {{%.*}} = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100))
+ // OGCG-LABEL: test_mm_roti_epi64
+ // OGCG: %[[VAR:.*]] = load <2 x i64>, ptr {{%.*}}, align 16
+ // OGCG: {{%.*}} = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100))
+ return _mm_roti_epi64(a, 100);
+ }
|
a014064 to
e556764
Compare
| } | ||
| // Cast the unsigned `amt` to operand element type's width unsigned. | ||
| auto unsingedVecElemType = builder.getUIntNTy(vecElemTy.getWidth()); | ||
| amt = builder.createIntCast(amt, unsingedVecElemType); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think you need the cast above. This should change the size and make it unsigned in one step.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had to do this for it to be reflected in the LLVM IR. Otherwise I still get a negative value there 🤔
In the OGCG a passed negative amt is zero extended to be unsigned (and then const folded).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So passing amt -30 for instance when widening it and make it unsigned in one gave sth like this LLVMIR:
%5 = bitcast <2 x i64> %4 to <4 x i32>
%6 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %5, <4 x i32> %5, <4 x i32> splat (i32 -30))
when the OGCG should be:
%1 = bitcast <2 x i64> %0 to <4 x i32>
%2 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %1, <4 x i32> %1, <4 x i32> splat (i32 226))
8762ba4 to
ac53a36
Compare
🐧 Linux x64 Test Results
|
c816ccd to
3c9ec7b
Compare
AmrDeveloper
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NIT
| } | ||
|
|
||
| const std::string intrinsicName = isRight ? "fshr" : "fshl"; | ||
| return emitIntrinsicCallOp(cgf.getBuilder(), location, intrinsicName, op0Ty, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| return emitIntrinsicCallOp(cgf.getBuilder(), location, intrinsicName, op0Ty, | |
| return emitIntrinsicCallOp(builder, location, intrinsicName, op0Ty, |
AmrDeveloper
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit
| amt); | ||
| } | ||
|
|
||
| const std::string intrinsicName = isRight ? "fshr" : "fshl"; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| const std::string intrinsicName = isRight ? "fshr" : "fshl"; | |
| const StringRef intrinsicName = isRight ? "fshr" : "fshl"; |
This PR implements CodeGen for rotate builtins in CIR upstream.
Issue #167765