[CIR] Implement x86 rotate builtins

moar55 · moar55 · commit a35f3cab846a · 2025-12-02T22:36:15.000+01:00
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -11,10 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CIRGenBuilder.h"
 #include "CIRGenFunction.h"
 #include "CIRGenModule.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/ValueRange.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
 
 using namespace clang;
@@ -168,6 +172,40 @@ static mlir::Value emitVecInsert(CIRGenBuilderTy &builder, mlir::Location loc,
   return cir::VecInsertOp::create(builder, loc, vec, value, indexVal);
 }
 
+static mlir::Value emitX86FunnelShift(CIRGenBuilderTy &builder,
+                                      mlir::Location location, mlir::Value &op0,
+                                      mlir::Value &op1, mlir::Value &amt,
+                                      bool isRight) {
+  mlir::Type op0Ty = op0.getType();
+
+  // Amount may be scalar immediate, in which case create a splat vector.
+  // Funnel shifts amounts are treated as modulo and types are all power-of-2
+  // so we only care about the lowest log2 bits anyway.
+  if (amt.getType() != op0Ty) {
+    auto vecTy = mlir::cast<cir::VectorType>(op0Ty);
+    uint64_t numElems = vecTy.getSize();
+
+    auto amtTy = mlir::cast<cir::IntType>(amt.getType());
+    auto vecElemTy = mlir::cast<cir::IntType>(vecTy.getElementType());
+
+    // Cast to same width unsigned if not already unsigned.
+    if (amtTy.isSigned()) {
+      cir::IntType unsignedAmtTy = builder.getUIntNTy(amtTy.getWidth());
+      amt = builder.createIntCast(amt, unsignedAmtTy);
+    }
+    // Cast the unsigned `amt` to operand element type's width unsigned.
+    cir::IntType unsignedVecElemType = builder.getUIntNTy(vecElemTy.getWidth());
+    amt = builder.createIntCast(amt, unsignedVecElemType);
+    amt = cir::VecSplatOp::create(
+        builder, location, cir::VectorType::get(unsignedVecElemType, numElems),
+        amt);
+  }
+
+  const StringRef intrinsicName = isRight ? "fshr" : "fshl";
+  return emitIntrinsicCallOp(builder, location, intrinsicName, op0Ty,
+                             mlir::ValueRange{op0, op1, amt});
+}
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -842,12 +880,16 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_prolq128:
   case X86::BI__builtin_ia32_prolq256:
   case X86::BI__builtin_ia32_prolq512:
+    return emitX86FunnelShift(this->getBuilder(), getLoc(expr->getExprLoc()),
+                              ops[0], ops[0], ops[1], false);
   case X86::BI__builtin_ia32_prord128:
   case X86::BI__builtin_ia32_prord256:
   case X86::BI__builtin_ia32_prord512:
   case X86::BI__builtin_ia32_prorq128:
   case X86::BI__builtin_ia32_prorq256:
   case X86::BI__builtin_ia32_prorq512:
+    return emitX86FunnelShift(this->getBuilder(), getLoc(expr->getExprLoc()),
+                              ops[0], ops[0], ops[1], true);
   case X86::BI__builtin_ia32_selectb_128:
   case X86::BI__builtin_ia32_selectb_256:
   case X86::BI__builtin_ia32_selectb_512:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
@@ -18,12 +18,12 @@
 __m512 test_mm512_undefined(void) {
   // CIR-LABEL: _mm512_undefined
   // CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
-  // CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<16 x !cir.float>
+  // CIR: cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<16 x !cir.float>
   // CIR: cir.return %{{.*}} : !cir.vector<16 x !cir.float>
 
   // LLVM-LABEL: test_mm512_undefined
   // LLVM: store <16 x float> zeroinitializer, ptr %[[A:.*]], align 64
-  // LLVM: %{{.*}} = load <16 x float>, ptr %[[A]], align 64
+  // LLVM: load <16 x float>, ptr %[[A]], align 64
   // LLVM: ret <16 x float> %{{.*}}
 
   // OGCG-LABEL: test_mm512_undefined
@@ -34,12 +34,12 @@ __m512 test_mm512_undefined(void) {
 __m512 test_mm512_undefined_ps(void) {
   // CIR-LABEL: _mm512_undefined_ps
   // CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
-  // CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<16 x !cir.float>
+  // CIR: cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<16 x !cir.float>
   // CIR: cir.return %{{.*}} : !cir.vector<16 x !cir.float>
 
   // LLVM-LABEL: test_mm512_undefined_ps
   // LLVM: store <16 x float> zeroinitializer, ptr %[[A:.*]], align 64
-  // LLVM: %{{.*}} = load <16 x float>, ptr %[[A]], align 64
+  // LLVM: load <16 x float>, ptr %[[A]], align 64
   // LLVM: ret <16 x float> %{{.*}}
 
   // OGCG-LABEL: test_mm512_undefined_ps
@@ -49,12 +49,12 @@ __m512 test_mm512_undefined_ps(void) {
 
 __m512d test_mm512_undefined_pd(void) {
   // CIR-LABEL: _mm512_undefined_pd
-  // CIR: %{{.*}} = cir.const #cir.zero : !cir.vector<8 x !cir.double>
+  // CIR: cir.const #cir.zero : !cir.vector<8 x !cir.double>
   // CIR: cir.return %{{.*}} : !cir.vector<8 x !cir.double>
 
   // LLVM-LABEL: test_mm512_undefined_pd
   // LLVM: store <8 x double> zeroinitializer, ptr %[[A:.*]], align 64
-  // LLVM: %{{.*}} = load <8 x double>, ptr %[[A]], align 64
+  // LLVM: load <8 x double>, ptr %[[A]], align 64
   // LLVM: ret <8 x double> %{{.*}}
 
   // OGCG-LABEL: test_mm512_undefined_pd
@@ -64,13 +64,13 @@ __m512d test_mm512_undefined_pd(void) {
 
 __m512i test_mm512_undefined_epi32(void) {
   // CIR-LABEL: _mm512_undefined_epi32
-  // CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
-  // CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<8 x !s64i>
+  // CIR: cir.const #cir.zero : !cir.vector<8 x !cir.double>
+  // CIR: cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> !cir.vector<8 x !s64i>
   // CIR: cir.return %{{.*}} : !cir.vector<8 x !s64i>
 
   // LLVM-LABEL: test_mm512_undefined_epi32
   // LLVM: store <8 x i64> zeroinitializer, ptr %[[A:.*]], align 64
-  // LLVM: %{{.*}} = load <8 x i64>, ptr %[[A]], align 64
+  // LLVM: load <8 x i64>, ptr %[[A]], align 64
   // LLVM: ret <8 x i64> %{{.*}}
 
   // OGCG-LABEL: test_mm512_undefined_epi32
@@ -446,3 +446,36 @@ __m512i test_mm512_mask_i32gather_epi64(__m512i __v1_old, __mmask8 __mask, __m25
   // OGCG: call <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512
   return _mm512_mask_i32gather_epi64(__v1_old, __mask, __index, __addr, 2);
 }
+
+__m512i test_mm512_ror_epi32(__m512i __A) {
+  // CIR-LABEL: test_mm512_ror_epi32
+  // CIR: cir.cast integral %{{.*}} : !s32i -> !u32i
+  // CIR: cir.vec.splat %{{.*}} : !u32i, !cir.vector<16 x !u32i>
+  // CIR: cir.call_llvm_intrinsic "fshr" {{%.*}}: (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i> 
+
+  // LLVM-LABEL: test_mm512_ror_epi32
+  // LLVM: %[[CASTED_VAR:.*]] = bitcast <8 x i64> %{{.*}} to <16 x i32>
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %[[CASTED_VAR]], <16 x i32> %[[CASTED_VAR]], <16 x i32> splat (i32 5))
+
+  // OGCG-LABEL: test_mm512_ror_epi32
+  // OGCG: %[[CASTED_VAR:.*]] = bitcast <8 x i64> %{{.*}} to <16 x i32>
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %[[CASTED_VAR]], <16 x i32> %[[CASTED_VAR]], <16 x i32> splat (i32 5))
+  return _mm512_ror_epi32(__A, 5); 
+}
+
+__m512i test_mm512_ror_epi64(__m512i __A) {
+  // CIR-LABEL: test_mm512_ror_epi64
+  // CIR: cir.cast integral %{{.*}} : !s32i -> !u32i
+  // CIR: cir.cast integral %{{.*}} : !u32i -> !u64i
+  // CIR: cir.vec.splat %{{.*}} : !u64i, !cir.vector<8 x !u64i>
+  // CIR: cir.call_llvm_intrinsic "fshr" {{%.*}}: (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i> 
+
+  // LLVM-LABEL: test_mm512_ror_epi64
+  // LLVM: %[[VAR:.*]] = load <8 x i64>, ptr {{%.*}}, align 64
+  // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %[[VAR]], <8 x i64> %[[VAR]], <8 x i64> splat (i64 5))
+
+  // OGCG-LABEL: test_mm512_ror_epi64
+  // OGCG: %[[VAR:.*]] = load <8 x i64>, ptr {{%.*}}, align 64
+  // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %[[VAR]], <8 x i64> %[[VAR]], <8 x i64> splat (i64 5))
+  return _mm512_ror_epi64(__A, 5); 
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/xop-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/xop-builtins.c
@@ -0,0 +1,92 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+
+#include <x86intrin.h>
+
+// This test mimics clang/test/CodeGen/X86/xop-builtins.c, which eventually
+// CIR shall be able to support fully.
+
+__m128i test_mm_roti_epi8(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi8
+  // CIR: cir.vec.splat %{{.*}} : !{{[us]}}8i, !cir.vector<16 x !{{[us]}}8i> 
+  // CIR: cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>) -> !cir.vector<16 x !{{[su]}}8i> 
+  
+  // LLVM-LABEL: test_mm_roti_epi8
+  // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <16 x i8>
+  // LLVM: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
+  
+  // OGCG-LABEL: test_mm_roti_epi8
+  // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <16 x i8>
+  // OGCG: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
+  return _mm_roti_epi8(a, 1);
+}
+
+__m128i test_mm_roti_epi16(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi16
+  // CIR: cir.cast integral %{{.*}} : !u8i -> !u16i
+  // CIR: cir.vec.splat %{{.*}} : !{{[us]}}16i, !cir.vector<8 x !u16i> 
+  // CIR: cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !u16i>) -> !cir.vector<8 x !{{[su]}}16i> 
+  
+  // LLVM-LABEL: test_mm_roti_epi16
+  // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // LLVM: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
+  
+  // OGCG-LABEL: test_mm_roti_epi16
+  // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // OGCG: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
+  return _mm_roti_epi16(a, 50);
+ }
+
+__m128i test_mm_roti_epi32(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi32
+  // CIR: cir.cast integral %{{.*}} : !u8i -> !u32i
+  // CIR: cir.vec.splat %{{.*}} : !{{[us]}}32i, !cir.vector<4 x !u32i> 
+  // CIR: cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !u32i>) -> !cir.vector<4 x !{{[su]}}32i> 
+  
+  // LLVM-LABEL: test_mm_roti_epi32
+  // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <4 x i32>
+  // LLVM: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %[[CASTED_VAR]], <4 x i32> %[[CASTED_VAR]], <4 x i32> splat (i32 226))
+  
+  // OGCG-LABEL: test_mm_roti_epi32
+  // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <4 x i32>
+  // OGCG: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %[[CASTED_VAR]], <4 x i32> %[[CASTED_VAR]], <4 x i32> splat (i32 226))
+  return _mm_roti_epi32(a, -30);
+ }
+
+__m128i test_mm_roti_epi64(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi64
+  // CIR: cir.cast integral %{{.*}} : !u8i -> !u64i
+  // CIR: cir.vec.splat %{{.*}} : !u64i, !cir.vector<2 x !u64i> 
+  // CIR: cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !u64i>) -> !cir.vector<2 x !s64i> 
+  
+  // LLVM-LABEL: test_mm_roti_epi64
+  // LLVM: %[[VAR:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+  // LLVM: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100))
+  
+  // OGCG-LABEL: test_mm_roti_epi64
+  // OGCG: %[[VAR:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+  // OGCG: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100))
+  return _mm_roti_epi64(a, 100);
+ }
diff --git a/shell.nix b/shell.nix
@@ -0,0 +1,14 @@
+let
+  nixpkgs = fetchTarball "https://github.com/NixOS/nixpkgs/tarball/nixos-24.05";
+  pkgs = import nixpkgs { config = {}; overlays = []; };
+in
+
+
+pkgs.mkShellNoCC {
+  packages = with pkgs; [
+    cmake
+    ninja
+    llvmPackages_latest.llvm
+  ];
+stdenv = pkgs.clangStdenv;
+}