llvm · moar55 · Nov 20, 2025 · Nov 23, 2025 · Nov 23, 2025 · Nov 23, 2025
@@ -13,8 +13,11 @@
 
 #include "CIRGenFunction.h"
 #include "CIRGenModule.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/ValueRange.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
 
 using namespace clang;
@@ -85,6 +88,41 @@ static mlir::Value getMaskVecValue(CIRGenBuilderTy &builder, mlir::Location loc,
   return maskVec;
 }
 
+static mlir::Value emitX86FunnelShift(CIRGenFunction &cgf,
+                                      mlir::Location location, mlir::Value &op0,
+                                      mlir::Value &op1, mlir::Value &amt,
+                                      bool isRight) {
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+  mlir::Type op0Ty = op0.getType();
+
+  // Amount may be scalar immediate, in which case create a splat vector.
+  // Funnel shifts amounts are treated as modulo and types are all power-of-2
+  // so we only care about the lowest log2 bits anyway.
+  if (amt.getType() != op0Ty) {
+    auto vecTy = mlir::cast<cir::VectorType>(op0Ty);
+    uint64_t numElems = vecTy.getSize();
+
+    auto amtTy = mlir::cast<cir::IntType>(amt.getType());
+    auto vecElemTy = mlir::cast<cir::IntType>(vecTy.getElementType());
+
+    // Cast to same width unsigned if not already unsigned.
+    if (amtTy.isSigned()) {
+      cir::IntType unsignedAmtTy = builder.getUIntNTy(amtTy.getWidth());
+      amt = builder.createIntCast(amt, unsignedAmtTy);
+    }
+    // Cast the unsigned `amt` to operand element type's width unsigned.
+    cir::IntType unsignedVecElemType = builder.getUIntNTy(vecElemTy.getWidth());
+    amt = builder.createIntCast(amt, unsignedVecElemType);
+    amt = cir::VecSplatOp::create(
+        builder, location, cir::VectorType::get(unsignedVecElemType, numElems),
+        amt);
+  }
+
+  const std::string intrinsicName = isRight ? "fshr" : "fshl";
-  const std::string intrinsicName = isRight ? "fshr" : "fshl";
+  const StringRef intrinsicName = isRight ? "fshr" : "fshl";
-  const std::string intrinsicName = isRight ? "fshr" : "fshl";
+  const StringRef intrinsicName = isRight ? "fshr" : "fshl";
+  return emitIntrinsicCallOp(cgf.getBuilder(), location, intrinsicName, op0Ty,
-  return emitIntrinsicCallOp(cgf.getBuilder(), location, intrinsicName, op0Ty,
+  return emitIntrinsicCallOp(builder, location, intrinsicName, op0Ty,
-  return emitIntrinsicCallOp(cgf.getBuilder(), location, intrinsicName, op0Ty,
+  return emitIntrinsicCallOp(builder, location, intrinsicName, op0Ty,
+                             mlir::ValueRange{op0, op1, amt});
+}
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -661,12 +699,16 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_prolq128:
   case X86::BI__builtin_ia32_prolq256:
   case X86::BI__builtin_ia32_prolq512:
+    return emitX86FunnelShift(*this, getLoc(expr->getExprLoc()), ops[0], ops[0],
+                              ops[1], false);
   case X86::BI__builtin_ia32_prord128:
   case X86::BI__builtin_ia32_prord256:
   case X86::BI__builtin_ia32_prord512:
   case X86::BI__builtin_ia32_prorq128:
   case X86::BI__builtin_ia32_prorq256:
   case X86::BI__builtin_ia32_prorq512:
+    return emitX86FunnelShift(*this, getLoc(expr->getExprLoc()), ops[0], ops[0],
+                              ops[1], true);
   case X86::BI__builtin_ia32_selectb_128:
   case X86::BI__builtin_ia32_selectb_256:
   case X86::BI__builtin_ia32_selectb_512:

diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
@@ -77,3 +77,36 @@ __m512i test_mm512_undefined_epi32(void) {
   // OGCG: ret <8 x i64> zeroinitializer
   return _mm512_undefined_epi32();
 }
+
+__m512i test_mm512_ror_epi32(__m512i __A) {
+  // CIR-LABEL: test_mm512_ror_epi32
+  // CIR: {{%.*}} =  cir.cast integral {{%.*}} : !s32i -> !u32i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !u32i, !cir.vector<16 x !u32i>
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshr" {{%.*}}: (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i> 
+
+  // LLVM-LABEL: test_mm512_ror_epi32
+  // LLVM: %[[CASTED_VAR:.*]] = bitcast <8 x i64> {{%.*}} to <16 x i32>
+  // LLVM: {{%.*}} = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %[[CASTED_VAR]], <16 x i32> %[[CASTED_VAR]], <16 x i32> splat (i32 5))
+
+  // OGCG-LABEL: test_mm512_ror_epi32
+  // OGCG: %[[CASTED_VAR:.*]] = bitcast <8 x i64> {{%.*}} to <16 x i32>
+  // OGCG: {{%.*}} = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %[[CASTED_VAR]], <16 x i32> %[[CASTED_VAR]], <16 x i32> splat (i32 5))
+  return _mm512_ror_epi32(__A, 5); 
+}
+
+__m512i test_mm512_ror_epi64(__m512i __A) {
+  // CIR-LABEL: test_mm512_ror_epi64
+  // CIR: {{%.*}} =  cir.cast integral {{%.*}} : !s32i -> !u32i
+  // CIR: {{%.*}} =  cir.cast integral {{%.*}} : !u32i -> !u64i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !u64i, !cir.vector<8 x !u64i>
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshr" {{%.*}}: (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i> 
+
+  // LLVM-LABEL: test_mm512_ror_epi64
+  // LLVM: %[[VAR:.*]] = load <8 x i64>, ptr {{%.*}}, align 64
+  // LLVM: {{%.*}} = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %[[VAR]], <8 x i64> %[[VAR]], <8 x i64> splat (i64 5))
+
+  // OGCG-LABEL: test_mm512_ror_epi64
+  // OGCG: %[[VAR:.*]] = load <8 x i64>, ptr {{%.*}}, align 64
+  // OGCG: {{%.*}} = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %[[VAR]], <8 x i64> %[[VAR]], <8 x i64> splat (i64 5))
+  return _mm512_ror_epi64(__A, 5); 
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/xop-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/xop-builtins.c
@@ -0,0 +1,84 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+
+#include <x86intrin.h>
+
+// This test mimics clang/test/CodeGen/X86/xop-builtins.c, which eventually
+// CIR shall be able to support fully.
+
+__m128i test_mm_roti_epi8(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi8
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}8i, !cir.vector<16 x !{{[us]}}8i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>) -> !cir.vector<16 x !{{[su]}}8i> 
+  // LLVM-LABEL: test_mm_roti_epi8
+  // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <16 x i8>
+  // LLVM: {{%.*}} = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
+  // OGCG-LABEL: test_mm_roti_epi8
+  // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <16 x i8>
+  // OGCG: {{%.*}} = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
+  return _mm_roti_epi8(a, 1);
+}
+
+__m128i test_mm_roti_epi16(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi16
+  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !u8i -> !u16i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}16i, !cir.vector<8 x !u16i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !u16i>) -> !cir.vector<8 x !{{[su]}}16i> 
+  // LLVM-LABEL: test_mm_roti_epi16
+  // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <8 x i16>
+  // LLVM: {{%.*}} = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
+  // OGCG-LABEL: test_mm_roti_epi16
+  // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <8 x i16>
+  // OGCG: {{%.*}} = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
+  return _mm_roti_epi16(a, 50);
+ }
+
+__m128i test_mm_roti_epi32(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi32
+  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !u8i -> !u32i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}32i, !cir.vector<4 x !u32i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !u32i>) -> !cir.vector<4 x !{{[su]}}32i> 
+  // LLVM-LABEL: test_mm_roti_epi32
+  // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <4 x i32>
+  // LLVM: {{%.*}} = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %[[CASTED_VAR]], <4 x i32> %[[CASTED_VAR]], <4 x i32> splat (i32 226))
+  // OGCG-LABEL: test_mm_roti_epi32
+  // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <4 x i32>
+  // OGCG: {{%.*}} = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %[[CASTED_VAR]], <4 x i32> %[[CASTED_VAR]], <4 x i32> splat (i32 226))
+  return _mm_roti_epi32(a, -30);
+ }
+
+__m128i test_mm_roti_epi64(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi64
+  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !u8i -> !u64i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !u64i, !cir.vector<2 x !u64i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !u64i>) -> !cir.vector<2 x !s64i> 
+  // LLVM-LABEL: test_mm_roti_epi64
+  // LLVM: %[[VAR:.*]] = load <2 x i64>, ptr {{%.*}}, align 16
+  // LLVM: {{%.*}} = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100))
+  // OGCG-LABEL: test_mm_roti_epi64
+  // OGCG: %[[VAR:.*]] = load <2 x i64>, ptr {{%.*}}, align 16
+  // OGCG: {{%.*}} = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100))
+  return _mm_roti_epi64(a, 100);
+ }