[CIR] Implement __builtin_ia32_cmpnleps/cmpnlepd (#1893)

badumbatish · web-flow · commit 9fc2a9151349 · 2025-09-22T16:03:44.000-07:00
Fixes #1818 - Implement createVecCompare, getCIRIntOrFloatBitWidth, getVectorFCmpIR helper for VecCmp op creation. - Add clang/test/CIR/CodeGen/builtin-fcmp-sse.c test. in OG, there is a sext from bool to int before casting to float vector since fcmp's result in llvm ir is boolean-like, while VecCmpOp in CIR returns int in the form of 0 or -1. There is also a boolean `shouldInvert` in CIR since CIR doesn't contain optimized unordered comparison, for example: OLE is the inverse predicate of UGT. So if we need UGT, we have to pass in OLE and `shouldInvert = true`
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -89,6 +89,14 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return cir::IntType::get(getContext(), N, false);
   }
 
+  static unsigned getCIRIntOrFloatBitWidth(mlir::Type eltTy) {
+    if (auto intType = mlir::dyn_cast<cir::IntTypeInterface>(eltTy))
+      return intType.getWidth();
+    if (auto floatType = mlir::dyn_cast<cir::FPTypeInterface>(eltTy))
+      return floatType.getWidth();
+
+    llvm_unreachable("Wrong type passed in or Non-CIR type passed in");
+  }
   cir::IntType getSIntNTy(int N) {
     return cir::IntType::get(getContext(), N, true);
   }
@@ -188,6 +196,16 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return cir::CmpOp::create(*this, loc, getBoolTy(), kind, lhs, rhs);
   }
 
+  cir::VecCmpOp createVecCompare(mlir::Location loc, cir::CmpOpKind kind,
+                                 mlir::Value lhs, mlir::Value rhs) {
+    VectorType vecCast = mlir::cast<VectorType>(lhs.getType());
+    auto integralTy =
+        getSIntNTy(getCIRIntOrFloatBitWidth(vecCast.getElementType()));
+    VectorType integralVecTy =
+        VectorType::get(context, integralTy, vecCast.getSize());
+    return cir::VecCmpOp::create(*this, loc, integralVecTy, kind, lhs, rhs);
+  }
+
   mlir::Value createIsNaN(mlir::Location loc, mlir::Value operand) {
     return createCompare(loc, cir::CmpOpKind::ne, operand, operand);
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -306,6 +306,44 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
     Ops.push_back(emitScalarOrConstFoldImmArg(ICEArguments, i, E));
   }
 
+  // OG has unordered comparison as a form of optimization in addition to
+  // ordered comparison, while CIR doesn't.
+  //
+  // This means that we can't encode the comparison code of UGT (unordered
+  // greater than), at least not at the CIR level.
+  //
+  // The boolean shouldInvert compensates for this.
+  // For example: to get to the comparison code UGT, we pass in
+  // getVectorFCmpIR(OLE, shouldInvert = true) since OLE is the inverse of UGT.
+
+  // There are several ways to support this otherwise:
+  // - register extra CmpOpKind for unordered comparison types and build the
+  // translation code for
+  //    to go from CIR -> LLVM dialect. Notice we get this naturally with
+  //    shouldInvert, benefiting from existing infrastructure, albeit having to
+  //    generate an extra `not` at CIR).
+  // - Just add extra comparison code to a new VecCmpOpKind instead of
+  // cluttering CmpOpKind.
+  // - Add a boolean in VecCmpOp to indicate if it's doing unordered or ordered
+  // comparison
+  // - Just emit the intrinsics call instead of calling this helper, see how the
+  // LLVM lowering handles this.
+  auto getVectorFCmpIR = [this, &Ops, &E](cir::CmpOpKind pred,
+                                          bool shouldInvert, bool isSignaling) {
+    assert(!cir::MissingFeatures::CGFPOptionsRAII());
+    auto loc = getLoc(E->getExprLoc());
+    mlir::Value cmp;
+    if (builder.getIsFPConstrained())
+      // TODO: Add isSignaling boolean once emitConstrainedFPCall implemented
+      assert(cir::MissingFeatures::emitConstrainedFPCall());
+    else
+      cmp = builder.createVecCompare(loc, pred, Ops[0], Ops[1]);
+
+    mlir::Value bitCast = builder.createBitcast(
+        shouldInvert ? builder.createNot(cmp) : cmp, Ops[0].getType());
+    return bitCast;
+  };
+
   switch (BuiltinID) {
   default:
     return nullptr;
@@ -1705,7 +1743,8 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
     llvm_unreachable("cmpnltps NYI");
   case X86::BI__builtin_ia32_cmpnleps:
   case X86::BI__builtin_ia32_cmpnlepd:
-    llvm_unreachable("cmpnleps NYI");
+    return getVectorFCmpIR(cir::CmpOpKind::le, /*shouldInvert=*/true,
+                           /*isSignaling=*/true);
   case X86::BI__builtin_ia32_cmpordps:
   case X86::BI__builtin_ia32_cmpordpd:
     llvm_unreachable("cmpordps NYI");
diff --git a/clang/test/CIR/CodeGen/builtin-fcmp-sse.c b/clang/test/CIR/CodeGen/builtin-fcmp-sse.c
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -emit-cir %s -o -  | FileCheck %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -emit-llvm %s -o - | FileCheck %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -emit-llvm %s -o - | FileCheck %s -check-prefix=OG
+
+typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
+
+__m128 test_cmpnleps(__m128 A, __m128 B) {
+
+  // CIR-LABEL: @test_cmpnleps
+  // CIR: [[CMP:%.*]] = cir.vec.cmp(le, [[A:%.*]], [[B:%.*]]) : !cir.vector<!cir.float x 4>, !cir.vector<!s32i x 4>
+  // CIR: [[NOTCMP:%.*]] = cir.unary(not, [[CMP]]) : !cir.vector<!s32i x 4>, !cir.vector<!s32i x 4>
+  // CIR-NEXT: [[CAST:%.*]] = cir.cast(bitcast, [[NOTCMP:%.*]] : !cir.vector<!s32i x 4>), !cir.vector<!cir.float x 4>
+  // CIR-NEXT: cir.store [[CAST]], [[ALLOCA:%.*]] :  !cir.vector<!cir.float x 4>, !cir.ptr<!cir.vector<!cir.float x 4>>
+  // CIR-NEXT: [[LD:%.*]] = cir.load [[ALLOCA]] :
+  // CIR-NEXT: cir.return [[LD]] : !cir.vector<!cir.float x 4>
+
+  // LLVM-LABEL: test_cmpnleps
+  // LLVM: [[CMP:%.*]] = fcmp ugt <4 x float> {{.*}}, {{.*}}
+  // LLVM-NEXT: [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // LLVM-NEXT: [[CAST:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // LLVM-NEXT: ret <4 x float> [[CAST]]
+
+  // OG-LABEL: test_cmpnleps
+  // OG: [[CMP:%.*]] = fcmp ugt <4 x float> {{.*}}, {{.*}}
+  // OG-NEXT: [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // OG-NEXT: [[CAST:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // OG-NEXT: ret <4 x float> [[CAST]]
+  return __builtin_ia32_cmpnleps(A, B);
+}
+
+
+__m128d test_cmpnlepd(__m128d A, __m128d B) {
+
+  // CIR-LABEL: @test_cmpnlepd
+  // CIR: [[CMP:%.*]] = cir.vec.cmp(le, [[A:%.*]], [[B:%.*]]) :  !cir.vector<!cir.double x 2>, !cir.vector<!s64i x 2>
+  // CIR-NEXT: [[NOTCMP:%.*]] = cir.unary(not, [[CMP]]) : !cir.vector<!s64i x 2>, !cir.vector<!s64i x 2>
+  // CIR-NEXT: [[CAST:%.*]] = cir.cast(bitcast, [[NOTCMP]] :  !cir.vector<!s64i x 2>), !cir.vector<!cir.double x 2>
+  // CIR-NEXT: cir.store [[CAST]], [[ALLOCA:%.*]] : !cir.vector<!cir.double x 2>, !cir.ptr<!cir.vector<!cir.double x 2>>
+  // CIR-NEXT: [[LD:%.*]] = cir.load [[ALLOCA]] :
+  // CIR-NEXT: cir.return [[LD]] : !cir.vector<!cir.double x 2>
+
+  // LLVM-LABEL: test_cmpnlepd
+  // LLVM: [[CMP:%.*]] = fcmp ugt <2 x double> {{.*}}, {{.*}}
+  // LLVM-NEXT: [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // LLVM-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // LLVM-NEXT: ret <2 x double> [[CAST]]
+
+  // OG-LABEL: test_cmpnlepd
+  // OG: [[CMP:%.*]] = fcmp ugt <2 x double> {{.*}}, {{.*}}
+  // OG-NEXT: [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // OG-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // OG-NEXT: ret <2 x double> [[CAST]]
+ return  __builtin_ia32_cmpnlepd(A, B);
+}