From f8d58bf7e6cdd81ea509d6f5bce894eb2eacc7d8 Mon Sep 17 00:00:00 2001 From: kumarak Date: Tue, 22 Jul 2025 10:41:33 -0400 Subject: [PATCH 01/63] [CT]implementation of ct.select intrinsics and clang frontend changes for X86 architecture (#21) --- clang/include/clang/Basic/Builtins.td | 8 + clang/lib/CodeGen/CGBuiltin.cpp | 38 + clang/lib/Sema/SemaChecking.cpp | 87 ++ .../test/Sema/builtin-ct-select-edge-cases.c | 384 ++++++ clang/test/Sema/builtin-ct-select.c | 683 ++++++++++ llvm/include/llvm/CodeGen/ISDOpcodes.h | 4 + llvm/include/llvm/CodeGen/SelectionDAG.h | 7 + llvm/include/llvm/CodeGen/TargetLowering.h | 18 +- llvm/include/llvm/IR/Intrinsics.td | 7 + .../include/llvm/Target/TargetSelectionDAG.td | 5 + llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 111 +- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 12 +- .../SelectionDAG/LegalizeFloatTypes.cpp | 17 +- .../SelectionDAG/LegalizeIntegerTypes.cpp | 20 + llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 6 +- .../SelectionDAG/LegalizeTypesGeneric.cpp | 14 + .../SelectionDAG/LegalizeVectorTypes.cpp | 13 + .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 1 + .../SelectionDAG/SelectionDAGBuilder.cpp | 29 + .../SelectionDAG/SelectionDAGDumper.cpp | 1 + llvm/lib/Target/X86/X86.td | 5 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 839 ++++++++++++ llvm/lib/Target/X86/X86ISelLowering.h | 13 + llvm/lib/Target/X86/X86InstrCMovSetCC.td | 133 ++ llvm/lib/Target/X86/X86InstrCompiler.td | 39 + llvm/lib/Target/X86/X86InstrFragments.td | 5 + llvm/lib/Target/X86/X86InstrInfo.cpp | 53 + llvm/lib/Target/X86/X86InstrInfo.h | 3 + llvm/lib/Target/X86/X86InstrPredicates.td | 1 + llvm/test/CodeGen/X86/ctselect-edge-cases.ll | 336 +++++ .../test/CodeGen/X86/ctselect-optimization.ll | 294 +++++ llvm/test/CodeGen/X86/ctselect-vector.ll | 1151 +++++++++++++++++ llvm/test/CodeGen/X86/ctselect.ll | 376 ++++++ 33 files changed, 4700 insertions(+), 13 deletions(-) create mode 100644 clang/test/Sema/builtin-ct-select-edge-cases.c create mode 100644 clang/test/Sema/builtin-ct-select.c create mode 100644 llvm/test/CodeGen/X86/ctselect-edge-cases.ll create mode 100644 llvm/test/CodeGen/X86/ctselect-optimization.ll create mode 100644 llvm/test/CodeGen/X86/ctselect-vector.ll create mode 100644 llvm/test/CodeGen/X86/ctselect.ll diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 792e2e07ec594..551e0cbfb3971 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -5253,3 +5253,11 @@ def CountedByRef : Builtin { let Attributes = [NoThrow, CustomTypeChecking]; let Prototype = "int(...)"; } + +// Constant-time select builtin +def CtSelect : Builtin { + let Spellings = ["__builtin_ct_select"]; + let Attributes = [NoThrow, Const, UnevaluatedArguments, + ConstIgnoringExceptions, CustomTypeChecking]; + let Prototype = "void(...)"; +} diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 92dba32698e51..25b95ce0289b7 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -26,6 +26,10 @@ #include "TargetInfo.h" #include "clang/AST/OSLog.h" #include "clang/AST/StmtVisitor.h" +#include "clang/AST/OperationKinds.h" +#include "clang/AST/Type.h" +#include "clang/Basic/DiagnosticSema.h" +#include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetInfo.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "llvm/IR/InlineAsm.h" @@ -6441,6 +6445,40 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, auto Str = CGM.GetAddrOfConstantCString(Name, ""); return RValue::get(Str.getPointer()); } + case Builtin::BI__builtin_ct_select: { + if (E->getNumArgs() != 3) { + CGM.getDiags().Report(E->getBeginLoc(), + E->getNumArgs() > 3 + ? diag::err_typecheck_call_too_many_args + : diag::err_typecheck_call_too_few_args); + return GetUndefRValue(E->getType()); + } + + auto *Cond = EmitScalarExpr(E->getArg(0)); + auto *A = EmitScalarExpr(E->getArg(1)); + auto *B = EmitScalarExpr(E->getArg(2)); + + // Verify types match + if (A->getType() != B->getType()) { + CGM.getDiags().Report(E->getBeginLoc(), + diag::err_typecheck_convert_incompatible); + return GetUndefRValue(E->getType()); + } + + // Verify condition is integer type + if (!Cond->getType()->isIntegerTy()) { + CGM.getDiags().Report(E->getBeginLoc(), diag::err_typecheck_expect_int); + return GetUndefRValue(E->getType()); + } + + if (Cond->getType()->getIntegerBitWidth() != 1) + Cond = Builder.CreateICmpNE( + Cond, llvm::ConstantInt::get(Cond->getType(), 0), "cond.bool"); + + llvm::Function *Fn = + CGM.getIntrinsic(llvm::Intrinsic::ct_select, {A->getType()}); + return RValue::get(Builder.CreateCall(Fn, {Cond, A, B})); + } } // If this is an alias for a lib function (e.g. __builtin_sin), emit diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 652527a88b160..12be5426ccd23 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3472,6 +3472,93 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, if (BuiltinCountedByRef(TheCall)) return ExprError(); break; + + case Builtin::BI__builtin_ct_select: { + if (TheCall->getNumArgs() != 3) { + // Simple argument count check without complex diagnostics + if (TheCall->getNumArgs() < 3) { + return Diag(TheCall->getEndLoc(), diag::err_typecheck_call_too_few_args_at_least) + << 0 << 3 << TheCall->getNumArgs() << 0 + << TheCall->getCallee()->getSourceRange(); + } else { + return Diag(TheCall->getEndLoc(), diag::err_typecheck_call_too_many_args) + << 0 << 3 << TheCall->getNumArgs() << 0 + << TheCall->getCallee()->getSourceRange(); + } + } + auto *Cond = TheCall->getArg(0); + auto *A = TheCall->getArg(1); + auto *B = TheCall->getArg(2); + + QualType CondTy = Cond->getType(); + if (!CondTy->isIntegerType()) { + return Diag(Cond->getBeginLoc(), diag::err_typecheck_cond_expect_scalar) + << CondTy << Cond->getSourceRange(); + } + + QualType ATy = A->getType(); + QualType BTy = B->getType(); + + // check for scalar or vector scalar type + if ((!ATy->isScalarType() && !ATy->isVectorType()) || + (!BTy->isScalarType() && !BTy->isVectorType())) { + return Diag(A->getBeginLoc(), + diag::err_typecheck_cond_incompatible_operands) + << ATy << BTy << A->getSourceRange() << B->getSourceRange(); + } + + // Check if both operands have the same type or can be implicitly converted + QualType ResultTy; + if (Context.hasSameType(ATy, BTy)) { + ResultTy = ATy; + } else { + // Try to find a common type using the same logic as conditional + // expressions + ExprResult ARes = ExprResult(A); + ExprResult BRes = ExprResult(B); + + // For arithmetic types, allow promotions within the same category only + if (ATy->isArithmeticType() && BTy->isArithmeticType()) { + // Check if both are integer types or both are floating types + bool AIsInteger = ATy->isIntegerType(); + bool BIsInteger = BTy->isIntegerType(); + bool AIsFloating = ATy->isFloatingType(); + bool BIsFloating = BTy->isFloatingType(); + + if ((AIsInteger && BIsInteger) || (AIsFloating && BIsFloating)) { + // Both are in the same category, allow usual arithmetic conversions + ResultTy = UsualArithmeticConversions( + ARes, BRes, TheCall->getBeginLoc(), ACK_Conditional); + if (ARes.isInvalid() || BRes.isInvalid() || ResultTy.isNull()) { + return Diag(A->getBeginLoc(), + diag::err_typecheck_cond_incompatible_operands) + << ATy << BTy << A->getSourceRange() << B->getSourceRange(); + } + // Update the arguments with any necessary implicit casts + TheCall->setArg(1, ARes.get()); + TheCall->setArg(2, BRes.get()); + } else { + // Different categories (int vs float), not allowed + return Diag(A->getBeginLoc(), + diag::err_typecheck_cond_incompatible_operands) + << ATy << BTy << A->getSourceRange() << B->getSourceRange(); + } + } else { + // For non-arithmetic types, they must be exactly the same + return Diag(A->getBeginLoc(), + diag::err_typecheck_cond_incompatible_operands) + << ATy << BTy << A->getSourceRange() << B->getSourceRange(); + } + } + + ExprResult CondRes = PerformContextuallyConvertToBool(Cond); + if (CondRes.isInvalid()) + return ExprError(); + + TheCall->setArg(0, CondRes.get()); + TheCall->setType(ResultTy); + return TheCall; + } break; } if (getLangOpts().HLSL && HLSL().CheckBuiltinFunctionCall(BuiltinID, TheCall)) diff --git a/clang/test/Sema/builtin-ct-select-edge-cases.c b/clang/test/Sema/builtin-ct-select-edge-cases.c new file mode 100644 index 0000000000000..3998e9d68748d --- /dev/null +++ b/clang/test/Sema/builtin-ct-select-edge-cases.c @@ -0,0 +1,384 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s +// RUN: %clang_cc1 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter + +// Test with various condition expressions +int test_conditional_expressions(int x, int y, int a, int b) { + // Logical expressions + int result1 = __builtin_ct_select(x && y, a, b); + int result2 = __builtin_ct_select(x || y, a, b); + int result3 = __builtin_ct_select(!x, a, b); + + // Comparison expressions + int result4 = __builtin_ct_select(x == y, a, b); + int result5 = __builtin_ct_select(x != y, a, b); + int result6 = __builtin_ct_select(x < y, a, b); + int result7 = __builtin_ct_select(x > y, a, b); + int result8 = __builtin_ct_select(x <= y, a, b); + int result9 = __builtin_ct_select(x >= y, a, b); + + // Bitwise expressions + int result10 = __builtin_ct_select(x & y, a, b); + int result11 = __builtin_ct_select(x | y, a, b); + int result12 = __builtin_ct_select(x ^ y, a, b); + int result13 = __builtin_ct_select(~x, a, b); + + // Arithmetic expressions + int result14 = __builtin_ct_select(x + y, a, b); + int result15 = __builtin_ct_select(x - y, a, b); + int result16 = __builtin_ct_select(x * y, a, b); + int result17 = __builtin_ct_select(x / y, a, b); + int result18 = __builtin_ct_select(x % y, a, b); + + return result1 + result2 + result3 + result4 + result5 + result6 + result7 + result8 + result9 + result10 + result11 + result12 + result13 + result14 + result15 + result16 + result17 + result18; +} + +// Test with extreme values +int test_extreme_values(int cond) { + // Maximum and minimum values + int max_int = __builtin_ct_select(cond, __INT_MAX__, -__INT_MAX__ - 1); + + // Very large numbers + long long max_ll = __builtin_ct_select(cond, __LONG_LONG_MAX__, -__LONG_LONG_MAX__ - 1); + + // Floating point extremes + float max_float = __builtin_ct_select(cond, __FLT_MAX__, -__FLT_MAX__); + double max_double = __builtin_ct_select(cond, __DBL_MAX__, -__DBL_MAX__); + + return max_int; +} + +// Test with zero and negative zero +int test_zero_values(int cond) { + // Integer zeros + int zero_int = __builtin_ct_select(cond, 0, -0); + + // Floating point zeros + float zero_float = __builtin_ct_select(cond, 0.0f, -0.0f); + double zero_double = __builtin_ct_select(cond, 0.0, -0.0); + + return zero_int; +} + +// Test with infinity and NaN +int test_special_float_values(int cond) { + // Infinity + float inf_float = __builtin_ct_select(cond, __builtin_inff(), -__builtin_inff()); + double inf_double = __builtin_ct_select(cond, __builtin_inf(), -__builtin_inf()); + + // NaN + float nan_float = __builtin_ct_select(cond, __builtin_nanf(""), __builtin_nanf("")); + double nan_double = __builtin_ct_select(cond, __builtin_nan(""), __builtin_nan("")); + + return 0; +} + +// Test with complex pointer scenarios +int test_pointer_edge_cases(int cond) { + int arr[10]; + int *ptr1 = arr; + int *ptr2 = arr + 5; + + // Array pointers + int *result1 = __builtin_ct_select(cond, ptr1, ptr2); + + // Pointer arithmetic + int *result2 = __builtin_ct_select(cond, arr + 1, arr + 2); + + // NULL vs non-NULL + int *result3 = __builtin_ct_select(cond, ptr1, (int*)0); + + // Different pointer types (should fail) + float *fptr = (float*)0; + int *result4 = __builtin_ct_select(cond, ptr1, fptr); // expected-error {{incompatible operand types ('int *' and 'float *')}} + + return *result1; +} + +// Test with function pointers +int func1(int x) { return x; } +int func2(int x) { return x * 2; } +float func3(float x) { return x; } + +int test_function_pointers(int cond, int x) { + // Same signature function pointer + int (*fptr)(int) = __builtin_ct_select(cond, &func1, &func2); + + // Different signature function pointers (should fail) + int (*bad_fptr)(int) = __builtin_ct_select(cond, &func1, &func3); // expected-error {{incompatible operand types ('int (*)(int)' and 'float (*)(float)')}} + + return fptr(x); +} + +// Test with void pointers +void *test_void_pointers(int cond, void *a, void *b) { + return __builtin_ct_select(cond, a, b); +} + +// Test with const/volatile qualifiers +int test_qualifiers(int cond) { + const int ca = 10; + const int cb = 20; + volatile int va = 30; + volatile int vb = 40; + const volatile int cva = 50; + const volatile int cvb = 60; + + // const to const + const int result1 = __builtin_ct_select(cond, ca, cb); + + // volatile to volatile + volatile int result2 = __builtin_ct_select(cond, va, vb); + + // const volatile to const volatile + const volatile int result3 = __builtin_ct_select(cond, cva, cvb); + + return result1 + result2 + result3; +} + +// Test with arrays (should fail as they're not arithmetic or pointer) +int test_arrays(int cond) { + int arr1[5] = {1, 2, 3, 4, 5}; + int arr2[5] = {6, 7, 8, 9, 10}; + + // This should fail?? + int *result = __builtin_ct_select(cond, arr1, arr2); // expected-error {{incompatible operand types ('int[5]' and 'int[5]')}} + + return result[0]; +} + +// Test with structures (should fail) +struct Point { + int x, y; +}; + +struct Point test_structs(int cond) { + struct Point p1 = {1, 2}; + struct Point p2 = {3, 4}; + + return __builtin_ct_select(cond, p1, p2); // expected-error {{incompatible operand types ('struct Point' and 'struct Point')}} +} + +// Test with unions (should fail) +union Data { + int i; + float f; +}; + +union Data test_unions(int cond) { + union Data d1 = {.i = 10}; + union Data d2 = {.i = 20}; + + return __builtin_ct_select(cond, d1, d2); // expected-error {{incompatible operand types ('union Data' and 'union Data')}} +} + +// Test with bit fields (should work as they're integers) +struct BitField { + int a : 4; + int b : 4; +}; + +int test_bit_fields(int cond) { + struct BitField bf1 = {1, 2}; + struct BitField bf2 = {3, 4}; + + // Individual bit fields should work + int result1 = __builtin_ct_select(cond, bf1.a, bf2.a); + int result2 = __builtin_ct_select(cond, bf1.b, bf2.b); + + return result1 + result2; +} + +// Test with designated initializers +int test_designated_init(int cond) { + int arr1[3] = {[0] = 1, [1] = 2, [2] = 3}; + int arr2[3] = {[0] = 4, [1] = 5, [2] = 6}; + + // Access specific elements + int result1 = __builtin_ct_select(cond, arr1[0], arr2[0]); + int result2 = __builtin_ct_select(cond, arr1[1], arr2[1]); + + return result1 + result2; +} + +// Test with complex expressions in arguments +int complex_expr(int x) { return x * x; } + +int test_complex_arguments(int cond, int x, int y) { + // Function calls as arguments + int result1 = __builtin_ct_select(cond, complex_expr(x), complex_expr(y)); + + // Ternary operator as arguments + int result2 = __builtin_ct_select(cond, x > 0 ? x : -x, y > 0 ? y : -y); + + // Compound literals + int result3 = __builtin_ct_select(cond, (int){x}, (int){y}); + + return result1 + result2 + result3; +} + +// Test with preprocessor macros +#define MACRO_A 42 +#define MACRO_B 24 +#define MACRO_COND(x) (x > 0) + +int test_macros(int x) { + int result1 = __builtin_ct_select(MACRO_COND(x), MACRO_A, MACRO_B); + + // Nested macros + #define NESTED_SELECT(c, a, b) __builtin_ct_select(c, a, b) + int result2 = NESTED_SELECT(x, 10, 20); + + return result1 + result2; +} + +// Test with string literals (should fail) +const char *test_strings(int cond) { + return __builtin_ct_select(cond, "hello", "world"); // expected-error {{incompatible operand types ('char[6]' and 'char[6]')}} +} + +// Test with variable length arrays (VLA) +int test_vla(int cond, int n) { + int vla1[n]; + int vla2[n]; + + // Individual elements should work + vla1[0] = 1; + vla2[0] = 2; + int result = __builtin_ct_select(cond, vla1[0], vla2[0]); + + return result; +} + +// Test with typedef +typedef int MyInt; +typedef float MyFloat; + +MyInt test_typedef(int cond, MyInt a, MyInt b) { + return __builtin_ct_select(cond, a, b); +} + +// Test with different typedef types (should fail) +MyInt test_different_typedef(int cond, MyInt a, MyFloat b) { + return __builtin_ct_select(cond, a, b); // expected-error {{incompatible operand types ('MyInt' (aka 'int') and 'MyFloat' (aka 'float'))}} +} + +// Test with side effects (should be evaluated) +int side_effect_counter = 0; +int side_effect_func(int x) { + side_effect_counter++; + return x; +} + +int test_side_effects(int cond) { + // Both arguments should be evaluated + int result = __builtin_ct_select(cond, side_effect_func(10), side_effect_func(20)); + return result; +} + +// Test with goto labels (context where expressions are used) +int test_goto_context(int cond, int a, int b) { + int result = __builtin_ct_select(cond, a, b); + + if (result > 0) { + goto positive; + } else { + goto negative; + } + +positive: + return result; + +negative: + return -result; +} + +// Test with switch statements +int test_switch_context(int cond, int a, int b) { + int result = __builtin_ct_select(cond, a, b); + + switch (result) { + case 0: + return 0; + case 1: + return 1; + default: + return -1; + } +} + +// Test with loops +int test_loop_context(int cond, int a, int b) { + int result = __builtin_ct_select(cond, a, b); + int sum = 0; + + for (int i = 0; i < result; i++) { + sum += i; + } + + return sum; +} + +// Test with recursive functions +int factorial(int n) { + if (n <= 1) return 1; + return n * factorial(n - 1); +} + +int test_recursive(int cond, int n) { + int result = __builtin_ct_select(cond, n, n + 1); + return factorial(result); +} + +// Test with inline functions +static inline int inline_func(int x) { + return x * 2; +} + +int test_inline(int cond, int a, int b) { + return __builtin_ct_select(cond, inline_func(a), inline_func(b)); +} + +// Test with static variables +int test_static_vars(int cond) { + static int static_a = 10; + static int static_b = 20; + + return __builtin_ct_select(cond, static_a, static_b); +} + +// Test with extern variables +extern int extern_a; +extern int extern_b; + +int test_extern_vars(int cond) { + return __builtin_ct_select(cond, extern_a, extern_b); +} + +// Test with register variables +int test_register_vars(int cond) { + register int reg_a = 30; + register int reg_b = 40; + + return __builtin_ct_select(cond, reg_a, reg_b); +} + +// Test with thread-local variables (C11) +#if __STDC_VERSION__ >= 201112L +_Thread_local int tls_a = 50; +_Thread_local int tls_b = 60; + +int test_tls_vars(int cond) { + return __builtin_ct_select(cond, tls_a, tls_b); +} +#endif + +// Test with atomic variables (C11) +#if __STDC_VERSION__ >= 201112L +#include +atomic_int atomic_a = 70; +atomic_int atomic_b = 80; + +int test_atomic_vars(int cond) { + return __builtin_ct_select(cond, atomic_a, atomic_b); // expected-error {{incompatible operand types ('atomic_int' (aka '_Atomic(int)') and 'atomic_int')}} +} +#endif diff --git a/clang/test/Sema/builtin-ct-select.c b/clang/test/Sema/builtin-ct-select.c new file mode 100644 index 0000000000000..7749eb52eecb3 --- /dev/null +++ b/clang/test/Sema/builtin-ct-select.c @@ -0,0 +1,683 @@ +// RUN: %clang_cc1 -emit-llvm -o - %s | FileCheck %s + +// Test integer types +int test_int(int cond, int a, int b) { + // CHECK-LABEL: define {{.*}} @test_int + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +long test_long(int cond, long a, long b) { + // CHECK-LABEL: define {{.*}} @test_long + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 %{{.*}}, i64 %{{.*}}) + // CHECK: ret i64 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +short test_short(int cond, short a, short b) { + // CHECK-LABEL: define {{.*}} @test_short + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i16 @llvm.ct.select.i16(i1 [[COND]], i16 %{{.*}}, i16 %{{.*}}) + // CHECK: ret i16 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +unsigned char test_uchar(int cond, unsigned char a, unsigned char b) { + // CHECK-LABEL: define {{.*}} @test_uchar + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i8 @llvm.ct.select.i8(i1 [[COND]], i8 %{{.*}}, i8 %{{.*}}) + // CHECK: ret i8 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +long long test_longlong(int cond, long long a, long long b) { + // CHECK-LABEL: define {{.*}} @test_longlong + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 %{{.*}}, i64 %{{.*}}) + // CHECK: ret i64 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test floating point types +float test_float(int cond, float a, float b) { + // CHECK-LABEL: define {{.*}} @test_float + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float %{{.*}}, float %{{.*}}) + // CHECK: ret float [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +double test_double(int cond, double a, double b) { + // CHECK-LABEL: define {{.*}} @test_double + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double %{{.*}}) + // CHECK: ret double [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test pointer types +int *test_pointer(int cond, int *a, int *b) { + // CHECK-LABEL: define {{.*}} @test_pointer + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call ptr @llvm.ct.select.p0(i1 [[COND]], ptr %{{.*}}, ptr %{{.*}}) + // CHECK: ret ptr [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test with different condition types +int test_char_cond(char cond, int a, int b) { + // CHECK-LABEL: define {{.*}} @test_char_cond + // CHECK: [[COND:%.*]] = icmp ne i8 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +int test_long_cond(long cond, int a, int b) { + // CHECK-LABEL: define {{.*}} @test_long_cond + // CHECK: [[COND:%.*]] = icmp ne i64 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test with boolean condition +int test_bool_cond(_Bool cond, int a, int b) { + // CHECK-LABEL: define {{.*}} @test_bool_cond + // CHECK: [[COND:%.*]] = trunc i8 %{{.*}} to i1 + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test with constants +int test_constant_cond(void) { + // CHECK-LABEL: define {{.*}} @test_constant_cond + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 true, i32 42, i32 24) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(1, 42, 24); +} + +int test_zero_cond(void) { + // CHECK-LABEL: define {{.*}} @test_zero_cond + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 false, i32 42, i32 24) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(0, 42, 24); +} + +// Test type promotion +int test_promotion(int cond, short a, short b) { + // CHECK-LABEL: define {{.*}} @test_promotion + // CHECK-DAG: [[A_EXT:%.*]] = sext i16 %{{.*}} to i32 + // CHECK-DAG: [[B_EXT:%.*]] = sext i16 %{{.*}} to i32 + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 [[A_EXT]], i32 [[B_EXT]]) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(cond, (int)a, (int)b); +} + +// Test mixed signedness +unsigned int test_mixed_signedness(int cond, int a, unsigned int b) { + // CHECK-LABEL: define {{.*}} @test_mixed_signedness + // CHECK-DAG: [[A_EXT:%.*]] = sext i32 %{{.*}} to i64 + // CHECK-DAG: [[B_EXT:%.*]] = zext i32 %{{.*}} to i64 + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 [[A_EXT]], i64 [[B_EXT]]) + // CHECK: [[RESULT_TRUNC:%.*]] = trunc i64 [[RESULT]] to i32 + // CHECK: ret i32 [[RESULT_TRUNC]] + return __builtin_ct_select(cond, (long)a, (long)b); +} + +// Test complex expression +int test_complex_expr_alt(int x, int y) { + // CHECK-LABEL: define {{.*}} @test_complex_expr_alt + // CHECK-DAG: [[CMP:%.*]] = icmp sgt i32 %{{.*}}, 0 + // CHECK-DAG: [[ADD:%.*]] = add nsw i32 %{{.*}}, %{{.*}} + // CHECK-DAG: [[SUB:%.*]] = sub nsw i32 %{{.*}}, %{{.*}} + // Separate the final sequence to ensure proper ordering + // CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP]], i32 [[ADD]], i32 [[SUB]]) + // CHECK-NEXT: ret i32 [[RESULT]] + return __builtin_ct_select(x > 0, x + y, x - y); +} + +// Test nested calls +int test_nested_structured(int cond1, int cond2, int a, int b, int c) { + // CHECK-LABEL: define {{.*}} @test_nested_structured + // Phase 1: Conditions (order doesn't matter) + // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0 + + // Phase 2: Inner select (must happen before outer) + // CHECK: [[INNER:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND2]], i32 %{{.*}}, i32 %{{.*}}) + + // Phase 3: Outer select (must use inner result) + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 [[INNER]], i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(cond1, __builtin_ct_select(cond2, a, b), c); +} + +// Test with function calls +int helper(int x) { return x * 2; } +int test_function_calls(int cond, int x, int y) { + // CHECK-LABEL: define {{.*}} @test_function_calls + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[CALL1:%.*]] = call i32 @helper(i32 noundef %{{.*}}) + // CHECK-DAG: [[CALL2:%.*]] = call i32 @helper(i32 noundef %{{.*}}) + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 [[CALL1]], i32 [[CALL2]]) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(cond, helper(x), helper(y)); +} + +// Test using ct_select as condition for another ct_select +int test_intrinsic_condition(int cond1, int cond2, int a, int b, int c, int d) { + // CHECK-LABEL: define {{.*}} @test_intrinsic_condition + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[INNER_COND:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK-DAG: [[FINAL_COND:%.*]] = icmp ne i32 [[INNER_COND]], 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(__builtin_ct_select(cond1, cond2, a), b, c); +} + +// Test using comparison result of ct_select as condition +int test_comparison_condition(int cond, int a, int b, int c, int d) { + // CHECK-LABEL: define {{.*}} @test_comparison_condition + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[FIRST_SELECT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: [[CMP:%.*]] = icmp sgt i32 [[FIRST_SELECT]], %{{.*}} + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(__builtin_ct_select(cond, a, b) > c, d, a); +} + +// Test using ct_select result in arithmetic as condition +int test_arithmetic_condition(int cond, int a, int b, int c, int d) { + // CHECK-LABEL: define {{.*}} @test_arithmetic_condition + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[FIRST_SELECT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: [[ADD:%.*]] = add nsw i32 [[FIRST_SELECT]], %{{.*}} + // CHECK: [[FINAL_COND:%.*]] = icmp ne i32 [[ADD]], 0 + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(__builtin_ct_select(cond, a, b) + c, d, a); +} + +// Test chained ct_select as conditions +int test_chained_conditions(int cond1, int cond2, int cond3, int a, int b, int c, int d, int e) { + // CHECK-LABEL: define {{.*}} @test_chained_conditions + // CHECK: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[FIRST:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[SECOND:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND2]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK-DAG: [[FINAL_COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + int first_select = __builtin_ct_select(cond1, a, b); + int second_select = __builtin_ct_select(cond2, first_select, c); + return __builtin_ct_select(second_select, d, e); +} + +// Test using ct_select with pointer condition +//int test_pointer_condition(int *ptr1, int *ptr2, int a, int b, int c) { + // NO-CHECK-LABEL: define {{.*}} @test_pointer_condition + // NO-CHECK: [[PTR_COND:%.*]] = icmp ne ptr %{{.*}}, null + // NO-CHECK: [[PTR_SELECT:%.*]] = call ptr @llvm.ct.select.p0(i1 [[PTR_COND]], ptr %{{.*}}, ptr %{{.*}}) + // NO-CHECK: [[FINAL_COND:%.*]] = icmp ne ptr [[PTR_SELECT]], null + // NO-CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}}) + // NO-CHECK: ret i32 [[RESULT]] +// return __builtin_ct_select(__builtin_ct_select(ptr1, ptr1, ptr2), a, b); +//} + + +// Test using ct_select result in logical operations as condition +int test_logical_condition(int cond1, int cond2, int a, int b, int c, int d) { + // CHECK-LABEL: define {{.*}} @test_logical_condition + // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[FIRST_SELECT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK-DAG: [[SELECT_BOOL:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(__builtin_ct_select(cond1, a, b) && cond2, c, d); +} + +// Test multiple levels of ct_select as conditions +int test_deep_condition_nesting(int cond1, int cond2, int cond3, int a, int b, int c, int d, int e, int f) { + // CHECK-LABEL: define {{.*}} @test_deep_condition_nesting + // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[INNER1:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND2]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK-DAG: [[INNER1_COND:%.*]] = icmp ne i32 [[INNER1]], 0 + // CHECK-DAG: [[INNER2:%.*]] = call i32 @llvm.ct.select.i32(i1 [[INNER1_COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK-DAG: [[OUTER:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 [[INNER2]], i32 %{{.*}}) + // CHECK-DAG: [[FINAL_COND:%.*]] = icmp ne i32 [[OUTER]], 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(__builtin_ct_select(cond1, __builtin_ct_select(__builtin_ct_select(cond2, a, b), c, d), e), f, a); +} + +// Test ct_select with complex condition expressions +int test_complex_condition_expr(int x, int y, int z, int a, int b) { + // CHECK-LABEL: define {{.*}} @test_complex_condition_expr + // CHECK: [[CMP1:%.*]] = icmp sgt i32 %{{.*}}, %{{.*}} + // CHECK: [[SELECT1:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP1]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: [[CMP2:%.*]] = icmp slt i32 [[SELECT1]], %{{.*}} + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP2]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(__builtin_ct_select(x > y, x, y) < z, a, b); +} + +// Test vector types - 128-bit vectors +typedef int __attribute__((vector_size(16))) int4; +typedef float __attribute__((vector_size(16))) float4; +typedef short __attribute__((vector_size(16))) short8; +typedef char __attribute__((vector_size(16))) char16; + +int4 test_vector_int4(int cond, int4 a, int4 b) { + // CHECK-LABEL: define {{.*}} @test_vector_int4 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: ret <4 x i32> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +float4 test_vector_float4(int cond, float4 a, float4 b) { + // CHECK-LABEL: define {{.*}} @test_vector_float4 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: ret <4 x float> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +short8 test_vector_short8(int cond, short8 a, short8 b) { + // CHECK-LABEL: define {{.*}} @test_vector_short8 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <8 x i16> @llvm.ct.select.v8i16(i1 [[COND]], <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: ret <8 x i16> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +char16 test_vector_char16(int cond, char16 a, char16 b) { + // CHECK-LABEL: define {{.*}} @test_vector_char16 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <16 x i8> @llvm.ct.select.v16i8(i1 [[COND]], <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: ret <16 x i8> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test 256-bit vectors +typedef int __attribute__((vector_size(32))) int8; +typedef float __attribute__((vector_size(32))) float8; +typedef double __attribute__((vector_size(32))) double4; + +int8 test_vector_int8(int cond, int8 a, int8 b) { + // CHECK-LABEL: define {{.*}} @test_vector_int8 + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call <8 x i32> @llvm.ct.select.v8i32(i1 [[COND]], <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + return __builtin_ct_select(cond, a, b); +} + +float8 test_vector_float8(int cond, float8 a, float8 b) { + // CHECK-LABEL: define {{.*}} @test_vector_float8 + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call <8 x float> @llvm.ct.select.v8f32(i1 [[COND]], <8 x float> %{{.*}}, <8 x float> %{{.*}}) + return __builtin_ct_select(cond, a, b); +} + +double4 test_vector_double4(int cond, double4 a, double4 b) { + // CHECK-LABEL: define {{.*}} @test_vector_double4 + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call <4 x double> @llvm.ct.select.v4f64(i1 [[COND]], <4 x double> %{{.*}}, <4 x double> %{{.*}}) + return __builtin_ct_select(cond, a, b); +} + +// Test 512-bit vectors +typedef int __attribute__((vector_size(64))) int16; +typedef float __attribute__((vector_size(64))) float16; + +int16 test_vector_int16(int cond, int16 a, int16 b) { + // CHECK-LABEL: define {{.*}} @test_vector_int16 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <16 x i32> @llvm.ct.select.v16i32(i1 [[COND]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + return __builtin_ct_select(cond, a, b); +} + +float16 test_vector_float16(int cond, float16 a, float16 b) { + // CHECK-LABEL: define {{.*}} @test_vector_float16 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <16 x float> @llvm.ct.select.v16f32(i1 [[COND]], <16 x float> %{{.*}}, <16 x float> %{{.*}}) + return __builtin_ct_select(cond, a, b); +} + +// Test vector operations with different condition types +int4 test_vector_char_cond(char cond, int4 a, int4 b) { + // CHECK-LABEL: define {{.*}} @test_vector_char_cond + // CHECK: [[COND:%.*]] = icmp ne i8 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: ret <4 x i32> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +float4 test_vector_long_cond(long cond, float4 a, float4 b) { + // CHECK-LABEL: define {{.*}} @test_vector_long_cond + // CHECK: [[COND:%.*]] = icmp ne i64 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: ret <4 x float> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test vector constants +int4 test_vector_constant_cond(void) { + // CHECK-LABEL: define {{.*}} @test_vector_constant_cond + // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 true, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: ret <4 x i32> [[RESULT]] + int4 a = {1, 2, 3, 4}; + int4 b = {5, 6, 7, 8}; + return __builtin_ct_select(1, a, b); +} + +float4 test_vector_zero_cond(void) { + // CHECK-LABEL: define {{.*}} @test_vector_zero_cond + // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 false, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: ret <4 x float> [[RESULT]] + float4 a = {1.0f, 2.0f, 3.0f, 4.0f}; + float4 b = {5.0f, 6.0f, 7.0f, 8.0f}; + return __builtin_ct_select(0, a, b); +} + +// Test nested vector selections +int4 test_vector_nested(int cond1, int cond2, int4 a, int4 b, int4 c) { + // CHECK-LABEL: define {{.*}} @test_vector_nested + // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[INNER:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND2]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND1]], <4 x i32> [[INNER]], <4 x i32> %{{.*}}) + // CHECK: ret <4 x i32> [[RESULT]] + return __builtin_ct_select(cond1, __builtin_ct_select(cond2, a, b), c); +} + +// Test vector selection with complex expressions +float4 test_vector_complex_expr(int x, int y, float4 a, float4 b) { + // CHECK-LABEL: define {{.*}} @test_vector_complex_expr + // CHECK: [[CMP:%.*]] = icmp sgt i32 %{{.*}}, %{{.*}} + // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[CMP]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: ret <4 x float> [[RESULT]] + return __builtin_ct_select(x > y, a, b); +} + +// Test vector with different element sizes +typedef long long __attribute__((vector_size(16))) long2; +typedef double __attribute__((vector_size(16))) double2; + +long2 test_vector_long2(int cond, long2 a, long2 b) { + // CHECK-LABEL: define {{.*}} @test_vector_long2 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <2 x i64> @llvm.ct.select.v2i64(i1 [[COND]], <2 x i64> %{{.*}}, <2 x i64> %{{.*}}) + // CHECK: ret <2 x i64> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +double2 test_vector_double2(int cond, double2 a, double2 b) { + // CHECK-LABEL: define {{.*}} @test_vector_double2 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <2 x double> @llvm.ct.select.v2f64(i1 [[COND]], <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: ret <2 x double> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test mixed vector operations +int4 test_vector_from_scalar_condition(int4 vec_cond, int4 a, int4 b) { + // CHECK-LABEL: define {{.*}} @test_vector_from_scalar_condition + // Extract first element and use as condition + int scalar_cond = vec_cond[0]; + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: ret <4 x i32> [[RESULT]] + return __builtin_ct_select(scalar_cond, a, b); +} + +// Test vector chaining +float4 test_vector_chaining(int cond1, int cond2, int cond3, float4 a, float4 b, float4 c, float4 d) { + // CHECK-LABEL: define {{.*}} @test_vector_chaining + // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[COND3:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[FIRST:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND1]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK-DAG: [[SECOND:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND2]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK-DAG: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND3]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: ret <4 x float> [[RESULT]] + float4 first = __builtin_ct_select(cond1, a, b); + float4 second = __builtin_ct_select(cond2, first, c); + return __builtin_ct_select(cond3, second, d); +} + +// Test special floating point values - NaN +float test_nan_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_nan_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float %{{.*}}, float 1.000000e+00) + // CHECK: ret float [[RESULT]] + float nan_val = __builtin_nanf(""); + return __builtin_ct_select(cond, nan_val, 1.0f); +} + +double test_nan_double_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_nan_double_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double 2.000000e+00) + // CHECK: ret double [[RESULT]] + double nan_val = __builtin_nan(""); + return __builtin_ct_select(cond, nan_val, 2.0); +} + +// Test infinity values +float test_infinity_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_infinity_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float %{{.*}}, float %{{.*}}) + // CHECK: ret float [[RESULT]] + float pos_inf = __builtin_inff(); + float neg_inf = -__builtin_inff(); + return __builtin_ct_select(cond, pos_inf, neg_inf); +} + +double test_infinity_double_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_infinity_double_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double %{{.*}}) + // CHECK: ret double [[RESULT]] + double pos_inf = __builtin_inf(); + double neg_inf = -__builtin_inf(); + return __builtin_ct_select(cond, pos_inf, neg_inf); +} + +// Test subnormal/denormal values +float test_subnormal_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_subnormal_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float %{{.*}}, float %{{.*}}) + // CHECK: ret float [[RESULT]] + // Very small subnormal values + float subnormal1 = 1e-40f; + float subnormal2 = 1e-45f; + return __builtin_ct_select(cond, subnormal1, subnormal2); +} + +// Test integer overflow boundaries +int test_integer_overflow_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_integer_overflow_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + int max_int = __INT_MAX__; + int min_int = (-__INT_MAX__ - 1); + return __builtin_ct_select(cond, max_int, min_int); +} + +long long test_longlong_overflow_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_longlong_overflow_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 %{{.*}}, i64 %{{.*}}) + // CHECK: ret i64 [[RESULT]] + long long max_ll = __LONG_LONG_MAX__; + long long min_ll = (-__LONG_LONG_MAX__ - 1); + return __builtin_ct_select(cond, max_ll, min_ll); +} + +// Test unsigned overflow boundaries +unsigned int test_unsigned_overflow_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_unsigned_overflow_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + unsigned int max_uint = 4294967295; + unsigned int min_uint = 0; + return __builtin_ct_select(cond, max_uint, min_uint); +} + +// Test null pointer dereference avoidance +int* test_null_pointer_operands(int cond, int* valid_ptr) { + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call ptr @llvm.ct.select.p0(i1 [[COND]], ptr %{{.*}}, ptr %{{.*}}) + // CHECK: ret ptr [[RESULT]] + int* null_ptr = (int*)0; + return __builtin_ct_select(cond, null_ptr, valid_ptr); +} + +// Test volatile operations +volatile int global_volatile = 42; +int test_volatile_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_volatile_operands + // CHECK-DAG: [[VOLATILE_LOAD:%.*]] = load volatile i32, ptr {{.*}} + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 100) + // CHECK: ret i32 [[RESULT]] + volatile int vol_val = global_volatile; + return __builtin_ct_select(cond, vol_val, 100); +} + +// Test uninitialized variable behavior (should still work with ct_select) +int test_uninitialized_operands(int cond, int initialized) { + // CHECK-LABEL: define {{.*}} @test_uninitialized_operands + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + int uninitialized; // Intentionally uninitialized + return __builtin_ct_select(cond, uninitialized, initialized); +} + +// Test zero division avoidance patterns +int test_division_by_zero_avoidance(int cond, int dividend, int divisor) { + // CHECK-LABEL: define {{.*}} @test_division_by_zero_avoidance + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[DIV_RESULT:%.*]] = sdiv i32 %{{.*}}, %{{.*}} + // CHECK-DAG: [[SAFE_DIVISOR:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 1) + // First get a safe divisor (never zero) + int safe_divisor = __builtin_ct_select(divisor != 0, divisor, 1); + // Then perform division with guaranteed non-zero divisor + return dividend / safe_divisor; +} + +// Test array bounds checking patterns +int test_array_bounds_protection(int cond, int index, int* array) { + // CHECK-LABEL: define {{.*}} @test_array_bounds_protection + // CHECK-DAG: [[SAFE_INDEX:%.*]] = call i32 @llvm.ct.select.i32(i1 {{.*}}, i32 %{{.*}}, i32 0) + // Use ct_select to ensure safe array indexing + int safe_index = __builtin_ct_select(index >= 0 && index < 10, index, 0); + return array[safe_index]; +} + +// Test bit manipulation edge cases +unsigned int test_bit_manipulation_edge_cases(int cond, unsigned int value) { + // CHECK-LABEL: define {{.*}} @test_bit_manipulation_edge_cases + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[SHIFT_LEFT:%.*]] = shl i32 %{{.*}}, 31 + // CHECK-DAG: [[SHIFT_RIGHT:%.*]] = lshr i32 %{{.*}}, 31 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + // Test extreme bit shifts that could cause undefined behavior + unsigned int left_shift = value << 31; // Could overflow + unsigned int right_shift = value >> 31; // Extract sign bit + return __builtin_ct_select(cond, left_shift, right_shift); +} + +// Test signed integer wraparound +int test_signed_wraparound(int cond, int a, int b) { + // CHECK-LABEL: define {{.*}} @test_signed_wraparound + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[ADD:%.*]] = add nsw i32 %{{.*}}, %{{.*}} + // CHECK-DAG: [[SUB:%.*]] = sub nsw i32 %{{.*}}, %{{.*}} + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + int sum = a + b; // Could overflow + int diff = a - b; // Could underflow + return __builtin_ct_select(cond, sum, diff); +} + +// Test vector NaN handling +float4 test_vector_nan_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_vector_nan_operands + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: ret <4 x float> [[RESULT]] + float nan_val = __builtin_nanf(""); + float4 nan_vec = {nan_val, nan_val, nan_val, nan_val}; + float4 normal_vec = {1.0f, 2.0f, 3.0f, 4.0f}; + return __builtin_ct_select(cond, nan_vec, normal_vec); +} + +// Test vector infinity handling +float4 test_vector_infinity_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_vector_infinity_operands + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: ret <4 x float> [[RESULT]] + float pos_inf = __builtin_inff(); + float neg_inf = -__builtin_inff(); + float4 inf_vec = {pos_inf, neg_inf, pos_inf, neg_inf}; + float4 zero_vec = {0.0f, 0.0f, 0.0f, 0.0f}; + return __builtin_ct_select(cond, inf_vec, zero_vec); +} + +// Test mixed special values +double test_mixed_special_values(int cond) { + // CHECK-LABEL: define {{.*}} @test_mixed_special_values + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double %{{.*}}) + // CHECK: ret double [[RESULT]] + double nan_val = __builtin_nan(""); + double inf_val = __builtin_inf(); + return __builtin_ct_select(cond, nan_val, inf_val); +} + +// Test constant-time memory access pattern +int test_constant_time_memory_access(int secret_index, int* data_array) { + // CHECK-LABEL: define {{.*}} @test_constant_time_memory_access + // This pattern ensures constant-time memory access regardless of secret_index value + int result = 0; + // Use ct_select to accumulate values without revealing the secret index + for (int i = 0; i < 8; i++) { + int is_target = (i == secret_index); + int current_value = data_array[i]; + int selected_value = __builtin_ct_select(is_target, current_value, 0); + result += selected_value; + } + return result; +} + +// Test timing-attack resistant comparison +int test_timing_resistant_comparison(const char* secret, const char* guess) { + // CHECK-LABEL: define {{.*}} @test_timing_resistant_comparison + // Constant-time string comparison using ct_select + int match = 1; + for (int i = 0; i < 32; i++) { + int chars_equal = (secret[i] == guess[i]); + int both_null = (secret[i] == 0) && (guess[i] == 0); + int still_matching = __builtin_ct_select(chars_equal || both_null, match, 0); + match = __builtin_ct_select(both_null, match, still_matching); + } + return match; +} diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index ff3dd0d4c3c51..656f6e718f029 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -783,6 +783,10 @@ enum NodeType { /// i1 then the high bits must conform to getBooleanContents. SELECT, + /// Constant-time Select, implemented with CMOV instruction. This is used to + /// implement constant-time select. + CTSELECT, + /// Select with a vector condition (op #0) and two vector operands (ops #1 /// and #2), returning a vector result. All vectors have the same length. /// Much like the scalar select and setcc, each bit in the condition selects diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index df6ce0fe1b037..00d2f5bd6c8eb 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1348,6 +1348,13 @@ class SelectionDAG { return getNode(Opcode, DL, VT, Cond, LHS, RHS, Flags); } + SDValue getCTSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, + SDValue RHS, SDNodeFlags Flags = SDNodeFlags()) { + assert(LHS.getValueType() == VT && RHS.getValueType() == VT && + "Cannot use select on differing types"); + return getNode(ISD::CTSELECT, DL, VT, Cond, LHS, RHS, Flags); + } + /// Helper function to make it easier to build SelectCC's if you just have an /// ISD::CondCode instead of an SDValue. SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 73f2c55a71125..375a4bf4c5c03 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -242,11 +242,15 @@ class LLVM_ABI TargetLoweringBase { /// Enum that describes what type of support for selects the target has. enum SelectSupportKind { - ScalarValSelect, // The target supports scalar selects (ex: cmov). - ScalarCondVectorVal, // The target supports selects with a scalar condition - // and vector values (ex: cmov). - VectorMaskSelect // The target supports vector selects with a vector - // mask (ex: x86 blends). + ScalarValSelect, // The target supports scalar selects (ex: cmov). + ScalarCondVectorVal, // The target supports selects with a scalar condition + // and vector values (ex: cmov). + VectorMaskSelect, // The target supports vector selects with a vector + // mask (ex: x86 blends). + CtSelect, // The target implements a custom constant-time select. + ScalarCondVectorValCtSelect, // The target supports selects with a scalar + // condition and vector values. + VectorMaskValCtSelect, // The target supports vector selects with a vector }; /// Enum that specifies what an atomic load/AtomicRMWInst is expanded @@ -476,8 +480,8 @@ class LLVM_ABI TargetLoweringBase { MachineMemOperand::Flags getVPIntrinsicMemOperandFlags(const VPIntrinsic &VPIntrin) const; - virtual bool isSelectSupported(SelectSupportKind /*kind*/) const { - return true; + virtual bool isSelectSupported(SelectSupportKind kind) const { + return kind != CtSelect; } /// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 8856eda250ed6..8c76ed010096b 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1813,6 +1813,13 @@ def int_coro_subfn_addr : DefaultAttrsIntrinsic< [IntrReadMem, IntrArgMemOnly, ReadOnly>, NoCapture>]>; +///===-------------------------- Constant Time Intrinsics --------------------------===// +// +// Intrinsic to support constant time select +def int_ct_select : DefaultAttrsIntrinsic<[llvm_any_ty], + [llvm_i1_ty, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrWriteMem, IntrWillReturn, NoUndef]>; + ///===-------------------------- Other Intrinsics --------------------------===// // // TODO: We should introduce a new memory kind fo traps (and other side effects diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 07a858fd682fc..c783a2aa9258f 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -214,6 +214,10 @@ def SDTSelect : SDTypeProfile<1, 3, [ // select SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3> ]>; +def SDTCtSelect : SDTypeProfile<1, 3, [ // ctselect + SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3> +]>; + def SDTVSelect : SDTypeProfile<1, 3, [ // vselect SDTCisVec<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisSameNumEltsAs<0, 1> ]>; @@ -717,6 +721,7 @@ def reset_fpmode : SDNode<"ISD::RESET_FPMODE", SDTNone, [SDNPHasChain]>; def setcc : SDNode<"ISD::SETCC" , SDTSetCC>; def select : SDNode<"ISD::SELECT" , SDTSelect>; +def ctselect : SDNode<"ISD::CTSELECT" , SDTCtSelect>; def vselect : SDNode<"ISD::VSELECT" , SDTVSelect>; def selectcc : SDNode<"ISD::SELECT_CC" , SDTSelectCC>; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c97300d64d455..cac7e813101c8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -484,6 +484,7 @@ namespace { SDValue visitCTTZ_ZERO_UNDEF(SDNode *N); SDValue visitCTPOP(SDNode *N); SDValue visitSELECT(SDNode *N); + SDValue visitCTSELECT(SDNode *N); SDValue visitVSELECT(SDNode *N); SDValue visitVP_SELECT(SDNode *N); SDValue visitSELECT_CC(SDNode *N); @@ -1968,6 +1969,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::CTTZ_ZERO_UNDEF: return visitCTTZ_ZERO_UNDEF(N); case ISD::CTPOP: return visitCTPOP(N); case ISD::SELECT: return visitSELECT(N); + case ISD::CTSELECT: return visitCTSELECT(N); case ISD::VSELECT: return visitVSELECT(N); case ISD::SELECT_CC: return visitSELECT_CC(N); case ISD::SETCC: return visitSETCC(N); @@ -6016,6 +6018,7 @@ static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2, N0CC = cast(N0.getOperand(4))->get(); break; case ISD::SELECT: + case ISD::CTSELECT: case ISD::VSELECT: if (N0.getOperand(0).getOpcode() != ISD::SETCC) return SDValue(); @@ -12168,8 +12171,9 @@ template static SDValue foldBoolSelectToLogic(SDNode *N, const SDLoc &DL, SelectionDAG &DAG) { assert((N->getOpcode() == ISD::SELECT || N->getOpcode() == ISD::VSELECT || - N->getOpcode() == ISD::VP_SELECT) && - "Expected a (v)(vp.)select"); + N->getOpcode() == ISD::VP_SELECT || + N->getOpcode() == ISD::CTSELECT) && + "Expected a (v)(vp.)(ct) select"); SDValue Cond = N->getOperand(0); SDValue T = N->getOperand(1), F = N->getOperand(2); EVT VT = N->getValueType(0); @@ -12531,6 +12535,109 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitCTSELECT(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + EVT VT = N->getValueType(0); + EVT VT0 = N0.getValueType(); + SDLoc DL(N); + SDNodeFlags Flags = N->getFlags(); + + if (SDValue V = foldBoolSelectToLogic(N, DL, DAG)) + return V; + + // ctselect (not Cond), N1, N2 -> ctselect Cond, N2, N1 + if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) { + SDValue SelectOp = DAG.getNode(ISD::CTSELECT, DL, VT, F, N2, N1); + SelectOp->setFlags(Flags); + return SelectOp; + } + + if (VT0 == MVT::i1) { + // The code in this block deals with the following 2 equivalences: + // select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y)) + // select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y) + // The target can specify its preferred form with the + // shouldNormalizeToSelectSequence() callback. However we always transform + // to the right anyway if we find the inner select exists in the DAG anyway + // and we always transform to the left side if we know that we can further + // optimize the combination of the conditions. + bool normalizeToSequence = + TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT); + // ctselect (and Cond0, Cond1), X, Y + // -> ctselect Cond0, (ctselect Cond1, X, Y), Y + if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) { + SDValue Cond0 = N0->getOperand(0); + SDValue Cond1 = N0->getOperand(1); + SDValue InnerSelect = DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), + Cond1, N1, N2, Flags); + if (normalizeToSequence || !InnerSelect.use_empty()) + return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Cond0, + InnerSelect, N2, Flags); + // Cleanup on failure. + if (InnerSelect.use_empty()) + recursivelyDeleteUnusedNodes(InnerSelect.getNode()); + } + // ctselect (or Cond0, Cond1), X, Y -> ctselect Cond0, X, (ctselect Cond1, + // X, Y) + if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) { + SDValue Cond0 = N0->getOperand(0); + SDValue Cond1 = N0->getOperand(1); + SDValue InnerSelect = DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), + Cond1, N1, N2, Flags); + if (normalizeToSequence || !InnerSelect.use_empty()) + return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Cond0, N1, + InnerSelect, Flags); + // Cleanup on failure. + if (InnerSelect.use_empty()) + recursivelyDeleteUnusedNodes(InnerSelect.getNode()); + } + + // ctselect Cond0, (ctselect Cond1, X, Y), Y -> ctselect (and Cond0, Cond1), + // X, Y + if (N1->getOpcode() == ISD::CTSELECT && N1->hasOneUse()) { + SDValue N1_0 = N1->getOperand(0); + SDValue N1_1 = N1->getOperand(1); + SDValue N1_2 = N1->getOperand(2); + if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) { + // Create the actual and node if we can generate good code for it. + if (!normalizeToSequence) { + SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0); + return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), And, N1_1, + N2, Flags); + } + // Otherwise see if we can optimize the "and" to a better pattern. + if (SDValue Combined = visitANDLike(N0, N1_0, N)) { + return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Combined, + N1_1, N2, Flags); + } + } + } + // ctselect Cond0, X, (ctselect Cond1, X, Y) -> ctselect (or Cond0, Cond1), + // X, Y + if (N2->getOpcode() == ISD::CTSELECT && N2->hasOneUse()) { + SDValue N2_0 = N2->getOperand(0); + SDValue N2_1 = N2->getOperand(1); + SDValue N2_2 = N2->getOperand(2); + if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) { + // Create the actual or node if we can generate good code for it. + if (!normalizeToSequence) { + SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0); + return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Or, N1, N2_2, + Flags); + } + // Otherwise see if we can optimize to a better pattern. + if (SDValue Combined = visitORLike(N0, N2_0, DL)) + return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Combined, N1, + N2_2, Flags); + } + } + } + + return SDValue(); +} + // This function assumes all the vselect's arguments are CONCAT_VECTOR // nodes and that the condition is a BV of ConstantSDNodes (or undefs). static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 5fb7e63cfb605..c7c7bf28de79e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4135,6 +4135,15 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { } Results.push_back(Tmp1); break; + case ISD::CTSELECT: { + Tmp1 = Node->getOperand(0); + Tmp2 = Node->getOperand(1); + Tmp3 = Node->getOperand(2); + Tmp1 = DAG.getCTSelect(dl, Tmp1.getValueType(), Tmp1, Tmp2, Tmp3); + Tmp1->setFlags(Node->getFlags()); + Results.push_back(Tmp1); + break; + } case ISD::BR_JT: { SDValue Chain = Node->getOperand(0); SDValue Table = Node->getOperand(1); @@ -5473,7 +5482,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp2)); break; } - case ISD::SELECT: { + case ISD::SELECT: + case ISD::CTSELECT: { unsigned ExtOp, TruncOp; if (Node->getValueType(0).isVector() || Node->getValueType(0).getSizeInBits() == NVT.getSizeInBits()) { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 437d0f4654096..61251e58046d3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -159,6 +159,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::ATOMIC_LOAD: R = SoftenFloatRes_ATOMIC_LOAD(N); break; case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break; case ISD::SELECT: R = SoftenFloatRes_SELECT(N); break; + case ISD::CTSELECT: R = SoftenFloatRes_CTSELECT(N); break; case ISD::SELECT_CC: R = SoftenFloatRes_SELECT_CC(N); break; case ISD::FREEZE: R = SoftenFloatRes_FREEZE(N); break; case ISD::STRICT_SINT_TO_FP: @@ -1041,6 +1042,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) { LHS.getValueType(), N->getOperand(0), LHS, RHS); } +SDValue DAGTypeLegalizer::SoftenFloatRes_CTSELECT(SDNode *N) { + SDValue LHS = GetSoftenedFloat(N->getOperand(1)); + SDValue RHS = GetSoftenedFloat(N->getOperand(2)); + return DAG.getCTSelect(SDLoc(N), LHS.getValueType(), N->getOperand(0), LHS, + RHS); +} + SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) { SDValue LHS = GetSoftenedFloat(N->getOperand(2)); SDValue RHS = GetSoftenedFloat(N->getOperand(3)); @@ -1541,6 +1549,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { case ISD::POISON: case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break; + case ISD::CTSELECT: SplitRes_Select(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::MERGE_VALUES: ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; @@ -2897,6 +2906,9 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { R = PromoteFloatRes_ATOMIC_LOAD(N); break; case ISD::SELECT: R = PromoteFloatRes_SELECT(N); break; + case ISD::CTSELECT: + R = PromoteFloatRes_SELECT(N); + break; case ISD::SELECT_CC: R = PromoteFloatRes_SELECT_CC(N); break; case ISD::SINT_TO_FP: @@ -3199,7 +3211,7 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT(SDNode *N) { SDValue TrueVal = GetPromotedFloat(N->getOperand(1)); SDValue FalseVal = GetPromotedFloat(N->getOperand(2)); - return DAG.getNode(ISD::SELECT, SDLoc(N), TrueVal->getValueType(0), + return DAG.getNode(N->getOpcode(), SDLoc(N), TrueVal->getValueType(0), N->getOperand(0), TrueVal, FalseVal); } @@ -3383,6 +3395,9 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { R = SoftPromoteHalfRes_ATOMIC_LOAD(N); break; case ISD::SELECT: R = SoftPromoteHalfRes_SELECT(N); break; + case ISD::CTSELECT: + R = SoftPromoteHalfRes_SELECT(N); + break; case ISD::SELECT_CC: R = SoftPromoteHalfRes_SELECT_CC(N); break; case ISD::STRICT_SINT_TO_FP: case ISD::STRICT_UINT_TO_FP: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 88a4a8b16373b..124f61df9679b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -95,6 +95,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { Res = PromoteIntRes_VECTOR_COMPRESS(N); break; case ISD::SELECT: + case ISD::CTSELECT: case ISD::VSELECT: case ISD::VP_SELECT: case ISD::VP_MERGE: @@ -2000,6 +2001,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { break; case ISD::VSELECT: case ISD::SELECT: Res = PromoteIntOp_SELECT(N, OpNo); break; + case ISD::CTSELECT: + Res = PromoteIntOp_CTSELECT(N, OpNo); + break; case ISD::SELECT_CC: Res = PromoteIntOp_SELECT_CC(N, OpNo); break; case ISD::VP_SETCC: case ISD::SETCC: Res = PromoteIntOp_SETCC(N, OpNo); break; @@ -2377,6 +2381,19 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) { N->getOperand(2)), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_CTSELECT(SDNode *N, unsigned OpNo) { + assert(OpNo == 0 && "Only know how to promote the condition!"); + SDValue Cond = N->getOperand(0); + EVT OpTy = N->getOperand(1).getValueType(); + + // Promote all the way up to the canonical SetCC type. + EVT OpVT = N->getOpcode() == ISD::CTSELECT ? OpTy.getScalarType() : OpTy; + Cond = PromoteTargetBoolean(Cond, OpVT); + + return SDValue( + DAG.UpdateNodeOperands(N, Cond, N->getOperand(1), N->getOperand(2)), 0); +} + SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) { assert(OpNo == 0 && "Don't know how to promote this operand!"); @@ -2978,6 +2995,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::ARITH_FENCE: SplitRes_ARITH_FENCE(N, Lo, Hi); break; case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break; + case ISD::CTSELECT: + SplitRes_Select(N, Lo, Hi); + break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::POISON: case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 603dc34ce72a7..f76520ad07508 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -401,6 +401,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N); SDValue PromoteIntOp_ScalarOp(SDNode *N); SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_CTSELECT(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_Shift(SDNode *N); @@ -633,6 +634,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SoftenFloatRes_LOAD(SDNode *N); SDValue SoftenFloatRes_ATOMIC_LOAD(SDNode *N); SDValue SoftenFloatRes_SELECT(SDNode *N); + SDValue SoftenFloatRes_CTSELECT(SDNode *N); SDValue SoftenFloatRes_SELECT_CC(SDNode *N); SDValue SoftenFloatRes_UNDEF(SDNode *N); SDValue SoftenFloatRes_VAARG(SDNode *N); @@ -893,6 +895,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N); SDValue ScalarizeVecRes_VSELECT(SDNode *N); SDValue ScalarizeVecRes_SELECT(SDNode *N); + SDValue ScalarizeVecRes_CTSELECT(SDNode *N); SDValue ScalarizeVecRes_SELECT_CC(SDNode *N); SDValue ScalarizeVecRes_SETCC(SDNode *N); SDValue ScalarizeVecRes_UNDEF(SDNode *N); @@ -1221,7 +1224,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue &Lo, SDValue &Hi); void SplitVecRes_AssertZext (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_ARITH_FENCE (SDNode *N, SDValue &Lo, SDValue &Hi); - void SplitRes_Select (SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitRes_CTSELECT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_SELECT_CC (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_UNDEF (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_FREEZE (SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index 88c1af20a321e..098368ef2f6b3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -570,6 +570,20 @@ void DAGTypeLegalizer::SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi) { Hi = DAG.getNode(Opcode, dl, LH.getValueType(), CH, LH, RH, EVLHi); } +void DAGTypeLegalizer::SplitRes_CTSELECT(SDNode *N, SDValue &Lo, SDValue &Hi) { + SDValue LL, LH, RL, RH, CL, CH; + SDLoc dl(N); + GetSplitOp(N->getOperand(1), LL, LH); + GetSplitOp(N->getOperand(2), RL, RH); + + SDValue Cond = N->getOperand(0); + CL = CH = Cond; + assert(!Cond.getValueType().isVector() && "Unsupported vector type"); + + Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL); + Hi = DAG.getNode(N->getOpcode(), dl, LH.getValueType(), CH, LH, RH); +} + void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LL, LH, RL, RH; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 3b5f83f7c089a..4ecc12c1f0e31 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -74,6 +74,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break; case ISD::VSELECT: R = ScalarizeVecRes_VSELECT(N); break; case ISD::SELECT: R = ScalarizeVecRes_SELECT(N); break; + case ISD::CTSELECT: + R = ScalarizeVecRes_CTSELECT(N); + break; case ISD::SELECT_CC: R = ScalarizeVecRes_SELECT_CC(N); break; case ISD::SETCC: R = ScalarizeVecRes_SETCC(N); break; case ISD::POISON: @@ -655,6 +658,12 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) { GetScalarizedVector(N->getOperand(2))); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_CTSELECT(SDNode *N) { + SDValue LHS = GetScalarizedVector(N->getOperand(1)); + return DAG.getCTSelect(SDLoc(N), LHS.getValueType(), N->getOperand(0), LHS, + GetScalarizedVector(N->getOperand(2))); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) { SDValue LHS = GetScalarizedVector(N->getOperand(2)); return DAG.getNode(ISD::SELECT_CC, SDLoc(N), LHS.getValueType(), @@ -1189,6 +1198,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::SELECT: case ISD::VP_MERGE: case ISD::VP_SELECT: SplitRes_Select(N, Lo, Hi); break; + case ISD::CTSELECT: + SplitRes_CTSELECT(N, Lo, Hi); + break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::POISON: case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; @@ -4854,6 +4866,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break; case ISD::VSELECT: case ISD::SELECT: + case ISD::CTSELECT: case ISD::VP_SELECT: case ISD::VP_MERGE: Res = WidenVecRes_Select(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 90edaf3ef5471..8e14aa1f869a3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -8249,6 +8249,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, return V; break; } + case ISD::SELECT: case ISD::VSELECT: if (SDValue V = simplifySelect(N1, N2, N3)) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index cb0038c54f8c7..316fa19d0b37c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6667,6 +6667,35 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, updateDAGForMaybeTailCall(MC); return; } + case Intrinsic::ct_select: { + SDLoc DL = getCurSDLoc(); + + SDValue Cond = getValue(I.getArgOperand(0)); // i1 + SDValue A = getValue(I.getArgOperand(1)); // T + SDValue B = getValue(I.getArgOperand(2)); // T + + assert((A.getValueType() == B.getValueType()) && + "Operands are of different types"); + + EVT VT = A.getValueType(); + EVT CondVT = Cond.getValueType(); + + // assert if Cond type is Vector + assert(!CondVT.isVector() && "Vector type cond not supported yet"); + + // Handle scalar types + if (TLI.isSelectSupported( + TargetLoweringBase::SelectSupportKind::CtSelect) && + !CondVT.isVector()) { + SDValue Result = DAG.getNode(ISD::CTSELECT, DL, VT, Cond, A, B); + setValue(&I, Result); + return; + } + + SDValue Result = DAG.getNode(ISD::SELECT, DL, VT, Cond, A, B); + setValue(&I, Result); + return; + } case Intrinsic::call_preallocated_setup: { const CallBase *PreallocatedCall = FindPreallocatedCall(&I); SDValue SrcValue = DAG.getSrcValue(PreallocatedCall); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 39cbfad6d0be1..274a1cd4f7594 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -332,6 +332,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::FPOWI: return "fpowi"; case ISD::STRICT_FPOWI: return "strict_fpowi"; case ISD::SETCC: return "setcc"; + case ISD::CTSELECT: return "ctselect"; case ISD::SETCCCARRY: return "setcccarry"; case ISD::STRICT_FSETCC: return "strict_fsetcc"; case ISD::STRICT_FSETCCS: return "strict_fsetccs"; diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 8e08d16342975..638ecb2491fe0 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -41,6 +41,9 @@ def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true", def FeatureCMOV : SubtargetFeature<"cmov","HasCMOV", "true", "Enable conditional move instructions">; +def FeatureCtSelect : SubtargetFeature<"ctselect", "HasCtSelect", "true", + "Enable feature to implement constant-time select">; + def FeatureCX8 : SubtargetFeature<"cx8", "HasCX8", "true", "Support CMPXCHG8B instructions">; @@ -830,7 +833,7 @@ def ProcessorFeatures { // x86-64 micro-architecture levels: x86-64 and x86-64-v[234] list X86_64V1Features = [ FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE2, - FeatureFXSR, FeatureNOPL, FeatureX86_64, + FeatureFXSR, FeatureNOPL, FeatureX86_64, FeatureCtSelect, ]; list X86_64V1Tuning = [ TuningMacroFusion, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a0b64ff370b10..fd7a70b32a23d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "X86ISelLowering.h" +#include "MCTargetDesc/X86MCTargetDesc.h" #include "MCTargetDesc/X86ShuffleDecode.h" #include "X86.h" #include "X86FrameLowering.h" @@ -29,6 +30,8 @@ #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -48,6 +51,7 @@ #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" @@ -488,6 +492,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // X86 wants to expand cmov itself. for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); @@ -496,11 +501,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); } // Custom action for SELECT MMX and expand action for SELECT_CC MMX setOperationAction(ISD::SELECT, MVT::x86mmx, Custom); + setOperationAction(ISD::CTSELECT, MVT::x86mmx, Custom); setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand); setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); @@ -630,6 +637,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC, VT, Action); setOperationAction(ISD::SETCC, VT, Action); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Action); setOperationAction(ISD::FROUND, VT, Action); setOperationAction(ISD::FROUNDEVEN, VT, Action); @@ -1067,6 +1075,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); + setOperationAction(ISD::CTSELECT, MVT::v4f32, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v2f32, Custom); @@ -1220,6 +1229,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v8f16, Custom); setOperationAction(ISD::SELECT, MVT::v16i8, Custom); + setOperationAction(ISD::CTSELECT, MVT::v2f64, Custom); + setOperationAction(ISD::CTSELECT, MVT::v2i64, Custom); + setOperationAction(ISD::CTSELECT, MVT::v4i32, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8i16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8f16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v16i8, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); @@ -1541,6 +1557,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v32i8, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); + setOperationAction(ISD::CTSELECT, MVT::v4f64, Custom); + setOperationAction(ISD::CTSELECT, MVT::v4i64, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8i32, Custom); + setOperationAction(ISD::CTSELECT, MVT::v16i16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v16f16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v32i8, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8f32, Custom); + for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); @@ -1727,6 +1751,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v16i1, &X86::VK16RegClass); setOperationAction(ISD::SELECT, MVT::v1i1, Custom); + setOperationAction(ISD::CTSELECT, MVT::v1i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); @@ -1772,6 +1797,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); @@ -2038,6 +2064,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); @@ -2203,6 +2230,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); @@ -2269,6 +2297,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); @@ -2643,6 +2672,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::BITCAST, ISD::VSELECT, ISD::SELECT, + ISD::CTSELECT, ISD::SHL, ISD::SRA, ISD::SRL, @@ -25321,6 +25351,316 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, return V; } +SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { + SDValue Cond = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + SDLoc DL(Op); + MVT VT = Op1.getSimpleValueType(); + + // Handle soft float16 by converting to integer operations + if (isSoftF16(VT, Subtarget)) { + MVT NVT = VT.changeTypeToInteger(); + return DAG.getBitcast(VT, DAG.getNode(ISD::CTSELECT, DL, NVT, Cond, + DAG.getBitcast(NVT, Op1), + DAG.getBitcast(NVT, Op2))); + } + + // Handle vector types + if (VT.isVector()) { + // Handle soft float16 vectors + if (isSoftF16(VT, Subtarget)) { + MVT NVT = VT.changeVectorElementTypeToInteger(); + return DAG.getBitcast(VT, DAG.getNode(ISD::CTSELECT, DL, NVT, Cond, + DAG.getBitcast(NVT, Op1), + DAG.getBitcast(NVT, Op2))); + } + + unsigned VectorWidth = VT.getSizeInBits(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + + // Check if we have the necessary SIMD support + bool HasSSE = Subtarget.hasSSE1(); + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); + + // For 512-bit vectors, we need AVX512 + if (VectorWidth == 512 && !HasAVX512) + return SDValue(); + + // For 256-bit vectors, we need at least AVX + if (VectorWidth == 256 && !HasAVX) + return SDValue(); + + // For 128-bit vectors, we need at least SSE + if (VectorWidth == 128 && !HasSSE) + return SDValue(); + + // Handle special cases for floating point vectors + if (EltVT.isFloatingPoint()) { + // For AVX-512, use mask-based selection for better performance + if (HasAVX512 && VectorWidth == 512) { + // Convert condition to mask and use masked select + MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); + SDValue Mask = DAG.getSetCC(DL, MaskVT, Cond, + DAG.getConstant(0, DL, Cond.getValueType()), + ISD::SETNE); + return DAG.getSelect(DL, VT, Mask, Op1, Op2); + } + + // For vector floating point with AVX, use VBLENDV-style operations + if (HasAVX && (VectorWidth == 256 || VectorWidth == 128)) { + // Convert to bitwise operations using the condition + MVT IntVT = VT.changeVectorElementTypeToInteger(); + SDValue IntOp1 = DAG.getBitcast(IntVT, Op1); + SDValue IntOp2 = DAG.getBitcast(IntVT, Op2); + + // Create the CTSELECT node with integer types + SDValue IntResult = + DAG.getNode(X86ISD::CTSELECT, DL, IntVT, IntOp2, IntOp1, + DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), + EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget)); + return DAG.getBitcast(VT, IntResult); + } + } + + // For integer vectors or when we don't have advanced SIMD support, + // use the generic X86 CTSELECT node which will be matched by the patterns + SDValue CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); + SDValue EFLAGS = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget); + + // Create the X86 CTSELECT node - note operand order: false, true, cc, flags + return DAG.getNode(X86ISD::CTSELECT, DL, VT, Op2, Op1, CC, EFLAGS); + } + + // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops + // are available, Otherwise FP cmovs get lowered into a less efficient branch + // sequence later. + if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) && + VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { + SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); + bool IsAlwaysSignaling; + unsigned SSECC = + translateX86FSETCC(cast(Cond.getOperand(2))->get(), + CondOp0, CondOp1, IsAlwaysSignaling); + + // TODO: CTSELECT does not look for AVX support and optimize it using vector + // select + if (SSECC < 8) { + SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, + DAG.getTargetConstant(SSECC, DL, MVT::i8)); + SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); + SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); + return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); + } + } + + // Try to optimize special patterns when comparing with zero + if (Cond.getOpcode() == X86ISD::SETCC && + Cond.getOperand(1).getOpcode() == X86ISD::CMP && + isNullConstant(Cond.getOperand(1).getOperand(1))) { + + SDValue CmpOp0 = Cond.getOperand(1).getOperand(0); + unsigned CondCode = Cond.getConstantOperandVal(0); + + // Special handling for __builtin_ffs(X) - 1 pattern + auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) { + return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() && + Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2)); + }; + + if ((VT == MVT::i32 || VT == MVT::i64) && + ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) || + (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) { + // Keep original comparison for FFS pattern + } else { + + auto TryOptimizeAndOneSelect = + [&](SDValue CmpOp0, SDValue Op1, SDValue Op2, unsigned CondCode, + SDLoc DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget) -> SDValue { + if (CondCode != X86::COND_E || CmpOp0.getOpcode() != ISD::AND || + !isOneConstant(CmpOp0.getOperand(1))) + return SDValue(); + + EVT CmpVT = CmpOp0.getValueType(); + EVT SelectVT = Op1.getValueType(); + + /// function to create a mask for LSB operations + auto SplatLSB = [&](EVT SplatVT) { + SDValue AdjustedValue; + + if (CmpVT.bitsGT(SplatVT)) { + AdjustedValue = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpOp0); + } else if (CmpVT.bitsLT(SplatVT)) { + SDValue Extended = + DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpOp0.getOperand(0)); + AdjustedValue = DAG.getNode(ISD::AND, DL, SplatVT, Extended, + DAG.getConstant(1, DL, SplatVT)); + } else { + AdjustedValue = CmpOp0; + } + + return DAG.getNegative(AdjustedValue, DL, SplatVT); + }; + + // CTSELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1)) + if (isNullConstant(Op1) && isAllOnesConstant(Op2)) + return SplatLSB(SelectVT); + + // CTSELECT (AND(X,1) == 0), C1, C2 -> + // XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2)) + if (!Subtarget.canUseCMOV() && isa(Op1) && + isa(Op2)) { + SDValue Mask = SplatLSB(SelectVT); + SDValue Diff = DAG.getNode(ISD::XOR, DL, SelectVT, Op1, Op2); + SDValue Flip = DAG.getNode(ISD::AND, DL, SelectVT, Mask, Diff); + return DAG.getNode(ISD::XOR, DL, SelectVT, Op1, Flip); + } + + return SDValue(); + }; + + /// Try to optimize min/max patterns with sign bit operations + auto TryOptimizeMinMaxPattern = + [&](SDValue CmpOp0, SDValue Op1, SDValue Op2, unsigned CondCode, + MVT VT, SDLoc DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget) -> SDValue { + if ((VT != MVT::i32 && VT != MVT::i64) || !isNullConstant(Op2) || + CmpOp0 != Op1) + return SDValue(); + + if (CondCode == X86::COND_S || // smin(x, 0) + (CondCode == X86::COND_G && hasAndNot(Op1))) { // smax(x, 0) + // (ctselect (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x + // (ctselect (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x + unsigned ShCt = VT.getSizeInBits() - 1; + SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT); + SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt); + if (CondCode == X86::COND_G) + Shift = DAG.getNOT(DL, Shift, VT); + return DAG.getNode(ISD::AND, DL, VT, Shift, Op1); + } + return SDValue(); + }; + // Try AND(X,1) optimizations + if (SDValue OptResult = TryOptimizeAndOneSelect( + CmpOp0, Op1, Op2, CondCode, DL, DAG, Subtarget)) + return OptResult; + + // Try min/max pattern optimizations + if (SDValue OptResult = TryOptimizeMinMaxPattern( + CmpOp0, Op1, Op2, CondCode, VT, DL, DAG, Subtarget)) + return OptResult; + } + } + + // Look past (and (setcc_carry (cmp ...)), 1) + if (Cond.getOpcode() == ISD::AND && + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); + + /// Process condition flags and prepare for CTSELECT node creation + auto ProcessConditionFlags = + [&](SDValue Cond, MVT VT, SDLoc DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget) -> std::pair { + SDValue CC; + bool AddTest = true; + + unsigned CondOpcode = Cond.getOpcode(); + if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { + CC = Cond.getOperand(0); + SDValue Cmp = Cond.getOperand(1); + + bool IllegalFPCMov = false; + if (VT.isFloatingPoint() && !VT.isVector() && + !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) + IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); + + if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || + Cmp.getOpcode() == X86ISD::BT) { + Cond = Cmp; + AddTest = false; + } + } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || + CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || + CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) { + SDValue Value; + X86::CondCode X86Cond; + std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); + CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8); + AddTest = false; + } + + if (AddTest) { + // Look past the truncate if the high bits are known zero + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); + + // Try to match AND to BT instruction + if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { + X86::CondCode X86CondCode; + if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) { + CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8); + Cond = BT; + AddTest = false; + } + } + } + + if (AddTest) { + CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); + Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget); + } + + return {CC, Cond}; + }; + + // Process condition flags and prepare for CTSELECT + auto [CC, ProcessedCond] = + ProcessConditionFlags(Cond, VT, DL, DAG, Subtarget); + + // Handle i8 CTSELECT with truncate optimization + if (Op.getValueType() == MVT::i8 && + Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { + SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); + if (T1.getValueType() == T2.getValueType() && + T1.getOpcode() != ISD::CopyFromReg && + T2.getOpcode() != ISD::CopyFromReg) { + SDValue Cmov = DAG.getNode(X86ISD::CTSELECT, DL, T1.getValueType(), T2, + T1, CC, ProcessedCond); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); + } + } + + // Promote small integer types to avoid partial register stalls + if ((Op.getValueType() == MVT::i8) || + (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) && + !X86::mayFoldLoad(Op2, Subtarget))) { + Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); + Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); + SDValue Ops[] = {Op2, Op1, CC, ProcessedCond}; + SDValue Cmov = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Ops); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); + } + + if (isScalarFPTypeInSSEReg(VT)) { + MVT IntVT = (VT == MVT::f32) ? MVT::i32 : MVT::i64; + Op1 = DAG.getBitcast(IntVT, Op1); + Op2 = DAG.getBitcast(IntVT, Op2); + SDValue Ops[] = {Op2, Op1, CC, ProcessedCond}; + SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, IntVT, Ops); + return DAG.getBitcast(VT, CtSelect); + } + + // Create final CTSELECT node + SDValue Ops[] = {Op2, Op1, CC, ProcessedCond}; + return DAG.getNode(X86ISD::CTSELECT, DL, Op.getValueType(), Ops, + Op->getFlags()); +} + static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op->getOperand(0); @@ -33684,6 +34024,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG); case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::CTSELECT: return LowerCTSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); @@ -33767,6 +34108,12 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } } +bool X86TargetLowering::isSelectSupported(SelectSupportKind Kind) const { + if (Kind == SelectSupportKind::CtSelect) { + return true; + } + return TargetLoweringBase::isSelectSupported(Kind); +} /// Replace a node with an illegal result type with a new node built out of /// custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, @@ -34994,6 +35341,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(STRICT_CMPM) NODE_NAME_CASE(CMPMM_SAE) NODE_NAME_CASE(SETCC) + NODE_NAME_CASE(CTSELECT) NODE_NAME_CASE(SETCC_CARRY) NODE_NAME_CASE(FSETCC) NODE_NAME_CASE(FSETCCM) @@ -36372,6 +36720,456 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV, return SinkMBB; } +struct CtSelectInstructions { + unsigned PAndOpc; + unsigned PAndnOpc; + unsigned POrOpc; + unsigned BroadcastOpc; + unsigned IntMoveOpc; + unsigned MoveOpc; + bool Use256; + bool UseVEX; + bool UseBlendInstr; +}; + +static CtSelectInstructions +getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { + CtSelectInstructions Instructions = {}; + + switch (Opcode) { + case X86::CTSELECT_V2F64: + if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVAPDrr; + } else { + llvm_unreachable("Double precision vectors require SSE2"); + } + break; + case X86::CTSELECT_V4F32: + if (Subtarget.hasSSE41()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVAPSrr; + Instructions.UseBlendInstr = true; + } else if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVAPSrr; + } else { + Instructions.PAndOpc = X86::ANDPSrr; + Instructions.PAndnOpc = X86::ANDNPSrr; + Instructions.POrOpc = X86::ORPSrr; + Instructions.BroadcastOpc = X86::SHUFPSrri; + Instructions.IntMoveOpc = X86::MOVSS2DIrr; + Instructions.MoveOpc = X86::MOVAPSrr; + } + break; + case X86::CTSELECT_V4I32: + case X86::CTSELECT_V2I64: + case X86::CTSELECT_V8I16: + case X86::CTSELECT_V16I8: + if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVDQArr; + } else { + llvm_unreachable("Integer vector operations require SSE2"); + } + break; + case X86::CTSELECT_V8F16: + if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVDQArr; + } else { + llvm_unreachable("FP16 vector operations require SSE2"); + } + break; + case X86::CTSELECT_V4F32X: + case X86::CTSELECT_V4I32X: + case X86::CTSELECT_V2F64X: + case X86::CTSELECT_V2I64X: + case X86::CTSELECT_V8I16X: + case X86::CTSELECT_V16I8X: + case X86::CTSELECT_V8F16X: + if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDrr; + Instructions.PAndnOpc = X86::VPANDNrr; + Instructions.POrOpc = X86::VPORrr; + Instructions.BroadcastOpc = X86::VPSHUFDri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = (Opcode == X86::CTSELECT_V4F32X) ? X86::VMOVAPSrr + : (Opcode == X86::CTSELECT_V2F64X) + ? X86::VMOVAPDrr + : X86::VMOVDQArr; + Instructions.UseVEX = true; + } else { + llvm_unreachable("AVX variants require AVX support"); + } + break; + case X86::CTSELECT_V8F32: + case X86::CTSELECT_V8I32: + if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPSYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = + (Opcode == X86::CTSELECT_V8F32) ? X86::VMOVAPSYrr : X86::VMOVDQAYrr; + Instructions.Use256 = true; + Instructions.UseVEX = true; + } else { + llvm_unreachable("256-bit vectors require AVX"); + } + break; + case X86::CTSELECT_V4F64: + case X86::CTSELECT_V4I64: + if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPDYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = + (Opcode == X86::CTSELECT_V4F64) ? X86::VMOVAPDYrr : X86::VMOVDQAYrr; + Instructions.Use256 = true; + Instructions.UseVEX = true; + } else { + llvm_unreachable("256-bit vectors require AVX"); + } + break; + case X86::CTSELECT_V16I16: + case X86::CTSELECT_V32I8: + case X86::CTSELECT_V16F16: + if (Subtarget.hasAVX2()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPSYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = X86::VMOVDQAYrr; + Instructions.Use256 = true; + Instructions.UseVEX = true; + } else if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPSYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = X86::VMOVDQAYrr; + Instructions.Use256 = true; + Instructions.UseVEX = true; + } else { + llvm_unreachable("256-bit integer vectors require AVX"); + } + break; + default: + llvm_unreachable("Unexpected CTSELECT opcode"); + } + + return Instructions; +} + +static Register createScalarMask(MachineBasicBlock *MBB, MachineInstr &MI, + const MIMetadata &MIMD, + const TargetInstrInfo *TII, + MachineRegisterInfo &MRI) { + const TargetRegisterClass *GR8Class = &X86::GR8RegClass; + const TargetRegisterClass *GR32Class = &X86::GR32RegClass; + + Register CondByteReg = MRI.createVirtualRegister(GR8Class); + Register CondReg = MRI.createVirtualRegister(GR32Class); + Register ScalarMaskReg = MRI.createVirtualRegister(GR32Class); + + // Create a condition value using appropriate SETCC instruction + BuildMI(*MBB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg) + .addImm(X86::COND_E) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Zero-extend byte to 32-bit register (movzbl %al, %eax) + BuildMI(*MBB, MI, MIMD, TII->get(X86::MOVZX32rr8), CondReg) + .addReg(CondByteReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax) + BuildMI(*MBB, MI, MIMD, TII->get(X86::NEG32r), ScalarMaskReg) + .addReg(CondReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + return ScalarMaskReg; +} + +static Register broadcastScalarMask( + MachineBasicBlock *MBB, MachineInstr &MI, const MIMetadata &MIMD, + const TargetInstrInfo *TII, MachineRegisterInfo &MRI, + Register ScalarMaskReg, const TargetRegisterClass *RC, + const CtSelectInstructions &Instructions, const X86Subtarget &Subtarget) { + // Step 1: Move scalar mask to vector register + Register VecFromScalarReg = MRI.createVirtualRegister(RC); + BuildMI(*MBB, MI, MIMD, TII->get(Instructions.IntMoveOpc), VecFromScalarReg) + .addReg(ScalarMaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 2: Broadcast mask across all elements + Register MaskReg = MRI.createVirtualRegister(RC); + if (Instructions.Use256) { + // For 256-bit vectors, broadcast across all elements + BuildMI(*MBB, MI, MIMD, TII->get(Instructions.BroadcastOpc), MaskReg) + .addReg(VecFromScalarReg) + .addImm(0) + .setMIFlag(MachineInstr::MIFlag::NoMerge); // Broadcast element 0 to all + // positions + } else { + // For 128-bit vectors + if (Subtarget.hasSSE2() || Instructions.UseVEX) { + // Use PSHUFD for efficient broadcasting + BuildMI(*MBB, MI, MIMD, TII->get(Instructions.BroadcastOpc), MaskReg) + .addReg(VecFromScalarReg) + .addImm(0x00) + .setMIFlag(MachineInstr::MIFlag::NoMerge); // Broadcast element 0 to + // all positions + } else { + // SSE1 fallback using SHUFPS + BuildMI(*MBB, MI, MIMD, TII->get(Instructions.BroadcastOpc), MaskReg) + .addReg(VecFromScalarReg) + .addReg(VecFromScalarReg) + .addImm(0x00) + .setMIFlag(MachineInstr::MIFlag::NoMerge); // Broadcast element 0 to + // all positions + } + } + + return MaskReg; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredCtSelect(MachineInstr &MI, + MachineBasicBlock *ThisMBB) const { + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const MIMetadata MIMD(MI); + + MachineRegisterInfo &MRI = ThisMBB->getParent()->getRegInfo(); + DebugLoc DL = MI.getDebugLoc(); + + // Extract operands: dst = ctselect src1, src2, cond + Register DstReg = MI.getOperand(0).getReg(); + Register TrueReg = MI.getOperand(1).getReg(); + Register FalseReg = MI.getOperand(2).getReg(); + // Note: CondCode from MI.getOperand(3).getImm() is not used - we hardcode + // COND_E for sete + + // Get the vector type to determine the appropriate instructions + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + unsigned Opcode = MI.getOpcode(); + + // Get instruction opcodes for this operation + CtSelectInstructions Instructions = + getCtSelectInstructions(Opcode, Subtarget); + + // Step 1: Create scalar mask using SETCC + NEG + Register ScalarMaskReg = createScalarMask(ThisMBB, MI, MIMD, TII, MRI); + + // Step 2: Move scalar mask to vector register and broadcast + Register MaskReg = broadcastScalarMask( + ThisMBB, MI, MIMD, TII, MRI, ScalarMaskReg, RC, Instructions, Subtarget); + + // Step 3: Implement blend operation + if (Instructions.UseBlendInstr && Subtarget.hasSSE41() && + !Instructions.Use256) { + // Use dedicated blend instructions for SSE4.1+ + unsigned BlendOpc; + switch (Opcode) { + case X86::CTSELECT_V4F32: + BlendOpc = X86::BLENDVPSrr0; + break; + case X86::CTSELECT_V2F64: + BlendOpc = X86::BLENDVPDrr0; + break; + default: + BlendOpc = X86::PBLENDVBrr0; + break; + } + + // BLENDV uses XMM0 as implicit mask register + BuildMI(*ThisMBB, MI, MIMD, TII->get(X86::MOVAPSrr), X86::XMM0) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + BuildMI(*ThisMBB, MI, MIMD, TII->get(BlendOpc), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } else { + // Use traditional AND/ANDN/OR approach + Register TempReg = MRI.createVirtualRegister(RC); + Register MaskCopyReg = MRI.createVirtualRegister(RC); + Register VecAndReg = MRI.createVirtualRegister(RC); + Register VecAndnReg = MRI.createVirtualRegister(RC); + Register FinalResultReg = MRI.createVirtualRegister(RC); + + // Copy mask for first operation + BuildMI(*ThisMBB, MI, MIMD, TII->get(Instructions.MoveOpc), TempReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // mask & true_val + BuildMI(*ThisMBB, MI, MIMD, TII->get(Instructions.PAndOpc), VecAndReg) + .addReg(TempReg) + .addReg(TrueReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Copy mask for second operation + BuildMI(*ThisMBB, MI, MIMD, TII->get(Instructions.MoveOpc), MaskCopyReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // ~mask & false_val + BuildMI(*ThisMBB, MI, MIMD, TII->get(Instructions.PAndnOpc), VecAndnReg) + .addReg(MaskCopyReg) + .addReg(FalseReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Combine results + BuildMI(*ThisMBB, MI, MIMD, TII->get(Instructions.POrOpc), FinalResultReg) + .addReg(VecAndReg) + .addReg(VecAndnReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Move final result to destination + BuildMI(*ThisMBB, MI, MIMD, TII->get(Instructions.MoveOpc), DstReg) + .addReg(FinalResultReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } + // Remove the original instruction + MI.eraseFromParent(); + return ThisMBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredCtSelectNoCMOV(MachineInstr &MI, + MachineBasicBlock *ThisMBB) const { + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const MIMetadata MIMD(MI); + MachineRegisterInfo &MRI = ThisMBB->getParent()->getRegInfo(); + DebugLoc DL = MI.getDebugLoc(); + + // Get operands + Register DstReg = MI.getOperand(0).getReg(); + Register TrueReg = MI.getOperand(1).getReg(); + Register FalseReg = MI.getOperand(2).getReg(); + + X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm()); + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); + + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + unsigned SETCCOp, MOVZXOp, NEGOp, ANDOp, XOROp, OROp; + const TargetRegisterClass *condRC; + + if (RC == &X86::GR8RegClass) { + SETCCOp = X86::SETCCr; + MOVZXOp = 0; // No extension needed for 8-bit + NEGOp = X86::NEG8r; + ANDOp = X86::AND8rr; + XOROp = X86::XOR8ri; + OROp = X86::OR8rr; + condRC = &X86::GR8RegClass; + } else if (RC == &X86::GR16RegClass) { + SETCCOp = X86::SETCCr; + MOVZXOp = X86::MOVZX16rr8; + NEGOp = X86::NEG16r; + ANDOp = X86::AND16rr; + XOROp = X86::XOR16ri; + OROp = X86::OR16rr; + condRC = &X86::GR16RegClass; + } else if (RC == &X86::GR32RegClass) { + SETCCOp = X86::SETCCr; + MOVZXOp = X86::MOVZX32rr8; + NEGOp = X86::NEG32r; + ANDOp = X86::AND32rr; + XOROp = X86::XOR32ri; + OROp = X86::OR32rr; + condRC = &X86::GR32RegClass; + } else { + llvm_unreachable("Unsupported register class for conditional select"); + } + + // Step 1: Create condition value using SETCC instruction + Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + BuildMI(*ThisMBB, MI, MIMD, TII->get(SETCCOp), CondByteReg) + .addImm(OppCC) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + Register CondReg; + if (RC == &X86::GR8RegClass) { + // For 8-bit, use the byte register directly + CondReg = CondByteReg; + } else { + // For 16/32-bit, zero-extend the byte to the target size + CondReg = MRI.createVirtualRegister(condRC); + BuildMI(*ThisMBB, MI, MIMD, TII->get(MOVZXOp), CondReg) + .addReg(CondByteReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } + + // Step 2: Convert condition to mask (1 -> 0xFFFF..., 0 -> 0x0000...) + // Use NEG to create all-ones mask when condition is true + Register MaskReg = MRI.createVirtualRegister(condRC); + BuildMI(*ThisMBB, MI, MIMD, TII->get(NEGOp), MaskReg) + .addReg(CondReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 3: Implement conditional select using bitwise operations + // Result = (TrueReg & Mask) | (FalseReg & ~Mask) + + // Create inverted mask (~Mask) + Register InvMaskReg = MRI.createVirtualRegister(condRC); + BuildMI(*ThisMBB, MI, MIMD, TII->get(XOROp), InvMaskReg) + .addReg(MaskReg) + .addImm(-1) + .setMIFlag(MachineInstr::MIFlag::NoMerge); // XOR with all 1s to invert + + // Compute TrueReg & Mask + Register TrueMaskedReg = MRI.createVirtualRegister(condRC); + BuildMI(*ThisMBB, MI, MIMD, TII->get(ANDOp), TrueMaskedReg) + .addReg(TrueReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Compute FalseReg & ~Mask + Register FalseMaskedReg = MRI.createVirtualRegister(condRC); + BuildMI(*ThisMBB, MI, MIMD, TII->get(ANDOp), FalseMaskedReg) + .addReg(FalseReg) + .addReg(InvMaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Final result: (TrueReg & Mask) | (FalseReg & ~Mask) + BuildMI(*ThisMBB, MI, MIMD, TII->get(OROp), DstReg) + .addReg(TrueMaskedReg) + .addReg(FalseMaskedReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Remove the original instruction + MI.eraseFromParent(); + return ThisMBB; +} + MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, MachineBasicBlock *ThisMBB) const { @@ -37828,6 +38626,47 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::CMOV_VK64: return EmitLoweredSelect(MI, BB); + case X86::CTSELECT_V2F64: + case X86::CTSELECT_V4F32: + case X86::CTSELECT_V8F16: + case X86::CTSELECT_V2I64: + case X86::CTSELECT_V4I32: + case X86::CTSELECT_V8I16: + case X86::CTSELECT_V16I8: + case X86::CTSELECT_V2F64X: + case X86::CTSELECT_V4F32X: + case X86::CTSELECT_V8F16X: + case X86::CTSELECT_V2I64X: + case X86::CTSELECT_V4I32X: + case X86::CTSELECT_V8I16X: + case X86::CTSELECT_V16I8X: + case X86::CTSELECT_V4I64: + case X86::CTSELECT_V8I32: + case X86::CTSELECT_V16I16: + case X86::CTSELECT_V32I8: + case X86::CTSELECT_V4F64: + case X86::CTSELECT_V8F32: + case X86::CTSELECT_V16F16: + case X86::CTSELECT_V8I64: + case X86::CTSELECT_V16I32: + case X86::CTSELECT_V32I16: + case X86::CTSELECT_V64I8: + case X86::CTSELECT_V8F64: + case X86::CTSELECT_V16F32: + case X86::CTSELECT_V32F16: + return EmitLoweredCtSelect(MI, BB); + + case X86::CTSELECT_GR16rr: + case X86::CTSELECT_GR32rr: + return EmitLoweredCtSelectNoCMOV(MI, BB); + + case X86::CTSELECT_FP32rr: + case X86::CTSELECT_FP64rr: + case X86::CTSELECT_FP80rr: + case X86::CTSELECT_VR64rr: + return EmitLoweredSelect( + MI, BB); // TODO: Implement this to generate for Constant time version + case X86::FP80_ADDr: case X86::FP80_ADDm32: { // Change the floating point control register to use double extended diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index e28b9c11a04cd..0c6f47a9f3ee5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -114,6 +114,10 @@ namespace llvm { /// X86 Select SELECTS, + /// X86 Constant-time Select, implemented with CMOV instruction. This is + /// used to implement constant-time select. + CTSELECT, + // Same as SETCC except it's materialized with a sbb and the value is all // one's or all zero's. SETCC_CARRY, // R = carry_bit ? ~0 : 0 @@ -1139,6 +1143,8 @@ namespace llvm { /// SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + bool isSelectSupported(SelectSupportKind Kind) const override; + /// Replace the results of node with an illegal result /// type with new values built out of custom code. /// @@ -1766,6 +1772,7 @@ namespace llvm { SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; @@ -1865,6 +1872,12 @@ namespace llvm { MachineBasicBlock *EmitLoweredSelect(MachineInstr &I, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredCtSelectNoCMOV(MachineInstr &MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredCtSelect(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const; diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td index 7d5d7cf4a83ab..68c17f889e6d6 100644 --- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -106,6 +106,139 @@ let Predicates = [HasCMOV, HasNDD] in { def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS), (CMOV64rm_ND GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>; } + +// Create pseudo instruction and do the pattern matching to them. +// We use a machine pass to lower these pseudos into cmov, in order +// to avoid backend optimizations +let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in { + + multiclass CTSELECT { + // register-only + let isCommutable = 0, SchedRW = [WriteCMOV] in { + def rr : PseudoI<(outs t.RegClass:$dst), + (ins t.RegClass:$src1, t.RegClass:$src2, i8imm:$cond), + [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, t.RegClass:$src2, timm:$cond, EFLAGS))]>; + } + + // register-memory + let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in { + def rm : PseudoI<(outs t.RegClass:$dst), + (ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond), + [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>; + } + } +} + +let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { + let Predicates = [HasCMOV], Constraints = "$dst = $src1" in { + defm CTSELECT16 : CTSELECT; + defm CTSELECT32 : CTSELECT; + defm CTSELECT64 : CTSELECT; + } +} + +let Uses = [EFLAGS], usesCustomInserter = 1, isNotDuplicable = 1, isPseudo = 1, hasSideEffects = 1 in { + // 128-bit vector types + let Predicates = [HasSSE1] in { + def CTSELECT_V4F32 : PseudoI<(outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + [(set VR128:$dst, (v4f32 (X86ctselect VR128:$t, VR128:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V2F64 : PseudoI<(outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + [(set VR128:$dst, (v2f64 (X86ctselect VR128:$t, VR128:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V4I32 : PseudoI<(outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + [(set VR128:$dst, (v4i32 (X86ctselect VR128:$t, VR128:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V2I64 : PseudoI<(outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + [(set VR128:$dst, (v2i64 (X86ctselect VR128:$t, VR128:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V8I16 : PseudoI<(outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + [(set VR128:$dst, (v8i16 (X86ctselect VR128:$t, VR128:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V16I8 : PseudoI<(outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + [(set VR128:$dst, (v16i8 (X86ctselect VR128:$t, VR128:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V8F16 : PseudoI<(outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + [(set VR128:$dst, (v8f16 (X86ctselect VR128:$t, VR128:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + } + + // 128-bit vector types (AVX versions) + let Predicates = [HasAVX] in { + def CTSELECT_V4F32X : PseudoI<(outs VR128X:$dst), (ins VR128X:$t, VR128X:$f, i8imm:$cond), + [(set VR128X:$dst, (v4f32 (X86ctselect VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V2F64X : PseudoI<(outs VR128X:$dst), (ins VR128X:$t, VR128X:$f, i8imm:$cond), + [(set VR128X:$dst, (v2f64 (X86ctselect VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V4I32X : PseudoI<(outs VR128X:$dst), (ins VR128X:$t, VR128X:$f, i8imm:$cond), + [(set VR128X:$dst, (v4i32 (X86ctselect VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V2I64X : PseudoI<(outs VR128X:$dst), (ins VR128X:$t, VR128X:$f, i8imm:$cond), + [(set VR128X:$dst, (v2i64 (X86ctselect VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V8I16X : PseudoI<(outs VR128X:$dst), (ins VR128X:$t, VR128X:$f, i8imm:$cond), + [(set VR128X:$dst, (v8i16 (X86ctselect VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V16I8X : PseudoI<(outs VR128X:$dst), (ins VR128X:$t, VR128X:$f, i8imm:$cond), + [(set VR128X:$dst, (v16i8 (X86ctselect VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V8F16X : PseudoI<(outs VR128X:$dst), (ins VR128X:$t, VR128X:$f, i8imm:$cond), + [(set VR128X:$dst, (v8f16 (X86ctselect VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + } + + // 256-bit vector types + let Predicates = [HasAVX] in { + def CTSELECT_V8F32 : PseudoI<(outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + [(set VR256:$dst, (v8f32 (X86ctselect VR256:$t, VR256:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V4F64 : PseudoI<(outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + [(set VR256:$dst, (v4f64 (X86ctselect VR256:$t, VR256:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V8I32 : PseudoI<(outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + [(set VR256:$dst, (v8i32 (X86ctselect VR256:$t, VR256:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V4I64 : PseudoI<(outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + [(set VR256:$dst, (v4i64 (X86ctselect VR256:$t, VR256:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V16I16 : PseudoI<(outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + [(set VR256:$dst, (v16i16 (X86ctselect VR256:$t, VR256:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V32I8 : PseudoI<(outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + [(set VR256:$dst, (v32i8 (X86ctselect VR256:$t, VR256:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + def CTSELECT_V16F16 : PseudoI<(outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + [(set VR256:$dst, (v16f16 (X86ctselect VR256:$t, VR256:$f, timm:$cond, EFLAGS)))]>, + Sched<[]>; + } + + // 512-bit vector types + let Predicates = [HasAVX512] in { + def CTSELECT_V16F32 : PseudoI<(outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), + [(set VR512:$dst, (v16f32 (X86ctselect VR512:$t, VR512:$f, timm:$cond, EFLAGS)))]>, + Sched<[WriteCMOV]>; + def CTSELECT_V8F64 : PseudoI<(outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), + [(set VR512:$dst, (v8f64 (X86ctselect VR512:$t, VR512:$f, timm:$cond, EFLAGS)))]>, + Sched<[WriteCMOV]>; + def CTSELECT_V16I32 : PseudoI<(outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), + [(set VR512:$dst, (v16i32 (X86ctselect VR512:$t, VR512:$f, timm:$cond, EFLAGS)))]>, + Sched<[WriteCMOV]>; + def CTSELECT_V8I64 : PseudoI<(outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), + [(set VR512:$dst, (v8i64 (X86ctselect VR512:$t, VR512:$f, timm:$cond, EFLAGS)))]>, + Sched<[WriteCMOV]>; + def CTSELECT_V32I16 : PseudoI<(outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), + [(set VR512:$dst, (v32i16 (X86ctselect VR512:$t, VR512:$f, timm:$cond, EFLAGS)))]>, + Sched<[WriteCMOV]>; + def CTSELECT_V64I8 : PseudoI<(outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), + [(set VR512:$dst, (v64i8 (X86ctselect VR512:$t, VR512:$f, timm:$cond, EFLAGS)))]>, + Sched<[WriteCMOV]>; + def CTSELECT_V32F16 : PseudoI<(outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), + [(set VR512:$dst, (v32f16 (X86ctselect VR512:$t, VR512:$f, timm:$cond, EFLAGS)))]>, + Sched<[WriteCMOV]>; + } +} + let Predicates = [HasCMOV, HasCF] in { def : Pat<(X86cmov GR16:$src1, 0, timm:$cond, EFLAGS), (CFCMOV16rr GR16:$src1, (inv_cond_XFORM timm:$cond))>; diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index ec31675731b79..2806e1a174abc 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -693,6 +693,45 @@ def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; +// CTSELECT + +let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in { + multiclass CTSELECT_NOCMOV { + let hasNoSchedulingInfo = 1 in { + def rr : PseudoI<(outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$cond), + [(set RC:$dst, ( VT (X86ctselect RC:$src1, RC:$src2, timm:$cond, EFLAGS)))]>; + + // TODO: Do we need register-memory variant?? + //def rm : PseudoI<(outs t.RegClass:$dst), + // (ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond), + // [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>; + } + } +} + +let usesCustomInserter = 1, + isCodeGenOnly = 1, + hasSideEffects = 1, + ForceDisassemble = 1, + Constraints = "$dst = $src1" in { + let Predicates = [NoCMOV] in { + defm CTSELECT_GR16 : CTSELECT_NOCMOV; + defm CTSELECT_GR32 : CTSELECT_NOCMOV; + } + + let Predicates = [FPStackf32] in + defm CTSELECT_FP32 : CTSELECT_NOCMOV; + + let Predicates = [FPStackf64] in + defm CTSELECT_FP64 : CTSELECT_NOCMOV; + + defm CTSELECT_FP80 : CTSELECT_NOCMOV; + + let Predicates = [HasMMX] in + defm CTSELECT_VR64 : CTSELECT_NOCMOV; +} + //===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td index 116986a0fffea..0f912f23bbb62 100644 --- a/llvm/lib/Target/X86/X86InstrFragments.td +++ b/llvm/lib/Target/X86/X86InstrFragments.td @@ -28,6 +28,10 @@ def SDTX86Cmov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; +def SDTX86CtSelect : SDTypeProfile<1, 4, + [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; + // Unary and binary operator instructions that set EFLAGS as a side-effect. def SDTUnaryArithWithFlags : SDTypeProfile<2, 1, [SDTCisSameAs<0, 2>, @@ -151,6 +155,7 @@ def X86ctest : SDNode<"X86ISD::CTEST", SDTX86Ccmp>; def X86cload : SDNode<"X86ISD::CLOAD", SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86cstore : SDNode<"X86ISD::CSTORE", SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86ctselect: SDNode<"X86ISD::CTSELECT", SDTX86CtSelect>; def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>; def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, [SDNPHasChain]>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 1d2cd39951bf4..4ef8c04d49e6f 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -475,6 +475,41 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op, return false; } +bool X86InstrInfo::expandCtSelect(unsigned Opcode, + MachineInstrBuilder &MIB) const { + MachineInstr *MI = MIB.getInstr(); + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + + // CTSELECT pseudo has: (outs dst), (ins true_val, false_val, cond) + MachineOperand &OperandRes = MI->getOperand(0); // destination register + MachineOperand &OperandTrue = MI->getOperand(1); // true value + MachineOperand &OperandCond = MI->getOperand(3); // condition code + + assert(OperandTrue.isReg() && OperandRes.isReg() && OperandCond.isImm() && + "Invalid operand types"); + assert(OperandTrue.getReg() == OperandRes.getReg() && + "Result register different from True register"); + + assert(Subtarget.hasCMOV() && "target does not support CMOV instructions"); + + if (Subtarget.hasCMOV()) { + // Build CMOV instruction: copy the first 3 operands (dst, true, false) and + // add condition code + MachineInstrBuilder CmovBuilder = + BuildMI(MBB, MIB.getInstr(), DL, get(Opcode)); + for (unsigned i = 0; i < MI->getNumOperands(); ++i) { // Copy + CmovBuilder.add(MIB->getOperand(i)); + } + } else { + llvm_unreachable("target does not support cmov"); + } + + // Remove the original CTSELECT instruction + MI->eraseFromParent(); + return true; +} + static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) { switch (Opcode) { default: @@ -6411,6 +6446,24 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break; + case X86::CTSELECT64rr: + expandCtSelect(X86::CMOV64rr, MIB); + break; + case X86::CTSELECT32rr: + expandCtSelect(X86::CMOV32rr, MIB); + break; + case X86::CTSELECT16rr: + expandCtSelect(X86::CMOV16rr, MIB); + break; + case X86::CTSELECT64rm: + expandCtSelect(X86::CMOV64rm, MIB); + break; + case X86::CTSELECT32rm: + expandCtSelect(X86::CMOV32rm, MIB); + break; + case X86::CTSELECT16rm: + expandCtSelect(X86::CMOV16rm, MIB); + break; } return false; } diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 5f75559bd9598..e5c105ceb615b 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -724,6 +724,9 @@ class X86InstrInfo final : public X86GenInstrInfo { bool isFrameOperand(const MachineInstr &MI, unsigned int Op, int &FrameIndex) const; + /// Expand the CTSELECT pseudo-instructions. + bool expandCtSelect(unsigned Opcode, MachineInstrBuilder &MIB) const; + /// Returns true iff the routine could find two commutable operands in the /// given machine instruction with 3 vector inputs. /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index c20bb05018b4d..2ffee14b34bad 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -49,6 +49,7 @@ def HasZU : Predicate<"Subtarget->hasZU()">; def HasCF : Predicate<"Subtarget->hasCF()">; def HasCMOV : Predicate<"Subtarget->canUseCMOV()">; def NoCMOV : Predicate<"!Subtarget->canUseCMOV()">; +def HasCtSelect : Predicate<"Subtarget->hasCtSelect()">; def HasNOPL : Predicate<"Subtarget->hasNOPL()">; def HasMMX : Predicate<"Subtarget->hasMMX()">; def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; diff --git a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll new file mode 100644 index 0000000000000..06791a3262749 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll @@ -0,0 +1,336 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X32 + +; Test ct.select edge cases and corner cases + +; Test with very large integers +define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) { +; X64-LABEL: test_ctselect_i128: +; X64: # %bb.0: +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: cmovneq %rdx, %r8 +; X64-NEXT: movq %r8, %rdx +; X64: retq +; +; X32-LABEL: test_ctselect_i128: +; X32: # %bb.0: +; X32: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %esi +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edi +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %edx, {{[0-9]+}}(%eax) +; X32-NEXT: movl %ecx, {{[0-9]+}}(%eax) +; X32-NEXT: movl %edi, {{[0-9]+}}(%eax) +; X32-NEXT: movl %esi, (%eax) +; X32: retl $4 + %result = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b) + ret i128 %result +} + +; Test with small integer types +define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { +; X64-LABEL: test_ctselect_i1: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %esi, %eax +; X64: retq +; +; X32-LABEL: test_ctselect_i1: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32: retl + %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) + ret i1 %result +} + +; Test with extremal values +define i32 @test_ctselect_extremal_values(i1 %cond) { +; X64-LABEL: test_ctselect_extremal_values: +; X64: # %bb.0: +; X64-NEXT: testb $1, %dil +; X64-NEXT: movl $2147483647, %ecx +; X64-NEXT: movl $-2147483648, %eax +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_extremal_values: +; X32: # %bb.0: +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: movl $2147483647, %ecx +; X32-NEXT: movl $-2147483648, %eax +; X32-NEXT: cmovnel %ecx, %eax +; X32-NEXT: retl + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648) + ret i32 %result +} + +; Test with floating point special values +define float @test_ctselect_f32_special_values(i1 %cond) { +; X64-LABEL: test_ctselect_f32_special_values: +; X64: # %bb.0: +; X64-NEXT: testb $1, %dil +; X64-NEXT: movl $2143289344, %eax +; X64-NEXT: movl $2139095040, %ecx +; X64-NEXT: cmovnel %eax, %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_f32_special_values: +; X32: # %bb.0: +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: flds .LCPI3_0 +; X32-NEXT: flds .LCPI3_1 +; X32-NEXT: jne .LBB3_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: fstp %st(1) +; X32-NEXT: fldz +; X32-NEXT: .LBB3_2: +; X32-NEXT: fstp %st(0) +; X32-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000) + ret float %result +} + +define double @test_ctselect_f64_special_values(i1 %cond) { +; X64-LABEL: test_ctselect_f64_special_values: +; X64: # %bb.0: +; X64-NEXT: testb $1, %dil +; X64-NEXT: movabsq $9221120237041090560, %rax +; X64-NEXT: movabsq $9218868437227405312, %rcx +; X64-NEXT: cmovneq %rax, %rcx +; X64-NEXT: movq %rcx, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_f64_special_values: +; X32: # %bb.0: +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: flds .LCPI4_0 +; X32-NEXT: flds .LCPI4_1 +; X32-NEXT: jne .LBB4_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: fstp %st(1) +; X32-NEXT: fldz +; X32-NEXT: .LBB4_2: +; X32-NEXT: fstp %st(0) +; X32-NEXT: retl + %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000) + ret double %result +} + +; Test with null pointers +define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { +; X64-LABEL: test_ctselect_null_ptr: +; X64: # %bb.0: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_null_ptr: +; X32: # %bb.0: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null) + ret ptr %result +} + +; Test with function pointers +define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { +; X64-LABEL: test_ctselect_function_ptr: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_function_ptr: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2) + ret ptr %result +} + +; Test with volatile loads +define i32 @test_ctselect_volatile_load(i1 %cond, ptr %p1, ptr %p2) { +; X64-LABEL: test_ctselect_volatile_load: +; X64: # %bb.0: +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_volatile_load: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx), %ecx +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %ecx, %eax +; X32-NEXT: retl + %a = load volatile i32, ptr %p1 + %b = load volatile i32, ptr %p2 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test with atomic loads +define i32 @test_ctselect_atomic_load(i1 %cond, ptr %p1, ptr %p2) { +; X64-LABEL: test_ctselect_atomic_load: +; X64: # %bb.0: +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_atomic_load: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx), %ecx +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %ecx, %eax +; X32-NEXT: retl + %a = load atomic i32, ptr %p1 acquire, align 4 + %b = load atomic i32, ptr %p2 acquire, align 4 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test with condition from icmp on pointers +define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { +; X64-LABEL: test_ctselect_ptr_cmp: +; X64: # %bb.0: +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: cmpq %rsi, %rdi +; X64-NEXT: sete %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovneq %rdx, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_ptr_cmp: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: sete %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %cmp = icmp eq ptr %p1, %p2 + %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b) + ret ptr %result +} + +; Test with struct pointer types (struct types themselves may not be directly supported) +%struct.pair = type { i32, i32 } + +define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { +; X64-LABEL: test_ctselect_struct_ptr: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_struct_ptr: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with deeply nested conditions (stress test for instruction selection) +define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +; X64-LABEL: test_ctselect_deeply_nested: +; X64: # %bb.0: +; X64-NEXT: movl 24(%rsp), %eax +; X64-NEXT: movl 16(%rsp), %r10d +; X64-NEXT: movl 8(%rsp), %r11d +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %r8d, %r9d +; X64-NEXT: testb $1, %sil +; X64-NEXT: cmovnel %r9d, %r11d +; X64-NEXT: testb $1, %dl +; X64-NEXT: cmovnel %r11d, %r10d +; X64-NEXT: testb $1, %cl +; X64-NEXT: cmovnel %r10d, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_deeply_nested: +; X32: # %bb.0: +; X32-NEXT: pushl %esi +; X32: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %esi +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %esi, %edx +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %edx, %ecx +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %ecx, %eax +; X32-NEXT: popl %esi +; X32: retl + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e) + ret i32 %sel4 +} + +; Test with misaligned loads +define i32 @test_ctselect_misaligned_load(i1 %cond, ptr %p1, ptr %p2) { +; X64-LABEL: test_ctselect_misaligned_load: +; X64: # %bb.0: +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel (%rsi), %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_misaligned_load: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel (%ecx), %eax +; X32-NEXT: retl + %a = load i32, ptr %p1, align 1 + %b = load i32, ptr %p2, align 1 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Declare the intrinsics +declare i1 @llvm.ct.select.i1(i1, i1, i1) +declare i128 @llvm.ct.select.i128(i1, i128, i128) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) diff --git a/llvm/test/CodeGen/X86/ctselect-optimization.ll b/llvm/test/CodeGen/X86/ctselect-optimization.ll new file mode 100644 index 0000000000000..fff76e34ec7e6 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-optimization.ll @@ -0,0 +1,294 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s + +; Test ct.select optimization patterns + +; Test smin(x, 0) pattern optimization +define i32 @test_ctselect_smin_zero(i32 %x) { +; CHECK-LABEL: test_ctselect_smin_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test smax(x, 0) pattern optimization +define i32 @test_ctselect_smax_zero(i32 %x) { +; CHECK-LABEL: test_ctselect_smax_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: setg %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp sgt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test generic smin pattern +define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { +; CHECK-LABEL: test_ctselect_smin_generic: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: setl %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test generic smax pattern +define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { +; CHECK-LABEL: test_ctselect_smax_generic: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: setg %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp sgt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umin pattern +define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { +; CHECK-LABEL: test_ctselect_umin_generic: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: setb %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umax pattern +define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { +; CHECK-LABEL: test_ctselect_umax_generic: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: seta %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp ugt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test abs pattern +define i32 @test_ctselect_abs(i32 %x) { +; CHECK-LABEL: test_ctselect_abs: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: negl %ecx +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %dl +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x) + ret i32 %result +} + +; Test nabs pattern (negative abs) +define i32 @test_ctselect_nabs(i32 %x) { +; CHECK-LABEL: test_ctselect_nabs: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: negl %eax +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg) + ret i32 %result +} + +; Test sign extension pattern +define i32 @test_ctselect_sign_extend(i32 %x) { +; CHECK-LABEL: test_ctselect_sign_extend: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movl $-1, %ecx +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) + ret i32 %result +} + +; Test zero extension pattern +define i32 @test_ctselect_zero_extend(i32 %x) { +; CHECK-LABEL: test_ctselect_zero_extend: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: setne %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movl $1, %ecx +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %cmp = icmp ne i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) + ret i32 %result +} + +; Test mask generation pattern +define i32 @test_ctselect_mask_generation(i32 %x) { +; CHECK-LABEL: test_ctselect_mask_generation: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movl $-1, %ecx +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) + ret i32 %result +} + +; Test constant folding with known condition +define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) { +; CHECK-LABEL: test_ctselect_constant_folding_true: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movb $1, %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { +; CHECK-LABEL: test_ctselect_constant_folding_false: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with identical operands +define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { +; CHECK-LABEL: test_ctselect_identical_operands: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovnel %esi, %eax +; CHECK-NEXT: retq + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x) + ret i32 %result +} + +; Test with inverted condition +define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { +; CHECK-LABEL: test_ctselect_inverted_condition: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: sete %dl +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %cmp = icmp eq i32 %x, %y + %not_cmp = xor i1 %cmp, true + %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b) + ret i32 %result +} + +; Test for 64-bit specific optimizations +define i64 @test_ctselect_i64_smin_zero(i64 %x) { +; CHECK-LABEL: test_ctselect_i64_smin_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovneq %rdi, %rax +; CHECK-NEXT: retq + %cmp = icmp slt i64 %x, 0 + %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0) + ret i64 %result +} + +; Test for floating point optimizations +define float @test_ctselect_f32_zero_positive(float %x) { +; CHECK-LABEL: test_ctselect_f32_zero_positive: +; CHECK: # %bb.0: +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cmpltss %xmm0, %xmm1 +; CHECK-NEXT: andps %xmm1, %xmm0 +; CHECK-NEXT: retq + %cmp = fcmp ogt float %x, 0.0 + %result = call float @llvm.ct.select.f32(i1 %cmp, float %x, float 0.0) + ret float %result +} + +define double @test_ctselect_f64_zero_positive(double %x) { +; CHECK-LABEL: test_ctselect_f64_zero_positive: +; CHECK: # %bb.0: +; CHECK-NEXT: xorpd %xmm1, %xmm1 +; CHECK-NEXT: cmpltsd %xmm0, %xmm1 +; CHECK-NEXT: andpd %xmm1, %xmm0 +; CHECK-NEXT: retq + %cmp = fcmp ogt double %x, 0.0 + %result = call double @llvm.ct.select.f64(i1 %cmp, double %x, double 0.0) + ret double %result +} + +; Test chain of ct.select operations +define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) { +; CHECK-LABEL: test_ctselect_chain: +; CHECK: # %bb.0: +; CHECK-NEXT: movl 8(%rsp), %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovnel %ecx, %r8d +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: cmovnel %r8d, %r9d +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: cmovnel %r9d, %eax +; CHECK-NEXT: retq + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + ret i32 %sel3 +} + +; Declare the intrinsics +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) \ No newline at end of file diff --git a/llvm/test/CodeGen/X86/ctselect-vector.ll b/llvm/test/CodeGen/X86/ctselect-vector.ll new file mode 100644 index 0000000000000..29877e374efa2 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-vector.ll @@ -0,0 +1,1151 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 + +; Test ct.select functionality for vector types + +; 128-bit vectors +define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: test_ctselect_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: sete %al +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: sete %al +; AVX-NEXT: negl %eax +; AVX-NEXT: movd %eax, %xmm2 +; AVX-NEXT: pshufd $0, %xmm2, %xmm2 +; AVX-NEXT: movdqa %xmm2, %xmm2 +; AVX-NEXT: pand %xmm2, %xmm1 +; AVX-NEXT: pandn %xmm0, %xmm2 +; AVX-NEXT: por %xmm1, %xmm2 +; AVX-NEXT: movdqa %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: sete %al +; AVX2-NEXT: negl %eax +; AVX2-NEXT: movd %eax, %xmm2 +; AVX2-NEXT: pshufd $0, %xmm2, %xmm2 +; AVX2-NEXT: movdqa %xmm2, %xmm2 +; AVX2-NEXT: pand %xmm2, %xmm1 +; AVX2-NEXT: pandn %xmm0, %xmm2 +; AVX2-NEXT: por %xmm1, %xmm2 +; AVX2-NEXT: movdqa %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB0_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %xmm0, %xmm1 +; AVX512-NEXT: .LBB0_2: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: test_ctselect_v4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: sete %al +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: sete %al +; AVX-NEXT: negl %eax +; AVX-NEXT: movd %eax, %xmm2 +; AVX-NEXT: pshufd $0, %xmm2, %xmm2 +; AVX-NEXT: movdqa %xmm2, %xmm2 +; AVX-NEXT: pand %xmm2, %xmm1 +; AVX-NEXT: pandn %xmm0, %xmm2 +; AVX-NEXT: por %xmm1, %xmm2 +; AVX-NEXT: movdqa %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: sete %al +; AVX2-NEXT: negl %eax +; AVX2-NEXT: movd %eax, %xmm2 +; AVX2-NEXT: pshufd $0, %xmm2, %xmm2 +; AVX2-NEXT: movdqa %xmm2, %xmm2 +; AVX2-NEXT: pand %xmm2, %xmm1 +; AVX2-NEXT: pandn %xmm0, %xmm2 +; AVX2-NEXT: por %xmm1, %xmm2 +; AVX2-NEXT: movdqa %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB1_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %xmm0, %xmm1 +; AVX512-NEXT: .LBB1_2: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { +; SSE2-LABEL: test_ctselect_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: sete %al +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: sete %al +; AVX-NEXT: negl %eax +; AVX-NEXT: movd %eax, %xmm2 +; AVX-NEXT: pshufd $0, %xmm2, %xmm2 +; AVX-NEXT: movdqa %xmm2, %xmm2 +; AVX-NEXT: pand %xmm2, %xmm1 +; AVX-NEXT: pandn %xmm0, %xmm2 +; AVX-NEXT: por %xmm1, %xmm2 +; AVX-NEXT: movdqa %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: sete %al +; AVX2-NEXT: negl %eax +; AVX2-NEXT: movd %eax, %xmm2 +; AVX2-NEXT: pshufd $0, %xmm2, %xmm2 +; AVX2-NEXT: movdqa %xmm2, %xmm2 +; AVX2-NEXT: pand %xmm2, %xmm1 +; AVX2-NEXT: pandn %xmm0, %xmm2 +; AVX2-NEXT: por %xmm1, %xmm2 +; AVX2-NEXT: movdqa %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB2_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %xmm0, %xmm1 +; AVX512-NEXT: .LBB2_2: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %result +} + +define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { +; SSE2-LABEL: test_ctselect_v2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: sete %al +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: sete %al +; AVX-NEXT: negl %eax +; AVX-NEXT: movd %eax, %xmm2 +; AVX-NEXT: pshufd $0, %xmm2, %xmm2 +; AVX-NEXT: movdqa %xmm2, %xmm2 +; AVX-NEXT: pand %xmm2, %xmm1 +; AVX-NEXT: pandn %xmm0, %xmm2 +; AVX-NEXT: por %xmm1, %xmm2 +; AVX-NEXT: movdqa %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: sete %al +; AVX2-NEXT: negl %eax +; AVX2-NEXT: movd %eax, %xmm2 +; AVX2-NEXT: pshufd $0, %xmm2, %xmm2 +; AVX2-NEXT: movdqa %xmm2, %xmm2 +; AVX2-NEXT: pand %xmm2, %xmm1 +; AVX2-NEXT: pandn %xmm0, %xmm2 +; AVX2-NEXT: por %xmm1, %xmm2 +; AVX2-NEXT: movdqa %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB3_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovapd %xmm0, %xmm1 +; AVX512-NEXT: .LBB3_2: +; AVX512-NEXT: vmovapd %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +; 256-bit vectors +define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) { +; SSE2-LABEL: test_ctselect_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: sete %al +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: sete %cl +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: sete %al +; AVX-NEXT: negl %eax +; AVX-NEXT: vmovd %eax, %ymm2 +; AVX-NEXT: vshufps $0, %ymm2, %ymm2, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm2 +; AVX-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovaps %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: sete %al +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vmovd %eax, %ymm2 +; AVX2-NEXT: vpshufd $0, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB4_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %ymm0, %ymm1 +; AVX512-NEXT: .LBB4_2: +; AVX512-NEXT: vmovaps %ymm1, %ymm0 +; AVX512-NEXT: retq + %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) + ret <8 x i32> %result +} + +define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) { +; SSE2-LABEL: test_ctselect_v8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: sete %al +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: sete %cl +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: sete %al +; AVX-NEXT: negl %eax +; AVX-NEXT: vmovd %eax, %ymm2 +; AVX-NEXT: vshufps $0, %ymm2, %ymm2, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm2 +; AVX-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovaps %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: sete %al +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vmovd %eax, %ymm2 +; AVX2-NEXT: vpshufd $0, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB5_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %ymm0, %ymm1 +; AVX512-NEXT: .LBB5_2: +; AVX512-NEXT: vmovaps %ymm1, %ymm0 +; AVX512-NEXT: retq + %result = call <8 x float> @llvm.ct.select.v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) + ret <8 x float> %result +} + +define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) { +; SSE2-LABEL: test_ctselect_v4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: sete %al +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: sete %cl +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i64: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: sete %al +; AVX-NEXT: negl %eax +; AVX-NEXT: vmovd %eax, %ymm2 +; AVX-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 +; AVX-NEXT: vmovapd %ymm2, %ymm2 +; AVX-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vandnpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vorpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovapd %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: sete %al +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vmovd %eax, %ymm2 +; AVX2-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vmovapd %ymm2, %ymm2 +; AVX2-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vandnpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vorpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB6_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %ymm0, %ymm1 +; AVX512-NEXT: .LBB6_2: +; AVX512-NEXT: vmovaps %ymm1, %ymm0 +; AVX512-NEXT: retq + %result = call <4 x i64> @llvm.ct.select.v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) + ret <4 x i64> %result +} + +define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> %b) { +; SSE2-LABEL: test_ctselect_v4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: sete %al +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: sete %cl +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: sete %al +; AVX-NEXT: negl %eax +; AVX-NEXT: vmovd %eax, %ymm2 +; AVX-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 +; AVX-NEXT: vmovapd %ymm2, %ymm2 +; AVX-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vandnpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vorpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovapd %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: sete %al +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vmovd %eax, %ymm2 +; AVX2-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vmovapd %ymm2, %ymm2 +; AVX2-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vandnpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vorpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB7_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovapd %ymm0, %ymm1 +; AVX512-NEXT: .LBB7_2: +; AVX512-NEXT: vmovapd %ymm1, %ymm0 +; AVX512-NEXT: retq + %result = call <4 x double> @llvm.ct.select.v4f64(i1 %cond, <4 x double> %a, <4 x double> %b) + ret <4 x double> %result +} + +; 512-bit vectors (AVX512 only) +define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) { +; SSE2-LABEL: test_ctselect_v16i32: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: sete %al +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: pshufd $0, %xmm8, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm8 +; SSE2-NEXT: por %xmm4, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: sete %cl +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: sete %al +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: sete %cl +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm7, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v16i32: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: sete %al +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: negl %eax +; AVX-NEXT: vmovd %eax, %ymm4 +; AVX-NEXT: vshufps $0, %ymm4, %ymm4, %ymm4 +; AVX-NEXT: vmovaps %ymm4, %ymm4 +; AVX-NEXT: vandps %ymm2, %ymm4, %ymm2 +; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovaps %ymm0, %ymm0 +; AVX-NEXT: sete %cl +; AVX-NEXT: negl %ecx +; AVX-NEXT: vmovd %ecx, %ymm2 +; AVX-NEXT: vshufps $0, %ymm2, %ymm2, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm2 +; AVX-NEXT: vandps %ymm3, %ymm2, %ymm3 +; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX-NEXT: vmovaps %ymm1, %ymm1 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: sete %al +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vmovd %eax, %ymm4 +; AVX2-NEXT: vpshufd $0, %ymm4, %ymm4 +; AVX2-NEXT: vmovdqa %ymm4, %ymm4 +; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpandn %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, %ymm0 +; AVX2-NEXT: sete %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: vmovd %ecx, %ymm2 +; AVX2-NEXT: vpshufd $0, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB8_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %zmm0, %zmm1 +; AVX512-NEXT: .LBB8_2: +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: retq + %result = call <16 x i32> @llvm.ct.select.v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) + ret <16 x i32> %result +} + +define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float> %b) { +; SSE2-LABEL: test_ctselect_v16f32: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: sete %al +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: pshufd $0, %xmm8, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm8 +; SSE2-NEXT: por %xmm4, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: sete %cl +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: sete %al +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: sete %cl +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm7, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v16f32: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: sete %al +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: negl %eax +; AVX-NEXT: vmovd %eax, %ymm4 +; AVX-NEXT: vshufps $0, %ymm4, %ymm4, %ymm4 +; AVX-NEXT: vmovaps %ymm4, %ymm4 +; AVX-NEXT: vandps %ymm2, %ymm4, %ymm2 +; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovaps %ymm0, %ymm0 +; AVX-NEXT: sete %cl +; AVX-NEXT: negl %ecx +; AVX-NEXT: vmovd %ecx, %ymm2 +; AVX-NEXT: vshufps $0, %ymm2, %ymm2, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm2 +; AVX-NEXT: vandps %ymm3, %ymm2, %ymm3 +; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX-NEXT: vmovaps %ymm1, %ymm1 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v16f32: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: sete %al +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vmovd %eax, %ymm4 +; AVX2-NEXT: vpshufd $0, %ymm4, %ymm4 +; AVX2-NEXT: vmovdqa %ymm4, %ymm4 +; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpandn %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, %ymm0 +; AVX2-NEXT: sete %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: vmovd %ecx, %ymm2 +; AVX2-NEXT: vpshufd $0, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v16f32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB9_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %zmm0, %zmm1 +; AVX512-NEXT: .LBB9_2: +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: retq + %result = call <16 x float> @llvm.ct.select.v16f32(i1 %cond, <16 x float> %a, <16 x float> %b) + ret <16 x float> %result +} + +define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { +; SSE2-LABEL: test_ctselect_v8i64: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: sete %al +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: pshufd $0, %xmm8, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm8 +; SSE2-NEXT: por %xmm4, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: sete %cl +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: sete %al +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: sete %cl +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm7, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v8i64: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: sete %al +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: negl %eax +; AVX-NEXT: vmovd %eax, %ymm4 +; AVX-NEXT: vshufpd $0, %ymm4, %ymm4, %ymm4 +; AVX-NEXT: vmovapd %ymm4, %ymm4 +; AVX-NEXT: vandpd %ymm2, %ymm4, %ymm2 +; AVX-NEXT: vandnpd %ymm0, %ymm4, %ymm0 +; AVX-NEXT: vorpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovapd %ymm0, %ymm0 +; AVX-NEXT: sete %cl +; AVX-NEXT: negl %ecx +; AVX-NEXT: vmovd %ecx, %ymm2 +; AVX-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 +; AVX-NEXT: vmovapd %ymm2, %ymm2 +; AVX-NEXT: vandpd %ymm3, %ymm2, %ymm3 +; AVX-NEXT: vandnpd %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vorpd %ymm1, %ymm3, %ymm1 +; AVX-NEXT: vmovapd %ymm1, %ymm1 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: sete %al +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vmovd %eax, %ymm4 +; AVX2-NEXT: vshufpd $0, %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vmovapd %ymm4, %ymm4 +; AVX2-NEXT: vandpd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vandnpd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vorpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, %ymm0 +; AVX2-NEXT: sete %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: vmovd %ecx, %ymm2 +; AVX2-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vmovapd %ymm2, %ymm2 +; AVX2-NEXT: vandpd %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vandnpd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vorpd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB10_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %zmm0, %zmm1 +; AVX512-NEXT: .LBB10_2: +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: retq + %result = call <8 x i64> @llvm.ct.select.v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) + ret <8 x i64> %result +} + +define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> %b) { +; SSE2-LABEL: test_ctselect_v8f64: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: sete %al +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: pshufd $0, %xmm8, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm8 +; SSE2-NEXT: por %xmm4, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: sete %cl +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: sete %al +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: sete %cl +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm7, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v8f64: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: sete %al +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: negl %eax +; AVX-NEXT: vmovd %eax, %ymm4 +; AVX-NEXT: vshufpd $0, %ymm4, %ymm4, %ymm4 +; AVX-NEXT: vmovapd %ymm4, %ymm4 +; AVX-NEXT: vandpd %ymm2, %ymm4, %ymm2 +; AVX-NEXT: vandnpd %ymm0, %ymm4, %ymm0 +; AVX-NEXT: vorpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovapd %ymm0, %ymm0 +; AVX-NEXT: sete %cl +; AVX-NEXT: negl %ecx +; AVX-NEXT: vmovd %ecx, %ymm2 +; AVX-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 +; AVX-NEXT: vmovapd %ymm2, %ymm2 +; AVX-NEXT: vandpd %ymm3, %ymm2, %ymm3 +; AVX-NEXT: vandnpd %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vorpd %ymm1, %ymm3, %ymm1 +; AVX-NEXT: vmovapd %ymm1, %ymm1 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v8f64: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: sete %al +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vmovd %eax, %ymm4 +; AVX2-NEXT: vshufpd $0, %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vmovapd %ymm4, %ymm4 +; AVX2-NEXT: vandpd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vandnpd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vorpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, %ymm0 +; AVX2-NEXT: sete %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: vmovd %ecx, %ymm2 +; AVX2-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vmovapd %ymm2, %ymm2 +; AVX2-NEXT: vandpd %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vandnpd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vorpd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v8f64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB11_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovapd %zmm0, %zmm1 +; AVX512-NEXT: .LBB11_2: +; AVX512-NEXT: vmovapd %zmm1, %zmm0 +; AVX512-NEXT: retq + %result = call <8 x double> @llvm.ct.select.v8f64(i1 %cond, <8 x double> %a, <8 x double> %b) + ret <8 x double> %result +} + +; Test with constant conditions for vector types +define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: test_ctselect_v4i32_const_true: +; SSE2: # %bb.0: +; SSE2-NEXT: movb $1, %al +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: sete %cl +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i32_const_true: +; AVX: # %bb.0: +; AVX-NEXT: movb $1, %al +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: testb %al, %al +; AVX-NEXT: sete %cl +; AVX-NEXT: negl %ecx +; AVX-NEXT: movd %ecx, %xmm2 +; AVX-NEXT: pshufd $0, %xmm2, %xmm2 +; AVX-NEXT: movdqa %xmm2, %xmm2 +; AVX-NEXT: pand %xmm2, %xmm1 +; AVX-NEXT: pandn %xmm0, %xmm2 +; AVX-NEXT: por %xmm1, %xmm2 +; AVX-NEXT: movdqa %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i32_const_true: +; AVX2: # %bb.0: +; AVX2-NEXT: movb $1, %al +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: sete %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: movd %ecx, %xmm2 +; AVX2-NEXT: pshufd $0, %xmm2, %xmm2 +; AVX2-NEXT: movdqa %xmm2, %xmm2 +; AVX2-NEXT: pand %xmm2, %xmm1 +; AVX2-NEXT: pandn %xmm0, %xmm2 +; AVX2-NEXT: por %xmm1, %xmm2 +; AVX2-NEXT: movdqa %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v4i32_const_true: +; AVX512: # %bb.0: +; AVX512-NEXT: retq + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 true, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: test_ctselect_v4i32_const_false: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: sete %cl +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i32_const_false: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: testb %al, %al +; AVX-NEXT: sete %cl +; AVX-NEXT: negl %ecx +; AVX-NEXT: movd %ecx, %xmm2 +; AVX-NEXT: pshufd $0, %xmm2, %xmm2 +; AVX-NEXT: movdqa %xmm2, %xmm2 +; AVX-NEXT: pand %xmm2, %xmm1 +; AVX-NEXT: pandn %xmm0, %xmm2 +; AVX-NEXT: por %xmm1, %xmm2 +; AVX-NEXT: movdqa %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i32_const_false: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: sete %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: movd %ecx, %xmm2 +; AVX2-NEXT: pshufd $0, %xmm2, %xmm2 +; AVX2-NEXT: movdqa %xmm2, %xmm2 +; AVX2-NEXT: pand %xmm2, %xmm1 +; AVX2-NEXT: pandn %xmm0, %xmm2 +; AVX2-NEXT: por %xmm1, %xmm2 +; AVX2-NEXT: movdqa %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v4i32_const_false: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 false, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with comparison conditions for vector types +define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: test_ctselect_v4i32_icmp: +; SSE2: # %bb.0: +; SSE2-NEXT: cmpl %esi, %edi +; SSE2-NEXT: sete %al +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: sete %cl +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i32_icmp: +; AVX: # %bb.0: +; AVX-NEXT: cmpl %esi, %edi +; AVX-NEXT: sete %al +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: testb %al, %al +; AVX-NEXT: sete %cl +; AVX-NEXT: negl %ecx +; AVX-NEXT: movd %ecx, %xmm2 +; AVX-NEXT: pshufd $0, %xmm2, %xmm2 +; AVX-NEXT: movdqa %xmm2, %xmm2 +; AVX-NEXT: pand %xmm2, %xmm1 +; AVX-NEXT: pandn %xmm0, %xmm2 +; AVX-NEXT: por %xmm1, %xmm2 +; AVX-NEXT: movdqa %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i32_icmp: +; AVX2: # %bb.0: +; AVX2-NEXT: cmpl %esi, %edi +; AVX2-NEXT: sete %al +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: sete %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: movd %ecx, %xmm2 +; AVX2-NEXT: pshufd $0, %xmm2, %xmm2 +; AVX2-NEXT: movdqa %xmm2, %xmm2 +; AVX2-NEXT: pand %xmm2, %xmm1 +; AVX2-NEXT: pandn %xmm0, %xmm2 +; AVX2-NEXT: por %xmm1, %xmm2 +; AVX2-NEXT: movdqa %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctselect_v4i32_icmp: +; AVX512: # %bb.0: +; AVX512-NEXT: cmpl %esi, %edi +; AVX512-NEXT: je .LBB14_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: .LBB14_2: +; AVX512-NEXT: retq + %cond = icmp eq i32 %x, %y + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Declare the intrinsics +declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>) +declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>) +declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>) +declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>) +declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>) +declare <8 x float> @llvm.ct.select.v8f32(i1, <8 x float>, <8 x float>) +declare <4 x i64> @llvm.ct.select.v4i64(i1, <4 x i64>, <4 x i64>) +declare <4 x double> @llvm.ct.select.v4f64(i1, <4 x double>, <4 x double>) +declare <16 x i32> @llvm.ct.select.v16i32(i1, <16 x i32>, <16 x i32>) +declare <16 x float> @llvm.ct.select.v16f32(i1, <16 x float>, <16 x float>) +declare <8 x i64> @llvm.ct.select.v8i64(i1, <8 x i64>, <8 x i64>) +declare <8 x double> @llvm.ct.select.v8f64(i1, <8 x double>, <8 x double>) diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll new file mode 100644 index 0000000000000..0f8d3bb78f851 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect.ll @@ -0,0 +1,376 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X32 + +; Test basic ct.select functionality for scalar types + +define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { +; X64-LABEL: test_ctselect_i8: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %esi, %eax +; X64: retq +; +; X32-LABEL: test_ctselect_i8: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32: retl + %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %result +} + +define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { +; X64-LABEL: test_ctselect_i16: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %esi, %eax +; X64: retq +; +; X32-LABEL: test_ctselect_i16: +; X32: # %bb.0: +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnew {{[0-9]+}}(%esp), %ax +; X32: retl + %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %result +} + +define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_i32: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %esi, %eax +; X64: retq +; +; X32-LABEL: test_ctselect_i32: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32: retl + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { +; X64-LABEL: test_ctselect_i64: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_i64: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx +; X32-NEXT: retl + %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) + ret i64 %result +} + +define float @test_ctselect_f32(i1 %cond, float %a, float %b) { +; X64-LABEL: test_ctselect_f32: +; X64: # %bb.0: +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %eax, %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_f32: +; X32: # %bb.0: +; X32-NEXT: flds {{[0-9]+}}(%esp) +; X32-NEXT: flds {{[0-9]+}}(%esp) +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: jne .LBB4_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: fstp %st(1) +; X32-NEXT: fldz +; X32-NEXT: .LBB4_2: +; X32-NEXT: fstp %st(0) +; X32-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +define double @test_ctselect_f64(i1 %cond, double %a, double %b) { +; X64-LABEL: test_ctselect_f64: +; X64: # %bb.0: +; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: movq %xmm1, %rcx +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rax, %rcx +; X64-NEXT: movq %rcx, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_f64: +; X32: # %bb.0: +; X32-NEXT: fldl {{[0-9]+}}(%esp) +; X32-NEXT: fldl {{[0-9]+}}(%esp) +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: jne .LBB5_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: fstp %st(1) +; X32-NEXT: fldz +; X32-NEXT: .LBB5_2: +; X32-NEXT: fstp %st(0) +; X32-NEXT: retl + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + +define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { +; X64-LABEL: test_ctselect_ptr: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_ptr: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with constant conditions +define i32 @test_ctselect_const_true(i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_const_true: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movb $1, %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edi, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_const_true: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movb $1, %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_const_false(i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_const_false: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edi, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_const_false: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with comparison conditions +define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_icmp_eq: +; X64: # %bb.0: +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: sete %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_icmp_eq: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: sete %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %cond = icmp eq i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_icmp_ne: +; X64: # %bb.0: +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: setne %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_icmp_ne: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: setne %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %cond = icmp ne i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_icmp_slt: +; X64: # %bb.0: +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: setl %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_icmp_slt: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: setl %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %cond = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_icmp_ult: +; X64: # %bb.0: +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: setb %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_icmp_ult: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: setb %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %cond = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { +; X64-LABEL: test_ctselect_fcmp_oeq: +; X64: # %bb.0: +; X64-NEXT: cmpeqss %xmm1, %xmm0 +; X64-NEXT: andps %xmm0, %xmm2 +; X64-NEXT: andnps %xmm3, %xmm0 +; X64-NEXT: orps %xmm2, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_fcmp_oeq: +; X32: # %bb.0: +; X32-NEXT: flds {{[0-9]+}}(%esp) +; X32-NEXT: flds {{[0-9]+}}(%esp) +; X32-NEXT: flds {{[0-9]+}}(%esp) +; X32-NEXT: flds {{[0-9]+}}(%esp) +; X32-NEXT: fucompi %st(1), %st +; X32-NEXT: fstp %st(0) +; X32-NEXT: setnp %al +; X32-NEXT: sete %cl +; X32-NEXT: testb %al, %cl +; X32-NEXT: jne .LBB13_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: fstp %st(1) +; X32-NEXT: fldz +; X32-NEXT: .LBB13_2: +; X32-NEXT: fstp %st(0) +; X32-NEXT: retl + %cond = fcmp oeq float %x, %y + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test with memory operands +define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { +; X64-LABEL: test_ctselect_load: +; X64: # %bb.0: +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel (%rsi), %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_load: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel (%ecx), %eax +; X32-NEXT: retl + %a = load i32, ptr %p1 + %b = load i32, ptr %p2 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test nested ctselect calls +define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { +; X64-LABEL: test_ctselect_nested: +; X64: # %bb.0: +; X64-NEXT: movl %r8d, %eax +; X64-NEXT: testb $1, %sil +; X64-NEXT: cmovnel %edx, %ecx +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_nested: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %ecx, %eax +; X32-NEXT: retl + %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) + %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c) + ret i32 %result +} + +; Declare the intrinsics +declare i8 @llvm.ct.select.i8(i1, i8, i8) +declare i16 @llvm.ct.select.i16(i1, i16, i16) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) From f578008cb08ce993cd1a7323f43336d21c4d52b2 Mon Sep 17 00:00:00 2001 From: Francesco Bertolaccini Date: Tue, 29 Jul 2025 17:26:05 +0200 Subject: [PATCH 02/63] [CT] Expand vector CTSELECT (#26) --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index c7c7bf28de79e..8be31f6c13490 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4139,8 +4139,22 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Tmp1 = Node->getOperand(0); Tmp2 = Node->getOperand(1); Tmp3 = Node->getOperand(2); - Tmp1 = DAG.getCTSelect(dl, Tmp1.getValueType(), Tmp1, Tmp2, Tmp3); - Tmp1->setFlags(Node->getFlags()); + EVT VT = Tmp2.getValueType(); + if (VT.isVector()) { + SmallVector Elements; + unsigned NumElements = VT.getVectorNumElements(); + EVT ScalarVT = VT.getScalarType(); + for (unsigned Idx = 0; Idx < NumElements; ++Idx) { + SDValue IdxVal = DAG.getConstant(Idx, dl, MVT::i64); + SDValue TVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp2, IdxVal); + SDValue FVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp3, IdxVal); + Elements.push_back(DAG.getCTSelect(dl, ScalarVT, Tmp1, TVal, FVal, Node->getFlags())); + } + Tmp1 = DAG.getBuildVector(VT, dl, Elements); + } else { + Tmp1 = DAG.getCTSelect(dl, Tmp1.getValueType(), Tmp1, Tmp2, Tmp3); + Tmp1->setFlags(Node->getFlags()); + } Results.push_back(Tmp1); break; } From 7607d4288ca5e81c4e6ff905341425b8b379df6d Mon Sep 17 00:00:00 2001 From: Francesco Bertolaccini Date: Tue, 29 Jul 2025 17:26:36 +0200 Subject: [PATCH 03/63] [CT] Fix promotion of (CT)SELECT nodes (#24) --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 8be31f6c13490..c1aa1992170ce 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -5515,7 +5515,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1)); Tmp3 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2)); // Perform the larger operation, then round down. - Tmp1 = DAG.getSelect(dl, NVT, Tmp1, Tmp2, Tmp3); + Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2, Tmp3); + Tmp1->setFlags(Node->getFlags()); if (TruncOp != ISD::FP_ROUND) Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1); else From 648cfce4f4567e5f55443ffd8cd73ad5932985e0 Mon Sep 17 00:00:00 2001 From: Francesco Bertolaccini Date: Tue, 5 Aug 2025 15:29:34 +0200 Subject: [PATCH 04/63] [CT] Expand float and integer CTSELECTs (#27) --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index c1aa1992170ce..3b134220368dc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4151,8 +4151,21 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Elements.push_back(DAG.getCTSelect(dl, ScalarVT, Tmp1, TVal, FVal, Node->getFlags())); } Tmp1 = DAG.getBuildVector(VT, dl, Elements); + } else if (VT.isFloatingPoint()) { + EVT IntegerVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + Tmp2 = DAG.getBitcast(IntegerVT, Tmp2); + Tmp3 = DAG.getBitcast(IntegerVT, Tmp3); + Tmp1 = DAG.getBitcast(VT, DAG.getCTSelect(dl, IntegerVT, Tmp1, Tmp2, Tmp3, Node->getFlags())); } else { - Tmp1 = DAG.getCTSelect(dl, Tmp1.getValueType(), Tmp1, Tmp2, Tmp3); + assert(VT.isInteger()); + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + SDValue Tmp2Lo, Tmp2Hi; + SDValue Tmp3Lo, Tmp3Hi; + std::tie(Tmp2Lo, Tmp2Hi) = DAG.SplitScalar(Tmp2, dl, HalfVT, HalfVT); + std::tie(Tmp3Lo, Tmp3Hi) = DAG.SplitScalar(Tmp3, dl, HalfVT, HalfVT); + SDValue ResLo = DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Lo, Tmp3Lo, Node->getFlags()); + SDValue ResHi = DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Hi, Tmp3Hi, Node->getFlags()); + Tmp1 = DAG.getNode(ISD::BUILD_PAIR, dl, VT, ResLo, ResHi); Tmp1->setFlags(Node->getFlags()); } Results.push_back(Tmp1); From 18323aee4cfc98d77b3bcfc938c3d96eb0eede84 Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Thu, 7 Aug 2025 11:49:25 -0400 Subject: [PATCH 05/63] Enchance CT Checks for all supported Archs --- clang/lib/Basic/Targets/AArch64.cpp | 1 + clang/lib/Basic/Targets/ARM.cpp | 1 + clang/lib/Basic/Targets/X86.cpp | 1 + clang/lib/Sema/SemaChecking.cpp | 6 ++++++ llvm/lib/Target/X86/X86.td | 10 ++++------ llvm/lib/Target/X86/X86InstrPredicates.td | 1 - 6 files changed, 13 insertions(+), 7 deletions(-) diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 18641a96063cd..17993660ba395 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -872,6 +872,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const { .Case("ssve-fp8fma", HasSSVE_FP8FMA) .Case("sme-f8f32", HasSME_F8F32) .Case("sme-f8f16", HasSME_F8F16) + .Case("ctselect", true) .Default(false); } diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp index 3de17d2c829f1..423a7b8749658 100644 --- a/clang/lib/Basic/Targets/ARM.cpp +++ b/clang/lib/Basic/Targets/ARM.cpp @@ -664,6 +664,7 @@ bool ARMTargetInfo::hasFeature(StringRef Feature) const { .Case("hwdiv", HWDiv & HWDivThumb) .Case("hwdiv-arm", HWDiv & HWDivARM) .Case("mve", hasMVE()) + .Case("ctselect", true) .Default(false); } diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index e71f10c4c16fc..45fa0379783fe 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -1298,6 +1298,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("cf", HasCF) .Case("zu", HasZU) .Case("branch-hint", HasBranchHint) + .Case("ctselect", true) .Default(false); } diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 12be5426ccd23..219fb2860031e 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3474,6 +3474,12 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, break; case Builtin::BI__builtin_ct_select: { + // check to see if the Arch supports it + if (!Context.getTargetInfo().hasFeature("ctselect")) { + return Diag(TheCall->getBeginLoc(), diag::err_builtin_target_unsupported) + << TheCall->getSourceRange(); + } + if (TheCall->getNumArgs() != 3) { // Simple argument count check without complex diagnostics if (TheCall->getNumArgs() < 3) { diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 638ecb2491fe0..d306d489a43d2 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -41,9 +41,6 @@ def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true", def FeatureCMOV : SubtargetFeature<"cmov","HasCMOV", "true", "Enable conditional move instructions">; -def FeatureCtSelect : SubtargetFeature<"ctselect", "HasCtSelect", "true", - "Enable feature to implement constant-time select">; - def FeatureCX8 : SubtargetFeature<"cx8", "HasCX8", "true", "Support CMPXCHG8B instructions">; @@ -831,9 +828,10 @@ include "X86SchedSapphireRapids.td" def ProcessorFeatures { // x86-64 micro-architecture levels: x86-64 and x86-64-v[234] - list X86_64V1Features = [ - FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE2, - FeatureFXSR, FeatureNOPL, FeatureX86_64, FeatureCtSelect, + list X86_64V1Features = [FeatureX87, FeatureCX8, + FeatureCMOV, FeatureMMX, + FeatureSSE2, FeatureFXSR, + FeatureNOPL, FeatureX86_64, ]; list X86_64V1Tuning = [ TuningMacroFusion, diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index 2ffee14b34bad..c20bb05018b4d 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -49,7 +49,6 @@ def HasZU : Predicate<"Subtarget->hasZU()">; def HasCF : Predicate<"Subtarget->hasCF()">; def HasCMOV : Predicate<"Subtarget->canUseCMOV()">; def NoCMOV : Predicate<"!Subtarget->canUseCMOV()">; -def HasCtSelect : Predicate<"Subtarget->hasCtSelect()">; def HasNOPL : Predicate<"Subtarget->hasNOPL()">; def HasMMX : Predicate<"Subtarget->hasMMX()">; def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; From e4ff2e4ed1df561123b7b76a7ca0cc1662c70410 Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Sun, 10 Aug 2025 03:02:20 -0400 Subject: [PATCH 06/63] Initial implement for fallback method --- llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 5 ++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 71 +++++++++++++++++++ .../SelectionDAG/SelectionDAGBuilder.cpp | 29 +++++++- .../SelectionDAG/SelectionDAGBuilder.h | 3 + llvm/test/CodeGen/RISCV/ctselect-fallback.ll | 17 +++++ 5 files changed, 123 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/ctselect-fallback.ll diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 69713d0d84011..1052f51ead48c 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -426,6 +426,9 @@ struct SDNodeFlags { NonNeg | NoNaNs | NoInfs | SameSign, FastMathFlags = NoNaNs | NoInfs | NoSignedZeros | AllowReciprocal | AllowContract | ApproximateFuncs | AllowReassociation, + + // Flag for disabling optimization + NoCtSelectOpt = 1 << 15, }; /// Default constructor turns off all optimization flags. @@ -458,6 +461,7 @@ struct SDNodeFlags { void setAllowReassociation(bool b) { setFlag(b); } void setNoFPExcept(bool b) { setFlag(b); } void setUnpredictable(bool b) { setFlag(b); } + void setNoCtSelectOpt(bool b) { setFlag(b); } // These are accessors for each flag. bool hasNoUnsignedWrap() const { return Flags & NoUnsignedWrap; } @@ -475,6 +479,7 @@ struct SDNodeFlags { bool hasAllowReassociation() const { return Flags & AllowReassociation; } bool hasNoFPExcept() const { return Flags & NoFPExcept; } bool hasUnpredictable() const { return Flags & Unpredictable; } + bool hasNoCtSelectOpt() const { return Flags & NoCtSelectOpt; } bool operator==(const SDNodeFlags &Other) const { return Flags == Other.Flags; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index cac7e813101c8..bd8ae9eb77680 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4116,6 +4116,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { unsigned BitWidth = VT.getScalarSizeInBits(); SDLoc DL(N); + if (N->getFlags().hasNoCtSelectOpt()) + return SDValue(); + if (SDValue V = foldSubCtlzNot(N, DAG)) return V; @@ -7447,6 +7450,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) { EVT VT = N1.getValueType(); SDLoc DL(N); + if (N->getFlags().hasNoCtSelectOpt()) + return SDValue(); + // x & x --> x if (N0 == N1) return N0; @@ -8375,12 +8381,74 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1, return SDValue(); } +static inline bool hasCtSelectProtection(const SDValue &V) { + if (SDNode *N = V.getNode()) { + if (N->getFlags().hasNoCtSelectOpt()) + return true; + + // if V is (~X) expressed as (xor x, -1) also check X + if (V.getOpcode() == ISD::XOR) { + if (isAllOnesConstant(V.getOperand(1)) && V.getOperand(0).getNode() && + V.getOperand(0)->getFlags().hasNoCtSelectOpt()) + return true; + if (isAllOnesConstant(V.getOperand(0)) && V.getOperand(1).getNode() && + V.getOperand(1)->getFlags().hasNoCtSelectOpt()) + return true; + } + } + return false; +} + +static inline bool isComplementPair(const SDValue &M, const SDValue &NM) { + // NM == (xor M, -1) + if (NM.getOpcode() == ISD::XOR && isAllOnesConstant(NM.getOperand(0)) && + NM.getOperand(1) == M) + return true; + if (NM.getOpcode() == ISD::XOR && isAllOnesConstant(NM.getOperand(1)) && + NM.getOperand(0) == M) + return true; + + // M == (xor NM, -1) + if (M.getOpcode() == ISD::XOR && isAllOnesConstant(M.getOperand(0)) && + M.getOperand(1) == NM) + return true; + if (M.getOpcode() == ISD::XOR && isAllOnesConstant(M.getOperand(1)) && + M.getOperand(0) == NM) + return true; + + return false; +} + SDValue DAGCombiner::visitOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N1.getValueType(); SDLoc DL(N); + if (N->getFlags().hasNoCtSelectOpt()) + return SDValue(); + + // recognize (and x, m) | (and y, ~m) -> select m, x, y + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND) { + SDNode *AND0 = N0.getNode(); + SDNode *AND1 = N1.getNode(); + + SDValue X = AND0->getOperand(0); + SDValue M = AND0->getOperand(1); + SDValue Y = AND1->getOperand(0); + SDValue NM = AND1->getOperand(1); + // The fold fires only if M and NM are complements. + if (isComplementPair(M, NM)) { + // bail if any participant is protected + if (hasCtSelectProtection(N0) || hasCtSelectProtection(N1) || + hasCtSelectProtection(M) || hasCtSelectProtection(NM) || + hasCtSelectProtection(X) || hasCtSelectProtection(Y)) { + // Do not fold to SELECT; keep the masked arithmetic as-is. + return SDValue(); + } + } + } + // x | x --> x if (N0 == N1) return N0; @@ -9926,6 +9994,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { EVT VT = N0.getValueType(); SDLoc DL(N); + if (N->getFlags().hasNoCtSelectOpt()) + return SDValue(); + // fold (xor undef, undef) -> 0. This is a common idiom (misuse). if (N0.isUndef() && N1.isUndef()) return DAG.getConstant(0, DL, VT); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 316fa19d0b37c..b5f04d9cb9f48 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6489,6 +6489,32 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I, setValue(&I, Result); } +SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( + SelectionDAG &DAG, const SDLoc &DL, SDValue Cond, SDValue T, SDValue F, + EVT VT) { + SDNodeFlags ProtectedFlag; + ProtectedFlag.setNoCtSelectOpt(true); + + // Extend cond to VT and normalize to 0 or 1 + if (Cond.getValueType() != VT) + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond, ProtectedFlag); + + SDValue One = DAG.getConstant(1, DL, VT); + SDValue Norm = DAG.getNode(ISD::AND, DL, VT, Cond, One, ProtectedFlag); + + // Mask = 0 - Norm + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, Zero, Norm, ProtectedFlag); + + SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); + SDValue Invert = DAG.getNode(ISD::XOR, DL, VT, Mask, AllOnes, ProtectedFlag); + + // (T & Mask) | (F & ~Mask) + SDValue TM = DAG.getNode(ISD::AND, DL, VT, Mask, T, ProtectedFlag); + SDValue FM = DAG.getNode(ISD::AND, DL, VT, Invert, F, ProtectedFlag); + return DAG.getNode(ISD::OR, DL, VT, TM, FM, ProtectedFlag); +} + /// Lower the call to the specified intrinsic function. void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { @@ -6692,8 +6718,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } - SDValue Result = DAG.getNode(ISD::SELECT, DL, VT, Cond, A, B); - setValue(&I, Result); + setValue(&I, createProtectedCtSelectFallback(DAG, DL, Cond, A, B, VT)); return; } case Intrinsic::call_preallocated_setup: { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index c7577fa335feb..6a307a33c6271 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -214,6 +214,9 @@ class SelectionDAGBuilder { peelDominantCaseCluster(const SwitchInst &SI, SwitchCG::CaseClusterVector &Clusters, BranchProbability &PeeledCaseProb); + SDValue createProtectedCtSelectFallback(SelectionDAG &DAG, const SDLoc &DL, + SDValue Cond, SDValue T, SDValue F, + EVT VT); private: const TargetMachine &TM; diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll new file mode 100644 index 0000000000000..f07d78bc84b5f --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll @@ -0,0 +1,17 @@ +; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64 + +declare i32 @llvm.ct.select.i32(i1, i32, i32) + +define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_i32: +; RV64: # %bb.0: +; RV64-NEXT: andi +; RV64-NEXT: addi +; RV64-NEXT: neg +; RV64-NEXT: and +; RV64-NEXT: and +; RV64-NEXT: or +; RV64-NEXT: ret + %r = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %r +} From ceaf0e3c7e3c2ba69c47449bb604193b1e042f1a Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Sun, 10 Aug 2025 17:31:47 -0400 Subject: [PATCH 07/63] Added test for RISCVs, MIPs and WebAsm --- clang/lib/Sema/SemaChecking.cpp | 6 - llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 6 +- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 100 +-- .../SelectionDAG/SelectionDAGBuilder.cpp | 4 +- .../Mips/ctselect-fallback-edge-cases.ll | 267 +++++++ .../Mips/ctselect-fallback-patterns.ll | 440 ++++++++++++ llvm/test/CodeGen/Mips/ctselect-fallback.ll | 384 ++++++++++ .../RISCV/ctselect-fallback-edge-cases.ll | 236 ++++++ .../RISCV/ctselect-fallback-patterns.ll | 403 +++++++++++ llvm/test/CodeGen/RISCV/ctselect-fallback.ll | 351 ++++++++- .../ctselect-fallback-edge-cases.ll | 415 +++++++++++ .../WebAssembly/ctselect-fallback-patterns.ll | 673 ++++++++++++++++++ .../CodeGen/WebAssembly/ctselect-fallback.ll | 576 +++++++++++++++ 13 files changed, 3773 insertions(+), 88 deletions(-) create mode 100644 llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll create mode 100644 llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll create mode 100644 llvm/test/CodeGen/Mips/ctselect-fallback.ll create mode 100644 llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll create mode 100644 llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll create mode 100644 llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll create mode 100644 llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll create mode 100644 llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 219fb2860031e..12be5426ccd23 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3474,12 +3474,6 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, break; case Builtin::BI__builtin_ct_select: { - // check to see if the Arch supports it - if (!Context.getTargetInfo().hasFeature("ctselect")) { - return Diag(TheCall->getBeginLoc(), diag::err_builtin_target_unsupported) - << TheCall->getSourceRange(); - } - if (TheCall->getNumArgs() != 3) { // Simple argument count check without complex diagnostics if (TheCall->getNumArgs() < 3) { diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 1052f51ead48c..62e917666e531 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -428,7 +428,7 @@ struct SDNodeFlags { AllowContract | ApproximateFuncs | AllowReassociation, // Flag for disabling optimization - NoCtSelectOpt = 1 << 15, + NoMerge = 1 << 15, }; /// Default constructor turns off all optimization flags. @@ -461,7 +461,7 @@ struct SDNodeFlags { void setAllowReassociation(bool b) { setFlag(b); } void setNoFPExcept(bool b) { setFlag(b); } void setUnpredictable(bool b) { setFlag(b); } - void setNoCtSelectOpt(bool b) { setFlag(b); } + void setNoMerge(bool b) { setFlag(b); } // These are accessors for each flag. bool hasNoUnsignedWrap() const { return Flags & NoUnsignedWrap; } @@ -479,7 +479,7 @@ struct SDNodeFlags { bool hasAllowReassociation() const { return Flags & AllowReassociation; } bool hasNoFPExcept() const { return Flags & NoFPExcept; } bool hasUnpredictable() const { return Flags & Unpredictable; } - bool hasNoCtSelectOpt() const { return Flags & NoCtSelectOpt; } + bool hasNoMerge() const { return Flags & NoMerge; } bool operator==(const SDNodeFlags &Other) const { return Flags == Other.Flags; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index bd8ae9eb77680..a5d30daac1d14 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1898,7 +1898,39 @@ void DAGCombiner::Run(CombineLevel AtLevel) { DAG.RemoveDeadNodes(); } +static inline bool hasNoMergeProtection(const SDValue &V) { + if (SDNode *N = V.getNode()) { + if (N->getFlags().hasNoMerge()) + return true; + + // if V is (~X) expressed as (xor X, -1) also check X + if (V.getOpcode() == ISD::XOR) { + if (isAllOnesConstant(V.getOperand(1)) && V.getOperand(0).getNode() && + V.getOperand(0)->getFlags().hasNoMerge()) + return true; + if (isAllOnesConstant(V.getOperand(0)) && V.getOperand(1).getNode() && + V.getOperand(1)->getFlags().hasNoMerge()) + return true; + } + } + return false; +} + +static inline bool touchesNoMerge(SDNode *N) { + if (N->getFlags().hasNoMerge()) + return true; + + for (const SDUse &U : N->ops()) { + if (hasNoMergeProtection(U.get())) + return true; + } + return false; +} + SDValue DAGCombiner::visit(SDNode *N) { + if (touchesNoMerge(N)) + return SDValue(); + // clang-format off switch (N->getOpcode()) { default: break; @@ -7450,9 +7482,6 @@ SDValue DAGCombiner::visitAND(SDNode *N) { EVT VT = N1.getValueType(); SDLoc DL(N); - if (N->getFlags().hasNoCtSelectOpt()) - return SDValue(); - // x & x --> x if (N0 == N1) return N0; @@ -8381,74 +8410,12 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1, return SDValue(); } -static inline bool hasCtSelectProtection(const SDValue &V) { - if (SDNode *N = V.getNode()) { - if (N->getFlags().hasNoCtSelectOpt()) - return true; - - // if V is (~X) expressed as (xor x, -1) also check X - if (V.getOpcode() == ISD::XOR) { - if (isAllOnesConstant(V.getOperand(1)) && V.getOperand(0).getNode() && - V.getOperand(0)->getFlags().hasNoCtSelectOpt()) - return true; - if (isAllOnesConstant(V.getOperand(0)) && V.getOperand(1).getNode() && - V.getOperand(1)->getFlags().hasNoCtSelectOpt()) - return true; - } - } - return false; -} - -static inline bool isComplementPair(const SDValue &M, const SDValue &NM) { - // NM == (xor M, -1) - if (NM.getOpcode() == ISD::XOR && isAllOnesConstant(NM.getOperand(0)) && - NM.getOperand(1) == M) - return true; - if (NM.getOpcode() == ISD::XOR && isAllOnesConstant(NM.getOperand(1)) && - NM.getOperand(0) == M) - return true; - - // M == (xor NM, -1) - if (M.getOpcode() == ISD::XOR && isAllOnesConstant(M.getOperand(0)) && - M.getOperand(1) == NM) - return true; - if (M.getOpcode() == ISD::XOR && isAllOnesConstant(M.getOperand(1)) && - M.getOperand(0) == NM) - return true; - - return false; -} - SDValue DAGCombiner::visitOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N1.getValueType(); SDLoc DL(N); - if (N->getFlags().hasNoCtSelectOpt()) - return SDValue(); - - // recognize (and x, m) | (and y, ~m) -> select m, x, y - if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND) { - SDNode *AND0 = N0.getNode(); - SDNode *AND1 = N1.getNode(); - - SDValue X = AND0->getOperand(0); - SDValue M = AND0->getOperand(1); - SDValue Y = AND1->getOperand(0); - SDValue NM = AND1->getOperand(1); - // The fold fires only if M and NM are complements. - if (isComplementPair(M, NM)) { - // bail if any participant is protected - if (hasCtSelectProtection(N0) || hasCtSelectProtection(N1) || - hasCtSelectProtection(M) || hasCtSelectProtection(NM) || - hasCtSelectProtection(X) || hasCtSelectProtection(Y)) { - // Do not fold to SELECT; keep the masked arithmetic as-is. - return SDValue(); - } - } - } - // x | x --> x if (N0 == N1) return N0; @@ -9994,9 +9961,6 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { EVT VT = N0.getValueType(); SDLoc DL(N); - if (N->getFlags().hasNoCtSelectOpt()) - return SDValue(); - // fold (xor undef, undef) -> 0. This is a common idiom (misuse). if (N0.isUndef() && N1.isUndef()) return DAG.getConstant(0, DL, VT); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index b5f04d9cb9f48..ea1b1b8634bef 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6493,7 +6493,7 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( SelectionDAG &DAG, const SDLoc &DL, SDValue Cond, SDValue T, SDValue F, EVT VT) { SDNodeFlags ProtectedFlag; - ProtectedFlag.setNoCtSelectOpt(true); + ProtectedFlag.setNoMerge(true); // Extend cond to VT and normalize to 0 or 1 if (Cond.getValueType() != VT) @@ -6509,7 +6509,7 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); SDValue Invert = DAG.getNode(ISD::XOR, DL, VT, Mask, AllOnes, ProtectedFlag); - // (T & Mask) | (F & ~Mask) + // (or (and T, Mask), (and F, ~Mask)) SDValue TM = DAG.getNode(ISD::AND, DL, VT, Mask, T, ProtectedFlag); SDValue FM = DAG.getNode(ISD::AND, DL, VT, Invert, F, ProtectedFlag); return DAG.getNode(ISD::OR, DL, VT, TM, FM, ProtectedFlag); diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll new file mode 100644 index 0000000000000..c66e0a41644ff --- /dev/null +++ b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll @@ -0,0 +1,267 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=mipsel-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M32 +; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M64 + +; Portable edge case tests + +; Test with small integer types +define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { +; M32-LABEL: test_ctselect_i1: +; M32: # %bb.0: +; M32-NEXT: negu $1, $4 +; M32-NEXT: and $2, $1, $5 +; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_i1: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: sll $3, $6, 0 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: xori $1, $1, 1 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) + ret i1 %result +} + +; Test with extremal values +define i32 @test_ctselect_extremal_values(i1 %cond) { +; M32-LABEL: test_ctselect_extremal_values: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: lui $2, 32767 +; M32-NEXT: lui $3, 32768 +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: ori $2, $2, 65535 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $2 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $3 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_extremal_values: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: lui $2, 32767 +; M64-NEXT: lui $3, 32768 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: ori $2, $2, 65535 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648) + ret i32 %result +} + +; Test with null pointers +define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { +; M32-LABEL: test_ctselect_null_ptr: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: jr $ra +; M32-NEXT: and $2, $1, $5 +; +; M64-LABEL: test_ctselect_null_ptr: +; M64: # %bb.0: +; M64-NEXT: andi $1, $4, 1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: jr $ra +; M64-NEXT: and $2, $1, $5 + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null) + ret ptr %result +} + +; Test with function pointers +define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { +; M32-LABEL: test_ctselect_function_ptr: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $5 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_function_ptr: +; M64: # %bb.0: +; M64-NEXT: andi $1, $4, 1 +; M64-NEXT: daddiu $3, $zero, -1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: and $2, $1, $5 +; M64-NEXT: xor $1, $1, $3 +; M64-NEXT: and $1, $1, $6 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2) + ret ptr %result +} + +; Test with condition from icmp on pointers +define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { +; M32-LABEL: test_ctselect_ptr_cmp: +; M32: # %bb.0: +; M32-NEXT: xor $1, $4, $5 +; M32-NEXT: sltiu $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $6 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $7 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_ptr_cmp: +; M64: # %bb.0: +; M64-NEXT: xor $1, $4, $5 +; M64-NEXT: daddiu $2, $zero, -1 +; M64-NEXT: sltiu $1, $1, 1 +; M64-NEXT: dsll $1, $1, 32 +; M64-NEXT: dsrl $1, $1, 32 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: and $3, $1, $6 +; M64-NEXT: xor $1, $1, $2 +; M64-NEXT: and $1, $1, $7 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $3, $1 + %cmp = icmp eq ptr %p1, %p2 + %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b) + ret ptr %result +} + +; Test with struct pointer types +%struct.pair = type { i32, i32 } + +define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { +; M32-LABEL: test_ctselect_struct_ptr: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $5 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_struct_ptr: +; M64: # %bb.0: +; M64-NEXT: andi $1, $4, 1 +; M64-NEXT: daddiu $3, $zero, -1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: and $2, $1, $5 +; M64-NEXT: xor $1, $1, $3 +; M64-NEXT: and $1, $1, $6 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with deeply nested conditions +define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +; M32-LABEL: test_ctselect_deeply_nested: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: lw $2, 16($sp) +; M32-NEXT: lw $3, 20($sp) +; M32-NEXT: andi $4, $6, 1 +; M32-NEXT: lw $6, 32($sp) +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: andi $4, $4, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: negu $4, $4 +; M32-NEXT: and $2, $1, $2 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $3 +; M32-NEXT: andi $3, $7, 1 +; M32-NEXT: lw $7, 24($sp) +; M32-NEXT: andi $3, $3, 1 +; M32-NEXT: or $1, $2, $1 +; M32-NEXT: negu $2, $3 +; M32-NEXT: andi $3, $5, 1 +; M32-NEXT: andi $3, $3, 1 +; M32-NEXT: not $5, $2 +; M32-NEXT: negu $3, $3 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: not $3, $3 +; M32-NEXT: and $3, $3, $7 +; M32-NEXT: or $1, $1, $3 +; M32-NEXT: not $3, $4 +; M32-NEXT: and $1, $4, $1 +; M32-NEXT: lw $4, 28($sp) +; M32-NEXT: and $3, $3, $4 +; M32-NEXT: or $1, $1, $3 +; M32-NEXT: and $1, $2, $1 +; M32-NEXT: and $2, $5, $6 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_deeply_nested: +; M64: # %bb.0: +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $4, $5, 0 +; M64-NEXT: sll $3, $8, 0 +; M64-NEXT: sll $5, $9, 0 +; M64-NEXT: sll $6, $6, 0 +; M64-NEXT: sll $1, $7, 0 +; M64-NEXT: lw $8, 0($sp) +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: andi $4, $4, 1 +; M64-NEXT: andi $6, $6, 1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: andi $4, $4, 1 +; M64-NEXT: andi $6, $6, 1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: negu $4, $4 +; M64-NEXT: negu $6, $6 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $3, $2, $3 +; M64-NEXT: not $2, $2 +; M64-NEXT: not $7, $1 +; M64-NEXT: and $2, $2, $5 +; M64-NEXT: or $2, $3, $2 +; M64-NEXT: not $3, $4 +; M64-NEXT: and $2, $4, $2 +; M64-NEXT: sll $4, $10, 0 +; M64-NEXT: and $3, $3, $4 +; M64-NEXT: sll $4, $11, 0 +; M64-NEXT: or $2, $2, $3 +; M64-NEXT: not $3, $6 +; M64-NEXT: and $2, $6, $2 +; M64-NEXT: and $3, $3, $4 +; M64-NEXT: or $2, $2, $3 +; M64-NEXT: and $1, $1, $2 +; M64-NEXT: and $2, $7, $8 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e) + ret i32 %sel4 +} + +; Declare the intrinsics +declare i1 @llvm.ct.select.i1(i1, i1, i1) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll new file mode 100644 index 0000000000000..46c74b1d3db36 --- /dev/null +++ b/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll @@ -0,0 +1,440 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=mipsel-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M32 +; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M64 + +; Test smin(x, 0) pattern +define i32 @test_ctselect_smin_zero(i32 %x) { +; M32-LABEL: test_ctselect_smin_zero: +; M32: # %bb.0: +; M32-NEXT: slti $1, $4, 0 +; M32-NEXT: negu $1, $1 +; M32-NEXT: jr $ra +; M32-NEXT: and $2, $1, $4 +; +; M64-LABEL: test_ctselect_smin_zero: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: slti $2, $1, 0 +; M64-NEXT: negu $2, $2 +; M64-NEXT: jr $ra +; M64-NEXT: and $2, $2, $1 + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test smax(x, 0) pattern +define i32 @test_ctselect_smax_zero(i32 %x) { +; M32-LABEL: test_ctselect_smax_zero: +; M32: # %bb.0: +; M32-NEXT: slt $1, $zero, $4 +; M32-NEXT: negu $1, $1 +; M32-NEXT: jr $ra +; M32-NEXT: and $2, $1, $4 +; +; M64-LABEL: test_ctselect_smax_zero: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: slt $2, $zero, $1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: jr $ra +; M64-NEXT: and $2, $2, $1 + %cmp = icmp sgt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test generic smin pattern +define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { +; M32-LABEL: test_ctselect_smin_generic: +; M32: # %bb.0: +; M32-NEXT: slt $1, $4, $5 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $4 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_smin_generic: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: slt $3, $2, $1 +; M64-NEXT: negu $3, $3 +; M64-NEXT: and $2, $3, $2 +; M64-NEXT: not $3, $3 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %cmp = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test generic smax pattern +define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { +; M32-LABEL: test_ctselect_smax_generic: +; M32: # %bb.0: +; M32-NEXT: slt $1, $5, $4 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $4 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_smax_generic: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: slt $3, $2, $1 +; M64-NEXT: negu $3, $3 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: not $3, $3 +; M64-NEXT: and $2, $3, $2 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %cmp = icmp sgt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umin pattern +define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { +; M32-LABEL: test_ctselect_umin_generic: +; M32: # %bb.0: +; M32-NEXT: sltu $1, $4, $5 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $4 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_umin_generic: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sltu $3, $2, $1 +; M64-NEXT: negu $3, $3 +; M64-NEXT: and $2, $3, $2 +; M64-NEXT: not $3, $3 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %cmp = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umax pattern +define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { +; M32-LABEL: test_ctselect_umax_generic: +; M32: # %bb.0: +; M32-NEXT: sltu $1, $5, $4 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $4 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_umax_generic: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: sltu $3, $2, $1 +; M64-NEXT: negu $3, $3 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: not $3, $3 +; M64-NEXT: and $2, $3, $2 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %cmp = icmp ugt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test abs pattern +define i32 @test_ctselect_abs(i32 %x) { +; M32-LABEL: test_ctselect_abs: +; M32: # %bb.0: +; M32-NEXT: slti $2, $4, 0 +; M32-NEXT: negu $1, $4 +; M32-NEXT: negu $2, $2 +; M32-NEXT: and $1, $2, $1 +; M32-NEXT: not $2, $2 +; M32-NEXT: and $2, $2, $4 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_abs: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: slti $3, $1, 0 +; M64-NEXT: negu $2, $1 +; M64-NEXT: negu $3, $3 +; M64-NEXT: and $2, $3, $2 +; M64-NEXT: not $3, $3 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x) + ret i32 %result +} + +; Test nabs pattern (negative abs) +define i32 @test_ctselect_nabs(i32 %x) { +; M32-LABEL: test_ctselect_nabs: +; M32: # %bb.0: +; M32-NEXT: slti $2, $4, 0 +; M32-NEXT: negu $1, $4 +; M32-NEXT: negu $2, $2 +; M32-NEXT: not $3, $2 +; M32-NEXT: and $2, $2, $4 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_nabs: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: slti $3, $1, 0 +; M64-NEXT: negu $2, $1 +; M64-NEXT: negu $3, $3 +; M64-NEXT: not $4, $3 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: and $2, $4, $2 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg) + ret i32 %result +} + +; Test sign extension pattern +define i32 @test_ctselect_sign_extend(i32 %x) { +; M32-LABEL: test_ctselect_sign_extend: +; M32: # %bb.0: +; M32-NEXT: slti $1, $4, 0 +; M32-NEXT: jr $ra +; M32-NEXT: negu $2, $1 +; +; M64-LABEL: test_ctselect_sign_extend: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: slti $1, $1, 0 +; M64-NEXT: jr $ra +; M64-NEXT: negu $2, $1 + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) + ret i32 %result +} + +; Test zero extension pattern +define i32 @test_ctselect_zero_extend(i32 %x) { +; M32-LABEL: test_ctselect_zero_extend: +; M32: # %bb.0: +; M32-NEXT: sltu $1, $zero, $4 +; M32-NEXT: negu $1, $1 +; M32-NEXT: jr $ra +; M32-NEXT: andi $2, $1, 1 +; +; M64-LABEL: test_ctselect_zero_extend: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: sltu $1, $zero, $1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: jr $ra +; M64-NEXT: andi $2, $1, 1 + %cmp = icmp ne i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) + ret i32 %result +} + +; Test constant folding with known condition +define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_constant_folding_true: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $4 +; +; M64-LABEL: test_ctselect_constant_folding_true: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $4, 0 + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_constant_folding_false: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $5 +; +; M64-LABEL: test_ctselect_constant_folding_false: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $5, 0 + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with identical operands +define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { +; M32-LABEL: test_ctselect_identical_operands: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $5 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_identical_operands: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $3, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $2 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $3, $1 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x) + ret i32 %result +} + +; Test with inverted condition +define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_inverted_condition: +; M32: # %bb.0: +; M32-NEXT: xor $1, $4, $5 +; M32-NEXT: sltu $1, $zero, $1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $6 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $7 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_inverted_condition: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $3, $7, 0 +; M64-NEXT: xor $1, $2, $1 +; M64-NEXT: sll $2, $6, 0 +; M64-NEXT: sltu $1, $zero, $1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %cmp = icmp eq i32 %x, %y + %not_cmp = xor i1 %cmp, true + %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b) + ret i32 %result +} + +; Test chain of ct.select operations +define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) { +; M32-LABEL: test_ctselect_chain: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: lw $3, 16($sp) +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $7 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $3 +; M32-NEXT: lw $3, 20($sp) +; M32-NEXT: or $1, $2, $1 +; M32-NEXT: andi $2, $5, 1 +; M32-NEXT: andi $2, $2, 1 +; M32-NEXT: negu $2, $2 +; M32-NEXT: and $1, $2, $1 +; M32-NEXT: not $2, $2 +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: andi $3, $6, 1 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: andi $2, $3, 1 +; M32-NEXT: lw $3, 24($sp) +; M32-NEXT: negu $2, $2 +; M32-NEXT: and $1, $2, $1 +; M32-NEXT: not $2, $2 +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_chain: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: sll $3, $5, 0 +; M64-NEXT: sll $2, $7, 0 +; M64-NEXT: sll $4, $8, 0 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: andi $3, $3, 1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: andi $3, $3, 1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: negu $3, $3 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $4 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: not $2, $3 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $9, 0 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: sll $3, $6, 0 +; M64-NEXT: andi $3, $3, 1 +; M64-NEXT: or $1, $1, $2 +; M64-NEXT: andi $2, $3, 1 +; M64-NEXT: sll $3, $10, 0 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $2, $1 +; M64-NEXT: not $2, $2 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + ret i32 %sel3 +} + +; Test for 64-bit operations (supported on all 64-bit architectures) +define i64 @test_ctselect_i64_smin_zero(i64 %x) { +; M32-LABEL: test_ctselect_i64_smin_zero: +; M32: # %bb.0: +; M32-NEXT: slti $1, $5, 0 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $4 +; M32-NEXT: jr $ra +; M32-NEXT: and $3, $1, $5 +; +; M64-LABEL: test_ctselect_i64_smin_zero: +; M64: # %bb.0: +; M64-NEXT: dsrl $1, $4, 63 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: jr $ra +; M64-NEXT: and $2, $1, $4 + %cmp = icmp slt i64 %x, 0 + %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0) + ret i64 %result +} + +; Declare the intrinsics +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback.ll b/llvm/test/CodeGen/Mips/ctselect-fallback.ll new file mode 100644 index 0000000000000..4993deed7be5f --- /dev/null +++ b/llvm/test/CodeGen/Mips/ctselect-fallback.ll @@ -0,0 +1,384 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=mipsel-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M32 +; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M64 + +; Test basic ct.select functionality for scalar types +define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { +; M32-LABEL: test_ctselect_i8: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_i8: +; M64: # %bb.0: +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $1, $6, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $5, 0 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %result +} + +define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { +; M32-LABEL: test_ctselect_i16: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_i16: +; M64: # %bb.0: +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $1, $6, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $5, 0 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %result +} + +define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_i32: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $5 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_i32: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: sll $3, $6, 0 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { +; M32-LABEL: test_ctselect_i64: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: lw $2, 16($sp) +; M32-NEXT: addiu $3, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $4, $1, $6 +; M32-NEXT: and $2, $3, $2 +; M32-NEXT: and $1, $1, $7 +; M32-NEXT: or $2, $4, $2 +; M32-NEXT: lw $4, 20($sp) +; M32-NEXT: and $3, $3, $4 +; M32-NEXT: jr $ra +; M32-NEXT: or $3, $1, $3 +; +; M64-LABEL: test_ctselect_i64: +; M64: # %bb.0: +; M64-NEXT: andi $1, $4, 1 +; M64-NEXT: daddiu $3, $zero, -1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: and $2, $1, $5 +; M64-NEXT: xor $1, $1, $3 +; M64-NEXT: and $1, $1, $6 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) + ret i64 %result +} + +define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { +; M32-LABEL: test_ctselect_ptr: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $5 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_ptr: +; M64: # %bb.0: +; M64-NEXT: andi $1, $4, 1 +; M64-NEXT: daddiu $3, $zero, -1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: and $2, $1, $5 +; M64-NEXT: xor $1, $1, $3 +; M64-NEXT: and $1, $1, $6 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with constant conditions +define i32 @test_ctselect_const_true(i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_const_true: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $4 +; +; M64-LABEL: test_ctselect_const_true: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $4, 0 + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_const_false(i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_const_false: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $5 +; +; M64-LABEL: test_ctselect_const_false: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $5, 0 + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with comparison conditions +define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_icmp_eq: +; M32: # %bb.0: +; M32-NEXT: xor $1, $4, $5 +; M32-NEXT: sltiu $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $6 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $7 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_icmp_eq: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $3, $7, 0 +; M64-NEXT: xor $1, $2, $1 +; M64-NEXT: sll $2, $6, 0 +; M64-NEXT: sltiu $1, $1, 1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %cond = icmp eq i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_icmp_ne: +; M32: # %bb.0: +; M32-NEXT: xor $1, $4, $5 +; M32-NEXT: sltu $1, $zero, $1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $6 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $7 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_icmp_ne: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $3, $7, 0 +; M64-NEXT: xor $1, $2, $1 +; M64-NEXT: sll $2, $6, 0 +; M64-NEXT: sltu $1, $zero, $1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %cond = icmp ne i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_icmp_slt: +; M32: # %bb.0: +; M32-NEXT: slt $1, $4, $5 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $6 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $7 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_icmp_slt: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $3, $7, 0 +; M64-NEXT: slt $1, $2, $1 +; M64-NEXT: sll $2, $6, 0 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %cond = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_icmp_ult: +; M32: # %bb.0: +; M32-NEXT: sltu $1, $4, $5 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $6 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $7 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_icmp_ult: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $3, $7, 0 +; M64-NEXT: sltu $1, $2, $1 +; M64-NEXT: sll $2, $6, 0 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %cond = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test with memory operands +define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { +; M32-LABEL: test_ctselect_load: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: lw $2, 0($5) +; M32-NEXT: lw $3, 0($6) +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $2 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $3 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_load: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: lw $2, 0($5) +; M64-NEXT: lw $3, 0($6) +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %a = load i32, ptr %p1 + %b = load i32, ptr %p2 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test nested ctselect calls +define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { +; M32-LABEL: test_ctselect_nested: +; M32: # %bb.0: +; M32-NEXT: andi $1, $5, 1 +; M32-NEXT: lw $3, 16($sp) +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $6 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $7 +; M32-NEXT: or $1, $2, $1 +; M32-NEXT: andi $2, $4, 1 +; M32-NEXT: andi $2, $2, 1 +; M32-NEXT: negu $2, $2 +; M32-NEXT: and $1, $2, $1 +; M32-NEXT: not $2, $2 +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_nested: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $6, 0 +; M64-NEXT: sll $3, $7, 0 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: sll $3, $8, 0 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $2, $1 +; M64-NEXT: not $2, $2 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) + %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c) + ret i32 %result +} + +; Declare the intrinsics +declare i8 @llvm.ct.select.i8(i1, i8, i8) +declare i16 @llvm.ct.select.i16(i1, i16, i16) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll new file mode 100644 index 0000000000000..d590d48a4f3eb --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll @@ -0,0 +1,236 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -O3 | FileCheck %s --check-prefix=RV32 + +; Test with small integer types +define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { +; RV64-LABEL: test_ctselect_i1: +; RV64: # %bb.0: +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a1, a0, a1 +; RV64-NEXT: xori a0, a0, 1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_i1: +; RV32: # %bb.0: +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) + ret i1 %result +} + +; Test with extremal values +define i32 @test_ctselect_extremal_values(i1 %cond) { +; RV64-LABEL: test_ctselect_extremal_values: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: lui a1, 524288 +; RV64-NEXT: addi a2, a0, -1 +; RV64-NEXT: negw a0, a0 +; RV64-NEXT: and a1, a2, a1 +; RV64-NEXT: slli a0, a0, 33 +; RV64-NEXT: srli a0, a0, 33 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_extremal_values: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: lui a1, 524288 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: not a2, a0 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: srli a0, a0, 1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648) + ret i32 %result +} + +; Test with null pointers +define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { +; RV64-LABEL: test_ctselect_null_ptr: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_null_ptr: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: ret + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null) + ret ptr %result +} + +; Test with function pointers +define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { +; RV64-LABEL: test_ctselect_function_ptr: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a1, a0, a1 +; RV64-NEXT: not a0, a0 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_function_ptr: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2) + ret ptr %result +} + +; Test with condition from icmp on pointers +define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { +; RV64-LABEL: test_ctselect_ptr_cmp: +; RV64: # %bb.0: +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a2, a0, a2 +; RV64-NEXT: not a0, a0 +; RV64-NEXT: and a0, a0, a3 +; RV64-NEXT: or a0, a2, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_ptr_cmp: +; RV32: # %bb.0: +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: ret + %cmp = icmp eq ptr %p1, %p2 + %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b) + ret ptr %result +} + +; Test with struct pointer types +%struct.pair = type { i32, i32 } + +define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { +; RV64-LABEL: test_ctselect_struct_ptr: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a1, a0, a1 +; RV64-NEXT: not a0, a0 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_struct_ptr: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with deeply nested conditions +define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +; RV64-LABEL: test_ctselect_deeply_nested: +; RV64: # %bb.0: +; RV64-NEXT: lw t0, 0(sp) +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: andi a1, a1, 1 +; RV64-NEXT: andi a2, a2, 1 +; RV64-NEXT: andi a3, a3, 1 +; RV64-NEXT: addi t1, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a5, t1, a5 +; RV64-NEXT: neg t1, a1 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a0, a0, a4 +; RV64-NEXT: neg a4, a2 +; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: and a1, a1, a6 +; RV64-NEXT: neg a6, a3 +; RV64-NEXT: addi a3, a3, -1 +; RV64-NEXT: and a2, a2, a7 +; RV64-NEXT: or a0, a0, a5 +; RV64-NEXT: and a0, t1, a0 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: and a0, a4, a0 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: and a0, a6, a0 +; RV64-NEXT: and a1, a3, t0 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_deeply_nested: +; RV32: # %bb.0: +; RV32-NEXT: lw t0, 0(sp) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: andi a2, a2, 1 +; RV32-NEXT: andi a3, a3, 1 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: slli a1, a1, 31 +; RV32-NEXT: slli a2, a2, 31 +; RV32-NEXT: slli a3, a3, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: srai a1, a1, 31 +; RV32-NEXT: srai a2, a2, 31 +; RV32-NEXT: srai a3, a3, 31 +; RV32-NEXT: and a4, a0, a4 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a5 +; RV32-NEXT: not a5, a1 +; RV32-NEXT: and a5, a5, a6 +; RV32-NEXT: not a6, a2 +; RV32-NEXT: and a6, a6, a7 +; RV32-NEXT: not a7, a3 +; RV32-NEXT: or a0, a4, a0 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: or a0, a0, a5 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: or a0, a0, a6 +; RV32-NEXT: and a0, a3, a0 +; RV32-NEXT: and a1, a7, t0 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e) + ret i32 %sel4 +} + +; Declare the intrinsics +declare i1 @llvm.ct.select.i1(i1, i1, i1) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll new file mode 100644 index 0000000000000..1dbade44cc1f4 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll @@ -0,0 +1,403 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -O3 | FileCheck %s --check-prefix=RV32 + +; Test smin(x, 0) pattern +define i32 @test_ctselect_smin_zero(i32 %x) { +; RV64-LABEL: test_ctselect_smin_zero: +; RV64: # %bb.0: +; RV64-NEXT: sraiw a1, a0, 31 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_smin_zero: +; RV32: # %bb.0: +; RV32-NEXT: srai a1, a0, 31 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: ret + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test smax(x, 0) pattern +define i32 @test_ctselect_smax_zero(i32 %x) { +; RV64-LABEL: test_ctselect_smax_zero: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a0 +; RV64-NEXT: sgtz a1, a1 +; RV64-NEXT: neg a1, a1 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_smax_zero: +; RV32: # %bb.0: +; RV32-NEXT: sgtz a1, a0 +; RV32-NEXT: neg a1, a1 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: ret + %cmp = icmp sgt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test generic smin pattern +define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_smin_generic: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a2, a1 +; RV64-NEXT: sext.w a3, a0 +; RV64-NEXT: slt a2, a3, a2 +; RV64-NEXT: addi a3, a2, -1 +; RV64-NEXT: neg a2, a2 +; RV64-NEXT: and a1, a3, a1 +; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_smin_generic: +; RV32: # %bb.0: +; RV32-NEXT: slt a2, a0, a1 +; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cmp = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test generic smax pattern +define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_smax_generic: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a2, a0 +; RV64-NEXT: sext.w a3, a1 +; RV64-NEXT: slt a2, a3, a2 +; RV64-NEXT: addi a3, a2, -1 +; RV64-NEXT: neg a2, a2 +; RV64-NEXT: and a1, a3, a1 +; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_smax_generic: +; RV32: # %bb.0: +; RV32-NEXT: slt a2, a1, a0 +; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cmp = icmp sgt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umin pattern +define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_umin_generic: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a2, a1 +; RV64-NEXT: sext.w a3, a0 +; RV64-NEXT: sltu a2, a3, a2 +; RV64-NEXT: addi a3, a2, -1 +; RV64-NEXT: neg a2, a2 +; RV64-NEXT: and a1, a3, a1 +; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_umin_generic: +; RV32: # %bb.0: +; RV32-NEXT: sltu a2, a0, a1 +; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cmp = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umax pattern +define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_umax_generic: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a2, a0 +; RV64-NEXT: sext.w a3, a1 +; RV64-NEXT: sltu a2, a3, a2 +; RV64-NEXT: addi a3, a2, -1 +; RV64-NEXT: neg a2, a2 +; RV64-NEXT: and a1, a3, a1 +; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_umax_generic: +; RV32: # %bb.0: +; RV32-NEXT: sltu a2, a1, a0 +; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cmp = icmp ugt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test abs pattern +define i32 @test_ctselect_abs(i32 %x) { +; RV64-LABEL: test_ctselect_abs: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a0 +; RV64-NEXT: negw a2, a0 +; RV64-NEXT: slti a1, a1, 0 +; RV64-NEXT: negw a3, a1 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: or a0, a2, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_abs: +; RV32: # %bb.0: +; RV32-NEXT: neg a1, a0 +; RV32-NEXT: srai a2, a0, 31 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x) + ret i32 %result +} + +; Test nabs pattern (negative abs) +define i32 @test_ctselect_nabs(i32 %x) { +; RV64-LABEL: test_ctselect_nabs: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a0 +; RV64-NEXT: negw a2, a0 +; RV64-NEXT: slti a1, a1, 0 +; RV64-NEXT: addi a3, a1, -1 +; RV64-NEXT: neg a1, a1 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_nabs: +; RV32: # %bb.0: +; RV32-NEXT: neg a1, a0 +; RV32-NEXT: srai a2, a0, 31 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg) + ret i32 %result +} + +; Test sign extension pattern +define i32 @test_ctselect_sign_extend(i32 %x) { +; RV64-LABEL: test_ctselect_sign_extend: +; RV64: # %bb.0: +; RV64-NEXT: sraiw a0, a0, 31 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_sign_extend: +; RV32: # %bb.0: +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: ret + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) + ret i32 %result +} + +; Test zero extension pattern +define i32 @test_ctselect_zero_extend(i32 %x) { +; RV64-LABEL: test_ctselect_zero_extend: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_zero_extend: +; RV32: # %bb.0: +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: ret + %cmp = icmp ne i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) + ret i32 %result +} + +; Test constant folding with known condition +define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_constant_folding_true: +; RV64: # %bb.0: +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_constant_folding_true: +; RV32: # %bb.0: +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_constant_folding_false: +; RV64: # %bb.0: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_constant_folding_false: +; RV32: # %bb.0: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with identical operands +define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { +; RV64-LABEL: test_ctselect_identical_operands: +; RV64: # %bb.0: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_identical_operands: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a2, a0, a1 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x) + ret i32 %result +} + +; Test with inverted condition +define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_inverted_condition: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: addi a1, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a1, a1, a3 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_inverted_condition: +; RV32: # %bb.0: +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: ret + %cmp = icmp eq i32 %x, %y + %not_cmp = xor i1 %cmp, true + %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b) + ret i32 %result +} + +; Test chain of ct.select operations +define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) { +; RV64-LABEL: test_ctselect_chain: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: andi a1, a1, 1 +; RV64-NEXT: andi a2, a2, 1 +; RV64-NEXT: addi a7, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a4, a7, a4 +; RV64-NEXT: neg a7, a1 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a0, a0, a3 +; RV64-NEXT: neg a3, a2 +; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: and a1, a1, a5 +; RV64-NEXT: or a0, a0, a4 +; RV64-NEXT: and a0, a7, a0 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: and a0, a3, a0 +; RV64-NEXT: and a1, a2, a6 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_chain: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: andi a2, a2, 1 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: slli a1, a1, 31 +; RV32-NEXT: slli a2, a2, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: srai a1, a1, 31 +; RV32-NEXT: srai a2, a2, 31 +; RV32-NEXT: and a3, a0, a3 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: not a4, a1 +; RV32-NEXT: and a4, a4, a5 +; RV32-NEXT: not a5, a2 +; RV32-NEXT: or a0, a3, a0 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: or a0, a0, a4 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: and a1, a5, a6 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + ret i32 %sel3 +} + +; Test for 64-bit operations (supported on all 64-bit architectures) +define i64 @test_ctselect_i64_smin_zero(i64 %x) { +; RV64-LABEL: test_ctselect_i64_smin_zero: +; RV64: # %bb.0: +; RV64-NEXT: srai a1, a0, 63 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_i64_smin_zero: +; RV32: # %bb.0: +; RV32-NEXT: srai a2, a1, 31 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: ret + %cmp = icmp slt i64 %x, 0 + %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0) + ret i64 %result +} + +; Declare the intrinsics +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll index f07d78bc84b5f..c9bf9b579cf29 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll @@ -1,17 +1,350 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -O3 | FileCheck %s --check-prefix=RV32 -declare i32 @llvm.ct.select.i32(i1, i32, i32) +; Test basic ct.select functionality for scalar types +define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { +; RV64-LABEL: test_ctselect_i8: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_i8: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: ret + %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %result +} + +define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { +; RV64-LABEL: test_ctselect_i16: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_i16: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: ret + %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %result +} define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; RV64-LABEL: test_ctselect_i32: ; RV64: # %bb.0: -; RV64-NEXT: andi -; RV64-NEXT: addi -; RV64-NEXT: neg -; RV64-NEXT: and -; RV64-NEXT: and -; RV64-NEXT: or +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_i32: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { +; RV64-LABEL: test_ctselect_i64: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a1, a0, a1 +; RV64-NEXT: not a0, a0 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_i64: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a5, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a3, a5, a3 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: and a4, a5, a4 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: or a0, a1, a3 +; RV32-NEXT: or a1, a2, a4 +; RV32-NEXT: ret + %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) + ret i64 %result +} + +define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { +; RV64-LABEL: test_ctselect_ptr: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a1, a0, a1 +; RV64-NEXT: not a0, a0 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_ptr: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with constant conditions +define i32 @test_ctselect_const_true(i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_const_true: +; RV64: # %bb.0: +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_const_true: +; RV32: # %bb.0: +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_const_false(i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_const_false: +; RV64: # %bb.0: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_const_false: +; RV32: # %bb.0: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with comparison conditions +define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_icmp_eq: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: addi a1, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a1, a1, a3 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_icmp_eq: +; RV32: # %bb.0: +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: ret + %cond = icmp eq i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_icmp_ne: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: addi a1, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a1, a1, a3 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a0, a1 ; RV64-NEXT: ret - %r = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) - ret i32 %r +; +; RV32-LABEL: test_ctselect_icmp_ne: +; RV32: # %bb.0: +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: ret + %cond = icmp ne i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result } + +define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_icmp_slt: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: slt a0, a0, a1 +; RV64-NEXT: addi a1, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a1, a1, a3 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_icmp_slt: +; RV32: # %bb.0: +; RV32-NEXT: slt a0, a0, a1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: ret + %cond = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_icmp_ult: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: addi a1, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a1, a1, a3 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_icmp_ult: +; RV32: # %bb.0: +; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: ret + %cond = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test with memory operands +define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { +; RV64-LABEL: test_ctselect_load: +; RV64: # %bb.0: +; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: lw a2, 0(a2) +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_load: +; RV32: # %bb.0: +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: lw a2, 0(a2) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %a = load i32, ptr %p1 + %b = load i32, ptr %p2 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test nested ctselect calls +define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { +; RV64-LABEL: test_ctselect_nested: +; RV64: # %bb.0: +; RV64-NEXT: andi a1, a1, 1 +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a5, a1, -1 +; RV64-NEXT: neg a1, a1 +; RV64-NEXT: and a3, a5, a3 +; RV64-NEXT: neg a5, a0 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a1, a1, a2 +; RV64-NEXT: or a1, a1, a3 +; RV64-NEXT: and a1, a5, a1 +; RV64-NEXT: and a0, a0, a4 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_nested: +; RV32: # %bb.0: +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: slli a1, a1, 31 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a1, a1, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a2, a1, a2 +; RV32-NEXT: not a1, a1 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: not a3, a0 +; RV32-NEXT: or a1, a2, a1 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: and a3, a3, a4 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: ret + %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) + %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c) + ret i32 %result +} + +; Declare the intrinsics +declare i8 @llvm.ct.select.i8(i1, i8, i8) +declare i16 @llvm.ct.select.i16(i1, i16, i16) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll new file mode 100644 index 0000000000000..18fcb8ac243d4 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll @@ -0,0 +1,415 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W32 +; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W64 + +; Test with small integer types +define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { +; W32-LABEL: test_ctselect_i1: +; W32: .functype test_ctselect_i1 (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_i1: +; W64: .functype test_ctselect_i1 (i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) + ret i1 %result +} + +; Test with extremal values +define i32 @test_ctselect_extremal_values(i1 %cond) { +; W32-LABEL: test_ctselect_extremal_values: +; W32: .functype test_ctselect_extremal_values (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.const 2147483647 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: i32.const -2147483648 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_extremal_values: +; W64: .functype test_ctselect_extremal_values (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.const 2147483647 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: i32.const -2147483648 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648) + ret i32 %result +} + +; Test with null pointers +define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { +; W32-LABEL: test_ctselect_null_ptr: +; W32: .functype test_ctselect_null_ptr (i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_null_ptr: +; W64: .functype test_ctselect_null_ptr (i32, i64) -> (i64) +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i64.and +; W64-NEXT: # fallthrough-return + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null) + ret ptr %result +} + +; Test with function pointers +define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { +; W32-LABEL: test_ctselect_function_ptr: +; W32: .functype test_ctselect_function_ptr (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_function_ptr: +; W64: .functype test_ctselect_function_ptr (i32, i64, i64) -> (i64) +; W64-NEXT: .local i64 +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.sub +; W64-NEXT: local.tee 3 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.and +; W64-NEXT: local.get 3 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.xor +; W64-NEXT: local.get 2 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: # fallthrough-return + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2) + ret ptr %result +} + +; Test with condition from icmp on pointers +define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { +; W32-LABEL: test_ctselect_ptr_cmp: +; W32: .functype test_ctselect_ptr_cmp (i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.eq +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_ptr_cmp: +; W64: .functype test_ctselect_ptr_cmp (i64, i64, i64, i64) -> (i64) +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.eq +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.sub +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 2 +; W64-NEXT: i64.and +; W64-NEXT: local.get 1 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.xor +; W64-NEXT: local.get 3 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: # fallthrough-return + %cmp = icmp eq ptr %p1, %p2 + %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b) + ret ptr %result +} + +; Test with struct pointer types +%struct.pair = type { i32, i32 } + +define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { +; W32-LABEL: test_ctselect_struct_ptr: +; W32: .functype test_ctselect_struct_ptr (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_struct_ptr: +; W64: .functype test_ctselect_struct_ptr (i32, i64, i64) -> (i64) +; W64-NEXT: .local i64 +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.sub +; W64-NEXT: local.tee 3 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.and +; W64-NEXT: local.get 3 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.xor +; W64-NEXT: local.get 2 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: # fallthrough-return + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with deeply nested conditions +define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +; W32-LABEL: test_ctselect_deeply_nested: +; W32: .functype test_ctselect_deeply_nested (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 3 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 3 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 2 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 4 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 5 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 6 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: i32.and +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 7 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: i32.and +; W32-NEXT: local.get 3 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 8 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_deeply_nested: +; W64: .functype test_ctselect_deeply_nested (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 3 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 3 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 2 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 1 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 0 +; W64-NEXT: local.get 4 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 5 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 6 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: i32.and +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 7 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: i32.and +; W64-NEXT: local.get 3 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 8 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e) + ret i32 %sel4 +} + +; Declare the intrinsics +declare i1 @llvm.ct.select.i1(i1, i1, i1) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll new file mode 100644 index 0000000000000..3d6035b64430e --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll @@ -0,0 +1,673 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W32 +; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W64 + +; Test smin(x, 0) pattern +define i32 @test_ctselect_smin_zero(i32 %x) { +; W32-LABEL: test_ctselect_smin_zero: +; W32: .functype test_ctselect_smin_zero (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.lt_s +; W32-NEXT: i32.sub +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_smin_zero: +; W64: .functype test_ctselect_smin_zero (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 0 +; W64-NEXT: i32.lt_s +; W64-NEXT: i32.sub +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: # fallthrough-return + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test smax(x, 0) pattern +define i32 @test_ctselect_smax_zero(i32 %x) { +; W32-LABEL: test_ctselect_smax_zero: +; W32: .functype test_ctselect_smax_zero (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.gt_s +; W32-NEXT: i32.sub +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_smax_zero: +; W64: .functype test_ctselect_smax_zero (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 0 +; W64-NEXT: i32.gt_s +; W64-NEXT: i32.sub +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: # fallthrough-return + %cmp = icmp sgt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test generic smin pattern +define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { +; W32-LABEL: test_ctselect_smin_generic: +; W32: .functype test_ctselect_smin_generic (i32, i32) -> (i32) +; W32-NEXT: .local i32 +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.lt_s +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 2 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_smin_generic: +; W64: .functype test_ctselect_smin_generic (i32, i32) -> (i32) +; W64-NEXT: .local i32 +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.lt_s +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 2 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cmp = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test generic smax pattern +define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { +; W32-LABEL: test_ctselect_smax_generic: +; W32: .functype test_ctselect_smax_generic (i32, i32) -> (i32) +; W32-NEXT: .local i32 +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.gt_s +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 2 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_smax_generic: +; W64: .functype test_ctselect_smax_generic (i32, i32) -> (i32) +; W64-NEXT: .local i32 +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.gt_s +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 2 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cmp = icmp sgt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umin pattern +define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { +; W32-LABEL: test_ctselect_umin_generic: +; W32: .functype test_ctselect_umin_generic (i32, i32) -> (i32) +; W32-NEXT: .local i32 +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.lt_u +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 2 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_umin_generic: +; W64: .functype test_ctselect_umin_generic (i32, i32) -> (i32) +; W64-NEXT: .local i32 +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.lt_u +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 2 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cmp = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umax pattern +define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { +; W32-LABEL: test_ctselect_umax_generic: +; W32: .functype test_ctselect_umax_generic (i32, i32) -> (i32) +; W32-NEXT: .local i32 +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.gt_u +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 2 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_umax_generic: +; W64: .functype test_ctselect_umax_generic (i32, i32) -> (i32) +; W64-NEXT: .local i32 +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.gt_u +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 2 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cmp = icmp ugt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test abs pattern +define i32 @test_ctselect_abs(i32 %x) { +; W32-LABEL: test_ctselect_abs: +; W32: .functype test_ctselect_abs (i32) -> (i32) +; W32-NEXT: .local i32 +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.lt_s +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.sub +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_abs: +; W64: .functype test_ctselect_abs (i32) -> (i32) +; W64-NEXT: .local i32 +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 0 +; W64-NEXT: i32.lt_s +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 1 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.sub +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x) + ret i32 %result +} + +; Test nabs pattern (negative abs) +define i32 @test_ctselect_nabs(i32 %x) { +; W32-LABEL: test_ctselect_nabs: +; W32: .functype test_ctselect_nabs (i32) -> (i32) +; W32-NEXT: .local i32 +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.lt_s +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.sub +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_nabs: +; W64: .functype test_ctselect_nabs (i32) -> (i32) +; W64-NEXT: .local i32 +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 0 +; W64-NEXT: i32.lt_s +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.sub +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg) + ret i32 %result +} + +; Test sign extension pattern +define i32 @test_ctselect_sign_extend(i32 %x) { +; W32-LABEL: test_ctselect_sign_extend: +; W32: .functype test_ctselect_sign_extend (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.lt_s +; W32-NEXT: i32.sub +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_sign_extend: +; W64: .functype test_ctselect_sign_extend (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 0 +; W64-NEXT: i32.lt_s +; W64-NEXT: i32.sub +; W64-NEXT: # fallthrough-return + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) + ret i32 %result +} + +; Test zero extension pattern +define i32 @test_ctselect_zero_extend(i32 %x) { +; W32-LABEL: test_ctselect_zero_extend: +; W32: .functype test_ctselect_zero_extend (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.ne +; W32-NEXT: i32.sub +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_zero_extend: +; W64: .functype test_ctselect_zero_extend (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 0 +; W64-NEXT: i32.ne +; W64-NEXT: i32.sub +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: # fallthrough-return + %cmp = icmp ne i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) + ret i32 %result +} + +; Test constant folding with known condition +define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_constant_folding_true: +; W32: .functype test_ctselect_constant_folding_true (i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_constant_folding_true: +; W64: .functype test_ctselect_constant_folding_true (i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_constant_folding_false: +; W32: .functype test_ctselect_constant_folding_false (i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 1 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_constant_folding_false: +; W64: .functype test_ctselect_constant_folding_false (i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 1 +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with identical operands +define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { +; W32-LABEL: test_ctselect_identical_operands: +; W32: .functype test_ctselect_identical_operands (i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_identical_operands: +; W64: .functype test_ctselect_identical_operands (i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x) + ret i32 %result +} + +; Test with inverted condition +define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_inverted_condition: +; W32: .functype test_ctselect_inverted_condition (i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.ne +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_inverted_condition: +; W64: .functype test_ctselect_inverted_condition (i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.ne +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 3 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cmp = icmp eq i32 %x, %y + %not_cmp = xor i1 %cmp, true + %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b) + ret i32 %result +} + +; Test chain of ct.select operations +define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) { +; W32-LABEL: test_ctselect_chain: +; W32: .functype test_ctselect_chain (i32, i32, i32, i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 2 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 4 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 5 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: i32.and +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 6 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_chain: +; W64: .functype test_ctselect_chain (i32, i32, i32, i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 2 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 1 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 0 +; W64-NEXT: local.get 3 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 4 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 5 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: i32.and +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 6 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + ret i32 %sel3 +} + +; Test for 64-bit operations (supported on all 64-bit architectures) +define i64 @test_ctselect_i64_smin_zero(i64 %x) { +; W32-LABEL: test_ctselect_i64_smin_zero: +; W32: .functype test_ctselect_i64_smin_zero (i64) -> (i64) +; W32-NEXT: # %bb.0: +; W32-NEXT: i64.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i64.const 63 +; W32-NEXT: i64.shr_u +; W32-NEXT: i64.sub +; W32-NEXT: local.get 0 +; W32-NEXT: i64.and +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_i64_smin_zero: +; W64: .functype test_ctselect_i64_smin_zero (i64) -> (i64) +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.const 63 +; W64-NEXT: i64.shr_u +; W64-NEXT: i64.sub +; W64-NEXT: local.get 0 +; W64-NEXT: i64.and +; W64-NEXT: # fallthrough-return + %cmp = icmp slt i64 %x, 0 + %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0) + ret i64 %result +} + +; Declare the intrinsics +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll new file mode 100644 index 0000000000000..bc1d12c9ad83e --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll @@ -0,0 +1,576 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W32 +; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W64 + +; Test basic ct.select functionality for scalar types +define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { +; W32-LABEL: test_ctselect_i8: +; W32: .functype test_ctselect_i8 (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_i8: +; W64: .functype test_ctselect_i8 (i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %result +} + +define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { +; W32-LABEL: test_ctselect_i16: +; W32: .functype test_ctselect_i16 (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_i16: +; W64: .functype test_ctselect_i16 (i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %result +} + +define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_i32: +; W32: .functype test_ctselect_i32 (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_i32: +; W64: .functype test_ctselect_i32 (i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { +; W32-LABEL: test_ctselect_i64: +; W32: .functype test_ctselect_i64 (i32, i64, i64) -> (i64) +; W32-NEXT: .local i64 +; W32-NEXT: # %bb.0: +; W32-NEXT: i64.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i64.extend_i32_u +; W32-NEXT: i64.const 1 +; W32-NEXT: i64.and +; W32-NEXT: i64.const 1 +; W32-NEXT: i64.and +; W32-NEXT: i64.sub +; W32-NEXT: local.tee 3 +; W32-NEXT: local.get 1 +; W32-NEXT: i64.and +; W32-NEXT: local.get 3 +; W32-NEXT: i64.const -1 +; W32-NEXT: i64.xor +; W32-NEXT: local.get 2 +; W32-NEXT: i64.and +; W32-NEXT: i64.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_i64: +; W64: .functype test_ctselect_i64 (i32, i64, i64) -> (i64) +; W64-NEXT: .local i64 +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.sub +; W64-NEXT: local.tee 3 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.and +; W64-NEXT: local.get 3 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.xor +; W64-NEXT: local.get 2 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: # fallthrough-return + %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) + ret i64 %result +} + +define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { +; W32-LABEL: test_ctselect_ptr: +; W32: .functype test_ctselect_ptr (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_ptr: +; W64: .functype test_ctselect_ptr (i32, i64, i64) -> (i64) +; W64-NEXT: .local i64 +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.sub +; W64-NEXT: local.tee 3 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.and +; W64-NEXT: local.get 3 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.xor +; W64-NEXT: local.get 2 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: # fallthrough-return + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with constant conditions +define i32 @test_ctselect_const_true(i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_const_true: +; W32: .functype test_ctselect_const_true (i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_const_true: +; W64: .functype test_ctselect_const_true (i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_const_false(i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_const_false: +; W32: .functype test_ctselect_const_false (i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 1 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_const_false: +; W64: .functype test_ctselect_const_false (i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 1 +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with comparison conditions +define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_icmp_eq: +; W32: .functype test_ctselect_icmp_eq (i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.eq +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_icmp_eq: +; W64: .functype test_ctselect_icmp_eq (i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.eq +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 3 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cond = icmp eq i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_icmp_ne: +; W32: .functype test_ctselect_icmp_ne (i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.ne +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_icmp_ne: +; W64: .functype test_ctselect_icmp_ne (i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.ne +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 3 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cond = icmp ne i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_icmp_slt: +; W32: .functype test_ctselect_icmp_slt (i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.lt_s +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_icmp_slt: +; W64: .functype test_ctselect_icmp_slt (i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.lt_s +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 3 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cond = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_icmp_ult: +; W32: .functype test_ctselect_icmp_ult (i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.lt_u +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_icmp_ult: +; W64: .functype test_ctselect_icmp_ult (i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.lt_u +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 3 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cond = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test with memory operands +define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { +; W32-LABEL: test_ctselect_load: +; W32: .functype test_ctselect_load (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.load 0 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 2 +; W32-NEXT: i32.load 0 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_load: +; W64: .functype test_ctselect_load (i32, i64, i64) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.load 0 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 2 +; W64-NEXT: i32.load 0 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %a = load i32, ptr %p1 + %b = load i32, ptr %p2 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test nested ctselect calls +define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { +; W32-LABEL: test_ctselect_nested: +; W32: .functype test_ctselect_nested (i32, i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 4 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_nested: +; W64: .functype test_ctselect_nested (i32, i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 3 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 4 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) + %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c) + ret i32 %result +} + +; Declare the intrinsics +declare i8 @llvm.ct.select.i8(i1, i8, i8) +declare i16 @llvm.ct.select.i16(i1, i16, i16) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) From f5d45aa2a524abae6110c5f7bfda7f29e5a7e5a2 Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Thu, 14 Aug 2025 08:47:54 -0400 Subject: [PATCH 08/63] Used a more generic and simpler approach for managing NoMerge flag --- llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 3 +- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 31 +-- .../SelectionDAG/SelectionDAGBuilder.cpp | 8 + .../CodeGen/Mips/ctselect-side-effects.ll | 184 ++++++++++++++ .../CodeGen/RISCV/ctselect-side-effects.ll | 178 ++++++++++++++ .../WebAssembly/ctselect-side-effects.ll | 230 ++++++++++++++++++ 6 files changed, 603 insertions(+), 31 deletions(-) create mode 100644 llvm/test/CodeGen/Mips/ctselect-side-effects.ll create mode 100644 llvm/test/CodeGen/RISCV/ctselect-side-effects.ll create mode 100644 llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 62e917666e531..55c62ff7e7216 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -427,7 +427,8 @@ struct SDNodeFlags { FastMathFlags = NoNaNs | NoInfs | NoSignedZeros | AllowReciprocal | AllowContract | ApproximateFuncs | AllowReassociation, - // Flag for disabling optimization + // Instructs DAGCombiner to skip optimization passes for this node. + // Preserves the operation as-is without folding, merging, or elimination. NoMerge = 1 << 15, }; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a5d30daac1d14..589f85671eddd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1898,37 +1898,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) { DAG.RemoveDeadNodes(); } -static inline bool hasNoMergeProtection(const SDValue &V) { - if (SDNode *N = V.getNode()) { - if (N->getFlags().hasNoMerge()) - return true; - - // if V is (~X) expressed as (xor X, -1) also check X - if (V.getOpcode() == ISD::XOR) { - if (isAllOnesConstant(V.getOperand(1)) && V.getOperand(0).getNode() && - V.getOperand(0)->getFlags().hasNoMerge()) - return true; - if (isAllOnesConstant(V.getOperand(0)) && V.getOperand(1).getNode() && - V.getOperand(1)->getFlags().hasNoMerge()) - return true; - } - } - return false; -} - -static inline bool touchesNoMerge(SDNode *N) { - if (N->getFlags().hasNoMerge()) - return true; - - for (const SDUse &U : N->ops()) { - if (hasNoMergeProtection(U.get())) - return true; - } - return false; -} - SDValue DAGCombiner::visit(SDNode *N) { - if (touchesNoMerge(N)) + if (N->getFlags().hasNoMerge()) return SDValue(); // clang-format off diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index ea1b1b8634bef..6f973f7ade1a0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6718,6 +6718,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } + // assert if Payloads type are Vector + assert(!VT.isVector() && + "Vector type not supported yet for fallback implementation"); + + // We don't support floating points yet + assert(!VT.isFloatingPoint() && + "Float point type not supported yet fallback implementation"); + setValue(&I, createProtectedCtSelectFallback(DAG, DL, Cond, A, B, VT)); return; } diff --git a/llvm/test/CodeGen/Mips/ctselect-side-effects.ll b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll new file mode 100644 index 0000000000000..86b5952ffd19c --- /dev/null +++ b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll @@ -0,0 +1,184 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=mipsel-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M32 +; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M64 + +; Test 1: Basic optimizations should still work +define i32 @test_basic_opts(i32 %x) { +; M32-LABEL: test_basic_opts: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $4 +; +; M64-LABEL: test_basic_opts: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $4, 0 + %a = or i32 %x, 0 + %b = and i32 %a, -1 + %c = xor i32 %b, 0 + ret i32 %c +} + +; Test 2: Constant folding should work +define i32 @test_constant_fold() { +; M32-LABEL: test_constant_fold: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: addiu $2, $zero, 0 +; +; M64-LABEL: test_constant_fold: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: addiu $2, $zero, 0 + %a = xor i32 -1, -1 ; Should fold to 0 + ret i32 %a +} + +; Test 3: Protected pattern should NOT have branches +define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { +; M32-LABEL: test_protected_no_branch: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $5 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_protected_no_branch: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: sll $3, $6, 0 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test 4: Explicit branch should still generate branches +define i32 @test_explicit_branch(i1 %cond, i32 %a, i32 %b) { +; M32-LABEL: test_explicit_branch: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: beqz $1, $BB3_2 +; M32-NEXT: nop +; M32-NEXT: # %bb.1: # %true +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $5 +; M32-NEXT: $BB3_2: # %false +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $6 +; +; M64-LABEL: test_explicit_branch: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: beqz $1, .LBB3_2 +; M64-NEXT: nop +; M64-NEXT: # %bb.1: # %true +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: .LBB3_2: # %false +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $6, 0 + br i1 %cond, label %true, label %false +true: + ret i32 %a +false: + ret i32 %b +} + +; Test 5: Regular select (not ct.select) - whatever wasm wants to do +define i32 @test_regular_select(i1 %cond, i32 %a, i32 %b) { +; M32-LABEL: test_regular_select: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: movn $6, $5, $1 +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $6 +; +; M64-LABEL: test_regular_select: +; M64: # %bb.0: +; M64-NEXT: sll $3, $4, 0 +; M64-NEXT: sll $2, $6, 0 +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: andi $3, $3, 1 +; M64-NEXT: jr $ra +; M64-NEXT: movn $2, $1, $3 + %result = select i1 %cond, i32 %a, i32 %b + ret i32 %result +} + +; Test if XOR with all-ones still gets optimized +define i32 @test_xor_all_ones() { +; M32-LABEL: test_xor_all_ones: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: addiu $2, $zero, 0 +; +; M64-LABEL: test_xor_all_ones: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: addiu $2, $zero, 0 + %xor1 = xor i32 -1, -1 ; Should optimize to 0 + ret i32 %xor1 +} + +define i32 @test_xor_same_value(i32 %x) { +; M32-LABEL: test_xor_same_value: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: addiu $2, $zero, 0 +; +; M64-LABEL: test_xor_same_value: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: addiu $2, $zero, 0 + %xor2 = xor i32 %x, %x ; Should optimize to 0 + ret i32 %xor2 +} + +define i32 @test_normal_ops(i32 %x) { +; M32-LABEL: test_normal_ops: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $4 +; +; M64-LABEL: test_normal_ops: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $4, 0 + %or1 = or i32 %x, 0 ; Should optimize to %x + %and1 = and i32 %or1, -1 ; Should optimize to %x + %xor1 = xor i32 %and1, 0 ; Should optimize to %x + ret i32 %xor1 +} + +; This simulates what the reviewer is worried about +define i32 @test_xor_with_const_operands() { +; M32-LABEL: test_xor_with_const_operands: +; M32: # %bb.0: M32-NEXT: jr $ra +; M32-NEXT: addiu $2, $zero, 0 +; +; M64-LABEL: test_xor_with_const_operands: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: addiu $2, $zero, 0 + %a = xor i32 -1, -1 + %b = xor i32 0, 0 + %c = xor i32 42, 42 + %result = or i32 %a, %b + %final = or i32 %result, %c + ret i32 %final ; Should optimize to 0 +} + +declare i32 @llvm.ct.select.i32(i1, i32, i32) diff --git a/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll b/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll new file mode 100644 index 0000000000000..a37a57578523f --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -O3 -filetype=asm | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -O3 -filetype=asm | FileCheck %s --check-prefix=RV32 + +; Test 1: Basic optimizations should still work +define i32 @test_basic_opts(i32 %x) { +; RV64-LABEL: test_basic_opts: +; RV64: # %bb.0: +; RV64-NEXT: ret +; +; RV32-LABEL: test_basic_opts: +; RV32: # %bb.0: +; RV32-NEXT: ret + %a = or i32 %x, 0 ; Should eliminate + %b = and i32 %a, -1 ; Should eliminate + %c = xor i32 %b, 0 ; Should eliminate + ret i32 %c +} + +; Test 2: Constant folding should work +define i32 @test_constant_fold() { +; RV64-LABEL: test_constant_fold: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_constant_fold: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret + %a = xor i32 -1, -1 ; Should fold to 0 + ret i32 %a +} + +; Test 3: Protected pattern should NOT have branches +define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { +; RV64-LABEL: test_protected_no_branch: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_protected_no_branch: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test 4: Explicit branch should still generate branches +define i32 @test_explicit_branch(i1 %cond, i32 %a, i32 %b) { +; RV64-LABEL: test_explicit_branch: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: beqz a0, .LBB3_2 +; RV64-NEXT: # %bb.1: # %true +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: ret +; RV64-NEXT: .LBB3_2: # %false +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_explicit_branch: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: beqz a0, .LBB3_2 +; RV32-NEXT: # %bb.1: # %true +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: ret +; RV32-NEXT: .LBB3_2: # %false +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: ret + br i1 %cond, label %true, label %false +true: + ret i32 %a +false: + ret i32 %b +} + +; Test 5: Regular select (not ct.select) - whatever wasm wants to do +define i32 @test_regular_select(i1 %cond, i32 %a, i32 %b) { +; RV64-LABEL: test_regular_select: +; RV64: # %bb.0: +; RV64-NEXT: andi a3, a0, 1 +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: bnez a3, .LBB4_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: .LBB4_2: +; RV64-NEXT: ret +; +; RV32-LABEL: test_regular_select: +; RV32: # %bb.0: +; RV32-NEXT: andi a3, a0, 1 +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: bnez a3, .LBB4_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: .LBB4_2: +; RV32-NEXT: ret + %result = select i1 %cond, i32 %a, i32 %b + ret i32 %result +} + +; Test if XOR with all-ones still gets optimized +define i32 @test_xor_all_ones() { +; RV64-LABEL: test_xor_all_ones: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_xor_all_ones: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret + %xor1 = xor i32 -1, -1 ; Should optimize to 0 + ret i32 %xor1 +} + +define i32 @test_xor_same_value(i32 %x) { +; RV64-LABEL: test_xor_same_value: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_xor_same_value: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret + %xor2 = xor i32 %x, %x ; Should optimize to 0 + ret i32 %xor2 +} + +define i32 @test_normal_ops(i32 %x) { +; RV64-LABEL: test_normal_ops: +; RV64: # %bb.0: +; RV64-NEXT: ret +; +; RV32-LABEL: test_normal_ops: +; RV32: # %bb.0: +; RV32-NEXT: ret + %or1 = or i32 %x, 0 ; Should optimize to %x + %and1 = and i32 %or1, -1 ; Should optimize to %x + %xor1 = xor i32 %and1, 0 ; Should optimize to %x + ret i32 %xor1 +} + +; This simulates what the reviewer is worried about +define i32 @test_xor_with_const_operands() { +; RV64-LABEL: test_xor_with_const_operands: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_xor_with_const_operands: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret + %a = xor i32 -1, -1 ; -1 ^ -1 should become 0 + %b = xor i32 0, 0 ; 0 ^ 0 should become 0 + %c = xor i32 42, 42 ; 42 ^ 42 should become 0 + %result = or i32 %a, %b + %final = or i32 %result, %c + ret i32 %final ; Should optimize to 0 +} + +declare i32 @llvm.ct.select.i32(i1, i32, i32) diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll b/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll new file mode 100644 index 0000000000000..036160b6dbadb --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll @@ -0,0 +1,230 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W32 +; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W64 + +; Test 1: Basic optimizations should still work +define i32 @test_basic_opts(i32 %x) { +; W32-LABEL: test_basic_opts: +; W32: .functype test_basic_opts (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_basic_opts: +; W64: .functype test_basic_opts (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: # fallthrough-return + %a = or i32 %x, 0 ; Should eliminate + %b = and i32 %a, -1 ; Should eliminate + %c = xor i32 %b, 0 ; Should eliminate + ret i32 %c +} + +; Test 2: Constant folding should work +define i32 @test_constant_fold() { +; W32-LABEL: test_constant_fold: +; W32: .functype test_constant_fold () -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_constant_fold: +; W64: .functype test_constant_fold () -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: # fallthrough-return + %a = xor i32 -1, -1 ; Should fold to 0 + ret i32 %a +} + +; Test 3: Protected pattern should NOT have branches +define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { +; W32-LABEL: test_protected_no_branch: +; W32: .functype test_protected_no_branch (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_protected_no_branch: +; W64: .functype test_protected_no_branch (i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test 4: Explicit branch should still generate branches +define i32 @test_explicit_branch(i1 %cond, i32 %a, i32 %b) { +; W32-LABEL: test_explicit_branch: +; W32: .functype test_explicit_branch (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: block +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.eqz +; W32-NEXT: br_if 0 # 0: down to label0 +; W32-NEXT: # %bb.1: # %true +; W32-NEXT: local.get 1 +; W32-NEXT: return +; W32-NEXT: .LBB3_2: # %false +; W32-NEXT: end_block # label0: +; W32-NEXT: local.get 2 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_explicit_branch: +; W64: .functype test_explicit_branch (i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: block +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.eqz +; W64-NEXT: br_if 0 # 0: down to label0 +; W64-NEXT: # %bb.1: # %true +; W64-NEXT: local.get 1 +; W64-NEXT: return +; W64-NEXT: .LBB3_2: # %false +; W64-NEXT: end_block # label0: +; W64-NEXT: local.get 2 +; W64-NEXT: # fallthrough-return + br i1 %cond, label %true, label %false +true: + ret i32 %a +false: + ret i32 %b +} + +; Test 5: Regular select (not ct.select) - whatever wasm wants to do +define i32 @test_regular_select(i1 %cond, i32 %a, i32 %b) { +; W32-LABEL: test_regular_select: +; W32: .functype test_regular_select (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 1 +; W32-NEXT: local.get 2 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.select +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_regular_select: +; W64: .functype test_regular_select (i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 1 +; W64-NEXT: local.get 2 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.select +; W64-NEXT: # fallthrough-return + %result = select i1 %cond, i32 %a, i32 %b + ret i32 %result +} + +; Test if XOR with all-ones still gets optimized +define i32 @test_xor_all_ones() { +; W32-LABEL: test_xor_all_ones: +; W32: .functype test_xor_all_ones () -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_xor_all_ones: +; W64: .functype test_xor_all_ones () -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: # fallthrough-return + %xor1 = xor i32 -1, -1 ; Should optimize to 0 + ret i32 %xor1 +} + +define i32 @test_xor_same_value(i32 %x) { +; W32-LABEL: test_xor_same_value: +; W32: .functype test_xor_same_value (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_xor_same_value: +; W64: .functype test_xor_same_value (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: # fallthrough-return + %xor2 = xor i32 %x, %x ; Should optimize to 0 + ret i32 %xor2 +} + +define i32 @test_normal_ops(i32 %x) { +; W32-LABEL: test_normal_ops: +; W32: .functype test_normal_ops (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_normal_ops: +; W64: .functype test_normal_ops (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: # fallthrough-return + %or1 = or i32 %x, 0 + %and1 = and i32 %or1, -1 + %xor1 = xor i32 %and1, 0 + ret i32 %xor1 +} + +; This simulates what the reviewer is worried about +define i32 @test_xor_with_const_operands() { +; W32-LABEL: test_xor_with_const_operands: +; W32: .functype test_xor_with_const_operands () -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_xor_with_const_operands: +; W64: .functype test_xor_with_const_operands () -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: # fallthrough-return + %a = xor i32 -1, -1 + %b = xor i32 0, 0 + %c = xor i32 42, 42 + %result = or i32 %a, %b + %final = or i32 %result, %c + ret i32 %final ; Should optimize to 0 +} + +declare i32 @llvm.ct.select.i32(i1, i32, i32) + From 17d1d9c4f266494779bf1d9b89bf91a86a04ed53 Mon Sep 17 00:00:00 2001 From: Francesco Bertolaccini Date: Fri, 15 Aug 2025 16:23:33 +0200 Subject: [PATCH 09/63] [CT] Implement CTSELECT pseudo instruction for AArch64 * [CT] Implement CTSELECT for AArch64 on scalars * [CT] Support CTSELECT on AArch64 vectors * [CT] Add AArch64 tests * [CT] Use custom inserter for AArch64 --- .../Target/AArch64/AArch64ISelLowering.cpp | 58 + llvm/lib/Target/AArch64/AArch64ISelLowering.h | 1028 +++++++++++++++++ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 45 + llvm/test/CodeGen/AArch64/ctselect.ll | 125 ++ 4 files changed, 1256 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/ctselect.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 662d84b7a60a8..5e5fec19d5713 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -511,12 +511,28 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::f64, Custom); setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::i64, Custom); + setOperationAction(ISD::CTSELECT, MVT::i8, Promote); + setOperationAction(ISD::CTSELECT, MVT::i16, Promote); + setOperationAction(ISD::CTSELECT, MVT::i32, Custom); + setOperationAction(ISD::CTSELECT, MVT::i64, Custom); if (Subtarget->hasFPARMv8()) { setOperationAction(ISD::SELECT, MVT::f16, Custom); setOperationAction(ISD::SELECT, MVT::bf16, Custom); } + if (Subtarget->hasFullFP16()) { + setOperationAction(ISD::CTSELECT, MVT::f16, Custom); + setOperationAction(ISD::CTSELECT, MVT::bf16, Custom); + } else { + setOperationAction(ISD::CTSELECT, MVT::f16, Promote); + setOperationAction(ISD::CTSELECT, MVT::bf16, Promote); + } setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::SELECT, MVT::f64, Custom); + setOperationAction(ISD::CTSELECT, MVT::f32, Custom); + setOperationAction(ISD::CTSELECT, MVT::f64, Custom); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::CTSELECT, VT, Expand); + } setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); @@ -3328,6 +3344,18 @@ void AArch64TargetLowering::fixupPtrauthDiscriminator( IntDiscOp.setImm(IntDisc); } +MachineBasicBlock *AArch64TargetLowering::EmitCTSELECT(MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + MachineInstrBuilder Builder = BuildMI(*MBB, MI, DL, TII->get(Opcode)); + for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) { + Builder.add(MI.getOperand(Idx)); + } + Builder->setFlag(MachineInstr::NoMerge); + MBB->remove_instr(&MI); + return MBB; +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { @@ -3370,6 +3398,18 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( return EmitEntryPStateSM(MI, BB); case AArch64::F128CSEL: return EmitF128CSEL(MI, BB); + case AArch64::I32CTSELECT: + return EmitCTSELECT(MI, BB, AArch64::CSELWr); + case AArch64::I64CTSELECT: + return EmitCTSELECT(MI, BB, AArch64::CSELXr); + case AArch64::BF16CTSELECT: + return EmitCTSELECT(MI, BB, AArch64::FCSELHrrr); + case AArch64::F16CTSELECT: + return EmitCTSELECT(MI, BB, AArch64::FCSELHrrr); + case AArch64::F32CTSELECT: + return EmitCTSELECT(MI, BB, AArch64::FCSELSrrr); + case AArch64::F64CTSELECT: + return EmitCTSELECT(MI, BB, AArch64::FCSELDrrr); case TargetOpcode::STATEPOINT: // STATEPOINT is a pseudo instruction which has no implicit defs/uses // while bl call instruction (where statepoint will be lowered at the end) @@ -7590,6 +7630,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::CTSELECT: + return LowerCTSELECT(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::BR_JT: @@ -12146,6 +12188,22 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, return Res; } +SDValue AArch64TargetLowering::LowerCTSELECT(SDValue Op, + SelectionDAG &DAG) const { + SDValue CCVal = Op->getOperand(0); + SDValue TVal = Op->getOperand(1); + SDValue FVal = Op->getOperand(2); + SDLoc DL(Op); + + EVT VT = Op.getValueType(); + + SDValue Zero = DAG.getConstant(0, DL, CCVal.getValueType()); + SDValue CC; + SDValue Cmp = getAArch64Cmp(CCVal, Zero, ISD::SETNE, CC, DAG, DL); + + return DAG.getNode(AArch64ISD::CTSELECT, DL, VT, TVal, FVal, CC, Cmp); +} + SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // Jump table entries as PC relative offsets. No additional tweaking diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 9495c9ffc47aa..e83694d8b0a6b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -24,6 +24,1027 @@ namespace llvm { class AArch64TargetMachine; +namespace AArch64ISD { + +// For predicated nodes where the result is a vector, the operation is +// controlled by a governing predicate and the inactive lanes are explicitly +// defined with a value, please stick the following naming convention: +// +// _MERGE_OP The result value is a vector with inactive lanes equal +// to source operand OP. +// +// _MERGE_ZERO The result value is a vector with inactive lanes +// actively zeroed. +// +// _MERGE_PASSTHRU The result value is a vector with inactive lanes equal +// to the last source operand which only purpose is being +// a passthru value. +// +// For other cases where no explicit action is needed to set the inactive lanes, +// or when the result is not a vector and it is needed or helpful to +// distinguish a node from similar unpredicated nodes, use: +// +// _PRED +// +enum NodeType : unsigned { + FIRST_NUMBER = ISD::BUILTIN_OP_END, + WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses. + CALL, // Function call. + + // Pseudo for a OBJC call that gets emitted together with a special `mov + // x29, x29` marker instruction. + CALL_RVMARKER, + + CALL_BTI, // Function call followed by a BTI instruction. + + // Function call, authenticating the callee value first: + // AUTH_CALL chain, callee, auth key #, int disc, addr disc, operands. + AUTH_CALL, + // AUTH_TC_RETURN chain, callee, fpdiff, auth key #, int disc, addr disc, + // operands. + AUTH_TC_RETURN, + + // Authenticated variant of CALL_RVMARKER. + AUTH_CALL_RVMARKER, + + COALESCER_BARRIER, + + VG_SAVE, + VG_RESTORE, + + SMSTART, + SMSTOP, + RESTORE_ZA, + RESTORE_ZT, + SAVE_ZT, + + // A call with the callee in x16, i.e. "blr x16". + CALL_ARM64EC_TO_X64, + + // Produces the full sequence of instructions for getting the thread pointer + // offset of a variable into X0, using the TLSDesc model. + TLSDESC_CALLSEQ, + TLSDESC_AUTH_CALLSEQ, + ADRP, // Page address of a TargetGlobalAddress operand. + ADR, // ADR + ADDlow, // Add the low 12 bits of a TargetGlobalAddress operand. + LOADgot, // Load from automatically generated descriptor (e.g. Global + // Offset Table, TLS record). + RET_GLUE, // Return with a glue operand. Operand 0 is the chain operand. + BRCOND, // Conditional branch instruction; "b.cond". + CSEL, + CSINV, // Conditional select invert. + CSNEG, // Conditional select negate. + CSINC, // Conditional select increment. + + // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on + // ELF. + THREAD_POINTER, + ADC, + SBC, // adc, sbc instructions + + // To avoid stack clash, allocation is performed by block and each block is + // probed. + PROBED_ALLOCA, + + // Predicated instructions where inactive lanes produce undefined results. + ABDS_PRED, + ABDU_PRED, + FADD_PRED, + FDIV_PRED, + FMA_PRED, + FMAX_PRED, + FMAXNM_PRED, + FMIN_PRED, + FMINNM_PRED, + FMUL_PRED, + FSUB_PRED, + HADDS_PRED, + HADDU_PRED, + MUL_PRED, + MULHS_PRED, + MULHU_PRED, + RHADDS_PRED, + RHADDU_PRED, + SDIV_PRED, + SHL_PRED, + SMAX_PRED, + SMIN_PRED, + SRA_PRED, + SRL_PRED, + UDIV_PRED, + UMAX_PRED, + UMIN_PRED, + + // Unpredicated vector instructions + BIC, + + SRAD_MERGE_OP1, + + // Predicated instructions with the result of inactive lanes provided by the + // last operand. + FABS_MERGE_PASSTHRU, + FCEIL_MERGE_PASSTHRU, + FFLOOR_MERGE_PASSTHRU, + FNEARBYINT_MERGE_PASSTHRU, + FNEG_MERGE_PASSTHRU, + FRECPX_MERGE_PASSTHRU, + FRINT_MERGE_PASSTHRU, + FROUND_MERGE_PASSTHRU, + FROUNDEVEN_MERGE_PASSTHRU, + FSQRT_MERGE_PASSTHRU, + FTRUNC_MERGE_PASSTHRU, + FP_ROUND_MERGE_PASSTHRU, + FP_EXTEND_MERGE_PASSTHRU, + UINT_TO_FP_MERGE_PASSTHRU, + SINT_TO_FP_MERGE_PASSTHRU, + FCVTX_MERGE_PASSTHRU, + FCVTZU_MERGE_PASSTHRU, + FCVTZS_MERGE_PASSTHRU, + SIGN_EXTEND_INREG_MERGE_PASSTHRU, + ZERO_EXTEND_INREG_MERGE_PASSTHRU, + ABS_MERGE_PASSTHRU, + NEG_MERGE_PASSTHRU, + + SETCC_MERGE_ZERO, + + // Arithmetic instructions which write flags. + ADDS, + SUBS, + ADCS, + SBCS, + ANDS, + + // Conditional compares. Operands: left,right,falsecc,cc,flags + CCMP, + CCMN, + FCCMP, + + // Floating point comparison + FCMP, + + // Scalar-to-vector duplication + DUP, + DUPLANE8, + DUPLANE16, + DUPLANE32, + DUPLANE64, + DUPLANE128, + + // Vector immedate moves + MOVI, + MOVIshift, + MOVIedit, + MOVImsl, + FMOV, + MVNIshift, + MVNImsl, + + // Vector immediate ops + BICi, + ORRi, + + // Vector bitwise select: similar to ISD::VSELECT but not all bits within an + // element must be identical. + BSP, + + // Vector shuffles + ZIP1, + ZIP2, + UZP1, + UZP2, + TRN1, + TRN2, + REV16, + REV32, + REV64, + EXT, + SPLICE, + + // Vector shift by scalar + VSHL, + VLSHR, + VASHR, + + // Vector shift by scalar (again) + SQSHL_I, + UQSHL_I, + SQSHLU_I, + SRSHR_I, + URSHR_I, + URSHR_I_PRED, + + // Vector narrowing shift by immediate (bottom) + RSHRNB_I, + + // Vector shift by constant and insert + VSLI, + VSRI, + + // Vector comparisons + CMEQ, + CMGE, + CMGT, + CMHI, + CMHS, + FCMEQ, + FCMGE, + FCMGT, + + // Vector zero comparisons + CMEQz, + CMGEz, + CMGTz, + CMLEz, + CMLTz, + FCMEQz, + FCMGEz, + FCMGTz, + FCMLEz, + FCMLTz, + + // Round wide FP to narrow FP with inexact results to odd. + FCVTXN, + + // Vector across-lanes addition + // Only the lower result lane is defined. + SADDV, + UADDV, + + // Unsigned sum Long across Vector + UADDLV, + SADDLV, + + // Wide adds + SADDWT, + SADDWB, + UADDWT, + UADDWB, + + // Add Pairwise of two vectors + ADDP, + // Add Long Pairwise + SADDLP, + UADDLP, + + // udot/sdot/usdot instructions + UDOT, + SDOT, + USDOT, + + // Vector across-lanes min/max + // Only the lower result lane is defined. + SMINV, + UMINV, + SMAXV, + UMAXV, + + SADDV_PRED, + UADDV_PRED, + SMAXV_PRED, + UMAXV_PRED, + SMINV_PRED, + UMINV_PRED, + ORV_PRED, + EORV_PRED, + ANDV_PRED, + + // Compare-and-branch + CBZ, + CBNZ, + TBZ, + TBNZ, + + // Tail calls + TC_RETURN, + + // Custom prefetch handling + PREFETCH, + + // {s|u}int to FP within a FP register. + SITOF, + UITOF, + + /// Natural vector cast. ISD::BITCAST is not natural in the big-endian + /// world w.r.t vectors; which causes additional REV instructions to be + /// generated to compensate for the byte-swapping. But sometimes we do + /// need to re-interpret the data in SIMD vector registers in big-endian + /// mode without emitting such REV instructions. + NVCAST, + + MRS, // MRS, also sets the flags via a glue. + + SMULL, + UMULL, + + PMULL, + + // Reciprocal estimates and steps. + FRECPE, + FRECPS, + FRSQRTE, + FRSQRTS, + + SUNPKHI, + SUNPKLO, + UUNPKHI, + UUNPKLO, + + CLASTA_N, + CLASTB_N, + LASTA, + LASTB, + TBL, + + // Floating-point reductions. + FADDA_PRED, + FADDV_PRED, + FMAXV_PRED, + FMAXNMV_PRED, + FMINV_PRED, + FMINNMV_PRED, + + INSR, + PTEST, + PTEST_ANY, + PTRUE, + + CTTZ_ELTS, + + BITREVERSE_MERGE_PASSTHRU, + BSWAP_MERGE_PASSTHRU, + REVH_MERGE_PASSTHRU, + REVW_MERGE_PASSTHRU, + CTLZ_MERGE_PASSTHRU, + CTPOP_MERGE_PASSTHRU, + DUP_MERGE_PASSTHRU, + INDEX_VECTOR, + + // Cast between vectors of the same element type but differ in length. + REINTERPRET_CAST, + + // Nodes to build an LD64B / ST64B 64-bit quantity out of i64, and vice versa + LS64_BUILD, + LS64_EXTRACT, + + LD1_MERGE_ZERO, + LD1S_MERGE_ZERO, + LDNF1_MERGE_ZERO, + LDNF1S_MERGE_ZERO, + LDFF1_MERGE_ZERO, + LDFF1S_MERGE_ZERO, + LD1RQ_MERGE_ZERO, + LD1RO_MERGE_ZERO, + + // Structured loads. + SVE_LD2_MERGE_ZERO, + SVE_LD3_MERGE_ZERO, + SVE_LD4_MERGE_ZERO, + + // Unsigned gather loads. + GLD1_MERGE_ZERO, + GLD1_SCALED_MERGE_ZERO, + GLD1_UXTW_MERGE_ZERO, + GLD1_SXTW_MERGE_ZERO, + GLD1_UXTW_SCALED_MERGE_ZERO, + GLD1_SXTW_SCALED_MERGE_ZERO, + GLD1_IMM_MERGE_ZERO, + GLD1Q_MERGE_ZERO, + GLD1Q_INDEX_MERGE_ZERO, + + // Signed gather loads + GLD1S_MERGE_ZERO, + GLD1S_SCALED_MERGE_ZERO, + GLD1S_UXTW_MERGE_ZERO, + GLD1S_SXTW_MERGE_ZERO, + GLD1S_UXTW_SCALED_MERGE_ZERO, + GLD1S_SXTW_SCALED_MERGE_ZERO, + GLD1S_IMM_MERGE_ZERO, + + // Unsigned gather loads. + GLDFF1_MERGE_ZERO, + GLDFF1_SCALED_MERGE_ZERO, + GLDFF1_UXTW_MERGE_ZERO, + GLDFF1_SXTW_MERGE_ZERO, + GLDFF1_UXTW_SCALED_MERGE_ZERO, + GLDFF1_SXTW_SCALED_MERGE_ZERO, + GLDFF1_IMM_MERGE_ZERO, + + // Signed gather loads. + GLDFF1S_MERGE_ZERO, + GLDFF1S_SCALED_MERGE_ZERO, + GLDFF1S_UXTW_MERGE_ZERO, + GLDFF1S_SXTW_MERGE_ZERO, + GLDFF1S_UXTW_SCALED_MERGE_ZERO, + GLDFF1S_SXTW_SCALED_MERGE_ZERO, + GLDFF1S_IMM_MERGE_ZERO, + + // Non-temporal gather loads + GLDNT1_MERGE_ZERO, + GLDNT1_INDEX_MERGE_ZERO, + GLDNT1S_MERGE_ZERO, + + // Contiguous masked store. + ST1_PRED, + + // Scatter store + SST1_PRED, + SST1_SCALED_PRED, + SST1_UXTW_PRED, + SST1_SXTW_PRED, + SST1_UXTW_SCALED_PRED, + SST1_SXTW_SCALED_PRED, + SST1_IMM_PRED, + SST1Q_PRED, + SST1Q_INDEX_PRED, + + // Non-temporal scatter store + SSTNT1_PRED, + SSTNT1_INDEX_PRED, + + // SME + RDSVL, + REVD_MERGE_PASSTHRU, + ALLOCATE_ZA_BUFFER, + INIT_TPIDR2OBJ, + + // Needed for __arm_agnostic("sme_za_state") + GET_SME_SAVE_SIZE, + ALLOC_SME_SAVE_BUFFER, + + // Asserts that a function argument (i32) is zero-extended to i8 by + // the caller + ASSERT_ZEXT_BOOL, + + // 128-bit system register accesses + // lo64, hi64, chain = MRRS(chain, sysregname) + MRRS, + // chain = MSRR(chain, sysregname, lo64, hi64) + MSRR, + + // Strict (exception-raising) floating point comparison + FIRST_STRICTFP_OPCODE, + STRICT_FCMP = FIRST_STRICTFP_OPCODE, + STRICT_FCMPE, + LAST_STRICTFP_OPCODE = STRICT_FCMPE, + + // NEON Load/Store with post-increment base updates + FIRST_MEMORY_OPCODE, + LD2post = FIRST_MEMORY_OPCODE, + LD3post, + LD4post, + ST2post, + ST3post, + ST4post, + LD1x2post, + LD1x3post, + LD1x4post, + ST1x2post, + ST1x3post, + ST1x4post, + LD1DUPpost, + LD2DUPpost, + LD3DUPpost, + LD4DUPpost, + LD1LANEpost, + LD2LANEpost, + LD3LANEpost, + LD4LANEpost, + ST2LANEpost, + ST3LANEpost, + ST4LANEpost, + + STG, + STZG, + ST2G, + STZ2G, + + LDP, + LDIAPP, + LDNP, + STP, + STILP, + STNP, + LAST_MEMORY_OPCODE = STNP, + + // SME ZA loads and stores + SME_ZA_LDR, + SME_ZA_STR, +}; + +} // end namespace AArch64ISD + +namespace AArch64ISD { + +// For predicated nodes where the result is a vector, the operation is +// controlled by a governing predicate and the inactive lanes are explicitly +// defined with a value, please stick the following naming convention: +// +// _MERGE_OP The result value is a vector with inactive lanes equal +// to source operand OP. +// +// _MERGE_ZERO The result value is a vector with inactive lanes +// actively zeroed. +// +// _MERGE_PASSTHRU The result value is a vector with inactive lanes equal +// to the last source operand which only purpose is being +// a passthru value. +// +// For other cases where no explicit action is needed to set the inactive lanes, +// or when the result is not a vector and it is needed or helpful to +// distinguish a node from similar unpredicated nodes, use: +// +// _PRED +// +enum NodeType : unsigned { + FIRST_NUMBER = ISD::BUILTIN_OP_END, + WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses. + CALL, // Function call. + + // Pseudo for a OBJC call that gets emitted together with a special `mov + // x29, x29` marker instruction. + CALL_RVMARKER, + + CALL_BTI, // Function call followed by a BTI instruction. + + // Function call, authenticating the callee value first: + // AUTH_CALL chain, callee, auth key #, int disc, addr disc, operands. + AUTH_CALL, + // AUTH_TC_RETURN chain, callee, fpdiff, auth key #, int disc, addr disc, + // operands. + AUTH_TC_RETURN, + + // Authenticated variant of CALL_RVMARKER. + AUTH_CALL_RVMARKER, + + COALESCER_BARRIER, + + VG_SAVE, + VG_RESTORE, + + SMSTART, + SMSTOP, + RESTORE_ZA, + RESTORE_ZT, + SAVE_ZT, + + // A call with the callee in x16, i.e. "blr x16". + CALL_ARM64EC_TO_X64, + + // Produces the full sequence of instructions for getting the thread pointer + // offset of a variable into X0, using the TLSDesc model. + TLSDESC_CALLSEQ, + TLSDESC_AUTH_CALLSEQ, + ADRP, // Page address of a TargetGlobalAddress operand. + ADR, // ADR + ADDlow, // Add the low 12 bits of a TargetGlobalAddress operand. + LOADgot, // Load from automatically generated descriptor (e.g. Global + // Offset Table, TLS record). + RET_GLUE, // Return with a glue operand. Operand 0 is the chain operand. + BRCOND, // Conditional branch instruction; "b.cond". + CSEL, + CSINV, // Conditional select invert. + CSNEG, // Conditional select negate. + CSINC, // Conditional select increment. + + CTSELECT, // AArch64 Constant-time conditional select, implemented with CSEL + + // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on + // ELF. + THREAD_POINTER, + ADC, + SBC, // adc, sbc instructions + + // To avoid stack clash, allocation is performed by block and each block is + // probed. + PROBED_ALLOCA, + + // Predicated instructions where inactive lanes produce undefined results. + ABDS_PRED, + ABDU_PRED, + FADD_PRED, + FDIV_PRED, + FMA_PRED, + FMAX_PRED, + FMAXNM_PRED, + FMIN_PRED, + FMINNM_PRED, + FMUL_PRED, + FSUB_PRED, + HADDS_PRED, + HADDU_PRED, + MUL_PRED, + MULHS_PRED, + MULHU_PRED, + RHADDS_PRED, + RHADDU_PRED, + SDIV_PRED, + SHL_PRED, + SMAX_PRED, + SMIN_PRED, + SRA_PRED, + SRL_PRED, + UDIV_PRED, + UMAX_PRED, + UMIN_PRED, + + // Unpredicated vector instructions + BIC, + + SRAD_MERGE_OP1, + + // Predicated instructions with the result of inactive lanes provided by the + // last operand. + FABS_MERGE_PASSTHRU, + FCEIL_MERGE_PASSTHRU, + FFLOOR_MERGE_PASSTHRU, + FNEARBYINT_MERGE_PASSTHRU, + FNEG_MERGE_PASSTHRU, + FRECPX_MERGE_PASSTHRU, + FRINT_MERGE_PASSTHRU, + FROUND_MERGE_PASSTHRU, + FROUNDEVEN_MERGE_PASSTHRU, + FSQRT_MERGE_PASSTHRU, + FTRUNC_MERGE_PASSTHRU, + FP_ROUND_MERGE_PASSTHRU, + FP_EXTEND_MERGE_PASSTHRU, + UINT_TO_FP_MERGE_PASSTHRU, + SINT_TO_FP_MERGE_PASSTHRU, + FCVTX_MERGE_PASSTHRU, + FCVTZU_MERGE_PASSTHRU, + FCVTZS_MERGE_PASSTHRU, + SIGN_EXTEND_INREG_MERGE_PASSTHRU, + ZERO_EXTEND_INREG_MERGE_PASSTHRU, + ABS_MERGE_PASSTHRU, + NEG_MERGE_PASSTHRU, + + SETCC_MERGE_ZERO, + + // Arithmetic instructions which write flags. + ADDS, + SUBS, + ADCS, + SBCS, + ANDS, + + // Conditional compares. Operands: left,right,falsecc,cc,flags + CCMP, + CCMN, + FCCMP, + + // Floating point comparison + FCMP, + + // Scalar-to-vector duplication + DUP, + DUPLANE8, + DUPLANE16, + DUPLANE32, + DUPLANE64, + DUPLANE128, + + // Vector immedate moves + MOVI, + MOVIshift, + MOVIedit, + MOVImsl, + FMOV, + MVNIshift, + MVNImsl, + + // Vector immediate ops + BICi, + ORRi, + + // Vector bitwise select: similar to ISD::VSELECT but not all bits within an + // element must be identical. + BSP, + + // Vector shuffles + ZIP1, + ZIP2, + UZP1, + UZP2, + TRN1, + TRN2, + REV16, + REV32, + REV64, + EXT, + SPLICE, + + // Vector shift by scalar + VSHL, + VLSHR, + VASHR, + + // Vector shift by scalar (again) + SQSHL_I, + UQSHL_I, + SQSHLU_I, + SRSHR_I, + URSHR_I, + URSHR_I_PRED, + + // Vector narrowing shift by immediate (bottom) + RSHRNB_I, + + // Vector shift by constant and insert + VSLI, + VSRI, + + // Vector comparisons + CMEQ, + CMGE, + CMGT, + CMHI, + CMHS, + FCMEQ, + FCMGE, + FCMGT, + + // Vector zero comparisons + CMEQz, + CMGEz, + CMGTz, + CMLEz, + CMLTz, + FCMEQz, + FCMGEz, + FCMGTz, + FCMLEz, + FCMLTz, + + // Round wide FP to narrow FP with inexact results to odd. + FCVTXN, + + // Vector across-lanes addition + // Only the lower result lane is defined. + SADDV, + UADDV, + + // Unsigned sum Long across Vector + UADDLV, + SADDLV, + + // Wide adds + SADDWT, + SADDWB, + UADDWT, + UADDWB, + + // Add Pairwise of two vectors + ADDP, + // Add Long Pairwise + SADDLP, + UADDLP, + + // udot/sdot/usdot instructions + UDOT, + SDOT, + USDOT, + + // Vector across-lanes min/max + // Only the lower result lane is defined. + SMINV, + UMINV, + SMAXV, + UMAXV, + + SADDV_PRED, + UADDV_PRED, + SMAXV_PRED, + UMAXV_PRED, + SMINV_PRED, + UMINV_PRED, + ORV_PRED, + EORV_PRED, + ANDV_PRED, + + // Compare-and-branch + CBZ, + CBNZ, + TBZ, + TBNZ, + + // Tail calls + TC_RETURN, + + // Custom prefetch handling + PREFETCH, + + // {s|u}int to FP within a FP register. + SITOF, + UITOF, + + /// Natural vector cast. ISD::BITCAST is not natural in the big-endian + /// world w.r.t vectors; which causes additional REV instructions to be + /// generated to compensate for the byte-swapping. But sometimes we do + /// need to re-interpret the data in SIMD vector registers in big-endian + /// mode without emitting such REV instructions. + NVCAST, + + MRS, // MRS, also sets the flags via a glue. + + SMULL, + UMULL, + + PMULL, + + // Reciprocal estimates and steps. + FRECPE, + FRECPS, + FRSQRTE, + FRSQRTS, + + SUNPKHI, + SUNPKLO, + UUNPKHI, + UUNPKLO, + + CLASTA_N, + CLASTB_N, + LASTA, + LASTB, + TBL, + + // Floating-point reductions. + FADDA_PRED, + FADDV_PRED, + FMAXV_PRED, + FMAXNMV_PRED, + FMINV_PRED, + FMINNMV_PRED, + + INSR, + PTEST, + PTEST_ANY, + PTRUE, + + CTTZ_ELTS, + + BITREVERSE_MERGE_PASSTHRU, + BSWAP_MERGE_PASSTHRU, + REVH_MERGE_PASSTHRU, + REVW_MERGE_PASSTHRU, + CTLZ_MERGE_PASSTHRU, + CTPOP_MERGE_PASSTHRU, + DUP_MERGE_PASSTHRU, + INDEX_VECTOR, + + // Cast between vectors of the same element type but differ in length. + REINTERPRET_CAST, + + // Nodes to build an LD64B / ST64B 64-bit quantity out of i64, and vice versa + LS64_BUILD, + LS64_EXTRACT, + + LD1_MERGE_ZERO, + LD1S_MERGE_ZERO, + LDNF1_MERGE_ZERO, + LDNF1S_MERGE_ZERO, + LDFF1_MERGE_ZERO, + LDFF1S_MERGE_ZERO, + LD1RQ_MERGE_ZERO, + LD1RO_MERGE_ZERO, + + // Structured loads. + SVE_LD2_MERGE_ZERO, + SVE_LD3_MERGE_ZERO, + SVE_LD4_MERGE_ZERO, + + // Unsigned gather loads. + GLD1_MERGE_ZERO, + GLD1_SCALED_MERGE_ZERO, + GLD1_UXTW_MERGE_ZERO, + GLD1_SXTW_MERGE_ZERO, + GLD1_UXTW_SCALED_MERGE_ZERO, + GLD1_SXTW_SCALED_MERGE_ZERO, + GLD1_IMM_MERGE_ZERO, + GLD1Q_MERGE_ZERO, + GLD1Q_INDEX_MERGE_ZERO, + + // Signed gather loads + GLD1S_MERGE_ZERO, + GLD1S_SCALED_MERGE_ZERO, + GLD1S_UXTW_MERGE_ZERO, + GLD1S_SXTW_MERGE_ZERO, + GLD1S_UXTW_SCALED_MERGE_ZERO, + GLD1S_SXTW_SCALED_MERGE_ZERO, + GLD1S_IMM_MERGE_ZERO, + + // Unsigned gather loads. + GLDFF1_MERGE_ZERO, + GLDFF1_SCALED_MERGE_ZERO, + GLDFF1_UXTW_MERGE_ZERO, + GLDFF1_SXTW_MERGE_ZERO, + GLDFF1_UXTW_SCALED_MERGE_ZERO, + GLDFF1_SXTW_SCALED_MERGE_ZERO, + GLDFF1_IMM_MERGE_ZERO, + + // Signed gather loads. + GLDFF1S_MERGE_ZERO, + GLDFF1S_SCALED_MERGE_ZERO, + GLDFF1S_UXTW_MERGE_ZERO, + GLDFF1S_SXTW_MERGE_ZERO, + GLDFF1S_UXTW_SCALED_MERGE_ZERO, + GLDFF1S_SXTW_SCALED_MERGE_ZERO, + GLDFF1S_IMM_MERGE_ZERO, + + // Non-temporal gather loads + GLDNT1_MERGE_ZERO, + GLDNT1_INDEX_MERGE_ZERO, + GLDNT1S_MERGE_ZERO, + + // Contiguous masked store. + ST1_PRED, + + // Scatter store + SST1_PRED, + SST1_SCALED_PRED, + SST1_UXTW_PRED, + SST1_SXTW_PRED, + SST1_UXTW_SCALED_PRED, + SST1_SXTW_SCALED_PRED, + SST1_IMM_PRED, + SST1Q_PRED, + SST1Q_INDEX_PRED, + + // Non-temporal scatter store + SSTNT1_PRED, + SSTNT1_INDEX_PRED, + + // SME + RDSVL, + REVD_MERGE_PASSTHRU, + ALLOCATE_ZA_BUFFER, + INIT_TPIDR2OBJ, + + // Needed for __arm_agnostic("sme_za_state") + GET_SME_SAVE_SIZE, + ALLOC_SME_SAVE_BUFFER, + + // Asserts that a function argument (i32) is zero-extended to i8 by + // the caller + ASSERT_ZEXT_BOOL, + + // 128-bit system register accesses + // lo64, hi64, chain = MRRS(chain, sysregname) + MRRS, + // chain = MSRR(chain, sysregname, lo64, hi64) + MSRR, + + // Strict (exception-raising) floating point comparison + FIRST_STRICTFP_OPCODE, + STRICT_FCMP = FIRST_STRICTFP_OPCODE, + STRICT_FCMPE, + LAST_STRICTFP_OPCODE = STRICT_FCMPE, + + // NEON Load/Store with post-increment base updates + FIRST_MEMORY_OPCODE, + LD2post = FIRST_MEMORY_OPCODE, + LD3post, + LD4post, + ST2post, + ST3post, + ST4post, + LD1x2post, + LD1x3post, + LD1x4post, + ST1x2post, + ST1x3post, + ST1x4post, + LD1DUPpost, + LD2DUPpost, + LD3DUPpost, + LD4DUPpost, + LD1LANEpost, + LD2LANEpost, + LD3LANEpost, + LD4LANEpost, + ST2LANEpost, + ST3LANEpost, + ST4LANEpost, + + STG, + STZG, + ST2G, + STZ2G, + + LDP, + LDIAPP, + LDNP, + STP, + STILP, + STNP, + LAST_MEMORY_OPCODE = STNP, + + // SME ZA loads and stores + SME_ZA_LDR, + SME_ZA_STR, +}; + +} // end namespace AArch64ISD namespace AArch64 { /// Possible values of current rounding mode, which is specified in bits @@ -202,6 +1223,8 @@ class AArch64TargetLowering : public TargetLowering { MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const; + MachineBasicBlock *EmitCTSELECT(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; @@ -685,6 +1708,7 @@ class AArch64TargetLowering : public TargetLowering { iterator_range Users, SDNodeFlags Flags, const SDLoc &dl, SelectionDAG &DAG) const; + SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; @@ -920,6 +1944,10 @@ class AArch64TargetLowering : public TargetLowering { bool hasMultipleConditionRegisters(EVT VT) const override { return VT.isScalableVector(); } + + bool isSelectSupported(SelectSupportKind Kind) const override { + return true; + } }; namespace AArch64 { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index f788c7510f80c..48a847244b672 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -464,6 +464,11 @@ def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>; def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisVT<2, OtherVT>]>; +def SDT_AArch64CtSelect : SDTypeProfile<1, 4, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisInt<3>, + SDTCisVT<4, i32>]>; def SDT_AArch64CSel : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, @@ -831,6 +836,7 @@ def AArch64tbz : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz, def AArch64tbnz : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz, [SDNPHasChain]>; +def AArch64ctselect : SDNode<"AArch64ISD::CTSELECT", SDT_AArch64CtSelect>; def AArch64csel : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>; // Conditional select invert. @@ -5683,6 +5689,45 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd), let hasNoSchedulingInfo = 1; } +//===----------------------------------------------------------------------===// +// Constant-time conditional selection instructions +//===----------------------------------------------------------------------===// + +let hasSideEffects = 1, isPseudo = 1, hasNoSchedulingInfo = 1, usesCustomInserter = 1 in { + def I32CTSELECT : Pseudo<(outs GPR32:$dst), + (ins GPR32:$tval, GPR32:$fval, i32imm:$cc), + [(set (i32 GPR32:$dst), + (AArch64ctselect GPR32:$tval, GPR32:$fval, + (i32 imm:$cc), NZCV))]>; + def I64CTSELECT : Pseudo<(outs GPR64:$dst), + (ins GPR64:$tval, GPR64:$fval, i32imm:$cc), + [(set (i64 GPR64:$dst), + (AArch64ctselect GPR64:$tval, GPR64:$fval, + (i32 imm:$cc), NZCV))]>; + let Predicates = [HasFullFP16] in { + def F16CTSELECT : Pseudo<(outs FPR16:$dst), + (ins FPR16:$tval, FPR16:$fval, i32imm:$cc), + [(set (f16 FPR16:$dst), + (AArch64ctselect (f16 FPR16:$tval), (f16 FPR16:$fval), + (i32 imm:$cc), NZCV))]>; + def BF16CTSELECT : Pseudo<(outs FPR16:$dst), + (ins FPR16:$tval, FPR16:$fval, i32imm:$cc), + [(set (bf16 FPR16:$dst), + (AArch64ctselect (bf16 FPR16:$tval), (bf16 FPR16:$fval), + (i32 imm:$cc), NZCV))]>; + } + def F32CTSELECT : Pseudo<(outs FPR32:$dst), + (ins FPR32:$tval, FPR32:$fval, i32imm:$cc), + [(set (f32 FPR32:$dst), + (AArch64ctselect FPR32:$tval, FPR32:$fval, + (i32 imm:$cc), NZCV))]>; + def F64CTSELECT : Pseudo<(outs FPR64:$dst), + (ins FPR64:$tval, FPR64:$fval, i32imm:$cc), + [(set (f64 FPR64:$dst), + (AArch64ctselect FPR64:$tval, FPR64:$fval, + (i32 imm:$cc), NZCV))]>; +} + //===----------------------------------------------------------------------===// // Instructions used for emitting unwind opcodes on ARM64 Windows. //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/ctselect.ll b/llvm/test/CodeGen/AArch64/ctselect.ll new file mode 100644 index 0000000000000..4cde9fe8a866a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ctselect.ll @@ -0,0 +1,125 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-eabi | FileCheck %s --check-prefixes=DEFAULT,NOFP16 +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefixes=DEFAULT,FP16 + +define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) { +; DEFAULT-LABEL: ct_i1: +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) + ret i1 %1 +} + +define i8 @ct_i8(i1 %cond, i8 %a, i8 %b) { +; DEFAULT-LABEL: ct_i8: +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %1 +} + +define i16 @ct_i16(i1 %cond, i16 %a, i16 %b) { +; DEFAULT-LABEL: ct_i16: +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %1 +} + +define i32 @ct_i32(i1 %cond, i32 %a, i32 %b) { +; DEFAULT-LABEL: ct_i32: +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %1 +} + +define i64 @ct_i64(i1 %cond, i64 %a, i64 %b) { +; DEFAULT-LABEL: ct_i64: +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) + ret i64 %1 +} + +define i128 @ct_i128(i1 %cond, i128 %a, i128 %b) { +; DEFAULT-LABEL: ct_i128: +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b) + ret i128 %1 +} + +define half @ct_f16(i1 %cond, half %a, half %b) { +; DEFAULT-LABEL: ct_f16: +; NOFP16: fcvt +; NOFP16: csel +; FP16: fcsel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} +; NOFP16: fcvt + %1 = call half @llvm.ct.select.f16(i1 %cond, half %a, half %b) + ret half %1 +} + +define float @ct_f32(i1 %cond, float %a, float %b) { +; DEFAULT-LABEL: ct_f32: +; DEFAULT: fcsel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %1 +} + +define double @ct_f64(i1 %cond, double %a, double %b) { +; DEFAULT-LABEL: ct_f64: +; DEFAULT: fcsel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %1 +} + +define <4 x i32> @ct_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; DEFAULT-LABEL: ct_v4i32: +; DEFAULT: csel +; DEFAULT: csel +; DEFAULT: csel +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{ldr}} + %1 = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %1 +} + +define <4 x float> @ct_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { +; DEFAULT-LABEL: ct_v4f32: +; DEFAULT: fcsel +; DEFAULT: fcsel +; DEFAULT: fcsel +; DEFAULT: fcsel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{ldr}} + %1 = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) + ret <4 x float> %1 +} \ No newline at end of file From c16db7b4955ec5e59dd93c752a7ddc837ce57fb6 Mon Sep 17 00:00:00 2001 From: kumarak Date: Mon, 18 Aug 2025 12:52:36 -0400 Subject: [PATCH 10/63] Move pseudo instruction expansion to post RA target specific expansion pass and bundle the instructions (#31) --- llvm/lib/Target/X86/X86ISelLowering.cpp | 789 ++-------- llvm/lib/Target/X86/X86ISelLowering.h | 3 - llvm/lib/Target/X86/X86InstrCMovSetCC.td | 289 ++-- llvm/lib/Target/X86/X86InstrFragments.td | 2 +- llvm/lib/Target/X86/X86InstrInfo.cpp | 402 ++++- llvm/lib/Target/X86/X86InstrInfo.h | 4 +- .../test/CodeGen/X86/ctselect-optimization.ll | 18 +- llvm/test/CodeGen/X86/ctselect-vector.ll | 1400 +++++++++-------- llvm/test/CodeGen/X86/ctselect.ll | 18 +- 9 files changed, 1512 insertions(+), 1413 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index fd7a70b32a23d..9f400a970793c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25352,18 +25352,19 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, } SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { - SDValue Cond = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); - SDValue Op2 = Op.getOperand(2); + SDValue Cond = Op.getOperand(0); // condition + SDValue TrueOp = Op.getOperand(1); // true_value + SDValue FalseOp = Op.getOperand(2); // false_value SDLoc DL(Op); - MVT VT = Op1.getSimpleValueType(); + MVT VT = TrueOp.getSimpleValueType(); // Handle soft float16 by converting to integer operations if (isSoftF16(VT, Subtarget)) { MVT NVT = VT.changeTypeToInteger(); - return DAG.getBitcast(VT, DAG.getNode(ISD::CTSELECT, DL, NVT, Cond, - DAG.getBitcast(NVT, Op1), - DAG.getBitcast(NVT, Op2))); + SDValue CtSelect = + DAG.getNode(ISD::CTSELECT, DL, NVT, Cond, DAG.getBitcast(NVT, FalseOp), + DAG.getBitcast(NVT, TrueOp)); + return DAG.getBitcast(VT, CtSelect); } // Handle vector types @@ -25371,15 +25372,14 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { // Handle soft float16 vectors if (isSoftF16(VT, Subtarget)) { MVT NVT = VT.changeVectorElementTypeToInteger(); - return DAG.getBitcast(VT, DAG.getNode(ISD::CTSELECT, DL, NVT, Cond, - DAG.getBitcast(NVT, Op1), - DAG.getBitcast(NVT, Op2))); + SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, NVT, Cond, + DAG.getBitcast(NVT, FalseOp), + DAG.getBitcast(NVT, TrueOp)); + return DAG.getBitcast(VT, CtSelect); } unsigned VectorWidth = VT.getSizeInBits(); MVT EltVT = VT.getVectorElementType(); - unsigned NumElts = VT.getVectorNumElements(); - // Check if we have the necessary SIMD support bool HasSSE = Subtarget.hasSSE1(); bool HasAVX = Subtarget.hasAVX(); @@ -25399,22 +25399,12 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { // Handle special cases for floating point vectors if (EltVT.isFloatingPoint()) { - // For AVX-512, use mask-based selection for better performance - if (HasAVX512 && VectorWidth == 512) { - // Convert condition to mask and use masked select - MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); - SDValue Mask = DAG.getSetCC(DL, MaskVT, Cond, - DAG.getConstant(0, DL, Cond.getValueType()), - ISD::SETNE); - return DAG.getSelect(DL, VT, Mask, Op1, Op2); - } - // For vector floating point with AVX, use VBLENDV-style operations if (HasAVX && (VectorWidth == 256 || VectorWidth == 128)) { // Convert to bitwise operations using the condition MVT IntVT = VT.changeVectorElementTypeToInteger(); - SDValue IntOp1 = DAG.getBitcast(IntVT, Op1); - SDValue IntOp2 = DAG.getBitcast(IntVT, Op2); + SDValue IntOp1 = DAG.getBitcast(IntVT, TrueOp); + SDValue IntOp2 = DAG.getBitcast(IntVT, FalseOp); // Create the CTSELECT node with integer types SDValue IntResult = @@ -25429,131 +25419,8 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { // use the generic X86 CTSELECT node which will be matched by the patterns SDValue CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); SDValue EFLAGS = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget); - - // Create the X86 CTSELECT node - note operand order: false, true, cc, flags - return DAG.getNode(X86ISD::CTSELECT, DL, VT, Op2, Op1, CC, EFLAGS); - } - - // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops - // are available, Otherwise FP cmovs get lowered into a less efficient branch - // sequence later. - if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) && - VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { - SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); - bool IsAlwaysSignaling; - unsigned SSECC = - translateX86FSETCC(cast(Cond.getOperand(2))->get(), - CondOp0, CondOp1, IsAlwaysSignaling); - - // TODO: CTSELECT does not look for AVX support and optimize it using vector - // select - if (SSECC < 8) { - SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, - DAG.getTargetConstant(SSECC, DL, MVT::i8)); - SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); - SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); - return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); - } - } - - // Try to optimize special patterns when comparing with zero - if (Cond.getOpcode() == X86ISD::SETCC && - Cond.getOperand(1).getOpcode() == X86ISD::CMP && - isNullConstant(Cond.getOperand(1).getOperand(1))) { - - SDValue CmpOp0 = Cond.getOperand(1).getOperand(0); - unsigned CondCode = Cond.getConstantOperandVal(0); - - // Special handling for __builtin_ffs(X) - 1 pattern - auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) { - return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() && - Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2)); - }; - - if ((VT == MVT::i32 || VT == MVT::i64) && - ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) || - (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) { - // Keep original comparison for FFS pattern - } else { - - auto TryOptimizeAndOneSelect = - [&](SDValue CmpOp0, SDValue Op1, SDValue Op2, unsigned CondCode, - SDLoc DL, SelectionDAG &DAG, - const X86Subtarget &Subtarget) -> SDValue { - if (CondCode != X86::COND_E || CmpOp0.getOpcode() != ISD::AND || - !isOneConstant(CmpOp0.getOperand(1))) - return SDValue(); - - EVT CmpVT = CmpOp0.getValueType(); - EVT SelectVT = Op1.getValueType(); - - /// function to create a mask for LSB operations - auto SplatLSB = [&](EVT SplatVT) { - SDValue AdjustedValue; - - if (CmpVT.bitsGT(SplatVT)) { - AdjustedValue = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpOp0); - } else if (CmpVT.bitsLT(SplatVT)) { - SDValue Extended = - DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpOp0.getOperand(0)); - AdjustedValue = DAG.getNode(ISD::AND, DL, SplatVT, Extended, - DAG.getConstant(1, DL, SplatVT)); - } else { - AdjustedValue = CmpOp0; - } - - return DAG.getNegative(AdjustedValue, DL, SplatVT); - }; - - // CTSELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1)) - if (isNullConstant(Op1) && isAllOnesConstant(Op2)) - return SplatLSB(SelectVT); - - // CTSELECT (AND(X,1) == 0), C1, C2 -> - // XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2)) - if (!Subtarget.canUseCMOV() && isa(Op1) && - isa(Op2)) { - SDValue Mask = SplatLSB(SelectVT); - SDValue Diff = DAG.getNode(ISD::XOR, DL, SelectVT, Op1, Op2); - SDValue Flip = DAG.getNode(ISD::AND, DL, SelectVT, Mask, Diff); - return DAG.getNode(ISD::XOR, DL, SelectVT, Op1, Flip); - } - - return SDValue(); - }; - - /// Try to optimize min/max patterns with sign bit operations - auto TryOptimizeMinMaxPattern = - [&](SDValue CmpOp0, SDValue Op1, SDValue Op2, unsigned CondCode, - MVT VT, SDLoc DL, SelectionDAG &DAG, - const X86Subtarget &Subtarget) -> SDValue { - if ((VT != MVT::i32 && VT != MVT::i64) || !isNullConstant(Op2) || - CmpOp0 != Op1) - return SDValue(); - - if (CondCode == X86::COND_S || // smin(x, 0) - (CondCode == X86::COND_G && hasAndNot(Op1))) { // smax(x, 0) - // (ctselect (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x - // (ctselect (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x - unsigned ShCt = VT.getSizeInBits() - 1; - SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT); - SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt); - if (CondCode == X86::COND_G) - Shift = DAG.getNOT(DL, Shift, VT); - return DAG.getNode(ISD::AND, DL, VT, Shift, Op1); - } - return SDValue(); - }; - // Try AND(X,1) optimizations - if (SDValue OptResult = TryOptimizeAndOneSelect( - CmpOp0, Op1, Op2, CondCode, DL, DAG, Subtarget)) - return OptResult; - - // Try min/max pattern optimizations - if (SDValue OptResult = TryOptimizeMinMaxPattern( - CmpOp0, Op1, Op2, CondCode, VT, DL, DAG, Subtarget)) - return OptResult; - } + // Create the X86 CTSELECT node - note operand order: true, false, cc, flags + return DAG.getNode(X86ISD::CTSELECT, DL, VT, FalseOp, TrueOp, CC, EFLAGS); } // Look past (and (setcc_carry (cmp ...)), 1) @@ -25623,9 +25490,9 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { ProcessConditionFlags(Cond, VT, DL, DAG, Subtarget); // Handle i8 CTSELECT with truncate optimization - if (Op.getValueType() == MVT::i8 && - Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { - SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); + if (Op.getValueType() == MVT::i8 && TrueOp.getOpcode() == ISD::TRUNCATE && + FalseOp.getOpcode() == ISD::TRUNCATE) { + SDValue T1 = TrueOp.getOperand(0), T2 = FalseOp.getOperand(0); if (T1.getValueType() == T2.getValueType() && T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode() != ISD::CopyFromReg) { @@ -25637,26 +25504,26 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { // Promote small integer types to avoid partial register stalls if ((Op.getValueType() == MVT::i8) || - (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) && - !X86::mayFoldLoad(Op2, Subtarget))) { - Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); - Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); - SDValue Ops[] = {Op2, Op1, CC, ProcessedCond}; + (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(TrueOp, Subtarget) && + !X86::mayFoldLoad(FalseOp, Subtarget))) { + TrueOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueOp); + FalseOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseOp); + SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond}; SDValue Cmov = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Ops); return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } if (isScalarFPTypeInSSEReg(VT)) { MVT IntVT = (VT == MVT::f32) ? MVT::i32 : MVT::i64; - Op1 = DAG.getBitcast(IntVT, Op1); - Op2 = DAG.getBitcast(IntVT, Op2); - SDValue Ops[] = {Op2, Op1, CC, ProcessedCond}; + TrueOp = DAG.getBitcast(IntVT, TrueOp); + FalseOp = DAG.getBitcast(IntVT, FalseOp); + SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond}; SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, IntVT, Ops); return DAG.getBitcast(VT, CtSelect); } // Create final CTSELECT node - SDValue Ops[] = {Op2, Op1, CC, ProcessedCond}; + SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond}; return DAG.getNode(X86ISD::CTSELECT, DL, Op.getValueType(), Ops, Op->getFlags()); } @@ -36720,456 +36587,6 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV, return SinkMBB; } -struct CtSelectInstructions { - unsigned PAndOpc; - unsigned PAndnOpc; - unsigned POrOpc; - unsigned BroadcastOpc; - unsigned IntMoveOpc; - unsigned MoveOpc; - bool Use256; - bool UseVEX; - bool UseBlendInstr; -}; - -static CtSelectInstructions -getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { - CtSelectInstructions Instructions = {}; - - switch (Opcode) { - case X86::CTSELECT_V2F64: - if (Subtarget.hasSSE2()) { - Instructions.PAndOpc = X86::PANDrr; - Instructions.PAndnOpc = X86::PANDNrr; - Instructions.POrOpc = X86::PORrr; - Instructions.BroadcastOpc = X86::PSHUFDri; - Instructions.IntMoveOpc = X86::MOVDI2PDIrr; - Instructions.MoveOpc = X86::MOVAPDrr; - } else { - llvm_unreachable("Double precision vectors require SSE2"); - } - break; - case X86::CTSELECT_V4F32: - if (Subtarget.hasSSE41()) { - Instructions.PAndOpc = X86::PANDrr; - Instructions.PAndnOpc = X86::PANDNrr; - Instructions.POrOpc = X86::PORrr; - Instructions.BroadcastOpc = X86::PSHUFDri; - Instructions.IntMoveOpc = X86::MOVDI2PDIrr; - Instructions.MoveOpc = X86::MOVAPSrr; - Instructions.UseBlendInstr = true; - } else if (Subtarget.hasSSE2()) { - Instructions.PAndOpc = X86::PANDrr; - Instructions.PAndnOpc = X86::PANDNrr; - Instructions.POrOpc = X86::PORrr; - Instructions.BroadcastOpc = X86::PSHUFDri; - Instructions.IntMoveOpc = X86::MOVDI2PDIrr; - Instructions.MoveOpc = X86::MOVAPSrr; - } else { - Instructions.PAndOpc = X86::ANDPSrr; - Instructions.PAndnOpc = X86::ANDNPSrr; - Instructions.POrOpc = X86::ORPSrr; - Instructions.BroadcastOpc = X86::SHUFPSrri; - Instructions.IntMoveOpc = X86::MOVSS2DIrr; - Instructions.MoveOpc = X86::MOVAPSrr; - } - break; - case X86::CTSELECT_V4I32: - case X86::CTSELECT_V2I64: - case X86::CTSELECT_V8I16: - case X86::CTSELECT_V16I8: - if (Subtarget.hasSSE2()) { - Instructions.PAndOpc = X86::PANDrr; - Instructions.PAndnOpc = X86::PANDNrr; - Instructions.POrOpc = X86::PORrr; - Instructions.BroadcastOpc = X86::PSHUFDri; - Instructions.IntMoveOpc = X86::MOVDI2PDIrr; - Instructions.MoveOpc = X86::MOVDQArr; - } else { - llvm_unreachable("Integer vector operations require SSE2"); - } - break; - case X86::CTSELECT_V8F16: - if (Subtarget.hasSSE2()) { - Instructions.PAndOpc = X86::PANDrr; - Instructions.PAndnOpc = X86::PANDNrr; - Instructions.POrOpc = X86::PORrr; - Instructions.BroadcastOpc = X86::PSHUFDri; - Instructions.IntMoveOpc = X86::MOVDI2PDIrr; - Instructions.MoveOpc = X86::MOVDQArr; - } else { - llvm_unreachable("FP16 vector operations require SSE2"); - } - break; - case X86::CTSELECT_V4F32X: - case X86::CTSELECT_V4I32X: - case X86::CTSELECT_V2F64X: - case X86::CTSELECT_V2I64X: - case X86::CTSELECT_V8I16X: - case X86::CTSELECT_V16I8X: - case X86::CTSELECT_V8F16X: - if (Subtarget.hasAVX()) { - Instructions.PAndOpc = X86::VPANDrr; - Instructions.PAndnOpc = X86::VPANDNrr; - Instructions.POrOpc = X86::VPORrr; - Instructions.BroadcastOpc = X86::VPSHUFDri; - Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; - Instructions.MoveOpc = (Opcode == X86::CTSELECT_V4F32X) ? X86::VMOVAPSrr - : (Opcode == X86::CTSELECT_V2F64X) - ? X86::VMOVAPDrr - : X86::VMOVDQArr; - Instructions.UseVEX = true; - } else { - llvm_unreachable("AVX variants require AVX support"); - } - break; - case X86::CTSELECT_V8F32: - case X86::CTSELECT_V8I32: - if (Subtarget.hasAVX()) { - Instructions.PAndOpc = X86::VPANDYrr; - Instructions.PAndnOpc = X86::VPANDNYrr; - Instructions.POrOpc = X86::VPORYrr; - Instructions.BroadcastOpc = X86::VPERMILPSYri; - Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; - Instructions.MoveOpc = - (Opcode == X86::CTSELECT_V8F32) ? X86::VMOVAPSYrr : X86::VMOVDQAYrr; - Instructions.Use256 = true; - Instructions.UseVEX = true; - } else { - llvm_unreachable("256-bit vectors require AVX"); - } - break; - case X86::CTSELECT_V4F64: - case X86::CTSELECT_V4I64: - if (Subtarget.hasAVX()) { - Instructions.PAndOpc = X86::VPANDYrr; - Instructions.PAndnOpc = X86::VPANDNYrr; - Instructions.POrOpc = X86::VPORYrr; - Instructions.BroadcastOpc = X86::VPERMILPDYri; - Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; - Instructions.MoveOpc = - (Opcode == X86::CTSELECT_V4F64) ? X86::VMOVAPDYrr : X86::VMOVDQAYrr; - Instructions.Use256 = true; - Instructions.UseVEX = true; - } else { - llvm_unreachable("256-bit vectors require AVX"); - } - break; - case X86::CTSELECT_V16I16: - case X86::CTSELECT_V32I8: - case X86::CTSELECT_V16F16: - if (Subtarget.hasAVX2()) { - Instructions.PAndOpc = X86::VPANDYrr; - Instructions.PAndnOpc = X86::VPANDNYrr; - Instructions.POrOpc = X86::VPORYrr; - Instructions.BroadcastOpc = X86::VPERMILPSYri; - Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; - Instructions.MoveOpc = X86::VMOVDQAYrr; - Instructions.Use256 = true; - Instructions.UseVEX = true; - } else if (Subtarget.hasAVX()) { - Instructions.PAndOpc = X86::VPANDYrr; - Instructions.PAndnOpc = X86::VPANDNYrr; - Instructions.POrOpc = X86::VPORYrr; - Instructions.BroadcastOpc = X86::VPERMILPSYri; - Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; - Instructions.MoveOpc = X86::VMOVDQAYrr; - Instructions.Use256 = true; - Instructions.UseVEX = true; - } else { - llvm_unreachable("256-bit integer vectors require AVX"); - } - break; - default: - llvm_unreachable("Unexpected CTSELECT opcode"); - } - - return Instructions; -} - -static Register createScalarMask(MachineBasicBlock *MBB, MachineInstr &MI, - const MIMetadata &MIMD, - const TargetInstrInfo *TII, - MachineRegisterInfo &MRI) { - const TargetRegisterClass *GR8Class = &X86::GR8RegClass; - const TargetRegisterClass *GR32Class = &X86::GR32RegClass; - - Register CondByteReg = MRI.createVirtualRegister(GR8Class); - Register CondReg = MRI.createVirtualRegister(GR32Class); - Register ScalarMaskReg = MRI.createVirtualRegister(GR32Class); - - // Create a condition value using appropriate SETCC instruction - BuildMI(*MBB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg) - .addImm(X86::COND_E) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // Zero-extend byte to 32-bit register (movzbl %al, %eax) - BuildMI(*MBB, MI, MIMD, TII->get(X86::MOVZX32rr8), CondReg) - .addReg(CondByteReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax) - BuildMI(*MBB, MI, MIMD, TII->get(X86::NEG32r), ScalarMaskReg) - .addReg(CondReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - return ScalarMaskReg; -} - -static Register broadcastScalarMask( - MachineBasicBlock *MBB, MachineInstr &MI, const MIMetadata &MIMD, - const TargetInstrInfo *TII, MachineRegisterInfo &MRI, - Register ScalarMaskReg, const TargetRegisterClass *RC, - const CtSelectInstructions &Instructions, const X86Subtarget &Subtarget) { - // Step 1: Move scalar mask to vector register - Register VecFromScalarReg = MRI.createVirtualRegister(RC); - BuildMI(*MBB, MI, MIMD, TII->get(Instructions.IntMoveOpc), VecFromScalarReg) - .addReg(ScalarMaskReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // Step 2: Broadcast mask across all elements - Register MaskReg = MRI.createVirtualRegister(RC); - if (Instructions.Use256) { - // For 256-bit vectors, broadcast across all elements - BuildMI(*MBB, MI, MIMD, TII->get(Instructions.BroadcastOpc), MaskReg) - .addReg(VecFromScalarReg) - .addImm(0) - .setMIFlag(MachineInstr::MIFlag::NoMerge); // Broadcast element 0 to all - // positions - } else { - // For 128-bit vectors - if (Subtarget.hasSSE2() || Instructions.UseVEX) { - // Use PSHUFD for efficient broadcasting - BuildMI(*MBB, MI, MIMD, TII->get(Instructions.BroadcastOpc), MaskReg) - .addReg(VecFromScalarReg) - .addImm(0x00) - .setMIFlag(MachineInstr::MIFlag::NoMerge); // Broadcast element 0 to - // all positions - } else { - // SSE1 fallback using SHUFPS - BuildMI(*MBB, MI, MIMD, TII->get(Instructions.BroadcastOpc), MaskReg) - .addReg(VecFromScalarReg) - .addReg(VecFromScalarReg) - .addImm(0x00) - .setMIFlag(MachineInstr::MIFlag::NoMerge); // Broadcast element 0 to - // all positions - } - } - - return MaskReg; -} - -MachineBasicBlock * -X86TargetLowering::EmitLoweredCtSelect(MachineInstr &MI, - MachineBasicBlock *ThisMBB) const { - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - const MIMetadata MIMD(MI); - - MachineRegisterInfo &MRI = ThisMBB->getParent()->getRegInfo(); - DebugLoc DL = MI.getDebugLoc(); - - // Extract operands: dst = ctselect src1, src2, cond - Register DstReg = MI.getOperand(0).getReg(); - Register TrueReg = MI.getOperand(1).getReg(); - Register FalseReg = MI.getOperand(2).getReg(); - // Note: CondCode from MI.getOperand(3).getImm() is not used - we hardcode - // COND_E for sete - - // Get the vector type to determine the appropriate instructions - const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - unsigned Opcode = MI.getOpcode(); - - // Get instruction opcodes for this operation - CtSelectInstructions Instructions = - getCtSelectInstructions(Opcode, Subtarget); - - // Step 1: Create scalar mask using SETCC + NEG - Register ScalarMaskReg = createScalarMask(ThisMBB, MI, MIMD, TII, MRI); - - // Step 2: Move scalar mask to vector register and broadcast - Register MaskReg = broadcastScalarMask( - ThisMBB, MI, MIMD, TII, MRI, ScalarMaskReg, RC, Instructions, Subtarget); - - // Step 3: Implement blend operation - if (Instructions.UseBlendInstr && Subtarget.hasSSE41() && - !Instructions.Use256) { - // Use dedicated blend instructions for SSE4.1+ - unsigned BlendOpc; - switch (Opcode) { - case X86::CTSELECT_V4F32: - BlendOpc = X86::BLENDVPSrr0; - break; - case X86::CTSELECT_V2F64: - BlendOpc = X86::BLENDVPDrr0; - break; - default: - BlendOpc = X86::PBLENDVBrr0; - break; - } - - // BLENDV uses XMM0 as implicit mask register - BuildMI(*ThisMBB, MI, MIMD, TII->get(X86::MOVAPSrr), X86::XMM0) - .addReg(MaskReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - BuildMI(*ThisMBB, MI, MIMD, TII->get(BlendOpc), DstReg) - .addReg(FalseReg) - .addReg(TrueReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - } else { - // Use traditional AND/ANDN/OR approach - Register TempReg = MRI.createVirtualRegister(RC); - Register MaskCopyReg = MRI.createVirtualRegister(RC); - Register VecAndReg = MRI.createVirtualRegister(RC); - Register VecAndnReg = MRI.createVirtualRegister(RC); - Register FinalResultReg = MRI.createVirtualRegister(RC); - - // Copy mask for first operation - BuildMI(*ThisMBB, MI, MIMD, TII->get(Instructions.MoveOpc), TempReg) - .addReg(MaskReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // mask & true_val - BuildMI(*ThisMBB, MI, MIMD, TII->get(Instructions.PAndOpc), VecAndReg) - .addReg(TempReg) - .addReg(TrueReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // Copy mask for second operation - BuildMI(*ThisMBB, MI, MIMD, TII->get(Instructions.MoveOpc), MaskCopyReg) - .addReg(MaskReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // ~mask & false_val - BuildMI(*ThisMBB, MI, MIMD, TII->get(Instructions.PAndnOpc), VecAndnReg) - .addReg(MaskCopyReg) - .addReg(FalseReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // Combine results - BuildMI(*ThisMBB, MI, MIMD, TII->get(Instructions.POrOpc), FinalResultReg) - .addReg(VecAndReg) - .addReg(VecAndnReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // Move final result to destination - BuildMI(*ThisMBB, MI, MIMD, TII->get(Instructions.MoveOpc), DstReg) - .addReg(FinalResultReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - } - // Remove the original instruction - MI.eraseFromParent(); - return ThisMBB; -} - -MachineBasicBlock * -X86TargetLowering::EmitLoweredCtSelectNoCMOV(MachineInstr &MI, - MachineBasicBlock *ThisMBB) const { - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - const MIMetadata MIMD(MI); - MachineRegisterInfo &MRI = ThisMBB->getParent()->getRegInfo(); - DebugLoc DL = MI.getDebugLoc(); - - // Get operands - Register DstReg = MI.getOperand(0).getReg(); - Register TrueReg = MI.getOperand(1).getReg(); - Register FalseReg = MI.getOperand(2).getReg(); - - X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm()); - X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); - - const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - unsigned SETCCOp, MOVZXOp, NEGOp, ANDOp, XOROp, OROp; - const TargetRegisterClass *condRC; - - if (RC == &X86::GR8RegClass) { - SETCCOp = X86::SETCCr; - MOVZXOp = 0; // No extension needed for 8-bit - NEGOp = X86::NEG8r; - ANDOp = X86::AND8rr; - XOROp = X86::XOR8ri; - OROp = X86::OR8rr; - condRC = &X86::GR8RegClass; - } else if (RC == &X86::GR16RegClass) { - SETCCOp = X86::SETCCr; - MOVZXOp = X86::MOVZX16rr8; - NEGOp = X86::NEG16r; - ANDOp = X86::AND16rr; - XOROp = X86::XOR16ri; - OROp = X86::OR16rr; - condRC = &X86::GR16RegClass; - } else if (RC == &X86::GR32RegClass) { - SETCCOp = X86::SETCCr; - MOVZXOp = X86::MOVZX32rr8; - NEGOp = X86::NEG32r; - ANDOp = X86::AND32rr; - XOROp = X86::XOR32ri; - OROp = X86::OR32rr; - condRC = &X86::GR32RegClass; - } else { - llvm_unreachable("Unsupported register class for conditional select"); - } - - // Step 1: Create condition value using SETCC instruction - Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); - BuildMI(*ThisMBB, MI, MIMD, TII->get(SETCCOp), CondByteReg) - .addImm(OppCC) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - Register CondReg; - if (RC == &X86::GR8RegClass) { - // For 8-bit, use the byte register directly - CondReg = CondByteReg; - } else { - // For 16/32-bit, zero-extend the byte to the target size - CondReg = MRI.createVirtualRegister(condRC); - BuildMI(*ThisMBB, MI, MIMD, TII->get(MOVZXOp), CondReg) - .addReg(CondByteReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - } - - // Step 2: Convert condition to mask (1 -> 0xFFFF..., 0 -> 0x0000...) - // Use NEG to create all-ones mask when condition is true - Register MaskReg = MRI.createVirtualRegister(condRC); - BuildMI(*ThisMBB, MI, MIMD, TII->get(NEGOp), MaskReg) - .addReg(CondReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // Step 3: Implement conditional select using bitwise operations - // Result = (TrueReg & Mask) | (FalseReg & ~Mask) - - // Create inverted mask (~Mask) - Register InvMaskReg = MRI.createVirtualRegister(condRC); - BuildMI(*ThisMBB, MI, MIMD, TII->get(XOROp), InvMaskReg) - .addReg(MaskReg) - .addImm(-1) - .setMIFlag(MachineInstr::MIFlag::NoMerge); // XOR with all 1s to invert - - // Compute TrueReg & Mask - Register TrueMaskedReg = MRI.createVirtualRegister(condRC); - BuildMI(*ThisMBB, MI, MIMD, TII->get(ANDOp), TrueMaskedReg) - .addReg(TrueReg) - .addReg(MaskReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // Compute FalseReg & ~Mask - Register FalseMaskedReg = MRI.createVirtualRegister(condRC); - BuildMI(*ThisMBB, MI, MIMD, TII->get(ANDOp), FalseMaskedReg) - .addReg(FalseReg) - .addReg(InvMaskReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // Final result: (TrueReg & Mask) | (FalseReg & ~Mask) - BuildMI(*ThisMBB, MI, MIMD, TII->get(OROp), DstReg) - .addReg(TrueMaskedReg) - .addReg(FalseMaskedReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // Remove the original instruction - MI.eraseFromParent(); - return ThisMBB; -} - MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, MachineBasicBlock *ThisMBB) const { @@ -38565,6 +37982,124 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI, return BB; } +MachineBasicBlock * +X86TargetLowering::EmitLoweredCtSelect(MachineInstr &MI, + MachineBasicBlock *ThisMBB) const { + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const MIMetadata MIMD(MI); + MachineRegisterInfo &MRI = ThisMBB->getParent()->getRegInfo(); + DebugLoc DL = MI.getDebugLoc(); + + // Get operands + Register DstReg = MI.getOperand(0).getReg(); + Register TrueReg = MI.getOperand(1).getReg(); + Register FalseReg = MI.getOperand(2).getReg(); + + X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm()); + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); + + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + unsigned SETCCOp, MOVZXOp, NEGOp, ANDOp, XOROp, OROp; + const TargetRegisterClass *condRC; + + if (RC == &X86::GR8RegClass) { + SETCCOp = X86::SETCCr; + MOVZXOp = 0; // No extension needed for 8-bit + NEGOp = X86::NEG8r; + ANDOp = X86::AND8rr; + XOROp = X86::XOR8ri; + OROp = X86::OR8rr; + condRC = &X86::GR8RegClass; + } else if (RC == &X86::GR16RegClass) { + SETCCOp = X86::SETCCr; + MOVZXOp = X86::MOVZX16rr8; + NEGOp = X86::NEG16r; + ANDOp = X86::AND16rr; + XOROp = X86::XOR16ri; + OROp = X86::OR16rr; + condRC = &X86::GR16RegClass; + } else if (RC == &X86::GR32RegClass) { + SETCCOp = X86::SETCCr; + MOVZXOp = X86::MOVZX32rr8; + NEGOp = X86::NEG32r; + ANDOp = X86::AND32rr; + XOROp = X86::XOR32ri; + OROp = X86::OR32rr; + condRC = &X86::GR32RegClass; + } else { + llvm_unreachable("Unsupported register class for conditional select"); + } + + auto BundleStart = MI.getIterator(); + + // Step 1: Create condition value using SETCC instruction + Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + BuildMI(*ThisMBB, MI, MIMD, TII->get(SETCCOp), CondByteReg) + .addImm(OppCC) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + Register CondReg; + if (RC == &X86::GR8RegClass) { + // For 8-bit, use the byte register directly + CondReg = CondByteReg; + } else { + // For 16/32-bit, zero-extend the byte to the target size + CondReg = MRI.createVirtualRegister(condRC); + BuildMI(*ThisMBB, MI, MIMD, TII->get(MOVZXOp), CondReg) + .addReg(CondByteReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } + + // Step 2: Convert condition to mask (1 -> 0xFFFF..., 0 -> 0x0000...) + // Use NEG to create all-ones mask when condition is true + Register MaskReg = MRI.createVirtualRegister(condRC); + BuildMI(*ThisMBB, MI, MIMD, TII->get(NEGOp), MaskReg) + .addReg(CondReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 3: Implement conditional select using bitwise operations + // Result = (TrueReg & Mask) | (FalseReg & ~Mask) + + // Create inverted mask (~Mask) + Register InvMaskReg = MRI.createVirtualRegister(condRC); + BuildMI(*ThisMBB, MI, MIMD, TII->get(XOROp), InvMaskReg) + .addReg(MaskReg) + .addImm(-1) + .setMIFlag(MachineInstr::MIFlag::NoMerge); // XOR with all 1s to invert + + // Compute TrueReg & Mask + Register TrueMaskedReg = MRI.createVirtualRegister(condRC); + BuildMI(*ThisMBB, MI, MIMD, TII->get(ANDOp), TrueMaskedReg) + .addReg(TrueReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Compute FalseReg & ~Mask + Register FalseMaskedReg = MRI.createVirtualRegister(condRC); + BuildMI(*ThisMBB, MI, MIMD, TII->get(ANDOp), FalseMaskedReg) + .addReg(FalseReg) + .addReg(InvMaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Final result: (TrueReg & Mask) | (FalseReg & ~Mask) + BuildMI(*ThisMBB, MI, MIMD, TII->get(OROp), DstReg) + .addReg(TrueMaskedReg) + .addReg(FalseMaskedReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Remove the original instruction + MI.eraseFromParent(); + + auto BundleEnd = MI.getIterator(); + if (BundleStart != BundleEnd) { + // Only bundle if we have multiple instructions + MachineInstr *BundleHeader = + BuildMI(*ThisMBB, BundleStart, DL, TII->get(TargetOpcode::BUNDLE)); + finalizeBundle(*ThisMBB, BundleHeader->getIterator(), std::next(BundleEnd)); + } + return ThisMBB; +} + MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -38626,39 +38161,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::CMOV_VK64: return EmitLoweredSelect(MI, BB); - case X86::CTSELECT_V2F64: - case X86::CTSELECT_V4F32: - case X86::CTSELECT_V8F16: - case X86::CTSELECT_V2I64: - case X86::CTSELECT_V4I32: - case X86::CTSELECT_V8I16: - case X86::CTSELECT_V16I8: - case X86::CTSELECT_V2F64X: - case X86::CTSELECT_V4F32X: - case X86::CTSELECT_V8F16X: - case X86::CTSELECT_V2I64X: - case X86::CTSELECT_V4I32X: - case X86::CTSELECT_V8I16X: - case X86::CTSELECT_V16I8X: - case X86::CTSELECT_V4I64: - case X86::CTSELECT_V8I32: - case X86::CTSELECT_V16I16: - case X86::CTSELECT_V32I8: - case X86::CTSELECT_V4F64: - case X86::CTSELECT_V8F32: - case X86::CTSELECT_V16F16: - case X86::CTSELECT_V8I64: - case X86::CTSELECT_V16I32: - case X86::CTSELECT_V32I16: - case X86::CTSELECT_V64I8: - case X86::CTSELECT_V8F64: - case X86::CTSELECT_V16F32: - case X86::CTSELECT_V32F16: - return EmitLoweredCtSelect(MI, BB); - case X86::CTSELECT_GR16rr: case X86::CTSELECT_GR32rr: - return EmitLoweredCtSelectNoCMOV(MI, BB); + return EmitLoweredCtSelect(MI, BB); case X86::CTSELECT_FP32rr: case X86::CTSELECT_FP64rr: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 0c6f47a9f3ee5..7cd63a1d77c7d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1872,9 +1872,6 @@ namespace llvm { MachineBasicBlock *EmitLoweredSelect(MachineInstr &I, MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredCtSelectNoCMOV(MachineInstr &MI, - MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredCtSelect(MachineInstr &MI, MachineBasicBlock *BB) const; diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td index 68c17f889e6d6..3081d9d22ab5d 100644 --- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -114,14 +114,16 @@ let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in { multiclass CTSELECT { // register-only - let isCommutable = 0, SchedRW = [WriteCMOV] in { + let isCommutable = 0, SchedRW = [WriteCMOV], + AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in { def rr : PseudoI<(outs t.RegClass:$dst), (ins t.RegClass:$src1, t.RegClass:$src2, i8imm:$cond), [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, t.RegClass:$src2, timm:$cond, EFLAGS))]>; } // register-memory - let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in { + let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold], + AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in { def rm : PseudoI<(outs t.RegClass:$dst), (ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond), [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>; @@ -137,106 +139,203 @@ let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { } } -let Uses = [EFLAGS], usesCustomInserter = 1, isNotDuplicable = 1, isPseudo = 1, hasSideEffects = 1 in { - // 128-bit vector types - let Predicates = [HasSSE1] in { - def CTSELECT_V4F32 : PseudoI<(outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - [(set VR128:$dst, (v4f32 (X86ctselect VR128:$t, VR128:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V2F64 : PseudoI<(outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - [(set VR128:$dst, (v2f64 (X86ctselect VR128:$t, VR128:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V4I32 : PseudoI<(outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - [(set VR128:$dst, (v4i32 (X86ctselect VR128:$t, VR128:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V2I64 : PseudoI<(outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - [(set VR128:$dst, (v2i64 (X86ctselect VR128:$t, VR128:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V8I16 : PseudoI<(outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - [(set VR128:$dst, (v8i16 (X86ctselect VR128:$t, VR128:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V16I8 : PseudoI<(outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - [(set VR128:$dst, (v16i8 (X86ctselect VR128:$t, VR128:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V8F16 : PseudoI<(outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - [(set VR128:$dst, (v8f16 (X86ctselect VR128:$t, VR128:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; +// CTSELECT_VEC base class +class CTSELECT_VEC + : PseudoI< + (outs VRc:$dst, VRc:$tmpx, GRc:$tmpg), + (ins VRc:$t, VRc:$f, i8imm:$cond), + [] + > { + let Uses = [EFLAGS]; + let isPseudo = 1; + let isNotDuplicable = 1; + let hasSideEffects = 1; + let AsmString = "ctselect\t$dst, $f, $t, $cond"; + let SchedRW = []; +} + +// Width-specific class aliases +class CTSELECT_VEC128 : CTSELECT_VEC; +class CTSELECT_VEC128X : CTSELECT_VEC; +class CTSELECT_VEC256 : CTSELECT_VEC; +class CTSELECT_VEC512 : CTSELECT_VEC; + + +//===----------------------------------------------------------------------===// +// 128-bit pseudos (SSE2 baseline; we use PXOR/PAND/MOVD/PSHUFD in the expander) +//===----------------------------------------------------------------------===// +let Predicates = [HasSSE2] in { + + def CTSELECT_V4F32 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V2F64 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; } + def CTSELECT_V4I32 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V2I64 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V8I16 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V16I8 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + // If your build has v8f16, keep this; otherwise comment it out. + def CTSELECT_V8F16 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } +} + +let Predicates = [HasAVX] in { - // 128-bit vector types (AVX versions) - let Predicates = [HasAVX] in { - def CTSELECT_V4F32X : PseudoI<(outs VR128X:$dst), (ins VR128X:$t, VR128X:$f, i8imm:$cond), - [(set VR128X:$dst, (v4f32 (X86ctselect VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V2F64X : PseudoI<(outs VR128X:$dst), (ins VR128X:$t, VR128X:$f, i8imm:$cond), - [(set VR128X:$dst, (v2f64 (X86ctselect VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V4I32X : PseudoI<(outs VR128X:$dst), (ins VR128X:$t, VR128X:$f, i8imm:$cond), - [(set VR128X:$dst, (v4i32 (X86ctselect VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V2I64X : PseudoI<(outs VR128X:$dst), (ins VR128X:$t, VR128X:$f, i8imm:$cond), - [(set VR128X:$dst, (v2i64 (X86ctselect VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V8I16X : PseudoI<(outs VR128X:$dst), (ins VR128X:$t, VR128X:$f, i8imm:$cond), - [(set VR128X:$dst, (v8i16 (X86ctselect VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V16I8X : PseudoI<(outs VR128X:$dst), (ins VR128X:$t, VR128X:$f, i8imm:$cond), - [(set VR128X:$dst, (v16i8 (X86ctselect VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V8F16X : PseudoI<(outs VR128X:$dst), (ins VR128X:$t, VR128X:$f, i8imm:$cond), - [(set VR128X:$dst, (v8f16 (X86ctselect VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; + def CTSELECT_V4F32X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; } + def CTSELECT_V2F64X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V4I32X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V2I64X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V8I16X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V16I8X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + // If your build has v8f16, keep this; otherwise comment it out. + def CTSELECT_V8F16X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } +} - // 256-bit vector types - let Predicates = [HasAVX] in { - def CTSELECT_V8F32 : PseudoI<(outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - [(set VR256:$dst, (v8f32 (X86ctselect VR256:$t, VR256:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V4F64 : PseudoI<(outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - [(set VR256:$dst, (v4f64 (X86ctselect VR256:$t, VR256:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V8I32 : PseudoI<(outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - [(set VR256:$dst, (v8i32 (X86ctselect VR256:$t, VR256:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V4I64 : PseudoI<(outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - [(set VR256:$dst, (v4i64 (X86ctselect VR256:$t, VR256:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V16I16 : PseudoI<(outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - [(set VR256:$dst, (v16i16 (X86ctselect VR256:$t, VR256:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V32I8 : PseudoI<(outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - [(set VR256:$dst, (v32i8 (X86ctselect VR256:$t, VR256:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; - def CTSELECT_V16F16 : PseudoI<(outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - [(set VR256:$dst, (v16f16 (X86ctselect VR256:$t, VR256:$f, timm:$cond, EFLAGS)))]>, - Sched<[]>; +//===----------------------------------------------------------------------===// +// 256-bit pseudos +//===----------------------------------------------------------------------===// +let Predicates = [HasAVX] in { + + def CTSELECT_V8F32 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V4F64 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V8I32 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V4I64 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V16I16 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; } + def CTSELECT_V32I8 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + // If your build has v16f16, keep this; otherwise comment it out. + def CTSELECT_V16F16 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } +} + +//===----------------------------------------------------------------------===// +// 512-bit pseudos +//===----------------------------------------------------------------------===// - // 512-bit vector types - let Predicates = [HasAVX512] in { - def CTSELECT_V16F32 : PseudoI<(outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - [(set VR512:$dst, (v16f32 (X86ctselect VR512:$t, VR512:$f, timm:$cond, EFLAGS)))]>, - Sched<[WriteCMOV]>; - def CTSELECT_V8F64 : PseudoI<(outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - [(set VR512:$dst, (v8f64 (X86ctselect VR512:$t, VR512:$f, timm:$cond, EFLAGS)))]>, - Sched<[WriteCMOV]>; - def CTSELECT_V16I32 : PseudoI<(outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - [(set VR512:$dst, (v16i32 (X86ctselect VR512:$t, VR512:$f, timm:$cond, EFLAGS)))]>, - Sched<[WriteCMOV]>; - def CTSELECT_V8I64 : PseudoI<(outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - [(set VR512:$dst, (v8i64 (X86ctselect VR512:$t, VR512:$f, timm:$cond, EFLAGS)))]>, - Sched<[WriteCMOV]>; - def CTSELECT_V32I16 : PseudoI<(outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - [(set VR512:$dst, (v32i16 (X86ctselect VR512:$t, VR512:$f, timm:$cond, EFLAGS)))]>, - Sched<[WriteCMOV]>; - def CTSELECT_V64I8 : PseudoI<(outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - [(set VR512:$dst, (v64i8 (X86ctselect VR512:$t, VR512:$f, timm:$cond, EFLAGS)))]>, - Sched<[WriteCMOV]>; - def CTSELECT_V32F16 : PseudoI<(outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - [(set VR512:$dst, (v32f16 (X86ctselect VR512:$t, VR512:$f, timm:$cond, EFLAGS)))]>, - Sched<[WriteCMOV]>; +// Core AVX-512F types +let Predicates = [HasAVX512] in { + def CTSELECT_V16F32 : CTSELECT_VEC512 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V8F64 : CTSELECT_VEC512 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V16I32 : CTSELECT_VEC512 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; } + def CTSELECT_V8I64 : CTSELECT_VEC512{ + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } +} + +//===----------------------------------------------------------------------===// +// Selection patterns: X86ctselect(...), EFLAGS -> CTSELECT_V* +// +// NOTE: +// * The SDNode carries Glue from CMP/TEST (due to SDNPInGlue). +// * We list EFLAGS explicitly in the pattern (X86 style) to model the arch read. +// * Temps (tmpx/tmpy,tmpg) are not in the pattern; they’re outs allocated by RA. +//===----------------------------------------------------------------------===// + +let Predicates = [HasSSE2] in { + + // 128-bit integer + def : Pat<(v4i32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V4I32 VR128:$t, VR128:$f, timm:$cc)>; + def : Pat<(v2i64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V2I64 VR128:$t, VR128:$f, timm:$cc)>; + def : Pat<(v8i16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8I16 VR128:$t, VR128:$f, timm:$cc)>; + def : Pat<(v16i8 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V16I8 VR128:$t, VR128:$f, timm:$cc)>; + + // 128-bit float (bitwise-equivalent ops in expander) + def : Pat<(v4f32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V4F32 VR128:$t, VR128:$f, timm:$cc)>; + def : Pat<(v2f64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V2F64 VR128:$t, VR128:$f, timm:$cc)>; + + // 128-bit f16 (optional) + def : Pat<(v8f16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8F16 VR128:$t, VR128:$f, timm:$cc)>; +} + +let Predicates = [HasAVX] in { + + // 256-bit integer + def : Pat<(v8i32 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8I32 VR256:$t, VR256:$f, timm:$cc)>; + def : Pat<(v4i64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V4I64 VR256:$t, VR256:$f, timm:$cc)>; + def : Pat<(v16i16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V16I16 VR256:$t, VR256:$f, timm:$cc)>; + def : Pat<(v32i8 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V32I8 VR256:$t, VR256:$f, timm:$cc)>; + + // 256-bit float (bitwise-equivalent ops in expander) + def : Pat<(v8f32 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8F32 VR256:$t, VR256:$f, timm:$cc)>; + def : Pat<(v4f64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V4F64 VR256:$t, VR256:$f, timm:$cc)>; + + // 256-bit f16 (optional) + def : Pat<(v16f16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V16F16 VR256:$t, VR256:$f, timm:$cc)>; +} + +//===----------------------------------------------------------------------===// +// 512-bit selection patterns +// Note: SDNode is X86ctselect with SDNPInGlue; pattern names EFLAGS explicitly. +// Temps ($tmpx,$tmpg) are outs allocated by RA; not mentioned in patterns. +//===----------------------------------------------------------------------===// + +let Predicates = [HasAVX512] in { + def : Pat<(v16f32 (X86ctselect VR512:$t, VR512:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V16F32 VR512:$t, VR512:$f, timm:$cc)>; + def : Pat<(v8f64 (X86ctselect VR512:$t, VR512:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8F64 VR512:$t, VR512:$f, timm:$cc)>; + + def : Pat<(v16i32 (X86ctselect VR512:$t, VR512:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V16I32 VR512:$t, VR512:$f, timm:$cc)>; + def : Pat<(v8i64 (X86ctselect VR512:$t, VR512:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8I64 VR512:$t, VR512:$f, timm:$cc)>; } let Predicates = [HasCMOV, HasCF] in { diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td index 0f912f23bbb62..4c9e5bae3b46c 100644 --- a/llvm/lib/Target/X86/X86InstrFragments.td +++ b/llvm/lib/Target/X86/X86InstrFragments.td @@ -155,7 +155,7 @@ def X86ctest : SDNode<"X86ISD::CTEST", SDTX86Ccmp>; def X86cload : SDNode<"X86ISD::CLOAD", SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86cstore : SDNode<"X86ISD::CSTORE", SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def X86ctselect: SDNode<"X86ISD::CTSELECT", SDTX86CtSelect>; +def X86ctselect: SDNode<"X86ISD::CTSELECT", SDTX86CtSelect, [SDNPInGlue]>; def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>; def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, [SDNPHasChain]>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 4ef8c04d49e6f..1463eeec6c696 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -475,16 +475,318 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op, return false; } -bool X86InstrInfo::expandCtSelect(unsigned Opcode, - MachineInstrBuilder &MIB) const { - MachineInstr *MI = MIB.getInstr(); - MachineBasicBlock &MBB = *MIB->getParent(); - DebugLoc DL = MIB->getDebugLoc(); +struct CtSelectInstructions { + unsigned PAndOpc; + unsigned PAndnOpc; + unsigned POrOpc; + unsigned BroadcastOpc; + unsigned IntMoveOpc; + unsigned MoveOpc; + bool Use256; + bool UseVEX; + bool UseBlendInstr; +}; + +static CtSelectInstructions +getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { + CtSelectInstructions Instructions = {}; + + switch (Opcode) { + case X86::CTSELECT_V2F64: + if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVAPDrr; + } else { + llvm_unreachable("Double precision vectors require SSE2"); + } + break; + case X86::CTSELECT_V4F32: + if (Subtarget.hasSSE41()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVAPSrr; + Instructions.UseBlendInstr = true; + } else if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVAPSrr; + } else { + Instructions.PAndOpc = X86::ANDPSrr; + Instructions.PAndnOpc = X86::ANDNPSrr; + Instructions.POrOpc = X86::ORPSrr; + Instructions.BroadcastOpc = X86::SHUFPSrri; + Instructions.IntMoveOpc = X86::MOVSS2DIrr; + Instructions.MoveOpc = X86::MOVAPSrr; + } + break; + case X86::CTSELECT_V4I32: + case X86::CTSELECT_V2I64: + case X86::CTSELECT_V8I16: + case X86::CTSELECT_V16I8: + if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVDQArr; + } else { + llvm_unreachable("Integer vector operations require SSE2"); + } + break; + case X86::CTSELECT_V8F16: + if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVDQArr; + } else { + llvm_unreachable("FP16 vector operations require SSE2"); + } + break; + case X86::CTSELECT_V4F32X: + case X86::CTSELECT_V4I32X: + case X86::CTSELECT_V2F64X: + case X86::CTSELECT_V2I64X: + case X86::CTSELECT_V8I16X: + case X86::CTSELECT_V16I8X: + case X86::CTSELECT_V8F16X: + if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDrr; + Instructions.PAndnOpc = X86::VPANDNrr; + Instructions.POrOpc = X86::VPORrr; + Instructions.BroadcastOpc = X86::VPSHUFDri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = (Opcode == X86::CTSELECT_V4F32X) ? X86::VMOVAPSrr + : (Opcode == X86::CTSELECT_V2F64X) + ? X86::VMOVAPDrr + : X86::VMOVDQArr; + Instructions.UseVEX = true; + } else { + llvm_unreachable("AVX variants require AVX support"); + } + break; + case X86::CTSELECT_V8F32: + case X86::CTSELECT_V8I32: + if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPSYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = + (Opcode == X86::CTSELECT_V8F32) ? X86::VMOVAPSYrr : X86::VMOVDQAYrr; + Instructions.Use256 = true; + Instructions.UseVEX = true; + } else { + llvm_unreachable("256-bit vectors require AVX"); + } + break; + case X86::CTSELECT_V4F64: + case X86::CTSELECT_V4I64: + if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPDYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = + (Opcode == X86::CTSELECT_V4F64) ? X86::VMOVAPDYrr : X86::VMOVDQAYrr; + Instructions.Use256 = true; + Instructions.UseVEX = true; + } else { + llvm_unreachable("256-bit vectors require AVX"); + } + break; + case X86::CTSELECT_V16I16: + case X86::CTSELECT_V32I8: + case X86::CTSELECT_V16F16: + if (Subtarget.hasAVX2()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPSYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = X86::VMOVDQAYrr; + Instructions.Use256 = true; + Instructions.UseVEX = true; + } else if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPSYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = X86::VMOVDQAYrr; + Instructions.Use256 = true; + Instructions.UseVEX = true; + } else { + llvm_unreachable("256-bit integer vectors require AVX"); + } + break; + default: + llvm_unreachable("Unexpected CTSELECT opcode"); + } + + return Instructions; +} + +bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + const DebugLoc &DL = MI.getDebugLoc(); + auto Instruction = getCtSelectInstructions(Opcode, Subtarget); + + MachineBasicBlock *MBB = MI.getParent(); + + // Operand layout matches the TableGen definition: + // (outs VR128:$dst, VR128:$tmpx, GR32:$tmpg), + // (ins VR128:$t, VR128:$f, i8imm:$cond) + Register Dst = MI.getOperand(0).getReg(); + Register MaskReg = MI.getOperand(1).getReg(); // vector mask temp + Register TmpGPR = MI.getOperand(2).getReg(); // scalar mask temp (GPR32) + Register FalseVal = MI.getOperand(3).getReg(); // true_value + Register TrueVal = MI.getOperand(4).getReg(); // false_value + X86::CondCode CC = X86::CondCode(MI.getOperand(5).getImm()); // condition + + auto BundleStart = MI.getIterator(); + + // Create scalar mask in tempGPR and broadcast to vector mask + BuildMI(*MBB, MI, DL, get(X86::MOV32ri), TmpGPR) + .addImm(0) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + auto SubReg = TRI->getSubReg(TmpGPR, X86::sub_8bit); + BuildMI(*MBB, MI, DL, get(X86::SETCCr)) + .addReg(SubReg) + .addImm(CC) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + + // Zero-extend byte to 32-bit register (movzbl %al, %eax) + BuildMI(*MBB, MI, DL, get(X86::MOVZX32rr8), TmpGPR) + .addReg(SubReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + + // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax) + BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR).addReg(TmpGPR); + + // Broadcast to TmpX (vector mask) + BuildMI(*MBB, MI, DL, get(X86::PXORrr), MaskReg) + .addReg(MaskReg) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + + // Move scalar mask to vector register + BuildMI(*MBB, MI, DL, get(Instruction.IntMoveOpc), MaskReg) + .addReg(TmpGPR) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + + if (Instruction.Use256) { + // Broadcast to 256-bit vector register + BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) + .addReg(MaskReg) + .addImm(0) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + } else { + if (Subtarget.hasSSE2() || Instruction.UseVEX) { + BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) + .addReg(MaskReg) + .addImm(0x00) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + } else { + BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) + .addReg(MaskReg) + .addReg(MaskReg) + .addImm(0x00) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + } + } + + if (Instruction.UseBlendInstr && Subtarget.hasSSE41() && + !Instruction.Use256) { + // Use dedicated blend instructions for SSE4.1+ + unsigned BlendOpc; + switch (Opcode) { + case X86::CTSELECT_V4F32: + BlendOpc = X86::BLENDVPSrr0; + break; + case X86::CTSELECT_V2F64: + BlendOpc = X86::BLENDVPDrr0; + break; + default: + BlendOpc = X86::PBLENDVBrr0; + break; + } + + // BLENDV uses XMM0 as implicit mask register + // https://www.felixcloutier.com/x86/pblendvb + BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), X86::XMM0) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + BuildMI(*MBB, MI, DL, get(BlendOpc), Dst) + .addReg(FalseVal) + .addReg(TrueVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + + } else { + + // dst = mask + BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + + // mask &= true_val + BuildMI(*MBB, MI, DL, get(X86::PANDrr), MaskReg) + .addReg(MaskReg) + .addReg(TrueVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + + // dst = ~mask & false_val + BuildMI(*MBB, MI, DL, get(X86::PANDNrr), Dst) + .addReg(Dst) + .addReg(FalseVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + + // dst |= mask; (mask & t) | (~mask & f) + BuildMI(*MBB, MI, DL, get(X86::PORrr), Dst) + .addReg(Dst) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + } + + MI.eraseFromParent(); + + auto BundleEnd = MI.getIterator(); + if (BundleStart != BundleEnd) { + // Only bundle if we have multiple instructions + MachineInstr *BundleHeader = + BuildMI(*MBB, BundleStart, DL, get(TargetOpcode::BUNDLE)); + finalizeBundle(*MBB, BundleHeader->getIterator(), std::next(BundleEnd)); + } + + return true; +} + +bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); // CTSELECT pseudo has: (outs dst), (ins true_val, false_val, cond) - MachineOperand &OperandRes = MI->getOperand(0); // destination register - MachineOperand &OperandTrue = MI->getOperand(1); // true value - MachineOperand &OperandCond = MI->getOperand(3); // condition code + MachineOperand &OperandRes = MI.getOperand(0); // destination register + MachineOperand &OperandTrue = MI.getOperand(1); // true value + MachineOperand &OperandCond = MI.getOperand(3); // condition code assert(OperandTrue.isReg() && OperandRes.isReg() && OperandCond.isImm() && "Invalid operand types"); @@ -493,20 +795,44 @@ bool X86InstrInfo::expandCtSelect(unsigned Opcode, assert(Subtarget.hasCMOV() && "target does not support CMOV instructions"); - if (Subtarget.hasCMOV()) { - // Build CMOV instruction: copy the first 3 operands (dst, true, false) and - // add condition code - MachineInstrBuilder CmovBuilder = - BuildMI(MBB, MIB.getInstr(), DL, get(Opcode)); - for (unsigned i = 0; i < MI->getNumOperands(); ++i) { // Copy - CmovBuilder.add(MIB->getOperand(i)); - } - } else { + unsigned Opcode = 0; + + switch (MI.getOpcode()) { + case X86::CTSELECT16rr: + Opcode = X86::CMOV16rr; + break; + case X86::CTSELECT32rr: + Opcode = X86::CMOV32rr; + break; + case X86::CTSELECT64rr: + Opcode = X86::CMOV64rr; + break; + case X86::CTSELECT16rm: + Opcode = X86::CMOV16rm; + break; + case X86::CTSELECT32rm: + Opcode = X86::CMOV32rm; + break; + case X86::CTSELECT64rm: + Opcode = X86::CMOV64rm; + break; + default: + llvm_unreachable("Invalid CTSELECT opcode"); + } + + if (!Subtarget.hasCMOV()) { llvm_unreachable("target does not support cmov"); } + // Build CMOV instruction: copy the first 3 operands (dst, true, false) + // and add condition code + MachineInstrBuilder CmovBuilder = BuildMI(*MBB, MI, DL, get(Opcode)); + for (unsigned i = 0u; i < MI.getNumOperands(); ++i) { // Copy + CmovBuilder.add(MI.getOperand(i)); + } + // Remove the original CTSELECT instruction - MI->eraseFromParent(); + MI.eraseFromParent(); return true; } @@ -6447,23 +6773,39 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MIB->setDesc(get(X86::OR64ri32)); break; case X86::CTSELECT64rr: - expandCtSelect(X86::CMOV64rr, MIB); - break; case X86::CTSELECT32rr: - expandCtSelect(X86::CMOV32rr, MIB); - break; case X86::CTSELECT16rr: - expandCtSelect(X86::CMOV16rr, MIB); - break; case X86::CTSELECT64rm: - expandCtSelect(X86::CMOV64rm, MIB); - break; case X86::CTSELECT32rm: - expandCtSelect(X86::CMOV32rm, MIB); - break; case X86::CTSELECT16rm: - expandCtSelect(X86::CMOV16rm, MIB); - break; + return expandCtSelectWithCMOV(MI); + + case X86::CTSELECT_V2F64: + case X86::CTSELECT_V4F32: + case X86::CTSELECT_V8F16: + case X86::CTSELECT_V2I64: + case X86::CTSELECT_V4I32: + case X86::CTSELECT_V8I16: + case X86::CTSELECT_V16I8: + case X86::CTSELECT_V2F64X: + case X86::CTSELECT_V4F32X: + case X86::CTSELECT_V8F16X: + case X86::CTSELECT_V2I64X: + case X86::CTSELECT_V4I32X: + case X86::CTSELECT_V8I16X: + case X86::CTSELECT_V16I8X: + case X86::CTSELECT_V4I64: + case X86::CTSELECT_V8I32: + case X86::CTSELECT_V16I16: + case X86::CTSELECT_V32I8: + case X86::CTSELECT_V4F64: + case X86::CTSELECT_V8F32: + case X86::CTSELECT_V16F16: + case X86::CTSELECT_V8I64: + case X86::CTSELECT_V16I32: + case X86::CTSELECT_V8F64: + case X86::CTSELECT_V16F32: + return expandCtSelectVector(MI); } return false; } diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index e5c105ceb615b..a8d61cfd579ad 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -725,7 +725,9 @@ class X86InstrInfo final : public X86GenInstrInfo { int &FrameIndex) const; /// Expand the CTSELECT pseudo-instructions. - bool expandCtSelect(unsigned Opcode, MachineInstrBuilder &MIB) const; + bool expandCtSelectWithCMOV(MachineInstr &MI) const; + + bool expandCtSelectVector(MachineInstr &MI) const; /// Returns true iff the routine could find two commutable operands in the /// given machine instruction with 3 vector inputs. diff --git a/llvm/test/CodeGen/X86/ctselect-optimization.ll b/llvm/test/CodeGen/X86/ctselect-optimization.ll index fff76e34ec7e6..4c94107665601 100644 --- a/llvm/test/CodeGen/X86/ctselect-optimization.ll +++ b/llvm/test/CodeGen/X86/ctselect-optimization.ll @@ -248,9 +248,14 @@ define i64 @test_ctselect_i64_smin_zero(i64 %x) { define float @test_ctselect_f32_zero_positive(float %x) { ; CHECK-LABEL: test_ctselect_f32_zero_positive: ; CHECK: # %bb.0: +; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: cmpltss %xmm0, %xmm1 -; CHECK-NEXT: andps %xmm1, %xmm0 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: seta %cl +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %eax, %edx +; CHECK-NEXT: movd %edx, %xmm0 ; CHECK-NEXT: retq %cmp = fcmp ogt float %x, 0.0 %result = call float @llvm.ct.select.f32(i1 %cmp, float %x, float 0.0) @@ -260,9 +265,14 @@ define float @test_ctselect_f32_zero_positive(float %x) { define double @test_ctselect_f64_zero_positive(double %x) { ; CHECK-LABEL: test_ctselect_f64_zero_positive: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorpd %xmm1, %xmm1 -; CHECK-NEXT: cmpltsd %xmm0, %xmm1 -; CHECK-NEXT: andpd %xmm1, %xmm0 +; CHECK-NEXT: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: seta %cl +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovneq %rax, %rdx +; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: retq %cmp = fcmp ogt double %x, 0.0 %result = call double @llvm.ct.select.f64(i1 %cmp, double %x, double 0.0) diff --git a/llvm/test/CodeGen/X86/ctselect-vector.ll b/llvm/test/CodeGen/X86/ctselect-vector.ll index 29877e374efa2..46ea1a0f83991 100644 --- a/llvm/test/CodeGen/X86/ctselect-vector.ll +++ b/llvm/test/CodeGen/X86/ctselect-vector.ll @@ -9,47 +9,53 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: test_ctselect_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: testb $1, %dil -; SSE2-NEXT: sete %al +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd $0, %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: testb $1, %dil -; AVX-NEXT: sete %al +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: movd %eax, %xmm2 -; AVX-NEXT: pshufd $0, %xmm2, %xmm2 -; AVX-NEXT: movdqa %xmm2, %xmm2 -; AVX-NEXT: pand %xmm2, %xmm1 -; AVX-NEXT: pandn %xmm0, %xmm2 -; AVX-NEXT: por %xmm1, %xmm2 -; AVX-NEXT: movdqa %xmm2, %xmm0 +; AVX-NEXT: xorps %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: testb $1, %dil -; AVX2-NEXT: sete %al +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: movd %eax, %xmm2 -; AVX2-NEXT: pshufd $0, %xmm2, %xmm2 -; AVX2-NEXT: movdqa %xmm2, %xmm2 -; AVX2-NEXT: pand %xmm2, %xmm1 -; AVX2-NEXT: pandn %xmm0, %xmm2 -; AVX2-NEXT: por %xmm1, %xmm2 -; AVX2-NEXT: movdqa %xmm2, %xmm0 +; AVX2-NEXT: xorps %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v4i32: @@ -68,47 +74,53 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: test_ctselect_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: testb $1, %dil -; SSE2-NEXT: sete %al +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd $0, %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: testb $1, %dil -; AVX-NEXT: sete %al +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: movd %eax, %xmm2 -; AVX-NEXT: pshufd $0, %xmm2, %xmm2 -; AVX-NEXT: movdqa %xmm2, %xmm2 -; AVX-NEXT: pand %xmm2, %xmm1 -; AVX-NEXT: pandn %xmm0, %xmm2 -; AVX-NEXT: por %xmm1, %xmm2 -; AVX-NEXT: movdqa %xmm2, %xmm0 +; AVX-NEXT: xorps %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v4f32: ; AVX2: # %bb.0: -; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: testb $1, %dil -; AVX2-NEXT: sete %al +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: movd %eax, %xmm2 -; AVX2-NEXT: pshufd $0, %xmm2, %xmm2 -; AVX2-NEXT: movdqa %xmm2, %xmm2 -; AVX2-NEXT: pand %xmm2, %xmm1 -; AVX2-NEXT: pandn %xmm0, %xmm2 -; AVX2-NEXT: por %xmm1, %xmm2 -; AVX2-NEXT: movdqa %xmm2, %xmm0 +; AVX2-NEXT: xorps %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v4f32: @@ -127,47 +139,53 @@ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: test_ctselect_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: testb $1, %dil -; SSE2-NEXT: sete %al +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd $0, %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: testb $1, %dil -; AVX-NEXT: sete %al +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: movd %eax, %xmm2 -; AVX-NEXT: pshufd $0, %xmm2, %xmm2 -; AVX-NEXT: movdqa %xmm2, %xmm2 -; AVX-NEXT: pand %xmm2, %xmm1 -; AVX-NEXT: pandn %xmm0, %xmm2 -; AVX-NEXT: por %xmm1, %xmm2 -; AVX-NEXT: movdqa %xmm2, %xmm0 +; AVX-NEXT: xorps %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: testb $1, %dil -; AVX2-NEXT: sete %al +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: movd %eax, %xmm2 -; AVX2-NEXT: pshufd $0, %xmm2, %xmm2 -; AVX2-NEXT: movdqa %xmm2, %xmm2 -; AVX2-NEXT: pand %xmm2, %xmm1 -; AVX2-NEXT: pandn %xmm0, %xmm2 -; AVX2-NEXT: por %xmm1, %xmm2 -; AVX2-NEXT: movdqa %xmm2, %xmm0 +; AVX2-NEXT: xorps %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v2i64: @@ -186,47 +204,53 @@ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { ; SSE2-LABEL: test_ctselect_v2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: testb $1, %dil -; SSE2-NEXT: sete %al +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd $0, %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: testb $1, %dil -; AVX-NEXT: sete %al +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: movd %eax, %xmm2 -; AVX-NEXT: pshufd $0, %xmm2, %xmm2 -; AVX-NEXT: movdqa %xmm2, %xmm2 -; AVX-NEXT: pand %xmm2, %xmm1 -; AVX-NEXT: pandn %xmm0, %xmm2 -; AVX-NEXT: por %xmm1, %xmm2 -; AVX-NEXT: movdqa %xmm2, %xmm0 +; AVX-NEXT: xorps %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v2f64: ; AVX2: # %bb.0: -; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: testb $1, %dil -; AVX2-NEXT: sete %al +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: movd %eax, %xmm2 -; AVX2-NEXT: pshufd $0, %xmm2, %xmm2 -; AVX2-NEXT: movdqa %xmm2, %xmm2 -; AVX2-NEXT: pand %xmm2, %xmm1 -; AVX2-NEXT: pandn %xmm0, %xmm2 -; AVX2-NEXT: por %xmm1, %xmm2 -; AVX2-NEXT: movdqa %xmm2, %xmm0 +; AVX2-NEXT: xorps %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v2f64: @@ -246,57 +270,65 @@ define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: test_ctselect_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: testb $1, %dil -; SSE2-NEXT: sete %al -; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: pshufd $0, %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: sete %cl -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v8i32: ; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: testb $1, %dil -; AVX-NEXT: sete %al +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: vmovd %eax, %ymm2 -; AVX-NEXT: vshufps $0, %ymm2, %ymm2, %ymm2 -; AVX-NEXT: vmovaps %ymm2, %ymm2 -; AVX-NEXT: vandps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vmovaps %ymm0, %ymm0 +; AVX-NEXT: xorps %ymm3, %ymm3 +; AVX-NEXT: vmovd %eax, %ymm3 +; AVX-NEXT: vshufps $0, %ymm3, %ymm3, %ymm3 +; AVX-NEXT: vmovaps %ymm3, %ymm2 +; AVX-NEXT: andps %ymm0, %ymm3 +; AVX-NEXT: andnps %ymm1, %ymm2 +; AVX-NEXT: orps %ymm3, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: testb $1, %dil -; AVX2-NEXT: sete %al +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: vmovd %eax, %ymm2 -; AVX2-NEXT: vpshufd $0, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, %ymm0 +; AVX2-NEXT: xorps %ymm3, %ymm3 +; AVX2-NEXT: vmovd %eax, %ymm3 +; AVX2-NEXT: vpshufd $0, %ymm3, %ymm3 +; AVX2-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-NEXT: pand %ymm0, %ymm3 +; AVX2-NEXT: pandn %ymm1, %ymm2 +; AVX2-NEXT: por %ymm3, %ymm2 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v8i32: @@ -315,57 +347,65 @@ define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) { define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) { ; SSE2-LABEL: test_ctselect_v8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: testb $1, %dil -; SSE2-NEXT: sete %al -; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: pshufd $0, %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: sete %cl -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: testb $1, %dil -; AVX-NEXT: sete %al +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: vmovd %eax, %ymm2 -; AVX-NEXT: vshufps $0, %ymm2, %ymm2, %ymm2 -; AVX-NEXT: vmovaps %ymm2, %ymm2 -; AVX-NEXT: vandps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vmovaps %ymm0, %ymm0 +; AVX-NEXT: xorps %ymm3, %ymm3 +; AVX-NEXT: vmovd %eax, %ymm3 +; AVX-NEXT: vshufps $0, %ymm3, %ymm3, %ymm3 +; AVX-NEXT: vmovaps %ymm3, %ymm2 +; AVX-NEXT: andps %ymm0, %ymm3 +; AVX-NEXT: andnps %ymm1, %ymm2 +; AVX-NEXT: orps %ymm3, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: testb $1, %dil -; AVX2-NEXT: sete %al +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: vmovd %eax, %ymm2 -; AVX2-NEXT: vpshufd $0, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, %ymm0 +; AVX2-NEXT: xorps %ymm3, %ymm3 +; AVX2-NEXT: vmovd %eax, %ymm3 +; AVX2-NEXT: vpshufd $0, %ymm3, %ymm3 +; AVX2-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-NEXT: pand %ymm0, %ymm3 +; AVX2-NEXT: pandn %ymm1, %ymm2 +; AVX2-NEXT: por %ymm3, %ymm2 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v8f32: @@ -384,57 +424,65 @@ define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) { ; SSE2-LABEL: test_ctselect_v4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: testb $1, %dil -; SSE2-NEXT: sete %al -; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: pshufd $0, %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: sete %cl -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v4i64: ; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: testb $1, %dil -; AVX-NEXT: sete %al +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: vmovd %eax, %ymm2 -; AVX-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 -; AVX-NEXT: vmovapd %ymm2, %ymm2 -; AVX-NEXT: vandpd %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vandnpd %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vorpd %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vmovapd %ymm0, %ymm0 +; AVX-NEXT: xorps %ymm3, %ymm3 +; AVX-NEXT: vmovd %eax, %ymm3 +; AVX-NEXT: vshufpd $0, %ymm3, %ymm3, %ymm3 +; AVX-NEXT: vmovapd %ymm3, %ymm2 +; AVX-NEXT: andpd %ymm0, %ymm3 +; AVX-NEXT: andnpd %ymm1, %ymm2 +; AVX-NEXT: orpd %ymm3, %ymm2 +; AVX-NEXT: vmovapd %ymm2, %ymm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: testb $1, %dil -; AVX2-NEXT: sete %al +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: vmovd %eax, %ymm2 -; AVX2-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vmovapd %ymm2, %ymm2 -; AVX2-NEXT: vandpd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vandnpd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vorpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovapd %ymm0, %ymm0 +; AVX2-NEXT: xorps %ymm3, %ymm3 +; AVX2-NEXT: vmovd %eax, %ymm3 +; AVX2-NEXT: vshufpd $0, %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vmovapd %ymm3, %ymm2 +; AVX2-NEXT: andpd %ymm0, %ymm3 +; AVX2-NEXT: andnpd %ymm1, %ymm2 +; AVX2-NEXT: orpd %ymm3, %ymm2 +; AVX2-NEXT: vmovapd %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v4i64: @@ -453,57 +501,65 @@ define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) { define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> %b) { ; SSE2-LABEL: test_ctselect_v4f64: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: testb $1, %dil -; SSE2-NEXT: sete %al -; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: pshufd $0, %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: sete %cl -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: testb $1, %dil -; AVX-NEXT: sete %al +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: vmovd %eax, %ymm2 -; AVX-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 -; AVX-NEXT: vmovapd %ymm2, %ymm2 -; AVX-NEXT: vandpd %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vandnpd %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vorpd %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vmovapd %ymm0, %ymm0 +; AVX-NEXT: xorps %ymm3, %ymm3 +; AVX-NEXT: vmovd %eax, %ymm3 +; AVX-NEXT: vshufpd $0, %ymm3, %ymm3, %ymm3 +; AVX-NEXT: vmovapd %ymm3, %ymm2 +; AVX-NEXT: andpd %ymm0, %ymm3 +; AVX-NEXT: andnpd %ymm1, %ymm2 +; AVX-NEXT: orpd %ymm3, %ymm2 +; AVX-NEXT: vmovapd %ymm2, %ymm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v4f64: ; AVX2: # %bb.0: -; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: testb $1, %dil -; AVX2-NEXT: sete %al +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: vmovd %eax, %ymm2 -; AVX2-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vmovapd %ymm2, %ymm2 -; AVX2-NEXT: vandpd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vandnpd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vorpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovapd %ymm0, %ymm0 +; AVX2-NEXT: xorps %ymm3, %ymm3 +; AVX2-NEXT: vmovd %eax, %ymm3 +; AVX2-NEXT: vshufpd $0, %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vmovapd %ymm3, %ymm2 +; AVX2-NEXT: andpd %ymm0, %ymm3 +; AVX2-NEXT: andnpd %ymm1, %ymm2 +; AVX2-NEXT: orpd %ymm3, %ymm2 +; AVX2-NEXT: vmovapd %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v4f64: @@ -523,97 +579,113 @@ define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: test_ctselect_v16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: testb $1, %dil -; SSE2-NEXT: sete %al -; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm8 -; SSE2-NEXT: pshufd $0, %xmm8, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm8 -; SSE2-NEXT: por %xmm4, %xmm8 +; SSE2-NEXT: xorps %xmm9, %xmm9 +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: pshufd $0, %xmm9, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm9 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm7, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 ; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: sete %cl -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: sete %al -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: sete %cl -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pandn %xmm3, %xmm4 -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm6, %xmm3 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v16i32: ; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: testb $1, %dil -; AVX-NEXT: sete %al -; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: vmovd %eax, %ymm4 -; AVX-NEXT: vshufps $0, %ymm4, %ymm4, %ymm4 -; AVX-NEXT: vmovaps %ymm4, %ymm4 -; AVX-NEXT: vandps %ymm2, %ymm4, %ymm2 -; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vmovaps %ymm0, %ymm0 -; AVX-NEXT: sete %cl -; AVX-NEXT: negl %ecx -; AVX-NEXT: vmovd %ecx, %ymm2 -; AVX-NEXT: vshufps $0, %ymm2, %ymm2, %ymm2 -; AVX-NEXT: vmovaps %ymm2, %ymm2 -; AVX-NEXT: vandps %ymm3, %ymm2, %ymm3 -; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX-NEXT: vmovaps %ymm1, %ymm1 +; AVX-NEXT: xorps %ymm5, %ymm5 +; AVX-NEXT: vmovd %eax, %ymm5 +; AVX-NEXT: vshufps $0, %ymm5, %ymm5, %ymm5 +; AVX-NEXT: vmovaps %ymm5, %ymm4 +; AVX-NEXT: andps %ymm0, %ymm5 +; AVX-NEXT: andnps %ymm2, %ymm4 +; AVX-NEXT: orps %ymm5, %ymm4 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: xorps %ymm0, %ymm0 +; AVX-NEXT: vmovd %eax, %ymm0 +; AVX-NEXT: vshufps $0, %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovaps %ymm0, %ymm2 +; AVX-NEXT: andps %ymm1, %ymm0 +; AVX-NEXT: andnps %ymm3, %ymm2 +; AVX-NEXT: orps %ymm0, %ymm2 +; AVX-NEXT: vmovaps %ymm4, %ymm0 +; AVX-NEXT: vmovaps %ymm2, %ymm1 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: testb $1, %dil -; AVX2-NEXT: sete %al -; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: vmovd %eax, %ymm4 -; AVX2-NEXT: vpshufd $0, %ymm4, %ymm4 -; AVX2-NEXT: vmovdqa %ymm4, %ymm4 -; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpandn %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, %ymm0 -; AVX2-NEXT: sete %cl -; AVX2-NEXT: negl %ecx -; AVX2-NEXT: vmovd %ecx, %ymm2 -; AVX2-NEXT: vpshufd $0, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm3 -; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, %ymm1 +; AVX2-NEXT: xorps %ymm5, %ymm5 +; AVX2-NEXT: vmovd %eax, %ymm5 +; AVX2-NEXT: vpshufd $0, %ymm5, %ymm5 +; AVX2-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-NEXT: pand %ymm0, %ymm5 +; AVX2-NEXT: pandn %ymm2, %ymm4 +; AVX2-NEXT: por %ymm5, %ymm4 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm0, %ymm0 +; AVX2-NEXT: vmovd %eax, %ymm0 +; AVX2-NEXT: vpshufd $0, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, %ymm2 +; AVX2-NEXT: pand %ymm1, %ymm0 +; AVX2-NEXT: pandn %ymm3, %ymm2 +; AVX2-NEXT: por %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa %ymm4, %ymm0 +; AVX2-NEXT: vmovdqa %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v16i32: @@ -632,97 +704,113 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float> %b) { ; SSE2-LABEL: test_ctselect_v16f32: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: testb $1, %dil -; SSE2-NEXT: sete %al -; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: xorps %xmm9, %xmm9 +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: pshufd $0, %xmm9, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm9 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm8 -; SSE2-NEXT: pshufd $0, %xmm8, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm8 -; SSE2-NEXT: por %xmm4, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm7, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 ; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: sete %cl -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: sete %al -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: sete %cl -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pandn %xmm3, %xmm4 -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm6, %xmm3 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v16f32: ; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: testb $1, %dil -; AVX-NEXT: sete %al -; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: xorps %ymm5, %ymm5 +; AVX-NEXT: vmovd %eax, %ymm5 +; AVX-NEXT: vshufps $0, %ymm5, %ymm5, %ymm5 +; AVX-NEXT: vmovaps %ymm5, %ymm4 +; AVX-NEXT: andps %ymm0, %ymm5 +; AVX-NEXT: andnps %ymm2, %ymm4 +; AVX-NEXT: orps %ymm5, %ymm4 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: vmovd %eax, %ymm4 -; AVX-NEXT: vshufps $0, %ymm4, %ymm4, %ymm4 -; AVX-NEXT: vmovaps %ymm4, %ymm4 -; AVX-NEXT: vandps %ymm2, %ymm4, %ymm2 -; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vmovaps %ymm0, %ymm0 -; AVX-NEXT: sete %cl -; AVX-NEXT: negl %ecx -; AVX-NEXT: vmovd %ecx, %ymm2 -; AVX-NEXT: vshufps $0, %ymm2, %ymm2, %ymm2 -; AVX-NEXT: vmovaps %ymm2, %ymm2 -; AVX-NEXT: vandps %ymm3, %ymm2, %ymm3 -; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX-NEXT: vmovaps %ymm1, %ymm1 +; AVX-NEXT: xorps %ymm0, %ymm0 +; AVX-NEXT: vmovd %eax, %ymm0 +; AVX-NEXT: vshufps $0, %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovaps %ymm0, %ymm2 +; AVX-NEXT: andps %ymm1, %ymm0 +; AVX-NEXT: andnps %ymm3, %ymm2 +; AVX-NEXT: orps %ymm0, %ymm2 +; AVX-NEXT: vmovaps %ymm4, %ymm0 +; AVX-NEXT: vmovaps %ymm2, %ymm1 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v16f32: ; AVX2: # %bb.0: -; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: testb $1, %dil -; AVX2-NEXT: sete %al -; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: xorps %ymm5, %ymm5 +; AVX2-NEXT: vmovd %eax, %ymm5 +; AVX2-NEXT: vpshufd $0, %ymm5, %ymm5 # ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-NEXT: pand %ymm0, %ymm5 +; AVX2-NEXT: pandn %ymm2, %ymm4 +; AVX2-NEXT: por %ymm5, %ymm4 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: vmovd %eax, %ymm4 -; AVX2-NEXT: vpshufd $0, %ymm4, %ymm4 -; AVX2-NEXT: vmovdqa %ymm4, %ymm4 -; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpandn %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, %ymm0 -; AVX2-NEXT: sete %cl -; AVX2-NEXT: negl %ecx -; AVX2-NEXT: vmovd %ecx, %ymm2 -; AVX2-NEXT: vpshufd $0, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm3 -; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, %ymm1 +; AVX2-NEXT: pxor %ymm0, %ymm0 +; AVX2-NEXT: vmovd %eax, %ymm0 +; AVX2-NEXT: vpshufd $0, %ymm0, %ymm0 # ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqa %ymm0, %ymm2 +; AVX2-NEXT: pand %ymm1, %ymm0 +; AVX2-NEXT: pandn %ymm3, %ymm2 +; AVX2-NEXT: por %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa %ymm4, %ymm0 +; AVX2-NEXT: vmovdqa %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v16f32: @@ -741,97 +829,113 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { ; SSE2-LABEL: test_ctselect_v8i64: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: testb $1, %dil -; SSE2-NEXT: sete %al -; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: xorps %xmm9, %xmm9 +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: pshufd $0, %xmm9, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm9 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm8 -; SSE2-NEXT: pshufd $0, %xmm8, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm8 -; SSE2-NEXT: por %xmm4, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm7, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 ; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: sete %cl -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: sete %al -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: sete %cl -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pandn %xmm3, %xmm4 -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm6, %xmm3 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v8i64: ; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: testb $1, %dil -; AVX-NEXT: sete %al -; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: vmovd %eax, %ymm4 -; AVX-NEXT: vshufpd $0, %ymm4, %ymm4, %ymm4 -; AVX-NEXT: vmovapd %ymm4, %ymm4 -; AVX-NEXT: vandpd %ymm2, %ymm4, %ymm2 -; AVX-NEXT: vandnpd %ymm0, %ymm4, %ymm0 -; AVX-NEXT: vorpd %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vmovapd %ymm0, %ymm0 -; AVX-NEXT: sete %cl -; AVX-NEXT: negl %ecx -; AVX-NEXT: vmovd %ecx, %ymm2 -; AVX-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 -; AVX-NEXT: vmovapd %ymm2, %ymm2 -; AVX-NEXT: vandpd %ymm3, %ymm2, %ymm3 -; AVX-NEXT: vandnpd %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vorpd %ymm1, %ymm3, %ymm1 -; AVX-NEXT: vmovapd %ymm1, %ymm1 +; AVX-NEXT: xorps %ymm5, %ymm5 +; AVX-NEXT: vmovd %eax, %ymm5 +; AVX-NEXT: vshufpd $0, %ymm5, %ymm5, %ymm5 +; AVX-NEXT: vmovapd %ymm5, %ymm4 +; AVX-NEXT: andpd %ymm0, %ymm5 +; AVX-NEXT: andnpd %ymm2, %ymm4 +; AVX-NEXT: orpd %ymm5, %ymm4 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: xorpd %ymm0, %ymm0 +; AVX-NEXT: vmovd %eax, %ymm0 +; AVX-NEXT: vshufpd $0, %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovapd %ymm0, %ymm2 +; AVX-NEXT: andpd %ymm1, %ymm0 +; AVX-NEXT: andnpd %ymm3, %ymm2 +; AVX-NEXT: orpd %ymm0, %ymm2 +; AVX-NEXT: vmovapd %ymm4, %ymm0 +; AVX-NEXT: vmovapd %ymm2, %ymm1 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v8i64: ; AVX2: # %bb.0: -; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: testb $1, %dil -; AVX2-NEXT: sete %al -; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: xorps %ymm5, %ymm5 +; AVX2-NEXT: vmovd %eax, %ymm5 +; AVX2-NEXT: vshufpd $0, %ymm5, %ymm5, %ymm5 # ymm5 = ymm5[0,0,2,2] +; AVX2-NEXT: vmovapd %ymm5, %ymm4 +; AVX2-NEXT: andpd %ymm0, %ymm5 +; AVX2-NEXT: andnpd %ymm2, %ymm4 +; AVX2-NEXT: orpd %ymm5, %ymm4 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: vmovd %eax, %ymm4 -; AVX2-NEXT: vshufpd $0, %ymm4, %ymm4, %ymm4 -; AVX2-NEXT: vmovapd %ymm4, %ymm4 -; AVX2-NEXT: vandpd %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vandnpd %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vorpd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovapd %ymm0, %ymm0 -; AVX2-NEXT: sete %cl -; AVX2-NEXT: negl %ecx -; AVX2-NEXT: vmovd %ecx, %ymm2 -; AVX2-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vmovapd %ymm2, %ymm2 -; AVX2-NEXT: vandpd %ymm3, %ymm2, %ymm3 -; AVX2-NEXT: vandnpd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vorpd %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vmovapd %ymm1, %ymm1 +; AVX2-NEXT: xorpd %ymm0, %ymm0 +; AVX2-NEXT: vmovd %eax, %ymm0 +; AVX2-NEXT: vshufpd $0, %ymm0, %ymm0, %ymm0 # ymm0 = ymm0[0,0,2,2] +; AVX2-NEXT: vmovapd %ymm0, %ymm2 +; AVX2-NEXT: andpd %ymm1, %ymm0 +; AVX2-NEXT: andnpd %ymm3, %ymm2 +; AVX2-NEXT: orpd %ymm0, %ymm2 +; AVX2-NEXT: vmovapd %ymm4, %ymm0 +; AVX2-NEXT: vmovapd %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v8i64: @@ -850,97 +954,113 @@ define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> %b) { ; SSE2-LABEL: test_ctselect_v8f64: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: testb $1, %dil -; SSE2-NEXT: sete %al -; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: xorps %xmm9, %xmm9 +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: pshufd $0, %xmm9, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm9 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm8 -; SSE2-NEXT: pshufd $0, %xmm8, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm8 -; SSE2-NEXT: por %xmm4, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm7, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 ; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: sete %cl -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: sete %al -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: sete %cl -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pshufd $0, %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pandn %xmm3, %xmm4 -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm6, %xmm3 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v8f64: ; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: testb $1, %dil -; AVX-NEXT: sete %al -; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: vmovd %eax, %ymm4 -; AVX-NEXT: vshufpd $0, %ymm4, %ymm4, %ymm4 -; AVX-NEXT: vmovapd %ymm4, %ymm4 -; AVX-NEXT: vandpd %ymm2, %ymm4, %ymm2 -; AVX-NEXT: vandnpd %ymm0, %ymm4, %ymm0 -; AVX-NEXT: vorpd %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vmovapd %ymm0, %ymm0 -; AVX-NEXT: sete %cl -; AVX-NEXT: negl %ecx -; AVX-NEXT: vmovd %ecx, %ymm2 -; AVX-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 -; AVX-NEXT: vmovapd %ymm2, %ymm2 -; AVX-NEXT: vandpd %ymm3, %ymm2, %ymm3 -; AVX-NEXT: vandnpd %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vorpd %ymm1, %ymm3, %ymm1 -; AVX-NEXT: vmovapd %ymm1, %ymm1 +; AVX-NEXT: xorps %ymm5, %ymm5 +; AVX-NEXT: vmovd %eax, %ymm5 +; AVX-NEXT: vshufpd $0, %ymm5, %ymm5, %ymm5 +; AVX-NEXT: vmovapd %ymm5, %ymm4 +; AVX-NEXT: andpd %ymm0, %ymm5 +; AVX-NEXT: andnpd %ymm2, %ymm4 +; AVX-NEXT: orpd %ymm5, %ymm4 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: xorpd %ymm0, %ymm0 +; AVX-NEXT: vmovd %eax, %ymm0 +; AVX-NEXT: vshufpd $0, %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovapd %ymm0, %ymm2 +; AVX-NEXT: andpd %ymm1, %ymm0 +; AVX-NEXT: andnpd %ymm3, %ymm2 +; AVX-NEXT: orpd %ymm0, %ymm2 +; AVX-NEXT: vmovapd %ymm4, %ymm0 +; AVX-NEXT: vmovapd %ymm2, %ymm1 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v8f64: ; AVX2: # %bb.0: -; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: testb $1, %dil -; AVX2-NEXT: sete %al -; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: vmovd %eax, %ymm4 -; AVX2-NEXT: vshufpd $0, %ymm4, %ymm4, %ymm4 -; AVX2-NEXT: vmovapd %ymm4, %ymm4 -; AVX2-NEXT: vandpd %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vandnpd %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vorpd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovapd %ymm0, %ymm0 -; AVX2-NEXT: sete %cl -; AVX2-NEXT: negl %ecx -; AVX2-NEXT: vmovd %ecx, %ymm2 -; AVX2-NEXT: vshufpd $0, %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vmovapd %ymm2, %ymm2 -; AVX2-NEXT: vandpd %ymm3, %ymm2, %ymm3 -; AVX2-NEXT: vandnpd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vorpd %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vmovapd %ymm1, %ymm1 +; AVX2-NEXT: xorps %ymm5, %ymm5 +; AVX2-NEXT: vmovd %eax, %ymm5 +; AVX2-NEXT: vshufpd $0, %ymm5, %ymm5, %ymm5 # ymm5 = ymm5[0,0,2,2] +; AVX2-NEXT: vmovapd %ymm5, %ymm4 +; AVX2-NEXT: andpd %ymm0, %ymm5 +; AVX2-NEXT: andnpd %ymm2, %ymm4 +; AVX2-NEXT: orpd %ymm5, %ymm4 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: xorpd %ymm0, %ymm0 +; AVX2-NEXT: vmovd %eax, %ymm0 +; AVX2-NEXT: vshufpd $0, %ymm0, %ymm0, %ymm0 # ymm0 = ymm0[0,0,2,2] +; AVX2-NEXT: vmovapd %ymm0, %ymm2 +; AVX2-NEXT: andpd %ymm1, %ymm0 +; AVX2-NEXT: andnpd %ymm3, %ymm2 +; AVX2-NEXT: orpd %ymm0, %ymm2 +; AVX2-NEXT: vmovapd %ymm4, %ymm0 +; AVX2-NEXT: vmovapd %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v8f64: @@ -961,49 +1081,55 @@ define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: test_ctselect_v4i32_const_true: ; SSE2: # %bb.0: ; SSE2-NEXT: movb $1, %al -; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: sete %cl -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd $0, %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v4i32_const_true: ; AVX: # %bb.0: ; AVX-NEXT: movb $1, %al -; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: testb %al, %al -; AVX-NEXT: sete %cl -; AVX-NEXT: negl %ecx -; AVX-NEXT: movd %ecx, %xmm2 -; AVX-NEXT: pshufd $0, %xmm2, %xmm2 -; AVX-NEXT: movdqa %xmm2, %xmm2 -; AVX-NEXT: pand %xmm2, %xmm1 -; AVX-NEXT: pandn %xmm0, %xmm2 -; AVX-NEXT: por %xmm1, %xmm2 -; AVX-NEXT: movdqa %xmm2, %xmm0 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: xorps %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v4i32_const_true: ; AVX2: # %bb.0: ; AVX2-NEXT: movb $1, %al -; AVX2-NEXT: xorl %ecx, %ecx ; AVX2-NEXT: testb %al, %al -; AVX2-NEXT: sete %cl -; AVX2-NEXT: negl %ecx -; AVX2-NEXT: movd %ecx, %xmm2 -; AVX2-NEXT: pshufd $0, %xmm2, %xmm2 -; AVX2-NEXT: movdqa %xmm2, %xmm2 -; AVX2-NEXT: pand %xmm2, %xmm1 -; AVX2-NEXT: pandn %xmm0, %xmm2 -; AVX2-NEXT: por %xmm1, %xmm2 -; AVX2-NEXT: movdqa %xmm2, %xmm0 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: xorps %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v4i32_const_true: @@ -1017,49 +1143,55 @@ define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: test_ctselect_v4i32_const_false: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: sete %cl -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd $0, %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v4i32_const_false: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: testb %al, %al -; AVX-NEXT: sete %cl -; AVX-NEXT: negl %ecx -; AVX-NEXT: movd %ecx, %xmm2 -; AVX-NEXT: pshufd $0, %xmm2, %xmm2 -; AVX-NEXT: movdqa %xmm2, %xmm2 -; AVX-NEXT: pand %xmm2, %xmm1 -; AVX-NEXT: pandn %xmm0, %xmm2 -; AVX-NEXT: por %xmm1, %xmm2 -; AVX-NEXT: movdqa %xmm2, %xmm0 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: xorps %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v4i32_const_false: ; AVX2: # %bb.0: ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: xorl %ecx, %ecx ; AVX2-NEXT: testb %al, %al -; AVX2-NEXT: sete %cl -; AVX2-NEXT: negl %ecx -; AVX2-NEXT: movd %ecx, %xmm2 -; AVX2-NEXT: pshufd $0, %xmm2, %xmm2 -; AVX2-NEXT: movdqa %xmm2, %xmm2 -; AVX2-NEXT: pand %xmm2, %xmm1 -; AVX2-NEXT: pandn %xmm0, %xmm2 -; AVX2-NEXT: por %xmm1, %xmm2 -; AVX2-NEXT: movdqa %xmm2, %xmm0 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: xorps %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v4i32_const_false: @@ -1076,16 +1208,18 @@ define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i3 ; SSE2: # %bb.0: ; SSE2-NEXT: cmpl %esi, %edi ; SSE2-NEXT: sete %al -; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: sete %cl -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: pshufd $0, %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd $0, %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; @@ -1093,34 +1227,38 @@ define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i3 ; AVX: # %bb.0: ; AVX-NEXT: cmpl %esi, %edi ; AVX-NEXT: sete %al -; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: testb %al, %al -; AVX-NEXT: sete %cl -; AVX-NEXT: negl %ecx -; AVX-NEXT: movd %ecx, %xmm2 -; AVX-NEXT: pshufd $0, %xmm2, %xmm2 -; AVX-NEXT: movdqa %xmm2, %xmm2 -; AVX-NEXT: pand %xmm2, %xmm1 -; AVX-NEXT: pandn %xmm0, %xmm2 -; AVX-NEXT: por %xmm1, %xmm2 -; AVX-NEXT: movdqa %xmm2, %xmm0 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: xorps %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v4i32_icmp: ; AVX2: # %bb.0: ; AVX2-NEXT: cmpl %esi, %edi ; AVX2-NEXT: sete %al -; AVX2-NEXT: xorl %ecx, %ecx ; AVX2-NEXT: testb %al, %al -; AVX2-NEXT: sete %cl -; AVX2-NEXT: negl %ecx -; AVX2-NEXT: movd %ecx, %xmm2 -; AVX2-NEXT: pshufd $0, %xmm2, %xmm2 -; AVX2-NEXT: movdqa %xmm2, %xmm2 -; AVX2-NEXT: pand %xmm2, %xmm1 -; AVX2-NEXT: pandn %xmm0, %xmm2 -; AVX2-NEXT: por %xmm1, %xmm2 -; AVX2-NEXT: movdqa %xmm2, %xmm0 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: xorps %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_ctselect_v4i32_icmp: diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll index 0f8d3bb78f851..71a847f00d166 100644 --- a/llvm/test/CodeGen/X86/ctselect.ll +++ b/llvm/test/CodeGen/X86/ctselect.ll @@ -10,7 +10,8 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { ; X64-NEXT: movl %edx, %eax ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %esi, %eax -; X64: retq +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i8: ; X32: # %bb.0: @@ -28,7 +29,8 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { ; X64-NEXT: movl %edx, %eax ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %esi, %eax -; X64: retq +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i16: ; X32: # %bb.0: @@ -289,10 +291,14 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { ; X64-LABEL: test_ctselect_fcmp_oeq: ; X64: # %bb.0: -; X64-NEXT: cmpeqss %xmm1, %xmm0 -; X64-NEXT: andps %xmm0, %xmm2 -; X64-NEXT: andnps %xmm3, %xmm0 -; X64-NEXT: orps %xmm2, %xmm0 +; X64-NEXT: movd %xmm2, %eax +; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: setnp %dl +; X64-NEXT: sete %sil +; X64-NEXT: testb %dl, %sil +; X64-NEXT: cmovnel %eax, %ecx +; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_fcmp_oeq: From 3912a7d1db8629b72db68de67e33dfb3ec53242c Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Fri, 15 Aug 2025 18:55:48 -0400 Subject: [PATCH 11/63] Added guards for non-integral pointers --- .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 9 +++++++++ .../RISCV/ctselect-fallback-nonintegral-fail.ll | 16 ++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/ctselect-fallback-nonintegral-fail.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 6f973f7ade1a0..a0b7915d89453 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6726,6 +6726,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, assert(!VT.isFloatingPoint() && "Float point type not supported yet fallback implementation"); + Type *CurrType = VT.getTypeForEVT(*Context); + if (CurrType->isPointerTy()) { + unsigned AS = CurrType->getPointerAddressSpace(); + if (DAG.getDataLayout().isNonIntegralAddressSpace(AS)) { + report_fatal_error( + "llvm.ct.select: non-integral pointers are not supported"); + } + } + setValue(&I, createProtectedCtSelectFallback(DAG, DL, Cond, A, B, VT)); return; } diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-nonintegral-fail.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-nonintegral-fail.ll new file mode 100644 index 0000000000000..1c9e5beceea85 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-nonintegral-fail.ll @@ -0,0 +1,16 @@ +; XFAIL: * +; RUN: llc -mtriple=riscv32 < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 < %s | FileCheck %s + +; This test should fail +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128-ni:200" +target triple = "riscv64-unknown-linux-gnu" + +define i32 addrspace(200)* @test_ctselect_ptr(i1 %c, + i32 addrspace(200)* %a, + i32 addrspace(200)* %b) { + %r = call i32 addrspace(200)* @llvm.ct.select.p200i32(i1 %c, + i32 addrspace(200)* %a, + i32 addrspace(200)* %b) + ret i32 addrspace(200)* %r +} From 741889878c5fba09bbee97e8647db5b77d0eff8c Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Sun, 17 Aug 2025 12:17:29 -0400 Subject: [PATCH 12/63] Added support for Float/Double data types --- .../SelectionDAG/SelectionDAGBuilder.cpp | 53 ++- llvm/test/CodeGen/Mips/ctselect-fallback.ll | 256 +++++++++++- .../ctselect-fallback-nonintegral-fail.ll | 4 +- llvm/test/CodeGen/RISCV/ctselect-fallback.ll | 264 ++++++++++++ .../CodeGen/WebAssembly/ctselect-fallback.ll | 377 ++++++++++++++++++ 5 files changed, 936 insertions(+), 18 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index a0b7915d89453..c4ee79441f137 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/BitmaskEnum.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringExtras.h" @@ -6489,30 +6490,50 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I, setValue(&I, Result); } +/// Fallback implementation is an alternative approach for managing architectures that don't have +/// native support for Constant-Time Select. SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( SelectionDAG &DAG, const SDLoc &DL, SDValue Cond, SDValue T, SDValue F, EVT VT) { SDNodeFlags ProtectedFlag; ProtectedFlag.setNoMerge(true); - // Extend cond to VT and normalize to 0 or 1 - if (Cond.getValueType() != VT) - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond, ProtectedFlag); + SDValue WorkingT = T; + SDValue WorkingF = F; + EVT WorkingVT = VT; - SDValue One = DAG.getConstant(1, DL, VT); - SDValue Norm = DAG.getNode(ISD::AND, DL, VT, Cond, One, ProtectedFlag); + if (VT.isFloatingPoint()) { + unsigned int BitWidth = VT.getSizeInBits(); + WorkingVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth); + WorkingT = DAG.getBitcast(WorkingVT, T); + WorkingF = DAG.getBitcast(WorkingVT, F); + } + + // Extend cond to WorkingVT and normalize to 0 or 1 + if (Cond.getValueType() != WorkingVT) + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, WorkingVT, Cond, ProtectedFlag); + + SDValue One = DAG.getConstant(1, DL, WorkingVT); + SDValue Norm = DAG.getNode(ISD::AND, DL, WorkingVT, Cond, One, ProtectedFlag); // Mask = 0 - Norm - SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, Zero, Norm, ProtectedFlag); + SDValue Zero = DAG.getConstant(0, DL, WorkingVT); + SDValue Mask = DAG.getNode(ISD::SUB, DL, WorkingVT, Zero, Norm, ProtectedFlag); + + SDValue AllOnes = DAG.getAllOnesConstant(DL, WorkingVT); + SDValue Invert = DAG.getNode(ISD::XOR, DL, WorkingVT, Mask, AllOnes, ProtectedFlag); - SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); - SDValue Invert = DAG.getNode(ISD::XOR, DL, VT, Mask, AllOnes, ProtectedFlag); + // (or (and WorkingT, Mask), (and F, ~Mask)) + SDValue TM = DAG.getNode(ISD::AND, DL, WorkingVT, Mask, WorkingT, ProtectedFlag); + SDValue FM = DAG.getNode(ISD::AND, DL, WorkingVT, Invert, WorkingF, ProtectedFlag); + SDValue Result = DAG.getNode(ISD::OR, DL, WorkingVT, TM, FM, ProtectedFlag); - // (or (and T, Mask), (and F, ~Mask)) - SDValue TM = DAG.getNode(ISD::AND, DL, VT, Mask, T, ProtectedFlag); - SDValue FM = DAG.getNode(ISD::AND, DL, VT, Invert, F, ProtectedFlag); - return DAG.getNode(ISD::OR, DL, VT, TM, FM, ProtectedFlag); + // Convert back to Float if needed + if (VT.isFloatingPoint()) { + Result = DAG.getBitcast(VT, Result); + } + + return Result; } /// Lower the call to the specified intrinsic function. @@ -6722,9 +6743,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, assert(!VT.isVector() && "Vector type not supported yet for fallback implementation"); - // We don't support floating points yet - assert(!VT.isFloatingPoint() && - "Float point type not supported yet fallback implementation"); + // // We don't support floating points yet + // assert(!VT.isFloatingPoint() && + // "Float point type not supported yet fallback implementation"); Type *CurrType = VT.getTypeForEVT(*Context); if (CurrType->isPointerTy()) { diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback.ll b/llvm/test/CodeGen/Mips/ctselect-fallback.ll index 4993deed7be5f..61e2011f891b9 100644 --- a/llvm/test/CodeGen/Mips/ctselect-fallback.ll +++ b/llvm/test/CodeGen/Mips/ctselect-fallback.ll @@ -376,9 +376,263 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ret i32 %result } +; Test float (32-bit) +define float @test_ctselect_f32(i1 %cond, float %a, float %b) { +; M32-LABEL: test_ctselect_f32: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $5 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: or $1, $2, $1 +; M32-NEXT: jr $ra +; M32-NEXT: mtc1 $1, $f0 +; +; M64-LABEL: test_ctselect_f32: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: mfc1 $2, $f13 +; M64-NEXT: mfc1 $3, $f14 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: jr $ra +; M64-NEXT: mtc1 $1, $f0 + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test double (64-bit) +define double @test_ctselect_f64(i1 %cond, double %a, double %b) { +; M32-LABEL: test_ctselect_f64: +; M32: # %bb.0: +; M32-NEXT: addiu $sp, $sp, -16 +; M32-NEXT: .cfi_def_cfa_offset 16 +; M32-NEXT: mtc1 $6, $f0 +; M32-NEXT: mtc1 $7, $f1 +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: lw $3, 36($sp) +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: sdc1 $f0, 8($sp) +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: lw $4, 12($sp) +; M32-NEXT: and $4, $1, $4 +; M32-NEXT: or $3, $4, $3 +; M32-NEXT: sw $3, 4($sp) +; M32-NEXT: lw $3, 32($sp) +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: lw $3, 8($sp) +; M32-NEXT: and $1, $1, $3 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: sw $1, 0($sp) +; M32-NEXT: ldc1 $f0, 0($sp) +; M32-NEXT: jr $ra +; M32-NEXT: addiu $sp, $sp, 16 +; +; M64-LABEL: test_ctselect_f64: +; M64: # %bb.0: +; M64-NEXT: andi $1, $4, 1 +; M64-NEXT: dmfc1 $2, $f13 +; M64-NEXT: daddiu $3, $zero, -1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: xor $1, $1, $3 +; M64-NEXT: dmfc1 $3, $f14 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: jr $ra +; M64-NEXT: dmtc1 $1, $f0 + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + + +; Test chained float selects +define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, float %c) { +; M32-LABEL: test_ctselect_f32_chain: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: lw $3, 16($sp) +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $6 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $7 +; M32-NEXT: or $1, $2, $1 +; M32-NEXT: andi $2, $5, 1 +; M32-NEXT: andi $2, $2, 1 +; M32-NEXT: negu $2, $2 +; M32-NEXT: and $1, $2, $1 +; M32-NEXT: not $2, $2 +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: jr $ra +; M32-NEXT: mtc1 $1, $f0 +; +; M64-LABEL: test_ctselect_f32_chain: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: mfc1 $2, $f14 +; M64-NEXT: mfc1 $3, $f15 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: mfc1 $3, $f16 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $2, $1 +; M64-NEXT: not $2, $2 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: or $1, $1, $2 +; M64-NEXT: jr $ra +; M64-NEXT: mtc1 $1, $f0 + %tmp = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b) + %result = call float @llvm.ct.select.f32(i1 %cond2, float %tmp, float %c) + ret float %result +} + +; Test with float load +define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { +; M32-LABEL: test_ctselect_f32_load: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: lw $2, 0($5) +; M32-NEXT: lw $3, 0($6) +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $1, $2 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $3 +; M32-NEXT: or $1, $2, $1 +; M32-NEXT: jr $ra +; M32-NEXT: mtc1 $1, $f0 +; +; M64-LABEL: test_ctselect_f32_load: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: lw $2, 0($5) +; M64-NEXT: lw $3, 0($6) +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: jr $ra +; M64-NEXT: mtc1 $1, $f0 + %a = load float, ptr %p1 + %b = load float, ptr %p2 + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test with double load +define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) { +; M32-LABEL: test_ctselect_f64_load: +; M32: # %bb.0: +; M32-NEXT: addiu $sp, $sp, -8 +; M32-NEXT: .cfi_def_cfa_offset 8 +; M32-NEXT: andi $2, $4, 1 +; M32-NEXT: lw $1, 4($6) +; M32-NEXT: lw $4, 4($5) +; M32-NEXT: addiu $3, $2, -1 +; M32-NEXT: negu $2, $2 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: and $4, $2, $4 +; M32-NEXT: or $1, $4, $1 +; M32-NEXT: sw $1, 4($sp) +; M32-NEXT: lw $1, 0($6) +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 0($5) +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: or $1, $2, $1 +; M32-NEXT: sw $1, 0($sp) +; M32-NEXT: ldc1 $f0, 0($sp) +; M32-NEXT: jr $ra +; M32-NEXT: addiu $sp, $sp, 8 +; +; M64-LABEL: test_ctselect_f64_load: +; M64: # %bb.0: +; M64-NEXT: andi $1, $4, 1 +; M64-NEXT: ld $2, 0($5) +; M64-NEXT: daddiu $3, $zero, -1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: xor $1, $1, $3 +; M64-NEXT: ld $3, 0($6) +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: jr $ra +; M64-NEXT: dmtc1 $1, $f0 + %a = load double, ptr %p1 + %b = load double, ptr %p2 + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + +; Test mixed with arithmetic +define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { +; M32-LABEL: test_ctselect_f32_arithmetic: +; M32: # %bb.0: +; M32-NEXT: mtc1 $6, $f0 +; M32-NEXT: mtc1 $5, $f1 +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: andi $1, $1, 1 +; M32-NEXT: add.s $f2, $f1, $f0 +; M32-NEXT: sub.s $f0, $f1, $f0 +; M32-NEXT: negu $1, $1 +; M32-NEXT: mfc1 $2, $f2 +; M32-NEXT: mfc1 $3, $f0 +; M32-NEXT: and $2, $1, $2 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $3 +; M32-NEXT: or $1, $2, $1 +; M32-NEXT: jr $ra +; M32-NEXT: mtc1 $1, $f0 +; +; M64-LABEL: test_ctselect_f32_arithmetic: +; M64: # %bb.0: +; M64-NEXT: add.s $f0, $f13, $f14 +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: mfc1 $2, $f0 +; M64-NEXT: sub.s $f0, $f13, $f14 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: not $1, $1 +; M64-NEXT: mfc1 $3, $f0 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: jr $ra +; M64-NEXT: mtc1 $1, $f0 + %sum = fadd float %x, %y + %diff = fsub float %x, %y + %result = call float @llvm.ct.select.f32(i1 %cond, float %sum, float %diff) + ret float %result +} + +; Declare the intrinsics ; Declare the intrinsics declare i8 @llvm.ct.select.i8(i1, i8, i8) declare i16 @llvm.ct.select.i16(i1, i16, i16) declare i32 @llvm.ct.select.i32(i1, i32, i32) -declare i64 @llvm.ct.select.i64(i1, i64, i64) declare ptr @llvm.ct.select.p0(i1, ptr, ptr) +declare i64 @llvm.ct.select.i64(i1, i64, i64) +declare double @llvm.ct.select.f64(i1, double, double) diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-nonintegral-fail.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-nonintegral-fail.ll index 1c9e5beceea85..cbfc3c7d0e399 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback-nonintegral-fail.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-nonintegral-fail.ll @@ -9,8 +9,10 @@ target triple = "riscv64-unknown-linux-gnu" define i32 addrspace(200)* @test_ctselect_ptr(i1 %c, i32 addrspace(200)* %a, i32 addrspace(200)* %b) { - %r = call i32 addrspace(200)* @llvm.ct.select.p200i32(i1 %c, + %r = call i32 addrspace(200)* @llvm.ct.select.p0(i1 %c, i32 addrspace(200)* %a, i32 addrspace(200)* %b) ret i32 addrspace(200)* %r } + +declare i32 @llvm.ct.select.p0(i1, i32, i32) diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll index c9bf9b579cf29..17fcd6f9371ea 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll @@ -342,9 +342,273 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ret i32 %result } +; Test float (32-bit) +define float @test_ctselect_f32(i1 %cond, float %a, float %b) { +; RV64-LABEL: test_ctselect_f32: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f32: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test double (64-bit) +define double @test_ctselect_f64(i1 %cond, double %a, double %b) { +; RV64-LABEL: test_ctselect_f64: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a1, a0, a1 +; RV64-NEXT: not a0, a0 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f64: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a5, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a3, a5, a3 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: and a4, a5, a4 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: or a0, a1, a3 +; RV32-NEXT: or a1, a2, a4 +; RV32-NEXT: ret + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + + +; Test chained float selects +define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, float %c) { +; RV64-LABEL: test_ctselect_f32_chain: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: andi a1, a1, 1 +; RV64-NEXT: addi a5, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a3, a5, a3 +; RV64-NEXT: neg a5, a1 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a0, a3 +; RV64-NEXT: and a0, a5, a0 +; RV64-NEXT: and a1, a1, a4 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f32_chain: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: slli a1, a1, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: srai a1, a1, 31 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: not a3, a1 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: and a3, a3, a4 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: ret + %tmp = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b) + %result = call float @llvm.ct.select.f32(i1 %cond2, float %tmp, float %c) + ret float %result +} + +; Test with float load +define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { +; RV64-LABEL: test_ctselect_f32_load: +; RV64: # %bb.0: +; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: lw a2, 0(a2) +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f32_load: +; RV32: # %bb.0: +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: lw a2, 0(a2) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %a = load float, ptr %p1 + %b = load float, ptr %p2 + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test with double load +define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) { +; RV64-LABEL: test_ctselect_f64_load: +; RV64: # %bb.0: +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: ld a2, 0(a2) +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a1, a0, a1 +; RV64-NEXT: not a0, a0 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f64_load: +; RV32: # %bb.0: +; RV32-NEXT: lw a3, 0(a1) +; RV32-NEXT: lw a1, 4(a1) +; RV32-NEXT: lw a4, 0(a2) +; RV32-NEXT: lw a2, 4(a2) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a5, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a4, a5, a4 +; RV32-NEXT: and a3, a0, a3 +; RV32-NEXT: and a2, a5, a2 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: or a0, a3, a4 +; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: ret + %a = load double, ptr %p1 + %b = load double, ptr %p2 + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + +; Test mixed with arithmetic +define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { +; RV64-LABEL: test_ctselect_f32_arithmetic: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -48 +; RV64-NEXT: .cfi_def_cfa_offset 48 +; RV64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: .cfi_offset s1, -24 +; RV64-NEXT: .cfi_offset s2, -32 +; RV64-NEXT: .cfi_offset s3, -40 +; RV64-NEXT: mv s0, a2 +; RV64-NEXT: mv s1, a1 +; RV64-NEXT: mv s2, a0 +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: call __addsf3 +; RV64-NEXT: mv s3, a0 +; RV64-NEXT: mv a0, s1 +; RV64-NEXT: mv a1, s0 +; RV64-NEXT: call __subsf3 +; RV64-NEXT: andi a1, s2, 1 +; RV64-NEXT: neg a2, a1 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a2, a2, s3 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: or a0, a2, a0 +; RV64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: .cfi_restore s1 +; RV64-NEXT: .cfi_restore s2 +; RV64-NEXT: .cfi_restore s3 +; RV64-NEXT: addi sp, sp, 48 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f32_arithmetic: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: .cfi_offset s1, -12 +; RV32-NEXT: .cfi_offset s2, -16 +; RV32-NEXT: .cfi_offset s3, -20 +; RV32-NEXT: mv s0, a2 +; RV32-NEXT: mv s1, a1 +; RV32-NEXT: mv s2, a0 +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: call __addsf3 +; RV32-NEXT: mv s3, a0 +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a1, s0 +; RV32-NEXT: call __subsf3 +; RV32-NEXT: andi a1, s2, 1 +; RV32-NEXT: slli a1, a1, 31 +; RV32-NEXT: srai a1, a1, 31 +; RV32-NEXT: and a2, a1, s3 +; RV32-NEXT: not a1, a1 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: .cfi_restore s1 +; RV32-NEXT: .cfi_restore s2 +; RV32-NEXT: .cfi_restore s3 +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret + %sum = fadd float %x, %y + %diff = fsub float %x, %y + %result = call float @llvm.ct.select.f32(i1 %cond, float %sum, float %diff) + ret float %result +} + +; Declare the intrinsics ; Declare the intrinsics declare i8 @llvm.ct.select.i8(i1, i8, i8) declare i16 @llvm.ct.select.i16(i1, i16, i16) declare i32 @llvm.ct.select.i32(i1, i32, i32) declare i64 @llvm.ct.select.i64(i1, i64, i64) declare ptr @llvm.ct.select.p0(i1, ptr, ptr) +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll index bc1d12c9ad83e..39d38a415ec42 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll @@ -568,9 +568,386 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ret i32 %result } +; Test float (32-bit) +define float @test_ctselect_f32(i1 %cond, float %a, float %b) { +; W32-LABEL: test_ctselect_f32: +; W32: .functype test_ctselect_f32 (i32, f32, f32) -> (f32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.reinterpret_f32 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 2 +; W32-NEXT: i32.reinterpret_f32 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: f32.reinterpret_i32 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_f32: +; W64: .functype test_ctselect_f32 (i32, f32, f32) -> (f32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.reinterpret_f32 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 2 +; W64-NEXT: i32.reinterpret_f32 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: f32.reinterpret_i32 +; W64-NEXT: # fallthrough-return + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test double (64-bit) +define double @test_ctselect_f64(i1 %cond, double %a, double %b) { +; W32-LABEL: test_ctselect_f64: +; W32: .functype test_ctselect_f64 (i32, f64, f64) -> (f64) +; W32-NEXT: .local i64 +; W32-NEXT: # %bb.0: +; W32-NEXT: i64.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i64.extend_i32_u +; W32-NEXT: i64.const 1 +; W32-NEXT: i64.and +; W32-NEXT: i64.const 1 +; W32-NEXT: i64.and +; W32-NEXT: i64.sub +; W32-NEXT: local.tee 3 +; W32-NEXT: local.get 1 +; W32-NEXT: i64.reinterpret_f64 +; W32-NEXT: i64.and +; W32-NEXT: local.get 3 +; W32-NEXT: i64.const -1 +; W32-NEXT: i64.xor +; W32-NEXT: local.get 2 +; W32-NEXT: i64.reinterpret_f64 +; W32-NEXT: i64.and +; W32-NEXT: i64.or +; W32-NEXT: f64.reinterpret_i64 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_f64: +; W64: .functype test_ctselect_f64 (i32, f64, f64) -> (f64) +; W64-NEXT: .local i64 +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.sub +; W64-NEXT: local.tee 3 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.reinterpret_f64 +; W64-NEXT: i64.and +; W64-NEXT: local.get 3 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.xor +; W64-NEXT: local.get 2 +; W64-NEXT: i64.reinterpret_f64 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: f64.reinterpret_i64 +; W64-NEXT: # fallthrough-return + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + + +; Test chained float selects +define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, float %c) { +; W32-LABEL: test_ctselect_f32_chain: +; W32: .functype test_ctselect_f32_chain (i32, i32, f32, f32, f32) -> (f32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.reinterpret_f32 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 3 +; W32-NEXT: i32.reinterpret_f32 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 4 +; W32-NEXT: i32.reinterpret_f32 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: f32.reinterpret_i32 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_f32_chain: +; W64: .functype test_ctselect_f32_chain (i32, i32, f32, f32, f32) -> (f32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 1 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 0 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.reinterpret_f32 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 3 +; W64-NEXT: i32.reinterpret_f32 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 4 +; W64-NEXT: i32.reinterpret_f32 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: f32.reinterpret_i32 +; W64-NEXT: # fallthrough-return + %tmp = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b) + %result = call float @llvm.ct.select.f32(i1 %cond2, float %tmp, float %c) + ret float %result +} + +; Test with float load +define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { +; W32-LABEL: test_ctselect_f32_load: +; W32: .functype test_ctselect_f32_load (i32, i32, i32) -> (f32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.load 0 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 2 +; W32-NEXT: i32.load 0 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: f32.reinterpret_i32 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_f32_load: +; W64: .functype test_ctselect_f32_load (i32, i64, i64) -> (f32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.load 0 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 2 +; W64-NEXT: i32.load 0 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: f32.reinterpret_i32 +; W64-NEXT: # fallthrough-return + %a = load float, ptr %p1 + %b = load float, ptr %p2 + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test with double load +define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) { +; W32-LABEL: test_ctselect_f64_load: +; W32: .functype test_ctselect_f64_load (i32, i32, i32) -> (f64) +; W32-NEXT: .local i64 +; W32-NEXT: # %bb.0: +; W32-NEXT: i64.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i64.extend_i32_u +; W32-NEXT: i64.const 1 +; W32-NEXT: i64.and +; W32-NEXT: i64.const 1 +; W32-NEXT: i64.and +; W32-NEXT: i64.sub +; W32-NEXT: local.tee 3 +; W32-NEXT: local.get 1 +; W32-NEXT: i64.load 0 +; W32-NEXT: i64.and +; W32-NEXT: local.get 3 +; W32-NEXT: i64.const -1 +; W32-NEXT: i64.xor +; W32-NEXT: local.get 2 +; W32-NEXT: i64.load 0 +; W32-NEXT: i64.and +; W32-NEXT: i64.or +; W32-NEXT: f64.reinterpret_i64 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_f64_load: +; W64: .functype test_ctselect_f64_load (i32, i64, i64) -> (f64) +; W64-NEXT: .local i64 +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.sub +; W64-NEXT: local.tee 3 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.load 0 +; W64-NEXT: i64.and +; W64-NEXT: local.get 3 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.xor +; W64-NEXT: local.get 2 +; W64-NEXT: i64.load 0 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: f64.reinterpret_i64 +; W64-NEXT: # fallthrough-return + %a = load double, ptr %p1 + %b = load double, ptr %p2 + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + +; Test mixed with arithmetic +define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { +; W32-LABEL: test_ctselect_f32_arithmetic: +; W32: .functype test_ctselect_f32_arithmetic (i32, f32, f32) -> (f32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 1 +; W32-NEXT: local.get 2 +; W32-NEXT: f32.add +; W32-NEXT: i32.reinterpret_f32 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 1 +; W32-NEXT: local.get 2 +; W32-NEXT: f32.sub +; W32-NEXT: i32.reinterpret_f32 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: f32.reinterpret_i32 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_f32_arithmetic: +; W64: .functype test_ctselect_f32_arithmetic (i32, f32, f32) -> (f32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.sub +; W64-NEXT: local.tee 0 +; W64-NEXT: local.get 1 +; W64-NEXT: local.get 2 +; W64-NEXT: f32.add +; W64-NEXT: i32.reinterpret_f32 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 1 +; W64-NEXT: local.get 2 +; W64-NEXT: f32.sub +; W64-NEXT: i32.reinterpret_f32 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: f32.reinterpret_i32 +; W64-NEXT: # fallthrough-return + %sum = fadd float %x, %y + %diff = fsub float %x, %y + %result = call float @llvm.ct.select.f32(i1 %cond, float %sum, float %diff) + ret float %result +} + +; Declare the intrinsics ; Declare the intrinsics declare i8 @llvm.ct.select.i8(i1, i8, i8) declare i16 @llvm.ct.select.i16(i1, i16, i16) declare i32 @llvm.ct.select.i32(i1, i32, i32) declare i64 @llvm.ct.select.i64(i1, i64, i64) declare ptr @llvm.ct.select.p0(i1, ptr, ptr) +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) From 5cde83a8d621fb498f2aa99b4ba4dab3115432b0 Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Sun, 17 Aug 2025 17:34:05 -0400 Subject: [PATCH 13/63] Added support for Vector types --- .../SelectionDAG/SelectionDAGBuilder.cpp | 72 +- .../CodeGen/Mips/ctselect-fallback-vector.ll | 712 ++++++++++++++++++ .../CodeGen/Mips/ctselect-side-effects.ll | 3 +- .../WebAssembly/ctselect-fallback-vector.ll | 566 ++++++++++++++ 4 files changed, 1329 insertions(+), 24 deletions(-) create mode 100644 llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll create mode 100644 llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index c4ee79441f137..6b34cbea723db 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6490,6 +6490,24 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I, setValue(&I, Result); } +static SDValue createMask(SelectionDAG &DAG, const SDLoc &DL, SDValue Cond, + EVT WorkingVT, const SDNodeFlags &Flag) { + if (WorkingVT.isVector()) + return DAG.getSExtOrTrunc(Cond, DL, WorkingVT); + + // Extend cond to WorkingVT and normalize to 0 or 1 + if (Cond.getValueType() != WorkingVT) + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, WorkingVT, Cond, Flag); + + // Normalize + SDValue One = DAG.getConstant(1, DL, WorkingVT); + SDValue Norm = DAG.getNode(ISD::AND, DL, WorkingVT, Cond, One, Flag); + + // Mask = 0 - Norm + SDValue Zero = DAG.getConstant(0, DL, WorkingVT); + return DAG.getNode(ISD::SUB, DL, WorkingVT, Zero, Norm, Flag); +} + /// Fallback implementation is an alternative approach for managing architectures that don't have /// native support for Constant-Time Select. SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( @@ -6502,23 +6520,29 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( SDValue WorkingF = F; EVT WorkingVT = VT; + if (VT.isVector() && !Cond.getValueType().isVector()) { + unsigned NumElems = VT.getVectorNumElements(); + EVT CondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElems); + Cond = DAG.getSplatBuildVector(CondVT, DL, Cond); + } + if (VT.isFloatingPoint()) { - unsigned int BitWidth = VT.getSizeInBits(); - WorkingVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth); + if (VT.isVector()) { + // float vector -> int vector + EVT ElemVT = VT.getVectorElementType(); + unsigned int ElemBitWidth = ElemVT.getSizeInBits(); + EVT IntElemVT = EVT::getIntegerVT(*DAG.getContext(), ElemBitWidth); + WorkingVT = EVT::getVectorVT(*DAG.getContext(), IntElemVT, + VT.getVectorNumElements()); + } else { + WorkingVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + } + WorkingT = DAG.getBitcast(WorkingVT, T); WorkingF = DAG.getBitcast(WorkingVT, F); } - // Extend cond to WorkingVT and normalize to 0 or 1 - if (Cond.getValueType() != WorkingVT) - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, WorkingVT, Cond, ProtectedFlag); - - SDValue One = DAG.getConstant(1, DL, WorkingVT); - SDValue Norm = DAG.getNode(ISD::AND, DL, WorkingVT, Cond, One, ProtectedFlag); - - // Mask = 0 - Norm - SDValue Zero = DAG.getConstant(0, DL, WorkingVT); - SDValue Mask = DAG.getNode(ISD::SUB, DL, WorkingVT, Zero, Norm, ProtectedFlag); + SDValue Mask = createMask(DAG, DL, Cond, WorkingVT, ProtectedFlag); SDValue AllOnes = DAG.getAllOnesConstant(DL, WorkingVT); SDValue Invert = DAG.getNode(ISD::XOR, DL, WorkingVT, Mask, AllOnes, ProtectedFlag); @@ -6528,8 +6552,8 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( SDValue FM = DAG.getNode(ISD::AND, DL, WorkingVT, Invert, WorkingF, ProtectedFlag); SDValue Result = DAG.getNode(ISD::OR, DL, WorkingVT, TM, FM, ProtectedFlag); - // Convert back to Float if needed - if (VT.isFloatingPoint()) { + // Convert back if needed + if (WorkingVT != VT) { Result = DAG.getBitcast(VT, Result); } @@ -6727,8 +6751,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, EVT VT = A.getValueType(); EVT CondVT = Cond.getValueType(); + // For now we'll only support scalar predicates // assert if Cond type is Vector - assert(!CondVT.isVector() && "Vector type cond not supported yet"); + // TODO: Maybe look into supporting vector predicates? + assert(!CondVT.isVector() && + "ct.select fallback only supports scalar conditions"); // Handle scalar types if (TLI.isSelectSupported( @@ -6739,14 +6766,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } - // assert if Payloads type are Vector - assert(!VT.isVector() && - "Vector type not supported yet for fallback implementation"); - - // // We don't support floating points yet - // assert(!VT.isFloatingPoint() && - // "Float point type not supported yet fallback implementation"); - + // We don't support non-integral pointers Type *CurrType = VT.getTypeForEVT(*Context); if (CurrType->isPointerTy()) { unsigned AS = CurrType->getPointerAddressSpace(); @@ -6756,6 +6776,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } } + // We don't support scalable vector types yet, for now it'll only be + // fix-width vector + // TODO: Add support for scalable vectors + assert(!VT.isScalableVector() && + "ct.select fallback doesn't supports scalable vectors"); + setValue(&I, createProtectedCtSelectFallback(DAG, DL, Cond, A, B, VT)); return; } diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll new file mode 100644 index 0000000000000..1e18a87ea6605 --- /dev/null +++ b/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll @@ -0,0 +1,712 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=mips64-unknown-linux-gnu -mcpu=mips64r6 -mattr=+msa -O3 | FileCheck %s --check-prefix=MIPS64-MSA +; RUN: llc < %s -mtriple=mips-unknown-linux-gnu -mcpu=mips32r6 -mattr=+msa -O3 | FileCheck %s --check-prefix=MIPS32-MSA + +; Test 32-bit integer vector (128 bits) +define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; MIPS64-MSA-LABEL: test_ctselect_v4i32: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.w $w2, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS64-MSA-NEXT: shf.w $w0, $w2, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.w $w2, $4 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: lw $2, 20($sp) +; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 32($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $2 +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: copy_s.w $2, $w2[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w2[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w2[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w2[3] + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test 16-bit integer vector (8 x i16 = 128-bit) +define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) { +; MIPS64-MSA-LABEL: test_ctselect_v8i16: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.h $w2, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: slli.h $w2, $w2, 15 +; MIPS64-MSA-NEXT: shf.h $w0, $w0, 27 +; MIPS64-MSA-NEXT: shf.h $w1, $w1, 27 +; MIPS64-MSA-NEXT: srai.h $w2, $w2, 15 +; MIPS64-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS64-MSA-NEXT: shf.h $w0, $w2, 27 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v8i16: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w0[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.h $w2, $4 +; MIPS32-MSA-NEXT: insert.w $w1[0], $2 +; MIPS32-MSA-NEXT: insert.w $w0[1], $7 +; MIPS32-MSA-NEXT: lw $2, 32($sp) +; MIPS32-MSA-NEXT: slli.h $w2, $w2, 15 +; MIPS32-MSA-NEXT: srai.h $w2, $w2, 15 +; MIPS32-MSA-NEXT: insert.w $w1[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 20($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $2 +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $1 +; MIPS32-MSA-NEXT: shf.h $w0, $w0, 177 +; MIPS32-MSA-NEXT: shf.h $w1, $w1, 177 +; MIPS32-MSA-NEXT: bsel.v $w2, $w1, $w0 +; MIPS32-MSA-NEXT: shf.h $w0, $w2, 177 +; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w0[3] + %result = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %result +} + +; Test byte vector (16 x i8 = 128-bit) +define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) { +; MIPS64-MSA-LABEL: test_ctselect_v16i8: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.b $w2, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: slli.b $w2, $w2, 7 +; MIPS64-MSA-NEXT: shf.b $w0, $w0, 27 +; MIPS64-MSA-NEXT: shf.b $w1, $w1, 27 +; MIPS64-MSA-NEXT: srai.b $w2, $w2, 7 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: bmnz.v $w0, $w1, $w2 +; MIPS64-MSA-NEXT: shf.b $w0, $w0, 27 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v16i8: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w0[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.b $w2, $4 +; MIPS32-MSA-NEXT: insert.w $w1[0], $2 +; MIPS32-MSA-NEXT: insert.w $w0[1], $7 +; MIPS32-MSA-NEXT: lw $2, 32($sp) +; MIPS32-MSA-NEXT: slli.b $w2, $w2, 7 +; MIPS32-MSA-NEXT: srai.b $w2, $w2, 7 +; MIPS32-MSA-NEXT: insert.w $w1[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 20($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $2 +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $1 +; MIPS32-MSA-NEXT: shf.b $w0, $w0, 27 +; MIPS32-MSA-NEXT: shf.b $w1, $w1, 27 +; MIPS32-MSA-NEXT: bmnz.v $w1, $w0, $w2 +; MIPS32-MSA-NEXT: shf.b $w0, $w1, 27 +; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w0[3] + %result = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %result +} + +; Test 64-bit integer vector (2 x i64 = 128-bit) +define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { +; MIPS64-MSA-LABEL: test_ctselect_v2i64: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $5 +; MIPS64-MSA-NEXT: insert.d $w1[0], $7 +; MIPS64-MSA-NEXT: fill.d $w2, $4 +; MIPS64-MSA-NEXT: slli.d $w2, $w2, 63 +; MIPS64-MSA-NEXT: insert.d $w0[1], $6 +; MIPS64-MSA-NEXT: insert.d $w1[1], $8 +; MIPS64-MSA-NEXT: srai.d $w2, $w2, 63 +; MIPS64-MSA-NEXT: bsel.v $w2, $w1, $w0 +; MIPS64-MSA-NEXT: copy_s.d $2, $w2[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w2[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v2i64: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: addiu $sp, $sp, -32 +; MIPS32-MSA-NEXT: .cfi_def_cfa_offset 32 +; MIPS32-MSA-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; MIPS32-MSA-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill +; MIPS32-MSA-NEXT: .cfi_offset 31, -4 +; MIPS32-MSA-NEXT: .cfi_offset 30, -8 +; MIPS32-MSA-NEXT: move $fp, $sp +; MIPS32-MSA-NEXT: .cfi_def_cfa_register 30 +; MIPS32-MSA-NEXT: addiu $1, $zero, -16 +; MIPS32-MSA-NEXT: and $sp, $sp, $1 +; MIPS32-MSA-NEXT: lw $2, 56($fp) +; MIPS32-MSA-NEXT: lw $1, 60($fp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: sw $4, 12($sp) +; MIPS32-MSA-NEXT: sw $4, 4($sp) +; MIPS32-MSA-NEXT: ld.d $w2, 0($sp) +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: slli.d $w2, $w2, 63 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 64($fp) +; MIPS32-MSA-NEXT: srai.d $w2, $w2, 63 +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 68($fp) +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: lw $1, 48($fp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 52($fp) +; MIPS32-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS32-MSA-NEXT: insert.w $w1[3], $1 +; MIPS32-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: shf.w $w0, $w2, 177 +; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2] +; MIPS32-MSA-NEXT: copy_s.w $5, $w0[3] +; MIPS32-MSA-NEXT: move $sp, $fp +; MIPS32-MSA-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload +; MIPS32-MSA-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: addiu $sp, $sp, 32 + %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %result +} + +; Test single-precision float vector (4 x float = 128-bit) +define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { +; MIPS64-MSA-LABEL: test_ctselect_v4f32: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.w $w2, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS64-MSA-NEXT: shf.w $w0, $w2, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4f32: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.w $w2, $5 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: lw $2, 20($sp) +; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 32($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $2 +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: st.w $w2, 0($4) + %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +; Test double-precision float vector (2 x double = 128-bit) +define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { +; MIPS64-MSA-LABEL: test_ctselect_v2f64: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $5 +; MIPS64-MSA-NEXT: insert.d $w1[0], $7 +; MIPS64-MSA-NEXT: fill.d $w2, $4 +; MIPS64-MSA-NEXT: slli.d $w2, $w2, 63 +; MIPS64-MSA-NEXT: insert.d $w0[1], $6 +; MIPS64-MSA-NEXT: insert.d $w1[1], $8 +; MIPS64-MSA-NEXT: srai.d $w2, $w2, 63 +; MIPS64-MSA-NEXT: bsel.v $w2, $w1, $w0 +; MIPS64-MSA-NEXT: copy_s.d $2, $w2[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w2[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v2f64: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: addiu $sp, $sp, -32 +; MIPS32-MSA-NEXT: .cfi_def_cfa_offset 32 +; MIPS32-MSA-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; MIPS32-MSA-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill +; MIPS32-MSA-NEXT: .cfi_offset 31, -4 +; MIPS32-MSA-NEXT: .cfi_offset 30, -8 +; MIPS32-MSA-NEXT: move $fp, $sp +; MIPS32-MSA-NEXT: .cfi_def_cfa_register 30 +; MIPS32-MSA-NEXT: addiu $1, $zero, -16 +; MIPS32-MSA-NEXT: and $sp, $sp, $1 +; MIPS32-MSA-NEXT: lw $2, 56($fp) +; MIPS32-MSA-NEXT: lw $1, 60($fp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: sw $5, 12($sp) +; MIPS32-MSA-NEXT: sw $5, 4($sp) +; MIPS32-MSA-NEXT: ld.d $w2, 0($sp) +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: slli.d $w2, $w2, 63 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 64($fp) +; MIPS32-MSA-NEXT: srai.d $w2, $w2, 63 +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 68($fp) +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: lw $1, 48($fp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 52($fp) +; MIPS32-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS32-MSA-NEXT: insert.w $w1[3], $1 +; MIPS32-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: st.d $w2, 0($4) +; MIPS32-MSA-NEXT: move $sp, $fp +; MIPS32-MSA-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload +; MIPS32-MSA-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: addiu $sp, $sp, 32 + %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +; Test with aligned loads (common case) +define <4 x i32> @test_ctselect_v4i32_aligned_load(i1 %cond, ptr %p1, ptr %p2) { +; MIPS64-MSA-LABEL: test_ctselect_v4i32_aligned_load: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: ld.w $w1, 0($5) +; MIPS64-MSA-NEXT: ld.w $w2, 0($6) +; MIPS64-MSA-NEXT: fill.w $w0, $1 +; MIPS64-MSA-NEXT: slli.w $w0, $w0, 31 +; MIPS64-MSA-NEXT: srai.w $w0, $w0, 31 +; MIPS64-MSA-NEXT: bsel.v $w0, $w2, $w1 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32_aligned_load: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: fill.w $w0, $4 +; MIPS32-MSA-NEXT: ld.w $w1, 0($5) +; MIPS32-MSA-NEXT: ld.w $w2, 0($6) +; MIPS32-MSA-NEXT: slli.w $w0, $w0, 31 +; MIPS32-MSA-NEXT: srai.w $w0, $w0, 31 +; MIPS32-MSA-NEXT: bsel.v $w0, $w2, $w1 +; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w0[3] + %a = load <4 x i32>, ptr %p1, align 16 + %b = load <4 x i32>, ptr %p2, align 16 + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with unaligned loads (stress test) +define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2) { +; MIPS64-MSA-LABEL: test_ctselect_v4i32_unaligned_load: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: ld.w $w1, 0($5) +; MIPS64-MSA-NEXT: ld.w $w2, 0($6) +; MIPS64-MSA-NEXT: fill.w $w0, $1 +; MIPS64-MSA-NEXT: slli.w $w0, $w0, 31 +; MIPS64-MSA-NEXT: srai.w $w0, $w0, 31 +; MIPS64-MSA-NEXT: bsel.v $w0, $w2, $w1 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32_unaligned_load: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: fill.w $w0, $4 +; MIPS32-MSA-NEXT: ld.w $w1, 0($5) +; MIPS32-MSA-NEXT: ld.w $w2, 0($6) +; MIPS32-MSA-NEXT: slli.w $w0, $w0, 31 +; MIPS32-MSA-NEXT: srai.w $w0, $w0, 31 +; MIPS32-MSA-NEXT: bsel.v $w0, $w2, $w1 +; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w0[3] + %a = load <4 x i32>, ptr %p1, align 4 + %b = load <4 x i32>, ptr %p2, align 4 + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with stores to verify result handling +define void @test_ctselect_v4i32_store(i1 %cond, <4 x i32> %a, <4 x i32> %b, ptr %out) { +; MIPS64-MSA-LABEL: test_ctselect_v4i32_store: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.w $w2, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: st.w $w2, 0($9) +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32_store: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.w $w2, $4 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: lw $2, 20($sp) +; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 32($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $2 +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: lw $1, 40($sp) +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: st.w $w2, 0($1) + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + store <4 x i32> %result, ptr %out, align 16 + ret void +} + +; Test chained selects (multiple conditions) +define <4 x i32> @test_ctselect_v4i32_chain(i1 %cond1, i1 %cond2, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; MIPS64-MSA-LABEL: test_ctselect_v4i32_chain: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $8 +; MIPS64-MSA-NEXT: insert.d $w1[0], $6 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.w $w2, $1 +; MIPS64-MSA-NEXT: sll $1, $5, 0 +; MIPS64-MSA-NEXT: insert.d $w0[1], $9 +; MIPS64-MSA-NEXT: insert.d $w1[1], $7 +; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS64-MSA-NEXT: insert.d $w0[0], $10 +; MIPS64-MSA-NEXT: fill.w $w1, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $11 +; MIPS64-MSA-NEXT: slli.w $w1, $w1, 31 +; MIPS64-MSA-NEXT: srai.w $w1, $w1, 31 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: bsel.v $w1, $w0, $w2 +; MIPS64-MSA-NEXT: shf.w $w0, $w1, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32_chain: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.w $w2, $4 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: lw $2, 20($sp) +; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 32($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $2 +; MIPS32-MSA-NEXT: lw $2, 40($sp) +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: lw $1, 44($sp) +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: fill.w $w1, $5 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 48($sp) +; MIPS32-MSA-NEXT: slli.w $w1, $w1, 31 +; MIPS32-MSA-NEXT: srai.w $w1, $w1, 31 +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 52($sp) +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: bsel.v $w1, $w0, $w2 +; MIPS32-MSA-NEXT: copy_s.w $2, $w1[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w1[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w1[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w1[3] + %tmp = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond1, <4 x i32> %a, <4 x i32> %b) + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond2, <4 x i32> %tmp, <4 x i32> %c) + ret <4 x i32> %result +} + +; Test with arithmetic operations (ensure float vectors work with FP ops) +define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4 x float> %y) { +; MIPS64-MSA-LABEL: test_ctselect_v4f32_arithmetic: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: fsub.w $w2, $w1, $w0 +; MIPS64-MSA-NEXT: fadd.w $w0, $w1, $w0 +; MIPS64-MSA-NEXT: fill.w $w1, $1 +; MIPS64-MSA-NEXT: slli.w $w1, $w1, 31 +; MIPS64-MSA-NEXT: srai.w $w1, $w1, 31 +; MIPS64-MSA-NEXT: bsel.v $w1, $w2, $w0 +; MIPS64-MSA-NEXT: shf.w $w0, $w1, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4f32_arithmetic: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 32($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 20($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $1 +; MIPS32-MSA-NEXT: fsub.w $w2, $w1, $w0 +; MIPS32-MSA-NEXT: fadd.w $w0, $w1, $w0 +; MIPS32-MSA-NEXT: fill.w $w1, $5 +; MIPS32-MSA-NEXT: slli.w $w1, $w1, 31 +; MIPS32-MSA-NEXT: srai.w $w1, $w1, 31 +; MIPS32-MSA-NEXT: bsel.v $w1, $w2, $w0 +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: st.w $w1, 0($4) + %sum = fadd <4 x float> %x, %y + %diff = fsub <4 x float> %x, %y + %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %sum, <4 x float> %diff) + ret <4 x float> %result +} + +; Test with mixed operations (load, compute, select, store) +define void @test_ctselect_v4i32_mixed(i1 %cond, ptr %p1, ptr %p2, ptr %out) { +; MIPS64-MSA-LABEL: test_ctselect_v4i32_mixed: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: ld.w $w0, 0($5) +; MIPS64-MSA-NEXT: ld.w $w1, 0($6) +; MIPS64-MSA-NEXT: fill.w $w2, $1 +; MIPS64-MSA-NEXT: addvi.w $w0, $w0, 1 +; MIPS64-MSA-NEXT: addvi.w $w1, $w1, 2 +; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: bsel.v $w2, $w1, $w0 +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: st.w $w2, 0($7) +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32_mixed: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: ld.w $w0, 0($5) +; MIPS32-MSA-NEXT: ld.w $w1, 0($6) +; MIPS32-MSA-NEXT: fill.w $w2, $4 +; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: addvi.w $w0, $w0, 1 +; MIPS32-MSA-NEXT: addvi.w $w1, $w1, 2 +; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: bsel.v $w2, $w1, $w0 +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: st.w $w2, 0($7) + %a = load <4 x i32>, ptr %p1, align 16 + %b = load <4 x i32>, ptr %p2, align 16 + %a_plus_1 = add <4 x i32> %a, + %b_plus_2 = add <4 x i32> %b, + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a_plus_1, <4 x i32> %b_plus_2) + store <4 x i32> %result, ptr %out, align 16 + ret void +} + +; Test with function arguments directly (no loads) +define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b) nounwind { +; MIPS64-MSA-LABEL: test_ctselect_v4i32_args: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.w $w2, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS64-MSA-NEXT: shf.w $w0, $w2, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32_args: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.w $w2, $4 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: lw $2, 20($sp) +; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 32($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $2 +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: copy_s.w $2, $w2[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w2[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w2[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w2[3] + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with multiple uses of result +define <4 x i32> @test_ctselect_v4i32_multi_use(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; MIPS64-MSA-LABEL: test_ctselect_v4i32_multi_use: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.w $w2, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS64-MSA-NEXT: addv.w $w0, $w2, $w2 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32_multi_use: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.w $w2, $4 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: lw $2, 20($sp) +; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 32($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $2 +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: addv.w $w0, $w2, $w2 +; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w0[3] + %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + %add = add <4 x i32> %sel, %sel ; Use result twice + ret <4 x i32> %add +} + +declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>) +declare <8 x i16> @llvm.ct.select.v8i16(i1, <8 x i16>, <8 x i16>) +declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>) +declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>) +declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>) +declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/Mips/ctselect-side-effects.ll b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll index 86b5952ffd19c..3ceffeeaaca05 100644 --- a/llvm/test/CodeGen/Mips/ctselect-side-effects.ll +++ b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll @@ -166,7 +166,8 @@ define i32 @test_normal_ops(i32 %x) { ; This simulates what the reviewer is worried about define i32 @test_xor_with_const_operands() { ; M32-LABEL: test_xor_with_const_operands: -; M32: # %bb.0: M32-NEXT: jr $ra +; M32: # %bb.0: +; M32-NEXT: jr $ra ; M32-NEXT: addiu $2, $zero, 0 ; ; M64-LABEL: test_xor_with_const_operands: diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll new file mode 100644 index 0000000000000..daa7370fb481a --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll @@ -0,0 +1,566 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -mattr=+simd128 | FileCheck %s --check-prefix=WASM32 +; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -mattr=+simd128 | FileCheck %s --check-prefix=WASM64 + +; Test 32-bit integer vector (4 x i32 = 128-bit) +define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; WASM32-LABEL: test_ctselect_v4i32: +; WASM32: .functype test_ctselect_v4i32 (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32: +; WASM64: .functype test_ctselect_v4i32 (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test 16-bit integer vector (8 x i16 = 128-bit) +define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) { +; WASM32-LABEL: test_ctselect_v8i16: +; WASM32: .functype test_ctselect_v8i16 (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i16x8.splat +; WASM32-NEXT: i32.const 15 +; WASM32-NEXT: i16x8.shl +; WASM32-NEXT: i32.const 15 +; WASM32-NEXT: i16x8.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v8i16: +; WASM64: .functype test_ctselect_v8i16 (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i16x8.splat +; WASM64-NEXT: i32.const 15 +; WASM64-NEXT: i16x8.shl +; WASM64-NEXT: i32.const 15 +; WASM64-NEXT: i16x8.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %result = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %result +} + +; Test byte vector (16 x i8 = 128-bit) +define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) { +; WASM32-LABEL: test_ctselect_v16i8: +; WASM32: .functype test_ctselect_v16i8 (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i8x16.splat +; WASM32-NEXT: i32.const 7 +; WASM32-NEXT: i8x16.shl +; WASM32-NEXT: i32.const 7 +; WASM32-NEXT: i8x16.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v16i8: +; WASM64: .functype test_ctselect_v16i8 (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i8x16.splat +; WASM64-NEXT: i32.const 7 +; WASM64-NEXT: i8x16.shl +; WASM64-NEXT: i32.const 7 +; WASM64-NEXT: i8x16.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %result = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %result +} + +; Test 64-bit integer vector (2 x i64 = 128-bit) +define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { +; WASM32-LABEL: test_ctselect_v2i64: +; WASM32: .functype test_ctselect_v2i64 (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 63 +; WASM32-NEXT: i64x2.shl +; WASM32-NEXT: i32.const 63 +; WASM32-NEXT: i64x2.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v2i64: +; WASM64: .functype test_ctselect_v2i64 (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 63 +; WASM64-NEXT: i64x2.shl +; WASM64-NEXT: i32.const 63 +; WASM64-NEXT: i64x2.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %result +} + +; Test single-precision float vector (4 x float = 128-bit) +define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { +; WASM32-LABEL: test_ctselect_v4f32: +; WASM32: .functype test_ctselect_v4f32 (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4f32: +; WASM64: .functype test_ctselect_v4f32 (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +; Test double-precision float vector (2 x double = 128-bit) +define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { +; WASM32-LABEL: test_ctselect_v2f64: +; WASM32: .functype test_ctselect_v2f64 (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 63 +; WASM32-NEXT: i64x2.shl +; WASM32-NEXT: i32.const 63 +; WASM32-NEXT: i64x2.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v2f64: +; WASM64: .functype test_ctselect_v2f64 (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 63 +; WASM64-NEXT: i64x2.shl +; WASM64-NEXT: i32.const 63 +; WASM64-NEXT: i64x2.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +; Test with aligned loads (common case) +define <4 x i32> @test_ctselect_v4i32_aligned_load(i1 %cond, ptr %p1, ptr %p2) { +; WASM32-LABEL: test_ctselect_v4i32_aligned_load: +; WASM32: .functype test_ctselect_v4i32_aligned_load (i32, i32, i32) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.load 0 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.load 0 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32_aligned_load: +; WASM64: .functype test_ctselect_v4i32_aligned_load (i32, i64, i64) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.load 0 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.load 0 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %a = load <4 x i32>, ptr %p1, align 16 + %b = load <4 x i32>, ptr %p2, align 16 + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with unaligned loads (stress test) +define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2) { +; WASM32-LABEL: test_ctselect_v4i32_unaligned_load: +; WASM32: .functype test_ctselect_v4i32_unaligned_load (i32, i32, i32) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.load 0:p2align=2 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.load 0:p2align=2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32_unaligned_load: +; WASM64: .functype test_ctselect_v4i32_unaligned_load (i32, i64, i64) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.load 0:p2align=2 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.load 0:p2align=2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %a = load <4 x i32>, ptr %p1, align 4 + %b = load <4 x i32>, ptr %p2, align 4 + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with stores to verify result handling +define void @test_ctselect_v4i32_store(i1 %cond, <4 x i32> %a, <4 x i32> %b, ptr %out) { +; WASM32-LABEL: test_ctselect_v4i32_store: +; WASM32: .functype test_ctselect_v4i32_store (i32, v128, v128, i32) -> () +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: v128.store 0 +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32_store: +; WASM64: .functype test_ctselect_v4i32_store (i32, v128, v128, i64) -> () +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: v128.store 0 +; WASM64-NEXT: # fallthrough-return + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + store <4 x i32> %result, ptr %out, align 16 + ret void +} + +; Test chained selects (multiple conditions) +define <4 x i32> @test_ctselect_v4i32_chain(i1 %cond1, i1 %cond2, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; WASM32-LABEL: test_ctselect_v4i32_chain: +; WASM32: .functype test_ctselect_v4i32_chain (i32, i32, v128, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.get 4 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32_chain: +; WASM64: .functype test_ctselect_v4i32_chain (i32, i32, v128, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.get 4 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %tmp = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond1, <4 x i32> %a, <4 x i32> %b) + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond2, <4 x i32> %tmp, <4 x i32> %c) + ret <4 x i32> %result +} + +; Test with arithmetic operations (ensure float vectors work with FP ops) +define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4 x float> %y) { +; WASM32-LABEL: test_ctselect_v4f32_arithmetic: +; WASM32: .functype test_ctselect_v4f32_arithmetic (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: f32x4.add +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: f32x4.sub +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4f32_arithmetic: +; WASM64: .functype test_ctselect_v4f32_arithmetic (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: f32x4.add +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: f32x4.sub +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %sum = fadd <4 x float> %x, %y + %diff = fsub <4 x float> %x, %y + %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %sum, <4 x float> %diff) + ret <4 x float> %result +} + +; Test with zero vectors +define <4 x i32> @test_ctselect_v4i32_zeros(i1 %cond, <4 x i32> %a) { +; WASM32-LABEL: test_ctselect_v4i32_zeros: +; WASM32: .functype test_ctselect_v4i32_zeros (i32, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.and +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32_zeros: +; WASM64: .functype test_ctselect_v4i32_zeros (i32, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.and +; WASM64-NEXT: # fallthrough-return + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, + <4 x i32> %a, + <4 x i32> zeroinitializer) + ret <4 x i32> %result +} + +; Test with function arguments directly (no loads) +define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b) nounwind { +; WASM32-LABEL: test_ctselect_v4i32_args: +; WASM32: .functype test_ctselect_v4i32_args (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32_args: +; WASM64: .functype test_ctselect_v4i32_args (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with multiple uses of result +define <4 x i32> @test_ctselect_v4i32_multi_use(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; WASM32-LABEL: test_ctselect_v4i32_multi_use: +; WASM32: .functype test_ctselect_v4i32_multi_use (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 2 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: i32x4.add +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32_multi_use: +; WASM64: .functype test_ctselect_v4i32_multi_use (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 2 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: i32x4.add +; WASM64-NEXT: # fallthrough-return + %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + %add = add <4 x i32> %sel, %sel ; Use result twice + ret <4 x i32> %add +} + +; Test byte vector with operations +define <16 x i8> @test_ctselect_v16i8_ops(i1 %cond, <16 x i8> %x, <16 x i8> %y) { +; WASM32-LABEL: test_ctselect_v16i8_ops: +; WASM32: .functype test_ctselect_v16i8_ops (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.xor +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i8x16.splat +; WASM32-NEXT: i32.const 7 +; WASM32-NEXT: i8x16.shl +; WASM32-NEXT: i32.const 7 +; WASM32-NEXT: i8x16.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v16i8_ops: +; WASM64: .functype test_ctselect_v16i8_ops (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.xor +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i8x16.splat +; WASM64-NEXT: i32.const 7 +; WASM64-NEXT: i8x16.shl +; WASM64-NEXT: i32.const 7 +; WASM64-NEXT: i8x16.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %xor = xor <16 x i8> %x, %y + %and = and <16 x i8> %x, %y + %result = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %xor, <16 x i8> %and) + ret <16 x i8> %result +} + +declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>) +declare <8 x i16> @llvm.ct.select.v8i16(i1, <8 x i16>, <8 x i16>) +declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>) +declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>) +declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>) +declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>) From 2afa2a2ea667ccb23da0a8d1638db5c1cc1b6019 Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Tue, 19 Aug 2025 08:23:34 -0400 Subject: [PATCH 14/63] [CT] Used sext for normalizing the predicate --- .../SelectionDAG/SelectionDAGBuilder.cpp | 33 ++-- .../Mips/ctselect-fallback-edge-cases.ll | 141 ++++++++---------- .../Mips/ctselect-fallback-patterns.ll | 124 +++++++-------- llvm/test/CodeGen/Mips/ctselect-fallback.ll | 67 +++------ .../CodeGen/Mips/ctselect-side-effects.ll | 8 +- .../RISCV/ctselect-fallback-edge-cases.ll | 15 +- .../ctselect-fallback-nonintegral-fail.ll | 18 --- .../RISCV/ctselect-fallback-patterns.ll | 38 ++--- llvm/test/CodeGen/RISCV/ctselect-fallback.ll | 30 +--- .../CodeGen/RISCV/ctselect-side-effects.ll | 1 - .../ctselect-fallback-edge-cases.ll | 47 +----- .../WebAssembly/ctselect-fallback-patterns.ll | 118 ++++++--------- .../CodeGen/WebAssembly/ctselect-fallback.ll | 76 ++-------- .../WebAssembly/ctselect-side-effects.ll | 4 - 14 files changed, 243 insertions(+), 477 deletions(-) delete mode 100644 llvm/test/CodeGen/RISCV/ctselect-fallback-nonintegral-fail.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 6b34cbea723db..31b4b8b8ddde6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -15,7 +15,6 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/BitVector.h" -#include "llvm/ADT/BitmaskEnum.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringExtras.h" @@ -6490,24 +6489,6 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I, setValue(&I, Result); } -static SDValue createMask(SelectionDAG &DAG, const SDLoc &DL, SDValue Cond, - EVT WorkingVT, const SDNodeFlags &Flag) { - if (WorkingVT.isVector()) - return DAG.getSExtOrTrunc(Cond, DL, WorkingVT); - - // Extend cond to WorkingVT and normalize to 0 or 1 - if (Cond.getValueType() != WorkingVT) - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, WorkingVT, Cond, Flag); - - // Normalize - SDValue One = DAG.getConstant(1, DL, WorkingVT); - SDValue Norm = DAG.getNode(ISD::AND, DL, WorkingVT, Cond, One, Flag); - - // Mask = 0 - Norm - SDValue Zero = DAG.getConstant(0, DL, WorkingVT); - return DAG.getNode(ISD::SUB, DL, WorkingVT, Zero, Norm, Flag); -} - /// Fallback implementation is an alternative approach for managing architectures that don't have /// native support for Constant-Time Select. SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( @@ -6542,7 +6523,7 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( WorkingF = DAG.getBitcast(WorkingVT, F); } - SDValue Mask = createMask(DAG, DL, Cond, WorkingVT, ProtectedFlag); + SDValue Mask = DAG.getSExtOrTrunc(Cond, DL, WorkingVT); SDValue AllOnes = DAG.getAllOnesConstant(DL, WorkingVT); SDValue Invert = DAG.getNode(ISD::XOR, DL, WorkingVT, Mask, AllOnes, ProtectedFlag); @@ -6754,8 +6735,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, // For now we'll only support scalar predicates // assert if Cond type is Vector // TODO: Maybe look into supporting vector predicates? - assert(!CondVT.isVector() && - "ct.select fallback only supports scalar conditions"); + if (CondVT.isVector()) { + report_fatal_error( + "llvm.ct.select: predicates with vector types not supported yet"); + } // Handle scalar types if (TLI.isSelectSupported( @@ -6779,8 +6762,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, // We don't support scalable vector types yet, for now it'll only be // fix-width vector // TODO: Add support for scalable vectors - assert(!VT.isScalableVector() && - "ct.select fallback doesn't supports scalable vectors"); + if (VT.isScalableVector()) { + report_fatal_error( + "llvm.ct.select: fallback doesn't supports scalable vectors"); + } setValue(&I, createProtectedCtSelectFallback(DAG, DL, Cond, A, B, VT)); return; diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll index c66e0a41644ff..5b4bdffbc76f6 100644 --- a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll +++ b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll @@ -8,22 +8,20 @@ define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { ; M32-LABEL: test_ctselect_i1: ; M32: # %bb.0: -; M32-NEXT: negu $1, $4 -; M32-NEXT: and $2, $1, $5 -; M32-NEXT: xori $1, $1, 1 -; M32-NEXT: and $1, $1, $6 +; M32-NEXT: xori $2, $4, 1 +; M32-NEXT: and $1, $4, $5 +; M32-NEXT: and $2, $2, $6 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_i1: ; M64: # %bb.0: -; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: sll $2, $5, 0 -; M64-NEXT: sll $3, $6, 0 -; M64-NEXT: negu $1, $1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: xori $1, $1, 1 -; M64-NEXT: and $1, $1, $3 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $1, $6, 0 +; M64-NEXT: xori $2, $2, 1 +; M64-NEXT: and $1, $2, $1 +; M64-NEXT: and $2, $4, $5 +; M64-NEXT: sll $2, $2, 0 ; M64-NEXT: jr $ra ; M64-NEXT: or $2, $2, $1 %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) @@ -37,9 +35,8 @@ define i32 @test_ctselect_extremal_values(i1 %cond) { ; M32-NEXT: andi $1, $4, 1 ; M32-NEXT: lui $2, 32767 ; M32-NEXT: lui $3, 32768 -; M32-NEXT: andi $1, $1, 1 -; M32-NEXT: ori $2, $2, 65535 ; M32-NEXT: negu $1, $1 +; M32-NEXT: ori $2, $2, 65535 ; M32-NEXT: and $2, $1, $2 ; M32-NEXT: not $1, $1 ; M32-NEXT: and $1, $1, $3 @@ -53,7 +50,6 @@ define i32 @test_ctselect_extremal_values(i1 %cond) { ; M64-NEXT: lui $3, 32768 ; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: ori $2, $2, 65535 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: negu $1, $1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: not $1, $1 @@ -69,7 +65,6 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { ; M32-LABEL: test_ctselect_null_ptr: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: andi $1, $1, 1 ; M32-NEXT: negu $1, $1 ; M32-NEXT: jr $ra ; M32-NEXT: and $2, $1, $5 @@ -77,7 +72,6 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { ; M64-LABEL: test_ctselect_null_ptr: ; M64: # %bb.0: ; M64-NEXT: andi $1, $4, 1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: dnegu $1, $1 ; M64-NEXT: jr $ra ; M64-NEXT: and $2, $1, $5 @@ -90,7 +84,6 @@ define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { ; M32-LABEL: test_ctselect_function_ptr: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: andi $1, $1, 1 ; M32-NEXT: negu $1, $1 ; M32-NEXT: and $2, $1, $5 ; M32-NEXT: not $1, $1 @@ -102,7 +95,6 @@ define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { ; M64: # %bb.0: ; M64-NEXT: andi $1, $4, 1 ; M64-NEXT: daddiu $3, $zero, -1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: dnegu $1, $1 ; M64-NEXT: and $2, $1, $5 ; M64-NEXT: xor $1, $1, $3 @@ -118,8 +110,8 @@ define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { ; M32-LABEL: test_ctselect_ptr_cmp: ; M32: # %bb.0: ; M32-NEXT: xor $1, $4, $5 -; M32-NEXT: sltiu $1, $1, 1 -; M32-NEXT: negu $1, $1 +; M32-NEXT: sltu $1, $zero, $1 +; M32-NEXT: addiu $1, $1, -1 ; M32-NEXT: and $2, $1, $6 ; M32-NEXT: not $1, $1 ; M32-NEXT: and $1, $1, $7 @@ -129,17 +121,14 @@ define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { ; M64-LABEL: test_ctselect_ptr_cmp: ; M64: # %bb.0: ; M64-NEXT: xor $1, $4, $5 +; M64-NEXT: daddiu $3, $zero, -1 ; M64-NEXT: daddiu $2, $zero, -1 -; M64-NEXT: sltiu $1, $1, 1 -; M64-NEXT: dsll $1, $1, 32 -; M64-NEXT: dsrl $1, $1, 32 -; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: dnegu $1, $1 -; M64-NEXT: and $3, $1, $6 -; M64-NEXT: xor $1, $1, $2 -; M64-NEXT: and $1, $1, $7 +; M64-NEXT: movn $3, $zero, $1 +; M64-NEXT: xor $2, $3, $2 +; M64-NEXT: and $1, $3, $6 +; M64-NEXT: and $2, $2, $7 ; M64-NEXT: jr $ra -; M64-NEXT: or $2, $3, $1 +; M64-NEXT: or $2, $1, $2 %cmp = icmp eq ptr %p1, %p2 %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b) ret ptr %result @@ -152,7 +141,6 @@ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { ; M32-LABEL: test_ctselect_struct_ptr: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: andi $1, $1, 1 ; M32-NEXT: negu $1, $1 ; M32-NEXT: and $2, $1, $5 ; M32-NEXT: not $1, $1 @@ -164,7 +152,6 @@ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { ; M64: # %bb.0: ; M64-NEXT: andi $1, $4, 1 ; M64-NEXT: daddiu $3, $zero, -1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: dnegu $1, $1 ; M64-NEXT: and $2, $1, $5 ; M64-NEXT: xor $1, $1, $3 @@ -182,76 +169,68 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ; M32-NEXT: andi $1, $4, 1 ; M32-NEXT: lw $2, 16($sp) ; M32-NEXT: lw $3, 20($sp) -; M32-NEXT: andi $4, $6, 1 -; M32-NEXT: lw $6, 32($sp) -; M32-NEXT: andi $1, $1, 1 -; M32-NEXT: andi $4, $4, 1 +; M32-NEXT: lw $4, 24($sp) ; M32-NEXT: negu $1, $1 -; M32-NEXT: negu $4, $4 ; M32-NEXT: and $2, $1, $2 ; M32-NEXT: not $1, $1 ; M32-NEXT: and $1, $1, $3 -; M32-NEXT: andi $3, $7, 1 -; M32-NEXT: lw $7, 24($sp) -; M32-NEXT: andi $3, $3, 1 ; M32-NEXT: or $1, $2, $1 -; M32-NEXT: negu $2, $3 -; M32-NEXT: andi $3, $5, 1 -; M32-NEXT: andi $3, $3, 1 -; M32-NEXT: not $5, $2 +; M32-NEXT: andi $2, $5, 1 +; M32-NEXT: negu $2, $2 +; M32-NEXT: not $3, $2 +; M32-NEXT: and $1, $2, $1 +; M32-NEXT: and $2, $3, $4 +; M32-NEXT: andi $4, $6, 1 +; M32-NEXT: andi $3, $7, 1 +; M32-NEXT: lw $6, 32($sp) +; M32-NEXT: negu $4, $4 +; M32-NEXT: or $1, $1, $2 ; M32-NEXT: negu $3, $3 -; M32-NEXT: and $1, $3, $1 -; M32-NEXT: not $3, $3 -; M32-NEXT: and $3, $3, $7 -; M32-NEXT: or $1, $1, $3 -; M32-NEXT: not $3, $4 ; M32-NEXT: and $1, $4, $1 +; M32-NEXT: not $2, $4 ; M32-NEXT: lw $4, 28($sp) -; M32-NEXT: and $3, $3, $4 -; M32-NEXT: or $1, $1, $3 -; M32-NEXT: and $1, $2, $1 +; M32-NEXT: not $5, $3 +; M32-NEXT: and $2, $2, $4 +; M32-NEXT: or $1, $1, $2 ; M32-NEXT: and $2, $5, $6 +; M32-NEXT: and $1, $3, $1 ; M32-NEXT: jr $ra ; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_deeply_nested: ; M64: # %bb.0: -; M64-NEXT: sll $2, $4, 0 -; M64-NEXT: sll $4, $5, 0 -; M64-NEXT: sll $3, $8, 0 -; M64-NEXT: sll $5, $9, 0 -; M64-NEXT: sll $6, $6, 0 -; M64-NEXT: sll $1, $7, 0 -; M64-NEXT: lw $8, 0($sp) -; M64-NEXT: andi $2, $2, 1 -; M64-NEXT: andi $4, $4, 1 -; M64-NEXT: andi $6, $6, 1 +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: sll $2, $7, 0 +; M64-NEXT: sll $5, $5, 0 +; M64-NEXT: sll $4, $9, 0 +; M64-NEXT: sll $7, $8, 0 ; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: andi $5, $5, 1 ; M64-NEXT: andi $2, $2, 1 -; M64-NEXT: andi $4, $4, 1 -; M64-NEXT: andi $6, $6, 1 -; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: negu $5, $5 ; M64-NEXT: negu $2, $2 +; M64-NEXT: not $3, $1 +; M64-NEXT: and $1, $1, $7 +; M64-NEXT: lw $7, 0($sp) +; M64-NEXT: and $3, $3, $4 +; M64-NEXT: sll $4, $6, 0 +; M64-NEXT: not $6, $2 +; M64-NEXT: or $1, $1, $3 +; M64-NEXT: not $3, $5 +; M64-NEXT: andi $4, $4, 1 +; M64-NEXT: and $1, $5, $1 +; M64-NEXT: sll $5, $10, 0 ; M64-NEXT: negu $4, $4 -; M64-NEXT: negu $6, $6 -; M64-NEXT: negu $1, $1 -; M64-NEXT: and $3, $2, $3 -; M64-NEXT: not $2, $2 -; M64-NEXT: not $7, $1 -; M64-NEXT: and $2, $2, $5 -; M64-NEXT: or $2, $3, $2 +; M64-NEXT: and $3, $3, $5 +; M64-NEXT: or $1, $1, $3 ; M64-NEXT: not $3, $4 -; M64-NEXT: and $2, $4, $2 -; M64-NEXT: sll $4, $10, 0 -; M64-NEXT: and $3, $3, $4 +; M64-NEXT: and $1, $4, $1 ; M64-NEXT: sll $4, $11, 0 -; M64-NEXT: or $2, $2, $3 -; M64-NEXT: not $3, $6 -; M64-NEXT: and $2, $6, $2 ; M64-NEXT: and $3, $3, $4 -; M64-NEXT: or $2, $2, $3 -; M64-NEXT: and $1, $1, $2 -; M64-NEXT: and $2, $7, $8 +; M64-NEXT: or $1, $1, $3 +; M64-NEXT: and $1, $2, $1 +; M64-NEXT: and $2, $6, $7 ; M64-NEXT: jr $ra ; M64-NEXT: or $2, $1, $2 %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll index 46c74b1d3db36..e195bdb369dae 100644 --- a/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll +++ b/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll @@ -6,16 +6,14 @@ define i32 @test_ctselect_smin_zero(i32 %x) { ; M32-LABEL: test_ctselect_smin_zero: ; M32: # %bb.0: -; M32-NEXT: slti $1, $4, 0 -; M32-NEXT: negu $1, $1 +; M32-NEXT: sra $1, $4, 31 ; M32-NEXT: jr $ra ; M32-NEXT: and $2, $1, $4 ; ; M64-LABEL: test_ctselect_smin_zero: ; M64: # %bb.0: ; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: slti $2, $1, 0 -; M64-NEXT: negu $2, $2 +; M64-NEXT: sra $2, $1, 31 ; M64-NEXT: jr $ra ; M64-NEXT: and $2, $2, $1 %cmp = icmp slt i32 %x, 0 @@ -27,16 +25,16 @@ define i32 @test_ctselect_smin_zero(i32 %x) { define i32 @test_ctselect_smax_zero(i32 %x) { ; M32-LABEL: test_ctselect_smax_zero: ; M32: # %bb.0: -; M32-NEXT: slt $1, $zero, $4 -; M32-NEXT: negu $1, $1 +; M32-NEXT: slti $1, $4, 1 +; M32-NEXT: addiu $1, $1, -1 ; M32-NEXT: jr $ra ; M32-NEXT: and $2, $1, $4 ; ; M64-LABEL: test_ctselect_smax_zero: ; M64: # %bb.0: ; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: slt $2, $zero, $1 -; M64-NEXT: negu $2, $2 +; M64-NEXT: slti $2, $1, 1 +; M64-NEXT: addiu $2, $2, -1 ; M64-NEXT: jr $ra ; M64-NEXT: and $2, $2, $1 %cmp = icmp sgt i32 %x, 0 @@ -49,7 +47,8 @@ define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { ; M32-LABEL: test_ctselect_smin_generic: ; M32: # %bb.0: ; M32-NEXT: slt $1, $4, $5 -; M32-NEXT: negu $1, $1 +; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: addiu $1, $1, -1 ; M32-NEXT: and $2, $1, $4 ; M32-NEXT: not $1, $1 ; M32-NEXT: and $1, $1, $5 @@ -61,7 +60,8 @@ define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { ; M64-NEXT: sll $1, $5, 0 ; M64-NEXT: sll $2, $4, 0 ; M64-NEXT: slt $3, $2, $1 -; M64-NEXT: negu $3, $3 +; M64-NEXT: xori $3, $3, 1 +; M64-NEXT: addiu $3, $3, -1 ; M64-NEXT: and $2, $3, $2 ; M64-NEXT: not $3, $3 ; M64-NEXT: and $1, $3, $1 @@ -77,7 +77,8 @@ define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { ; M32-LABEL: test_ctselect_smax_generic: ; M32: # %bb.0: ; M32-NEXT: slt $1, $5, $4 -; M32-NEXT: negu $1, $1 +; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: addiu $1, $1, -1 ; M32-NEXT: and $2, $1, $4 ; M32-NEXT: not $1, $1 ; M32-NEXT: and $1, $1, $5 @@ -89,7 +90,8 @@ define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { ; M64-NEXT: sll $1, $4, 0 ; M64-NEXT: sll $2, $5, 0 ; M64-NEXT: slt $3, $2, $1 -; M64-NEXT: negu $3, $3 +; M64-NEXT: xori $3, $3, 1 +; M64-NEXT: addiu $3, $3, -1 ; M64-NEXT: and $1, $3, $1 ; M64-NEXT: not $3, $3 ; M64-NEXT: and $2, $3, $2 @@ -105,7 +107,8 @@ define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { ; M32-LABEL: test_ctselect_umin_generic: ; M32: # %bb.0: ; M32-NEXT: sltu $1, $4, $5 -; M32-NEXT: negu $1, $1 +; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: addiu $1, $1, -1 ; M32-NEXT: and $2, $1, $4 ; M32-NEXT: not $1, $1 ; M32-NEXT: and $1, $1, $5 @@ -117,7 +120,8 @@ define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { ; M64-NEXT: sll $1, $5, 0 ; M64-NEXT: sll $2, $4, 0 ; M64-NEXT: sltu $3, $2, $1 -; M64-NEXT: negu $3, $3 +; M64-NEXT: xori $3, $3, 1 +; M64-NEXT: addiu $3, $3, -1 ; M64-NEXT: and $2, $3, $2 ; M64-NEXT: not $3, $3 ; M64-NEXT: and $1, $3, $1 @@ -133,7 +137,8 @@ define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { ; M32-LABEL: test_ctselect_umax_generic: ; M32: # %bb.0: ; M32-NEXT: sltu $1, $5, $4 -; M32-NEXT: negu $1, $1 +; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: addiu $1, $1, -1 ; M32-NEXT: and $2, $1, $4 ; M32-NEXT: not $1, $1 ; M32-NEXT: and $1, $1, $5 @@ -145,7 +150,8 @@ define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { ; M64-NEXT: sll $1, $4, 0 ; M64-NEXT: sll $2, $5, 0 ; M64-NEXT: sltu $3, $2, $1 -; M64-NEXT: negu $3, $3 +; M64-NEXT: xori $3, $3, 1 +; M64-NEXT: addiu $3, $3, -1 ; M64-NEXT: and $1, $3, $1 ; M64-NEXT: not $3, $3 ; M64-NEXT: and $2, $3, $2 @@ -160,9 +166,8 @@ define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { define i32 @test_ctselect_abs(i32 %x) { ; M32-LABEL: test_ctselect_abs: ; M32: # %bb.0: -; M32-NEXT: slti $2, $4, 0 ; M32-NEXT: negu $1, $4 -; M32-NEXT: negu $2, $2 +; M32-NEXT: sra $2, $4, 31 ; M32-NEXT: and $1, $2, $1 ; M32-NEXT: not $2, $2 ; M32-NEXT: and $2, $2, $4 @@ -172,9 +177,8 @@ define i32 @test_ctselect_abs(i32 %x) { ; M64-LABEL: test_ctselect_abs: ; M64: # %bb.0: ; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: slti $3, $1, 0 ; M64-NEXT: negu $2, $1 -; M64-NEXT: negu $3, $3 +; M64-NEXT: sra $3, $1, 31 ; M64-NEXT: and $2, $3, $2 ; M64-NEXT: not $3, $3 ; M64-NEXT: and $1, $3, $1 @@ -190,26 +194,24 @@ define i32 @test_ctselect_abs(i32 %x) { define i32 @test_ctselect_nabs(i32 %x) { ; M32-LABEL: test_ctselect_nabs: ; M32: # %bb.0: -; M32-NEXT: slti $2, $4, 0 -; M32-NEXT: negu $1, $4 -; M32-NEXT: negu $2, $2 -; M32-NEXT: not $3, $2 -; M32-NEXT: and $2, $2, $4 -; M32-NEXT: and $1, $3, $1 +; M32-NEXT: sra $1, $4, 31 +; M32-NEXT: negu $3, $4 +; M32-NEXT: and $2, $1, $4 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $3 ; M32-NEXT: jr $ra ; M32-NEXT: or $2, $2, $1 ; ; M64-LABEL: test_ctselect_nabs: ; M64: # %bb.0: ; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: slti $3, $1, 0 -; M64-NEXT: negu $2, $1 -; M64-NEXT: negu $3, $3 -; M64-NEXT: not $4, $3 -; M64-NEXT: and $1, $3, $1 -; M64-NEXT: and $2, $4, $2 +; M64-NEXT: sra $2, $1, 31 +; M64-NEXT: and $3, $2, $1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: not $2, $2 +; M64-NEXT: and $1, $2, $1 ; M64-NEXT: jr $ra -; M64-NEXT: or $2, $1, $2 +; M64-NEXT: or $2, $3, $1 %neg = sub i32 0, %x %cmp = icmp slt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg) @@ -220,16 +222,14 @@ define i32 @test_ctselect_nabs(i32 %x) { define i32 @test_ctselect_sign_extend(i32 %x) { ; M32-LABEL: test_ctselect_sign_extend: ; M32: # %bb.0: -; M32-NEXT: slti $1, $4, 0 ; M32-NEXT: jr $ra -; M32-NEXT: negu $2, $1 +; M32-NEXT: sra $2, $4, 31 ; ; M64-LABEL: test_ctselect_sign_extend: ; M64: # %bb.0: ; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: slti $1, $1, 0 ; M64-NEXT: jr $ra -; M64-NEXT: negu $2, $1 +; M64-NEXT: sra $2, $1, 31 %cmp = icmp slt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) ret i32 %result @@ -239,16 +239,16 @@ define i32 @test_ctselect_sign_extend(i32 %x) { define i32 @test_ctselect_zero_extend(i32 %x) { ; M32-LABEL: test_ctselect_zero_extend: ; M32: # %bb.0: -; M32-NEXT: sltu $1, $zero, $4 -; M32-NEXT: negu $1, $1 +; M32-NEXT: sltiu $1, $4, 1 +; M32-NEXT: addiu $1, $1, -1 ; M32-NEXT: jr $ra ; M32-NEXT: andi $2, $1, 1 ; ; M64-LABEL: test_ctselect_zero_extend: ; M64: # %bb.0: ; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: sltu $1, $zero, $1 -; M64-NEXT: negu $1, $1 +; M64-NEXT: sltiu $1, $1, 1 +; M64-NEXT: addiu $1, $1, -1 ; M64-NEXT: jr $ra ; M64-NEXT: andi $2, $1, 1 %cmp = icmp ne i32 %x, 0 @@ -290,7 +290,6 @@ define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { ; M32-LABEL: test_ctselect_identical_operands: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: andi $1, $1, 1 ; M32-NEXT: negu $1, $1 ; M32-NEXT: and $2, $1, $5 ; M32-NEXT: not $1, $1 @@ -303,7 +302,6 @@ define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { ; M64-NEXT: sll $1, $4, 0 ; M64-NEXT: sll $2, $5, 0 ; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: negu $1, $1 ; M64-NEXT: and $3, $1, $2 ; M64-NEXT: not $1, $1 @@ -319,8 +317,8 @@ define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { ; M32-LABEL: test_ctselect_inverted_condition: ; M32: # %bb.0: ; M32-NEXT: xor $1, $4, $5 -; M32-NEXT: sltu $1, $zero, $1 -; M32-NEXT: negu $1, $1 +; M32-NEXT: sltiu $1, $1, 1 +; M32-NEXT: addiu $1, $1, -1 ; M32-NEXT: and $2, $1, $6 ; M32-NEXT: not $1, $1 ; M32-NEXT: and $1, $1, $7 @@ -334,8 +332,8 @@ define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { ; M64-NEXT: sll $3, $7, 0 ; M64-NEXT: xor $1, $2, $1 ; M64-NEXT: sll $2, $6, 0 -; M64-NEXT: sltu $1, $zero, $1 -; M64-NEXT: negu $1, $1 +; M64-NEXT: sltiu $1, $1, 1 +; M64-NEXT: addiu $1, $1, -1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: not $1, $1 ; M64-NEXT: and $1, $1, $3 @@ -353,7 +351,6 @@ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 ; M32-NEXT: lw $3, 16($sp) -; M32-NEXT: andi $1, $1, 1 ; M32-NEXT: negu $1, $1 ; M32-NEXT: and $2, $1, $7 ; M32-NEXT: not $1, $1 @@ -361,15 +358,13 @@ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, ; M32-NEXT: lw $3, 20($sp) ; M32-NEXT: or $1, $2, $1 ; M32-NEXT: andi $2, $5, 1 -; M32-NEXT: andi $2, $2, 1 ; M32-NEXT: negu $2, $2 ; M32-NEXT: and $1, $2, $1 ; M32-NEXT: not $2, $2 ; M32-NEXT: and $2, $2, $3 -; M32-NEXT: andi $3, $6, 1 -; M32-NEXT: or $1, $1, $2 -; M32-NEXT: andi $2, $3, 1 ; M32-NEXT: lw $3, 24($sp) +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: andi $2, $6, 1 ; M32-NEXT: negu $2, $2 ; M32-NEXT: and $1, $2, $1 ; M32-NEXT: not $2, $2 @@ -380,25 +375,22 @@ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, ; M64-LABEL: test_ctselect_chain: ; M64: # %bb.0: ; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: sll $3, $5, 0 ; M64-NEXT: sll $2, $7, 0 -; M64-NEXT: sll $4, $8, 0 +; M64-NEXT: sll $3, $8, 0 ; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: andi $3, $3, 1 -; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: andi $3, $3, 1 ; M64-NEXT: negu $1, $1 -; M64-NEXT: negu $3, $3 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: not $1, $1 -; M64-NEXT: and $1, $1, $4 -; M64-NEXT: or $1, $2, $1 -; M64-NEXT: not $2, $3 -; M64-NEXT: and $1, $3, $1 +; M64-NEXT: and $1, $1, $3 ; M64-NEXT: sll $3, $9, 0 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $2, $1 +; M64-NEXT: not $2, $2 ; M64-NEXT: and $2, $2, $3 ; M64-NEXT: sll $3, $6, 0 -; M64-NEXT: andi $3, $3, 1 ; M64-NEXT: or $1, $1, $2 ; M64-NEXT: andi $2, $3, 1 ; M64-NEXT: sll $3, $10, 0 @@ -418,16 +410,14 @@ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, define i64 @test_ctselect_i64_smin_zero(i64 %x) { ; M32-LABEL: test_ctselect_i64_smin_zero: ; M32: # %bb.0: -; M32-NEXT: slti $1, $5, 0 -; M32-NEXT: negu $1, $1 +; M32-NEXT: sra $1, $5, 31 ; M32-NEXT: and $2, $1, $4 ; M32-NEXT: jr $ra ; M32-NEXT: and $3, $1, $5 ; ; M64-LABEL: test_ctselect_i64_smin_zero: ; M64: # %bb.0: -; M64-NEXT: dsrl $1, $4, 63 -; M64-NEXT: dnegu $1, $1 +; M64-NEXT: dsra $1, $4, 63 ; M64-NEXT: jr $ra ; M64-NEXT: and $2, $1, $4 %cmp = icmp slt i64 %x, 0 diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback.ll b/llvm/test/CodeGen/Mips/ctselect-fallback.ll index 61e2011f891b9..7f924253b4ccc 100644 --- a/llvm/test/CodeGen/Mips/ctselect-fallback.ll +++ b/llvm/test/CodeGen/Mips/ctselect-fallback.ll @@ -61,7 +61,6 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; M32-LABEL: test_ctselect_i32: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: andi $1, $1, 1 ; M32-NEXT: negu $1, $1 ; M32-NEXT: and $2, $1, $5 ; M32-NEXT: not $1, $1 @@ -75,7 +74,6 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; M64-NEXT: sll $2, $5, 0 ; M64-NEXT: sll $3, $6, 0 ; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: negu $1, $1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: not $1, $1 @@ -106,7 +104,6 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { ; M64: # %bb.0: ; M64-NEXT: andi $1, $4, 1 ; M64-NEXT: daddiu $3, $zero, -1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: dnegu $1, $1 ; M64-NEXT: and $2, $1, $5 ; M64-NEXT: xor $1, $1, $3 @@ -121,7 +118,6 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { ; M32-LABEL: test_ctselect_ptr: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: andi $1, $1, 1 ; M32-NEXT: negu $1, $1 ; M32-NEXT: and $2, $1, $5 ; M32-NEXT: not $1, $1 @@ -133,7 +129,6 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { ; M64: # %bb.0: ; M64-NEXT: andi $1, $4, 1 ; M64-NEXT: daddiu $3, $zero, -1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: dnegu $1, $1 ; M64-NEXT: and $2, $1, $5 ; M64-NEXT: xor $1, $1, $3 @@ -178,8 +173,8 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { ; M32-LABEL: test_ctselect_icmp_eq: ; M32: # %bb.0: ; M32-NEXT: xor $1, $4, $5 -; M32-NEXT: sltiu $1, $1, 1 -; M32-NEXT: negu $1, $1 +; M32-NEXT: sltu $1, $zero, $1 +; M32-NEXT: addiu $1, $1, -1 ; M32-NEXT: and $2, $1, $6 ; M32-NEXT: not $1, $1 ; M32-NEXT: and $1, $1, $7 @@ -193,8 +188,8 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { ; M64-NEXT: sll $3, $7, 0 ; M64-NEXT: xor $1, $2, $1 ; M64-NEXT: sll $2, $6, 0 -; M64-NEXT: sltiu $1, $1, 1 -; M64-NEXT: negu $1, $1 +; M64-NEXT: sltu $1, $zero, $1 +; M64-NEXT: addiu $1, $1, -1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: not $1, $1 ; M64-NEXT: and $1, $1, $3 @@ -209,8 +204,8 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { ; M32-LABEL: test_ctselect_icmp_ne: ; M32: # %bb.0: ; M32-NEXT: xor $1, $4, $5 -; M32-NEXT: sltu $1, $zero, $1 -; M32-NEXT: negu $1, $1 +; M32-NEXT: sltiu $1, $1, 1 +; M32-NEXT: addiu $1, $1, -1 ; M32-NEXT: and $2, $1, $6 ; M32-NEXT: not $1, $1 ; M32-NEXT: and $1, $1, $7 @@ -224,8 +219,8 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { ; M64-NEXT: sll $3, $7, 0 ; M64-NEXT: xor $1, $2, $1 ; M64-NEXT: sll $2, $6, 0 -; M64-NEXT: sltu $1, $zero, $1 -; M64-NEXT: negu $1, $1 +; M64-NEXT: sltiu $1, $1, 1 +; M64-NEXT: addiu $1, $1, -1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: not $1, $1 ; M64-NEXT: and $1, $1, $3 @@ -240,7 +235,8 @@ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { ; M32-LABEL: test_ctselect_icmp_slt: ; M32: # %bb.0: ; M32-NEXT: slt $1, $4, $5 -; M32-NEXT: negu $1, $1 +; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: addiu $1, $1, -1 ; M32-NEXT: and $2, $1, $6 ; M32-NEXT: not $1, $1 ; M32-NEXT: and $1, $1, $7 @@ -254,7 +250,8 @@ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { ; M64-NEXT: sll $3, $7, 0 ; M64-NEXT: slt $1, $2, $1 ; M64-NEXT: sll $2, $6, 0 -; M64-NEXT: negu $1, $1 +; M64-NEXT: xori $1, $1, 1 +; M64-NEXT: addiu $1, $1, -1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: not $1, $1 ; M64-NEXT: and $1, $1, $3 @@ -269,7 +266,8 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { ; M32-LABEL: test_ctselect_icmp_ult: ; M32: # %bb.0: ; M32-NEXT: sltu $1, $4, $5 -; M32-NEXT: negu $1, $1 +; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: addiu $1, $1, -1 ; M32-NEXT: and $2, $1, $6 ; M32-NEXT: not $1, $1 ; M32-NEXT: and $1, $1, $7 @@ -283,7 +281,8 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { ; M64-NEXT: sll $3, $7, 0 ; M64-NEXT: sltu $1, $2, $1 ; M64-NEXT: sll $2, $6, 0 -; M64-NEXT: negu $1, $1 +; M64-NEXT: xori $1, $1, 1 +; M64-NEXT: addiu $1, $1, -1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: not $1, $1 ; M64-NEXT: and $1, $1, $3 @@ -301,7 +300,6 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { ; M32-NEXT: andi $1, $4, 1 ; M32-NEXT: lw $2, 0($5) ; M32-NEXT: lw $3, 0($6) -; M32-NEXT: andi $1, $1, 1 ; M32-NEXT: negu $1, $1 ; M32-NEXT: and $2, $1, $2 ; M32-NEXT: not $1, $1 @@ -315,7 +313,6 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { ; M64-NEXT: lw $2, 0($5) ; M64-NEXT: lw $3, 0($6) ; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: negu $1, $1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: not $1, $1 @@ -334,14 +331,12 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ; M32: # %bb.0: ; M32-NEXT: andi $1, $5, 1 ; M32-NEXT: lw $3, 16($sp) -; M32-NEXT: andi $1, $1, 1 ; M32-NEXT: negu $1, $1 ; M32-NEXT: and $2, $1, $6 ; M32-NEXT: not $1, $1 ; M32-NEXT: and $1, $1, $7 ; M32-NEXT: or $1, $2, $1 ; M32-NEXT: andi $2, $4, 1 -; M32-NEXT: andi $2, $2, 1 ; M32-NEXT: negu $2, $2 ; M32-NEXT: and $1, $2, $1 ; M32-NEXT: not $2, $2 @@ -355,7 +350,6 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ; M64-NEXT: sll $2, $6, 0 ; M64-NEXT: sll $3, $7, 0 ; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: negu $1, $1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: not $1, $1 @@ -364,7 +358,6 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ; M64-NEXT: or $1, $2, $1 ; M64-NEXT: sll $2, $4, 0 ; M64-NEXT: andi $2, $2, 1 -; M64-NEXT: andi $2, $2, 1 ; M64-NEXT: negu $2, $2 ; M64-NEXT: and $1, $2, $1 ; M64-NEXT: not $2, $2 @@ -381,7 +374,6 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; M32-LABEL: test_ctselect_f32: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: andi $1, $1, 1 ; M32-NEXT: negu $1, $1 ; M32-NEXT: and $2, $1, $5 ; M32-NEXT: not $1, $1 @@ -396,7 +388,6 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; M64-NEXT: mfc1 $2, $f13 ; M64-NEXT: mfc1 $3, $f14 ; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: negu $1, $1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: not $1, $1 @@ -441,7 +432,6 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) { ; M64-NEXT: andi $1, $4, 1 ; M64-NEXT: dmfc1 $2, $f13 ; M64-NEXT: daddiu $3, $zero, -1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: dnegu $1, $1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: xor $1, $1, $3 @@ -461,14 +451,12 @@ define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 ; M32-NEXT: lw $3, 16($sp) -; M32-NEXT: andi $1, $1, 1 ; M32-NEXT: negu $1, $1 ; M32-NEXT: and $2, $1, $6 ; M32-NEXT: not $1, $1 ; M32-NEXT: and $1, $1, $7 ; M32-NEXT: or $1, $2, $1 ; M32-NEXT: andi $2, $5, 1 -; M32-NEXT: andi $2, $2, 1 ; M32-NEXT: negu $2, $2 ; M32-NEXT: and $1, $2, $1 ; M32-NEXT: not $2, $2 @@ -483,7 +471,6 @@ define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, ; M64-NEXT: mfc1 $2, $f14 ; M64-NEXT: mfc1 $3, $f15 ; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: negu $1, $1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: not $1, $1 @@ -492,7 +479,6 @@ define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, ; M64-NEXT: or $1, $2, $1 ; M64-NEXT: sll $2, $5, 0 ; M64-NEXT: andi $2, $2, 1 -; M64-NEXT: andi $2, $2, 1 ; M64-NEXT: negu $2, $2 ; M64-NEXT: and $1, $2, $1 ; M64-NEXT: not $2, $2 @@ -512,7 +498,6 @@ define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { ; M32-NEXT: andi $1, $4, 1 ; M32-NEXT: lw $2, 0($5) ; M32-NEXT: lw $3, 0($6) -; M32-NEXT: andi $1, $1, 1 ; M32-NEXT: negu $1, $1 ; M32-NEXT: and $2, $1, $2 ; M32-NEXT: not $1, $1 @@ -527,7 +512,6 @@ define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { ; M64-NEXT: lw $2, 0($5) ; M64-NEXT: lw $3, 0($6) ; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: negu $1, $1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: not $1, $1 @@ -571,7 +555,6 @@ define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) { ; M64-NEXT: andi $1, $4, 1 ; M64-NEXT: ld $2, 0($5) ; M64-NEXT: daddiu $3, $zero, -1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: dnegu $1, $1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: xor $1, $1, $3 @@ -592,16 +575,15 @@ define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { ; M32: # %bb.0: ; M32-NEXT: mtc1 $6, $f0 ; M32-NEXT: mtc1 $5, $f1 -; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: andi $1, $1, 1 -; M32-NEXT: add.s $f2, $f1, $f0 -; M32-NEXT: sub.s $f0, $f1, $f0 -; M32-NEXT: negu $1, $1 -; M32-NEXT: mfc1 $2, $f2 +; M32-NEXT: andi $2, $4, 1 +; M32-NEXT: negu $2, $2 +; M32-NEXT: sub.s $f2, $f1, $f0 +; M32-NEXT: add.s $f0, $f1, $f0 +; M32-NEXT: not $3, $2 +; M32-NEXT: mfc1 $1, $f2 +; M32-NEXT: and $1, $3, $1 ; M32-NEXT: mfc1 $3, $f0 -; M32-NEXT: and $2, $1, $2 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $3 +; M32-NEXT: and $2, $2, $3 ; M32-NEXT: or $1, $2, $1 ; M32-NEXT: jr $ra ; M32-NEXT: mtc1 $1, $f0 @@ -611,7 +593,6 @@ define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { ; M64-NEXT: add.s $f0, $f13, $f14 ; M64-NEXT: sll $1, $4, 0 ; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: negu $1, $1 ; M64-NEXT: mfc1 $2, $f0 ; M64-NEXT: sub.s $f0, $f13, $f14 diff --git a/llvm/test/CodeGen/Mips/ctselect-side-effects.ll b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll index 3ceffeeaaca05..f0f6a18ae73bf 100644 --- a/llvm/test/CodeGen/Mips/ctselect-side-effects.ll +++ b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll @@ -39,7 +39,6 @@ define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { ; M32-LABEL: test_protected_no_branch: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: andi $1, $1, 1 ; M32-NEXT: negu $1, $1 ; M32-NEXT: and $2, $1, $5 ; M32-NEXT: not $1, $1 @@ -53,7 +52,6 @@ define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { ; M64-NEXT: sll $2, $5, 0 ; M64-NEXT: sll $3, $6, 0 ; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: andi $1, $1, 1 ; M64-NEXT: negu $1, $1 ; M64-NEXT: and $2, $1, $2 ; M64-NEXT: not $1, $1 @@ -174,9 +172,9 @@ define i32 @test_xor_with_const_operands() { ; M64: # %bb.0: ; M64-NEXT: jr $ra ; M64-NEXT: addiu $2, $zero, 0 - %a = xor i32 -1, -1 - %b = xor i32 0, 0 - %c = xor i32 42, 42 + %a = xor i32 -1, -1 + %b = xor i32 0, 0 + %c = xor i32 42, 42 %result = or i32 %a, %b %final = or i32 %result, %c ret i32 %final ; Should optimize to 0 diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll index d590d48a4f3eb..323be38d3d865 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll @@ -6,7 +6,6 @@ define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { ; RV64-LABEL: test_ctselect_i1: ; RV64: # %bb.0: -; RV64-NEXT: neg a0, a0 ; RV64-NEXT: and a1, a0, a1 ; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: and a0, a0, a2 @@ -15,7 +14,6 @@ define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { ; ; RV32-LABEL: test_ctselect_i1: ; RV32: # %bb.0: -; RV32-NEXT: neg a0, a0 ; RV32-NEXT: and a1, a0, a1 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: and a0, a0, a2 @@ -41,9 +39,8 @@ define i32 @test_ctselect_extremal_values(i1 %cond) { ; ; RV32-LABEL: test_ctselect_extremal_values: ; RV32: # %bb.0: -; RV32-NEXT: andi a0, a0, 1 -; RV32-NEXT: lui a1, 524288 ; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: lui a1, 524288 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: not a2, a0 ; RV32-NEXT: slli a0, a0, 1 @@ -59,7 +56,6 @@ define i32 @test_ctselect_extremal_values(i1 %cond) { define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { ; RV64-LABEL: test_ctselect_null_ptr: ; RV64: # %bb.0: -; RV64-NEXT: andi a0, a0, 1 ; RV64-NEXT: slli a0, a0, 63 ; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: and a0, a0, a1 @@ -67,7 +63,6 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { ; ; RV32-LABEL: test_ctselect_null_ptr: ; RV32: # %bb.0: -; RV32-NEXT: andi a0, a0, 1 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a0, a0, a1 @@ -80,7 +75,6 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { ; RV64-LABEL: test_ctselect_function_ptr: ; RV64: # %bb.0: -; RV64-NEXT: andi a0, a0, 1 ; RV64-NEXT: slli a0, a0, 63 ; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: and a1, a0, a1 @@ -91,7 +85,6 @@ define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { ; ; RV32-LABEL: test_ctselect_function_ptr: ; RV32: # %bb.0: -; RV32-NEXT: andi a0, a0, 1 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a1, a0, a1 @@ -137,7 +130,6 @@ define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { ; RV64-LABEL: test_ctselect_struct_ptr: ; RV64: # %bb.0: -; RV64-NEXT: andi a0, a0, 1 ; RV64-NEXT: slli a0, a0, 63 ; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: and a1, a0, a1 @@ -148,7 +140,6 @@ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { ; ; RV32-LABEL: test_ctselect_struct_ptr: ; RV32: # %bb.0: -; RV32-NEXT: andi a0, a0, 1 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a1, a0, a1 @@ -194,10 +185,6 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ; RV32-LABEL: test_ctselect_deeply_nested: ; RV32: # %bb.0: ; RV32-NEXT: lw t0, 0(sp) -; RV32-NEXT: andi a0, a0, 1 -; RV32-NEXT: andi a1, a1, 1 -; RV32-NEXT: andi a2, a2, 1 -; RV32-NEXT: andi a3, a3, 1 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: slli a1, a1, 31 ; RV32-NEXT: slli a2, a2, 31 diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-nonintegral-fail.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-nonintegral-fail.ll deleted file mode 100644 index cbfc3c7d0e399..0000000000000 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback-nonintegral-fail.ll +++ /dev/null @@ -1,18 +0,0 @@ -; XFAIL: * -; RUN: llc -mtriple=riscv32 < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 < %s | FileCheck %s - -; This test should fail -target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128-ni:200" -target triple = "riscv64-unknown-linux-gnu" - -define i32 addrspace(200)* @test_ctselect_ptr(i1 %c, - i32 addrspace(200)* %a, - i32 addrspace(200)* %b) { - %r = call i32 addrspace(200)* @llvm.ct.select.p0(i1 %c, - i32 addrspace(200)* %a, - i32 addrspace(200)* %b) - ret i32 addrspace(200)* %r -} - -declare i32 @llvm.ct.select.p0(i1, i32, i32) diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll index 1dbade44cc1f4..c21404dbc6317 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll @@ -157,14 +157,12 @@ define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { define i32 @test_ctselect_abs(i32 %x) { ; RV64-LABEL: test_ctselect_abs: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a1, a0 -; RV64-NEXT: negw a2, a0 -; RV64-NEXT: slti a1, a1, 0 -; RV64-NEXT: negw a3, a1 -; RV64-NEXT: addi a1, a1, -1 -; RV64-NEXT: and a2, a3, a2 -; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: or a0, a2, a0 +; RV64-NEXT: negw a1, a0 +; RV64-NEXT: sraiw a2, a0, 31 +; RV64-NEXT: and a1, a2, a1 +; RV64-NEXT: not a2, a2 +; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: or a0, a1, a0 ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_abs: @@ -186,14 +184,12 @@ define i32 @test_ctselect_abs(i32 %x) { define i32 @test_ctselect_nabs(i32 %x) { ; RV64-LABEL: test_ctselect_nabs: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a1, a0 -; RV64-NEXT: negw a2, a0 -; RV64-NEXT: slti a1, a1, 0 -; RV64-NEXT: addi a3, a1, -1 -; RV64-NEXT: neg a1, a1 -; RV64-NEXT: and a2, a3, a2 -; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: negw a1, a0 +; RV64-NEXT: sraiw a2, a0, 31 +; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: not a2, a2 +; RV64-NEXT: and a1, a2, a1 +; RV64-NEXT: or a0, a0, a1 ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_nabs: @@ -282,7 +278,6 @@ define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { ; ; RV32-LABEL: test_ctselect_identical_operands: ; RV32: # %bb.0: -; RV32-NEXT: andi a0, a0, 1 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a2, a0, a1 @@ -301,9 +296,9 @@ define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { ; RV64-NEXT: sext.w a1, a1 ; RV64-NEXT: sext.w a0, a0 ; RV64-NEXT: xor a0, a0, a1 -; RV64-NEXT: snez a0, a0 -; RV64-NEXT: addi a1, a0, -1 -; RV64-NEXT: neg a0, a0 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: neg a1, a0 +; RV64-NEXT: addi a0, a0, -1 ; RV64-NEXT: and a1, a1, a3 ; RV64-NEXT: and a0, a0, a2 ; RV64-NEXT: or a0, a0, a1 @@ -351,9 +346,6 @@ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, ; ; RV32-LABEL: test_ctselect_chain: ; RV32: # %bb.0: -; RV32-NEXT: andi a0, a0, 1 -; RV32-NEXT: andi a1, a1, 1 -; RV32-NEXT: andi a2, a2, 1 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: slli a1, a1, 31 ; RV32-NEXT: slli a2, a2, 31 diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll index 17fcd6f9371ea..4f25ea0a2d4c1 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll @@ -64,7 +64,6 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; ; RV32-LABEL: test_ctselect_i32: ; RV32: # %bb.0: -; RV32-NEXT: andi a0, a0, 1 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a1, a0, a1 @@ -79,7 +78,6 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { ; RV64-LABEL: test_ctselect_i64: ; RV64: # %bb.0: -; RV64-NEXT: andi a0, a0, 1 ; RV64-NEXT: slli a0, a0, 63 ; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: and a1, a0, a1 @@ -107,7 +105,6 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { ; RV64-LABEL: test_ctselect_ptr: ; RV64: # %bb.0: -; RV64-NEXT: andi a0, a0, 1 ; RV64-NEXT: slli a0, a0, 63 ; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: and a1, a0, a1 @@ -118,7 +115,6 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { ; ; RV32-LABEL: test_ctselect_ptr: ; RV32: # %bb.0: -; RV32-NEXT: andi a0, a0, 1 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a1, a0, a1 @@ -164,9 +160,9 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { ; RV64-NEXT: sext.w a1, a1 ; RV64-NEXT: sext.w a0, a0 ; RV64-NEXT: xor a0, a0, a1 -; RV64-NEXT: seqz a0, a0 -; RV64-NEXT: addi a1, a0, -1 -; RV64-NEXT: neg a0, a0 +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: neg a1, a0 +; RV64-NEXT: addi a0, a0, -1 ; RV64-NEXT: and a1, a1, a3 ; RV64-NEXT: and a0, a0, a2 ; RV64-NEXT: or a0, a0, a1 @@ -193,9 +189,9 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { ; RV64-NEXT: sext.w a1, a1 ; RV64-NEXT: sext.w a0, a0 ; RV64-NEXT: xor a0, a0, a1 -; RV64-NEXT: snez a0, a0 -; RV64-NEXT: addi a1, a0, -1 -; RV64-NEXT: neg a0, a0 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: neg a1, a0 +; RV64-NEXT: addi a0, a0, -1 ; RV64-NEXT: and a1, a1, a3 ; RV64-NEXT: and a0, a0, a2 ; RV64-NEXT: or a0, a0, a1 @@ -288,7 +284,6 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { ; RV32: # %bb.0: ; RV32-NEXT: lw a1, 0(a1) ; RV32-NEXT: lw a2, 0(a2) -; RV32-NEXT: andi a0, a0, 1 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a1, a0, a1 @@ -322,8 +317,6 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ; ; RV32-LABEL: test_ctselect_nested: ; RV32: # %bb.0: -; RV32-NEXT: andi a1, a1, 1 -; RV32-NEXT: andi a0, a0, 1 ; RV32-NEXT: slli a1, a1, 31 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a1, a1, 31 @@ -356,7 +349,6 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; ; RV32-LABEL: test_ctselect_f32: ; RV32: # %bb.0: -; RV32-NEXT: andi a0, a0, 1 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a1, a0, a1 @@ -372,7 +364,6 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { define double @test_ctselect_f64(i1 %cond, double %a, double %b) { ; RV64-LABEL: test_ctselect_f64: ; RV64: # %bb.0: -; RV64-NEXT: andi a0, a0, 1 ; RV64-NEXT: slli a0, a0, 63 ; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: and a1, a0, a1 @@ -418,8 +409,6 @@ define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, ; ; RV32-LABEL: test_ctselect_f32_chain: ; RV32: # %bb.0: -; RV32-NEXT: andi a0, a0, 1 -; RV32-NEXT: andi a1, a1, 1 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: slli a1, a1, 31 ; RV32-NEXT: srai a0, a0, 31 @@ -456,7 +445,6 @@ define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { ; RV32: # %bb.0: ; RV32-NEXT: lw a1, 0(a1) ; RV32-NEXT: lw a2, 0(a2) -; RV32-NEXT: andi a0, a0, 1 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a1, a0, a1 @@ -476,7 +464,6 @@ define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) { ; RV64: # %bb.0: ; RV64-NEXT: ld a1, 0(a1) ; RV64-NEXT: ld a2, 0(a2) -; RV64-NEXT: andi a0, a0, 1 ; RV64-NEXT: slli a0, a0, 63 ; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: and a1, a0, a1 @@ -577,9 +564,8 @@ define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { ; RV32-NEXT: mv a0, s1 ; RV32-NEXT: mv a1, s0 ; RV32-NEXT: call __subsf3 -; RV32-NEXT: andi a1, s2, 1 -; RV32-NEXT: slli a1, a1, 31 -; RV32-NEXT: srai a1, a1, 31 +; RV32-NEXT: slli s2, s2, 31 +; RV32-NEXT: srai a1, s2, 31 ; RV32-NEXT: and a2, a1, s3 ; RV32-NEXT: not a1, a1 ; RV32-NEXT: and a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll b/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll index a37a57578523f..e6a48a914bd0f 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll @@ -46,7 +46,6 @@ define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { ; ; RV32-LABEL: test_protected_no_branch: ; RV32: # %bb.0: -; RV32-NEXT: andi a0, a0, 1 ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a1, a0, a1 diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll index 18fcb8ac243d4..ce3099e593282 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll @@ -7,10 +7,7 @@ define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { ; W32-LABEL: test_ctselect_i1: ; W32: .functype test_ctselect_i1 (i32, i32, i32) -> (i32) ; W32-NEXT: # %bb.0: -; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 -; W32-NEXT: i32.sub -; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 @@ -24,10 +21,7 @@ define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { ; W64-LABEL: test_ctselect_i1: ; W64: .functype test_ctselect_i1 (i32, i32, i32) -> (i32) ; W64-NEXT: # %bb.0: -; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 -; W64-NEXT: i32.sub -; W64-NEXT: local.tee 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.and ; W64-NEXT: local.get 0 @@ -50,8 +44,6 @@ define i32 @test_ctselect_extremal_values(i1 %cond) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: i32.const 2147483647 @@ -71,8 +63,6 @@ define i32 @test_ctselect_extremal_values(i1 %cond) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 ; W64-NEXT: i32.const 2147483647 @@ -97,8 +87,6 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.get 1 ; W32-NEXT: i32.and @@ -112,8 +100,6 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { ; W64-NEXT: i64.extend_i32_u ; W64-NEXT: i64.const 1 ; W64-NEXT: i64.and -; W64-NEXT: i64.const 1 -; W64-NEXT: i64.and ; W64-NEXT: i64.sub ; W64-NEXT: local.get 1 ; W64-NEXT: i64.and @@ -131,8 +117,6 @@ define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 1 @@ -154,8 +138,6 @@ define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { ; W64-NEXT: i64.extend_i32_u ; W64-NEXT: i64.const 1 ; W64-NEXT: i64.and -; W64-NEXT: i64.const 1 -; W64-NEXT: i64.and ; W64-NEXT: i64.sub ; W64-NEXT: local.tee 3 ; W64-NEXT: local.get 1 @@ -176,11 +158,12 @@ define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { ; W32-LABEL: test_ctselect_ptr_cmp: ; W32: .functype test_ctselect_ptr_cmp (i32, i32, i32, i32) -> (i32) ; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.eq -; W32-NEXT: i32.sub +; W32-NEXT: i32.select ; W32-NEXT: local.tee 1 ; W32-NEXT: local.get 2 ; W32-NEXT: i32.and @@ -195,14 +178,12 @@ define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { ; W64-LABEL: test_ctselect_ptr_cmp: ; W64: .functype test_ctselect_ptr_cmp (i64, i64, i64, i64) -> (i64) ; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const -1 ; W64-NEXT: i64.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i64.eq -; W64-NEXT: i64.extend_i32_u -; W64-NEXT: i64.const 1 -; W64-NEXT: i64.and -; W64-NEXT: i64.sub +; W64-NEXT: i64.select ; W64-NEXT: local.tee 1 ; W64-NEXT: local.get 2 ; W64-NEXT: i64.and @@ -229,8 +210,6 @@ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 1 @@ -252,8 +231,6 @@ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { ; W64-NEXT: i64.extend_i32_u ; W64-NEXT: i64.const 1 ; W64-NEXT: i64.and -; W64-NEXT: i64.const 1 -; W64-NEXT: i64.and ; W64-NEXT: i64.sub ; W64-NEXT: local.tee 3 ; W64-NEXT: local.get 1 @@ -278,32 +255,24 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ; W32-NEXT: local.get 3 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 3 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 2 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 2 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 4 @@ -344,32 +313,24 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ; W64-NEXT: local.get 3 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 3 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 2 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 2 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 ; W64-NEXT: local.get 4 diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll index 3d6035b64430e..85acc50210b4f 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll @@ -7,11 +7,9 @@ define i32 @test_ctselect_smin_zero(i32 %x) { ; W32-LABEL: test_ctselect_smin_zero: ; W32: .functype test_ctselect_smin_zero (i32) -> (i32) ; W32-NEXT: # %bb.0: -; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 -; W32-NEXT: i32.const 0 -; W32-NEXT: i32.lt_s -; W32-NEXT: i32.sub +; W32-NEXT: i32.const 31 +; W32-NEXT: i32.shr_s ; W32-NEXT: local.get 0 ; W32-NEXT: i32.and ; W32-NEXT: # fallthrough-return @@ -19,11 +17,9 @@ define i32 @test_ctselect_smin_zero(i32 %x) { ; W64-LABEL: test_ctselect_smin_zero: ; W64: .functype test_ctselect_smin_zero (i32) -> (i32) ; W64-NEXT: # %bb.0: -; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 -; W64-NEXT: i32.const 0 -; W64-NEXT: i32.lt_s -; W64-NEXT: i32.sub +; W64-NEXT: i32.const 31 +; W64-NEXT: i32.shr_s ; W64-NEXT: local.get 0 ; W64-NEXT: i32.and ; W64-NEXT: # fallthrough-return @@ -37,11 +33,12 @@ define i32 @test_ctselect_smax_zero(i32 %x) { ; W32-LABEL: test_ctselect_smax_zero: ; W32: .functype test_ctselect_smax_zero (i32) -> (i32) ; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 0 ; W32-NEXT: i32.gt_s -; W32-NEXT: i32.sub +; W32-NEXT: i32.select ; W32-NEXT: local.get 0 ; W32-NEXT: i32.and ; W32-NEXT: # fallthrough-return @@ -49,11 +46,12 @@ define i32 @test_ctselect_smax_zero(i32 %x) { ; W64-LABEL: test_ctselect_smax_zero: ; W64: .functype test_ctselect_smax_zero (i32) -> (i32) ; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 0 ; W64-NEXT: i32.gt_s -; W64-NEXT: i32.sub +; W64-NEXT: i32.select ; W64-NEXT: local.get 0 ; W64-NEXT: i32.and ; W64-NEXT: # fallthrough-return @@ -68,11 +66,12 @@ define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { ; W32: .functype test_ctselect_smin_generic (i32, i32) -> (i32) ; W32-NEXT: .local i32 ; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.lt_s -; W32-NEXT: i32.sub +; W32-NEXT: i32.select ; W32-NEXT: local.tee 2 ; W32-NEXT: local.get 0 ; W32-NEXT: i32.and @@ -88,11 +87,12 @@ define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { ; W64: .functype test_ctselect_smin_generic (i32, i32) -> (i32) ; W64-NEXT: .local i32 ; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.lt_s -; W64-NEXT: i32.sub +; W64-NEXT: i32.select ; W64-NEXT: local.tee 2 ; W64-NEXT: local.get 0 ; W64-NEXT: i32.and @@ -114,11 +114,12 @@ define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { ; W32: .functype test_ctselect_smax_generic (i32, i32) -> (i32) ; W32-NEXT: .local i32 ; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.gt_s -; W32-NEXT: i32.sub +; W32-NEXT: i32.select ; W32-NEXT: local.tee 2 ; W32-NEXT: local.get 0 ; W32-NEXT: i32.and @@ -134,11 +135,12 @@ define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { ; W64: .functype test_ctselect_smax_generic (i32, i32) -> (i32) ; W64-NEXT: .local i32 ; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.gt_s -; W64-NEXT: i32.sub +; W64-NEXT: i32.select ; W64-NEXT: local.tee 2 ; W64-NEXT: local.get 0 ; W64-NEXT: i32.and @@ -160,11 +162,12 @@ define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { ; W32: .functype test_ctselect_umin_generic (i32, i32) -> (i32) ; W32-NEXT: .local i32 ; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.lt_u -; W32-NEXT: i32.sub +; W32-NEXT: i32.select ; W32-NEXT: local.tee 2 ; W32-NEXT: local.get 0 ; W32-NEXT: i32.and @@ -180,11 +183,12 @@ define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { ; W64: .functype test_ctselect_umin_generic (i32, i32) -> (i32) ; W64-NEXT: .local i32 ; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.lt_u -; W64-NEXT: i32.sub +; W64-NEXT: i32.select ; W64-NEXT: local.tee 2 ; W64-NEXT: local.get 0 ; W64-NEXT: i32.and @@ -206,11 +210,12 @@ define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { ; W32: .functype test_ctselect_umax_generic (i32, i32) -> (i32) ; W32-NEXT: .local i32 ; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.gt_u -; W32-NEXT: i32.sub +; W32-NEXT: i32.select ; W32-NEXT: local.tee 2 ; W32-NEXT: local.get 0 ; W32-NEXT: i32.and @@ -226,11 +231,12 @@ define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { ; W64: .functype test_ctselect_umax_generic (i32, i32) -> (i32) ; W64-NEXT: .local i32 ; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.gt_u -; W64-NEXT: i32.sub +; W64-NEXT: i32.select ; W64-NEXT: local.tee 2 ; W64-NEXT: local.get 0 ; W64-NEXT: i32.and @@ -252,11 +258,9 @@ define i32 @test_ctselect_abs(i32 %x) { ; W32: .functype test_ctselect_abs (i32) -> (i32) ; W32-NEXT: .local i32 ; W32-NEXT: # %bb.0: -; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 -; W32-NEXT: i32.const 0 -; W32-NEXT: i32.lt_s -; W32-NEXT: i32.sub +; W32-NEXT: i32.const 31 +; W32-NEXT: i32.shr_s ; W32-NEXT: local.tee 1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 @@ -274,11 +278,9 @@ define i32 @test_ctselect_abs(i32 %x) { ; W64: .functype test_ctselect_abs (i32) -> (i32) ; W64-NEXT: .local i32 ; W64-NEXT: # %bb.0: -; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 -; W64-NEXT: i32.const 0 -; W64-NEXT: i32.lt_s -; W64-NEXT: i32.sub +; W64-NEXT: i32.const 31 +; W64-NEXT: i32.shr_s ; W64-NEXT: local.tee 1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 @@ -303,11 +305,9 @@ define i32 @test_ctselect_nabs(i32 %x) { ; W32: .functype test_ctselect_nabs (i32) -> (i32) ; W32-NEXT: .local i32 ; W32-NEXT: # %bb.0: -; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 -; W32-NEXT: i32.const 0 -; W32-NEXT: i32.lt_s -; W32-NEXT: i32.sub +; W32-NEXT: i32.const 31 +; W32-NEXT: i32.shr_s ; W32-NEXT: local.tee 1 ; W32-NEXT: local.get 0 ; W32-NEXT: i32.and @@ -325,11 +325,9 @@ define i32 @test_ctselect_nabs(i32 %x) { ; W64: .functype test_ctselect_nabs (i32) -> (i32) ; W64-NEXT: .local i32 ; W64-NEXT: # %bb.0: -; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 -; W64-NEXT: i32.const 0 -; W64-NEXT: i32.lt_s -; W64-NEXT: i32.sub +; W64-NEXT: i32.const 31 +; W64-NEXT: i32.shr_s ; W64-NEXT: local.tee 1 ; W64-NEXT: local.get 0 ; W64-NEXT: i32.and @@ -353,21 +351,17 @@ define i32 @test_ctselect_sign_extend(i32 %x) { ; W32-LABEL: test_ctselect_sign_extend: ; W32: .functype test_ctselect_sign_extend (i32) -> (i32) ; W32-NEXT: # %bb.0: -; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 -; W32-NEXT: i32.const 0 -; W32-NEXT: i32.lt_s -; W32-NEXT: i32.sub +; W32-NEXT: i32.const 31 +; W32-NEXT: i32.shr_s ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_sign_extend: ; W64: .functype test_ctselect_sign_extend (i32) -> (i32) ; W64-NEXT: # %bb.0: -; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 -; W64-NEXT: i32.const 0 -; W64-NEXT: i32.lt_s -; W64-NEXT: i32.sub +; W64-NEXT: i32.const 31 +; W64-NEXT: i32.shr_s ; W64-NEXT: # fallthrough-return %cmp = icmp slt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) @@ -379,11 +373,10 @@ define i32 @test_ctselect_zero_extend(i32 %x) { ; W32-LABEL: test_ctselect_zero_extend: ; W32: .functype test_ctselect_zero_extend (i32) -> (i32) ; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 -; W32-NEXT: i32.const 0 -; W32-NEXT: i32.ne -; W32-NEXT: i32.sub +; W32-NEXT: i32.select ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and ; W32-NEXT: # fallthrough-return @@ -391,11 +384,10 @@ define i32 @test_ctselect_zero_extend(i32 %x) { ; W64-LABEL: test_ctselect_zero_extend: ; W64: .functype test_ctselect_zero_extend (i32) -> (i32) ; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 -; W64-NEXT: i32.const 0 -; W64-NEXT: i32.ne -; W64-NEXT: i32.sub +; W64-NEXT: i32.select ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and ; W64-NEXT: # fallthrough-return @@ -446,8 +438,6 @@ define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 1 @@ -467,8 +457,6 @@ define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 ; W64-NEXT: local.get 1 @@ -489,11 +477,12 @@ define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { ; W32-LABEL: test_ctselect_inverted_condition: ; W32: .functype test_ctselect_inverted_condition (i32, i32, i32, i32) -> (i32) ; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.ne -; W32-NEXT: i32.sub +; W32-NEXT: i32.select ; W32-NEXT: local.tee 1 ; W32-NEXT: local.get 2 ; W32-NEXT: i32.and @@ -508,11 +497,12 @@ define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { ; W64-LABEL: test_ctselect_inverted_condition: ; W64: .functype test_ctselect_inverted_condition (i32, i32, i32, i32) -> (i32) ; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.ne -; W64-NEXT: i32.sub +; W64-NEXT: i32.select ; W64-NEXT: local.tee 1 ; W64-NEXT: local.get 2 ; W64-NEXT: i32.and @@ -538,24 +528,18 @@ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, ; W32-NEXT: local.get 2 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 2 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 3 @@ -589,24 +573,18 @@ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, ; W64-NEXT: local.get 2 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 2 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 ; W64-NEXT: local.get 3 @@ -643,11 +621,9 @@ define i64 @test_ctselect_i64_smin_zero(i64 %x) { ; W32-LABEL: test_ctselect_i64_smin_zero: ; W32: .functype test_ctselect_i64_smin_zero (i64) -> (i64) ; W32-NEXT: # %bb.0: -; W32-NEXT: i64.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: i64.const 63 -; W32-NEXT: i64.shr_u -; W32-NEXT: i64.sub +; W32-NEXT: i64.shr_s ; W32-NEXT: local.get 0 ; W32-NEXT: i64.and ; W32-NEXT: # fallthrough-return @@ -655,11 +631,9 @@ define i64 @test_ctselect_i64_smin_zero(i64 %x) { ; W64-LABEL: test_ctselect_i64_smin_zero: ; W64: .functype test_ctselect_i64_smin_zero (i64) -> (i64) ; W64-NEXT: # %bb.0: -; W64-NEXT: i64.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: i64.const 63 -; W64-NEXT: i64.shr_u -; W64-NEXT: i64.sub +; W64-NEXT: i64.shr_s ; W64-NEXT: local.get 0 ; W64-NEXT: i64.and ; W64-NEXT: # fallthrough-return diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll index 39d38a415ec42..db21bd58f6797 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll @@ -95,8 +95,6 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 1 @@ -116,8 +114,6 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 ; W64-NEXT: local.get 1 @@ -143,8 +139,6 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { ; W32-NEXT: i64.extend_i32_u ; W32-NEXT: i64.const 1 ; W32-NEXT: i64.and -; W32-NEXT: i64.const 1 -; W32-NEXT: i64.and ; W32-NEXT: i64.sub ; W32-NEXT: local.tee 3 ; W32-NEXT: local.get 1 @@ -166,8 +160,6 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { ; W64-NEXT: i64.extend_i32_u ; W64-NEXT: i64.const 1 ; W64-NEXT: i64.and -; W64-NEXT: i64.const 1 -; W64-NEXT: i64.and ; W64-NEXT: i64.sub ; W64-NEXT: local.tee 3 ; W64-NEXT: local.get 1 @@ -191,8 +183,6 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 1 @@ -214,8 +204,6 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { ; W64-NEXT: i64.extend_i32_u ; W64-NEXT: i64.const 1 ; W64-NEXT: i64.and -; W64-NEXT: i64.const 1 -; W64-NEXT: i64.and ; W64-NEXT: i64.sub ; W64-NEXT: local.tee 3 ; W64-NEXT: local.get 1 @@ -269,11 +257,12 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { ; W32-LABEL: test_ctselect_icmp_eq: ; W32: .functype test_ctselect_icmp_eq (i32, i32, i32, i32) -> (i32) ; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.eq -; W32-NEXT: i32.sub +; W32-NEXT: i32.select ; W32-NEXT: local.tee 1 ; W32-NEXT: local.get 2 ; W32-NEXT: i32.and @@ -288,11 +277,12 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { ; W64-LABEL: test_ctselect_icmp_eq: ; W64: .functype test_ctselect_icmp_eq (i32, i32, i32, i32) -> (i32) ; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.eq -; W64-NEXT: i32.sub +; W64-NEXT: i32.select ; W64-NEXT: local.tee 1 ; W64-NEXT: local.get 2 ; W64-NEXT: i32.and @@ -312,11 +302,12 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { ; W32-LABEL: test_ctselect_icmp_ne: ; W32: .functype test_ctselect_icmp_ne (i32, i32, i32, i32) -> (i32) ; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.ne -; W32-NEXT: i32.sub +; W32-NEXT: i32.select ; W32-NEXT: local.tee 1 ; W32-NEXT: local.get 2 ; W32-NEXT: i32.and @@ -331,11 +322,12 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { ; W64-LABEL: test_ctselect_icmp_ne: ; W64: .functype test_ctselect_icmp_ne (i32, i32, i32, i32) -> (i32) ; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.ne -; W64-NEXT: i32.sub +; W64-NEXT: i32.select ; W64-NEXT: local.tee 1 ; W64-NEXT: local.get 2 ; W64-NEXT: i32.and @@ -355,11 +347,12 @@ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { ; W32-LABEL: test_ctselect_icmp_slt: ; W32: .functype test_ctselect_icmp_slt (i32, i32, i32, i32) -> (i32) ; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.lt_s -; W32-NEXT: i32.sub +; W32-NEXT: i32.select ; W32-NEXT: local.tee 1 ; W32-NEXT: local.get 2 ; W32-NEXT: i32.and @@ -374,11 +367,12 @@ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { ; W64-LABEL: test_ctselect_icmp_slt: ; W64: .functype test_ctselect_icmp_slt (i32, i32, i32, i32) -> (i32) ; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.lt_s -; W64-NEXT: i32.sub +; W64-NEXT: i32.select ; W64-NEXT: local.tee 1 ; W64-NEXT: local.get 2 ; W64-NEXT: i32.and @@ -398,11 +392,12 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { ; W32-LABEL: test_ctselect_icmp_ult: ; W32: .functype test_ctselect_icmp_ult (i32, i32, i32, i32) -> (i32) ; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.lt_u -; W32-NEXT: i32.sub +; W32-NEXT: i32.select ; W32-NEXT: local.tee 1 ; W32-NEXT: local.get 2 ; W32-NEXT: i32.and @@ -417,11 +412,12 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { ; W64-LABEL: test_ctselect_icmp_ult: ; W64: .functype test_ctselect_icmp_ult (i32, i32, i32, i32) -> (i32) ; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.lt_u -; W64-NEXT: i32.sub +; W64-NEXT: i32.select ; W64-NEXT: local.tee 1 ; W64-NEXT: local.get 2 ; W64-NEXT: i32.and @@ -446,8 +442,6 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 1 @@ -469,8 +463,6 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 ; W64-NEXT: local.get 1 @@ -499,16 +491,12 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 1 ; W32-NEXT: local.get 2 @@ -535,16 +523,12 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 1 ; W64-NEXT: local.get 2 @@ -577,8 +561,6 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 1 @@ -601,8 +583,6 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 ; W64-NEXT: local.get 1 @@ -632,8 +612,6 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) { ; W32-NEXT: i64.extend_i32_u ; W32-NEXT: i64.const 1 ; W32-NEXT: i64.and -; W32-NEXT: i64.const 1 -; W32-NEXT: i64.and ; W32-NEXT: i64.sub ; W32-NEXT: local.tee 3 ; W32-NEXT: local.get 1 @@ -658,8 +636,6 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) { ; W64-NEXT: i64.extend_i32_u ; W64-NEXT: i64.const 1 ; W64-NEXT: i64.and -; W64-NEXT: i64.const 1 -; W64-NEXT: i64.and ; W64-NEXT: i64.sub ; W64-NEXT: local.tee 3 ; W64-NEXT: local.get 1 @@ -688,16 +664,12 @@ define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, ; W32-NEXT: local.get 1 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 1 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 2 @@ -728,16 +700,12 @@ define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, ; W64-NEXT: local.get 1 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 1 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 ; W64-NEXT: local.get 2 @@ -774,8 +742,6 @@ define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 1 @@ -798,8 +764,6 @@ define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 ; W64-NEXT: local.get 1 @@ -831,8 +795,6 @@ define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) { ; W32-NEXT: i64.extend_i32_u ; W32-NEXT: i64.const 1 ; W32-NEXT: i64.and -; W32-NEXT: i64.const 1 -; W32-NEXT: i64.and ; W32-NEXT: i64.sub ; W32-NEXT: local.tee 3 ; W32-NEXT: local.get 1 @@ -857,8 +819,6 @@ define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) { ; W64-NEXT: i64.extend_i32_u ; W64-NEXT: i64.const 1 ; W64-NEXT: i64.and -; W64-NEXT: i64.const 1 -; W64-NEXT: i64.and ; W64-NEXT: i64.sub ; W64-NEXT: local.tee 3 ; W64-NEXT: local.get 1 @@ -888,8 +848,6 @@ define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 1 @@ -916,8 +874,6 @@ define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 ; W64-NEXT: local.get 1 diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll b/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll index 036160b6dbadb..cc5746389aa93 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll @@ -47,8 +47,6 @@ define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 1 @@ -68,8 +66,6 @@ define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 ; W64-NEXT: local.get 1 From 71b49557450985eccf95452981c7fb78738c0ff8 Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Mon, 18 Aug 2025 06:33:24 -0400 Subject: [PATCH 15/63] Added specific test; cmovznz4 --- .../Mips/ctselect-fallback-edge-cases.ll | 210 +++++++++++++ .../RISCV/ctselect-fallback-edge-cases.ll | 239 ++++++++++++++ .../ctselect-fallback-edge-cases.ll | 291 ++++++++++++++++++ 3 files changed, 740 insertions(+) diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll index 5b4bdffbc76f6..1e0478cabe744 100644 --- a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll +++ b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll @@ -240,6 +240,216 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ret i32 %sel4 } + ; This test demonstrates the FStar cmovznz4 pattern using ct.select +; Based on https://godbolt.org/z/6Kb71Ks7z +; Shows that NoMerge flag prevents DAG optimization from introducing branches +define void @cmovznz4_fstar_original(i64 %cin, ptr %x, ptr %y, ptr %r) { +; M32-LABEL: cmovznz4_fstar_original: +; M32: # %bb.0: # %entry +; M32-NEXT: or $1, $4, $5 +; M32-NEXT: addiu $2, $7, 16 +; M32-NEXT: addiu $3, $6, 16 +; M32-NEXT: addiu $4, $6, 8 +; M32-NEXT: movz $2, $3, $1 +; M32-NEXT: addiu $3, $7, 8 +; M32-NEXT: movz $3, $4, $1 +; M32-NEXT: addiu $4, $7, 24 +; M32-NEXT: movz $7, $6, $1 +; M32-NEXT: addiu $6, $6, 24 +; M32-NEXT: lw $9, 4($2) +; M32-NEXT: lw $2, 0($2) +; M32-NEXT: movz $4, $6, $1 +; M32-NEXT: lw $5, 4($7) +; M32-NEXT: lw $8, 4($3) +; M32-NEXT: lw $7, 0($7) +; M32-NEXT: lw $3, 0($3) +; M32-NEXT: lw $6, 16($sp) +; M32-NEXT: lw $1, 4($4) +; M32-NEXT: lw $4, 0($4) +; M32-NEXT: sw $4, 24($6) +; M32-NEXT: sw $1, 28($6) +; M32-NEXT: sw $2, 16($6) +; M32-NEXT: sw $9, 20($6) +; M32-NEXT: sw $3, 8($6) +; M32-NEXT: sw $8, 12($6) +; M32-NEXT: sw $7, 0($6) +; M32-NEXT: jr $ra +; M32-NEXT: sw $5, 4($6) +; +; M64-LABEL: cmovznz4_fstar_original: +; M64: # %bb.0: # %entry +; M64-NEXT: daddiu $1, $6, 8 +; M64-NEXT: daddiu $2, $5, 8 +; M64-NEXT: daddiu $3, $6, 16 +; M64-NEXT: daddiu $8, $5, 16 +; M64-NEXT: movz $1, $2, $4 +; M64-NEXT: move $2, $6 +; M64-NEXT: daddiu $6, $6, 24 +; M64-NEXT: movz $3, $8, $4 +; M64-NEXT: movz $2, $5, $4 +; M64-NEXT: daddiu $5, $5, 24 +; M64-NEXT: ld $1, 0($1) +; M64-NEXT: ld $3, 0($3) +; M64-NEXT: movz $6, $5, $4 +; M64-NEXT: ld $2, 0($2) +; M64-NEXT: ld $4, 0($6) +; M64-NEXT: sd $4, 24($7) +; M64-NEXT: sd $3, 16($7) +; M64-NEXT: sd $1, 8($7) +; M64-NEXT: jr $ra +; M64-NEXT: sd $2, 0($7) +entry: + %.not.i = icmp eq i64 %cin, 0 + %0 = load i64, ptr %y, align 8 + %1 = load i64, ptr %x, align 8 + %or = select i1 %.not.i, i64 %1, i64 %0 + %arrayidx4 = getelementptr inbounds nuw i8, ptr %y, i64 8 + %2 = load i64, ptr %arrayidx4, align 8 + %arrayidx6 = getelementptr inbounds nuw i8, ptr %x, i64 8 + %3 = load i64, ptr %arrayidx6, align 8 + %or9 = select i1 %.not.i, i64 %3, i64 %2 + %arrayidx10 = getelementptr inbounds nuw i8, ptr %y, i64 16 + %4 = load i64, ptr %arrayidx10, align 8 + %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 16 + %5 = load i64, ptr %arrayidx12, align 8 + %or15 = select i1 %.not.i, i64 %5, i64 %4 + %arrayidx16 = getelementptr inbounds nuw i8, ptr %y, i64 24 + %6 = load i64, ptr %arrayidx16, align 8 + %arrayidx18 = getelementptr inbounds nuw i8, ptr %x, i64 24 + %7 = load i64, ptr %arrayidx18, align 8 + %or21 = select i1 %.not.i, i64 %7, i64 %6 + store i64 %or, ptr %r, align 8 + %arrayidx23 = getelementptr inbounds nuw i8, ptr %r, i64 8 + store i64 %or9, ptr %arrayidx23, align 8 + %arrayidx24 = getelementptr inbounds nuw i8, ptr %r, i64 16 + store i64 %or15, ptr %arrayidx24, align 8 + %arrayidx25 = getelementptr inbounds nuw i8, ptr %r, i64 24 + store i64 %or21, ptr %arrayidx25, align 8 + ret void +} + +define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) { +; M32-LABEL: cmovznz4_builtin_ctselect: +; M32: # %bb.0: # %entry +; M32-NEXT: or $1, $4, $5 +; M32-NEXT: lw $3, 4($7) +; M32-NEXT: lw $4, 4($6) +; M32-NEXT: sltiu $1, $1, 1 +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: and $4, $1, $4 +; M32-NEXT: or $3, $4, $3 +; M32-NEXT: lw $4, 16($sp) +; M32-NEXT: sw $3, 4($4) +; M32-NEXT: lw $3, 0($7) +; M32-NEXT: lw $5, 0($6) +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: and $5, $1, $5 +; M32-NEXT: or $3, $5, $3 +; M32-NEXT: sw $3, 0($4) +; M32-NEXT: lw $3, 12($7) +; M32-NEXT: lw $5, 12($6) +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: and $5, $1, $5 +; M32-NEXT: or $3, $5, $3 +; M32-NEXT: sw $3, 12($4) +; M32-NEXT: lw $3, 8($7) +; M32-NEXT: lw $5, 8($6) +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: and $5, $1, $5 +; M32-NEXT: or $3, $5, $3 +; M32-NEXT: sw $3, 8($4) +; M32-NEXT: lw $3, 20($7) +; M32-NEXT: lw $5, 20($6) +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: and $5, $1, $5 +; M32-NEXT: or $3, $5, $3 +; M32-NEXT: sw $3, 20($4) +; M32-NEXT: lw $3, 16($7) +; M32-NEXT: lw $5, 16($6) +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: and $5, $1, $5 +; M32-NEXT: or $3, $5, $3 +; M32-NEXT: sw $3, 16($4) +; M32-NEXT: lw $3, 28($7) +; M32-NEXT: lw $5, 28($6) +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: and $5, $1, $5 +; M32-NEXT: or $3, $5, $3 +; M32-NEXT: sw $3, 28($4) +; M32-NEXT: lw $3, 24($7) +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: lw $3, 24($6) +; M32-NEXT: and $1, $1, $3 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: jr $ra +; M32-NEXT: sw $1, 24($4) +; +; M64-LABEL: cmovznz4_builtin_ctselect: +; M64: # %bb.0: # %entry +; M64-NEXT: sltiu $1, $4, 1 +; M64-NEXT: daddiu $3, $zero, -1 +; M64-NEXT: dsll $1, $1, 32 +; M64-NEXT: ld $2, 0($5) +; M64-NEXT: ld $4, 0($6) +; M64-NEXT: dsrl $1, $1, 32 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: xor $3, $1, $3 +; M64-NEXT: and $2, $1, $2 +; M64-NEXT: and $4, $3, $4 +; M64-NEXT: or $2, $2, $4 +; M64-NEXT: sd $2, 0($7) +; M64-NEXT: ld $2, 8($6) +; M64-NEXT: ld $4, 8($5) +; M64-NEXT: and $2, $3, $2 +; M64-NEXT: and $4, $1, $4 +; M64-NEXT: or $2, $4, $2 +; M64-NEXT: sd $2, 8($7) +; M64-NEXT: ld $2, 16($6) +; M64-NEXT: ld $4, 16($5) +; M64-NEXT: and $2, $3, $2 +; M64-NEXT: and $4, $1, $4 +; M64-NEXT: or $2, $4, $2 +; M64-NEXT: sd $2, 16($7) +; M64-NEXT: ld $2, 24($6) +; M64-NEXT: and $2, $3, $2 +; M64-NEXT: ld $3, 24($5) +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: or $1, $1, $2 +; M64-NEXT: jr $ra +; M64-NEXT: sd $1, 24($7) +entry: + %cmp = icmp eq i64 %cin, 0 + %0 = load i64, ptr %x, align 8 + %1 = load i64, ptr %y, align 8 + %2 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %0, i64 %1) + store i64 %2, ptr %r, align 8 + %arrayidx4 = getelementptr inbounds nuw i8, ptr %x, i64 8 + %3 = load i64, ptr %arrayidx4, align 8 + %arrayidx5 = getelementptr inbounds nuw i8, ptr %y, i64 8 + %4 = load i64, ptr %arrayidx5, align 8 + %5 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %3, i64 %4) + %arrayidx6 = getelementptr inbounds nuw i8, ptr %r, i64 8 + store i64 %5, ptr %arrayidx6, align 8 + %arrayidx8 = getelementptr inbounds nuw i8, ptr %x, i64 16 + %6 = load i64, ptr %arrayidx8, align 8 + %arrayidx9 = getelementptr inbounds nuw i8, ptr %y, i64 16 + %7 = load i64, ptr %arrayidx9, align 8 + %8 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %6, i64 %7) + %arrayidx10 = getelementptr inbounds nuw i8, ptr %r, i64 16 + store i64 %8, ptr %arrayidx10, align 8 + %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 24 + %9 = load i64, ptr %arrayidx12, align 8 + %arrayidx13 = getelementptr inbounds nuw i8, ptr %y, i64 24 + %10 = load i64, ptr %arrayidx13, align 8 + %11 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %9, i64 %10) + %arrayidx14 = getelementptr inbounds nuw i8, ptr %r, i64 24 + store i64 %11, ptr %arrayidx14, align 8 + ret void +} + ; Declare the intrinsics declare i1 @llvm.ct.select.i1(i1, i1, i1) declare i32 @llvm.ct.select.i32(i1, i32, i32) diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll index 323be38d3d865..a4e204d915334 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll @@ -217,6 +217,245 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ret i32 %sel4 } +; This test demonstrates the FStar cmovznz4 pattern using ct.select +; Based on https://godbolt.org/z/6Kb71Ks7z +; Shows that NoMerge flag prevents DAG optimization from introducing branches +define void @cmovznz4_fstar_original(i64 %cin, ptr %x, ptr %y, ptr %r) { +; RV64-LABEL: cmovznz4_fstar_original: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mv a4, a1 +; RV64-NEXT: beqz a0, .LBB7_2 +; RV64-NEXT: # %bb.1: # %entry +; RV64-NEXT: mv a4, a2 +; RV64-NEXT: .LBB7_2: # %entry +; RV64-NEXT: beqz a0, .LBB7_6 +; RV64-NEXT: # %bb.3: # %entry +; RV64-NEXT: addi a5, a2, 8 +; RV64-NEXT: bnez a0, .LBB7_7 +; RV64-NEXT: .LBB7_4: +; RV64-NEXT: addi a6, a1, 16 +; RV64-NEXT: ld a4, 0(a4) +; RV64-NEXT: ld a5, 0(a5) +; RV64-NEXT: ld a6, 0(a6) +; RV64-NEXT: bnez a0, .LBB7_8 +; RV64-NEXT: .LBB7_5: +; RV64-NEXT: addi a1, a1, 24 +; RV64-NEXT: ld a0, 0(a1) +; RV64-NEXT: sd a4, 0(a3) +; RV64-NEXT: sd a5, 8(a3) +; RV64-NEXT: sd a6, 16(a3) +; RV64-NEXT: sd a0, 24(a3) +; RV64-NEXT: ret +; RV64-NEXT: .LBB7_6: +; RV64-NEXT: addi a5, a1, 8 +; RV64-NEXT: beqz a0, .LBB7_4 +; RV64-NEXT: .LBB7_7: # %entry +; RV64-NEXT: addi a6, a2, 16 +; RV64-NEXT: ld a4, 0(a4) +; RV64-NEXT: ld a5, 0(a5) +; RV64-NEXT: ld a6, 0(a6) +; RV64-NEXT: beqz a0, .LBB7_5 +; RV64-NEXT: .LBB7_8: # %entry +; RV64-NEXT: addi a1, a2, 24 +; RV64-NEXT: ld a0, 0(a1) +; RV64-NEXT: sd a4, 0(a3) +; RV64-NEXT: sd a5, 8(a3) +; RV64-NEXT: sd a6, 16(a3) +; RV64-NEXT: sd a0, 24(a3) +; RV64-NEXT: ret +; +; RV32-LABEL: cmovznz4_fstar_original: +; RV32: # %bb.0: # %entry +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: beqz a0, .LBB7_2 +; RV32-NEXT: # %bb.1: # %entry +; RV32-NEXT: mv a1, a3 +; RV32-NEXT: .LBB7_2: # %entry +; RV32-NEXT: beqz a0, .LBB7_5 +; RV32-NEXT: # %bb.3: # %entry +; RV32-NEXT: addi a5, a3, 8 +; RV32-NEXT: bnez a0, .LBB7_6 +; RV32-NEXT: .LBB7_4: +; RV32-NEXT: addi t0, a2, 16 +; RV32-NEXT: j .LBB7_7 +; RV32-NEXT: .LBB7_5: +; RV32-NEXT: addi a5, a2, 8 +; RV32-NEXT: beqz a0, .LBB7_4 +; RV32-NEXT: .LBB7_6: # %entry +; RV32-NEXT: addi t0, a3, 16 +; RV32-NEXT: .LBB7_7: # %entry +; RV32-NEXT: lw a6, 0(a1) +; RV32-NEXT: lw a1, 4(a1) +; RV32-NEXT: lw a7, 0(a5) +; RV32-NEXT: lw a5, 4(a5) +; RV32-NEXT: lw t1, 0(t0) +; RV32-NEXT: lw t0, 4(t0) +; RV32-NEXT: beqz a0, .LBB7_9 +; RV32-NEXT: # %bb.8: # %entry +; RV32-NEXT: addi a2, a3, 24 +; RV32-NEXT: j .LBB7_10 +; RV32-NEXT: .LBB7_9: +; RV32-NEXT: addi a2, a2, 24 +; RV32-NEXT: .LBB7_10: # %entry +; RV32-NEXT: lw a0, 0(a2) +; RV32-NEXT: lw a2, 4(a2) +; RV32-NEXT: sw a6, 0(a4) +; RV32-NEXT: sw a1, 4(a4) +; RV32-NEXT: sw a7, 8(a4) +; RV32-NEXT: sw a5, 12(a4) +; RV32-NEXT: sw t1, 16(a4) +; RV32-NEXT: sw t0, 20(a4) +; RV32-NEXT: sw a0, 24(a4) +; RV32-NEXT: sw a2, 28(a4) +; RV32-NEXT: ret +entry: + %.not.i = icmp eq i64 %cin, 0 + %0 = load i64, ptr %y, align 8 + %1 = load i64, ptr %x, align 8 + %or = select i1 %.not.i, i64 %1, i64 %0 + %arrayidx4 = getelementptr inbounds nuw i8, ptr %y, i64 8 + %2 = load i64, ptr %arrayidx4, align 8 + %arrayidx6 = getelementptr inbounds nuw i8, ptr %x, i64 8 + %3 = load i64, ptr %arrayidx6, align 8 + %or9 = select i1 %.not.i, i64 %3, i64 %2 + %arrayidx10 = getelementptr inbounds nuw i8, ptr %y, i64 16 + %4 = load i64, ptr %arrayidx10, align 8 + %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 16 + %5 = load i64, ptr %arrayidx12, align 8 + %or15 = select i1 %.not.i, i64 %5, i64 %4 + %arrayidx16 = getelementptr inbounds nuw i8, ptr %y, i64 24 + %6 = load i64, ptr %arrayidx16, align 8 + %arrayidx18 = getelementptr inbounds nuw i8, ptr %x, i64 24 + %7 = load i64, ptr %arrayidx18, align 8 + %or21 = select i1 %.not.i, i64 %7, i64 %6 + store i64 %or, ptr %r, align 8 + %arrayidx23 = getelementptr inbounds nuw i8, ptr %r, i64 8 + store i64 %or9, ptr %arrayidx23, align 8 + %arrayidx24 = getelementptr inbounds nuw i8, ptr %r, i64 16 + store i64 %or15, ptr %arrayidx24, align 8 + %arrayidx25 = getelementptr inbounds nuw i8, ptr %r, i64 24 + store i64 %or21, ptr %arrayidx25, align 8 + ret void +} + +define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) { +; RV64-LABEL: cmovznz4_builtin_ctselect: +; RV64: # %bb.0: # %entry +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: ld a4, 0(a1) +; RV64-NEXT: ld a5, 0(a2) +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: not a6, a0 +; RV64-NEXT: and a4, a0, a4 +; RV64-NEXT: and a5, a6, a5 +; RV64-NEXT: or a4, a4, a5 +; RV64-NEXT: sd a4, 0(a3) +; RV64-NEXT: ld a4, 8(a2) +; RV64-NEXT: ld a5, 8(a1) +; RV64-NEXT: and a4, a6, a4 +; RV64-NEXT: and a5, a0, a5 +; RV64-NEXT: or a4, a5, a4 +; RV64-NEXT: sd a4, 8(a3) +; RV64-NEXT: ld a4, 16(a2) +; RV64-NEXT: ld a5, 16(a1) +; RV64-NEXT: and a4, a6, a4 +; RV64-NEXT: and a5, a0, a5 +; RV64-NEXT: or a4, a5, a4 +; RV64-NEXT: sd a4, 16(a3) +; RV64-NEXT: ld a2, 24(a2) +; RV64-NEXT: ld a1, 24(a1) +; RV64-NEXT: and a2, a6, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: sd a0, 24(a3) +; RV64-NEXT: ret +; +; RV32-LABEL: cmovznz4_builtin_ctselect: +; RV32: # %bb.0: # %entry +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: lw a1, 0(a2) +; RV32-NEXT: lw a5, 4(a2) +; RV32-NEXT: lw a6, 0(a3) +; RV32-NEXT: lw a7, 4(a3) +; RV32-NEXT: seqz t0, a0 +; RV32-NEXT: addi a0, t0, -1 +; RV32-NEXT: neg t0, t0 +; RV32-NEXT: and a6, a0, a6 +; RV32-NEXT: and a1, t0, a1 +; RV32-NEXT: and a7, a0, a7 +; RV32-NEXT: and a5, t0, a5 +; RV32-NEXT: or a1, a1, a6 +; RV32-NEXT: or a5, a5, a7 +; RV32-NEXT: sw a1, 0(a4) +; RV32-NEXT: sw a5, 4(a4) +; RV32-NEXT: lw a1, 8(a3) +; RV32-NEXT: lw a5, 8(a2) +; RV32-NEXT: lw a6, 12(a3) +; RV32-NEXT: lw a7, 12(a2) +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: and a5, t0, a5 +; RV32-NEXT: and a6, a0, a6 +; RV32-NEXT: and a7, t0, a7 +; RV32-NEXT: or a1, a5, a1 +; RV32-NEXT: or a5, a7, a6 +; RV32-NEXT: sw a1, 8(a4) +; RV32-NEXT: sw a5, 12(a4) +; RV32-NEXT: lw a1, 16(a3) +; RV32-NEXT: lw a5, 16(a2) +; RV32-NEXT: lw a6, 20(a3) +; RV32-NEXT: lw a7, 20(a2) +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: and a5, t0, a5 +; RV32-NEXT: and a6, a0, a6 +; RV32-NEXT: and a7, t0, a7 +; RV32-NEXT: or a1, a5, a1 +; RV32-NEXT: or a5, a7, a6 +; RV32-NEXT: sw a1, 16(a4) +; RV32-NEXT: sw a5, 20(a4) +; RV32-NEXT: lw a1, 24(a3) +; RV32-NEXT: lw a5, 24(a2) +; RV32-NEXT: lw a3, 28(a3) +; RV32-NEXT: lw a2, 28(a2) +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: and a5, t0, a5 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: and a2, t0, a2 +; RV32-NEXT: or a1, a5, a1 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: sw a1, 24(a4) +; RV32-NEXT: sw a0, 28(a4) +; RV32-NEXT: ret +entry: + %cmp = icmp eq i64 %cin, 0 + %0 = load i64, ptr %x, align 8 + %1 = load i64, ptr %y, align 8 + %2 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %0, i64 %1) + store i64 %2, ptr %r, align 8 + %arrayidx4 = getelementptr inbounds nuw i8, ptr %x, i64 8 + %3 = load i64, ptr %arrayidx4, align 8 + %arrayidx5 = getelementptr inbounds nuw i8, ptr %y, i64 8 + %4 = load i64, ptr %arrayidx5, align 8 + %5 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %3, i64 %4) + %arrayidx6 = getelementptr inbounds nuw i8, ptr %r, i64 8 + store i64 %5, ptr %arrayidx6, align 8 + %arrayidx8 = getelementptr inbounds nuw i8, ptr %x, i64 16 + %6 = load i64, ptr %arrayidx8, align 8 + %arrayidx9 = getelementptr inbounds nuw i8, ptr %y, i64 16 + %7 = load i64, ptr %arrayidx9, align 8 + %8 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %6, i64 %7) + %arrayidx10 = getelementptr inbounds nuw i8, ptr %r, i64 16 + store i64 %8, ptr %arrayidx10, align 8 + %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 24 + %9 = load i64, ptr %arrayidx12, align 8 + %arrayidx13 = getelementptr inbounds nuw i8, ptr %y, i64 24 + %10 = load i64, ptr %arrayidx13, align 8 + %11 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %9, i64 %10) + %arrayidx14 = getelementptr inbounds nuw i8, ptr %r, i64 24 + store i64 %11, ptr %arrayidx14, align 8 + ret void +} + ; Declare the intrinsics declare i1 @llvm.ct.select.i1(i1, i1, i1) declare i32 @llvm.ct.select.i32(i1, i32, i32) diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll index ce3099e593282..384ab5c174026 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll @@ -370,6 +370,297 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ret i32 %sel4 } +; This test demonstrates the FStar cmovznz4 pattern using ct.select +; Based on https://godbolt.org/z/6Kb71Ks7z +; Shows that NoMerge flag prevents DAG optimization from introducing branches +define void @cmovznz4_fstar_original(i64 %cin, ptr %x, ptr %y, ptr %r) { +; W32-LABEL: cmovznz4_fstar_original: +; W32: .functype cmovznz4_fstar_original (i64, i32, i32, i32) -> () +; W32-NEXT: .local i32, i64, i64 +; W32-NEXT: # %bb.0: # %entry +; W32-NEXT: local.get 1 +; W32-NEXT: local.get 2 +; W32-NEXT: local.get 0 +; W32-NEXT: i64.eqz +; W32-NEXT: local.tee 4 +; W32-NEXT: i32.select +; W32-NEXT: i64.load 0 +; W32-NEXT: local.set 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const 8 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const 8 +; W32-NEXT: i32.add +; W32-NEXT: local.get 4 +; W32-NEXT: i32.select +; W32-NEXT: i64.load 0 +; W32-NEXT: local.set 5 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const 16 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const 16 +; W32-NEXT: i32.add +; W32-NEXT: local.get 4 +; W32-NEXT: i32.select +; W32-NEXT: i64.load 0 +; W32-NEXT: local.set 6 +; W32-NEXT: local.get 3 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const 24 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const 24 +; W32-NEXT: i32.add +; W32-NEXT: local.get 4 +; W32-NEXT: i32.select +; W32-NEXT: i64.load 0 +; W32-NEXT: i64.store 24 +; W32-NEXT: local.get 3 +; W32-NEXT: local.get 6 +; W32-NEXT: i64.store 16 +; W32-NEXT: local.get 3 +; W32-NEXT: local.get 5 +; W32-NEXT: i64.store 8 +; W32-NEXT: local.get 3 +; W32-NEXT: local.get 0 +; W32-NEXT: i64.store 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: cmovznz4_fstar_original: +; W64: .functype cmovznz4_fstar_original (i64, i64, i64, i64) -> () +; W64-NEXT: .local i32, i64, i64 +; W64-NEXT: # %bb.0: # %entry +; W64-NEXT: local.get 1 +; W64-NEXT: local.get 2 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.eqz +; W64-NEXT: local.tee 4 +; W64-NEXT: i64.select +; W64-NEXT: i64.load 0 +; W64-NEXT: local.set 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.const 8 +; W64-NEXT: i64.add +; W64-NEXT: local.get 2 +; W64-NEXT: i64.const 8 +; W64-NEXT: i64.add +; W64-NEXT: local.get 4 +; W64-NEXT: i64.select +; W64-NEXT: i64.load 0 +; W64-NEXT: local.set 5 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.const 16 +; W64-NEXT: i64.add +; W64-NEXT: local.get 2 +; W64-NEXT: i64.const 16 +; W64-NEXT: i64.add +; W64-NEXT: local.get 4 +; W64-NEXT: i64.select +; W64-NEXT: i64.load 0 +; W64-NEXT: local.set 6 +; W64-NEXT: local.get 3 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.const 24 +; W64-NEXT: i64.add +; W64-NEXT: local.get 2 +; W64-NEXT: i64.const 24 +; W64-NEXT: i64.add +; W64-NEXT: local.get 4 +; W64-NEXT: i64.select +; W64-NEXT: i64.load 0 +; W64-NEXT: i64.store 24 +; W64-NEXT: local.get 3 +; W64-NEXT: local.get 6 +; W64-NEXT: i64.store 16 +; W64-NEXT: local.get 3 +; W64-NEXT: local.get 5 +; W64-NEXT: i64.store 8 +; W64-NEXT: local.get 3 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.store 0 +; W64-NEXT: # fallthrough-return +entry: + %.not.i = icmp eq i64 %cin, 0 + %0 = load i64, ptr %y, align 8 + %1 = load i64, ptr %x, align 8 + %or = select i1 %.not.i, i64 %1, i64 %0 + %arrayidx4 = getelementptr inbounds nuw i8, ptr %y, i64 8 + %2 = load i64, ptr %arrayidx4, align 8 + %arrayidx6 = getelementptr inbounds nuw i8, ptr %x, i64 8 + %3 = load i64, ptr %arrayidx6, align 8 + %or9 = select i1 %.not.i, i64 %3, i64 %2 + %arrayidx10 = getelementptr inbounds nuw i8, ptr %y, i64 16 + %4 = load i64, ptr %arrayidx10, align 8 + %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 16 + %5 = load i64, ptr %arrayidx12, align 8 + %or15 = select i1 %.not.i, i64 %5, i64 %4 + %arrayidx16 = getelementptr inbounds nuw i8, ptr %y, i64 24 + %6 = load i64, ptr %arrayidx16, align 8 + %arrayidx18 = getelementptr inbounds nuw i8, ptr %x, i64 24 + %7 = load i64, ptr %arrayidx18, align 8 + %or21 = select i1 %.not.i, i64 %7, i64 %6 + store i64 %or, ptr %r, align 8 + %arrayidx23 = getelementptr inbounds nuw i8, ptr %r, i64 8 + store i64 %or9, ptr %arrayidx23, align 8 + %arrayidx24 = getelementptr inbounds nuw i8, ptr %r, i64 16 + store i64 %or15, ptr %arrayidx24, align 8 + %arrayidx25 = getelementptr inbounds nuw i8, ptr %r, i64 24 + store i64 %or21, ptr %arrayidx25, align 8 + ret void +} + +define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) { +; W32-LABEL: cmovznz4_builtin_ctselect: +; W32: .functype cmovznz4_builtin_ctselect (i64, i32, i32, i32) -> () +; W32-NEXT: .local i64 +; W32-NEXT: # %bb.0: # %entry +; W32-NEXT: local.get 3 +; W32-NEXT: i64.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i64.eqz +; W32-NEXT: i64.extend_i32_u +; W32-NEXT: i64.const 1 +; W32-NEXT: i64.and +; W32-NEXT: i64.sub +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i64.load 0 +; W32-NEXT: i64.and +; W32-NEXT: local.get 0 +; W32-NEXT: i64.const -1 +; W32-NEXT: i64.xor +; W32-NEXT: local.tee 4 +; W32-NEXT: local.get 2 +; W32-NEXT: i64.load 0 +; W32-NEXT: i64.and +; W32-NEXT: i64.or +; W32-NEXT: i64.store 0 +; W32-NEXT: local.get 3 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i64.load 8 +; W32-NEXT: i64.and +; W32-NEXT: local.get 4 +; W32-NEXT: local.get 2 +; W32-NEXT: i64.load 8 +; W32-NEXT: i64.and +; W32-NEXT: i64.or +; W32-NEXT: i64.store 8 +; W32-NEXT: local.get 3 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i64.load 16 +; W32-NEXT: i64.and +; W32-NEXT: local.get 4 +; W32-NEXT: local.get 2 +; W32-NEXT: i64.load 16 +; W32-NEXT: i64.and +; W32-NEXT: i64.or +; W32-NEXT: i64.store 16 +; W32-NEXT: local.get 3 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i64.load 24 +; W32-NEXT: i64.and +; W32-NEXT: local.get 4 +; W32-NEXT: local.get 2 +; W32-NEXT: i64.load 24 +; W32-NEXT: i64.and +; W32-NEXT: i64.or +; W32-NEXT: i64.store 24 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: cmovznz4_builtin_ctselect: +; W64: .functype cmovznz4_builtin_ctselect (i64, i64, i64, i64) -> () +; W64-NEXT: .local i64 +; W64-NEXT: # %bb.0: # %entry +; W64-NEXT: local.get 3 +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.eqz +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.sub +; W64-NEXT: local.tee 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.load 0 +; W64-NEXT: i64.and +; W64-NEXT: local.get 0 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.xor +; W64-NEXT: local.tee 4 +; W64-NEXT: local.get 2 +; W64-NEXT: i64.load 0 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: i64.store 0 +; W64-NEXT: local.get 3 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.load 8 +; W64-NEXT: i64.and +; W64-NEXT: local.get 4 +; W64-NEXT: local.get 2 +; W64-NEXT: i64.load 8 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: i64.store 8 +; W64-NEXT: local.get 3 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.load 16 +; W64-NEXT: i64.and +; W64-NEXT: local.get 4 +; W64-NEXT: local.get 2 +; W64-NEXT: i64.load 16 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: i64.store 16 +; W64-NEXT: local.get 3 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.load 24 +; W64-NEXT: i64.and +; W64-NEXT: local.get 4 +; W64-NEXT: local.get 2 +; W64-NEXT: i64.load 24 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: i64.store 24 +; W64-NEXT: # fallthrough-return +entry: + %cmp = icmp eq i64 %cin, 0 + %0 = load i64, ptr %x, align 8 + %1 = load i64, ptr %y, align 8 + %2 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %0, i64 %1) + store i64 %2, ptr %r, align 8 + %arrayidx4 = getelementptr inbounds nuw i8, ptr %x, i64 8 + %3 = load i64, ptr %arrayidx4, align 8 + %arrayidx5 = getelementptr inbounds nuw i8, ptr %y, i64 8 + %4 = load i64, ptr %arrayidx5, align 8 + %5 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %3, i64 %4) + %arrayidx6 = getelementptr inbounds nuw i8, ptr %r, i64 8 + store i64 %5, ptr %arrayidx6, align 8 + %arrayidx8 = getelementptr inbounds nuw i8, ptr %x, i64 16 + %6 = load i64, ptr %arrayidx8, align 8 + %arrayidx9 = getelementptr inbounds nuw i8, ptr %y, i64 16 + %7 = load i64, ptr %arrayidx9, align 8 + %8 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %6, i64 %7) + %arrayidx10 = getelementptr inbounds nuw i8, ptr %r, i64 16 + store i64 %8, ptr %arrayidx10, align 8 + %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 24 + %9 = load i64, ptr %arrayidx12, align 8 + %arrayidx13 = getelementptr inbounds nuw i8, ptr %y, i64 24 + %10 = load i64, ptr %arrayidx13, align 8 + %11 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %9, i64 %10) + %arrayidx14 = getelementptr inbounds nuw i8, ptr %r, i64 24 + store i64 %11, ptr %arrayidx14, align 8 + ret void +} + ; Declare the intrinsics declare i1 @llvm.ct.select.i1(i1, i1, i1) declare i32 @llvm.ct.select.i32(i1, i32, i32) From 14b276859ebfc0a59aceb41b1c0906ed6ae6f82a Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Tue, 19 Aug 2025 10:41:52 -0400 Subject: [PATCH 16/63] [CT] These changes reflect the current fallback changes. Which is was merge to our ct-llvmorg-20.1.4 branch --- .../Mips/ctselect-fallback-edge-cases.ll | 55 +++++++++---------- .../RISCV/ctselect-fallback-edge-cases.ll | 6 +- .../ctselect-fallback-edge-cases.ll | 12 ++-- 3 files changed, 33 insertions(+), 40 deletions(-) diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll index 1e0478cabe744..afdb85ee69e16 100644 --- a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll +++ b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll @@ -334,9 +334,9 @@ define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) { ; M32-NEXT: or $1, $4, $5 ; M32-NEXT: lw $3, 4($7) ; M32-NEXT: lw $4, 4($6) -; M32-NEXT: sltiu $1, $1, 1 -; M32-NEXT: addiu $2, $1, -1 -; M32-NEXT: negu $1, $1 +; M32-NEXT: sltu $1, $zero, $1 +; M32-NEXT: negu $2, $1 +; M32-NEXT: addiu $1, $1, -1 ; M32-NEXT: and $3, $2, $3 ; M32-NEXT: and $4, $1, $4 ; M32-NEXT: or $3, $4, $3 @@ -388,36 +388,33 @@ define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) { ; ; M64-LABEL: cmovznz4_builtin_ctselect: ; M64: # %bb.0: # %entry -; M64-NEXT: sltiu $1, $4, 1 -; M64-NEXT: daddiu $3, $zero, -1 -; M64-NEXT: dsll $1, $1, 32 -; M64-NEXT: ld $2, 0($5) +; M64-NEXT: daddiu $2, $zero, -1 +; M64-NEXT: daddiu $1, $zero, -1 +; M64-NEXT: ld $3, 0($5) +; M64-NEXT: movn $2, $zero, $4 ; M64-NEXT: ld $4, 0($6) -; M64-NEXT: dsrl $1, $1, 32 -; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: dnegu $1, $1 -; M64-NEXT: xor $3, $1, $3 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: and $4, $3, $4 -; M64-NEXT: or $2, $2, $4 -; M64-NEXT: sd $2, 0($7) -; M64-NEXT: ld $2, 8($6) -; M64-NEXT: ld $4, 8($5) -; M64-NEXT: and $2, $3, $2 +; M64-NEXT: xor $1, $2, $1 +; M64-NEXT: and $3, $2, $3 ; M64-NEXT: and $4, $1, $4 -; M64-NEXT: or $2, $4, $2 -; M64-NEXT: sd $2, 8($7) -; M64-NEXT: ld $2, 16($6) +; M64-NEXT: or $3, $3, $4 +; M64-NEXT: sd $3, 0($7) +; M64-NEXT: ld $3, 8($6) +; M64-NEXT: ld $4, 8($5) +; M64-NEXT: and $3, $1, $3 +; M64-NEXT: and $4, $2, $4 +; M64-NEXT: or $3, $4, $3 +; M64-NEXT: sd $3, 8($7) +; M64-NEXT: ld $3, 16($6) ; M64-NEXT: ld $4, 16($5) -; M64-NEXT: and $2, $3, $2 -; M64-NEXT: and $4, $1, $4 -; M64-NEXT: or $2, $4, $2 -; M64-NEXT: sd $2, 16($7) -; M64-NEXT: ld $2, 24($6) -; M64-NEXT: and $2, $3, $2 -; M64-NEXT: ld $3, 24($5) +; M64-NEXT: and $3, $1, $3 +; M64-NEXT: and $4, $2, $4 +; M64-NEXT: or $3, $4, $3 +; M64-NEXT: sd $3, 16($7) +; M64-NEXT: ld $3, 24($6) ; M64-NEXT: and $1, $1, $3 -; M64-NEXT: or $1, $1, $2 +; M64-NEXT: ld $3, 24($5) +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: or $1, $2, $1 ; M64-NEXT: jr $ra ; M64-NEXT: sd $1, 24($7) entry: diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll index a4e204d915334..7545a9c94f229 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll @@ -378,9 +378,9 @@ define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) { ; RV32-NEXT: lw a5, 4(a2) ; RV32-NEXT: lw a6, 0(a3) ; RV32-NEXT: lw a7, 4(a3) -; RV32-NEXT: seqz t0, a0 -; RV32-NEXT: addi a0, t0, -1 -; RV32-NEXT: neg t0, t0 +; RV32-NEXT: snez t0, a0 +; RV32-NEXT: neg a0, t0 +; RV32-NEXT: addi t0, t0, -1 ; RV32-NEXT: and a6, a0, a6 ; RV32-NEXT: and a1, t0, a1 ; RV32-NEXT: and a7, a0, a7 diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll index 384ab5c174026..53b12e67a9204 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll @@ -517,13 +517,11 @@ define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) { ; W32-NEXT: .local i64 ; W32-NEXT: # %bb.0: # %entry ; W32-NEXT: local.get 3 +; W32-NEXT: i64.const -1 ; W32-NEXT: i64.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: i64.eqz -; W32-NEXT: i64.extend_i32_u -; W32-NEXT: i64.const 1 -; W32-NEXT: i64.and -; W32-NEXT: i64.sub +; W32-NEXT: i64.select ; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i64.load 0 @@ -577,13 +575,11 @@ define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) { ; W64-NEXT: .local i64 ; W64-NEXT: # %bb.0: # %entry ; W64-NEXT: local.get 3 +; W64-NEXT: i64.const -1 ; W64-NEXT: i64.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: i64.eqz -; W64-NEXT: i64.extend_i32_u -; W64-NEXT: i64.const 1 -; W64-NEXT: i64.and -; W64-NEXT: i64.sub +; W64-NEXT: i64.select ; W64-NEXT: local.tee 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i64.load 0 From 28bf3742b1a48c6fdb7ae7e8d47b8afdfe2dce72 Mon Sep 17 00:00:00 2001 From: Francesco Bertolaccini Date: Tue, 19 Aug 2025 12:46:30 +0200 Subject: [PATCH 17/63] [CT] Lower CTSELECT in post-RA pass for AArch64 --- .../Target/AArch64/AArch64ISelLowering.cpp | 12 ------ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 38 +++++++++++++++++-- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 2 +- .../lib/Target/AArch64/AArch64MCInstLower.cpp | 18 +++++++++ 4 files changed, 53 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 5e5fec19d5713..17e76a2945db3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3398,18 +3398,6 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( return EmitEntryPStateSM(MI, BB); case AArch64::F128CSEL: return EmitF128CSEL(MI, BB); - case AArch64::I32CTSELECT: - return EmitCTSELECT(MI, BB, AArch64::CSELWr); - case AArch64::I64CTSELECT: - return EmitCTSELECT(MI, BB, AArch64::CSELXr); - case AArch64::BF16CTSELECT: - return EmitCTSELECT(MI, BB, AArch64::FCSELHrrr); - case AArch64::F16CTSELECT: - return EmitCTSELECT(MI, BB, AArch64::FCSELHrrr); - case AArch64::F32CTSELECT: - return EmitCTSELECT(MI, BB, AArch64::FCSELSrrr); - case AArch64::F64CTSELECT: - return EmitCTSELECT(MI, BB, AArch64::FCSELDrrr); case TargetOpcode::STATEPOINT: // STATEPOINT is a pseudo instruction which has no implicit defs/uses // while bl call instruction (where statepoint will be lowered at the end) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 12c600f0f2661..7b3fbc64ada36 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2113,16 +2113,46 @@ bool AArch64InstrInfo::removeCmpToZeroOrOne( return true; } -bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { - if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && - MI.getOpcode() != AArch64::CATCHRET) - return false; +static inline void expandCtSelect(MachineBasicBlock &MBB, MachineInstr &MI, DebugLoc &DL, const MCInstrDesc &MCID) { + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, MCID); + for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) { + Builder.add(MI.getOperand(Idx)); + } + Builder->setFlag(MachineInstr::NoMerge); + MBB.remove_instr(&MI); +} +bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineBasicBlock &MBB = *MI.getParent(); auto &Subtarget = MBB.getParent()->getSubtarget(); auto TRI = Subtarget.getRegisterInfo(); DebugLoc DL = MI.getDebugLoc(); + switch (MI.getOpcode()) { + case AArch64::I32CTSELECT: + expandCtSelect(MBB, MI, DL, get(AArch64::CSELWr)); + return true; + case AArch64::I64CTSELECT: + expandCtSelect(MBB, MI, DL, get(AArch64::CSELXr)); + return true; + case AArch64::BF16CTSELECT: + expandCtSelect(MBB, MI, DL, get(AArch64::FCSELHrrr)); + return true; + case AArch64::F16CTSELECT: + expandCtSelect(MBB, MI, DL, get(AArch64::FCSELHrrr)); + return true; + case AArch64::F32CTSELECT: + expandCtSelect(MBB, MI, DL, get(AArch64::FCSELSrrr)); + return true; + case AArch64::F64CTSELECT: + expandCtSelect(MBB, MI, DL, get(AArch64::FCSELDrrr)); + return true; + } + + if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && + MI.getOpcode() != AArch64::CATCHRET) + return false; + if (MI.getOpcode() == AArch64::CATCHRET) { // Skip to the first instruction before the epilog. const TargetInstrInfo *TII = diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 48a847244b672..64de1674b494d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5693,7 +5693,7 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd), // Constant-time conditional selection instructions //===----------------------------------------------------------------------===// -let hasSideEffects = 1, isPseudo = 1, hasNoSchedulingInfo = 1, usesCustomInserter = 1 in { +let hasSideEffects = 1, isPseudo = 1, hasNoSchedulingInfo = 1, Uses = [NZCV] in { def I32CTSELECT : Pseudo<(outs GPR32:$dst), (ins GPR32:$tval, GPR32:$fval, i32imm:$cc), [(set (i32 GPR32:$dst), diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp index 39946633603f6..e2ec9118eb5ee 100644 --- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -393,5 +393,23 @@ void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(AArch64::RET); OutMI.addOperand(MCOperand::createReg(AArch64::LR)); break; + case AArch64::I32CTSELECT: + OutMI.setOpcode(AArch64::CSELWr); + break; + case AArch64::I64CTSELECT: + OutMI.setOpcode(AArch64::CSELXr); + break; + case AArch64::BF16CTSELECT: + OutMI.setOpcode(AArch64::FCSELHrrr); + break; + case AArch64::F16CTSELECT: + OutMI.setOpcode(AArch64::FCSELHrrr); + break; + case AArch64::F32CTSELECT: + OutMI.setOpcode(AArch64::FCSELSrrr); + break; + case AArch64::F64CTSELECT: + OutMI.setOpcode(AArch64::FCSELDrrr); + break; } } From 211f5677d796b4b5589258aa48c4405c05950215 Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Tue, 19 Aug 2025 12:15:07 -0400 Subject: [PATCH 18/63] [CT] Added support for Scalable Vector types --- .../SelectionDAG/SelectionDAGBuilder.cpp | 34 +- .../RISCV/ctselect-fallback-vector-rvv.ll | 804 ++++++++++++++++++ 2 files changed, 825 insertions(+), 13 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 31b4b8b8ddde6..08852d5dc9b7a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6502,19 +6502,25 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( EVT WorkingVT = VT; if (VT.isVector() && !Cond.getValueType().isVector()) { - unsigned NumElems = VT.getVectorNumElements(); + ElementCount NumElems = VT.getVectorElementCount(); EVT CondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElems); - Cond = DAG.getSplatBuildVector(CondVT, DL, Cond); + + if (VT.isScalableVector()) { + Cond = DAG.getSplatVector(CondVT, DL, Cond); + } else { + Cond = DAG.getSplatBuildVector(CondVT, DL, Cond); + } } if (VT.isFloatingPoint()) { if (VT.isVector()) { // float vector -> int vector EVT ElemVT = VT.getVectorElementType(); - unsigned int ElemBitWidth = ElemVT.getSizeInBits(); + unsigned int ElemBitWidth = ElemVT.getScalarSizeInBits(); EVT IntElemVT = EVT::getIntegerVT(*DAG.getContext(), ElemBitWidth); + WorkingVT = EVT::getVectorVT(*DAG.getContext(), IntElemVT, - VT.getVectorNumElements()); + VT.getVectorElementCount()); } else { WorkingVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); } @@ -6525,7 +6531,17 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( SDValue Mask = DAG.getSExtOrTrunc(Cond, DL, WorkingVT); - SDValue AllOnes = DAG.getAllOnesConstant(DL, WorkingVT); + SDValue AllOnes; + if (WorkingVT.isScalableVector()) { + unsigned BitWidth = WorkingVT.getScalarSizeInBits(); + APInt AllOnesVal = APInt::getAllOnes(BitWidth); + SDValue ScalarAllOnes = + DAG.getConstant(AllOnesVal, DL, WorkingVT.getScalarType()); + AllOnes = DAG.getSplatVector(WorkingVT, DL, ScalarAllOnes); + } else { + AllOnes = DAG.getAllOnesConstant(DL, WorkingVT); + } + SDValue Invert = DAG.getNode(ISD::XOR, DL, WorkingVT, Mask, AllOnes, ProtectedFlag); // (or (and WorkingT, Mask), (and F, ~Mask)) @@ -6759,14 +6775,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } } - // We don't support scalable vector types yet, for now it'll only be - // fix-width vector - // TODO: Add support for scalable vectors - if (VT.isScalableVector()) { - report_fatal_error( - "llvm.ct.select: fallback doesn't supports scalable vectors"); - } - setValue(&I, createProtectedCtSelectFallback(DAG, DL, Cond, A, B, VT)); return; } diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll new file mode 100644 index 0000000000000..014d95c3883b9 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll @@ -0,0 +1,804 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v -O3 | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v -O3 | FileCheck %s --check-prefix=RV32 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvl128b -O3 | FileCheck %s --check-prefix=RV32-V128 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvl256b -O3 | FileCheck %s --check-prefix=RV64-V256 + + +; Basic pass-through select on nxv4i32 +define @ctsel_nxv4i32_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv4i32_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv4i32(i1 %cond, %a, %b) + ret %r +} + +; Select with loads (aligned) +define @ctsel_nxv4i32_load(i1 %cond, ptr %p1, ptr %p2) { +; RV64-LABEL: ctsel_nxv4i32_load: +; RV64: # %bb.0: +; RV64-NEXT: vl2re32.v v8, (a1) +; RV64-NEXT: vl2re32.v v10, (a2) +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_load: +; RV32: # %bb.0: +; RV32-NEXT: vl2re32.v v8, (a1) +; RV32-NEXT: vl2re32.v v10, (a2) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_load: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: vl2re32.v v8, (a1) +; RV32-V128-NEXT: vl2re32.v v10, (a2) +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_load: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: vl2re32.v v8, (a1) +; RV64-V256-NEXT: vl2re32.v v10, (a2) +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %a = load , ptr %p1, align 16 + %b = load , ptr %p2, align 16 + %r = call @llvm.ct.select.nxv4i32(i1 %cond, %a, %b) + ret %r +} + +; Mixed: do arithmetic first, then select, then store +define void @ctsel_nxv4i32_mixed(i1 %cond, ptr %p1, ptr %p2, ptr %out) { +; RV64-LABEL: ctsel_nxv4i32_mixed: +; RV64: # %bb.0: +; RV64-NEXT: vl2re32.v v8, (a1) +; RV64-NEXT: vl2re32.v v10, (a2) +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vadd.vv v8, v8, v8 +; RV64-NEXT: vadd.vv v10, v10, v10 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vs2r.v v8, (a3) +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_mixed: +; RV32: # %bb.0: +; RV32-NEXT: vl2re32.v v8, (a1) +; RV32-NEXT: vl2re32.v v10, (a2) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vadd.vv v8, v8, v8 +; RV32-NEXT: vadd.vv v10, v10, v10 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vs2r.v v8, (a3) +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_mixed: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: vl2re32.v v8, (a1) +; RV32-V128-NEXT: vl2re32.v v10, (a2) +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vadd.vv v8, v8, v8 +; RV32-V128-NEXT: vadd.vv v10, v10, v10 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: vs2r.v v8, (a3) +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_mixed: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: vl2re32.v v8, (a1) +; RV64-V256-NEXT: vl2re32.v v10, (a2) +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vadd.vv v8, v8, v8 +; RV64-V256-NEXT: vadd.vv v10, v10, v10 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: vs2r.v v8, (a3) +; RV64-V256-NEXT: ret + %a = load , ptr %p1, align 16 + %b = load , ptr %p2, align 16 + ; avoid scalable vector constants: use %a+%a and %b+%b + %a2 = add %a, %a + %b2 = add %b, %b + %r = call @llvm.ct.select.nxv4i32(i1 %cond, %a2, %b2) + store %r, ptr %out, align 16 + ret void +} + +; Const-true/false fold smoke tests +define @ctsel_nxv4i32_true( %a, %b) { +; RV64-LABEL: ctsel_nxv4i32_true: +; RV64: # %bb.0: +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_true: +; RV32: # %bb.0: +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_true: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_true: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv4i32(i1 true, %a, %b) + ret %r +} + +define @ctsel_nxv4i32_false( %a, %b) { +; RV64-LABEL: ctsel_nxv4i32_false: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_false: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_false: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-V128-NEXT: vmv2r.v v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_false: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-V256-NEXT: vmv2r.v v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv4i32(i1 false, %a, %b) + ret %r +} + +; Chain two selects to ensure masks don’t get merged away +define @ctsel_nxv4i32_chain(i1 %c1, i1 %c2, +; RV64-LABEL: ctsel_nxv4i32_chain: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v14, 0 +; RV64-NEXT: andi a1, a1, 1 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v16, a0 +; RV64-NEXT: vmsne.vi v0, v16, 0 +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmerge.vim v18, v14, -1, v0 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-NEXT: vmsne.vi v0, v16, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmerge.vim v14, v14, -1, v0 +; RV64-NEXT: vand.vv v8, v18, v8 +; RV64-NEXT: vnot.v v16, v18 +; RV64-NEXT: vand.vv v10, v16, v10 +; RV64-NEXT: vnot.v v16, v14 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vand.vv v8, v14, v8 +; RV64-NEXT: vand.vv v10, v16, v12 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_chain: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v14, 0 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v16, a0 +; RV32-NEXT: vmsne.vi v0, v16, 0 +; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmerge.vim v18, v14, -1, v0 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-NEXT: vmsne.vi v0, v16, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmerge.vim v14, v14, -1, v0 +; RV32-NEXT: vand.vv v8, v18, v8 +; RV32-NEXT: vnot.v v16, v18 +; RV32-NEXT: vand.vv v10, v16, v10 +; RV32-NEXT: vnot.v v16, v14 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vand.vv v8, v14, v8 +; RV32-NEXT: vand.vv v10, v16, v12 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_chain: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v14, 0 +; RV32-V128-NEXT: andi a1, a1, 1 +; RV32-V128-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v16, a0 +; RV32-V128-NEXT: vmsne.vi v0, v16, 0 +; RV32-V128-NEXT: vmv.v.x v16, a1 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmerge.vim v18, v14, -1, v0 +; RV32-V128-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmsne.vi v0, v16, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmerge.vim v14, v14, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v18, v8 +; RV32-V128-NEXT: vnot.v v16, v18 +; RV32-V128-NEXT: vand.vv v10, v16, v10 +; RV32-V128-NEXT: vnot.v v16, v14 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: vand.vv v8, v14, v8 +; RV32-V128-NEXT: vand.vv v10, v16, v12 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_chain: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v14, 0 +; RV64-V256-NEXT: andi a1, a1, 1 +; RV64-V256-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v16, a0 +; RV64-V256-NEXT: vmsne.vi v0, v16, 0 +; RV64-V256-NEXT: vmv.v.x v16, a1 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmerge.vim v18, v14, -1, v0 +; RV64-V256-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmsne.vi v0, v16, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmerge.vim v14, v14, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v18, v8 +; RV64-V256-NEXT: vnot.v v16, v18 +; RV64-V256-NEXT: vand.vv v10, v16, v10 +; RV64-V256-NEXT: vnot.v v16, v14 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: vand.vv v8, v14, v8 +; RV64-V256-NEXT: vand.vv v10, v16, v12 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %a, + %b, + %c) { + %t = call @llvm.ct.select.nxv4i32(i1 %c1, %a, %b) + %r = call @llvm.ct.select.nxv4i32(i1 %c2, %t, %c) + ret %r +} + +; A different element width +define @ctsel_nxv8i16_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv8i16_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv8i16_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv8i16_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv8i16_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv8i16(i1 %cond, %a, %b) + ret %r +} + +define @ctsel_nxv16i8_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv16i8_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv16i8_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv16i8_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv16i8_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv16i8(i1 %cond, %a, %b) + ret %r +} + +; 64-bit elements (useful on RV64) +define @ctsel_nxv2i64_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv2i64_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv2i64_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv2i64_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv2i64_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv2i64(i1 %cond, %a, %b) + ret %r +} + +; Floating-point scalable vectors (bitcasted in your fallback) +define @ctsel_nxv4f32_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv4f32_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4f32_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4f32_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4f32_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv4f32(i1 %cond, %a, %b) + ret %r +} + +; FP arithmetic around select +define @ctsel_nxv4f32_arith(i1 %cond, %x, %y) { +; RV64-LABEL: ctsel_nxv4f32_arith: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV64-NEXT: vfadd.vv v12, v8, v10 +; RV64-NEXT: vfsub.vv v8, v8, v10 +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vmerge.vim v10, v10, -1, v0 +; RV64-NEXT: vand.vv v12, v10, v12 +; RV64-NEXT: vnot.v v10, v10 +; RV64-NEXT: vand.vv v8, v10, v8 +; RV64-NEXT: vor.vv v8, v12, v8 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4f32_arith: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-NEXT: vfadd.vv v12, v8, v10 +; RV32-NEXT: vfsub.vv v8, v8, v10 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vmerge.vim v10, v10, -1, v0 +; RV32-NEXT: vand.vv v12, v10, v12 +; RV32-NEXT: vnot.v v10, v10 +; RV32-NEXT: vand.vv v8, v10, v8 +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4f32_arith: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vfadd.vv v12, v8, v10 +; RV32-V128-NEXT: vfsub.vv v8, v8, v10 +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v10, a0 +; RV32-V128-NEXT: vmsne.vi v0, v10, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v10, 0 +; RV32-V128-NEXT: vmerge.vim v10, v10, -1, v0 +; RV32-V128-NEXT: vand.vv v12, v10, v12 +; RV32-V128-NEXT: vnot.v v10, v10 +; RV32-V128-NEXT: vand.vv v8, v10, v8 +; RV32-V128-NEXT: vor.vv v8, v12, v8 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4f32_arith: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vfadd.vv v12, v8, v10 +; RV64-V256-NEXT: vfsub.vv v8, v8, v10 +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v10, a0 +; RV64-V256-NEXT: vmsne.vi v0, v10, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v10, 0 +; RV64-V256-NEXT: vmerge.vim v10, v10, -1, v0 +; RV64-V256-NEXT: vand.vv v12, v10, v12 +; RV64-V256-NEXT: vnot.v v10, v10 +; RV64-V256-NEXT: vand.vv v8, v10, v8 +; RV64-V256-NEXT: vor.vv v8, v12, v8 +; RV64-V256-NEXT: ret + %sum = fadd %x, %y + %diff = fsub %x, %y + %r = call @llvm.ct.select.nxv4f32(i1 %cond, %sum, %diff) + ret %r +} + +define @ctsel_nxv2f64_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv2f64_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv2f64_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv2f64_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv2f64_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv2f64(i1 %cond, %a, %b) + ret %r +} + +declare @llvm.ct.select.nxv4i32(i1, , ) +declare @llvm.ct.select.nxv8i16(i1, , ) +declare @llvm.ct.select.nxv16i8(i1, , ) +declare @llvm.ct.select.nxv2i64(i1, , ) +declare @llvm.ct.select.nxv4f32(i1, , ) +declare @llvm.ct.select.nxv2f64(i1, ,) From b2484766304933e5515fc2c3dd163aefd07178be Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Mon, 25 Aug 2025 13:12:41 -0400 Subject: [PATCH 19/63] Changes to for DAG Chaining --- .../SelectionDAG/SelectionDAGBuilder.cpp | 30 +++++++++++++++++++ .../RISCV/ctselect-fallback-edge-cases.ll | 28 +++++++++-------- .../RISCV/ctselect-fallback-patterns.ll | 8 ++++- llvm/test/CodeGen/RISCV/ctselect-fallback.ll | 3 +- 4 files changed, 54 insertions(+), 15 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 08852d5dc9b7a..38cec9eb5f174 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6501,6 +6501,9 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( SDValue WorkingF = F; EVT WorkingVT = VT; + SDValue Chain = DAG.getEntryNode(); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + if (VT.isVector() && !Cond.getValueType().isVector()) { ElementCount NumElems = VT.getVectorElementCount(); EVT CondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElems); @@ -6547,6 +6550,33 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( // (or (and WorkingT, Mask), (and F, ~Mask)) SDValue TM = DAG.getNode(ISD::AND, DL, WorkingVT, Mask, WorkingT, ProtectedFlag); SDValue FM = DAG.getNode(ISD::AND, DL, WorkingVT, Invert, WorkingF, ProtectedFlag); + + // Only apply chaining for non-scalable types or when we can get a register class + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + bool CanUseChaining = false; + + if (!WorkingVT.isScalableVector()) { + // For fixed-size vectors and scalars, we can safely use register classes + CanUseChaining = TLI.isTypeLegal(WorkingVT.getSimpleVT()); + } else { + // For scalable vectors, check if the target has register class support + // This is target-specific - RISC-V might not support this directly + CanUseChaining = false; // Conservative: disable for scalable vectors + } + + if (CanUseChaining) { + // Apply chaining through registers for additional protection + const TargetRegisterClass *RC = TLI.getRegClassFor(WorkingVT.getSimpleVT()); + + Register TMReg = MRI.createVirtualRegister(RC); + Chain = DAG.getCopyToReg(Chain, DL, TMReg, TM); + TM = DAG.getCopyFromReg(Chain, DL, TMReg, WorkingVT); + + Register FMReg = MRI.createVirtualRegister(RC); + Chain = DAG.getCopyToReg(Chain, DL, FMReg, FM); + FM = DAG.getCopyFromReg(Chain, DL, FMReg, WorkingVT); + } + SDValue Result = DAG.getNode(ISD::OR, DL, WorkingVT, TM, FM, ProtectedFlag); // Convert back if needed diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll index 7545a9c94f229..7fe76cf7f574c 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll @@ -59,6 +59,7 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { ; RV64-NEXT: slli a0, a0, 63 ; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, zero ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_null_ptr: @@ -66,6 +67,7 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, zero ; RV32-NEXT: ret %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null) ret ptr %result @@ -351,23 +353,23 @@ define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) { ; RV64-NEXT: and a5, a6, a5 ; RV64-NEXT: or a4, a4, a5 ; RV64-NEXT: sd a4, 0(a3) -; RV64-NEXT: ld a4, 8(a2) -; RV64-NEXT: ld a5, 8(a1) -; RV64-NEXT: and a4, a6, a4 -; RV64-NEXT: and a5, a0, a5 -; RV64-NEXT: or a4, a5, a4 +; RV64-NEXT: ld a4, 8(a1) +; RV64-NEXT: ld a5, 8(a2) +; RV64-NEXT: and a4, a0, a4 +; RV64-NEXT: and a5, a6, a5 +; RV64-NEXT: or a4, a4, a5 ; RV64-NEXT: sd a4, 8(a3) -; RV64-NEXT: ld a4, 16(a2) -; RV64-NEXT: ld a5, 16(a1) -; RV64-NEXT: and a4, a6, a4 -; RV64-NEXT: and a5, a0, a5 -; RV64-NEXT: or a4, a5, a4 +; RV64-NEXT: ld a4, 16(a1) +; RV64-NEXT: ld a5, 16(a2) +; RV64-NEXT: and a4, a0, a4 +; RV64-NEXT: and a5, a6, a5 +; RV64-NEXT: or a4, a4, a5 ; RV64-NEXT: sd a4, 16(a3) -; RV64-NEXT: ld a2, 24(a2) ; RV64-NEXT: ld a1, 24(a1) -; RV64-NEXT: and a2, a6, a2 +; RV64-NEXT: ld a2, 24(a2) ; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: and a1, a6, a2 +; RV64-NEXT: or a0, a0, a1 ; RV64-NEXT: sd a0, 24(a3) ; RV64-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll index c21404dbc6317..9a4106c489a6d 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll @@ -14,6 +14,7 @@ define i32 @test_ctselect_smin_zero(i32 %x) { ; RV32: # %bb.0: ; RV32-NEXT: srai a1, a0, 31 ; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: or a0, a0, zero ; RV32-NEXT: ret %cmp = icmp slt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) @@ -35,6 +36,7 @@ define i32 @test_ctselect_smax_zero(i32 %x) { ; RV32-NEXT: sgtz a1, a0 ; RV32-NEXT: neg a1, a1 ; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: or a0, a0, zero ; RV32-NEXT: ret %cmp = icmp sgt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) @@ -217,6 +219,7 @@ define i32 @test_ctselect_sign_extend(i32 %x) { ; RV32-LABEL: test_ctselect_sign_extend: ; RV32: # %bb.0: ; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: or a0, a0, zero ; RV32-NEXT: ret %cmp = icmp slt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) @@ -236,6 +239,7 @@ define i32 @test_ctselect_zero_extend(i32 %x) { ; RV32-NEXT: seqz a0, a0 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: or a0, a0, zero ; RV32-NEXT: ret %cmp = icmp ne i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) @@ -250,6 +254,7 @@ define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) { ; ; RV32-LABEL: test_ctselect_constant_folding_true: ; RV32: # %bb.0: +; RV32-NEXT: or a0, a0, zero ; RV32-NEXT: ret %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) ret i32 %result @@ -263,7 +268,7 @@ define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { ; ; RV32-LABEL: test_ctselect_constant_folding_false: ; RV32: # %bb.0: -; RV32-NEXT: mv a0, a1 +; RV32-NEXT: or a0, zero, a1 ; RV32-NEXT: ret %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) ret i32 %result @@ -377,6 +382,7 @@ define i64 @test_ctselect_i64_smin_zero(i64 %x) { ; RV64: # %bb.0: ; RV64-NEXT: srai a1, a0, 63 ; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: or a0, a0, zero ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_i64_smin_zero: diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll index 4f25ea0a2d4c1..33df9fdefc0d7 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll @@ -134,6 +134,7 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) { ; ; RV32-LABEL: test_ctselect_const_true: ; RV32: # %bb.0: +; RV32-NEXT: or a0, a0, zero ; RV32-NEXT: ret %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) ret i32 %result @@ -147,7 +148,7 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) { ; ; RV32-LABEL: test_ctselect_const_false: ; RV32: # %bb.0: -; RV32-NEXT: mv a0, a1 +; RV32-NEXT: or a0, zero, a1 ; RV32-NEXT: ret %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) ret i32 %result From 9af537e1b4b3ccc6361aeccce8fa8ac34ed92dd1 Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Mon, 25 Aug 2025 13:18:13 -0400 Subject: [PATCH 20/63] [CT] Added diffs for webasm tests --- .../ctselect-fallback-edge-cases.ll | 4 + .../WebAssembly/ctselect-fallback-patterns.ll | 28 ++ .../WebAssembly/ctselect-fallback-vector.ll | 388 ++++++++++++------ .../CodeGen/WebAssembly/ctselect-fallback.ll | 8 + 4 files changed, 310 insertions(+), 118 deletions(-) diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll index 53b12e67a9204..4361393ca0339 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll @@ -90,6 +90,8 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { ; W32-NEXT: i32.sub ; W32-NEXT: local.get 1 ; W32-NEXT: i32.and +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_null_ptr: @@ -103,6 +105,8 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { ; W64-NEXT: i64.sub ; W64-NEXT: local.get 1 ; W64-NEXT: i64.and +; W64-NEXT: i64.const 0 +; W64-NEXT: i64.or ; W64-NEXT: # fallthrough-return %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null) ret ptr %result diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll index 85acc50210b4f..0316249d869f6 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll @@ -12,6 +12,8 @@ define i32 @test_ctselect_smin_zero(i32 %x) { ; W32-NEXT: i32.shr_s ; W32-NEXT: local.get 0 ; W32-NEXT: i32.and +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_smin_zero: @@ -22,6 +24,8 @@ define i32 @test_ctselect_smin_zero(i32 %x) { ; W64-NEXT: i32.shr_s ; W64-NEXT: local.get 0 ; W64-NEXT: i32.and +; W64-NEXT: i32.const 0 +; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %cmp = icmp slt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) @@ -41,6 +45,8 @@ define i32 @test_ctselect_smax_zero(i32 %x) { ; W32-NEXT: i32.select ; W32-NEXT: local.get 0 ; W32-NEXT: i32.and +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_smax_zero: @@ -54,6 +60,8 @@ define i32 @test_ctselect_smax_zero(i32 %x) { ; W64-NEXT: i32.select ; W64-NEXT: local.get 0 ; W64-NEXT: i32.and +; W64-NEXT: i32.const 0 +; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %cmp = icmp sgt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) @@ -354,6 +362,8 @@ define i32 @test_ctselect_sign_extend(i32 %x) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 31 ; W32-NEXT: i32.shr_s +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_sign_extend: @@ -362,6 +372,8 @@ define i32 @test_ctselect_sign_extend(i32 %x) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 31 ; W64-NEXT: i32.shr_s +; W64-NEXT: i32.const 0 +; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %cmp = icmp slt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) @@ -379,6 +391,8 @@ define i32 @test_ctselect_zero_extend(i32 %x) { ; W32-NEXT: i32.select ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_zero_extend: @@ -390,6 +404,8 @@ define i32 @test_ctselect_zero_extend(i32 %x) { ; W64-NEXT: i32.select ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and +; W64-NEXT: i32.const 0 +; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %cmp = icmp ne i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) @@ -402,12 +418,16 @@ define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) { ; W32: .functype test_ctselect_constant_folding_true (i32, i32) -> (i32) ; W32-NEXT: # %bb.0: ; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_constant_folding_true: ; W64: .functype test_ctselect_constant_folding_true (i32, i32) -> (i32) ; W64-NEXT: # %bb.0: ; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 0 +; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) ret i32 %result @@ -417,13 +437,17 @@ define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { ; W32-LABEL: test_ctselect_constant_folding_false: ; W32: .functype test_ctselect_constant_folding_false (i32, i32) -> (i32) ; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 1 +; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_constant_folding_false: ; W64: .functype test_ctselect_constant_folding_false (i32, i32) -> (i32) ; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 1 +; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) ret i32 %result @@ -626,6 +650,8 @@ define i64 @test_ctselect_i64_smin_zero(i64 %x) { ; W32-NEXT: i64.shr_s ; W32-NEXT: local.get 0 ; W32-NEXT: i64.and +; W32-NEXT: i64.const 0 +; W32-NEXT: i64.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_i64_smin_zero: @@ -636,6 +662,8 @@ define i64 @test_ctselect_i64_smin_zero(i64 %x) { ; W64-NEXT: i64.shr_s ; W64-NEXT: local.get 0 ; W64-NEXT: i64.and +; W64-NEXT: i64.const 0 +; W64-NEXT: i64.or ; W64-NEXT: # fallthrough-return %cmp = icmp slt i64 %x, 0 %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0) diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll index daa7370fb481a..cac4f721beccc 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll @@ -6,30 +6,40 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { ; WASM32-LABEL: test_ctselect_v4i32: ; WASM32: .functype test_ctselect_v4i32 (i32, v128, v128) -> (v128) +; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32: ; WASM64: .functype test_ctselect_v4i32 (i32, v128, v128) -> (v128) +; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or ; WASM64-NEXT: # fallthrough-return %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) ret <4 x i32> %result @@ -39,30 +49,40 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) { ; WASM32-LABEL: test_ctselect_v8i16: ; WASM32: .functype test_ctselect_v8i16 (i32, v128, v128) -> (v128) +; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i16x8.splat ; WASM32-NEXT: i32.const 15 ; WASM32-NEXT: i16x8.shl ; WASM32-NEXT: i32.const 15 ; WASM32-NEXT: i16x8.shr_s -; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v8i16: ; WASM64: .functype test_ctselect_v8i16 (i32, v128, v128) -> (v128) +; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i16x8.splat ; WASM64-NEXT: i32.const 15 ; WASM64-NEXT: i16x8.shl ; WASM64-NEXT: i32.const 15 ; WASM64-NEXT: i16x8.shr_s -; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or ; WASM64-NEXT: # fallthrough-return %result = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) ret <8 x i16> %result @@ -72,30 +92,40 @@ define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) { define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) { ; WASM32-LABEL: test_ctselect_v16i8: ; WASM32: .functype test_ctselect_v16i8 (i32, v128, v128) -> (v128) +; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i8x16.splat ; WASM32-NEXT: i32.const 7 ; WASM32-NEXT: i8x16.shl ; WASM32-NEXT: i32.const 7 ; WASM32-NEXT: i8x16.shr_s -; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v16i8: ; WASM64: .functype test_ctselect_v16i8 (i32, v128, v128) -> (v128) +; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i8x16.splat ; WASM64-NEXT: i32.const 7 ; WASM64-NEXT: i8x16.shl ; WASM64-NEXT: i32.const 7 ; WASM64-NEXT: i8x16.shr_s -; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or ; WASM64-NEXT: # fallthrough-return %result = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) ret <16 x i8> %result @@ -105,30 +135,40 @@ define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) { define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { ; WASM32-LABEL: test_ctselect_v2i64: ; WASM32: .functype test_ctselect_v2i64 (i32, v128, v128) -> (v128) +; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 63 ; WASM32-NEXT: i64x2.shl ; WASM32-NEXT: i32.const 63 ; WASM32-NEXT: i64x2.shr_s -; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v2i64: ; WASM64: .functype test_ctselect_v2i64 (i32, v128, v128) -> (v128) +; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 63 ; WASM64-NEXT: i64x2.shl ; WASM64-NEXT: i32.const 63 ; WASM64-NEXT: i64x2.shr_s -; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or ; WASM64-NEXT: # fallthrough-return %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) ret <2 x i64> %result @@ -138,30 +178,40 @@ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { ; WASM32-LABEL: test_ctselect_v4f32: ; WASM32: .functype test_ctselect_v4f32 (i32, v128, v128) -> (v128) +; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4f32: ; WASM64: .functype test_ctselect_v4f32 (i32, v128, v128) -> (v128) +; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or ; WASM64-NEXT: # fallthrough-return %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) ret <4 x float> %result @@ -171,30 +221,40 @@ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { ; WASM32-LABEL: test_ctselect_v2f64: ; WASM32: .functype test_ctselect_v2f64 (i32, v128, v128) -> (v128) +; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 63 ; WASM32-NEXT: i64x2.shl ; WASM32-NEXT: i32.const 63 ; WASM32-NEXT: i64x2.shr_s -; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v2f64: ; WASM64: .functype test_ctselect_v2f64 (i32, v128, v128) -> (v128) +; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 63 ; WASM64-NEXT: i64x2.shl ; WASM64-NEXT: i32.const 63 ; WASM64-NEXT: i64x2.shr_s -; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or ; WASM64-NEXT: # fallthrough-return %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) ret <2 x double> %result @@ -204,34 +264,44 @@ define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> define <4 x i32> @test_ctselect_v4i32_aligned_load(i1 %cond, ptr %p1, ptr %p2) { ; WASM32-LABEL: test_ctselect_v4i32_aligned_load: ; WASM32: .functype test_ctselect_v4i32_aligned_load (i32, i32, i32) -> (v128) +; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: v128.load 0 -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: v128.load 0 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.load 0 +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.load 0 +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32_aligned_load: ; WASM64: .functype test_ctselect_v4i32_aligned_load (i32, i64, i64) -> (v128) +; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: v128.load 0 -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: v128.load 0 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.load 0 +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.load 0 +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or ; WASM64-NEXT: # fallthrough-return %a = load <4 x i32>, ptr %p1, align 16 %b = load <4 x i32>, ptr %p2, align 16 @@ -243,34 +313,44 @@ define <4 x i32> @test_ctselect_v4i32_aligned_load(i1 %cond, ptr %p1, ptr %p2) { define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2) { ; WASM32-LABEL: test_ctselect_v4i32_unaligned_load: ; WASM32: .functype test_ctselect_v4i32_unaligned_load (i32, i32, i32) -> (v128) +; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: v128.load 0:p2align=2 -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: v128.load 0:p2align=2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.load 0:p2align=2 +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.load 0:p2align=2 +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32_unaligned_load: ; WASM64: .functype test_ctselect_v4i32_unaligned_load (i32, i64, i64) -> (v128) +; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: v128.load 0:p2align=2 -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: v128.load 0:p2align=2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.load 0:p2align=2 +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.load 0:p2align=2 +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or ; WASM64-NEXT: # fallthrough-return %a = load <4 x i32>, ptr %p1, align 4 %b = load <4 x i32>, ptr %p2, align 4 @@ -282,33 +362,43 @@ define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2) define void @test_ctselect_v4i32_store(i1 %cond, <4 x i32> %a, <4 x i32> %b, ptr %out) { ; WASM32-LABEL: test_ctselect_v4i32_store: ; WASM32: .functype test_ctselect_v4i32_store (i32, v128, v128, i32) -> () +; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: ; WASM32-NEXT: local.get 3 -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 4 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 4 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or ; WASM32-NEXT: v128.store 0 ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32_store: ; WASM64: .functype test_ctselect_v4i32_store (i32, v128, v128, i64) -> () +; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: ; WASM64-NEXT: local.get 3 -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 4 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 4 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or ; WASM64-NEXT: v128.store 0 ; WASM64-NEXT: # fallthrough-return %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) @@ -320,46 +410,64 @@ define void @test_ctselect_v4i32_store(i1 %cond, <4 x i32> %a, <4 x i32> %b, ptr define <4 x i32> @test_ctselect_v4i32_chain(i1 %cond1, i1 %cond2, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; WASM32-LABEL: test_ctselect_v4i32_chain: ; WASM32: .functype test_ctselect_v4i32_chain (i32, i32, v128, v128, v128) -> (v128) +; WASM32-NEXT: .local v128, v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: local.get 3 -; WASM32-NEXT: local.get 0 +; WASM32-NEXT: local.get 1 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: v128.bitselect -; WASM32-NEXT: local.get 4 -; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.tee 5 +; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 6 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: local.get 6 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 4 +; WASM32-NEXT: local.get 5 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32_chain: ; WASM64: .functype test_ctselect_v4i32_chain (i32, i32, v128, v128, v128) -> (v128) +; WASM64-NEXT: .local v128, v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: local.get 3 -; WASM64-NEXT: local.get 0 +; WASM64-NEXT: local.get 1 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: v128.bitselect -; WASM64-NEXT: local.get 4 -; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.tee 5 +; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 6 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: local.get 6 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 4 +; WASM64-NEXT: local.get 5 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or ; WASM64-NEXT: # fallthrough-return %tmp = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond1, <4 x i32> %a, <4 x i32> %b) %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond2, <4 x i32> %tmp, <4 x i32> %c) @@ -370,38 +478,48 @@ define <4 x i32> @test_ctselect_v4i32_chain(i1 %cond1, i1 %cond2, <4 x i32> %a, define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4 x float> %y) { ; WASM32-LABEL: test_ctselect_v4f32_arithmetic: ; WASM32: .functype test_ctselect_v4f32_arithmetic (i32, v128, v128) -> (v128) +; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: f32x4.add -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: f32x4.sub ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: f32x4.add +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: f32x4.sub +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4f32_arithmetic: ; WASM64: .functype test_ctselect_v4f32_arithmetic (i32, v128, v128) -> (v128) +; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: f32x4.add -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: f32x4.sub ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: f32x4.add +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: f32x4.sub +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or ; WASM64-NEXT: # fallthrough-return %sum = fadd <4 x float> %x, %y %diff = fsub <4 x float> %x, %y @@ -422,6 +540,8 @@ define <4 x i32> @test_ctselect_v4i32_zeros(i1 %cond, <4 x i32> %a) { ; WASM32-NEXT: i32x4.shr_s ; WASM32-NEXT: local.get 1 ; WASM32-NEXT: v128.and +; WASM32-NEXT: v128.const 0, 0, 0, 0 +; WASM32-NEXT: v128.or ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32_zeros: @@ -435,6 +555,8 @@ define <4 x i32> @test_ctselect_v4i32_zeros(i1 %cond, <4 x i32> %a) { ; WASM64-NEXT: i32x4.shr_s ; WASM64-NEXT: local.get 1 ; WASM64-NEXT: v128.and +; WASM64-NEXT: v128.const 0, 0, 0, 0 +; WASM64-NEXT: v128.or ; WASM64-NEXT: # fallthrough-return %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, @@ -446,30 +568,40 @@ define <4 x i32> @test_ctselect_v4i32_zeros(i1 %cond, <4 x i32> %a) { define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b) nounwind { ; WASM32-LABEL: test_ctselect_v4i32_args: ; WASM32: .functype test_ctselect_v4i32_args (i32, v128, v128) -> (v128) +; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32_args: ; WASM64: .functype test_ctselect_v4i32_args (i32, v128, v128) -> (v128) +; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or ; WASM64-NEXT: # fallthrough-return %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) ret <4 x i32> %result @@ -479,35 +611,45 @@ define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b) define <4 x i32> @test_ctselect_v4i32_multi_use(i1 %cond, <4 x i32> %a, <4 x i32> %b) { ; WASM32-LABEL: test_ctselect_v4i32_multi_use: ; WASM32: .functype test_ctselect_v4i32_multi_use (i32, v128, v128) -> (v128) +; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: v128.bitselect -; WASM32-NEXT: local.tee 2 +; WASM32-NEXT: local.tee 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.and ; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or +; WASM32-NEXT: local.tee 1 +; WASM32-NEXT: local.get 1 ; WASM32-NEXT: i32x4.add ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32_multi_use: ; WASM64: .functype test_ctselect_v4i32_multi_use (i32, v128, v128) -> (v128) +; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: v128.bitselect -; WASM64-NEXT: local.tee 2 +; WASM64-NEXT: local.tee 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.and ; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or +; WASM64-NEXT: local.tee 1 +; WASM64-NEXT: local.get 1 ; WASM64-NEXT: i32x4.add ; WASM64-NEXT: # fallthrough-return %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) @@ -519,38 +661,48 @@ define <4 x i32> @test_ctselect_v4i32_multi_use(i1 %cond, <4 x i32> %a, <4 x i32 define <16 x i8> @test_ctselect_v16i8_ops(i1 %cond, <16 x i8> %x, <16 x i8> %y) { ; WASM32-LABEL: test_ctselect_v16i8_ops: ; WASM32: .functype test_ctselect_v16i8_ops (i32, v128, v128) -> (v128) +; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: v128.xor -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: v128.and ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i8x16.splat ; WASM32-NEXT: i32.const 7 ; WASM32-NEXT: i8x16.shl ; WASM32-NEXT: i32.const 7 ; WASM32-NEXT: i8x16.shr_s -; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.xor +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: v128.andnot +; WASM32-NEXT: v128.or ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v16i8_ops: ; WASM64: .functype test_ctselect_v16i8_ops (i32, v128, v128) -> (v128) +; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: v128.xor -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: v128.and ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i8x16.splat ; WASM64-NEXT: i32.const 7 ; WASM64-NEXT: i8x16.shl ; WASM64-NEXT: i32.const 7 ; WASM64-NEXT: i8x16.shr_s -; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.xor +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: v128.andnot +; WASM64-NEXT: v128.or ; WASM64-NEXT: # fallthrough-return %xor = xor <16 x i8> %x, %y %and = and <16 x i8> %x, %y diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll index db21bd58f6797..2a694db6d88d9 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll @@ -225,12 +225,16 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) { ; W32: .functype test_ctselect_const_true (i32, i32) -> (i32) ; W32-NEXT: # %bb.0: ; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_const_true: ; W64: .functype test_ctselect_const_true (i32, i32) -> (i32) ; W64-NEXT: # %bb.0: ; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 0 +; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) ret i32 %result @@ -240,13 +244,17 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) { ; W32-LABEL: test_ctselect_const_false: ; W32: .functype test_ctselect_const_false (i32, i32) -> (i32) ; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 1 +; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_const_false: ; W64: .functype test_ctselect_const_false (i32, i32) -> (i32) ; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 1 +; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) ret i32 %result From f0c36cb1cde5378819cd23b470dfd88790e8b3d6 Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Tue, 26 Aug 2025 10:57:03 -0400 Subject: [PATCH 21/63] [CT] Removed NoMerge flag and added chain dependency between both ANDs --- .../SelectionDAG/SelectionDAGBuilder.cpp | 37 +- .../RISCV/ctselect-fallback-edge-cases.ll | 156 ++++--- .../RISCV/ctselect-fallback-patterns.ll | 75 ++-- llvm/test/CodeGen/RISCV/ctselect-fallback.ll | 197 +++++---- .../CodeGen/RISCV/ctselect-side-effects.ll | 12 +- .../ctselect-fallback-edge-cases.ll | 60 ++- .../WebAssembly/ctselect-fallback-patterns.ll | 96 +---- .../WebAssembly/ctselect-fallback-vector.ll | 388 ++++++------------ .../CodeGen/WebAssembly/ctselect-fallback.ll | 112 +++-- .../WebAssembly/ctselect-side-effects.ll | 8 +- 10 files changed, 446 insertions(+), 695 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 38cec9eb5f174..ee791310dd5e4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6494,8 +6494,6 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I, SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( SelectionDAG &DAG, const SDLoc &DL, SDValue Cond, SDValue T, SDValue F, EVT VT) { - SDNodeFlags ProtectedFlag; - ProtectedFlag.setNoMerge(true); SDValue WorkingT = T; SDValue WorkingF = F; @@ -6545,39 +6543,36 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( AllOnes = DAG.getAllOnesConstant(DL, WorkingVT); } - SDValue Invert = DAG.getNode(ISD::XOR, DL, WorkingVT, Mask, AllOnes, ProtectedFlag); + SDValue Invert = DAG.getNode(ISD::XOR, DL, WorkingVT, Mask, AllOnes); // (or (and WorkingT, Mask), (and F, ~Mask)) - SDValue TM = DAG.getNode(ISD::AND, DL, WorkingVT, Mask, WorkingT, ProtectedFlag); - SDValue FM = DAG.getNode(ISD::AND, DL, WorkingVT, Invert, WorkingF, ProtectedFlag); + SDValue TM = DAG.getNode(ISD::AND, DL, WorkingVT, Mask, WorkingT); - // Only apply chaining for non-scalable types or when we can get a register class - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); bool CanUseChaining = false; - - if (!WorkingVT.isScalableVector()) { - // For fixed-size vectors and scalars, we can safely use register classes - CanUseChaining = TLI.isTypeLegal(WorkingVT.getSimpleVT()); - } else { - // For scalable vectors, check if the target has register class support - // This is target-specific - RISC-V might not support this directly - CanUseChaining = false; // Conservative: disable for scalable vectors - } + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (CanUseChaining) { // Apply chaining through registers for additional protection - const TargetRegisterClass *RC = TLI.getRegClassFor(WorkingVT.getSimpleVT()); + const TargetRegisterClass *RC = TLI.getRegClassFor(WorkingVT.getSimpleVT()); Register TMReg = MRI.createVirtualRegister(RC); Chain = DAG.getCopyToReg(Chain, DL, TMReg, TM); TM = DAG.getCopyFromReg(Chain, DL, TMReg, WorkingVT); + } + + SDValue FM = DAG.getNode(ISD::AND, DL, WorkingVT, Invert, WorkingF); - Register FMReg = MRI.createVirtualRegister(RC); - Chain = DAG.getCopyToReg(Chain, DL, FMReg, FM); - FM = DAG.getCopyFromReg(Chain, DL, FMReg, WorkingVT); + if (!WorkingVT.isScalableVector()) { + // For fixed-size vectors and scalars, we can safely use register classes + CanUseChaining = TLI.isTypeLegal(WorkingVT.getSimpleVT()); + } else { + // For scalable vectors, check if the target has register class support + // This is target-specific - RISC-V might not support this directly + CanUseChaining = false; // Conservative: disable for scalable vectors } - SDValue Result = DAG.getNode(ISD::OR, DL, WorkingVT, TM, FM, ProtectedFlag); + + SDValue Result = DAG.getNode(ISD::OR, DL, WorkingVT, TM, FM); // Convert back if needed if (WorkingVT != VT) { diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll index 7fe76cf7f574c..860f64c3672b0 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll @@ -39,12 +39,12 @@ define i32 @test_ctselect_extremal_values(i1 %cond) { ; ; RV32-LABEL: test_ctselect_extremal_values: ; RV32: # %bb.0: -; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: andi a0, a0, 1 ; RV32-NEXT: lui a1, 524288 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: not a2, a0 -; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: addi a2, a0, -1 +; RV32-NEXT: neg a0, a0 ; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: slli a0, a0, 1 ; RV32-NEXT: srli a0, a0, 1 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: ret @@ -59,7 +59,6 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { ; RV64-NEXT: slli a0, a0, 63 ; RV64-NEXT: srai a0, a0, 63 ; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: or a0, a0, zero ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_null_ptr: @@ -67,7 +66,6 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { ; RV32-NEXT: slli a0, a0, 31 ; RV32-NEXT: srai a0, a0, 31 ; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: or a0, a0, zero ; RV32-NEXT: ret %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null) ret ptr %result @@ -77,22 +75,22 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { ; RV64-LABEL: test_ctselect_function_ptr: ; RV64: # %bb.0: -; RV64-NEXT: slli a0, a0, 63 -; RV64-NEXT: srai a0, a0, 63 -; RV64-NEXT: and a1, a0, a1 -; RV64-NEXT: not a0, a0 -; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_function_ptr: ; RV32: # %bb.0: -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: and a1, a0, a1 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: ret %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2) ret ptr %result @@ -104,22 +102,22 @@ define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { ; RV64: # %bb.0: ; RV64-NEXT: xor a0, a0, a1 ; RV64-NEXT: snez a0, a0 +; RV64-NEXT: neg a1, a0 ; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a2, a0, a2 -; RV64-NEXT: not a0, a0 -; RV64-NEXT: and a0, a0, a3 -; RV64-NEXT: or a0, a2, a0 +; RV64-NEXT: and a1, a1, a3 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a0, a1 ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_ptr_cmp: ; RV32: # %bb.0: ; RV32-NEXT: xor a0, a0, a1 ; RV32-NEXT: snez a0, a0 +; RV32-NEXT: neg a1, a0 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a2, a0, a2 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: ret %cmp = icmp eq ptr %p1, %p2 %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b) @@ -132,22 +130,22 @@ define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { ; RV64-LABEL: test_ctselect_struct_ptr: ; RV64: # %bb.0: -; RV64-NEXT: slli a0, a0, 63 -; RV64-NEXT: srai a0, a0, 63 -; RV64-NEXT: and a1, a0, a1 -; RV64-NEXT: not a0, a0 -; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_struct_ptr: ; RV32: # %bb.0: -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: and a1, a0, a1 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: ret %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) ret ptr %result @@ -187,29 +185,29 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ; RV32-LABEL: test_ctselect_deeply_nested: ; RV32: # %bb.0: ; RV32-NEXT: lw t0, 0(sp) -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: slli a1, a1, 31 -; RV32-NEXT: slli a2, a2, 31 -; RV32-NEXT: slli a3, a3, 31 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: srai a1, a1, 31 -; RV32-NEXT: srai a2, a2, 31 -; RV32-NEXT: srai a3, a3, 31 -; RV32-NEXT: and a4, a0, a4 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a5 -; RV32-NEXT: not a5, a1 -; RV32-NEXT: and a5, a5, a6 -; RV32-NEXT: not a6, a2 -; RV32-NEXT: and a6, a6, a7 -; RV32-NEXT: not a7, a3 -; RV32-NEXT: or a0, a4, a0 -; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: andi a2, a2, 1 +; RV32-NEXT: andi a3, a3, 1 +; RV32-NEXT: addi t1, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a5, t1, a5 +; RV32-NEXT: neg t1, a1 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: neg a4, a2 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a1, a1, a6 +; RV32-NEXT: neg a6, a3 +; RV32-NEXT: addi a3, a3, -1 +; RV32-NEXT: and a2, a2, a7 ; RV32-NEXT: or a0, a0, a5 -; RV32-NEXT: and a0, a2, a0 -; RV32-NEXT: or a0, a0, a6 -; RV32-NEXT: and a0, a3, a0 -; RV32-NEXT: and a1, a7, t0 +; RV32-NEXT: and a0, t1, a0 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: and a0, a4, a0 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: and a0, a6, a0 +; RV32-NEXT: and a1, a3, t0 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: ret %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) @@ -345,31 +343,31 @@ define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) { ; RV64-LABEL: cmovznz4_builtin_ctselect: ; RV64: # %bb.0: # %entry ; RV64-NEXT: snez a0, a0 -; RV64-NEXT: ld a4, 0(a1) -; RV64-NEXT: ld a5, 0(a2) +; RV64-NEXT: ld a4, 0(a2) +; RV64-NEXT: ld a5, 0(a1) +; RV64-NEXT: neg a6, a0 ; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: not a6, a0 -; RV64-NEXT: and a4, a0, a4 -; RV64-NEXT: and a5, a6, a5 -; RV64-NEXT: or a4, a4, a5 +; RV64-NEXT: and a4, a6, a4 +; RV64-NEXT: and a5, a0, a5 +; RV64-NEXT: or a4, a5, a4 ; RV64-NEXT: sd a4, 0(a3) -; RV64-NEXT: ld a4, 8(a1) -; RV64-NEXT: ld a5, 8(a2) -; RV64-NEXT: and a4, a0, a4 -; RV64-NEXT: and a5, a6, a5 -; RV64-NEXT: or a4, a4, a5 +; RV64-NEXT: ld a4, 8(a2) +; RV64-NEXT: ld a5, 8(a1) +; RV64-NEXT: and a4, a6, a4 +; RV64-NEXT: and a5, a0, a5 +; RV64-NEXT: or a4, a5, a4 ; RV64-NEXT: sd a4, 8(a3) -; RV64-NEXT: ld a4, 16(a1) -; RV64-NEXT: ld a5, 16(a2) -; RV64-NEXT: and a4, a0, a4 -; RV64-NEXT: and a5, a6, a5 -; RV64-NEXT: or a4, a4, a5 +; RV64-NEXT: ld a4, 16(a2) +; RV64-NEXT: ld a5, 16(a1) +; RV64-NEXT: and a4, a6, a4 +; RV64-NEXT: and a5, a0, a5 +; RV64-NEXT: or a4, a5, a4 ; RV64-NEXT: sd a4, 16(a3) -; RV64-NEXT: ld a1, 24(a1) ; RV64-NEXT: ld a2, 24(a2) +; RV64-NEXT: ld a1, 24(a1) +; RV64-NEXT: and a2, a6, a2 ; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: and a1, a6, a2 -; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 ; RV64-NEXT: sd a0, 24(a3) ; RV64-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll index 9a4106c489a6d..27c0d521bb631 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll @@ -14,7 +14,6 @@ define i32 @test_ctselect_smin_zero(i32 %x) { ; RV32: # %bb.0: ; RV32-NEXT: srai a1, a0, 31 ; RV32-NEXT: and a0, a1, a0 -; RV32-NEXT: or a0, a0, zero ; RV32-NEXT: ret %cmp = icmp slt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) @@ -36,7 +35,6 @@ define i32 @test_ctselect_smax_zero(i32 %x) { ; RV32-NEXT: sgtz a1, a0 ; RV32-NEXT: neg a1, a1 ; RV32-NEXT: and a0, a1, a0 -; RV32-NEXT: or a0, a0, zero ; RV32-NEXT: ret %cmp = icmp sgt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) @@ -60,10 +58,10 @@ define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { ; RV32-LABEL: test_ctselect_smin_generic: ; RV32: # %bb.0: ; RV32-NEXT: slt a2, a0, a1 +; RV32-NEXT: addi a3, a2, -1 ; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a2, a0 -; RV32-NEXT: not a2, a2 -; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: ret %cmp = icmp slt i32 %x, %y @@ -88,10 +86,10 @@ define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { ; RV32-LABEL: test_ctselect_smax_generic: ; RV32: # %bb.0: ; RV32-NEXT: slt a2, a1, a0 +; RV32-NEXT: addi a3, a2, -1 ; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a2, a0 -; RV32-NEXT: not a2, a2 -; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: ret %cmp = icmp sgt i32 %x, %y @@ -116,10 +114,10 @@ define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { ; RV32-LABEL: test_ctselect_umin_generic: ; RV32: # %bb.0: ; RV32-NEXT: sltu a2, a0, a1 +; RV32-NEXT: addi a3, a2, -1 ; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a2, a0 -; RV32-NEXT: not a2, a2 -; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: ret %cmp = icmp ult i32 %x, %y @@ -144,10 +142,10 @@ define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { ; RV32-LABEL: test_ctselect_umax_generic: ; RV32: # %bb.0: ; RV32-NEXT: sltu a2, a1, a0 +; RV32-NEXT: addi a3, a2, -1 ; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a2, a0 -; RV32-NEXT: not a2, a2 -; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: ret %cmp = icmp ugt i32 %x, %y @@ -219,7 +217,6 @@ define i32 @test_ctselect_sign_extend(i32 %x) { ; RV32-LABEL: test_ctselect_sign_extend: ; RV32: # %bb.0: ; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: or a0, a0, zero ; RV32-NEXT: ret %cmp = icmp slt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) @@ -236,10 +233,7 @@ define i32 @test_ctselect_zero_extend(i32 %x) { ; ; RV32-LABEL: test_ctselect_zero_extend: ; RV32: # %bb.0: -; RV32-NEXT: seqz a0, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: andi a0, a0, 1 -; RV32-NEXT: or a0, a0, zero +; RV32-NEXT: snez a0, a0 ; RV32-NEXT: ret %cmp = icmp ne i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) @@ -254,7 +248,6 @@ define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) { ; ; RV32-LABEL: test_ctselect_constant_folding_true: ; RV32: # %bb.0: -; RV32-NEXT: or a0, a0, zero ; RV32-NEXT: ret %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) ret i32 %result @@ -268,7 +261,7 @@ define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { ; ; RV32-LABEL: test_ctselect_constant_folding_false: ; RV32: # %bb.0: -; RV32-NEXT: or a0, zero, a1 +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) ret i32 %result @@ -283,12 +276,7 @@ define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { ; ; RV32-LABEL: test_ctselect_identical_operands: ; RV32: # %bb.0: -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: and a2, a0, a1 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x) ret i32 %result @@ -313,11 +301,11 @@ define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { ; RV32: # %bb.0: ; RV32-NEXT: xor a0, a0, a1 ; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: neg a1, a0 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a2, a0, a2 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: ret %cmp = icmp eq i32 %x, %y %not_cmp = xor i1 %cmp, true @@ -351,23 +339,23 @@ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, ; ; RV32-LABEL: test_ctselect_chain: ; RV32: # %bb.0: -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: slli a1, a1, 31 -; RV32-NEXT: slli a2, a2, 31 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: srai a1, a1, 31 -; RV32-NEXT: srai a2, a2, 31 -; RV32-NEXT: and a3, a0, a3 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a4 -; RV32-NEXT: not a4, a1 -; RV32-NEXT: and a4, a4, a5 -; RV32-NEXT: not a5, a2 -; RV32-NEXT: or a0, a3, a0 -; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: andi a2, a2, 1 +; RV32-NEXT: addi a7, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a4, a7, a4 +; RV32-NEXT: neg a7, a1 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: neg a3, a2 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a1, a1, a5 ; RV32-NEXT: or a0, a0, a4 -; RV32-NEXT: and a0, a2, a0 -; RV32-NEXT: and a1, a5, a6 +; RV32-NEXT: and a0, a7, a0 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: and a0, a3, a0 +; RV32-NEXT: and a1, a2, a6 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: ret %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) @@ -382,7 +370,6 @@ define i64 @test_ctselect_i64_smin_zero(i64 %x) { ; RV64: # %bb.0: ; RV64-NEXT: srai a1, a0, 63 ; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: or a0, a0, zero ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_i64_smin_zero: diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll index 33df9fdefc0d7..1625c8db2d85c 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll @@ -64,12 +64,12 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; ; RV32-LABEL: test_ctselect_i32: ; RV32: # %bb.0: -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: and a1, a0, a1 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: ret %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result @@ -78,12 +78,12 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { ; RV64-LABEL: test_ctselect_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a0, a0, 63 -; RV64-NEXT: srai a0, a0, 63 -; RV64-NEXT: and a1, a0, a1 -; RV64-NEXT: not a0, a0 -; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_i64: @@ -105,22 +105,22 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { ; RV64-LABEL: test_ctselect_ptr: ; RV64: # %bb.0: -; RV64-NEXT: slli a0, a0, 63 -; RV64-NEXT: srai a0, a0, 63 -; RV64-NEXT: and a1, a0, a1 -; RV64-NEXT: not a0, a0 -; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_ptr: ; RV32: # %bb.0: -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: and a1, a0, a1 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: ret %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) ret ptr %result @@ -134,7 +134,6 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) { ; ; RV32-LABEL: test_ctselect_const_true: ; RV32: # %bb.0: -; RV32-NEXT: or a0, a0, zero ; RV32-NEXT: ret %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) ret i32 %result @@ -148,7 +147,7 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) { ; ; RV32-LABEL: test_ctselect_const_false: ; RV32: # %bb.0: -; RV32-NEXT: or a0, zero, a1 +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) ret i32 %result @@ -173,11 +172,11 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { ; RV32: # %bb.0: ; RV32-NEXT: xor a0, a0, a1 ; RV32-NEXT: snez a0, a0 +; RV32-NEXT: neg a1, a0 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a2, a0, a2 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: ret %cond = icmp eq i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) @@ -202,11 +201,11 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { ; RV32: # %bb.0: ; RV32-NEXT: xor a0, a0, a1 ; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: neg a1, a0 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a2, a0, a2 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: ret %cond = icmp ne i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) @@ -229,11 +228,11 @@ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { ; RV32-LABEL: test_ctselect_icmp_slt: ; RV32: # %bb.0: ; RV32-NEXT: slt a0, a0, a1 +; RV32-NEXT: addi a1, a0, -1 ; RV32-NEXT: neg a0, a0 -; RV32-NEXT: and a2, a0, a2 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: ret %cond = icmp slt i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) @@ -256,11 +255,11 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { ; RV32-LABEL: test_ctselect_icmp_ult: ; RV32: # %bb.0: ; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: addi a1, a0, -1 ; RV32-NEXT: neg a0, a0 -; RV32-NEXT: and a2, a0, a2 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: ret %cond = icmp ult i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) @@ -285,12 +284,12 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { ; RV32: # %bb.0: ; RV32-NEXT: lw a1, 0(a1) ; RV32-NEXT: lw a2, 0(a2) -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: and a1, a0, a1 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: ret %a = load i32, ptr %p1 %b = load i32, ptr %p2 @@ -318,18 +317,18 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ; ; RV32-LABEL: test_ctselect_nested: ; RV32: # %bb.0: -; RV32-NEXT: slli a1, a1, 31 -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: srai a1, a1, 31 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: and a2, a1, a2 -; RV32-NEXT: not a1, a1 -; RV32-NEXT: and a1, a1, a3 -; RV32-NEXT: not a3, a0 -; RV32-NEXT: or a1, a2, a1 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: and a3, a3, a4 -; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a5, a1, -1 +; RV32-NEXT: neg a1, a1 +; RV32-NEXT: and a3, a5, a3 +; RV32-NEXT: neg a5, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: and a1, a5, a1 +; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: ret %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c) @@ -350,12 +349,12 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; ; RV32-LABEL: test_ctselect_f32: ; RV32: # %bb.0: -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: and a1, a0, a1 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: ret %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) ret float %result @@ -365,12 +364,12 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { define double @test_ctselect_f64(i1 %cond, double %a, double %b) { ; RV64-LABEL: test_ctselect_f64: ; RV64: # %bb.0: -; RV64-NEXT: slli a0, a0, 63 -; RV64-NEXT: srai a0, a0, 63 -; RV64-NEXT: and a1, a0, a1 -; RV64-NEXT: not a0, a0 -; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_f64: @@ -410,18 +409,18 @@ define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, ; ; RV32-LABEL: test_ctselect_f32_chain: ; RV32: # %bb.0: -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: slli a1, a1, 31 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: srai a1, a1, 31 -; RV32-NEXT: and a2, a0, a2 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: not a3, a1 -; RV32-NEXT: or a0, a2, a0 -; RV32-NEXT: and a0, a1, a0 -; RV32-NEXT: and a3, a3, a4 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: addi a5, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a3, a5, a3 +; RV32-NEXT: neg a5, a1 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: and a0, a5, a0 +; RV32-NEXT: and a1, a1, a4 +; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: ret %tmp = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b) %result = call float @llvm.ct.select.f32(i1 %cond2, float %tmp, float %c) @@ -446,12 +445,12 @@ define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { ; RV32: # %bb.0: ; RV32-NEXT: lw a1, 0(a1) ; RV32-NEXT: lw a2, 0(a2) -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: and a1, a0, a1 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: ret %a = load float, ptr %p1 %b = load float, ptr %p2 @@ -465,12 +464,12 @@ define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) { ; RV64: # %bb.0: ; RV64-NEXT: ld a1, 0(a1) ; RV64-NEXT: ld a2, 0(a2) -; RV64-NEXT: slli a0, a0, 63 -; RV64-NEXT: srai a0, a0, 63 -; RV64-NEXT: and a1, a0, a1 -; RV64-NEXT: not a0, a0 -; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 ; RV64-NEXT: ret ; ; RV32-LABEL: test_ctselect_f64_load: @@ -565,10 +564,10 @@ define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { ; RV32-NEXT: mv a0, s1 ; RV32-NEXT: mv a1, s0 ; RV32-NEXT: call __subsf3 -; RV32-NEXT: slli s2, s2, 31 -; RV32-NEXT: srai a1, s2, 31 -; RV32-NEXT: and a2, a1, s3 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: andi a1, s2, 1 +; RV32-NEXT: neg a2, a1 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a2, a2, s3 ; RV32-NEXT: and a0, a1, a0 ; RV32-NEXT: or a0, a2, a0 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll b/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll index e6a48a914bd0f..60f6350d6508d 100644 --- a/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll +++ b/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll @@ -46,12 +46,12 @@ define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { ; ; RV32-LABEL: test_protected_no_branch: ; RV32: # %bb.0: -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: and a1, a0, a1 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: ret %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll index 4361393ca0339..19f01b37ba8cb 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll @@ -44,13 +44,13 @@ define i32 @test_ctselect_extremal_values(i1 %cond) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub ; W32-NEXT: i32.const 2147483647 ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: i32.const -2147483648 ; W32-NEXT: i32.and ; W32-NEXT: i32.or @@ -63,13 +63,13 @@ define i32 @test_ctselect_extremal_values(i1 %cond) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub ; W64-NEXT: i32.const 2147483647 ; W64-NEXT: i32.and ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: i32.const -2147483648 ; W64-NEXT: i32.and ; W64-NEXT: i32.or @@ -90,8 +90,6 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { ; W32-NEXT: i32.sub ; W32-NEXT: local.get 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 0 -; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_null_ptr: @@ -105,8 +103,6 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { ; W64-NEXT: i64.sub ; W64-NEXT: local.get 1 ; W64-NEXT: i64.and -; W64-NEXT: i64.const 0 -; W64-NEXT: i64.or ; W64-NEXT: # fallthrough-return %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null) ret ptr %result @@ -121,13 +117,13 @@ define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub ; W32-NEXT: local.get 1 ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 2 ; W32-NEXT: i32.and ; W32-NEXT: i32.or @@ -142,13 +138,13 @@ define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { ; W64-NEXT: i64.extend_i32_u ; W64-NEXT: i64.const 1 ; W64-NEXT: i64.and -; W64-NEXT: i64.sub ; W64-NEXT: local.tee 3 +; W64-NEXT: i64.sub ; W64-NEXT: local.get 1 ; W64-NEXT: i64.and ; W64-NEXT: local.get 3 ; W64-NEXT: i64.const -1 -; W64-NEXT: i64.xor +; W64-NEXT: i64.add ; W64-NEXT: local.get 2 ; W64-NEXT: i64.and ; W64-NEXT: i64.or @@ -214,13 +210,13 @@ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub ; W32-NEXT: local.get 1 ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 2 ; W32-NEXT: i32.and ; W32-NEXT: i32.or @@ -235,13 +231,13 @@ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { ; W64-NEXT: i64.extend_i32_u ; W64-NEXT: i64.const 1 ; W64-NEXT: i64.and -; W64-NEXT: i64.sub ; W64-NEXT: local.tee 3 +; W64-NEXT: i64.sub ; W64-NEXT: local.get 1 ; W64-NEXT: i64.and ; W64-NEXT: local.get 3 ; W64-NEXT: i64.const -1 -; W64-NEXT: i64.xor +; W64-NEXT: i64.add ; W64-NEXT: local.get 2 ; W64-NEXT: i64.and ; W64-NEXT: i64.or @@ -259,52 +255,52 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ; W32-NEXT: local.get 3 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 3 +; W32-NEXT: i32.sub ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 2 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 2 +; W32-NEXT: i32.sub ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 1 +; W32-NEXT: i32.sub ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub ; W32-NEXT: local.get 4 ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 5 ; W32-NEXT: i32.and ; W32-NEXT: i32.or ; W32-NEXT: i32.and ; W32-NEXT: local.get 1 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 6 ; W32-NEXT: i32.and ; W32-NEXT: i32.or ; W32-NEXT: i32.and ; W32-NEXT: local.get 2 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 7 ; W32-NEXT: i32.and ; W32-NEXT: i32.or ; W32-NEXT: i32.and ; W32-NEXT: local.get 3 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 8 ; W32-NEXT: i32.and ; W32-NEXT: i32.or @@ -317,52 +313,52 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ; W64-NEXT: local.get 3 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 3 +; W64-NEXT: i32.sub ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 2 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 2 +; W64-NEXT: i32.sub ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 1 +; W64-NEXT: i32.sub ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub ; W64-NEXT: local.get 4 ; W64-NEXT: i32.and ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 5 ; W64-NEXT: i32.and ; W64-NEXT: i32.or ; W64-NEXT: i32.and ; W64-NEXT: local.get 1 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 6 ; W64-NEXT: i32.and ; W64-NEXT: i32.or ; W64-NEXT: i32.and ; W64-NEXT: local.get 2 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 7 ; W64-NEXT: i32.and ; W64-NEXT: i32.or ; W64-NEXT: i32.and ; W64-NEXT: local.get 3 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 8 ; W64-NEXT: i32.and ; W64-NEXT: i32.or diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll index 0316249d869f6..5c8d66249a95a 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll @@ -12,8 +12,6 @@ define i32 @test_ctselect_smin_zero(i32 %x) { ; W32-NEXT: i32.shr_s ; W32-NEXT: local.get 0 ; W32-NEXT: i32.and -; W32-NEXT: i32.const 0 -; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_smin_zero: @@ -24,8 +22,6 @@ define i32 @test_ctselect_smin_zero(i32 %x) { ; W64-NEXT: i32.shr_s ; W64-NEXT: local.get 0 ; W64-NEXT: i32.and -; W64-NEXT: i32.const 0 -; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %cmp = icmp slt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) @@ -37,31 +33,23 @@ define i32 @test_ctselect_smax_zero(i32 %x) { ; W32-LABEL: test_ctselect_smax_zero: ; W32: .functype test_ctselect_smax_zero (i32) -> (i32) ; W32-NEXT: # %bb.0: -; W32-NEXT: i32.const -1 +; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 0 ; W32-NEXT: i32.gt_s ; W32-NEXT: i32.select -; W32-NEXT: local.get 0 -; W32-NEXT: i32.and -; W32-NEXT: i32.const 0 -; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_smax_zero: ; W64: .functype test_ctselect_smax_zero (i32) -> (i32) ; W64-NEXT: # %bb.0: -; W64-NEXT: i32.const -1 +; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 0 ; W64-NEXT: i32.gt_s ; W64-NEXT: i32.select -; W64-NEXT: local.get 0 -; W64-NEXT: i32.and -; W64-NEXT: i32.const 0 -; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %cmp = icmp sgt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) @@ -362,8 +350,6 @@ define i32 @test_ctselect_sign_extend(i32 %x) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 31 ; W32-NEXT: i32.shr_s -; W32-NEXT: i32.const 0 -; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_sign_extend: @@ -372,8 +358,6 @@ define i32 @test_ctselect_sign_extend(i32 %x) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 31 ; W64-NEXT: i32.shr_s -; W64-NEXT: i32.const 0 -; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %cmp = icmp slt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) @@ -385,27 +369,17 @@ define i32 @test_ctselect_zero_extend(i32 %x) { ; W32-LABEL: test_ctselect_zero_extend: ; W32: .functype test_ctselect_zero_extend (i32) -> (i32) ; W32-NEXT: # %bb.0: -; W32-NEXT: i32.const -1 -; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 -; W32-NEXT: i32.select -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and ; W32-NEXT: i32.const 0 -; W32-NEXT: i32.or +; W32-NEXT: i32.ne ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_zero_extend: ; W64: .functype test_ctselect_zero_extend (i32) -> (i32) ; W64-NEXT: # %bb.0: -; W64-NEXT: i32.const -1 -; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 -; W64-NEXT: i32.select -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and ; W64-NEXT: i32.const 0 -; W64-NEXT: i32.or +; W64-NEXT: i32.ne ; W64-NEXT: # fallthrough-return %cmp = icmp ne i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) @@ -418,16 +392,12 @@ define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) { ; W32: .functype test_ctselect_constant_folding_true (i32, i32) -> (i32) ; W32-NEXT: # %bb.0: ; W32-NEXT: local.get 0 -; W32-NEXT: i32.const 0 -; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_constant_folding_true: ; W64: .functype test_ctselect_constant_folding_true (i32, i32) -> (i32) ; W64-NEXT: # %bb.0: ; W64-NEXT: local.get 0 -; W64-NEXT: i32.const 0 -; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) ret i32 %result @@ -437,17 +407,13 @@ define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { ; W32-LABEL: test_ctselect_constant_folding_false: ; W32: .functype test_ctselect_constant_folding_false (i32, i32) -> (i32) ; W32-NEXT: # %bb.0: -; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 1 -; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_constant_folding_false: ; W64: .functype test_ctselect_constant_folding_false (i32, i32) -> (i32) ; W64-NEXT: # %bb.0: -; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 1 -; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) ret i32 %result @@ -458,39 +424,13 @@ define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { ; W32-LABEL: test_ctselect_identical_operands: ; W32: .functype test_ctselect_identical_operands (i32, i32) -> (i32) ; W32-NEXT: # %bb.0: -; W32-NEXT: i32.const 0 -; W32-NEXT: local.get 0 -; W32-NEXT: i32.const 1 -; W32-NEXT: i32.and -; W32-NEXT: i32.sub -; W32-NEXT: local.tee 0 ; W32-NEXT: local.get 1 -; W32-NEXT: i32.and -; W32-NEXT: local.get 0 -; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor -; W32-NEXT: local.get 1 -; W32-NEXT: i32.and -; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_identical_operands: ; W64: .functype test_ctselect_identical_operands (i32, i32) -> (i32) ; W64-NEXT: # %bb.0: -; W64-NEXT: i32.const 0 -; W64-NEXT: local.get 0 -; W64-NEXT: i32.const 1 -; W64-NEXT: i32.and -; W64-NEXT: i32.sub -; W64-NEXT: local.tee 0 ; W64-NEXT: local.get 1 -; W64-NEXT: i32.and -; W64-NEXT: local.get 0 -; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor -; W64-NEXT: local.get 1 -; W64-NEXT: i32.and -; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x) ret i32 %result @@ -552,39 +492,39 @@ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, ; W32-NEXT: local.get 2 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 2 +; W32-NEXT: i32.sub ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 1 +; W32-NEXT: i32.sub ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub ; W32-NEXT: local.get 3 ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 4 ; W32-NEXT: i32.and ; W32-NEXT: i32.or ; W32-NEXT: i32.and ; W32-NEXT: local.get 1 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 5 ; W32-NEXT: i32.and ; W32-NEXT: i32.or ; W32-NEXT: i32.and ; W32-NEXT: local.get 2 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 6 ; W32-NEXT: i32.and ; W32-NEXT: i32.or @@ -597,39 +537,39 @@ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, ; W64-NEXT: local.get 2 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 2 +; W64-NEXT: i32.sub ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 1 +; W64-NEXT: i32.sub ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub ; W64-NEXT: local.get 3 ; W64-NEXT: i32.and ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 4 ; W64-NEXT: i32.and ; W64-NEXT: i32.or ; W64-NEXT: i32.and ; W64-NEXT: local.get 1 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 5 ; W64-NEXT: i32.and ; W64-NEXT: i32.or ; W64-NEXT: i32.and ; W64-NEXT: local.get 2 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 6 ; W64-NEXT: i32.and ; W64-NEXT: i32.or @@ -650,8 +590,6 @@ define i64 @test_ctselect_i64_smin_zero(i64 %x) { ; W32-NEXT: i64.shr_s ; W32-NEXT: local.get 0 ; W32-NEXT: i64.and -; W32-NEXT: i64.const 0 -; W32-NEXT: i64.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_i64_smin_zero: @@ -662,8 +600,6 @@ define i64 @test_ctselect_i64_smin_zero(i64 %x) { ; W64-NEXT: i64.shr_s ; W64-NEXT: local.get 0 ; W64-NEXT: i64.and -; W64-NEXT: i64.const 0 -; W64-NEXT: i64.or ; W64-NEXT: # fallthrough-return %cmp = icmp slt i64 %x, 0 %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0) diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll index cac4f721beccc..daa7370fb481a 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll @@ -6,40 +6,30 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { ; WASM32-LABEL: test_ctselect_v4i32: ; WASM32: .functype test_ctselect_v4i32 (i32, v128, v128) -> (v128) -; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: local.tee 3 -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: v128.and -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: local.get 3 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or +; WASM32-NEXT: v128.bitselect ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32: ; WASM64: .functype test_ctselect_v4i32 (i32, v128, v128) -> (v128) -; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: local.tee 3 -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: v128.and -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: local.get 3 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or +; WASM64-NEXT: v128.bitselect ; WASM64-NEXT: # fallthrough-return %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) ret <4 x i32> %result @@ -49,40 +39,30 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) { ; WASM32-LABEL: test_ctselect_v8i16: ; WASM32: .functype test_ctselect_v8i16 (i32, v128, v128) -> (v128) -; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i16x8.splat ; WASM32-NEXT: i32.const 15 ; WASM32-NEXT: i16x8.shl ; WASM32-NEXT: i32.const 15 ; WASM32-NEXT: i16x8.shr_s -; WASM32-NEXT: local.tee 3 -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: v128.and -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: local.get 3 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or +; WASM32-NEXT: v128.bitselect ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v8i16: ; WASM64: .functype test_ctselect_v8i16 (i32, v128, v128) -> (v128) -; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i16x8.splat ; WASM64-NEXT: i32.const 15 ; WASM64-NEXT: i16x8.shl ; WASM64-NEXT: i32.const 15 ; WASM64-NEXT: i16x8.shr_s -; WASM64-NEXT: local.tee 3 -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: v128.and -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: local.get 3 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or +; WASM64-NEXT: v128.bitselect ; WASM64-NEXT: # fallthrough-return %result = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) ret <8 x i16> %result @@ -92,40 +72,30 @@ define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) { define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) { ; WASM32-LABEL: test_ctselect_v16i8: ; WASM32: .functype test_ctselect_v16i8 (i32, v128, v128) -> (v128) -; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i8x16.splat ; WASM32-NEXT: i32.const 7 ; WASM32-NEXT: i8x16.shl ; WASM32-NEXT: i32.const 7 ; WASM32-NEXT: i8x16.shr_s -; WASM32-NEXT: local.tee 3 -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: v128.and -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: local.get 3 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or +; WASM32-NEXT: v128.bitselect ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v16i8: ; WASM64: .functype test_ctselect_v16i8 (i32, v128, v128) -> (v128) -; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i8x16.splat ; WASM64-NEXT: i32.const 7 ; WASM64-NEXT: i8x16.shl ; WASM64-NEXT: i32.const 7 ; WASM64-NEXT: i8x16.shr_s -; WASM64-NEXT: local.tee 3 -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: v128.and -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: local.get 3 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or +; WASM64-NEXT: v128.bitselect ; WASM64-NEXT: # fallthrough-return %result = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) ret <16 x i8> %result @@ -135,40 +105,30 @@ define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) { define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { ; WASM32-LABEL: test_ctselect_v2i64: ; WASM32: .functype test_ctselect_v2i64 (i32, v128, v128) -> (v128) -; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 63 ; WASM32-NEXT: i64x2.shl ; WASM32-NEXT: i32.const 63 ; WASM32-NEXT: i64x2.shr_s -; WASM32-NEXT: local.tee 3 -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: v128.and -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: local.get 3 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or +; WASM32-NEXT: v128.bitselect ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v2i64: ; WASM64: .functype test_ctselect_v2i64 (i32, v128, v128) -> (v128) -; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 63 ; WASM64-NEXT: i64x2.shl ; WASM64-NEXT: i32.const 63 ; WASM64-NEXT: i64x2.shr_s -; WASM64-NEXT: local.tee 3 -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: v128.and -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: local.get 3 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or +; WASM64-NEXT: v128.bitselect ; WASM64-NEXT: # fallthrough-return %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) ret <2 x i64> %result @@ -178,40 +138,30 @@ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { ; WASM32-LABEL: test_ctselect_v4f32: ; WASM32: .functype test_ctselect_v4f32 (i32, v128, v128) -> (v128) -; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: local.tee 3 -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: v128.and -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: local.get 3 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or +; WASM32-NEXT: v128.bitselect ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4f32: ; WASM64: .functype test_ctselect_v4f32 (i32, v128, v128) -> (v128) -; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: local.tee 3 -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: v128.and -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: local.get 3 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or +; WASM64-NEXT: v128.bitselect ; WASM64-NEXT: # fallthrough-return %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) ret <4 x float> %result @@ -221,40 +171,30 @@ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { ; WASM32-LABEL: test_ctselect_v2f64: ; WASM32: .functype test_ctselect_v2f64 (i32, v128, v128) -> (v128) -; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 63 ; WASM32-NEXT: i64x2.shl ; WASM32-NEXT: i32.const 63 ; WASM32-NEXT: i64x2.shr_s -; WASM32-NEXT: local.tee 3 -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: v128.and -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: local.get 3 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or +; WASM32-NEXT: v128.bitselect ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v2f64: ; WASM64: .functype test_ctselect_v2f64 (i32, v128, v128) -> (v128) -; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 63 ; WASM64-NEXT: i64x2.shl ; WASM64-NEXT: i32.const 63 ; WASM64-NEXT: i64x2.shr_s -; WASM64-NEXT: local.tee 3 -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: v128.and -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: local.get 3 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or +; WASM64-NEXT: v128.bitselect ; WASM64-NEXT: # fallthrough-return %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) ret <2 x double> %result @@ -264,44 +204,34 @@ define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> define <4 x i32> @test_ctselect_v4i32_aligned_load(i1 %cond, ptr %p1, ptr %p2) { ; WASM32-LABEL: test_ctselect_v4i32_aligned_load: ; WASM32: .functype test_ctselect_v4i32_aligned_load (i32, i32, i32) -> (v128) -; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.load 0 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.load 0 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: local.tee 3 -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: v128.load 0 -; WASM32-NEXT: v128.and -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: v128.load 0 -; WASM32-NEXT: local.get 3 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or +; WASM32-NEXT: v128.bitselect ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32_aligned_load: ; WASM64: .functype test_ctselect_v4i32_aligned_load (i32, i64, i64) -> (v128) -; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.load 0 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.load 0 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: local.tee 3 -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: v128.load 0 -; WASM64-NEXT: v128.and -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: v128.load 0 -; WASM64-NEXT: local.get 3 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or +; WASM64-NEXT: v128.bitselect ; WASM64-NEXT: # fallthrough-return %a = load <4 x i32>, ptr %p1, align 16 %b = load <4 x i32>, ptr %p2, align 16 @@ -313,44 +243,34 @@ define <4 x i32> @test_ctselect_v4i32_aligned_load(i1 %cond, ptr %p1, ptr %p2) { define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2) { ; WASM32-LABEL: test_ctselect_v4i32_unaligned_load: ; WASM32: .functype test_ctselect_v4i32_unaligned_load (i32, i32, i32) -> (v128) -; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.load 0:p2align=2 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.load 0:p2align=2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: local.tee 3 -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: v128.load 0:p2align=2 -; WASM32-NEXT: v128.and -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: v128.load 0:p2align=2 -; WASM32-NEXT: local.get 3 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or +; WASM32-NEXT: v128.bitselect ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32_unaligned_load: ; WASM64: .functype test_ctselect_v4i32_unaligned_load (i32, i64, i64) -> (v128) -; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.load 0:p2align=2 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.load 0:p2align=2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: local.tee 3 -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: v128.load 0:p2align=2 -; WASM64-NEXT: v128.and -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: v128.load 0:p2align=2 -; WASM64-NEXT: local.get 3 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or +; WASM64-NEXT: v128.bitselect ; WASM64-NEXT: # fallthrough-return %a = load <4 x i32>, ptr %p1, align 4 %b = load <4 x i32>, ptr %p2, align 4 @@ -362,43 +282,33 @@ define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2) define void @test_ctselect_v4i32_store(i1 %cond, <4 x i32> %a, <4 x i32> %b, ptr %out) { ; WASM32-LABEL: test_ctselect_v4i32_store: ; WASM32: .functype test_ctselect_v4i32_store (i32, v128, v128, i32) -> () -; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: ; WASM32-NEXT: local.get 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: local.tee 4 -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: v128.and -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: local.get 4 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or +; WASM32-NEXT: v128.bitselect ; WASM32-NEXT: v128.store 0 ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32_store: ; WASM64: .functype test_ctselect_v4i32_store (i32, v128, v128, i64) -> () -; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: ; WASM64-NEXT: local.get 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: local.tee 4 -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: v128.and -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: local.get 4 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or +; WASM64-NEXT: v128.bitselect ; WASM64-NEXT: v128.store 0 ; WASM64-NEXT: # fallthrough-return %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) @@ -410,64 +320,46 @@ define void @test_ctselect_v4i32_store(i1 %cond, <4 x i32> %a, <4 x i32> %b, ptr define <4 x i32> @test_ctselect_v4i32_chain(i1 %cond1, i1 %cond2, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; WASM32-LABEL: test_ctselect_v4i32_chain: ; WASM32: .functype test_ctselect_v4i32_chain (i32, i32, v128, v128, v128) -> (v128) -; WASM32-NEXT: .local v128, v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: local.tee 5 -; WASM32-NEXT: local.get 0 +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.get 4 +; WASM32-NEXT: local.get 1 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: local.tee 6 -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: v128.and -; WASM32-NEXT: local.get 3 -; WASM32-NEXT: local.get 6 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or -; WASM32-NEXT: v128.and -; WASM32-NEXT: local.get 4 -; WASM32-NEXT: local.get 5 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or +; WASM32-NEXT: v128.bitselect ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32_chain: ; WASM64: .functype test_ctselect_v4i32_chain (i32, i32, v128, v128, v128) -> (v128) -; WASM64-NEXT: .local v128, v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: local.tee 5 -; WASM64-NEXT: local.get 0 +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.get 4 +; WASM64-NEXT: local.get 1 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: local.tee 6 -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: v128.and -; WASM64-NEXT: local.get 3 -; WASM64-NEXT: local.get 6 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or -; WASM64-NEXT: v128.and -; WASM64-NEXT: local.get 4 -; WASM64-NEXT: local.get 5 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or +; WASM64-NEXT: v128.bitselect ; WASM64-NEXT: # fallthrough-return %tmp = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond1, <4 x i32> %a, <4 x i32> %b) %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond2, <4 x i32> %tmp, <4 x i32> %c) @@ -478,48 +370,38 @@ define <4 x i32> @test_ctselect_v4i32_chain(i1 %cond1, i1 %cond2, <4 x i32> %a, define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4 x float> %y) { ; WASM32-LABEL: test_ctselect_v4f32_arithmetic: ; WASM32: .functype test_ctselect_v4f32_arithmetic (i32, v128, v128) -> (v128) -; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 0 -; WASM32-NEXT: i32x4.splat -; WASM32-NEXT: i32.const 31 -; WASM32-NEXT: i32x4.shl -; WASM32-NEXT: i32.const 31 -; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: local.tee 3 ; WASM32-NEXT: local.get 1 ; WASM32-NEXT: local.get 2 ; WASM32-NEXT: f32x4.add -; WASM32-NEXT: v128.and ; WASM32-NEXT: local.get 1 ; WASM32-NEXT: local.get 2 ; WASM32-NEXT: f32x4.sub -; WASM32-NEXT: local.get 3 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4f32_arithmetic: ; WASM64: .functype test_ctselect_v4f32_arithmetic (i32, v128, v128) -> (v128) -; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 0 -; WASM64-NEXT: i32x4.splat -; WASM64-NEXT: i32.const 31 -; WASM64-NEXT: i32x4.shl -; WASM64-NEXT: i32.const 31 -; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: local.tee 3 ; WASM64-NEXT: local.get 1 ; WASM64-NEXT: local.get 2 ; WASM64-NEXT: f32x4.add -; WASM64-NEXT: v128.and ; WASM64-NEXT: local.get 1 ; WASM64-NEXT: local.get 2 ; WASM64-NEXT: f32x4.sub -; WASM64-NEXT: local.get 3 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect ; WASM64-NEXT: # fallthrough-return %sum = fadd <4 x float> %x, %y %diff = fsub <4 x float> %x, %y @@ -540,8 +422,6 @@ define <4 x i32> @test_ctselect_v4i32_zeros(i1 %cond, <4 x i32> %a) { ; WASM32-NEXT: i32x4.shr_s ; WASM32-NEXT: local.get 1 ; WASM32-NEXT: v128.and -; WASM32-NEXT: v128.const 0, 0, 0, 0 -; WASM32-NEXT: v128.or ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32_zeros: @@ -555,8 +435,6 @@ define <4 x i32> @test_ctselect_v4i32_zeros(i1 %cond, <4 x i32> %a) { ; WASM64-NEXT: i32x4.shr_s ; WASM64-NEXT: local.get 1 ; WASM64-NEXT: v128.and -; WASM64-NEXT: v128.const 0, 0, 0, 0 -; WASM64-NEXT: v128.or ; WASM64-NEXT: # fallthrough-return %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, @@ -568,40 +446,30 @@ define <4 x i32> @test_ctselect_v4i32_zeros(i1 %cond, <4 x i32> %a) { define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b) nounwind { ; WASM32-LABEL: test_ctselect_v4i32_args: ; WASM32: .functype test_ctselect_v4i32_args (i32, v128, v128) -> (v128) -; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: local.tee 3 -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: v128.and -; WASM32-NEXT: local.get 2 -; WASM32-NEXT: local.get 3 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or +; WASM32-NEXT: v128.bitselect ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32_args: ; WASM64: .functype test_ctselect_v4i32_args (i32, v128, v128) -> (v128) -; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: local.tee 3 -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: v128.and -; WASM64-NEXT: local.get 2 -; WASM64-NEXT: local.get 3 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or +; WASM64-NEXT: v128.bitselect ; WASM64-NEXT: # fallthrough-return %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) ret <4 x i32> %result @@ -611,45 +479,35 @@ define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b) define <4 x i32> @test_ctselect_v4i32_multi_use(i1 %cond, <4 x i32> %a, <4 x i32> %b) { ; WASM32-LABEL: test_ctselect_v4i32_multi_use: ; WASM32: .functype test_ctselect_v4i32_multi_use (i32, v128, v128) -> (v128) -; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 ; WASM32-NEXT: local.get 0 ; WASM32-NEXT: i32x4.splat ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shl ; WASM32-NEXT: i32.const 31 ; WASM32-NEXT: i32x4.shr_s -; WASM32-NEXT: local.tee 3 -; WASM32-NEXT: local.get 1 -; WASM32-NEXT: v128.and +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 2 ; WASM32-NEXT: local.get 2 -; WASM32-NEXT: local.get 3 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or -; WASM32-NEXT: local.tee 1 -; WASM32-NEXT: local.get 1 ; WASM32-NEXT: i32x4.add ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v4i32_multi_use: ; WASM64: .functype test_ctselect_v4i32_multi_use (i32, v128, v128) -> (v128) -; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 ; WASM64-NEXT: local.get 0 ; WASM64-NEXT: i32x4.splat ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shl ; WASM64-NEXT: i32.const 31 ; WASM64-NEXT: i32x4.shr_s -; WASM64-NEXT: local.tee 3 -; WASM64-NEXT: local.get 1 -; WASM64-NEXT: v128.and +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 2 ; WASM64-NEXT: local.get 2 -; WASM64-NEXT: local.get 3 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or -; WASM64-NEXT: local.tee 1 -; WASM64-NEXT: local.get 1 ; WASM64-NEXT: i32x4.add ; WASM64-NEXT: # fallthrough-return %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) @@ -661,48 +519,38 @@ define <4 x i32> @test_ctselect_v4i32_multi_use(i1 %cond, <4 x i32> %a, <4 x i32 define <16 x i8> @test_ctselect_v16i8_ops(i1 %cond, <16 x i8> %x, <16 x i8> %y) { ; WASM32-LABEL: test_ctselect_v16i8_ops: ; WASM32: .functype test_ctselect_v16i8_ops (i32, v128, v128) -> (v128) -; WASM32-NEXT: .local v128 ; WASM32-NEXT: # %bb.0: -; WASM32-NEXT: local.get 0 -; WASM32-NEXT: i8x16.splat -; WASM32-NEXT: i32.const 7 -; WASM32-NEXT: i8x16.shl -; WASM32-NEXT: i32.const 7 -; WASM32-NEXT: i8x16.shr_s -; WASM32-NEXT: local.tee 3 ; WASM32-NEXT: local.get 1 ; WASM32-NEXT: local.get 2 ; WASM32-NEXT: v128.xor -; WASM32-NEXT: v128.and ; WASM32-NEXT: local.get 1 ; WASM32-NEXT: local.get 2 ; WASM32-NEXT: v128.and -; WASM32-NEXT: local.get 3 -; WASM32-NEXT: v128.andnot -; WASM32-NEXT: v128.or +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i8x16.splat +; WASM32-NEXT: i32.const 7 +; WASM32-NEXT: i8x16.shl +; WASM32-NEXT: i32.const 7 +; WASM32-NEXT: i8x16.shr_s +; WASM32-NEXT: v128.bitselect ; WASM32-NEXT: # fallthrough-return ; ; WASM64-LABEL: test_ctselect_v16i8_ops: ; WASM64: .functype test_ctselect_v16i8_ops (i32, v128, v128) -> (v128) -; WASM64-NEXT: .local v128 ; WASM64-NEXT: # %bb.0: -; WASM64-NEXT: local.get 0 -; WASM64-NEXT: i8x16.splat -; WASM64-NEXT: i32.const 7 -; WASM64-NEXT: i8x16.shl -; WASM64-NEXT: i32.const 7 -; WASM64-NEXT: i8x16.shr_s -; WASM64-NEXT: local.tee 3 ; WASM64-NEXT: local.get 1 ; WASM64-NEXT: local.get 2 ; WASM64-NEXT: v128.xor -; WASM64-NEXT: v128.and ; WASM64-NEXT: local.get 1 ; WASM64-NEXT: local.get 2 ; WASM64-NEXT: v128.and -; WASM64-NEXT: local.get 3 -; WASM64-NEXT: v128.andnot -; WASM64-NEXT: v128.or +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i8x16.splat +; WASM64-NEXT: i32.const 7 +; WASM64-NEXT: i8x16.shl +; WASM64-NEXT: i32.const 7 +; WASM64-NEXT: i8x16.shr_s +; WASM64-NEXT: v128.bitselect ; WASM64-NEXT: # fallthrough-return %xor = xor <16 x i8> %x, %y %and = and <16 x i8> %x, %y diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll index 2a694db6d88d9..4e356f8562b39 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll @@ -95,13 +95,13 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub ; W32-NEXT: local.get 1 ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 2 ; W32-NEXT: i32.and ; W32-NEXT: i32.or @@ -114,13 +114,13 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub ; W64-NEXT: local.get 1 ; W64-NEXT: i32.and ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 2 ; W64-NEXT: i32.and ; W64-NEXT: i32.or @@ -139,13 +139,13 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { ; W32-NEXT: i64.extend_i32_u ; W32-NEXT: i64.const 1 ; W32-NEXT: i64.and -; W32-NEXT: i64.sub ; W32-NEXT: local.tee 3 +; W32-NEXT: i64.sub ; W32-NEXT: local.get 1 ; W32-NEXT: i64.and ; W32-NEXT: local.get 3 ; W32-NEXT: i64.const -1 -; W32-NEXT: i64.xor +; W32-NEXT: i64.add ; W32-NEXT: local.get 2 ; W32-NEXT: i64.and ; W32-NEXT: i64.or @@ -160,13 +160,13 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { ; W64-NEXT: i64.extend_i32_u ; W64-NEXT: i64.const 1 ; W64-NEXT: i64.and -; W64-NEXT: i64.sub ; W64-NEXT: local.tee 3 +; W64-NEXT: i64.sub ; W64-NEXT: local.get 1 ; W64-NEXT: i64.and ; W64-NEXT: local.get 3 ; W64-NEXT: i64.const -1 -; W64-NEXT: i64.xor +; W64-NEXT: i64.add ; W64-NEXT: local.get 2 ; W64-NEXT: i64.and ; W64-NEXT: i64.or @@ -183,13 +183,13 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub ; W32-NEXT: local.get 1 ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 2 ; W32-NEXT: i32.and ; W32-NEXT: i32.or @@ -204,13 +204,13 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { ; W64-NEXT: i64.extend_i32_u ; W64-NEXT: i64.const 1 ; W64-NEXT: i64.and -; W64-NEXT: i64.sub ; W64-NEXT: local.tee 3 +; W64-NEXT: i64.sub ; W64-NEXT: local.get 1 ; W64-NEXT: i64.and ; W64-NEXT: local.get 3 ; W64-NEXT: i64.const -1 -; W64-NEXT: i64.xor +; W64-NEXT: i64.add ; W64-NEXT: local.get 2 ; W64-NEXT: i64.and ; W64-NEXT: i64.or @@ -225,16 +225,12 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) { ; W32: .functype test_ctselect_const_true (i32, i32) -> (i32) ; W32-NEXT: # %bb.0: ; W32-NEXT: local.get 0 -; W32-NEXT: i32.const 0 -; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_const_true: ; W64: .functype test_ctselect_const_true (i32, i32) -> (i32) ; W64-NEXT: # %bb.0: ; W64-NEXT: local.get 0 -; W64-NEXT: i32.const 0 -; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) ret i32 %result @@ -244,17 +240,13 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) { ; W32-LABEL: test_ctselect_const_false: ; W32: .functype test_ctselect_const_false (i32, i32) -> (i32) ; W32-NEXT: # %bb.0: -; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 1 -; W32-NEXT: i32.or ; W32-NEXT: # fallthrough-return ; ; W64-LABEL: test_ctselect_const_false: ; W64: .functype test_ctselect_const_false (i32, i32) -> (i32) ; W64-NEXT: # %bb.0: -; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 1 -; W64-NEXT: i32.or ; W64-NEXT: # fallthrough-return %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) ret i32 %result @@ -450,14 +442,14 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub ; W32-NEXT: local.get 1 ; W32-NEXT: i32.load 0 ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 2 ; W32-NEXT: i32.load 0 ; W32-NEXT: i32.and @@ -471,14 +463,14 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub ; W64-NEXT: local.get 1 ; W64-NEXT: i32.load 0 ; W64-NEXT: i32.and ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 2 ; W64-NEXT: i32.load 0 ; W64-NEXT: i32.and @@ -499,26 +491,26 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 1 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 1 +; W32-NEXT: i32.sub ; W32-NEXT: local.get 2 ; W32-NEXT: i32.and ; W32-NEXT: local.get 1 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 3 ; W32-NEXT: i32.and ; W32-NEXT: i32.or ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 4 ; W32-NEXT: i32.and ; W32-NEXT: i32.or @@ -531,26 +523,26 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 1 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 1 +; W64-NEXT: i32.sub ; W64-NEXT: local.get 2 ; W64-NEXT: i32.and ; W64-NEXT: local.get 1 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 3 ; W64-NEXT: i32.and ; W64-NEXT: i32.or ; W64-NEXT: i32.and ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 4 ; W64-NEXT: i32.and ; W64-NEXT: i32.or @@ -569,14 +561,14 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub ; W32-NEXT: local.get 1 ; W32-NEXT: i32.reinterpret_f32 ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 2 ; W32-NEXT: i32.reinterpret_f32 ; W32-NEXT: i32.and @@ -591,14 +583,14 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub ; W64-NEXT: local.get 1 ; W64-NEXT: i32.reinterpret_f32 ; W64-NEXT: i32.and ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 2 ; W64-NEXT: i32.reinterpret_f32 ; W64-NEXT: i32.and @@ -620,14 +612,14 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) { ; W32-NEXT: i64.extend_i32_u ; W32-NEXT: i64.const 1 ; W32-NEXT: i64.and -; W32-NEXT: i64.sub ; W32-NEXT: local.tee 3 +; W32-NEXT: i64.sub ; W32-NEXT: local.get 1 ; W32-NEXT: i64.reinterpret_f64 ; W32-NEXT: i64.and ; W32-NEXT: local.get 3 ; W32-NEXT: i64.const -1 -; W32-NEXT: i64.xor +; W32-NEXT: i64.add ; W32-NEXT: local.get 2 ; W32-NEXT: i64.reinterpret_f64 ; W32-NEXT: i64.and @@ -644,14 +636,14 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) { ; W64-NEXT: i64.extend_i32_u ; W64-NEXT: i64.const 1 ; W64-NEXT: i64.and -; W64-NEXT: i64.sub ; W64-NEXT: local.tee 3 +; W64-NEXT: i64.sub ; W64-NEXT: local.get 1 ; W64-NEXT: i64.reinterpret_f64 ; W64-NEXT: i64.and ; W64-NEXT: local.get 3 ; W64-NEXT: i64.const -1 -; W64-NEXT: i64.xor +; W64-NEXT: i64.add ; W64-NEXT: local.get 2 ; W64-NEXT: i64.reinterpret_f64 ; W64-NEXT: i64.and @@ -672,20 +664,20 @@ define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, ; W32-NEXT: local.get 1 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 1 +; W32-NEXT: i32.sub ; W32-NEXT: i32.const 0 ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub ; W32-NEXT: local.get 2 ; W32-NEXT: i32.reinterpret_f32 ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 3 ; W32-NEXT: i32.reinterpret_f32 ; W32-NEXT: i32.and @@ -693,7 +685,7 @@ define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, ; W32-NEXT: i32.and ; W32-NEXT: local.get 1 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 4 ; W32-NEXT: i32.reinterpret_f32 ; W32-NEXT: i32.and @@ -708,20 +700,20 @@ define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, ; W64-NEXT: local.get 1 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 1 +; W64-NEXT: i32.sub ; W64-NEXT: i32.const 0 ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub ; W64-NEXT: local.get 2 ; W64-NEXT: i32.reinterpret_f32 ; W64-NEXT: i32.and ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 3 ; W64-NEXT: i32.reinterpret_f32 ; W64-NEXT: i32.and @@ -729,7 +721,7 @@ define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, ; W64-NEXT: i32.and ; W64-NEXT: local.get 1 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 4 ; W64-NEXT: i32.reinterpret_f32 ; W64-NEXT: i32.and @@ -750,14 +742,14 @@ define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub ; W32-NEXT: local.get 1 ; W32-NEXT: i32.load 0 ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 2 ; W32-NEXT: i32.load 0 ; W32-NEXT: i32.and @@ -772,14 +764,14 @@ define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub ; W64-NEXT: local.get 1 ; W64-NEXT: i32.load 0 ; W64-NEXT: i32.and ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 2 ; W64-NEXT: i32.load 0 ; W64-NEXT: i32.and @@ -803,14 +795,14 @@ define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) { ; W32-NEXT: i64.extend_i32_u ; W32-NEXT: i64.const 1 ; W32-NEXT: i64.and -; W32-NEXT: i64.sub ; W32-NEXT: local.tee 3 +; W32-NEXT: i64.sub ; W32-NEXT: local.get 1 ; W32-NEXT: i64.load 0 ; W32-NEXT: i64.and ; W32-NEXT: local.get 3 ; W32-NEXT: i64.const -1 -; W32-NEXT: i64.xor +; W32-NEXT: i64.add ; W32-NEXT: local.get 2 ; W32-NEXT: i64.load 0 ; W32-NEXT: i64.and @@ -827,14 +819,14 @@ define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) { ; W64-NEXT: i64.extend_i32_u ; W64-NEXT: i64.const 1 ; W64-NEXT: i64.and -; W64-NEXT: i64.sub ; W64-NEXT: local.tee 3 +; W64-NEXT: i64.sub ; W64-NEXT: local.get 1 ; W64-NEXT: i64.load 0 ; W64-NEXT: i64.and ; W64-NEXT: local.get 3 ; W64-NEXT: i64.const -1 -; W64-NEXT: i64.xor +; W64-NEXT: i64.add ; W64-NEXT: local.get 2 ; W64-NEXT: i64.load 0 ; W64-NEXT: i64.and @@ -856,8 +848,8 @@ define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub ; W32-NEXT: local.get 1 ; W32-NEXT: local.get 2 ; W32-NEXT: f32.add @@ -865,7 +857,7 @@ define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 1 ; W32-NEXT: local.get 2 ; W32-NEXT: f32.sub @@ -882,8 +874,8 @@ define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub ; W64-NEXT: local.get 1 ; W64-NEXT: local.get 2 ; W64-NEXT: f32.add @@ -891,7 +883,7 @@ define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { ; W64-NEXT: i32.and ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 1 ; W64-NEXT: local.get 2 ; W64-NEXT: f32.sub diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll b/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll index cc5746389aa93..5b20e892c64d2 100644 --- a/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll +++ b/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll @@ -47,13 +47,13 @@ define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const 1 ; W32-NEXT: i32.and -; W32-NEXT: i32.sub ; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub ; W32-NEXT: local.get 1 ; W32-NEXT: i32.and ; W32-NEXT: local.get 0 ; W32-NEXT: i32.const -1 -; W32-NEXT: i32.xor +; W32-NEXT: i32.add ; W32-NEXT: local.get 2 ; W32-NEXT: i32.and ; W32-NEXT: i32.or @@ -66,13 +66,13 @@ define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const 1 ; W64-NEXT: i32.and -; W64-NEXT: i32.sub ; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub ; W64-NEXT: local.get 1 ; W64-NEXT: i32.and ; W64-NEXT: local.get 0 ; W64-NEXT: i32.const -1 -; W64-NEXT: i32.xor +; W64-NEXT: i32.add ; W64-NEXT: local.get 2 ; W64-NEXT: i32.and ; W64-NEXT: i32.or From ab6692c8b3228c948cb0cb8dbbc79a26106800df Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Wed, 27 Aug 2025 07:49:43 -0400 Subject: [PATCH 22/63] [CT] Added both createProtect function for chaining and for NoMerge Flag --- .../SelectionDAG/SelectionDAGBuilder.cpp | 81 ++++++++++++++++++- .../SelectionDAG/SelectionDAGBuilder.h | 9 ++- 2 files changed, 83 insertions(+), 7 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index ee791310dd5e4..8276bd25b840d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6489,9 +6489,10 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I, setValue(&I, Result); } -/// Fallback implementation is an alternative approach for managing architectures that don't have -/// native support for Constant-Time Select. -SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( +/// Fallback implementation is an alternative approach for managing +/// architectures that don't have native support for Constant-Time Select. This +/// function uses DAG Chaining +SDValue SelectionDAGBuilder::createProtectedCtSelectFallbackChain( SelectionDAG &DAG, const SDLoc &DL, SDValue Cond, SDValue T, SDValue F, EVT VT) { @@ -6582,6 +6583,78 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback( return Result; } +/// Fallback implementation is an alternative approach for managing +/// architectures that don't have native support for Constant-Time Select. This +/// function uses the NoMerge flag +SDValue SelectionDAGBuilder::createProtectedCtSelectFallbackNoMerge( + SelectionDAG &DAG, const SDLoc &DL, SDValue Cond, SDValue T, SDValue F, + EVT VT) { + SDNodeFlags ProtectedFlag; + ProtectedFlag.setNoMerge(true); + + SDValue WorkingT = T; + SDValue WorkingF = F; + EVT WorkingVT = VT; + + if (VT.isVector() && !Cond.getValueType().isVector()) { + ElementCount NumElems = VT.getVectorElementCount(); + EVT CondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElems); + + if (VT.isScalableVector()) { + Cond = DAG.getSplatVector(CondVT, DL, Cond); + } else { + Cond = DAG.getSplatBuildVector(CondVT, DL, Cond); + } + } + + if (VT.isFloatingPoint()) { + if (VT.isVector()) { + // float vector -> int vector + EVT ElemVT = VT.getVectorElementType(); + unsigned int ElemBitWidth = ElemVT.getScalarSizeInBits(); + EVT IntElemVT = EVT::getIntegerVT(*DAG.getContext(), ElemBitWidth); + + WorkingVT = EVT::getVectorVT(*DAG.getContext(), IntElemVT, + VT.getVectorElementCount()); + } else { + WorkingVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + } + + WorkingT = DAG.getBitcast(WorkingVT, T); + WorkingF = DAG.getBitcast(WorkingVT, F); + } + + SDValue Mask = DAG.getSExtOrTrunc(Cond, DL, WorkingVT); + + SDValue AllOnes; + if (WorkingVT.isScalableVector()) { + unsigned BitWidth = WorkingVT.getScalarSizeInBits(); + APInt AllOnesVal = APInt::getAllOnes(BitWidth); + SDValue ScalarAllOnes = + DAG.getConstant(AllOnesVal, DL, WorkingVT.getScalarType()); + AllOnes = DAG.getSplatVector(WorkingVT, DL, ScalarAllOnes); + } else { + AllOnes = DAG.getAllOnesConstant(DL, WorkingVT); + } + + SDValue Invert = + DAG.getNode(ISD::XOR, DL, WorkingVT, Mask, AllOnes, ProtectedFlag); + + // (or (and WorkingT, Mask), (and F, ~Mask)) + SDValue TM = + DAG.getNode(ISD::AND, DL, WorkingVT, Mask, WorkingT, ProtectedFlag); + SDValue FM = + DAG.getNode(ISD::AND, DL, WorkingVT, Invert, WorkingF, ProtectedFlag); + SDValue Result = DAG.getNode(ISD::OR, DL, WorkingVT, TM, FM, ProtectedFlag); + + // Convert back if needed + if (WorkingVT != VT) { + Result = DAG.getBitcast(VT, Result); + } + + return Result; +} + /// Lower the call to the specified intrinsic function. void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { @@ -6800,7 +6873,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } } - setValue(&I, createProtectedCtSelectFallback(DAG, DL, Cond, A, B, VT)); + setValue(&I, createProtectedCtSelectFallbackChain(DAG, DL, Cond, A, B, VT)); return; } case Intrinsic::call_preallocated_setup: { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 6a307a33c6271..6068818a32656 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -214,9 +214,12 @@ class SelectionDAGBuilder { peelDominantCaseCluster(const SwitchInst &SI, SwitchCG::CaseClusterVector &Clusters, BranchProbability &PeeledCaseProb); - SDValue createProtectedCtSelectFallback(SelectionDAG &DAG, const SDLoc &DL, - SDValue Cond, SDValue T, SDValue F, - EVT VT); + SDValue createProtectedCtSelectFallbackChain(SelectionDAG &DAG, + const SDLoc &DL, SDValue Cond, + SDValue T, SDValue F, EVT VT); + SDValue createProtectedCtSelectFallbackNoMerge(SelectionDAG &DAG, + const SDLoc &DL, SDValue Cond, + SDValue T, SDValue F, EVT VT); private: const TargetMachine &TM; From 3353218813af97fc21fa83deda1ede2fa3c53a2c Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Thu, 28 Aug 2025 12:57:46 -0400 Subject: [PATCH 23/63] [CT] Fix for x86 floating point vector types --- llvm/lib/Target/X86/X86InstrInfo.cpp | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 1463eeec6c696..690c23879edae 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -649,6 +649,18 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { MachineBasicBlock *MBB = MI.getParent(); + bool IsFloatVector = ( + // SSE float moves + Instruction.MoveOpc == X86::MOVAPSrr || // 128-bit single precision + Instruction.MoveOpc == X86::MOVAPDrr || // 128-bit double precision + // AVX float moves + Instruction.MoveOpc == X86::VMOVAPSrr || // VEX 128-bit single + Instruction.MoveOpc == X86::VMOVAPDrr || // VEX 128-bit double + // AVX 256-bit float moves + Instruction.MoveOpc == X86::VMOVAPSYrr || // VEX 256-bit single + Instruction.MoveOpc == X86::VMOVAPDYrr // VEX 256-bit double + ); + // Operand layout matches the TableGen definition: // (outs VR128:$dst, VR128:$tmpx, GR32:$tmpg), // (ins VR128:$t, VR128:$f, i8imm:$cond) @@ -678,8 +690,14 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { .addReg(SubReg) .setMIFlags(MachineInstr::MIFlag::NoMerge); - // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax) - BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR).addReg(TmpGPR); + if (IsFloatVector) { + // Shift left 31 bits to convert 1 -> 0x80000000, 0 -> 0x00000000 (shll $31, + // %eax) + BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR).addReg(TmpGPR).addImm(31); + } else { + // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax) + BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR).addReg(TmpGPR); + } // Broadcast to TmpX (vector mask) BuildMI(*MBB, MI, DL, get(X86::PXORrr), MaskReg) From ea30b338119872cbcca66fa7426b90c055fa1016 Mon Sep 17 00:00:00 2001 From: AkshayK Date: Fri, 29 Aug 2025 09:22:33 -0400 Subject: [PATCH 24/63] [CT] check for xmm0 register as live and handle accordingly --- llvm/lib/Target/X86/X86InstrInfo.cpp | 78 ++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 690c23879edae..6f9ad5e678e90 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -747,17 +747,77 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { break; } - // BLENDV uses XMM0 as implicit mask register - // https://www.felixcloutier.com/x86/pblendvb - BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), X86::XMM0) - .addReg(MaskReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); + // Check if XMM0 is used as one of source registers, if yes then save it + // in Dst register and update FalseVal and TrueVal to Dst register + bool DidSaveXMM0 = false; + Register SavedXMM0 = X86::XMM0; + if (FalseVal == X86::XMM0 || TrueVal == X86::XMM0) { + Register SrcXMM0 = (FalseVal == X86::XMM0) ? FalseVal : TrueVal; + + // if XMM0 is one of the source registers, it will not match with Dst + // registers, so we need to move it to Dst register + BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), Dst) + .addReg(SrcXMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge); - BuildMI(*MBB, MI, DL, get(BlendOpc), Dst) - .addReg(FalseVal) - .addReg(TrueVal) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + // update FalseVal and TrueVal to Dst register + if (FalseVal == X86::XMM0) + FalseVal = Dst; + if (TrueVal == X86::XMM0) + TrueVal = Dst; + + // update SavedXMM0 to Dst register + SavedXMM0 = Dst; + + // set DidSaveXMM0 to true to indicate that we saved XMM0 into Dst + // register + DidSaveXMM0 = true; + } + + if (MaskReg != X86::XMM0) { + // BLENDV uses XMM0 as implicit mask register + // https://www.felixcloutier.com/x86/pblendvb + BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), X86::XMM0) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // move FalseVal to mask (use MaskReg as the dst of the blend) + BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), MaskReg) + .addReg(FalseVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + // MaskReg := blend(MaskReg /*false*/, TrueVal /*true*/) ; mask in + // xmm0 + BuildMI(*MBB, MI, DL, get(BlendOpc), MaskReg) + .addReg(MaskReg) + .addReg(TrueVal) + .addReg(X86::XMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + + // restore XMM0 from SavedXMM0 if we saved it into Dst + if (DidSaveXMM0) { + BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), X86::XMM0) + .addReg(SavedXMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + } + // dst = result (now in MaskReg) + BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), Dst) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + } else { + // move FalseVal to Dst register since MaskReg is XMM0 and Dst is not + BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), Dst) + .addReg(FalseVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + + // Dst := blend(Dst /*false*/, TrueVal /*true*/) ; mask in + // xmm0 + BuildMI(*MBB, MI, DL, get(BlendOpc), Dst) + .addReg(Dst) + .addReg(TrueVal) + .addReg(X86::XMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + } } else { // dst = mask From 9702887d33814a98ad1442fbd71d7ff2aeb98a8b Mon Sep 17 00:00:00 2001 From: AkshayK Date: Fri, 29 Aug 2025 10:09:04 -0400 Subject: [PATCH 25/63] [CT] save and restore XMM0 for using as mask register --- llvm/lib/Target/X86/X86InstrInfo.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 6f9ad5e678e90..d5d7e02aab2e1 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -772,6 +772,15 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { // set DidSaveXMM0 to true to indicate that we saved XMM0 into Dst // register DidSaveXMM0 = true; + } else if (MaskReg != X86::XMM0 && Dst != X86::XMM0) { + + // if XMM0 is not allocated for any of the register, we stil need to save + // and restore it after using as mask register + BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), Dst) + .addReg(X86::XMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge); + SavedXMM0 = Dst; + DidSaveXMM0 = true; } if (MaskReg != X86::XMM0) { From 28ca4d5f7b98edd48ed9e20e16095eeac10d85fd Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Tue, 2 Sep 2025 15:18:37 -0400 Subject: [PATCH 26/63] [CT] Move floating point logic to right place --- llvm/lib/Target/X86/X86InstrInfo.cpp | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index d5d7e02aab2e1..d1057e11ba615 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -649,18 +649,6 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { MachineBasicBlock *MBB = MI.getParent(); - bool IsFloatVector = ( - // SSE float moves - Instruction.MoveOpc == X86::MOVAPSrr || // 128-bit single precision - Instruction.MoveOpc == X86::MOVAPDrr || // 128-bit double precision - // AVX float moves - Instruction.MoveOpc == X86::VMOVAPSrr || // VEX 128-bit single - Instruction.MoveOpc == X86::VMOVAPDrr || // VEX 128-bit double - // AVX 256-bit float moves - Instruction.MoveOpc == X86::VMOVAPSYrr || // VEX 256-bit single - Instruction.MoveOpc == X86::VMOVAPDYrr // VEX 256-bit double - ); - // Operand layout matches the TableGen definition: // (outs VR128:$dst, VR128:$tmpx, GR32:$tmpg), // (ins VR128:$t, VR128:$f, i8imm:$cond) @@ -690,15 +678,6 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { .addReg(SubReg) .setMIFlags(MachineInstr::MIFlag::NoMerge); - if (IsFloatVector) { - // Shift left 31 bits to convert 1 -> 0x80000000, 0 -> 0x00000000 (shll $31, - // %eax) - BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR).addReg(TmpGPR).addImm(31); - } else { - // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax) - BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR).addReg(TmpGPR); - } - // Broadcast to TmpX (vector mask) BuildMI(*MBB, MI, DL, get(X86::PXORrr), MaskReg) .addReg(MaskReg) @@ -747,6 +726,10 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { break; } + // Shift left 31 bits to convert 1 -> 0x80000000, 0 -> 0x00000000 (shll $31, + // %eax) + BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR).addReg(TmpGPR).addImm(31); + // Check if XMM0 is used as one of source registers, if yes then save it // in Dst register and update FalseVal and TrueVal to Dst register bool DidSaveXMM0 = false; @@ -814,6 +797,9 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { .addReg(MaskReg) .setMIFlags(MachineInstr::MIFlag::NoMerge); } else { + // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax) + BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR).addReg(TmpGPR); + // move FalseVal to Dst register since MaskReg is XMM0 and Dst is not BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), Dst) .addReg(FalseVal) From 2f6082b1a4b954c5a355caee2ba3507dfeffdbe4 Mon Sep 17 00:00:00 2001 From: AkshayK Date: Tue, 2 Sep 2025 15:51:59 -0400 Subject: [PATCH 27/63] [CT] update handling of mask and remove avx512 pseudo and usage pattern --- llvm/lib/Target/X86/X86InstrCMovSetCC.td | 76 ++++-------------------- llvm/lib/Target/X86/X86InstrInfo.cpp | 64 +++++++------------- 2 files changed, 34 insertions(+), 106 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td index 3081d9d22ab5d..a22cc3b265710 100644 --- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -164,11 +164,16 @@ class CTSELECT_VEC512 : CTSELECT_VEC; //===----------------------------------------------------------------------===// // 128-bit pseudos (SSE2 baseline; we use PXOR/PAND/MOVD/PSHUFD in the expander) //===----------------------------------------------------------------------===// -let Predicates = [HasSSE2] in { + +let Predicates = [HasSSE1] in { def CTSELECT_V4F32 : CTSELECT_VEC128 { let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; } +} + +let Predicates = [HasSSE2] in { + def CTSELECT_V2F64 : CTSELECT_VEC128 { let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; } @@ -184,10 +189,6 @@ let Predicates = [HasSSE2] in { def CTSELECT_V16I8 : CTSELECT_VEC128 { let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; } - // If your build has v8f16, keep this; otherwise comment it out. - def CTSELECT_V8F16 : CTSELECT_VEC128 { - let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; - } } let Predicates = [HasAVX] in { @@ -210,10 +211,6 @@ let Predicates = [HasAVX] in { def CTSELECT_V16I8X : CTSELECT_VEC128X { let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; } - // If your build has v8f16, keep this; otherwise comment it out. - def CTSELECT_V8F16X : CTSELECT_VEC128X { - let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; - } } //===----------------------------------------------------------------------===// @@ -239,30 +236,6 @@ let Predicates = [HasAVX] in { def CTSELECT_V32I8 : CTSELECT_VEC256 { let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; } - // If your build has v16f16, keep this; otherwise comment it out. - def CTSELECT_V16F16 : CTSELECT_VEC256 { - let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; - } -} - -//===----------------------------------------------------------------------===// -// 512-bit pseudos -//===----------------------------------------------------------------------===// - -// Core AVX-512F types -let Predicates = [HasAVX512] in { - def CTSELECT_V16F32 : CTSELECT_VEC512 { - let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; - } - def CTSELECT_V8F64 : CTSELECT_VEC512 { - let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; - } - def CTSELECT_V16I32 : CTSELECT_VEC512 { - let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; - } - def CTSELECT_V8I64 : CTSELECT_VEC512{ - let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; - } } //===----------------------------------------------------------------------===// @@ -274,6 +247,13 @@ let Predicates = [HasAVX512] in { // * Temps (tmpx/tmpy,tmpg) are not in the pattern; they’re outs allocated by RA. //===----------------------------------------------------------------------===// +let Predicates = [HasSSE1] in { + + // 128-bit float (bitwise-equivalent ops in expander) + def : Pat<(v4f32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V4F32 VR128:$t, VR128:$f, timm:$cc)>; +} + let Predicates = [HasSSE2] in { // 128-bit integer @@ -285,16 +265,8 @@ let Predicates = [HasSSE2] in { (CTSELECT_V8I16 VR128:$t, VR128:$f, timm:$cc)>; def : Pat<(v16i8 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), (CTSELECT_V16I8 VR128:$t, VR128:$f, timm:$cc)>; - - // 128-bit float (bitwise-equivalent ops in expander) - def : Pat<(v4f32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), - (CTSELECT_V4F32 VR128:$t, VR128:$f, timm:$cc)>; def : Pat<(v2f64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), (CTSELECT_V2F64 VR128:$t, VR128:$f, timm:$cc)>; - - // 128-bit f16 (optional) - def : Pat<(v8f16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), - (CTSELECT_V8F16 VR128:$t, VR128:$f, timm:$cc)>; } let Predicates = [HasAVX] in { @@ -314,28 +286,6 @@ let Predicates = [HasAVX] in { (CTSELECT_V8F32 VR256:$t, VR256:$f, timm:$cc)>; def : Pat<(v4f64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), (CTSELECT_V4F64 VR256:$t, VR256:$f, timm:$cc)>; - - // 256-bit f16 (optional) - def : Pat<(v16f16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), - (CTSELECT_V16F16 VR256:$t, VR256:$f, timm:$cc)>; -} - -//===----------------------------------------------------------------------===// -// 512-bit selection patterns -// Note: SDNode is X86ctselect with SDNPInGlue; pattern names EFLAGS explicitly. -// Temps ($tmpx,$tmpg) are outs allocated by RA; not mentioned in patterns. -//===----------------------------------------------------------------------===// - -let Predicates = [HasAVX512] in { - def : Pat<(v16f32 (X86ctselect VR512:$t, VR512:$f, (i8 timm:$cc), EFLAGS)), - (CTSELECT_V16F32 VR512:$t, VR512:$f, timm:$cc)>; - def : Pat<(v8f64 (X86ctselect VR512:$t, VR512:$f, (i8 timm:$cc), EFLAGS)), - (CTSELECT_V8F64 VR512:$t, VR512:$f, timm:$cc)>; - - def : Pat<(v16i32 (X86ctselect VR512:$t, VR512:$f, (i8 timm:$cc), EFLAGS)), - (CTSELECT_V16I32 VR512:$t, VR512:$f, timm:$cc)>; - def : Pat<(v8i64 (X86ctselect VR512:$t, VR512:$f, (i8 timm:$cc), EFLAGS)), - (CTSELECT_V8I64 VR512:$t, VR512:$f, timm:$cc)>; } let Predicates = [HasCMOV, HasCF] in { diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index d1057e11ba615..1fd6efaaf7b1e 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -483,7 +483,6 @@ struct CtSelectInstructions { unsigned IntMoveOpc; unsigned MoveOpc; bool Use256; - bool UseVEX; bool UseBlendInstr; }; @@ -500,6 +499,7 @@ getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { Instructions.BroadcastOpc = X86::PSHUFDri; Instructions.IntMoveOpc = X86::MOVDI2PDIrr; Instructions.MoveOpc = X86::MOVAPDrr; + Instructions.UseBlendInstr = true; } else { llvm_unreachable("Double precision vectors require SSE2"); } @@ -521,6 +521,8 @@ getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { Instructions.IntMoveOpc = X86::MOVDI2PDIrr; Instructions.MoveOpc = X86::MOVAPSrr; } else { + // fallback to SSE1, only support four 32-bit single precision + // floating-point values Instructions.PAndOpc = X86::ANDPSrr; Instructions.PAndnOpc = X86::ANDNPSrr; Instructions.POrOpc = X86::ORPSrr; @@ -544,25 +546,12 @@ getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { llvm_unreachable("Integer vector operations require SSE2"); } break; - case X86::CTSELECT_V8F16: - if (Subtarget.hasSSE2()) { - Instructions.PAndOpc = X86::PANDrr; - Instructions.PAndnOpc = X86::PANDNrr; - Instructions.POrOpc = X86::PORrr; - Instructions.BroadcastOpc = X86::PSHUFDri; - Instructions.IntMoveOpc = X86::MOVDI2PDIrr; - Instructions.MoveOpc = X86::MOVDQArr; - } else { - llvm_unreachable("FP16 vector operations require SSE2"); - } - break; case X86::CTSELECT_V4F32X: case X86::CTSELECT_V4I32X: case X86::CTSELECT_V2F64X: case X86::CTSELECT_V2I64X: case X86::CTSELECT_V8I16X: case X86::CTSELECT_V16I8X: - case X86::CTSELECT_V8F16X: if (Subtarget.hasAVX()) { Instructions.PAndOpc = X86::VPANDrr; Instructions.PAndnOpc = X86::VPANDNrr; @@ -573,7 +562,6 @@ getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { : (Opcode == X86::CTSELECT_V2F64X) ? X86::VMOVAPDrr : X86::VMOVDQArr; - Instructions.UseVEX = true; } else { llvm_unreachable("AVX variants require AVX support"); } @@ -589,7 +577,6 @@ getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { Instructions.MoveOpc = (Opcode == X86::CTSELECT_V8F32) ? X86::VMOVAPSYrr : X86::VMOVDQAYrr; Instructions.Use256 = true; - Instructions.UseVEX = true; } else { llvm_unreachable("256-bit vectors require AVX"); } @@ -605,14 +592,12 @@ getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { Instructions.MoveOpc = (Opcode == X86::CTSELECT_V4F64) ? X86::VMOVAPDYrr : X86::VMOVDQAYrr; Instructions.Use256 = true; - Instructions.UseVEX = true; } else { llvm_unreachable("256-bit vectors require AVX"); } break; case X86::CTSELECT_V16I16: case X86::CTSELECT_V32I8: - case X86::CTSELECT_V16F16: if (Subtarget.hasAVX2()) { Instructions.PAndOpc = X86::VPANDYrr; Instructions.PAndnOpc = X86::VPANDNYrr; @@ -621,7 +606,6 @@ getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; Instructions.MoveOpc = X86::VMOVDQAYrr; Instructions.Use256 = true; - Instructions.UseVEX = true; } else if (Subtarget.hasAVX()) { Instructions.PAndOpc = X86::VPANDYrr; Instructions.PAndnOpc = X86::VPANDNYrr; @@ -630,7 +614,6 @@ getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; Instructions.MoveOpc = X86::VMOVDQAYrr; Instructions.Use256 = true; - Instructions.UseVEX = true; } else { llvm_unreachable("256-bit integer vectors require AVX"); } @@ -678,6 +661,15 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { .addReg(SubReg) .setMIFlags(MachineInstr::MIFlag::NoMerge); + if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) { + // Shift left 31 bits to convert 1 -> 0x80000000, 0 -> 0x00000000 (shll $31, + // %eax) + BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR).addReg(TmpGPR).addImm(31); + } else { + // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax) + BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR).addReg(TmpGPR); + } + // Broadcast to TmpX (vector mask) BuildMI(*MBB, MI, DL, get(X86::PXORrr), MaskReg) .addReg(MaskReg) @@ -696,7 +688,7 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { .addImm(0) .setMIFlags(MachineInstr::MIFlag::NoMerge); } else { - if (Subtarget.hasSSE2() || Instruction.UseVEX) { + if (Subtarget.hasSSE2() || Subtarget.hasAVX()) { BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) .addReg(MaskReg) .addImm(0x00) @@ -710,8 +702,7 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { } } - if (Instruction.UseBlendInstr && Subtarget.hasSSE41() && - !Instruction.Use256) { + if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) { // Use dedicated blend instructions for SSE4.1+ unsigned BlendOpc; switch (Opcode) { @@ -722,14 +713,11 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { BlendOpc = X86::BLENDVPDrr0; break; default: + // alias for pblendvb that takes xmm0 as implicit mask register BlendOpc = X86::PBLENDVBrr0; break; } - // Shift left 31 bits to convert 1 -> 0x80000000, 0 -> 0x00000000 (shll $31, - // %eax) - BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR).addReg(TmpGPR).addImm(31); - // Check if XMM0 is used as one of source registers, if yes then save it // in Dst register and update FalseVal and TrueVal to Dst register bool DidSaveXMM0 = false; @@ -739,7 +727,7 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { // if XMM0 is one of the source registers, it will not match with Dst // registers, so we need to move it to Dst register - BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), Dst) + BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) .addReg(SrcXMM0) .setMIFlags(MachineInstr::MIFlag::NoMerge); @@ -759,7 +747,7 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { // if XMM0 is not allocated for any of the register, we stil need to save // and restore it after using as mask register - BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), Dst) + BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) .addReg(X86::XMM0) .setMIFlags(MachineInstr::MIFlag::NoMerge); SavedXMM0 = Dst; @@ -769,7 +757,7 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { if (MaskReg != X86::XMM0) { // BLENDV uses XMM0 as implicit mask register // https://www.felixcloutier.com/x86/pblendvb - BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), X86::XMM0) + BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0) .addReg(MaskReg) .setMIFlag(MachineInstr::MIFlag::NoMerge); @@ -788,20 +776,17 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { // restore XMM0 from SavedXMM0 if we saved it into Dst if (DidSaveXMM0) { - BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), X86::XMM0) + BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0) .addReg(SavedXMM0) .setMIFlags(MachineInstr::MIFlag::NoMerge); } // dst = result (now in MaskReg) - BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), Dst) + BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) .addReg(MaskReg) .setMIFlags(MachineInstr::MIFlag::NoMerge); } else { - // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax) - BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR).addReg(TmpGPR); - // move FalseVal to Dst register since MaskReg is XMM0 and Dst is not - BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), Dst) + BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) .addReg(FalseVal) .setMIFlags(MachineInstr::MIFlag::NoMerge); @@ -6855,14 +6840,12 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::CTSELECT_V2F64: case X86::CTSELECT_V4F32: - case X86::CTSELECT_V8F16: case X86::CTSELECT_V2I64: case X86::CTSELECT_V4I32: case X86::CTSELECT_V8I16: case X86::CTSELECT_V16I8: case X86::CTSELECT_V2F64X: case X86::CTSELECT_V4F32X: - case X86::CTSELECT_V8F16X: case X86::CTSELECT_V2I64X: case X86::CTSELECT_V4I32X: case X86::CTSELECT_V8I16X: @@ -6873,11 +6856,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::CTSELECT_V32I8: case X86::CTSELECT_V4F64: case X86::CTSELECT_V8F32: - case X86::CTSELECT_V16F16: - case X86::CTSELECT_V8I64: - case X86::CTSELECT_V16I32: - case X86::CTSELECT_V8F64: - case X86::CTSELECT_V16F32: return expandCtSelectVector(MI); } return false; From 82c59e7c2e502c3243921d48da1542f4ac3a4020 Mon Sep 17 00:00:00 2001 From: AkshayK Date: Tue, 2 Sep 2025 17:46:14 -0400 Subject: [PATCH 28/63] [CT] report error on vector types with avx512 and improve lower ctselect --- .../SelectionDAG/SelectionDAGBuilder.cpp | 6 ++++ llvm/lib/Target/X86/X86ISelLowering.cpp | 31 ++++++------------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 8276bd25b840d..1c8069228dd15 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6854,6 +6854,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, "llvm.ct.select: predicates with vector types not supported yet"); } + // report error for vector types of 512 bits + if (VT.isVector() && VT.getSizeInBits() == 512) { + report_fatal_error( + "llvm.ct.select: vector types of 512 bits not supported yet"); + } + // Handle scalar types if (TLI.isSelectSupported( TargetLoweringBase::SelectSupportKind::CtSelect) && diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9f400a970793c..073938be5c04d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25380,27 +25380,20 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { unsigned VectorWidth = VT.getSizeInBits(); MVT EltVT = VT.getVectorElementType(); - // Check if we have the necessary SIMD support - bool HasSSE = Subtarget.hasSSE1(); - bool HasAVX = Subtarget.hasAVX(); - bool HasAVX512 = Subtarget.hasAVX512(); - // For 512-bit vectors, we need AVX512 - if (VectorWidth == 512 && !HasAVX512) + // dont support 512-bit vectors yet; report error?? + if (VectorWidth == 512) return SDValue(); - // For 256-bit vectors, we need at least AVX - if (VectorWidth == 256 && !HasAVX) + if (VectorWidth == 256 && !Subtarget.hasAVX()) return SDValue(); - - // For 128-bit vectors, we need at least SSE - if (VectorWidth == 128 && !HasSSE) + if (VectorWidth == 128 && !Subtarget.hasSSE1()) return SDValue(); // Handle special cases for floating point vectors if (EltVT.isFloatingPoint()) { // For vector floating point with AVX, use VBLENDV-style operations - if (HasAVX && (VectorWidth == 256 || VectorWidth == 128)) { + if (Subtarget.hasAVX() && (VectorWidth == 256 || VectorWidth == 128)) { // Convert to bitwise operations using the condition MVT IntVT = VT.changeVectorElementTypeToInteger(); SDValue IntOp1 = DAG.getBitcast(IntVT, TrueOp); @@ -25441,13 +25434,7 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); - bool IllegalFPCMov = false; - if (VT.isFloatingPoint() && !VT.isVector() && - !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) - IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); - - if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || - Cmp.getOpcode() == X86ISD::BT) { + if ((isX86LogicalCmp(Cmp)) || Cmp.getOpcode() == X86ISD::BT) { Cond = Cmp; AddTest = false; } @@ -25496,9 +25483,9 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { if (T1.getValueType() == T2.getValueType() && T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode() != ISD::CopyFromReg) { - SDValue Cmov = DAG.getNode(X86ISD::CTSELECT, DL, T1.getValueType(), T2, - T1, CC, ProcessedCond); - return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); + SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, T1.getValueType(), + T2, T1, CC, ProcessedCond); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect); } } From 4d00929303131b143c9c3a06265636decca26679 Mon Sep 17 00:00:00 2001 From: AkshayK Date: Tue, 2 Sep 2025 21:53:48 -0400 Subject: [PATCH 29/63] [CT] add back support for v8f16 vector types that was removed --- llvm/lib/Target/X86/X86ISelLowering.cpp | 7 ++++--- llvm/lib/Target/X86/X86InstrCMovSetCC.td | 23 +++++++++++++++++++++++ llvm/lib/Target/X86/X86InstrInfo.cpp | 14 ++++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 073938be5c04d..199b47aba3da3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25381,12 +25381,13 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { unsigned VectorWidth = VT.getSizeInBits(); MVT EltVT = VT.getVectorElementType(); - // dont support 512-bit vectors yet; report error?? + // don't support 512-bit vectors yet; report error?? if (VectorWidth == 512) return SDValue(); if (VectorWidth == 256 && !Subtarget.hasAVX()) return SDValue(); + if (VectorWidth == 128 && !Subtarget.hasSSE1()) return SDValue(); @@ -25496,8 +25497,8 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { TrueOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueOp); FalseOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseOp); SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond}; - SDValue Cmov = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Ops); - return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); + SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Ops); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect); } if (isScalarFPTypeInSSEReg(VT)) { diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td index a22cc3b265710..ede8d80f0b897 100644 --- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -189,6 +189,11 @@ let Predicates = [HasSSE2] in { def CTSELECT_V16I8 : CTSELECT_VEC128 { let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; } + + // If your build has v8f16, keep this; otherwise comment it out. + def CTSELECT_V8F16 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } } let Predicates = [HasAVX] in { @@ -211,6 +216,11 @@ let Predicates = [HasAVX] in { def CTSELECT_V16I8X : CTSELECT_VEC128X { let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; } + + // If your build has v8f16, keep this; otherwise comment it out. + def CTSELECT_V8F16X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } } //===----------------------------------------------------------------------===// @@ -236,6 +246,11 @@ let Predicates = [HasAVX] in { def CTSELECT_V32I8 : CTSELECT_VEC256 { let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; } + + // If your build has v16f16, keep this; otherwise comment it out. + def CTSELECT_V16F16 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } } //===----------------------------------------------------------------------===// @@ -267,6 +282,10 @@ let Predicates = [HasSSE2] in { (CTSELECT_V16I8 VR128:$t, VR128:$f, timm:$cc)>; def : Pat<(v2f64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), (CTSELECT_V2F64 VR128:$t, VR128:$f, timm:$cc)>; + + // 128-bit f16 (optional) + def : Pat<(v8f16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8F16 VR128:$t, VR128:$f, timm:$cc)>; } let Predicates = [HasAVX] in { @@ -286,6 +305,10 @@ let Predicates = [HasAVX] in { (CTSELECT_V8F32 VR256:$t, VR256:$f, timm:$cc)>; def : Pat<(v4f64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), (CTSELECT_V4F64 VR256:$t, VR256:$f, timm:$cc)>; + + // 256-bit f16 (optional) + def : Pat<(v16f16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V16F16 VR256:$t, VR256:$f, timm:$cc)>; } let Predicates = [HasCMOV, HasCF] in { diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 1fd6efaaf7b1e..536e706334cc0 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -546,12 +546,25 @@ getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { llvm_unreachable("Integer vector operations require SSE2"); } break; + case X86::CTSELECT_V8F16: + if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVDQArr; + } else { + llvm_unreachable("FP16 vector operations require SSE2"); + } + break; case X86::CTSELECT_V4F32X: case X86::CTSELECT_V4I32X: case X86::CTSELECT_V2F64X: case X86::CTSELECT_V2I64X: case X86::CTSELECT_V8I16X: case X86::CTSELECT_V16I8X: + case X86::CTSELECT_V8F16X: if (Subtarget.hasAVX()) { Instructions.PAndOpc = X86::VPANDrr; Instructions.PAndnOpc = X86::VPANDNrr; @@ -598,6 +611,7 @@ getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { break; case X86::CTSELECT_V16I16: case X86::CTSELECT_V32I8: + case X86::CTSELECT_V16F16: if (Subtarget.hasAVX2()) { Instructions.PAndOpc = X86::VPANDYrr; Instructions.PAndnOpc = X86::VPANDNYrr; From b62fd47e817641f94f6629eaf17435bd29874d68 Mon Sep 17 00:00:00 2001 From: AkshayK Date: Thu, 4 Sep 2025 10:22:35 -0400 Subject: [PATCH 30/63] [CT] disable bundling for vector types --- llvm/lib/Target/X86/X86InstrInfo.cpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 536e706334cc0..ea779fe66946f 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -656,8 +656,6 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { Register TrueVal = MI.getOperand(4).getReg(); // false_value X86::CondCode CC = X86::CondCode(MI.getOperand(5).getImm()); // condition - auto BundleStart = MI.getIterator(); - // Create scalar mask in tempGPR and broadcast to vector mask BuildMI(*MBB, MI, DL, get(X86::MOV32ri), TmpGPR) .addImm(0) @@ -838,15 +836,12 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { .setMIFlags(MachineInstr::MIFlag::NoMerge); } - MI.eraseFromParent(); + // TODO: Bundle instructions to avoid future optimizations from breaking up + // the instructions sequence. However, bundled instructions disappears after + // unpack-mi-bundles pass. Look into the issue and fix it before enabling the + // instruction bundling. - auto BundleEnd = MI.getIterator(); - if (BundleStart != BundleEnd) { - // Only bundle if we have multiple instructions - MachineInstr *BundleHeader = - BuildMI(*MBB, BundleStart, DL, get(TargetOpcode::BUNDLE)); - finalizeBundle(*MBB, BundleHeader->getIterator(), std::next(BundleEnd)); - } + MI.eraseFromParent(); return true; } From 00355a28e999d7bbe377010c0be305bf8c2798c6 Mon Sep 17 00:00:00 2001 From: AkshayK Date: Fri, 5 Sep 2025 01:45:48 -0400 Subject: [PATCH 31/63] [CT] legalize vector types and remove error reporting for 512 bits vectors --- .../SelectionDAG/SelectionDAGBuilder.cpp | 6 - llvm/lib/Target/X86/X86ISelLowering.cpp | 25 ++- llvm/test/CodeGen/X86/ctselect-edge-cases.ll | 65 +++++--- .../test/CodeGen/X86/ctselect-optimization.ll | 4 +- llvm/test/CodeGen/X86/ctselect-vector.ll | 153 ++++++++---------- llvm/test/CodeGen/X86/ctselect.ll | 13 +- 6 files changed, 137 insertions(+), 129 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 1c8069228dd15..8276bd25b840d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6854,12 +6854,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, "llvm.ct.select: predicates with vector types not supported yet"); } - // report error for vector types of 512 bits - if (VT.isVector() && VT.getSizeInBits() == 512) { - report_fatal_error( - "llvm.ct.select: vector types of 512 bits not supported yet"); - } - // Handle scalar types if (TLI.isSelectSupported( TargetLoweringBase::SelectSupportKind::CtSelect) && diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 199b47aba3da3..91b5ed5c3f8b6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2567,6 +2567,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::x86amx, &X86::TILERegClass); } + // Handle 512-bit vector CTSELECT without AVX512 by setting them to Expand + // This allows type legalization to split them into smaller vectors + for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, MVT::v32f16, + MVT::v16f32, MVT::v8f64}) { + setOperationAction(ISD::CTSELECT, VT, Expand); + } + + // Handle 256-bit vector CTSELECT without AVX by setting them to Expand + // This allows type legalization to split them into 128-bit vectors + if (!Subtarget.hasAVX()) { + for (auto VT : {MVT::v4f64, MVT::v4i64, MVT::v8i32, MVT::v16i16, + MVT::v16f16, MVT::v32i8, MVT::v8f32}) { + setOperationAction(ISD::CTSELECT, VT, Expand); + } + } + // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); @@ -25381,12 +25397,9 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { unsigned VectorWidth = VT.getSizeInBits(); MVT EltVT = VT.getVectorElementType(); - // don't support 512-bit vectors yet; report error?? - if (VectorWidth == 512) - return SDValue(); - - if (VectorWidth == 256 && !Subtarget.hasAVX()) - return SDValue(); + // 512-bit vectors without AVX512 are now handled by type legalization + // (Expand action) 256-bit vectors without AVX are now handled by type + // legalization (Expand action) if (VectorWidth == 128 && !Subtarget.hasSSE1()) return SDValue(); diff --git a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll index 06791a3262749..fb6b4706d62d8 100644 --- a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll +++ b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll @@ -13,11 +13,17 @@ define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) { ; X64-NEXT: cmovneq %rsi, %rax ; X64-NEXT: cmovneq %rdx, %r8 ; X64-NEXT: movq %r8, %rdx -; X64: retq +; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i128: ; X32: # %bb.0: -; X32: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -27,11 +33,15 @@ define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) { ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edi ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %edx, {{[0-9]+}}(%eax) -; X32-NEXT: movl %ecx, {{[0-9]+}}(%eax) -; X32-NEXT: movl %edi, {{[0-9]+}}(%eax) +; X32-NEXT: movl %edx, 12(%eax) +; X32-NEXT: movl %ecx, 8(%eax) +; X32-NEXT: movl %edi, 4(%eax) ; X32-NEXT: movl %esi, (%eax) -; X32: retl $4 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl $4 %result = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b) ret i128 %result } @@ -43,14 +53,16 @@ define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { ; X64-NEXT: movl %edx, %eax ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %esi, %eax -; X64: retq +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i1: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax -; X32: retl +; X32-NEXT: # kill: def $al killed $al killed $eax +; X32-NEXT: retl %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) ret i1 %result } @@ -60,16 +72,16 @@ define i32 @test_ctselect_extremal_values(i1 %cond) { ; X64-LABEL: test_ctselect_extremal_values: ; X64: # %bb.0: ; X64-NEXT: testb $1, %dil -; X64-NEXT: movl $2147483647, %ecx -; X64-NEXT: movl $-2147483648, %eax +; X64-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X64-NEXT: movl $-2147483648, %eax # imm = 0x80000000 ; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_extremal_values: ; X32: # %bb.0: ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: movl $2147483647, %ecx -; X32-NEXT: movl $-2147483648, %eax +; X32-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X32-NEXT: movl $-2147483648, %eax # imm = 0x80000000 ; X32-NEXT: cmovnel %ecx, %eax ; X32-NEXT: retl %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648) @@ -81,8 +93,8 @@ define float @test_ctselect_f32_special_values(i1 %cond) { ; X64-LABEL: test_ctselect_f32_special_values: ; X64: # %bb.0: ; X64-NEXT: testb $1, %dil -; X64-NEXT: movl $2143289344, %eax -; X64-NEXT: movl $2139095040, %ecx +; X64-NEXT: movl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: movl $2139095040, %ecx # imm = 0x7F800000 ; X64-NEXT: cmovnel %eax, %ecx ; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: retq @@ -90,8 +102,8 @@ define float @test_ctselect_f32_special_values(i1 %cond) { ; X32-LABEL: test_ctselect_f32_special_values: ; X32: # %bb.0: ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: flds .LCPI3_0 -; X32-NEXT: flds .LCPI3_1 +; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; X32-NEXT: jne .LBB3_2 ; X32-NEXT: # %bb.1: ; X32-NEXT: fstp %st(1) @@ -107,8 +119,8 @@ define double @test_ctselect_f64_special_values(i1 %cond) { ; X64-LABEL: test_ctselect_f64_special_values: ; X64: # %bb.0: ; X64-NEXT: testb $1, %dil -; X64-NEXT: movabsq $9221120237041090560, %rax -; X64-NEXT: movabsq $9218868437227405312, %rcx +; X64-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000 +; X64-NEXT: movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000 ; X64-NEXT: cmovneq %rax, %rcx ; X64-NEXT: movq %rcx, %xmm0 ; X64-NEXT: retq @@ -116,8 +128,8 @@ define double @test_ctselect_f64_special_values(i1 %cond) { ; X32-LABEL: test_ctselect_f64_special_values: ; X32: # %bb.0: ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: flds .LCPI4_0 -; X32-NEXT: flds .LCPI4_1 +; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; X32-NEXT: jne .LBB4_2 ; X32-NEXT: # %bb.1: ; X32-NEXT: fstp %st(1) @@ -267,9 +279,9 @@ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; X64-LABEL: test_ctselect_deeply_nested: ; X64: # %bb.0: -; X64-NEXT: movl 24(%rsp), %eax -; X64-NEXT: movl 16(%rsp), %r10d -; X64-NEXT: movl 8(%rsp), %r11d +; X64-NEXT: movl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: movl {{[0-9]+}}(%rsp), %r11d ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %r8d, %r9d ; X64-NEXT: testb $1, %sil @@ -283,7 +295,9 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ; X32-LABEL: test_ctselect_deeply_nested: ; X32: # %bb.0: ; X32-NEXT: pushl %esi -; X32: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %esi, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -296,7 +310,8 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel %ecx, %eax ; X32-NEXT: popl %esi -; X32: retl +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) diff --git a/llvm/test/CodeGen/X86/ctselect-optimization.ll b/llvm/test/CodeGen/X86/ctselect-optimization.ll index 4c94107665601..481d49971a937 100644 --- a/llvm/test/CodeGen/X86/ctselect-optimization.ll +++ b/llvm/test/CodeGen/X86/ctselect-optimization.ll @@ -283,7 +283,7 @@ define double @test_ctselect_f64_zero_positive(double %x) { define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: test_ctselect_chain: ; CHECK: # %bb.0: -; CHECK-NEXT: movl 8(%rsp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: cmovnel %ecx, %r8d ; CHECK-NEXT: testb $1, %sil @@ -301,4 +301,4 @@ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, declare i32 @llvm.ct.select.i32(i1, i32, i32) declare i64 @llvm.ct.select.i64(i1, i64, i64) declare float @llvm.ct.select.f32(i1, float, float) -declare double @llvm.ct.select.f64(i1, double, double) \ No newline at end of file +declare double @llvm.ct.select.f64(i1, double, double) diff --git a/llvm/test/CodeGen/X86/ctselect-vector.ll b/llvm/test/CodeGen/X86/ctselect-vector.ll index 46ea1a0f83991..0e53a8324e5ce 100644 --- a/llvm/test/CodeGen/X86/ctselect-vector.ll +++ b/llvm/test/CodeGen/X86/ctselect-vector.ll @@ -16,7 +16,7 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm3, %xmm3 ; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pshufd $0, %xmm3, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 @@ -33,7 +33,7 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %xmm3, %xmm3 ; AVX-NEXT: movd %eax, %xmm3 -; AVX-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: movdqa %xmm3, %xmm2 ; AVX-NEXT: pand %xmm0, %xmm3 ; AVX-NEXT: pandn %xmm1, %xmm2 @@ -50,14 +50,13 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %xmm3, %xmm3 ; AVX2-NEXT: movd %eax, %xmm3 -; AVX2-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX2-NEXT: movdqa %xmm3, %xmm2 ; AVX2-NEXT: pand %xmm0, %xmm3 ; AVX2-NEXT: pandn %xmm1, %xmm2 ; AVX2-NEXT: por %xmm3, %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: testb %dil, %dil @@ -81,7 +80,7 @@ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm3, %xmm3 ; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pshufd $0, %xmm3, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 @@ -98,7 +97,7 @@ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %xmm3, %xmm3 ; AVX-NEXT: movd %eax, %xmm3 -; AVX-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: movdqa %xmm3, %xmm2 ; AVX-NEXT: pand %xmm0, %xmm3 ; AVX-NEXT: pandn %xmm1, %xmm2 @@ -115,14 +114,13 @@ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %xmm3, %xmm3 ; AVX2-NEXT: movd %eax, %xmm3 -; AVX2-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX2-NEXT: movdqa %xmm3, %xmm2 ; AVX2-NEXT: pand %xmm0, %xmm3 ; AVX2-NEXT: pandn %xmm1, %xmm2 ; AVX2-NEXT: por %xmm3, %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v4f32: ; AVX512: # %bb.0: ; AVX512-NEXT: testb %dil, %dil @@ -146,7 +144,7 @@ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm3, %xmm3 ; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pshufd $0, %xmm3, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 @@ -163,7 +161,7 @@ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %xmm3, %xmm3 ; AVX-NEXT: movd %eax, %xmm3 -; AVX-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: movdqa %xmm3, %xmm2 ; AVX-NEXT: pand %xmm0, %xmm3 ; AVX-NEXT: pandn %xmm1, %xmm2 @@ -180,14 +178,13 @@ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %xmm3, %xmm3 ; AVX2-NEXT: movd %eax, %xmm3 -; AVX2-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX2-NEXT: movdqa %xmm3, %xmm2 ; AVX2-NEXT: pand %xmm0, %xmm3 ; AVX2-NEXT: pandn %xmm1, %xmm2 ; AVX2-NEXT: por %xmm3, %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: testb %dil, %dil @@ -211,7 +208,7 @@ define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm3, %xmm3 ; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pshufd $0, %xmm3, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 @@ -228,7 +225,7 @@ define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %xmm3, %xmm3 ; AVX-NEXT: movd %eax, %xmm3 -; AVX-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: movdqa %xmm3, %xmm2 ; AVX-NEXT: pand %xmm0, %xmm3 ; AVX-NEXT: pandn %xmm1, %xmm2 @@ -245,14 +242,13 @@ define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %xmm3, %xmm3 ; AVX2-NEXT: movd %eax, %xmm3 -; AVX2-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX2-NEXT: movdqa %xmm3, %xmm2 ; AVX2-NEXT: pand %xmm0, %xmm3 ; AVX2-NEXT: pandn %xmm1, %xmm2 ; AVX2-NEXT: por %xmm3, %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v2f64: ; AVX512: # %bb.0: ; AVX512-NEXT: testb %dil, %dil @@ -277,7 +273,7 @@ define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) { ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm5, %xmm5 ; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: pshufd $0, %xmm5, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] ; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm4 @@ -288,7 +284,7 @@ define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) { ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm2 @@ -306,7 +302,7 @@ define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) { ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %ymm3, %ymm3 ; AVX-NEXT: vmovd %eax, %ymm3 -; AVX-NEXT: vshufps $0, %ymm3, %ymm3, %ymm3 +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] ; AVX-NEXT: vmovaps %ymm3, %ymm2 ; AVX-NEXT: andps %ymm0, %ymm3 ; AVX-NEXT: andnps %ymm1, %ymm2 @@ -323,14 +319,13 @@ define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) { ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %ymm3, %ymm3 ; AVX2-NEXT: vmovd %eax, %ymm3 -; AVX2-NEXT: vpshufd $0, %ymm3, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovdqa %ymm3, %ymm2 ; AVX2-NEXT: pand %ymm0, %ymm3 ; AVX2-NEXT: pandn %ymm1, %ymm2 ; AVX2-NEXT: por %ymm3, %ymm2 ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: testb %dil, %dil @@ -354,7 +349,7 @@ define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm5, %xmm5 ; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: pshufd $0, %xmm5, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] ; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm4 @@ -365,7 +360,7 @@ define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm2 @@ -383,7 +378,7 @@ define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %ymm3, %ymm3 ; AVX-NEXT: vmovd %eax, %ymm3 -; AVX-NEXT: vshufps $0, %ymm3, %ymm3, %ymm3 +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] ; AVX-NEXT: vmovaps %ymm3, %ymm2 ; AVX-NEXT: andps %ymm0, %ymm3 ; AVX-NEXT: andnps %ymm1, %ymm2 @@ -400,14 +395,13 @@ define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %ymm3, %ymm3 ; AVX2-NEXT: vmovd %eax, %ymm3 -; AVX2-NEXT: vpshufd $0, %ymm3, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovdqa %ymm3, %ymm2 ; AVX2-NEXT: pand %ymm0, %ymm3 ; AVX2-NEXT: pandn %ymm1, %ymm2 ; AVX2-NEXT: por %ymm3, %ymm2 ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: testb %dil, %dil @@ -431,7 +425,7 @@ define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) { ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm5, %xmm5 ; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: pshufd $0, %xmm5, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] ; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm4 @@ -442,7 +436,7 @@ define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) { ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm2 @@ -460,7 +454,7 @@ define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) { ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %ymm3, %ymm3 ; AVX-NEXT: vmovd %eax, %ymm3 -; AVX-NEXT: vshufpd $0, %ymm3, %ymm3, %ymm3 +; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,2,2] ; AVX-NEXT: vmovapd %ymm3, %ymm2 ; AVX-NEXT: andpd %ymm0, %ymm3 ; AVX-NEXT: andnpd %ymm1, %ymm2 @@ -477,14 +471,13 @@ define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) { ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %ymm3, %ymm3 ; AVX2-NEXT: vmovd %eax, %ymm3 -; AVX2-NEXT: vshufpd $0, %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,2,2] ; AVX2-NEXT: vmovapd %ymm3, %ymm2 ; AVX2-NEXT: andpd %ymm0, %ymm3 ; AVX2-NEXT: andnpd %ymm1, %ymm2 ; AVX2-NEXT: orpd %ymm3, %ymm2 ; AVX2-NEXT: vmovapd %ymm2, %ymm0 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: testb %dil, %dil @@ -508,7 +501,7 @@ define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm5, %xmm5 ; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: pshufd $0, %xmm5, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] ; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm4 @@ -519,7 +512,7 @@ define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm2 @@ -537,7 +530,7 @@ define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %ymm3, %ymm3 ; AVX-NEXT: vmovd %eax, %ymm3 -; AVX-NEXT: vshufpd $0, %ymm3, %ymm3, %ymm3 +; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,2,2] ; AVX-NEXT: vmovapd %ymm3, %ymm2 ; AVX-NEXT: andpd %ymm0, %ymm3 ; AVX-NEXT: andnpd %ymm1, %ymm2 @@ -554,14 +547,13 @@ define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %ymm3, %ymm3 ; AVX2-NEXT: vmovd %eax, %ymm3 -; AVX2-NEXT: vshufpd $0, %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,2,2] ; AVX2-NEXT: vmovapd %ymm3, %ymm2 ; AVX2-NEXT: andpd %ymm0, %ymm3 ; AVX2-NEXT: andnpd %ymm1, %ymm2 ; AVX2-NEXT: orpd %ymm3, %ymm2 ; AVX2-NEXT: vmovapd %ymm2, %ymm0 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: testb %dil, %dil @@ -586,7 +578,7 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm9, %xmm9 ; SSE2-NEXT: movd %eax, %xmm9 -; SSE2-NEXT: pshufd $0, %xmm9, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] ; SSE2-NEXT: movdqa %xmm9, %xmm8 ; SSE2-NEXT: pand %xmm0, %xmm9 ; SSE2-NEXT: pandn %xmm4, %xmm8 @@ -597,7 +589,7 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm4 @@ -608,7 +600,7 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm5 @@ -619,7 +611,7 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm7, %xmm6 @@ -639,7 +631,7 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %ymm5, %ymm5 ; AVX-NEXT: vmovd %eax, %ymm5 -; AVX-NEXT: vshufps $0, %ymm5, %ymm5, %ymm5 +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] ; AVX-NEXT: vmovaps %ymm5, %ymm4 ; AVX-NEXT: andps %ymm0, %ymm5 ; AVX-NEXT: andnps %ymm2, %ymm4 @@ -650,7 +642,7 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %ymm0, %ymm0 ; AVX-NEXT: vmovd %eax, %ymm0 -; AVX-NEXT: vshufps $0, %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX-NEXT: vmovaps %ymm0, %ymm2 ; AVX-NEXT: andps %ymm1, %ymm0 ; AVX-NEXT: andnps %ymm3, %ymm2 @@ -668,7 +660,7 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %ymm5, %ymm5 ; AVX2-NEXT: vmovd %eax, %ymm5 -; AVX2-NEXT: vpshufd $0, %ymm5, %ymm5 +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovdqa %ymm5, %ymm4 ; AVX2-NEXT: pand %ymm0, %ymm5 ; AVX2-NEXT: pandn %ymm2, %ymm4 @@ -679,7 +671,7 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) ; AVX2-NEXT: negl %eax ; AVX2-NEXT: pxor %ymm0, %ymm0 ; AVX2-NEXT: vmovd %eax, %ymm0 -; AVX2-NEXT: vpshufd $0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovdqa %ymm0, %ymm2 ; AVX2-NEXT: pand %ymm1, %ymm0 ; AVX2-NEXT: pandn %ymm3, %ymm2 @@ -687,7 +679,6 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) ; AVX2-NEXT: vmovdqa %ymm4, %ymm0 ; AVX2-NEXT: vmovdqa %ymm2, %ymm1 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: testb %dil, %dil @@ -711,7 +702,7 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm9, %xmm9 ; SSE2-NEXT: movd %eax, %xmm9 -; SSE2-NEXT: pshufd $0, %xmm9, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] ; SSE2-NEXT: movdqa %xmm9, %xmm8 ; SSE2-NEXT: pand %xmm0, %xmm9 ; SSE2-NEXT: pandn %xmm4, %xmm8 @@ -722,7 +713,7 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm4 @@ -733,7 +724,7 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm5 @@ -744,7 +735,7 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm7, %xmm6 @@ -764,7 +755,7 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %ymm5, %ymm5 ; AVX-NEXT: vmovd %eax, %ymm5 -; AVX-NEXT: vshufps $0, %ymm5, %ymm5, %ymm5 +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] ; AVX-NEXT: vmovaps %ymm5, %ymm4 ; AVX-NEXT: andps %ymm0, %ymm5 ; AVX-NEXT: andnps %ymm2, %ymm4 @@ -775,7 +766,7 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %ymm0, %ymm0 ; AVX-NEXT: vmovd %eax, %ymm0 -; AVX-NEXT: vshufps $0, %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX-NEXT: vmovaps %ymm0, %ymm2 ; AVX-NEXT: andps %ymm1, %ymm0 ; AVX-NEXT: andnps %ymm3, %ymm2 @@ -793,7 +784,7 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %ymm5, %ymm5 ; AVX2-NEXT: vmovd %eax, %ymm5 -; AVX2-NEXT: vpshufd $0, %ymm5, %ymm5 # ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovdqa %ymm5, %ymm4 ; AVX2-NEXT: pand %ymm0, %ymm5 ; AVX2-NEXT: pandn %ymm2, %ymm4 @@ -804,7 +795,7 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; AVX2-NEXT: negl %eax ; AVX2-NEXT: pxor %ymm0, %ymm0 ; AVX2-NEXT: vmovd %eax, %ymm0 -; AVX2-NEXT: vpshufd $0, %ymm0, %ymm0 # ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovdqa %ymm0, %ymm2 ; AVX2-NEXT: pand %ymm1, %ymm0 ; AVX2-NEXT: pandn %ymm3, %ymm2 @@ -812,7 +803,6 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; AVX2-NEXT: vmovdqa %ymm4, %ymm0 ; AVX2-NEXT: vmovdqa %ymm2, %ymm1 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v16f32: ; AVX512: # %bb.0: ; AVX512-NEXT: testb %dil, %dil @@ -836,7 +826,7 @@ define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm9, %xmm9 ; SSE2-NEXT: movd %eax, %xmm9 -; SSE2-NEXT: pshufd $0, %xmm9, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] ; SSE2-NEXT: movdqa %xmm9, %xmm8 ; SSE2-NEXT: pand %xmm0, %xmm9 ; SSE2-NEXT: pandn %xmm4, %xmm8 @@ -847,7 +837,7 @@ define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm4 @@ -858,7 +848,7 @@ define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm5 @@ -869,7 +859,7 @@ define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm7, %xmm6 @@ -889,7 +879,7 @@ define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %ymm5, %ymm5 ; AVX-NEXT: vmovd %eax, %ymm5 -; AVX-NEXT: vshufpd $0, %ymm5, %ymm5, %ymm5 +; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,2,2] ; AVX-NEXT: vmovapd %ymm5, %ymm4 ; AVX-NEXT: andpd %ymm0, %ymm5 ; AVX-NEXT: andnpd %ymm2, %ymm4 @@ -900,7 +890,7 @@ define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { ; AVX-NEXT: negl %eax ; AVX-NEXT: xorpd %ymm0, %ymm0 ; AVX-NEXT: vmovd %eax, %ymm0 -; AVX-NEXT: vshufpd $0, %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX-NEXT: vmovapd %ymm0, %ymm2 ; AVX-NEXT: andpd %ymm1, %ymm0 ; AVX-NEXT: andnpd %ymm3, %ymm2 @@ -918,7 +908,7 @@ define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %ymm5, %ymm5 ; AVX2-NEXT: vmovd %eax, %ymm5 -; AVX2-NEXT: vshufpd $0, %ymm5, %ymm5, %ymm5 # ymm5 = ymm5[0,0,2,2] +; AVX2-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,2,2] ; AVX2-NEXT: vmovapd %ymm5, %ymm4 ; AVX2-NEXT: andpd %ymm0, %ymm5 ; AVX2-NEXT: andnpd %ymm2, %ymm4 @@ -929,7 +919,7 @@ define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorpd %ymm0, %ymm0 ; AVX2-NEXT: vmovd %eax, %ymm0 -; AVX2-NEXT: vshufpd $0, %ymm0, %ymm0, %ymm0 # ymm0 = ymm0[0,0,2,2] +; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX2-NEXT: vmovapd %ymm0, %ymm2 ; AVX2-NEXT: andpd %ymm1, %ymm0 ; AVX2-NEXT: andnpd %ymm3, %ymm2 @@ -937,7 +927,6 @@ define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { ; AVX2-NEXT: vmovapd %ymm4, %ymm0 ; AVX2-NEXT: vmovapd %ymm2, %ymm1 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: testb %dil, %dil @@ -961,7 +950,7 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm9, %xmm9 ; SSE2-NEXT: movd %eax, %xmm9 -; SSE2-NEXT: pshufd $0, %xmm9, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] ; SSE2-NEXT: movdqa %xmm9, %xmm8 ; SSE2-NEXT: pand %xmm0, %xmm9 ; SSE2-NEXT: pandn %xmm4, %xmm8 @@ -972,7 +961,7 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm4 @@ -983,7 +972,7 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm5 @@ -994,7 +983,7 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; SSE2-NEXT: negl %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd $0, %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm7, %xmm6 @@ -1014,7 +1003,7 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %ymm5, %ymm5 ; AVX-NEXT: vmovd %eax, %ymm5 -; AVX-NEXT: vshufpd $0, %ymm5, %ymm5, %ymm5 +; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,2,2] ; AVX-NEXT: vmovapd %ymm5, %ymm4 ; AVX-NEXT: andpd %ymm0, %ymm5 ; AVX-NEXT: andnpd %ymm2, %ymm4 @@ -1025,7 +1014,7 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; AVX-NEXT: negl %eax ; AVX-NEXT: xorpd %ymm0, %ymm0 ; AVX-NEXT: vmovd %eax, %ymm0 -; AVX-NEXT: vshufpd $0, %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX-NEXT: vmovapd %ymm0, %ymm2 ; AVX-NEXT: andpd %ymm1, %ymm0 ; AVX-NEXT: andnpd %ymm3, %ymm2 @@ -1043,7 +1032,7 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %ymm5, %ymm5 ; AVX2-NEXT: vmovd %eax, %ymm5 -; AVX2-NEXT: vshufpd $0, %ymm5, %ymm5, %ymm5 # ymm5 = ymm5[0,0,2,2] +; AVX2-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,2,2] ; AVX2-NEXT: vmovapd %ymm5, %ymm4 ; AVX2-NEXT: andpd %ymm0, %ymm5 ; AVX2-NEXT: andnpd %ymm2, %ymm4 @@ -1054,7 +1043,7 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorpd %ymm0, %ymm0 ; AVX2-NEXT: vmovd %eax, %ymm0 -; AVX2-NEXT: vshufpd $0, %ymm0, %ymm0, %ymm0 # ymm0 = ymm0[0,0,2,2] +; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX2-NEXT: vmovapd %ymm0, %ymm2 ; AVX2-NEXT: andpd %ymm1, %ymm0 ; AVX2-NEXT: andnpd %ymm3, %ymm2 @@ -1062,7 +1051,6 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; AVX2-NEXT: vmovapd %ymm4, %ymm0 ; AVX2-NEXT: vmovapd %ymm2, %ymm1 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v8f64: ; AVX512: # %bb.0: ; AVX512-NEXT: testb %dil, %dil @@ -1088,7 +1076,7 @@ define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) { ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm3, %xmm3 ; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pshufd $0, %xmm3, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 @@ -1106,7 +1094,7 @@ define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) { ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %xmm3, %xmm3 ; AVX-NEXT: movd %eax, %xmm3 -; AVX-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: movdqa %xmm3, %xmm2 ; AVX-NEXT: pand %xmm0, %xmm3 ; AVX-NEXT: pandn %xmm1, %xmm2 @@ -1124,14 +1112,13 @@ define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) { ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %xmm3, %xmm3 ; AVX2-NEXT: movd %eax, %xmm3 -; AVX2-NEXT: pshufd $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX2-NEXT: movdqa %xmm3, %xmm2 ; AVX2-NEXT: pand %xmm0, %xmm3 ; AVX2-NEXT: pandn %xmm1, %xmm2 ; AVX2-NEXT: por %xmm3, %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v4i32_const_true: ; AVX512: # %bb.0: ; AVX512-NEXT: retq @@ -1150,7 +1137,7 @@ define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) { ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm3, %xmm3 ; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pshufd $0, %xmm3, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 @@ -1168,7 +1155,7 @@ define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) { ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %xmm3, %xmm3 ; AVX-NEXT: movd %eax, %xmm3 -; AVX-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: movdqa %xmm3, %xmm2 ; AVX-NEXT: pand %xmm0, %xmm3 ; AVX-NEXT: pandn %xmm1, %xmm2 @@ -1186,14 +1173,13 @@ define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) { ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %xmm3, %xmm3 ; AVX2-NEXT: movd %eax, %xmm3 -; AVX2-NEXT: pshufd $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX2-NEXT: movdqa %xmm3, %xmm2 ; AVX2-NEXT: pand %xmm0, %xmm3 ; AVX2-NEXT: pandn %xmm1, %xmm2 ; AVX2-NEXT: por %xmm3, %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v4i32_const_false: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovaps %xmm1, %xmm0 @@ -1215,7 +1201,7 @@ define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i3 ; SSE2-NEXT: negl %eax ; SSE2-NEXT: xorps %xmm3, %xmm3 ; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pshufd $0, %xmm3, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 @@ -1234,7 +1220,7 @@ define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i3 ; AVX-NEXT: negl %eax ; AVX-NEXT: xorps %xmm3, %xmm3 ; AVX-NEXT: movd %eax, %xmm3 -; AVX-NEXT: pshufd $0, %xmm3, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: movdqa %xmm3, %xmm2 ; AVX-NEXT: pand %xmm0, %xmm3 ; AVX-NEXT: pandn %xmm1, %xmm2 @@ -1253,14 +1239,13 @@ define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i3 ; AVX2-NEXT: negl %eax ; AVX2-NEXT: xorps %xmm3, %xmm3 ; AVX2-NEXT: movd %eax, %xmm3 -; AVX2-NEXT: pshufd $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX2-NEXT: movdqa %xmm3, %xmm2 ; AVX2-NEXT: pand %xmm0, %xmm3 ; AVX2-NEXT: pandn %xmm1, %xmm2 ; AVX2-NEXT: por %xmm3, %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; AVX2-NEXT: retq -; ; AVX512-LABEL: test_ctselect_v4i32_icmp: ; AVX512: # %bb.0: ; AVX512-NEXT: cmpl %esi, %edi diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll index 71a847f00d166..580253b27d44e 100644 --- a/llvm/test/CodeGen/X86/ctselect.ll +++ b/llvm/test/CodeGen/X86/ctselect.ll @@ -10,7 +10,7 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { ; X64-NEXT: movl %edx, %eax ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %esi, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i8: @@ -18,7 +18,8 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax -; X32: retl +; X32-NEXT: # kill: def $al killed $al killed $eax +; X32-NEXT: retl %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) ret i8 %result } @@ -29,7 +30,7 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { ; X64-NEXT: movl %edx, %eax ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %esi, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i16: @@ -37,7 +38,7 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { ; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnew {{[0-9]+}}(%esp), %ax -; X32: retl +; X32-NEXT: retl %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) ret i16 %result } @@ -48,14 +49,14 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; X64-NEXT: movl %edx, %eax ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %esi, %eax -; X64: retq +; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i32: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax -; X32: retl +; X32-NEXT: retl %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result } From 6945ccc16cdb1f8401fbc7ff9de961e4472faed7 Mon Sep 17 00:00:00 2001 From: AkshayK Date: Fri, 30 May 2025 10:30:46 -0400 Subject: [PATCH 32/63] [CT]implementation of ct.select intrinsics and clang frontend changes for X86 architecture --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 9 +++ llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 31 +-------- llvm/lib/Target/X86/X86ISelLowering.cpp | 53 ++++++++------- llvm/test/CodeGen/X86/ctselect-edge-cases.ll | 65 +++++++------------ llvm/test/CodeGen/X86/ctselect-vector.ll | 1 + llvm/test/CodeGen/X86/ctselect.ll | 13 ++-- 6 files changed, 69 insertions(+), 103 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 589f85671eddd..61feb82a585c9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12550,6 +12550,9 @@ SDValue DAGCombiner::visitCTSELECT(SDNode *N) { SDLoc DL(N); SDNodeFlags Flags = N->getFlags(); + // if (SDValue V = DAG.simplifySelect(N0, N1, N2)) + // return V; + if (SDValue V = foldBoolSelectToLogic(N, DL, DAG)) return V; @@ -12560,6 +12563,9 @@ SDValue DAGCombiner::visitCTSELECT(SDNode *N) { return SelectOp; } + // if (SDValue V = foldSelectOfConstants(N)) + // return V; + if (VT0 == MVT::i1) { // The code in this block deals with the following 2 equivalences: // select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y)) @@ -12641,6 +12647,9 @@ SDValue DAGCombiner::visitCTSELECT(SDNode *N) { } } + if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG)) + return R; + return SDValue(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 3b134220368dc..cae3c8ab30cfe 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4139,35 +4139,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Tmp1 = Node->getOperand(0); Tmp2 = Node->getOperand(1); Tmp3 = Node->getOperand(2); - EVT VT = Tmp2.getValueType(); - if (VT.isVector()) { - SmallVector Elements; - unsigned NumElements = VT.getVectorNumElements(); - EVT ScalarVT = VT.getScalarType(); - for (unsigned Idx = 0; Idx < NumElements; ++Idx) { - SDValue IdxVal = DAG.getConstant(Idx, dl, MVT::i64); - SDValue TVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp2, IdxVal); - SDValue FVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp3, IdxVal); - Elements.push_back(DAG.getCTSelect(dl, ScalarVT, Tmp1, TVal, FVal, Node->getFlags())); - } - Tmp1 = DAG.getBuildVector(VT, dl, Elements); - } else if (VT.isFloatingPoint()) { - EVT IntegerVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); - Tmp2 = DAG.getBitcast(IntegerVT, Tmp2); - Tmp3 = DAG.getBitcast(IntegerVT, Tmp3); - Tmp1 = DAG.getBitcast(VT, DAG.getCTSelect(dl, IntegerVT, Tmp1, Tmp2, Tmp3, Node->getFlags())); - } else { - assert(VT.isInteger()); - EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - SDValue Tmp2Lo, Tmp2Hi; - SDValue Tmp3Lo, Tmp3Hi; - std::tie(Tmp2Lo, Tmp2Hi) = DAG.SplitScalar(Tmp2, dl, HalfVT, HalfVT); - std::tie(Tmp3Lo, Tmp3Hi) = DAG.SplitScalar(Tmp3, dl, HalfVT, HalfVT); - SDValue ResLo = DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Lo, Tmp3Lo, Node->getFlags()); - SDValue ResHi = DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Hi, Tmp3Hi, Node->getFlags()); - Tmp1 = DAG.getNode(ISD::BUILD_PAIR, dl, VT, ResLo, ResHi); - Tmp1->setFlags(Node->getFlags()); - } + Tmp1 = DAG.getCTSelect(dl, Tmp1.getValueType(), Tmp1, Tmp2, Tmp3); + Tmp1->setFlags(Node->getFlags()); Results.push_back(Tmp1); break; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 91b5ed5c3f8b6..9f400a970793c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2567,22 +2567,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::x86amx, &X86::TILERegClass); } - // Handle 512-bit vector CTSELECT without AVX512 by setting them to Expand - // This allows type legalization to split them into smaller vectors - for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, MVT::v32f16, - MVT::v16f32, MVT::v8f64}) { - setOperationAction(ISD::CTSELECT, VT, Expand); - } - - // Handle 256-bit vector CTSELECT without AVX by setting them to Expand - // This allows type legalization to split them into 128-bit vectors - if (!Subtarget.hasAVX()) { - for (auto VT : {MVT::v4f64, MVT::v4i64, MVT::v8i32, MVT::v16i16, - MVT::v16f16, MVT::v32i8, MVT::v8f32}) { - setOperationAction(ISD::CTSELECT, VT, Expand); - } - } - // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); @@ -25396,18 +25380,27 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { unsigned VectorWidth = VT.getSizeInBits(); MVT EltVT = VT.getVectorElementType(); + // Check if we have the necessary SIMD support + bool HasSSE = Subtarget.hasSSE1(); + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); - // 512-bit vectors without AVX512 are now handled by type legalization - // (Expand action) 256-bit vectors without AVX are now handled by type - // legalization (Expand action) + // For 512-bit vectors, we need AVX512 + if (VectorWidth == 512 && !HasAVX512) + return SDValue(); - if (VectorWidth == 128 && !Subtarget.hasSSE1()) + // For 256-bit vectors, we need at least AVX + if (VectorWidth == 256 && !HasAVX) + return SDValue(); + + // For 128-bit vectors, we need at least SSE + if (VectorWidth == 128 && !HasSSE) return SDValue(); // Handle special cases for floating point vectors if (EltVT.isFloatingPoint()) { // For vector floating point with AVX, use VBLENDV-style operations - if (Subtarget.hasAVX() && (VectorWidth == 256 || VectorWidth == 128)) { + if (HasAVX && (VectorWidth == 256 || VectorWidth == 128)) { // Convert to bitwise operations using the condition MVT IntVT = VT.changeVectorElementTypeToInteger(); SDValue IntOp1 = DAG.getBitcast(IntVT, TrueOp); @@ -25448,7 +25441,13 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); - if ((isX86LogicalCmp(Cmp)) || Cmp.getOpcode() == X86ISD::BT) { + bool IllegalFPCMov = false; + if (VT.isFloatingPoint() && !VT.isVector() && + !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) + IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); + + if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || + Cmp.getOpcode() == X86ISD::BT) { Cond = Cmp; AddTest = false; } @@ -25497,9 +25496,9 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { if (T1.getValueType() == T2.getValueType() && T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode() != ISD::CopyFromReg) { - SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, T1.getValueType(), - T2, T1, CC, ProcessedCond); - return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect); + SDValue Cmov = DAG.getNode(X86ISD::CTSELECT, DL, T1.getValueType(), T2, + T1, CC, ProcessedCond); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } } @@ -25510,8 +25509,8 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { TrueOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueOp); FalseOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseOp); SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond}; - SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Ops); - return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect); + SDValue Cmov = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Ops); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } if (isScalarFPTypeInSSEReg(VT)) { diff --git a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll index fb6b4706d62d8..06791a3262749 100644 --- a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll +++ b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll @@ -13,17 +13,11 @@ define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) { ; X64-NEXT: cmovneq %rsi, %rax ; X64-NEXT: cmovneq %rdx, %r8 ; X64-NEXT: movq %r8, %rdx -; X64-NEXT: retq +; X64: retq ; ; X32-LABEL: test_ctselect_i128: ; X32: # %bb.0: -; X32-NEXT: pushl %edi -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: pushl %esi -; X32-NEXT: .cfi_def_cfa_offset 12 -; X32-NEXT: .cfi_offset %esi, -12 -; X32-NEXT: .cfi_offset %edi, -8 -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -33,15 +27,11 @@ define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) { ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edi ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %edx, 12(%eax) -; X32-NEXT: movl %ecx, 8(%eax) -; X32-NEXT: movl %edi, 4(%eax) +; X32-NEXT: movl %edx, {{[0-9]+}}(%eax) +; X32-NEXT: movl %ecx, {{[0-9]+}}(%eax) +; X32-NEXT: movl %edi, {{[0-9]+}}(%eax) ; X32-NEXT: movl %esi, (%eax) -; X32-NEXT: popl %esi -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: popl %edi -; X32-NEXT: .cfi_def_cfa_offset 4 -; X32-NEXT: retl $4 +; X32: retl $4 %result = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b) ret i128 %result } @@ -53,16 +43,14 @@ define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { ; X64-NEXT: movl %edx, %eax ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %esi, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: retq +; X64: retq ; ; X32-LABEL: test_ctselect_i1: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax -; X32-NEXT: # kill: def $al killed $al killed $eax -; X32-NEXT: retl +; X32: retl %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) ret i1 %result } @@ -72,16 +60,16 @@ define i32 @test_ctselect_extremal_values(i1 %cond) { ; X64-LABEL: test_ctselect_extremal_values: ; X64: # %bb.0: ; X64-NEXT: testb $1, %dil -; X64-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF -; X64-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; X64-NEXT: movl $2147483647, %ecx +; X64-NEXT: movl $-2147483648, %eax ; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_extremal_values: ; X32: # %bb.0: ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF -; X32-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; X32-NEXT: movl $2147483647, %ecx +; X32-NEXT: movl $-2147483648, %eax ; X32-NEXT: cmovnel %ecx, %eax ; X32-NEXT: retl %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648) @@ -93,8 +81,8 @@ define float @test_ctselect_f32_special_values(i1 %cond) { ; X64-LABEL: test_ctselect_f32_special_values: ; X64: # %bb.0: ; X64-NEXT: testb $1, %dil -; X64-NEXT: movl $2143289344, %eax # imm = 0x7FC00000 -; X64-NEXT: movl $2139095040, %ecx # imm = 0x7F800000 +; X64-NEXT: movl $2143289344, %eax +; X64-NEXT: movl $2139095040, %ecx ; X64-NEXT: cmovnel %eax, %ecx ; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: retq @@ -102,8 +90,8 @@ define float @test_ctselect_f32_special_values(i1 %cond) { ; X32-LABEL: test_ctselect_f32_special_values: ; X32: # %bb.0: ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} -; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X32-NEXT: flds .LCPI3_0 +; X32-NEXT: flds .LCPI3_1 ; X32-NEXT: jne .LBB3_2 ; X32-NEXT: # %bb.1: ; X32-NEXT: fstp %st(1) @@ -119,8 +107,8 @@ define double @test_ctselect_f64_special_values(i1 %cond) { ; X64-LABEL: test_ctselect_f64_special_values: ; X64: # %bb.0: ; X64-NEXT: testb $1, %dil -; X64-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000 -; X64-NEXT: movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000 +; X64-NEXT: movabsq $9221120237041090560, %rax +; X64-NEXT: movabsq $9218868437227405312, %rcx ; X64-NEXT: cmovneq %rax, %rcx ; X64-NEXT: movq %rcx, %xmm0 ; X64-NEXT: retq @@ -128,8 +116,8 @@ define double @test_ctselect_f64_special_values(i1 %cond) { ; X32-LABEL: test_ctselect_f64_special_values: ; X32: # %bb.0: ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} -; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X32-NEXT: flds .LCPI4_0 +; X32-NEXT: flds .LCPI4_1 ; X32-NEXT: jne .LBB4_2 ; X32-NEXT: # %bb.1: ; X32-NEXT: fstp %st(1) @@ -279,9 +267,9 @@ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; X64-LABEL: test_ctselect_deeply_nested: ; X64: # %bb.0: -; X64-NEXT: movl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl {{[0-9]+}}(%rsp), %r10d -; X64-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; X64-NEXT: movl 24(%rsp), %eax +; X64-NEXT: movl 16(%rsp), %r10d +; X64-NEXT: movl 8(%rsp), %r11d ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %r8d, %r9d ; X64-NEXT: testb $1, %sil @@ -295,9 +283,7 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ; X32-LABEL: test_ctselect_deeply_nested: ; X32: # %bb.0: ; X32-NEXT: pushl %esi -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: .cfi_offset %esi, -8 -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -310,8 +296,7 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel %ecx, %eax ; X32-NEXT: popl %esi -; X32-NEXT: .cfi_def_cfa_offset 4 -; X32-NEXT: retl +; X32: retl %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) diff --git a/llvm/test/CodeGen/X86/ctselect-vector.ll b/llvm/test/CodeGen/X86/ctselect-vector.ll index 0e53a8324e5ce..a3a7934d2bb92 100644 --- a/llvm/test/CodeGen/X86/ctselect-vector.ll +++ b/llvm/test/CodeGen/X86/ctselect-vector.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +// ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 ; Test ct.select functionality for vector types diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll index 580253b27d44e..71a847f00d166 100644 --- a/llvm/test/CodeGen/X86/ctselect.ll +++ b/llvm/test/CodeGen/X86/ctselect.ll @@ -10,7 +10,7 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { ; X64-NEXT: movl %edx, %eax ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %esi, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i8: @@ -18,8 +18,7 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax -; X32-NEXT: # kill: def $al killed $al killed $eax -; X32-NEXT: retl +; X32: retl %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) ret i8 %result } @@ -30,7 +29,7 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { ; X64-NEXT: movl %edx, %eax ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %esi, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i16: @@ -38,7 +37,7 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { ; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnew {{[0-9]+}}(%esp), %ax -; X32-NEXT: retl +; X32: retl %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) ret i16 %result } @@ -49,14 +48,14 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; X64-NEXT: movl %edx, %eax ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %esi, %eax -; X64-NEXT: retq +; X64: retq ; ; X32-LABEL: test_ctselect_i32: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax -; X32-NEXT: retl +; X32: retl %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result } From 1815b06cdf37419e7dc9aa9ed868f101baa27275 Mon Sep 17 00:00:00 2001 From: Francesco Bertolaccini Date: Tue, 15 Jul 2025 15:04:05 +0200 Subject: [PATCH 33/63] [CT] WIP: Implement CTSELECT for ARM --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 48 +++++++++++++++++++++++++ llvm/lib/Target/ARM/ARMISelLowering.h | 1 + 2 files changed, 49 insertions(+) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 67ea2dd3df792..5210f23384fcb 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -203,6 +203,7 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) { setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::CTSELECT, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT.isInteger()) { setOperationAction(ISD::SHL, VT, Custom); @@ -304,6 +305,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::CTSELECT, VT, Expand); // Vector reductions setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); @@ -355,6 +357,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::CTSELECT, VT, Expand); // Pre and Post inc are supported on loads and stores for (unsigned im = (unsigned)ISD::PRE_INC; @@ -474,6 +477,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::CTSELECT, VT, Expand); if (!HasMVEFP) { setOperationAction(ISD::SINT_TO_FP, VT, Expand); @@ -1237,10 +1241,15 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); + setOperationAction(ISD::CTSELECT, MVT::i32, Custom); + setOperationAction(ISD::CTSELECT, MVT::f32, Custom); + setOperationAction(ISD::CTSELECT, MVT::i64, Custom); + setOperationAction(ISD::CTSELECT, MVT::f64, Custom); if (Subtarget->hasFullFP16()) { setOperationAction(ISD::SETCC, MVT::f16, Expand); setOperationAction(ISD::SELECT, MVT::f16, Custom); setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); + setOperationAction(ISD::CTSELECT, MVT::f16, Custom); } setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); @@ -5103,6 +5112,23 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SelectTrue, SelectFalse, ISD::SETNE); } +SDValue ARMTargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { + SDValue Cond = Op.getOperand(0); + SDValue SelectTrue = Op.getOperand(1); + SDValue SelectFalse = Op.getOperand(2); + EVT VT = Op.getValueType(); + SDLoc DL(Op); + + SDValue Zero = DAG.getConstant(0, DL, Cond.getValueType()); + SDValue Val = DAG.getSetCC(DL, VT, Cond, Zero, ISD::SETNE); + SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Val); + SDValue MaskNeg = DAG.getNOT(DL, Mask, VT); + + SelectTrue = DAG.getNode(ISD::AND, DL, VT, SelectTrue, Mask); + SelectFalse = DAG.getNode(ISD::AND, DL, VT, SelectFalse, MaskNeg); + return DAG.getNode(ISD::OR, DL, VT, SelectTrue, SelectFalse); +} + static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps) { // Start by selecting the GE condition code for opcodes that return true for @@ -10599,6 +10625,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::CTSELECT: return LowerCTSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::BR_JT: return LowerBR_JT(Op, DAG); @@ -10751,6 +10778,24 @@ static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl &Results, LongMul.getValue(0), LongMul.getValue(1))); } +static SDValue ExpandCTSELECT(SDNode *N, SelectionDAG &DAG) { + SDValue Cond = N->getOperand(0); + SDValue TrueValue = N->getOperand(1); + SDValue FalseValue = N->getOperand(2); + SDLoc DL(N); + + SDValue TrueValueHi, TrueValueLo; + std::tie(TrueValueHi, TrueValueLo) = DAG.SplitScalar(TrueValue, DL, MVT::i32, MVT::i32); + + SDValue FalseValueHi, FalseValueLo; + std::tie(FalseValueHi, FalseValueLo) = DAG.SplitScalar(FalseValue, DL, MVT::i32, MVT::i32); + + SDValue ResHi = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, {Cond, TrueValueHi, FalseValueHi}); + SDValue ResLo = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, {Cond, TrueValueLo, FalseValueLo}); + + return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResLo, ResHi); +} + /// ReplaceNodeResults - Replace the results of node with an illegal result /// type with new values built out of custom code. void ARMTargetLowering::ReplaceNodeResults(SDNode *N, @@ -10815,6 +10860,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::FP_TO_UINT_SAT: Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget); break; + case ISD::CTSELECT: + Res = ExpandCTSELECT(N, DAG); + break; } if (Res.getNode()) Results.push_back(Res); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 70aa001a41885..0e4f66e0e391d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -880,6 +880,7 @@ class VectorType; SDValue LowerUnsignedALUO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; From 62e58e5295a10a3ee0deb650995dc076648c97ff Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Thu, 17 Jul 2025 09:52:34 -0600 Subject: [PATCH 34/63] WIP: integrate the ctselect intrinsic so it can be recognised and lowered, connecting it to the lowering function --- llvm/lib/Target/ARM/ARMArchitectures.td | 15 +- llvm/lib/Target/ARM/ARMFeatures.td | 8 + llvm/lib/Target/ARM/ARMISelLowering.cpp | 98 +++++++-- llvm/lib/Target/ARM/ARMISelLowering.h | 14 +- llvm/lib/Target/ARM/ARMInstrInfo.td | 9 + llvm/test/CodeGen/ARM/ctselect.ll | 263 ++++++++++++++++++++++++ 6 files changed, 379 insertions(+), 28 deletions(-) create mode 100644 llvm/test/CodeGen/ARM/ctselect.ll diff --git a/llvm/lib/Target/ARM/ARMArchitectures.td b/llvm/lib/Target/ARM/ARMArchitectures.td index 301ed5bf3e3fc..ea32d200fdb83 100644 --- a/llvm/lib/Target/ARM/ARMArchitectures.td +++ b/llvm/lib/Target/ARM/ARMArchitectures.td @@ -46,7 +46,8 @@ def ARMv7a : Architecture<"armv7-a", "ARMv7a", [HasV7Ops, FeatureDB, FeatureDSP, FeatureAClass, - FeaturePerfMon]>; + FeaturePerfMon, + FeatureCtSelect]>; def ARMv7ve : Architecture<"armv7ve", "ARMv7ve", [HasV7Ops, FeatureNEON, @@ -56,14 +57,16 @@ def ARMv7ve : Architecture<"armv7ve", "ARMv7ve", [HasV7Ops, FeatureMP, FeatureVirtualization, FeatureAClass, - FeaturePerfMon]>; + FeaturePerfMon, + FeatureCtSelect]>; def ARMv7r : Architecture<"armv7-r", "ARMv7r", [HasV7Ops, FeatureDB, FeatureDSP, FeatureHWDivThumb, FeatureRClass, - FeaturePerfMon]>; + FeaturePerfMon, + FeatureCtSelect]>; def ARMv7m : Architecture<"armv7-m", "ARMv7m", [HasV7Ops, FeatureThumb2, @@ -71,7 +74,8 @@ def ARMv7m : Architecture<"armv7-m", "ARMv7m", [HasV7Ops, ModeThumb, FeatureDB, FeatureHWDivThumb, - FeatureMClass]>; + FeatureMClass, + FeatureCtSelect]>; def ARMv7em : Architecture<"armv7e-m", "ARMv7em", [HasV7Ops, FeatureThumb2, @@ -80,7 +84,8 @@ def ARMv7em : Architecture<"armv7e-m", "ARMv7em", [HasV7Ops, FeatureDB, FeatureHWDivThumb, FeatureMClass, - FeatureDSP]>; + FeatureDSP, + FeatureCtSelect]>; def ARMv8a : Architecture<"armv8-a", "ARMv8a", [HasV8Ops, FeatureAClass, diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td index 9b1fa5d7b99d8..ea6baf50ee3e2 100644 --- a/llvm/lib/Target/ARM/ARMFeatures.td +++ b/llvm/lib/Target/ARM/ARMFeatures.td @@ -753,6 +753,14 @@ def FeatureHardenSlsNoComdat : SubtargetFeature<"harden-sls-nocomdat", "HardenSlsNoComdat", "true", "Generate thunk code for SLS mitigation in the normal text section">; +//===----------------------------------------------------------------------===// +// Support for constant-time coding for branch timing mitigation. +//===----------------------------------------------------------------------===// + +// Constant-time selection is ARMv7-A only, for the moment! +def FeatureCtSelect : SubtargetFeature<"ctselect", "HasCtSelect", "true", + "Enable feature to implement constant-time select">; + //===----------------------------------------------------------------------===// // Endianness of instruction encodings in memory. // diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 5210f23384fcb..b1d074558bf46 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -203,7 +203,7 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) { setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); - setOperationAction(ISD::CTSELECT, VT, Expand); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT.isInteger()) { setOperationAction(ISD::SHL, VT, Custom); @@ -305,7 +305,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); - setOperationAction(ISD::CTSELECT, VT, Expand); + setOperationAction(ISD::CTSELECT, VT, Custom); // Vector reductions setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); @@ -357,7 +357,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); - setOperationAction(ISD::CTSELECT, VT, Expand); + setOperationAction(ISD::CTSELECT, VT, Custom); // Pre and Post inc are supported on loads and stores for (unsigned im = (unsigned)ISD::PRE_INC; @@ -422,6 +422,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); @@ -477,7 +478,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); - setOperationAction(ISD::CTSELECT, VT, Expand); + setOperationAction(ISD::CTSELECT, VT, Custom); if (!HasMVEFP) { setOperationAction(ISD::SINT_TO_FP, VT, Expand); @@ -1241,9 +1242,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); + setOperationAction(ISD::CTSELECT, MVT::i8, Promote); + setOperationAction(ISD::CTSELECT, MVT::i16, Promote); setOperationAction(ISD::CTSELECT, MVT::i32, Custom); + setOperationAction(ISD::CTSELECT, MVT::i64, Expand); setOperationAction(ISD::CTSELECT, MVT::f32, Custom); - setOperationAction(ISD::CTSELECT, MVT::i64, Custom); setOperationAction(ISD::CTSELECT, MVT::f64, Custom); if (Subtarget->hasFullFP16()) { setOperationAction(ISD::SETCC, MVT::f16, Expand); @@ -1576,6 +1579,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(ARMISD::BCC_i64) MAKE_CASE(ARMISD::FMSTAT) MAKE_CASE(ARMISD::CMOV) + MAKE_CASE(ARMISD::CTSELECT) MAKE_CASE(ARMISD::SSAT) MAKE_CASE(ARMISD::USAT) MAKE_CASE(ARMISD::ASRL) @@ -5112,21 +5116,74 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SelectTrue, SelectFalse, ISD::SETNE); } +SDValue BuildCtSelectMask(SDValue Cond, EVT MaskVT, SDLoc DL, SelectionDAG &DAG) { + Cond = DAG.getNode(ISD::AND, DL, MaskVT, Cond, DAG.getConstant(1, DL, MaskVT)); + SDValue Zero = DAG.getConstant(0, DL, MaskVT); + return DAG.getNode(ISD::SUB, DL, MaskVT, Zero, Cond); // mask = -cond +} + SDValue ARMTargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); SDValue Cond = Op.getOperand(0); SDValue SelectTrue = Op.getOperand(1); SDValue SelectFalse = Op.getOperand(2); EVT VT = Op.getValueType(); - SDLoc DL(Op); - SDValue Zero = DAG.getConstant(0, DL, Cond.getValueType()); - SDValue Val = DAG.getSetCC(DL, VT, Cond, Zero, ISD::SETNE); - SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Val); - SDValue MaskNeg = DAG.getNOT(DL, Mask, VT); + const ARMSubtarget &Subtarget = DAG.getSubtarget(); + if (!Subtarget.hasCtSelect()) { + return DAG.getNode(ISD::SELECT, DL, VT, Cond, SelectTrue, SelectFalse); + } + + EVT ElemVT = VT; + EVT MaskVT = VT; + + if (!VT.isVector()) { + if (VT == MVT::f64) { + // Use <2 x i32> vector mask for scalar f64 + ElemVT = EVT::getIntegerVT(*DAG.getContext(), 32); + MaskVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 2); + } else { + // float masks as i32 + ElemVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + MaskVT = ElemVT; + } + } else { + ElemVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits()); + MaskVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, VT.getVectorNumElements()); + } + + const EVT CondVT = Cond.getValueType(); + if (MaskVT.isVector() && !CondVT.isVector()) { + unsigned CondBits = CondVT.getSizeInBits(); + unsigned ElemBits = ElemVT.getSizeInBits(); - SelectTrue = DAG.getNode(ISD::AND, DL, VT, SelectTrue, Mask); - SelectFalse = DAG.getNode(ISD::AND, DL, VT, SelectFalse, MaskNeg); - return DAG.getNode(ISD::OR, DL, VT, SelectTrue, SelectFalse); + if (CondBits < ElemBits) { + Cond = DAG.getZExtOrTrunc(Cond, DL, ElemVT); + } + + Cond = DAG.getSplatBuildVector(MaskVT, DL, Cond); + } else if (CondVT != MaskVT) { + Cond = DAG.getZExtOrTrunc(Cond, DL, MaskVT); + } + + if (VT.isFloatingPoint()) { + SelectTrue = DAG.getBitcast(MaskVT, SelectTrue); + SelectFalse = DAG.getBitcast(MaskVT, SelectFalse); + } + + SDValue Mask = BuildCtSelectMask(Cond, MaskVT, DL, DAG); + SDValue TrueMasked = DAG.getNode(ISD::AND, DL, MaskVT, SelectTrue, Mask); + + SDValue MaskNeg = DAG.getNOT(DL, Mask, MaskVT); + SDValue FalseMasked = DAG.getNode(ISD::AND, DL, MaskVT, SelectFalse, MaskNeg); + + SDValue ResultInt = DAG.getNode(ISD::OR, DL, MaskVT, TrueMasked, FalseMasked); + + if (VT.isFloatingPoint()) { + return DAG.getBitcast(VT, ResultInt); + } + + return ResultInt; } static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, @@ -10784,15 +10841,14 @@ static SDValue ExpandCTSELECT(SDNode *N, SelectionDAG &DAG) { SDValue FalseValue = N->getOperand(2); SDLoc DL(N); - SDValue TrueValueHi, TrueValueLo; - std::tie(TrueValueHi, TrueValueLo) = DAG.SplitScalar(TrueValue, DL, MVT::i32, MVT::i32); - - SDValue FalseValueHi, FalseValueLo; - std::tie(FalseValueHi, FalseValueLo) = DAG.SplitScalar(FalseValue, DL, MVT::i32, MVT::i32); - - SDValue ResHi = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, {Cond, TrueValueHi, FalseValueHi}); - SDValue ResLo = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, {Cond, TrueValueLo, FalseValueLo}); + SDValue TrueLo, TrueHi, FalseLo, FalseHi; + std::tie(TrueLo, TrueHi) = + DAG.SplitScalar(TrueValue, DL, MVT::i32, MVT::i32); + std::tie(FalseLo, FalseHi) = + DAG.SplitScalar(FalseValue, DL, MVT::i32, MVT::i32); + SDValue ResLo = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, {Cond, TrueLo, FalseLo}); + SDValue ResHi = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, {Cond, TrueHi, FalseHi}); return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResLo, ResHi); } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 0e4f66e0e391d..94ec74d62954d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -97,6 +97,9 @@ class VectorType; CMOV, // ARM conditional move instructions. + CTSELECT, // ARM constant-time select, implemented with constant-time + // bitwise arithmetic instructions. + SSAT, // Signed saturation USAT, // Unsigned saturation @@ -430,8 +433,12 @@ class VectorType; const char *getTargetNodeName(unsigned Opcode) const override; bool isSelectSupported(SelectSupportKind Kind) const override { - // ARM does not support scalar condition selects on vectors. - return (Kind != ScalarCondVectorVal); + if (Kind == SelectSupportKind::CtSelect) { + return true; + } else { + // ARM does not support scalar condition selects on vectors. + return (Kind != SelectSupportKind::ScalarCondVectorVal); + } } bool isReadOnly(const GlobalValue *GV) const; @@ -1026,6 +1033,9 @@ class VectorType; MachineBasicBlock *MBB) const; MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const; + + MachineBasicBlock *EmitLoweredCtSelect(MachineInstr &MI, + MachineBasicBlock *MBB) const; void addMVEVectorTypes(bool HasMVEFP); void addAllExtLoads(const MVT From, const MVT To, LegalizeAction Action); void setAllExpand(MVT VT); diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 282ff534fc112..00f8853cb1d30 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -32,6 +32,14 @@ def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>; def SDT_ARMcall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; +def SDT_ARMCtSelect : SDTypeProfile<1, 4, [ + /* any */ // result + SDTCisSameAs<1, 0>, // value on false + SDTCisSameAs<2, 0>, // value on true + SDTCisVT<3, CondCodeVT>, // condition code + SDTCisVT<4, FlagsVT>, // in flags +]>; + def SDT_ARMCMov : SDTypeProfile<1, 4, [ /* any */ // result SDTCisSameAs<1, 0>, // value on false @@ -188,6 +196,7 @@ def ARMseretglue : SDNode<"ARMISD::SERET_GLUE", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def ARMintretglue : SDNode<"ARMISD::INTRET_GLUE", SDT_ARMcall, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def ARMctselect : SDNode<"ARMISD::CTSELECT", SDT_ARMCtSelect>; def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov>; def ARMssat : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>; diff --git a/llvm/test/CodeGen/ARM/ctselect.ll b/llvm/test/CodeGen/ARM/ctselect.ll new file mode 100644 index 0000000000000..5813796c65a52 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ctselect.ll @@ -0,0 +1,263 @@ +; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s +; RUN: llc < %s -mtriple=armv6 -mattr=+ctselect -verify-machineinstrs | FileCheck --check-prefix=TEST-CT %s +; RUN: llc < %s -mtriple=armv6 -verify-machineinstrs | FileCheck --check-prefix=DEFAULT %s + +define i8 @ct_int8(i1 %cond, i8 %a, i8 %b) { +; CT-LABEL: ct_int8: +; CT: and +; CT: sub +; CT: rsb +; CT-NEXT: and +; CT-NEXT: and +; CT-NEXT: orr +; CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; CT-NOT: j +; CT-NOT: mov +; CT-NOT: ldr + +; TEST-CT: and +; TEST-CT: sub +; TEST-CT: rsb +; TEST-CT-NEXT: and +; TEST-CT-NEXT: and +; TEST-CT-NEXT: orr +; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; TEST-CT-NOT: j +; TEST-CT-NOT: mov + +; DEFAULT: {{mov|ldr}} +entry: + %sel = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %sel +} + +define i16 @ct_int16(i1 %cond, i16 %a, i16 %b) { +; CT-LABEL: ct_int16: +; CT: and +; CT: sub +; CT: rsb +; CT-NEXT: and +; CT-NEXT: and +; CT-NEXT: orr +; CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; CT-NOT: j +; CT-NOT: mov +; CT-NOT: ldr + +; TEST-CT: and +; TEST-CT: sub +; TEST-CT: rsb +; TEST-CT-NEXT: and +; TEST-CT-NEXT: and +; TEST-CT-NEXT: orr +; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; TEST-CT-NOT: j +; TEST-CT-NOT: mov + +; DEFAULT: {{mov|ldr}} +entry: + %sel = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %sel +} + +define i32 @ct_int32(i1 %cond, i32 %a, i32 %b) { +; CT-LABEL: ct_int32: +; CT: and +; CT: sub +; CT: rsb +; CT-NEXT: and +; CT-NEXT: and +; CT-NEXT: orr +; CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; CT-NOT: j +; CT-NOT: mov +; CT-NOT: ldr + +; TEST-CT: and +; TEST-CT: sub +; TEST-CT: rsb +; TEST-CT-NEXT: and +; TEST-CT-NEXT: and +; TEST-CT-NEXT: orr +; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; TEST-CT-NOT: j +; TEST-CT-NOT: mov + +; DEFAULT: {{mov|ldr}} +entry: + %sel = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %sel +} + +define i64 @ct_int64(i1 %cond, i64 %a, i64 %b) { +; CT-LABEL: ct_int64: +; CT: sub +; CT: rsb +; CT: and +; CT: and +; CT: and +; CT-NEXT: and +; CT-NEXT: orr +; CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; CT-NOT: j +; CT-NOT: mov +; CT-NOT: ldr + +; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; TEST-CT-NOT: j +; TEST-CT-NOT: mov + +; DEFAULT: {{mov|ldr}} +entry: + %sel = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) + ret i64 %sel +} + +define float @ct_float(i1 %cond, float %a, float %b) { +; CT-LABEL: ct_float: +; CT: and +; CT: sub +; CT: rsb +; CT-NEXT: and +; CT-NEXT: and +; CT-NEXT: orr +; CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; CT-NOT: j +; CT-NOT: mov +; CT-NOT: ldr + +; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; TEST-CT-NOT: j +; TEST-CT-NOT: mov + +; DEFAULT: {{mov|ldr}} +entry: + %sel = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %sel +} + +define double @ct_f64(i1 %cond, double %a, double %b) { +; CT-LABEL: ct_f64: +; CT: vand +; CT-NEXT: vldr +; CT-NEXT: vneg +; CT-NEXT: vbsl +; CT-NOT: ldr +; CT-NOT: vldr +; CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; CT-NOT: j + +; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; TEST-CT-NOT: j +; TEST-CT-NOT: mov + +; DEFAULT: {{mov|ldr|vldr}} +entry: + %sel = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %sel +} + +define <8 x i8> @ct_v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) { +; CT-LABEL: ct_v8i8: +; CT: vand +; CT-NEXT: vldr +; CT-NEXT: vneg +; CT-NEXT: vbsl +; CT-NOT: ldr +; CT-NOT: vldr +; CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; CT-NOT: j + +; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; TEST-CT-NOT: j +; TEST-CT-NOT: mov + +; DEFAULT: {{mov|ldr|vldr}} +entry: + %sel = call <8 x i8> @llvm.ct.select.v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) + ret <8 x i8> %sel +} + +define <4 x i16> @ct_v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) { +; CT-LABEL: ct_v4i16: +; CT: vand +; CT-NEXT: vldr +; CT-NEXT: vneg +; CT-NEXT: vbsl +; CT-NOT: ldr +; CT-NOT: vldr +; CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; CT-NOT: j + +; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; TEST-CT-NOT: j +; TEST-CT-NOT: mov + +; DEFAULT: {{mov|ldr|vldr}} +entry: + %sel = call <4 x i16> @llvm.ct.select.v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) + ret <4 x i16> %sel +} + +; Technically this should be handled the exact same as double. +define <2 x i32> @ct_v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) { +; CT-LABEL: ct_v2i32: +; CT: vand +; CT-NEXT: vldr +; CT-NEXT: vneg +; CT-NEXT: vbsl +; CT-NOT: ldr +; CT-NOT: vldr +; CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; CT-NOT: j + +; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; TEST-CT-NOT: j +; TEST-CT-NOT: mov + +; DEFAULT: {{mov|ldr|vldr}} +entry: + %sel = call <2 x i32> @llvm.ct.select.v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) + ret <2 x i32> %sel +} + +define <2 x float> @ct_v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) { +; CT-LABEL: ct_v2f32: +; CT: vand +; CT-NEXT: vldr +; CT-NEXT: vneg +; CT-NEXT: vbsl +; CT-NOT: ldr +; CT-NOT: vldr +; CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; CT-NOT: j + +; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; TEST-CT-NOT: j +; TEST-CT-NOT: mov + +; DEFAULT: {{mov|ldr|vldr}} +entry: + %sel = call <2 x float> @llvm.ct.select.v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) + ret <2 x float> %sel +} + +define <4 x float> @ct_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { +; CT-LABEL: ct_v4f32: +; CT: vand +; CT: vldr +; CT: vneg +; CT: vbsl +; CT-NOT: ldr +; CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; CT-NOT: j + +; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} +; TEST-CT-NOT: j +; TEST-CT-NOT: mov + +; DEFAULT: {{mov|ldr|vldr}} +entry: + %sel = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) + ret <4 x float> %sel +} \ No newline at end of file From 82e3c4c910d9bbb248313ba0fa5a3daf576f5f24 Mon Sep 17 00:00:00 2001 From: Francesco Bertolaccini Date: Tue, 5 Aug 2025 15:29:34 +0200 Subject: [PATCH 35/63] [CT] Expand float and integer CTSELECTs (#27) --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index cae3c8ab30cfe..3b134220368dc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4139,8 +4139,35 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Tmp1 = Node->getOperand(0); Tmp2 = Node->getOperand(1); Tmp3 = Node->getOperand(2); - Tmp1 = DAG.getCTSelect(dl, Tmp1.getValueType(), Tmp1, Tmp2, Tmp3); - Tmp1->setFlags(Node->getFlags()); + EVT VT = Tmp2.getValueType(); + if (VT.isVector()) { + SmallVector Elements; + unsigned NumElements = VT.getVectorNumElements(); + EVT ScalarVT = VT.getScalarType(); + for (unsigned Idx = 0; Idx < NumElements; ++Idx) { + SDValue IdxVal = DAG.getConstant(Idx, dl, MVT::i64); + SDValue TVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp2, IdxVal); + SDValue FVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp3, IdxVal); + Elements.push_back(DAG.getCTSelect(dl, ScalarVT, Tmp1, TVal, FVal, Node->getFlags())); + } + Tmp1 = DAG.getBuildVector(VT, dl, Elements); + } else if (VT.isFloatingPoint()) { + EVT IntegerVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + Tmp2 = DAG.getBitcast(IntegerVT, Tmp2); + Tmp3 = DAG.getBitcast(IntegerVT, Tmp3); + Tmp1 = DAG.getBitcast(VT, DAG.getCTSelect(dl, IntegerVT, Tmp1, Tmp2, Tmp3, Node->getFlags())); + } else { + assert(VT.isInteger()); + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + SDValue Tmp2Lo, Tmp2Hi; + SDValue Tmp3Lo, Tmp3Hi; + std::tie(Tmp2Lo, Tmp2Hi) = DAG.SplitScalar(Tmp2, dl, HalfVT, HalfVT); + std::tie(Tmp3Lo, Tmp3Hi) = DAG.SplitScalar(Tmp3, dl, HalfVT, HalfVT); + SDValue ResLo = DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Lo, Tmp3Lo, Node->getFlags()); + SDValue ResHi = DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Hi, Tmp3Hi, Node->getFlags()); + Tmp1 = DAG.getNode(ISD::BUILD_PAIR, dl, VT, ResLo, ResHi); + Tmp1->setFlags(Node->getFlags()); + } Results.push_back(Tmp1); break; } From 3863def640c7c932f7d277ded881f7f7d4090904 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Wed, 6 Aug 2025 18:05:56 -0600 Subject: [PATCH 36/63] [CT] initial ARM lowering to pseudo and expansion to bitwise instructions --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 9 - llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 6 +- llvm/lib/Target/ARM/ARMArchitectures.td | 15 +- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 192 +++- llvm/lib/Target/ARM/ARMBaseInstrInfo.h | 2 + llvm/lib/Target/ARM/ARMFeatures.td | 8 - llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 86 ++ llvm/lib/Target/ARM/ARMISelLowering.cpp | 210 +++-- llvm/lib/Target/ARM/ARMISelLowering.h | 2 - llvm/lib/Target/ARM/ARMInstrInfo.td | 225 ++++- llvm/lib/Target/X86/X86ISelLowering.cpp | 53 +- llvm/test/CodeGen/ARM/ctselect-half.ll | 477 ++++++++++ llvm/test/CodeGen/ARM/ctselect-vector.ll | 855 ++++++++++++++++++ llvm/test/CodeGen/ARM/ctselect.ll | 371 ++++---- llvm/test/CodeGen/X86/ctselect-edge-cases.ll | 65 +- llvm/test/CodeGen/X86/ctselect-vector.ll | 1 - llvm/test/CodeGen/X86/ctselect.ll | 13 +- 17 files changed, 2192 insertions(+), 398 deletions(-) create mode 100644 llvm/test/CodeGen/ARM/ctselect-half.ll create mode 100644 llvm/test/CodeGen/ARM/ctselect-vector.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 61feb82a585c9..589f85671eddd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12550,9 +12550,6 @@ SDValue DAGCombiner::visitCTSELECT(SDNode *N) { SDLoc DL(N); SDNodeFlags Flags = N->getFlags(); - // if (SDValue V = DAG.simplifySelect(N0, N1, N2)) - // return V; - if (SDValue V = foldBoolSelectToLogic(N, DL, DAG)) return V; @@ -12563,9 +12560,6 @@ SDValue DAGCombiner::visitCTSELECT(SDNode *N) { return SelectOp; } - // if (SDValue V = foldSelectOfConstants(N)) - // return V; - if (VT0 == MVT::i1) { // The code in this block deals with the following 2 equivalences: // select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y)) @@ -12647,9 +12641,6 @@ SDValue DAGCombiner::visitCTSELECT(SDNode *N) { } } - if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG)) - return R; - return SDValue(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 3b134220368dc..54d51aaa15442 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4159,10 +4159,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { } else { assert(VT.isInteger()); EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - SDValue Tmp2Lo, Tmp2Hi; - SDValue Tmp3Lo, Tmp3Hi; - std::tie(Tmp2Lo, Tmp2Hi) = DAG.SplitScalar(Tmp2, dl, HalfVT, HalfVT); - std::tie(Tmp3Lo, Tmp3Hi) = DAG.SplitScalar(Tmp3, dl, HalfVT, HalfVT); + auto [Tmp2Lo, Tmp2Hi] = DAG.SplitScalar(Tmp2, dl, HalfVT, HalfVT); + auto [Tmp3Lo, Tmp3Hi] = DAG.SplitScalar(Tmp3, dl, HalfVT, HalfVT); SDValue ResLo = DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Lo, Tmp3Lo, Node->getFlags()); SDValue ResHi = DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Hi, Tmp3Hi, Node->getFlags()); Tmp1 = DAG.getNode(ISD::BUILD_PAIR, dl, VT, ResLo, ResHi); diff --git a/llvm/lib/Target/ARM/ARMArchitectures.td b/llvm/lib/Target/ARM/ARMArchitectures.td index ea32d200fdb83..301ed5bf3e3fc 100644 --- a/llvm/lib/Target/ARM/ARMArchitectures.td +++ b/llvm/lib/Target/ARM/ARMArchitectures.td @@ -46,8 +46,7 @@ def ARMv7a : Architecture<"armv7-a", "ARMv7a", [HasV7Ops, FeatureDB, FeatureDSP, FeatureAClass, - FeaturePerfMon, - FeatureCtSelect]>; + FeaturePerfMon]>; def ARMv7ve : Architecture<"armv7ve", "ARMv7ve", [HasV7Ops, FeatureNEON, @@ -57,16 +56,14 @@ def ARMv7ve : Architecture<"armv7ve", "ARMv7ve", [HasV7Ops, FeatureMP, FeatureVirtualization, FeatureAClass, - FeaturePerfMon, - FeatureCtSelect]>; + FeaturePerfMon]>; def ARMv7r : Architecture<"armv7-r", "ARMv7r", [HasV7Ops, FeatureDB, FeatureDSP, FeatureHWDivThumb, FeatureRClass, - FeaturePerfMon, - FeatureCtSelect]>; + FeaturePerfMon]>; def ARMv7m : Architecture<"armv7-m", "ARMv7m", [HasV7Ops, FeatureThumb2, @@ -74,8 +71,7 @@ def ARMv7m : Architecture<"armv7-m", "ARMv7m", [HasV7Ops, ModeThumb, FeatureDB, FeatureHWDivThumb, - FeatureMClass, - FeatureCtSelect]>; + FeatureMClass]>; def ARMv7em : Architecture<"armv7e-m", "ARMv7em", [HasV7Ops, FeatureThumb2, @@ -84,8 +80,7 @@ def ARMv7em : Architecture<"armv7e-m", "ARMv7em", [HasV7Ops, FeatureDB, FeatureHWDivThumb, FeatureMClass, - FeatureDSP, - FeatureCtSelect]>; + FeatureDSP]>; def ARMv8a : Architecture<"armv8-a", "ARMv8a", [HasV8Ops, FeatureAClass, diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 22769dbf38719..b1e2c843a0cf2 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1526,18 +1526,206 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const { BB->erase(MI); } +// Expands the ctselect pseudo, post-RA. +bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + Register DestReg = MI.getOperand(0).getReg(); + Register MaskReg = MI.getOperand(1).getReg(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(DestReg); + + Register DestRegSavedRef = DestReg; + Register VectorMaskReg = 0; + Register Src1Reg, Src2Reg, CondReg; + + // These operations will differ by operand register size. + unsigned AndOp = ARM::ANDrr; + unsigned BicOp = ARM::BICrr; + unsigned OrrOp = ARM::ORRrr; + unsigned BroadcastOp = ARM::VDUP32d; + + unsigned Opcode = MI.getOpcode(); + bool IsVector = false; + + if (ARM::QPRRegClass.hasSubClassEq(RC)) { + AndOp = ARM::VANDq; + BicOp = ARM::VBICq; + OrrOp = ARM::VORRq; + BroadcastOp = ARM::VDUP32q; + IsVector = true; + } else if (ARM::DPRRegClass.hasSubClassEq(RC)) { + AndOp = ARM::VANDd; + BicOp = ARM::VBICd; + OrrOp = ARM::VORRd; + IsVector = true; + } + + // NB: we handle f64 as a vec of two f32s. + if (Opcode == ARM::CTSELECTf64) { + IsVector = true; + } + + bool IsFloat = Opcode == ARM::CTSELECTf32 || Opcode == ARM::CTSELECTf16 || Opcode == ARM::CTSELECTbf16; + if (IsFloat) { + // Each float pseudo has: (outs $dst, $tmp_mask, $scratch1, $scratch2), (ins $src1, $src2, $cond)) + // We use two scratch registers in tablegen for bitwise ops on float types,. + Register GPRScratch1 = MI.getOperand(2).getReg(); + Register GPRScratch2 = MI.getOperand(3).getReg(); + + // choice a from __builtin_ct_select(cond, a, b) + Src1Reg = MI.getOperand(4).getReg(); + // choice b from __builtin_ct_select(cond, a, b) + Src2Reg = MI.getOperand(5).getReg(); + // cond from __builtin_ct_select(cond, a, b) + CondReg = MI.getOperand(6).getReg(); + + // Move fp src1 to GPR scratch1 so we can do our bitwise ops + BuildMI(*MBB, MI, DL, get(ARM::VMOVRS), GPRScratch1) + .addReg(Src1Reg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Move src2 to scratch2 + BuildMI(*MBB, MI, DL, get(ARM::VMOVRS), GPRScratch2) + .addReg(Src2Reg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + Src1Reg = GPRScratch1; + Src2Reg = GPRScratch2; + // Reuse GPRScratch1 for dest after we are done working with src1. + DestReg = GPRScratch1; + } else if (IsVector) { + // Any vector pseudo has: ((outs $dst, $tmp_mask, $bcast_mask), (ins $src1, $src2, $cond)) + VectorMaskReg = MI.getOperand(2).getReg(); + Src1Reg = MI.getOperand(3).getReg(); + Src2Reg = MI.getOperand(4).getReg(); + CondReg = MI.getOperand(5).getReg(); + } else { + // Any non-float, non-vector pseudo has: (outs $dst, $tmp_mask), (ins $src1, $src2, $cond)) + Src1Reg = MI.getOperand(2).getReg(); + Src2Reg = MI.getOperand(3).getReg(); + CondReg = MI.getOperand(4).getReg(); + } + + // The following sequence of steps yields: (src1 & mask) | (src2 & ~mask) + + // 1. mask = 0 - cond + // When cond = 0: mask = 0x00000000. + // When cond = 1: mask = 0xFFFFFFFF. + BuildMI(*MBB, MI, DL, get(ARM::RSBri), MaskReg) + .addReg(CondReg) + .addImm(0) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 2. A = src1 & mask + if (IsVector) { + // For vectors, broadcast the scalar mask so it matches operand size. + BuildMI(*MBB, MI, DL, get(BroadcastOp), VectorMaskReg) + .addReg(MaskReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + BuildMI(*MBB, MI, DL, get(AndOp), DestReg) + .addReg(Src1Reg) + .addReg(VectorMaskReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } else { + BuildMI(*MBB, MI, DL, get(AndOp), DestReg) + .addReg(Src1Reg) + .addReg(MaskReg) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } + + // 3. B = src2 & ~mask + if (IsVector) { + BuildMI(*MBB, MI, DL, get(BicOp), VectorMaskReg) + .addReg(Src2Reg) + .addReg(VectorMaskReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } else { + BuildMI(*MBB, MI, DL, get(BicOp), MaskReg) + .addReg(Src2Reg) + .addReg(MaskReg) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } + + // 4. result = A | B + if (IsVector) { + BuildMI(*MBB, MI, DL, get(OrrOp), DestReg) + .addReg(DestReg) + .addReg(VectorMaskReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } else { + BuildMI(*MBB, MI, DL, get(OrrOp), DestReg) + .addReg(DestReg) + .addReg(MaskReg) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } + + if (IsFloat) { + // Return our result from GPR to the correct register type. + BuildMI(*MBB, MI, DL, get(ARM::VMOVSR), DestRegSavedRef) + .addReg(DestReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } + + MI.eraseFromParent(); + return true; +} + bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { - if (MI.getOpcode() == TargetOpcode::LOAD_STACK_GUARD) { + auto opcode = MI.getOpcode(); + + if (opcode == TargetOpcode::LOAD_STACK_GUARD) { expandLoadStackGuard(MI); MI.getParent()->erase(MI); return true; } - if (MI.getOpcode() == ARM::MEMCPY) { + if (opcode == ARM::MEMCPY) { expandMEMCPY(MI); return true; } + if (opcode == ARM::CTSELECTint || + opcode == ARM::CTSELECTf16 || + opcode == ARM::CTSELECTbf16 || + opcode == ARM::CTSELECTf32 || + opcode == ARM::CTSELECTf64 || + opcode == ARM::CTSELECTv8i8 || + opcode == ARM::CTSELECTv4i16 || + opcode == ARM::CTSELECTv2i32 || + opcode == ARM::CTSELECTv1i64 || + opcode == ARM::CTSELECTv2f32 || + opcode == ARM::CTSELECTv4f16 || + opcode == ARM::CTSELECTv4bf16 || + opcode == ARM::CTSELECTv16i8 || + opcode == ARM::CTSELECTv8i16 || + opcode == ARM::CTSELECTv4i32 || + opcode == ARM::CTSELECTv2i64 || + opcode == ARM::CTSELECTv4f32 || + opcode == ARM::CTSELECTv2f64 || + opcode == ARM::CTSELECTv8f16 || + opcode == ARM::CTSELECTv8bf16) { + LLVM_DEBUG(dbgs() << "Opcode " << opcode << "replaced by: " << MI); + return expandCtSelect(MI); + } + // This hook gets to expand COPY instructions before they become // copyPhysReg() calls. Look for VMOVS instructions that can legally be // widened to VMOVD. We prefer the VMOVD when possible because it may be diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 2869e7f708046..9fc13d1a8e977 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -221,6 +221,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; + bool expandCtSelect(MachineInstr &MI) const; + bool expandPostRAPseudo(MachineInstr &MI) const override; bool shouldSink(const MachineInstr &MI) const override; diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td index ea6baf50ee3e2..9b1fa5d7b99d8 100644 --- a/llvm/lib/Target/ARM/ARMFeatures.td +++ b/llvm/lib/Target/ARM/ARMFeatures.td @@ -753,14 +753,6 @@ def FeatureHardenSlsNoComdat : SubtargetFeature<"harden-sls-nocomdat", "HardenSlsNoComdat", "true", "Generate thunk code for SLS mitigation in the normal text section">; -//===----------------------------------------------------------------------===// -// Support for constant-time coding for branch timing mitigation. -//===----------------------------------------------------------------------===// - -// Constant-time selection is ARMv7-A only, for the moment! -def FeatureCtSelect : SubtargetFeature<"ctselect", "HasCtSelect", "true", - "Enable feature to implement constant-time select">; - //===----------------------------------------------------------------------===// // Endianness of instruction encodings in memory. // diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 847b7af5a9b11..62f5b21a738dd 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -4200,6 +4200,92 @@ void ARMDAGToDAGISel::Select(SDNode *N) { // Other cases are autogenerated. break; } + case ARMISD::CTSELECT: { + EVT VT = N->getValueType(0); + unsigned PseudoOpcode; + bool IsFloat = false; + bool IsVector = false; + + if (VT == MVT::f16) { + PseudoOpcode = ARM::CTSELECTf16; + IsFloat = true; + } else if (VT == MVT::bf16) { + PseudoOpcode = ARM::CTSELECTbf16; + IsFloat = true; + } else if (VT == MVT::f32) { + PseudoOpcode = ARM::CTSELECTf32; + IsFloat = true; + } else if (VT == MVT::f64) { + PseudoOpcode = ARM::CTSELECTf64; + IsVector = true; + } else if (VT == MVT::v8i8) { + PseudoOpcode = ARM::CTSELECTv8i8; + IsVector = true; + } else if (VT == MVT::v4i16) { + PseudoOpcode = ARM::CTSELECTv4i16; + IsVector = true; + } else if (VT == MVT::v2i32) { + PseudoOpcode = ARM::CTSELECTv2i32; + IsVector = true; + } else if (VT == MVT::v1i64) { + PseudoOpcode = ARM::CTSELECTv1i64; + IsVector = true; + } else if (VT == MVT::v2f32) { + PseudoOpcode = ARM::CTSELECTv2f32; + IsVector = true; + } else if (VT == MVT::v4f16) { + PseudoOpcode = ARM::CTSELECTv4f16; + IsVector = true; + } else if (VT == MVT::v4bf16) { + PseudoOpcode = ARM::CTSELECTv4bf16; + IsVector = true; + } else if (VT == MVT::v16i8) { + PseudoOpcode = ARM::CTSELECTv16i8; + IsVector = true; + } else if (VT == MVT::v8i16) { + PseudoOpcode = ARM::CTSELECTv8i16; + IsVector = true; + } else if (VT == MVT::v4i32) { + PseudoOpcode = ARM::CTSELECTv4i32; + IsVector = true; + } else if (VT == MVT::v2i64) { + PseudoOpcode = ARM::CTSELECTv2i64; + IsVector = true; + } else if (VT == MVT::v4f32) { + PseudoOpcode = ARM::CTSELECTv4f32; + IsVector = true; + } else if (VT == MVT::v2f64) { + PseudoOpcode = ARM::CTSELECTv2f64; + IsVector = true; + } else if (VT == MVT::v8f16) { + PseudoOpcode = ARM::CTSELECTv8f16; + IsVector = true; + } else if (VT == MVT::v8bf16) { + PseudoOpcode = ARM::CTSELECTv8bf16; + IsVector = true; + } else { + // i1, i8, i16, i32, i64 + PseudoOpcode = ARM::CTSELECTint; + } + + SmallVector VTs; + VTs.push_back(VT); // $dst + VTs.push_back(MVT::i32); // $tmp_mask (always GPR) + + if (IsVector) { + VTs.push_back(VT); // $bcast_mask (same type as dst for vectors) + } else if (IsFloat) { + VTs.push_back(MVT::i32); // $scratch1 (GPR) + VTs.push_back(MVT::i32); // $scratch2 (GPR) + } + + // src1, src2, cond + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) }; + + SDNode *ResNode = CurDAG->getMachineNode(PseudoOpcode, SDLoc(N), VTs, Ops); + ReplaceNode(N, ResNode); + return; + } case ARMISD::VZIP: { EVT VT = N->getValueType(0); // vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index b1d074558bf46..c5729aa990bf6 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -411,6 +411,28 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom); setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom); + if (Subtarget->hasFullFP16()) { + setOperationAction(ISD::CTSELECT, MVT::v4f16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8f16, Custom); + } + + if (Subtarget->hasBF16()) { + setOperationAction(ISD::CTSELECT, MVT::v4bf16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8bf16, Custom); + } + + // small exotic vectors get scalarised for ctselect + setOperationAction(ISD::CTSELECT, MVT::v1i8, Expand); + setOperationAction(ISD::CTSELECT, MVT::v1i16, Expand); + setOperationAction(ISD::CTSELECT, MVT::v1i32, Expand); + setOperationAction(ISD::CTSELECT, MVT::v1f32, Expand); + setOperationAction(ISD::CTSELECT, MVT::v2i8, Expand); + + setOperationAction(ISD::CTSELECT, MVT::v2i16, Promote); + setOperationPromotedToType(ISD::CTSELECT, MVT::v2i16, MVT::v4i16); + setOperationAction(ISD::CTSELECT, MVT::v4i8, Promote); + setOperationPromotedToType(ISD::CTSELECT, MVT::v4i8, MVT::v8i8); + // We 'support' these types up to bitcast/load/store level, regardless of // MVE integer-only / float support. Only doing FP data processing on the FP // vector types is inhibited at integer-only level. @@ -1244,10 +1266,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); setOperationAction(ISD::CTSELECT, MVT::i8, Promote); setOperationAction(ISD::CTSELECT, MVT::i16, Promote); + setOperationPromotedToType(ISD::CTSELECT, MVT::i16, MVT::i32); + setOperationAction(ISD::CTSELECT, MVT::i32, Custom); setOperationAction(ISD::CTSELECT, MVT::i64, Expand); setOperationAction(ISD::CTSELECT, MVT::f32, Custom); setOperationAction(ISD::CTSELECT, MVT::f64, Custom); + + // Handle f16 and bf16 without falling back to select from ctselect. + setTargetDAGCombine({ISD::CTSELECT}); + if (Subtarget->hasFullFP16()) { setOperationAction(ISD::SETCC, MVT::f16, Expand); setOperationAction(ISD::SELECT, MVT::f16, Custom); @@ -1255,6 +1283,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::CTSELECT, MVT::f16, Custom); } + if (Subtarget->hasBF16()) { + setOperationAction(ISD::CTSELECT, MVT::bf16, Custom); + } + setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); @@ -5116,74 +5148,18 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SelectTrue, SelectFalse, ISD::SETNE); } -SDValue BuildCtSelectMask(SDValue Cond, EVT MaskVT, SDLoc DL, SelectionDAG &DAG) { - Cond = DAG.getNode(ISD::AND, DL, MaskVT, Cond, DAG.getConstant(1, DL, MaskVT)); - SDValue Zero = DAG.getConstant(0, DL, MaskVT); - return DAG.getNode(ISD::SUB, DL, MaskVT, Zero, Cond); // mask = -cond -} - SDValue ARMTargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); + SDValue Cond = Op.getOperand(0); - SDValue SelectTrue = Op.getOperand(1); - SDValue SelectFalse = Op.getOperand(2); + SDValue TrueVal = Op.getOperand(1); + SDValue FalseVal = Op.getOperand(2); EVT VT = Op.getValueType(); - const ARMSubtarget &Subtarget = DAG.getSubtarget(); - if (!Subtarget.hasCtSelect()) { - return DAG.getNode(ISD::SELECT, DL, VT, Cond, SelectTrue, SelectFalse); - } - - EVT ElemVT = VT; - EVT MaskVT = VT; - - if (!VT.isVector()) { - if (VT == MVT::f64) { - // Use <2 x i32> vector mask for scalar f64 - ElemVT = EVT::getIntegerVT(*DAG.getContext(), 32); - MaskVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 2); - } else { - // float masks as i32 - ElemVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); - MaskVT = ElemVT; - } - } else { - ElemVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits()); - MaskVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, VT.getVectorNumElements()); - } - - const EVT CondVT = Cond.getValueType(); - if (MaskVT.isVector() && !CondVT.isVector()) { - unsigned CondBits = CondVT.getSizeInBits(); - unsigned ElemBits = ElemVT.getSizeInBits(); - - if (CondBits < ElemBits) { - Cond = DAG.getZExtOrTrunc(Cond, DL, ElemVT); - } - - Cond = DAG.getSplatBuildVector(MaskVT, DL, Cond); - } else if (CondVT != MaskVT) { - Cond = DAG.getZExtOrTrunc(Cond, DL, MaskVT); - } - - if (VT.isFloatingPoint()) { - SelectTrue = DAG.getBitcast(MaskVT, SelectTrue); - SelectFalse = DAG.getBitcast(MaskVT, SelectFalse); - } - - SDValue Mask = BuildCtSelectMask(Cond, MaskVT, DL, DAG); - SDValue TrueMasked = DAG.getNode(ISD::AND, DL, MaskVT, SelectTrue, Mask); - - SDValue MaskNeg = DAG.getNOT(DL, Mask, MaskVT); - SDValue FalseMasked = DAG.getNode(ISD::AND, DL, MaskVT, SelectFalse, MaskNeg); - - SDValue ResultInt = DAG.getNode(ISD::OR, DL, MaskVT, TrueMasked, FalseMasked); - - if (VT.isFloatingPoint()) { - return DAG.getBitcast(VT, ResultInt); - } - - return ResultInt; + // Normalise the condition to 0 or 1. + SDValue One = DAG.getConstant(1, DL, MVT::i32); + SDValue CondNode = DAG.getNode(ISD::AND, DL, MVT::i32, Cond, One); + return DAG.getNode(ARMISD::CTSELECT, DL, VT, TrueVal, FalseVal, CondNode); } static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, @@ -10835,23 +10811,6 @@ static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl &Results, LongMul.getValue(0), LongMul.getValue(1))); } -static SDValue ExpandCTSELECT(SDNode *N, SelectionDAG &DAG) { - SDValue Cond = N->getOperand(0); - SDValue TrueValue = N->getOperand(1); - SDValue FalseValue = N->getOperand(2); - SDLoc DL(N); - - SDValue TrueLo, TrueHi, FalseLo, FalseHi; - std::tie(TrueLo, TrueHi) = - DAG.SplitScalar(TrueValue, DL, MVT::i32, MVT::i32); - std::tie(FalseLo, FalseHi) = - DAG.SplitScalar(FalseValue, DL, MVT::i32, MVT::i32); - - SDValue ResLo = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, {Cond, TrueLo, FalseLo}); - SDValue ResHi = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, {Cond, TrueHi, FalseHi}); - return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResLo, ResHi); -} - /// ReplaceNodeResults - Replace the results of node with an illegal result /// type with new values built out of custom code. void ARMTargetLowering::ReplaceNodeResults(SDNode *N, @@ -10916,9 +10875,36 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::FP_TO_UINT_SAT: Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget); break; - case ISD::CTSELECT: - Res = ExpandCTSELECT(N, DAG); - break; + case ISD::CTSELECT: { + EVT VT = N->getValueType(0); + + // Handle f16/bf16 type promotion while preserving ctselect + if (VT == MVT::f16 || VT == MVT::bf16) { + SDLoc DL(N); + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Bitcast to i16, then promote to i32 + SDValue TrueInt = DAG.getBitcast(MVT::i16, TrueVal); + SDValue FalseInt = DAG.getBitcast(MVT::i16, FalseVal); + + TrueInt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueInt); + FalseInt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseInt); + + // Normalize condition + SDValue One = DAG.getConstant(1, DL, MVT::i32); + SDValue CondNorm = DAG.getNode(ISD::AND, DL, MVT::i32, Cond, One); + + // Create i32 ctselect that will go through normal lowering + Res = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, + CondNorm, TrueInt, FalseInt); + } else { + // For other types, use existing lowering + Res = LowerCTSELECT(SDValue(N, 0), DAG); + } + break; + } } if (Res.getNode()) Results.push_back(Res); @@ -13475,6 +13461,63 @@ static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) { DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts)); } +static SDValue PerformCTSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + if (!DCI.isBeforeLegalize()) { + return SDValue(); + } + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + EVT VT = N->getValueType(0); + if (VT == MVT::f16 || VT == MVT::bf16) { + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + SDValue TrueInt = DAG.getBitcast(MVT::i16, TrueVal); + SDValue FalseInt = DAG.getBitcast(MVT::i16, FalseVal); + + // Create i16 ctselect - this will be promoted to i32 ctselect naturally + SDValue Result = DAG.getNode(ISD::CTSELECT, DL, MVT::i16, + Cond, TrueInt, FalseInt); + + return DAG.getBitcast(VT, Result); + } else if (VT.isVector()) { + EVT EltVT = VT.getVectorElementType(); + if (EltVT == MVT::f16 || EltVT == MVT::bf16) { + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + EVT IntVT; + switch (VT.getSimpleVT().SimpleTy) { + case MVT::v4f16: + case MVT::v4bf16: + IntVT = MVT::v4i16; + break; + case MVT::v8f16: + case MVT::v8bf16: + IntVT = MVT::v8i16; + break; + default: + return SDValue(); // Unsupported vector type + } + + SDValue TrueInt = DAG.getBitcast(IntVT, TrueVal); + SDValue FalseInt = DAG.getBitcast(IntVT, FalseVal); + + SDValue Result = DAG.getNode(ISD::CTSELECT, DL, IntVT, + Cond, TrueInt, FalseInt); + + return DAG.getBitcast(VT, Result); + } + } + + return SDValue(); +} + static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { @@ -18978,6 +19021,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT_CC: case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget); case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget); + case ISD::CTSELECT: return PerformCTSELECTCombine(N, DCI, Subtarget); case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget); case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 94ec74d62954d..5ca1769087873 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -1034,8 +1034,6 @@ class VectorType; MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const; - MachineBasicBlock *EmitLoweredCtSelect(MachineInstr &MI, - MachineBasicBlock *MBB) const; void addMVEVectorTypes(bool HasMVEFP); void addAllExtLoads(const MVT From, const MVT To, LegalizeAction Action); void setAllExpand(MVT VT); diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 00f8853cb1d30..b8597f97b43df 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -32,12 +32,11 @@ def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>; def SDT_ARMcall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; -def SDT_ARMCtSelect : SDTypeProfile<1, 4, [ +def SDT_ARMCtSelect : SDTypeProfile<1, 3, [ /* any */ // result SDTCisSameAs<1, 0>, // value on false SDTCisSameAs<2, 0>, // value on true - SDTCisVT<3, CondCodeVT>, // condition code - SDTCisVT<4, FlagsVT>, // in flags + SDTCisVT<3, i32> // cond ]>; def SDT_ARMCMov : SDTypeProfile<1, 4, [ @@ -5117,6 +5116,226 @@ def : ARMPat<(ARMcmov i32:$false, mod_imm_not:$imm, imm:$cc, CPSR), def : ARMV6T2Pat<(ARMcmov i32:$false, imm:$src, imm:$cc, CPSR), (MOVCCi32imm $false, imm:$src, imm:$cc, CPSR)>; +//===----------------------------------------------------------------------===// +// Constant-time selection pseudoinstructions. +// We use a machine pass to lower these pseudos as applicable by subtarget, +// in order to avoid backend optimizations that could invalidate constant-time +// guarantees to the source programmer by node merging or other operations that +// would result in machine code that does not run in constant time. +let isNotDuplicable = 1, + isPseudo = 1, + hasNoSchedulingInfo = 1 in { + + // i1, i8, i16, i32, i64 + def CTSELECTint : ARMPseudoInst< + (outs GPR:$dst, GPR:$tmp_mask), + (ins GPR:$src1, GPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask"; + } + + def CTSELECTf16 : ARMPseudoInst< + (outs HPR:$dst, GPR:$tmp_mask, GPR:$scratch1, GPR:$scratch2), + (ins HPR:$src1, HPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $scratch1,@earlyclobber $scratch2"; + } + + def CTSELECTbf16 : ARMPseudoInst< + (outs HPR:$dst, GPR:$tmp_mask, GPR:$scratch1, GPR:$scratch2), + (ins HPR:$src1, HPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $scratch1,@earlyclobber $scratch2"; + } + + def CTSELECTf32 : ARMPseudoInst< + (outs SPR:$dst, GPR:$tmp_mask, GPR:$scratch1, GPR:$scratch2), + (ins SPR:$src1, SPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $scratch1,@earlyclobber $scratch2"; + } + + let Predicates = [HasDPVFP] in { + def CTSELECTf64 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + } + + let Predicates = [HasNEON] in { + // DPR + def CTSELECTv8i8 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv4i16 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv2i32 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv1i64 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv2f32 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv4f16 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv4bf16 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + + // QPR + def CTSELECTv16i8 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv8i16 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv4i32 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv2i64 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv4f32 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv2f64 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv8f16 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv8bf16 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + } +} + //===----------------------------------------------------------------------===// // Atomic operations intrinsics // diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9f400a970793c..91b5ed5c3f8b6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2567,6 +2567,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::x86amx, &X86::TILERegClass); } + // Handle 512-bit vector CTSELECT without AVX512 by setting them to Expand + // This allows type legalization to split them into smaller vectors + for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, MVT::v32f16, + MVT::v16f32, MVT::v8f64}) { + setOperationAction(ISD::CTSELECT, VT, Expand); + } + + // Handle 256-bit vector CTSELECT without AVX by setting them to Expand + // This allows type legalization to split them into 128-bit vectors + if (!Subtarget.hasAVX()) { + for (auto VT : {MVT::v4f64, MVT::v4i64, MVT::v8i32, MVT::v16i16, + MVT::v16f16, MVT::v32i8, MVT::v8f32}) { + setOperationAction(ISD::CTSELECT, VT, Expand); + } + } + // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); @@ -25380,27 +25396,18 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { unsigned VectorWidth = VT.getSizeInBits(); MVT EltVT = VT.getVectorElementType(); - // Check if we have the necessary SIMD support - bool HasSSE = Subtarget.hasSSE1(); - bool HasAVX = Subtarget.hasAVX(); - bool HasAVX512 = Subtarget.hasAVX512(); - // For 512-bit vectors, we need AVX512 - if (VectorWidth == 512 && !HasAVX512) - return SDValue(); + // 512-bit vectors without AVX512 are now handled by type legalization + // (Expand action) 256-bit vectors without AVX are now handled by type + // legalization (Expand action) - // For 256-bit vectors, we need at least AVX - if (VectorWidth == 256 && !HasAVX) - return SDValue(); - - // For 128-bit vectors, we need at least SSE - if (VectorWidth == 128 && !HasSSE) + if (VectorWidth == 128 && !Subtarget.hasSSE1()) return SDValue(); // Handle special cases for floating point vectors if (EltVT.isFloatingPoint()) { // For vector floating point with AVX, use VBLENDV-style operations - if (HasAVX && (VectorWidth == 256 || VectorWidth == 128)) { + if (Subtarget.hasAVX() && (VectorWidth == 256 || VectorWidth == 128)) { // Convert to bitwise operations using the condition MVT IntVT = VT.changeVectorElementTypeToInteger(); SDValue IntOp1 = DAG.getBitcast(IntVT, TrueOp); @@ -25441,13 +25448,7 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); - bool IllegalFPCMov = false; - if (VT.isFloatingPoint() && !VT.isVector() && - !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) - IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); - - if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || - Cmp.getOpcode() == X86ISD::BT) { + if ((isX86LogicalCmp(Cmp)) || Cmp.getOpcode() == X86ISD::BT) { Cond = Cmp; AddTest = false; } @@ -25496,9 +25497,9 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { if (T1.getValueType() == T2.getValueType() && T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode() != ISD::CopyFromReg) { - SDValue Cmov = DAG.getNode(X86ISD::CTSELECT, DL, T1.getValueType(), T2, - T1, CC, ProcessedCond); - return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); + SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, T1.getValueType(), + T2, T1, CC, ProcessedCond); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect); } } @@ -25509,8 +25510,8 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { TrueOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueOp); FalseOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseOp); SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond}; - SDValue Cmov = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Ops); - return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); + SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Ops); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect); } if (isScalarFPTypeInSSEReg(VT)) { diff --git a/llvm/test/CodeGen/ARM/ctselect-half.ll b/llvm/test/CodeGen/ARM/ctselect-half.ll new file mode 100644 index 0000000000000..0f1b4a4b14ac1 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ctselect-half.ll @@ -0,0 +1,477 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s +; RUN: llc < %s -mtriple=armv8.6a-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=BFLOAT-F16-NATIVE %s +; RUN: llc < %s -mtriple=armv8.2a-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=F16-NATIVE %s + +define half @ct_half(i1 %cond, half %a, half %b) { +; CT-LABEL: ct_half: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; BFLOAT-F16-NATIVE-LABEL: ct_half: +; BFLOAT-F16-NATIVE: @ %bb.0: @ %entry +; BFLOAT-F16-NATIVE-NEXT: and r3, r0, #1 +; BFLOAT-F16-NATIVE-NEXT: rsb r12, r3, #0 +; BFLOAT-F16-NATIVE-NEXT: and r0, r1, r12 +; BFLOAT-F16-NATIVE-NEXT: bic r12, r2, r12 +; BFLOAT-F16-NATIVE-NEXT: orr r0, r0, r12 +; BFLOAT-F16-NATIVE-NEXT: bx lr +; +; F16-NATIVE-LABEL: ct_half: +; F16-NATIVE: @ %bb.0: @ %entry +; F16-NATIVE-NEXT: and r3, r0, #1 +; F16-NATIVE-NEXT: rsb r12, r3, #0 +; F16-NATIVE-NEXT: and r0, r1, r12 +; F16-NATIVE-NEXT: bic r12, r2, r12 +; F16-NATIVE-NEXT: orr r0, r0, r12 +; F16-NATIVE-NEXT: bx lr +entry: + %sel = call half @llvm.ct.select.f16(i1 %cond, half %a, half %b) + ret half %sel +} + +define bfloat @ct_bf16(i1 %cond, bfloat %a, bfloat %b) { +; CT-LABEL: ct_bf16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; BFLOAT-F16-NATIVE-LABEL: ct_bf16: +; BFLOAT-F16-NATIVE: @ %bb.0: @ %entry +; BFLOAT-F16-NATIVE-NEXT: .pad #4 +; BFLOAT-F16-NATIVE-NEXT: sub sp, sp, #4 +; BFLOAT-F16-NATIVE-NEXT: and r0, r0, #1 +; BFLOAT-F16-NATIVE-NEXT: rsb r12, r0, #0 +; BFLOAT-F16-NATIVE-NEXT: and r3, r1, r12 +; BFLOAT-F16-NATIVE-NEXT: bic r12, r2, r12 +; BFLOAT-F16-NATIVE-NEXT: orr r3, r3, r12 +; BFLOAT-F16-NATIVE-NEXT: strh r3, [sp, #2] +; BFLOAT-F16-NATIVE-NEXT: ldrh r0, [sp, #2] +; BFLOAT-F16-NATIVE-NEXT: add sp, sp, #4 +; BFLOAT-F16-NATIVE-NEXT: bx lr +; +; F16-NATIVE-LABEL: ct_bf16: +; F16-NATIVE: @ %bb.0: @ %entry +; F16-NATIVE-NEXT: and r3, r0, #1 +; F16-NATIVE-NEXT: rsb r12, r3, #0 +; F16-NATIVE-NEXT: and r0, r1, r12 +; F16-NATIVE-NEXT: bic r12, r2, r12 +; F16-NATIVE-NEXT: orr r0, r0, r12 +; F16-NATIVE-NEXT: bx lr +entry: + %sel = call bfloat @llvm.ct.select.bf16(i1 %cond, bfloat %a, bfloat %b) + ret bfloat %sel +} + +define <4 x half> @ct_v4f16(i1 %cond, <4 x half> %a, <4 x half> %b) { +; CT-LABEL: ct_v4f16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: .save {r4, r5, r6, lr} +; CT-NEXT: push {r4, r5, r6, lr} +; CT-NEXT: ldrh r1, [sp, #20] +; CT-NEXT: pkhbt r2, r2, r3, lsl #16 +; CT-NEXT: ldrh r4, [sp, #16] +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: ldrh r12, [sp, #36] +; CT-NEXT: ldrh lr, [sp, #28] +; CT-NEXT: orr r1, r4, r1, lsl #16 +; CT-NEXT: ldrh r6, [sp, #24] +; CT-NEXT: ldrh r5, [sp, #32] +; CT-NEXT: vmov d17, r2, r1 +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: orr r6, r6, lr, lsl #16 +; CT-NEXT: orr r3, r5, r12, lsl #16 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vmov d16, r6, r3 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov.u16 r0, d18[0] +; CT-NEXT: vmov.u16 r1, d18[1] +; CT-NEXT: vmov.u16 r2, d18[2] +; CT-NEXT: vmov.u16 r3, d18[3] +; CT-NEXT: pop {r4, r5, r6, pc} +; +; BFLOAT-F16-NATIVE-LABEL: ct_v4f16: +; BFLOAT-F16-NATIVE: @ %bb.0: @ %entry +; BFLOAT-F16-NATIVE-NEXT: .save {r4, r5, r6, lr} +; BFLOAT-F16-NATIVE-NEXT: push {r4, r5, r6, lr} +; BFLOAT-F16-NATIVE-NEXT: ldrh r1, [sp, #20] +; BFLOAT-F16-NATIVE-NEXT: pkhbt r2, r2, r3, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r4, [sp, #16] +; BFLOAT-F16-NATIVE-NEXT: and r0, r0, #1 +; BFLOAT-F16-NATIVE-NEXT: ldrh r12, [sp, #36] +; BFLOAT-F16-NATIVE-NEXT: ldrh lr, [sp, #28] +; BFLOAT-F16-NATIVE-NEXT: orr r1, r4, r1, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r6, [sp, #24] +; BFLOAT-F16-NATIVE-NEXT: ldrh r5, [sp, #32] +; BFLOAT-F16-NATIVE-NEXT: vmov d17, r2, r1 +; BFLOAT-F16-NATIVE-NEXT: rsb r1, r0, #0 +; BFLOAT-F16-NATIVE-NEXT: orr r6, r6, lr, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: orr r3, r5, r12, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: vdup.32 d19, r1 +; BFLOAT-F16-NATIVE-NEXT: vmov d16, r6, r3 +; BFLOAT-F16-NATIVE-NEXT: vand d18, d17, d19 +; BFLOAT-F16-NATIVE-NEXT: vbic d19, d16, d19 +; BFLOAT-F16-NATIVE-NEXT: vorr d18, d18, d19 +; BFLOAT-F16-NATIVE-NEXT: vmov.u16 r0, d18[0] +; BFLOAT-F16-NATIVE-NEXT: vmov.u16 r1, d18[1] +; BFLOAT-F16-NATIVE-NEXT: vmov.u16 r2, d18[2] +; BFLOAT-F16-NATIVE-NEXT: vmov.u16 r3, d18[3] +; BFLOAT-F16-NATIVE-NEXT: pop {r4, r5, r6, pc} +; +; F16-NATIVE-LABEL: ct_v4f16: +; F16-NATIVE: @ %bb.0: @ %entry +; F16-NATIVE-NEXT: .save {r4, r5, r6, lr} +; F16-NATIVE-NEXT: push {r4, r5, r6, lr} +; F16-NATIVE-NEXT: ldrh r1, [sp, #20] +; F16-NATIVE-NEXT: pkhbt r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: ldrh r4, [sp, #16] +; F16-NATIVE-NEXT: and r0, r0, #1 +; F16-NATIVE-NEXT: ldrh r12, [sp, #36] +; F16-NATIVE-NEXT: ldrh lr, [sp, #28] +; F16-NATIVE-NEXT: orr r1, r4, r1, lsl #16 +; F16-NATIVE-NEXT: ldrh r6, [sp, #24] +; F16-NATIVE-NEXT: ldrh r5, [sp, #32] +; F16-NATIVE-NEXT: vmov d17, r2, r1 +; F16-NATIVE-NEXT: rsb r1, r0, #0 +; F16-NATIVE-NEXT: orr r6, r6, lr, lsl #16 +; F16-NATIVE-NEXT: orr r3, r5, r12, lsl #16 +; F16-NATIVE-NEXT: vdup.32 d19, r1 +; F16-NATIVE-NEXT: vmov d16, r6, r3 +; F16-NATIVE-NEXT: vand d18, d17, d19 +; F16-NATIVE-NEXT: vbic d19, d16, d19 +; F16-NATIVE-NEXT: vorr d18, d18, d19 +; F16-NATIVE-NEXT: vmov.u16 r0, d18[0] +; F16-NATIVE-NEXT: vmov.u16 r1, d18[1] +; F16-NATIVE-NEXT: vmov.u16 r2, d18[2] +; F16-NATIVE-NEXT: vmov.u16 r3, d18[3] +; F16-NATIVE-NEXT: pop {r4, r5, r6, pc} +entry: + %sel = call <4 x half> @llvm.ct.select.v4f16(i1 %cond, <4 x half> %a, <4 x half> %b) + ret <4 x half> %sel +} + +define <4 x bfloat> @ct_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) { +; CT-LABEL: ct_v4bf16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: .save {r4, r5, r6, lr} +; CT-NEXT: push {r4, r5, r6, lr} +; CT-NEXT: ldrh r1, [sp, #20] +; CT-NEXT: pkhbt r2, r2, r3, lsl #16 +; CT-NEXT: ldrh r4, [sp, #16] +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: ldrh r12, [sp, #36] +; CT-NEXT: ldrh lr, [sp, #28] +; CT-NEXT: orr r1, r4, r1, lsl #16 +; CT-NEXT: ldrh r6, [sp, #24] +; CT-NEXT: ldrh r5, [sp, #32] +; CT-NEXT: vmov d17, r2, r1 +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: orr r6, r6, lr, lsl #16 +; CT-NEXT: orr r3, r5, r12, lsl #16 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vmov d16, r6, r3 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov.u16 r0, d18[0] +; CT-NEXT: vmov.u16 r1, d18[1] +; CT-NEXT: vmov.u16 r2, d18[2] +; CT-NEXT: vmov.u16 r3, d18[3] +; CT-NEXT: pop {r4, r5, r6, pc} +; +; BFLOAT-F16-NATIVE-LABEL: ct_v4bf16: +; BFLOAT-F16-NATIVE: @ %bb.0: @ %entry +; BFLOAT-F16-NATIVE-NEXT: and r0, r0, #1 +; BFLOAT-F16-NATIVE-NEXT: vldr d16, [sp] +; BFLOAT-F16-NATIVE-NEXT: rsb r1, r0, #0 +; BFLOAT-F16-NATIVE-NEXT: vmov d17, r2, r3 +; BFLOAT-F16-NATIVE-NEXT: vdup.32 d19, r1 +; BFLOAT-F16-NATIVE-NEXT: vand d18, d17, d19 +; BFLOAT-F16-NATIVE-NEXT: vbic d19, d16, d19 +; BFLOAT-F16-NATIVE-NEXT: vorr d18, d18, d19 +; BFLOAT-F16-NATIVE-NEXT: vmov r0, r1, d18 +; BFLOAT-F16-NATIVE-NEXT: bx lr +; +; F16-NATIVE-LABEL: ct_v4bf16: +; F16-NATIVE: @ %bb.0: @ %entry +; F16-NATIVE-NEXT: .save {r4, r5, r6, lr} +; F16-NATIVE-NEXT: push {r4, r5, r6, lr} +; F16-NATIVE-NEXT: ldrh r1, [sp, #20] +; F16-NATIVE-NEXT: pkhbt r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: ldrh r4, [sp, #16] +; F16-NATIVE-NEXT: and r0, r0, #1 +; F16-NATIVE-NEXT: ldrh r12, [sp, #36] +; F16-NATIVE-NEXT: ldrh lr, [sp, #28] +; F16-NATIVE-NEXT: orr r1, r4, r1, lsl #16 +; F16-NATIVE-NEXT: ldrh r6, [sp, #24] +; F16-NATIVE-NEXT: ldrh r5, [sp, #32] +; F16-NATIVE-NEXT: vmov d17, r2, r1 +; F16-NATIVE-NEXT: rsb r1, r0, #0 +; F16-NATIVE-NEXT: orr r6, r6, lr, lsl #16 +; F16-NATIVE-NEXT: orr r3, r5, r12, lsl #16 +; F16-NATIVE-NEXT: vdup.32 d19, r1 +; F16-NATIVE-NEXT: vmov d16, r6, r3 +; F16-NATIVE-NEXT: vand d18, d17, d19 +; F16-NATIVE-NEXT: vbic d19, d16, d19 +; F16-NATIVE-NEXT: vorr d18, d18, d19 +; F16-NATIVE-NEXT: vmov.u16 r0, d18[0] +; F16-NATIVE-NEXT: vmov.u16 r1, d18[1] +; F16-NATIVE-NEXT: vmov.u16 r2, d18[2] +; F16-NATIVE-NEXT: vmov.u16 r3, d18[3] +; F16-NATIVE-NEXT: pop {r4, r5, r6, pc} +entry: + %sel = call <4 x bfloat> @llvm.ct.select.v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) + ret <4 x bfloat> %sel +} + +define <8 x half> @ct_v8f16(i1 %cond, <8 x half> %a, <8 x half> %b) { +; CT-LABEL: ct_v8f16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CT-NEXT: push {r4, r5, r6, r7, r8, lr} +; CT-NEXT: ldrh r12, [sp, #36] +; CT-NEXT: pkhbt r2, r2, r3, lsl #16 +; CT-NEXT: ldrh r7, [sp, #32] +; CT-NEXT: and r1, r1, #1 +; CT-NEXT: ldrh r3, [sp, #52] +; CT-NEXT: vmov.32 d16[0], r2 +; CT-NEXT: ldrh r2, [sp, #48] +; CT-NEXT: orr r7, r7, r12, lsl #16 +; CT-NEXT: ldrh r5, [sp, #68] +; CT-NEXT: orr r2, r2, r3, lsl #16 +; CT-NEXT: vmov.32 d17[0], r7 +; CT-NEXT: ldrh r7, [sp, #64] +; CT-NEXT: ldrh r3, [sp, #28] +; CT-NEXT: vmov.32 d18[0], r2 +; CT-NEXT: ldrh r2, [sp, #24] +; CT-NEXT: orr r7, r7, r5, lsl #16 +; CT-NEXT: ldrh r5, [sp, #76] +; CT-NEXT: vmov.32 d19[0], r7 +; CT-NEXT: orr r2, r2, r3, lsl #16 +; CT-NEXT: ldrh r7, [sp, #72] +; CT-NEXT: ldrh lr, [sp, #60] +; CT-NEXT: vmov.32 d16[1], r2 +; CT-NEXT: orr r2, r7, r5, lsl #16 +; CT-NEXT: ldrh r4, [sp, #56] +; CT-NEXT: ldrh r8, [sp, #44] +; CT-NEXT: vmov.32 d19[1], r2 +; CT-NEXT: orr r2, r4, lr, lsl #16 +; CT-NEXT: ldrh r6, [sp, #40] +; CT-NEXT: vmov.32 d18[1], r2 +; CT-NEXT: orr r2, r6, r8, lsl #16 +; CT-NEXT: vmov.32 d17[1], r2 +; CT-NEXT: rsb r2, r1, #0 +; CT-NEXT: vdup.32 q11, r2 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vst1.64 {d20, d21}, [r0:128] +; CT-NEXT: pop {r4, r5, r6, r7, r8, pc} +; +; BFLOAT-F16-NATIVE-LABEL: ct_v8f16: +; BFLOAT-F16-NATIVE: @ %bb.0: @ %entry +; BFLOAT-F16-NATIVE-NEXT: .save {r4, r5, r6, r7, r8, lr} +; BFLOAT-F16-NATIVE-NEXT: push {r4, r5, r6, r7, r8, lr} +; BFLOAT-F16-NATIVE-NEXT: ldrh r12, [sp, #36] +; BFLOAT-F16-NATIVE-NEXT: pkhbt r2, r2, r3, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r7, [sp, #32] +; BFLOAT-F16-NATIVE-NEXT: and r1, r1, #1 +; BFLOAT-F16-NATIVE-NEXT: ldrh r3, [sp, #52] +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d16[0], r2 +; BFLOAT-F16-NATIVE-NEXT: ldrh r2, [sp, #48] +; BFLOAT-F16-NATIVE-NEXT: orr r7, r7, r12, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r5, [sp, #68] +; BFLOAT-F16-NATIVE-NEXT: orr r2, r2, r3, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d17[0], r7 +; BFLOAT-F16-NATIVE-NEXT: ldrh r7, [sp, #64] +; BFLOAT-F16-NATIVE-NEXT: ldrh r3, [sp, #28] +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d18[0], r2 +; BFLOAT-F16-NATIVE-NEXT: ldrh r2, [sp, #24] +; BFLOAT-F16-NATIVE-NEXT: orr r7, r7, r5, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r5, [sp, #76] +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d19[0], r7 +; BFLOAT-F16-NATIVE-NEXT: orr r2, r2, r3, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r7, [sp, #72] +; BFLOAT-F16-NATIVE-NEXT: ldrh lr, [sp, #60] +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d16[1], r2 +; BFLOAT-F16-NATIVE-NEXT: orr r2, r7, r5, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r4, [sp, #56] +; BFLOAT-F16-NATIVE-NEXT: ldrh r8, [sp, #44] +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d19[1], r2 +; BFLOAT-F16-NATIVE-NEXT: orr r2, r4, lr, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r6, [sp, #40] +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d18[1], r2 +; BFLOAT-F16-NATIVE-NEXT: orr r2, r6, r8, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d17[1], r2 +; BFLOAT-F16-NATIVE-NEXT: rsb r2, r1, #0 +; BFLOAT-F16-NATIVE-NEXT: vdup.32 q11, r2 +; BFLOAT-F16-NATIVE-NEXT: vand q10, q8, q11 +; BFLOAT-F16-NATIVE-NEXT: vbic q11, q9, q11 +; BFLOAT-F16-NATIVE-NEXT: vorr q10, q10, q11 +; BFLOAT-F16-NATIVE-NEXT: vst1.64 {d20, d21}, [r0:128] +; BFLOAT-F16-NATIVE-NEXT: pop {r4, r5, r6, r7, r8, pc} +; +; F16-NATIVE-LABEL: ct_v8f16: +; F16-NATIVE: @ %bb.0: @ %entry +; F16-NATIVE-NEXT: .save {r4, r5, r6, r7, r8, lr} +; F16-NATIVE-NEXT: push {r4, r5, r6, r7, r8, lr} +; F16-NATIVE-NEXT: ldrh r12, [sp, #36] +; F16-NATIVE-NEXT: pkhbt r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: ldrh r7, [sp, #32] +; F16-NATIVE-NEXT: and r1, r1, #1 +; F16-NATIVE-NEXT: ldrh r3, [sp, #52] +; F16-NATIVE-NEXT: vmov.32 d16[0], r2 +; F16-NATIVE-NEXT: ldrh r2, [sp, #48] +; F16-NATIVE-NEXT: orr r7, r7, r12, lsl #16 +; F16-NATIVE-NEXT: ldrh r5, [sp, #68] +; F16-NATIVE-NEXT: orr r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: vmov.32 d17[0], r7 +; F16-NATIVE-NEXT: ldrh r7, [sp, #64] +; F16-NATIVE-NEXT: ldrh r3, [sp, #28] +; F16-NATIVE-NEXT: vmov.32 d18[0], r2 +; F16-NATIVE-NEXT: ldrh r2, [sp, #24] +; F16-NATIVE-NEXT: orr r7, r7, r5, lsl #16 +; F16-NATIVE-NEXT: ldrh r5, [sp, #76] +; F16-NATIVE-NEXT: vmov.32 d19[0], r7 +; F16-NATIVE-NEXT: orr r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: ldrh r7, [sp, #72] +; F16-NATIVE-NEXT: ldrh lr, [sp, #60] +; F16-NATIVE-NEXT: vmov.32 d16[1], r2 +; F16-NATIVE-NEXT: orr r2, r7, r5, lsl #16 +; F16-NATIVE-NEXT: ldrh r4, [sp, #56] +; F16-NATIVE-NEXT: ldrh r8, [sp, #44] +; F16-NATIVE-NEXT: vmov.32 d19[1], r2 +; F16-NATIVE-NEXT: orr r2, r4, lr, lsl #16 +; F16-NATIVE-NEXT: ldrh r6, [sp, #40] +; F16-NATIVE-NEXT: vmov.32 d18[1], r2 +; F16-NATIVE-NEXT: orr r2, r6, r8, lsl #16 +; F16-NATIVE-NEXT: vmov.32 d17[1], r2 +; F16-NATIVE-NEXT: rsb r2, r1, #0 +; F16-NATIVE-NEXT: vdup.32 q11, r2 +; F16-NATIVE-NEXT: vand q10, q8, q11 +; F16-NATIVE-NEXT: vbic q11, q9, q11 +; F16-NATIVE-NEXT: vorr q10, q10, q11 +; F16-NATIVE-NEXT: vst1.64 {d20, d21}, [r0:128] +; F16-NATIVE-NEXT: pop {r4, r5, r6, r7, r8, pc} +entry: + %sel = call <8 x half> @llvm.ct.select.v8f16(i1 %cond, <8 x half> %a, <8 x half> %b) + ret <8 x half> %sel +} + +define <8 x bfloat> @ct_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) { +; CT-LABEL: ct_v8bf16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CT-NEXT: push {r4, r5, r6, r7, r8, lr} +; CT-NEXT: ldrh r12, [sp, #36] +; CT-NEXT: pkhbt r2, r2, r3, lsl #16 +; CT-NEXT: ldrh r7, [sp, #32] +; CT-NEXT: and r1, r1, #1 +; CT-NEXT: ldrh r3, [sp, #52] +; CT-NEXT: vmov.32 d16[0], r2 +; CT-NEXT: ldrh r2, [sp, #48] +; CT-NEXT: orr r7, r7, r12, lsl #16 +; CT-NEXT: ldrh r5, [sp, #68] +; CT-NEXT: orr r2, r2, r3, lsl #16 +; CT-NEXT: vmov.32 d17[0], r7 +; CT-NEXT: ldrh r7, [sp, #64] +; CT-NEXT: ldrh r3, [sp, #28] +; CT-NEXT: vmov.32 d18[0], r2 +; CT-NEXT: ldrh r2, [sp, #24] +; CT-NEXT: orr r7, r7, r5, lsl #16 +; CT-NEXT: ldrh r5, [sp, #76] +; CT-NEXT: vmov.32 d19[0], r7 +; CT-NEXT: orr r2, r2, r3, lsl #16 +; CT-NEXT: ldrh r7, [sp, #72] +; CT-NEXT: ldrh lr, [sp, #60] +; CT-NEXT: vmov.32 d16[1], r2 +; CT-NEXT: orr r2, r7, r5, lsl #16 +; CT-NEXT: ldrh r4, [sp, #56] +; CT-NEXT: ldrh r8, [sp, #44] +; CT-NEXT: vmov.32 d19[1], r2 +; CT-NEXT: orr r2, r4, lr, lsl #16 +; CT-NEXT: ldrh r6, [sp, #40] +; CT-NEXT: vmov.32 d18[1], r2 +; CT-NEXT: orr r2, r6, r8, lsl #16 +; CT-NEXT: vmov.32 d17[1], r2 +; CT-NEXT: rsb r2, r1, #0 +; CT-NEXT: vdup.32 q11, r2 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vst1.64 {d20, d21}, [r0:128] +; CT-NEXT: pop {r4, r5, r6, r7, r8, pc} +; +; BFLOAT-F16-NATIVE-LABEL: ct_v8bf16: +; BFLOAT-F16-NATIVE: @ %bb.0: @ %entry +; BFLOAT-F16-NATIVE-NEXT: add r1, sp, #8 +; BFLOAT-F16-NATIVE-NEXT: and r0, r0, #1 +; BFLOAT-F16-NATIVE-NEXT: vld1.64 {d18, d19}, [r1] +; BFLOAT-F16-NATIVE-NEXT: rsb r1, r0, #0 +; BFLOAT-F16-NATIVE-NEXT: vldr d17, [sp] +; BFLOAT-F16-NATIVE-NEXT: vmov d16, r2, r3 +; BFLOAT-F16-NATIVE-NEXT: vdup.32 q11, r1 +; BFLOAT-F16-NATIVE-NEXT: vand q10, q8, q11 +; BFLOAT-F16-NATIVE-NEXT: vbic q11, q9, q11 +; BFLOAT-F16-NATIVE-NEXT: vorr q10, q10, q11 +; BFLOAT-F16-NATIVE-NEXT: vmov r0, r1, d20 +; BFLOAT-F16-NATIVE-NEXT: vmov r2, r3, d21 +; BFLOAT-F16-NATIVE-NEXT: bx lr +; +; F16-NATIVE-LABEL: ct_v8bf16: +; F16-NATIVE: @ %bb.0: @ %entry +; F16-NATIVE-NEXT: .save {r4, r5, r6, r7, r8, lr} +; F16-NATIVE-NEXT: push {r4, r5, r6, r7, r8, lr} +; F16-NATIVE-NEXT: ldrh r12, [sp, #36] +; F16-NATIVE-NEXT: pkhbt r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: ldrh r7, [sp, #32] +; F16-NATIVE-NEXT: and r1, r1, #1 +; F16-NATIVE-NEXT: ldrh r3, [sp, #52] +; F16-NATIVE-NEXT: vmov.32 d16[0], r2 +; F16-NATIVE-NEXT: ldrh r2, [sp, #48] +; F16-NATIVE-NEXT: orr r7, r7, r12, lsl #16 +; F16-NATIVE-NEXT: ldrh r5, [sp, #68] +; F16-NATIVE-NEXT: orr r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: vmov.32 d17[0], r7 +; F16-NATIVE-NEXT: ldrh r7, [sp, #64] +; F16-NATIVE-NEXT: ldrh r3, [sp, #28] +; F16-NATIVE-NEXT: vmov.32 d18[0], r2 +; F16-NATIVE-NEXT: ldrh r2, [sp, #24] +; F16-NATIVE-NEXT: orr r7, r7, r5, lsl #16 +; F16-NATIVE-NEXT: ldrh r5, [sp, #76] +; F16-NATIVE-NEXT: vmov.32 d19[0], r7 +; F16-NATIVE-NEXT: orr r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: ldrh r7, [sp, #72] +; F16-NATIVE-NEXT: ldrh lr, [sp, #60] +; F16-NATIVE-NEXT: vmov.32 d16[1], r2 +; F16-NATIVE-NEXT: orr r2, r7, r5, lsl #16 +; F16-NATIVE-NEXT: ldrh r4, [sp, #56] +; F16-NATIVE-NEXT: ldrh r8, [sp, #44] +; F16-NATIVE-NEXT: vmov.32 d19[1], r2 +; F16-NATIVE-NEXT: orr r2, r4, lr, lsl #16 +; F16-NATIVE-NEXT: ldrh r6, [sp, #40] +; F16-NATIVE-NEXT: vmov.32 d18[1], r2 +; F16-NATIVE-NEXT: orr r2, r6, r8, lsl #16 +; F16-NATIVE-NEXT: vmov.32 d17[1], r2 +; F16-NATIVE-NEXT: rsb r2, r1, #0 +; F16-NATIVE-NEXT: vdup.32 q11, r2 +; F16-NATIVE-NEXT: vand q10, q8, q11 +; F16-NATIVE-NEXT: vbic q11, q9, q11 +; F16-NATIVE-NEXT: vorr q10, q10, q11 +; F16-NATIVE-NEXT: vst1.64 {d20, d21}, [r0:128] +; F16-NATIVE-NEXT: pop {r4, r5, r6, r7, r8, pc} +entry: + %sel = call <8 x bfloat> @llvm.ct.select.v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) + ret <8 x bfloat> %sel +} diff --git a/llvm/test/CodeGen/ARM/ctselect-vector.ll b/llvm/test/CodeGen/ARM/ctselect-vector.ll new file mode 100644 index 0000000000000..8afa8275d9aff --- /dev/null +++ b/llvm/test/CodeGen/ARM/ctselect-vector.ll @@ -0,0 +1,855 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s +; RUN: llc < %s -mtriple=armv6 -verify-machineinstrs | FileCheck --check-prefix=DEFAULT %s + +define <8 x i8> @ct_v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) { +; CT-LABEL: ct_v8i8: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v8i8: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and lr, r1, #1 +; DEFAULT-NEXT: ldrb r12, [sp, #68] +; DEFAULT-NEXT: ldrb r1, [sp, #36] +; DEFAULT-NEXT: rsb r5, lr, #0 +; DEFAULT-NEXT: and r4, r1, r5 +; DEFAULT-NEXT: bic r5, r12, r5 +; DEFAULT-NEXT: orr r4, r4, r5 +; DEFAULT-NEXT: ldrb r12, [sp, #64] +; DEFAULT-NEXT: ldrb r5, [sp, #32] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #7] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #60] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #28] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #6] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #56] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #24] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #5] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #52] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #20] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #4] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #48] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #16] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #3] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r1, [sp, #44] +; DEFAULT-NEXT: strb r4, [r0, #2] +; DEFAULT-NEXT: rsb r4, lr, #0 +; DEFAULT-NEXT: and r5, r3, r4 +; DEFAULT-NEXT: bic r4, r1, r4 +; DEFAULT-NEXT: orr r5, r5, r4 +; DEFAULT-NEXT: ldrb r1, [sp, #40] +; DEFAULT-NEXT: strb r5, [r0, #1] +; DEFAULT-NEXT: rsb r5, lr, #0 +; DEFAULT-NEXT: and r3, r2, r5 +; DEFAULT-NEXT: bic r5, r1, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: strb r3, [r0] +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +entry: + %sel = call <8 x i8> @llvm.ct.select.v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) + ret <8 x i8> %sel +} + +define <4 x i16> @ct_v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) { +; CT-LABEL: ct_v4i16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v4i16: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldrh r1, [sp, #24] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r4, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldrh r2, [sp, #28] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r5, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: ldrh r3, [sp, #16] +; DEFAULT-NEXT: ldrh lr, [sp, #32] +; DEFAULT-NEXT: and r2, r3, r4 +; DEFAULT-NEXT: bic r4, lr, r4 +; DEFAULT-NEXT: ldrh lr, [sp, #36] +; DEFAULT-NEXT: orr r2, r2, r4 +; DEFAULT-NEXT: ldrh r4, [sp, #20] +; DEFAULT-NEXT: and r3, r4, r5 +; DEFAULT-NEXT: bic r5, lr, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +entry: + %sel = call <4 x i16> @llvm.ct.select.v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) + ret <4 x i16> %sel +} + +define <2 x i32> @ct_v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) { +; CT-LABEL: ct_v2i32: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v2i32: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #8] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #12] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: pop {r11, pc} +entry: + %sel = call <2 x i32> @llvm.ct.select.v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) + ret <2 x i32> %sel +} + +define <1 x i64> @ct_v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b) { +; CT-LABEL: ct_v1i64: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v1i64: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #8] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #12] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: pop {r11, pc} +entry: + %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b) + ret <1 x i64> %sel +} + +define <2 x float> @ct_v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) { +; CT-LABEL: ct_v2f32: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v2f32: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #8] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #12] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: pop {r11, pc} +entry: + %sel = call <2 x float> @llvm.ct.select.v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) + ret <2 x float> %sel +} + +define <16 x i8> @ct_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) { +; CT-LABEL: ct_v16i8: +; CT: @ %bb.0: @ %entry +; CT-NEXT: add r1, sp, #8 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vld1.64 {d18, d19}, [r1] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vldr d17, [sp] +; CT-NEXT: vmov d16, r2, r3 +; CT-NEXT: vdup.32 q11, r1 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vmov r0, r1, d20 +; CT-NEXT: vmov r2, r3, d21 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v16i8: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and lr, r1, #1 +; DEFAULT-NEXT: ldrb r12, [sp, #132] +; DEFAULT-NEXT: ldrb r1, [sp, #68] +; DEFAULT-NEXT: rsb r5, lr, #0 +; DEFAULT-NEXT: and r4, r1, r5 +; DEFAULT-NEXT: bic r5, r12, r5 +; DEFAULT-NEXT: orr r4, r4, r5 +; DEFAULT-NEXT: ldrb r12, [sp, #128] +; DEFAULT-NEXT: ldrb r5, [sp, #64] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #15] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #124] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #60] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #14] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #120] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #56] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #13] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #116] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #52] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #12] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #112] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #48] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #11] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #108] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #44] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #10] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #104] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #40] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #9] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #100] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #36] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #8] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #96] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #32] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #7] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #92] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #28] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #6] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #88] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #24] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #5] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #84] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #20] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #4] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #80] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #16] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #3] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r1, [sp, #76] +; DEFAULT-NEXT: strb r4, [r0, #2] +; DEFAULT-NEXT: rsb r4, lr, #0 +; DEFAULT-NEXT: and r5, r3, r4 +; DEFAULT-NEXT: bic r4, r1, r4 +; DEFAULT-NEXT: orr r5, r5, r4 +; DEFAULT-NEXT: ldrb r1, [sp, #72] +; DEFAULT-NEXT: strb r5, [r0, #1] +; DEFAULT-NEXT: rsb r5, lr, #0 +; DEFAULT-NEXT: and r3, r2, r5 +; DEFAULT-NEXT: bic r5, r1, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: strb r3, [r0] +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +entry: + %sel = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %sel +} + +define <8 x i16> @ct_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) { +; CT-LABEL: ct_v8i16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: add r1, sp, #8 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vld1.64 {d18, d19}, [r1] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vldr d17, [sp] +; CT-NEXT: vmov d16, r2, r3 +; CT-NEXT: vdup.32 q11, r1 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vmov r0, r1, d20 +; CT-NEXT: vmov r2, r3, d21 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v8i16: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and lr, r1, #1 +; DEFAULT-NEXT: ldrh r12, [sp, #68] +; DEFAULT-NEXT: ldrh r1, [sp, #36] +; DEFAULT-NEXT: rsb r5, lr, #0 +; DEFAULT-NEXT: and r4, r1, r5 +; DEFAULT-NEXT: bic r5, r12, r5 +; DEFAULT-NEXT: orr r4, r4, r5 +; DEFAULT-NEXT: ldrh r12, [sp, #64] +; DEFAULT-NEXT: ldrh r5, [sp, #32] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strh r4, [r0, #14] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrh r12, [sp, #60] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrh r5, [sp, #28] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strh r4, [r0, #12] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrh r12, [sp, #56] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrh r5, [sp, #24] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strh r4, [r0, #10] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrh r12, [sp, #52] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrh r5, [sp, #20] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strh r4, [r0, #8] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrh r12, [sp, #48] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrh r5, [sp, #16] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strh r4, [r0, #6] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrh r1, [sp, #44] +; DEFAULT-NEXT: strh r4, [r0, #4] +; DEFAULT-NEXT: rsb r4, lr, #0 +; DEFAULT-NEXT: and r5, r3, r4 +; DEFAULT-NEXT: bic r4, r1, r4 +; DEFAULT-NEXT: orr r5, r5, r4 +; DEFAULT-NEXT: ldrh r1, [sp, #40] +; DEFAULT-NEXT: strh r5, [r0, #2] +; DEFAULT-NEXT: rsb r5, lr, #0 +; DEFAULT-NEXT: and r3, r2, r5 +; DEFAULT-NEXT: bic r5, r1, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: strh r3, [r0] +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +entry: + %sel = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %sel +} + +define <4 x i32> @ct_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; CT-LABEL: ct_v4i32: +; CT: @ %bb.0: @ %entry +; CT-NEXT: add r1, sp, #8 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vld1.64 {d18, d19}, [r1] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vldr d17, [sp] +; CT-NEXT: vmov d16, r2, r3 +; CT-NEXT: vdup.32 q11, r1 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vmov r0, r1, d20 +; CT-NEXT: vmov r2, r3, d21 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v4i32: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #24] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r4, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #28] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r5, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: ldr r3, [sp, #16] +; DEFAULT-NEXT: ldr lr, [sp, #32] +; DEFAULT-NEXT: and r2, r3, r4 +; DEFAULT-NEXT: bic r4, lr, r4 +; DEFAULT-NEXT: ldr lr, [sp, #36] +; DEFAULT-NEXT: orr r2, r2, r4 +; DEFAULT-NEXT: ldr r4, [sp, #20] +; DEFAULT-NEXT: and r3, r4, r5 +; DEFAULT-NEXT: bic r5, lr, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +entry: + %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %sel +} + +define <2 x i64> @ct_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { +; CT-LABEL: ct_v2i64: +; CT: @ %bb.0: @ %entry +; CT-NEXT: add r1, sp, #8 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vld1.64 {d18, d19}, [r1] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vldr d17, [sp] +; CT-NEXT: vmov d16, r2, r3 +; CT-NEXT: vdup.32 q11, r1 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vmov r0, r1, d20 +; CT-NEXT: vmov r2, r3, d21 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v2i64: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #24] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r4, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #28] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r5, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: ldr r3, [sp, #16] +; DEFAULT-NEXT: ldr lr, [sp, #32] +; DEFAULT-NEXT: and r2, r3, r4 +; DEFAULT-NEXT: bic r4, lr, r4 +; DEFAULT-NEXT: ldr lr, [sp, #36] +; DEFAULT-NEXT: orr r2, r2, r4 +; DEFAULT-NEXT: ldr r4, [sp, #20] +; DEFAULT-NEXT: and r3, r4, r5 +; DEFAULT-NEXT: bic r5, lr, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +entry: + %sel = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %sel +} + +define <4 x float> @ct_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { +; CT-LABEL: ct_v4f32: +; CT: @ %bb.0: @ %entry +; CT-NEXT: add r1, sp, #8 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vld1.64 {d18, d19}, [r1] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vldr d17, [sp] +; CT-NEXT: vmov d16, r2, r3 +; CT-NEXT: vdup.32 q11, r1 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vmov r0, r1, d20 +; CT-NEXT: vmov r2, r3, d21 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v4f32: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #24] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r4, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #28] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r5, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: ldr r3, [sp, #16] +; DEFAULT-NEXT: ldr lr, [sp, #32] +; DEFAULT-NEXT: and r2, r3, r4 +; DEFAULT-NEXT: bic r4, lr, r4 +; DEFAULT-NEXT: ldr lr, [sp, #36] +; DEFAULT-NEXT: orr r2, r2, r4 +; DEFAULT-NEXT: ldr r4, [sp, #20] +; DEFAULT-NEXT: and r3, r4, r5 +; DEFAULT-NEXT: bic r5, lr, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +entry: + %sel = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) + ret <4 x float> %sel +} + +define <2 x double> @ct_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { +; CT-LABEL: ct_v2f64: +; CT: @ %bb.0: @ %entry +; CT-NEXT: add r1, sp, #8 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vld1.64 {d18, d19}, [r1] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vldr d17, [sp] +; CT-NEXT: vmov d16, r2, r3 +; CT-NEXT: vdup.32 q11, r1 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vmov r0, r1, d20 +; CT-NEXT: vmov r2, r3, d21 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v2f64: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #24] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r4, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #28] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r5, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: ldr r3, [sp, #16] +; DEFAULT-NEXT: ldr lr, [sp, #32] +; DEFAULT-NEXT: and r2, r3, r4 +; DEFAULT-NEXT: bic r4, lr, r4 +; DEFAULT-NEXT: ldr lr, [sp, #36] +; DEFAULT-NEXT: orr r2, r2, r4 +; DEFAULT-NEXT: ldr r4, [sp, #20] +; DEFAULT-NEXT: and r3, r4, r5 +; DEFAULT-NEXT: bic r5, lr, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +entry: + %sel = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) + ret <2 x double> %sel +} + +; +; itty bitty vector type edge cases follow. these should be scalarised. +; +define <1 x i8> @ct_v1i8(i1 %cond, <1 x i8> %a, <1 x i8> %b) { +; CT-LABEL: ct_v1i8: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v1i8: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr +entry: + %sel = call <1 x i8> @llvm.ct.select.i8(i1 %cond, <1 x i8> %a, <1 x i8> %b) + ret <1 x i8> %sel +} + +define <2 x i8> @ct_v2i8(i1 %cond, <2 x i8> %a, <2 x i8> %b) { +; CT-LABEL: ct_v2i8: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v2i8: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r0, r1, lr +; DEFAULT-NEXT: bic lr, r3, lr +; DEFAULT-NEXT: ldrb r3, [sp, #8] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r2, lr +; DEFAULT-NEXT: bic lr, r3, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: pop {r11, pc} +entry: + %sel = call <2 x i8> @llvm.ct.select.i16(i1 %cond, <2 x i8> %a, <2 x i8> %b) + ret <2 x i8> %sel +} + +define <4 x i8> @ct_v4i8(i1 %cond, <4 x i8> %a, <4 x i8> %b) { +; CT-LABEL: ct_v4i8: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v4i8: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldrb lr, [sp, #20] +; DEFAULT-NEXT: rsb r4, r12, #0 +; DEFAULT-NEXT: rsb r5, r12, #0 +; DEFAULT-NEXT: and r0, r1, r4 +; DEFAULT-NEXT: bic r4, lr, r4 +; DEFAULT-NEXT: orr r0, r0, r4 +; DEFAULT-NEXT: ldrb r4, [sp, #24] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r2, lr +; DEFAULT-NEXT: bic lr, r4, lr +; DEFAULT-NEXT: ldrb r4, [sp, #28] +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r2, r3, lr +; DEFAULT-NEXT: bic lr, r4, lr +; DEFAULT-NEXT: orr r2, r2, lr +; DEFAULT-NEXT: ldrb r4, [sp, #16] +; DEFAULT-NEXT: ldrb lr, [sp, #32] +; DEFAULT-NEXT: and r3, r4, r5 +; DEFAULT-NEXT: bic r5, lr, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +entry: + %sel = call <4 x i8> @llvm.ct.select.i32(i1 %cond, <4 x i8> %a, <4 x i8> %b) + ret <4 x i8> %sel +} + +define <1 x i16> @ct_v1i16(i1 %cond, <1 x i16> %a, <1 x i16> %b) { +; CT-LABEL: ct_v1i16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v1i16: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr +entry: + %sel = call <1 x i16> @llvm.ct.select.i16(i1 %cond, <1 x i16> %a, <1 x i16> %b) + ret <1 x i16> %sel +} + +define <2 x i16> @ct_v2i16(i1 %cond, <2 x i16> %a, <2 x i16> %b) { +; CT-LABEL: ct_v2i16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v2i16: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r0, r1, lr +; DEFAULT-NEXT: bic lr, r3, lr +; DEFAULT-NEXT: ldrh r3, [sp, #8] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r2, lr +; DEFAULT-NEXT: bic lr, r3, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: pop {r11, pc} +entry: + %sel = call <2 x i16> @llvm.ct.select.i32(i1 %cond, <2 x i16> %a, <2 x i16> %b) + ret <2 x i16> %sel +} + +define <1 x i32> @ct_v1i32(i1 %cond, <1 x i32> %a, <1 x i32> %b) { +; CT-LABEL: ct_v1i32: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v1i32: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr +entry: + %sel = call <1 x i32> @llvm.ct.select.i32(i1 %cond, <1 x i32> %a, <1 x i32> %b) + ret <1 x i32> %sel +} + +define <1 x float> @ct_v1f32(i1 %cond, <1 x float> %a, <1 x float> %b) { +; CT-LABEL: ct_v1f32: +; CT: @ %bb.0: @ %entry +; CT-NEXT: vmov s0, r2 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vmov s2, r1 +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov r3, s0 +; CT-NEXT: vmov r2, s2 +; CT-NEXT: and r2, r2, r1 +; CT-NEXT: bic r1, r3, r1 +; CT-NEXT: orr r2, r2, r1 +; CT-NEXT: vmov s4, r2 +; CT-NEXT: vmov r0, s4 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v1f32: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr +entry: + %sel = call <1 x float> @llvm.ct.select.f32(i1 %cond, <1 x float> %a, <1 x float> %b) + ret <1 x float> %sel +} diff --git a/llvm/test/CodeGen/ARM/ctselect.ll b/llvm/test/CodeGen/ARM/ctselect.ll index 5813796c65a52..e054f99cf0db8 100644 --- a/llvm/test/CodeGen/ARM/ctselect.ll +++ b/llvm/test/CodeGen/ARM/ctselect.ll @@ -1,31 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s -; RUN: llc < %s -mtriple=armv6 -mattr=+ctselect -verify-machineinstrs | FileCheck --check-prefix=TEST-CT %s ; RUN: llc < %s -mtriple=armv6 -verify-machineinstrs | FileCheck --check-prefix=DEFAULT %s +define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) { +; CT-LABEL: ct_i1: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_i1: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr +entry: + %sel = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) + ret i1 %sel +} + define i8 @ct_int8(i1 %cond, i8 %a, i8 %b) { ; CT-LABEL: ct_int8: -; CT: and -; CT: sub -; CT: rsb -; CT-NEXT: and -; CT-NEXT: and -; CT-NEXT: orr -; CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; CT-NOT: j -; CT-NOT: mov -; CT-NOT: ldr - -; TEST-CT: and -; TEST-CT: sub -; TEST-CT: rsb -; TEST-CT-NEXT: and -; TEST-CT-NEXT: and -; TEST-CT-NEXT: orr -; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; TEST-CT-NOT: j -; TEST-CT-NOT: mov - -; DEFAULT: {{mov|ldr}} +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_int8: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr entry: %sel = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) ret i8 %sel @@ -33,28 +50,22 @@ entry: define i16 @ct_int16(i1 %cond, i16 %a, i16 %b) { ; CT-LABEL: ct_int16: -; CT: and -; CT: sub -; CT: rsb -; CT-NEXT: and -; CT-NEXT: and -; CT-NEXT: orr -; CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; CT-NOT: j -; CT-NOT: mov -; CT-NOT: ldr - -; TEST-CT: and -; TEST-CT: sub -; TEST-CT: rsb -; TEST-CT-NEXT: and -; TEST-CT-NEXT: and -; TEST-CT-NEXT: orr -; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; TEST-CT-NOT: j -; TEST-CT-NOT: mov - -; DEFAULT: {{mov|ldr}} +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_int16: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr entry: %sel = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) ret i16 %sel @@ -62,28 +73,22 @@ entry: define i32 @ct_int32(i1 %cond, i32 %a, i32 %b) { ; CT-LABEL: ct_int32: -; CT: and -; CT: sub -; CT: rsb -; CT-NEXT: and -; CT-NEXT: and -; CT-NEXT: orr -; CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; CT-NOT: j -; CT-NOT: mov -; CT-NOT: ldr - -; TEST-CT: and -; TEST-CT: sub -; TEST-CT: rsb -; TEST-CT-NEXT: and -; TEST-CT-NEXT: and -; TEST-CT-NEXT: orr -; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; TEST-CT-NOT: j -; TEST-CT-NOT: mov - -; DEFAULT: {{mov|ldr}} +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_int32: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr entry: %sel = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %sel @@ -91,23 +96,37 @@ entry: define i64 @ct_int64(i1 %cond, i64 %a, i64 %b) { ; CT-LABEL: ct_int64: -; CT: sub -; CT: rsb -; CT: and -; CT: and -; CT: and -; CT-NEXT: and -; CT-NEXT: orr -; CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; CT-NOT: j -; CT-NOT: mov -; CT-NOT: ldr - -; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; TEST-CT-NOT: j -; TEST-CT-NOT: mov - -; DEFAULT: {{mov|ldr}} +; CT: @ %bb.0: @ %entry +; CT-NEXT: .save {r4, lr} +; CT-NEXT: push {r4, lr} +; CT-NEXT: and lr, r0, #1 +; CT-NEXT: ldr r12, [sp, #12] +; CT-NEXT: rsb r4, lr, #0 +; CT-NEXT: ldr r1, [sp, #8] +; CT-NEXT: and r0, r2, r4 +; CT-NEXT: rsb r2, lr, #0 +; CT-NEXT: bic r4, r1, r4 +; CT-NEXT: and r1, r3, r2 +; CT-NEXT: bic r2, r12, r2 +; CT-NEXT: orr r0, r0, r4 +; CT-NEXT: orr r1, r1, r2 +; CT-NEXT: pop {r4, pc} +; +; DEFAULT-LABEL: ct_int64: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #8] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #12] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: pop {r11, pc} entry: %sel = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) ret i64 %sel @@ -115,22 +134,28 @@ entry: define float @ct_float(i1 %cond, float %a, float %b) { ; CT-LABEL: ct_float: -; CT: and -; CT: sub -; CT: rsb -; CT-NEXT: and -; CT-NEXT: and -; CT-NEXT: orr -; CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; CT-NOT: j -; CT-NOT: mov -; CT-NOT: ldr - -; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; TEST-CT-NOT: j -; TEST-CT-NOT: mov - -; DEFAULT: {{mov|ldr}} +; CT: @ %bb.0: @ %entry +; CT-NEXT: vmov s0, r2 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vmov s2, r1 +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov r3, s0 +; CT-NEXT: vmov r2, s2 +; CT-NEXT: and r2, r2, r1 +; CT-NEXT: bic r1, r3, r1 +; CT-NEXT: orr r2, r2, r1 +; CT-NEXT: vmov s4, r2 +; CT-NEXT: vmov r0, s4 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_float: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr entry: %sel = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) ret float %sel @@ -138,126 +163,34 @@ entry: define double @ct_f64(i1 %cond, double %a, double %b) { ; CT-LABEL: ct_f64: -; CT: vand -; CT-NEXT: vldr -; CT-NEXT: vneg -; CT-NEXT: vbsl -; CT-NOT: ldr -; CT-NOT: vldr -; CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; CT-NOT: j - -; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; TEST-CT-NOT: j -; TEST-CT-NOT: mov - -; DEFAULT: {{mov|ldr|vldr}} +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_f64: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #8] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #12] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: pop {r11, pc} entry: %sel = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) ret double %sel } - -define <8 x i8> @ct_v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) { -; CT-LABEL: ct_v8i8: -; CT: vand -; CT-NEXT: vldr -; CT-NEXT: vneg -; CT-NEXT: vbsl -; CT-NOT: ldr -; CT-NOT: vldr -; CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; CT-NOT: j - -; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; TEST-CT-NOT: j -; TEST-CT-NOT: mov - -; DEFAULT: {{mov|ldr|vldr}} -entry: - %sel = call <8 x i8> @llvm.ct.select.v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) - ret <8 x i8> %sel -} - -define <4 x i16> @ct_v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) { -; CT-LABEL: ct_v4i16: -; CT: vand -; CT-NEXT: vldr -; CT-NEXT: vneg -; CT-NEXT: vbsl -; CT-NOT: ldr -; CT-NOT: vldr -; CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; CT-NOT: j - -; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; TEST-CT-NOT: j -; TEST-CT-NOT: mov - -; DEFAULT: {{mov|ldr|vldr}} -entry: - %sel = call <4 x i16> @llvm.ct.select.v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) - ret <4 x i16> %sel -} - -; Technically this should be handled the exact same as double. -define <2 x i32> @ct_v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) { -; CT-LABEL: ct_v2i32: -; CT: vand -; CT-NEXT: vldr -; CT-NEXT: vneg -; CT-NEXT: vbsl -; CT-NOT: ldr -; CT-NOT: vldr -; CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; CT-NOT: j - -; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; TEST-CT-NOT: j -; TEST-CT-NOT: mov - -; DEFAULT: {{mov|ldr|vldr}} -entry: - %sel = call <2 x i32> @llvm.ct.select.v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) - ret <2 x i32> %sel -} - -define <2 x float> @ct_v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) { -; CT-LABEL: ct_v2f32: -; CT: vand -; CT-NEXT: vldr -; CT-NEXT: vneg -; CT-NEXT: vbsl -; CT-NOT: ldr -; CT-NOT: vldr -; CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; CT-NOT: j - -; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; TEST-CT-NOT: j -; TEST-CT-NOT: mov - -; DEFAULT: {{mov|ldr|vldr}} -entry: - %sel = call <2 x float> @llvm.ct.select.v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) - ret <2 x float> %sel -} - -define <4 x float> @ct_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { -; CT-LABEL: ct_v4f32: -; CT: vand -; CT: vldr -; CT: vneg -; CT: vbsl -; CT-NOT: ldr -; CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; CT-NOT: j - -; TEST-CT-NOT: b{{eq|ne|lt|gt|le|ge}} -; TEST-CT-NOT: j -; TEST-CT-NOT: mov - -; DEFAULT: {{mov|ldr|vldr}} -entry: - %sel = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) - ret <4 x float> %sel -} \ No newline at end of file diff --git a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll index 06791a3262749..fb6b4706d62d8 100644 --- a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll +++ b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll @@ -13,11 +13,17 @@ define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) { ; X64-NEXT: cmovneq %rsi, %rax ; X64-NEXT: cmovneq %rdx, %r8 ; X64-NEXT: movq %r8, %rdx -; X64: retq +; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i128: ; X32: # %bb.0: -; X32: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -27,11 +33,15 @@ define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) { ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edi ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %edx, {{[0-9]+}}(%eax) -; X32-NEXT: movl %ecx, {{[0-9]+}}(%eax) -; X32-NEXT: movl %edi, {{[0-9]+}}(%eax) +; X32-NEXT: movl %edx, 12(%eax) +; X32-NEXT: movl %ecx, 8(%eax) +; X32-NEXT: movl %edi, 4(%eax) ; X32-NEXT: movl %esi, (%eax) -; X32: retl $4 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl $4 %result = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b) ret i128 %result } @@ -43,14 +53,16 @@ define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { ; X64-NEXT: movl %edx, %eax ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %esi, %eax -; X64: retq +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i1: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax -; X32: retl +; X32-NEXT: # kill: def $al killed $al killed $eax +; X32-NEXT: retl %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) ret i1 %result } @@ -60,16 +72,16 @@ define i32 @test_ctselect_extremal_values(i1 %cond) { ; X64-LABEL: test_ctselect_extremal_values: ; X64: # %bb.0: ; X64-NEXT: testb $1, %dil -; X64-NEXT: movl $2147483647, %ecx -; X64-NEXT: movl $-2147483648, %eax +; X64-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X64-NEXT: movl $-2147483648, %eax # imm = 0x80000000 ; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_extremal_values: ; X32: # %bb.0: ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: movl $2147483647, %ecx -; X32-NEXT: movl $-2147483648, %eax +; X32-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X32-NEXT: movl $-2147483648, %eax # imm = 0x80000000 ; X32-NEXT: cmovnel %ecx, %eax ; X32-NEXT: retl %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648) @@ -81,8 +93,8 @@ define float @test_ctselect_f32_special_values(i1 %cond) { ; X64-LABEL: test_ctselect_f32_special_values: ; X64: # %bb.0: ; X64-NEXT: testb $1, %dil -; X64-NEXT: movl $2143289344, %eax -; X64-NEXT: movl $2139095040, %ecx +; X64-NEXT: movl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: movl $2139095040, %ecx # imm = 0x7F800000 ; X64-NEXT: cmovnel %eax, %ecx ; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: retq @@ -90,8 +102,8 @@ define float @test_ctselect_f32_special_values(i1 %cond) { ; X32-LABEL: test_ctselect_f32_special_values: ; X32: # %bb.0: ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: flds .LCPI3_0 -; X32-NEXT: flds .LCPI3_1 +; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; X32-NEXT: jne .LBB3_2 ; X32-NEXT: # %bb.1: ; X32-NEXT: fstp %st(1) @@ -107,8 +119,8 @@ define double @test_ctselect_f64_special_values(i1 %cond) { ; X64-LABEL: test_ctselect_f64_special_values: ; X64: # %bb.0: ; X64-NEXT: testb $1, %dil -; X64-NEXT: movabsq $9221120237041090560, %rax -; X64-NEXT: movabsq $9218868437227405312, %rcx +; X64-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000 +; X64-NEXT: movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000 ; X64-NEXT: cmovneq %rax, %rcx ; X64-NEXT: movq %rcx, %xmm0 ; X64-NEXT: retq @@ -116,8 +128,8 @@ define double @test_ctselect_f64_special_values(i1 %cond) { ; X32-LABEL: test_ctselect_f64_special_values: ; X32: # %bb.0: ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: flds .LCPI4_0 -; X32-NEXT: flds .LCPI4_1 +; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; X32-NEXT: jne .LBB4_2 ; X32-NEXT: # %bb.1: ; X32-NEXT: fstp %st(1) @@ -267,9 +279,9 @@ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; X64-LABEL: test_ctselect_deeply_nested: ; X64: # %bb.0: -; X64-NEXT: movl 24(%rsp), %eax -; X64-NEXT: movl 16(%rsp), %r10d -; X64-NEXT: movl 8(%rsp), %r11d +; X64-NEXT: movl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: movl {{[0-9]+}}(%rsp), %r11d ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %r8d, %r9d ; X64-NEXT: testb $1, %sil @@ -283,7 +295,9 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ; X32-LABEL: test_ctselect_deeply_nested: ; X32: # %bb.0: ; X32-NEXT: pushl %esi -; X32: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %esi, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -296,7 +310,8 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel %ecx, %eax ; X32-NEXT: popl %esi -; X32: retl +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) diff --git a/llvm/test/CodeGen/X86/ctselect-vector.ll b/llvm/test/CodeGen/X86/ctselect-vector.ll index a3a7934d2bb92..0e53a8324e5ce 100644 --- a/llvm/test/CodeGen/X86/ctselect-vector.ll +++ b/llvm/test/CodeGen/X86/ctselect-vector.ll @@ -2,7 +2,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 -// ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 ; Test ct.select functionality for vector types diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll index 71a847f00d166..580253b27d44e 100644 --- a/llvm/test/CodeGen/X86/ctselect.ll +++ b/llvm/test/CodeGen/X86/ctselect.ll @@ -10,7 +10,7 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { ; X64-NEXT: movl %edx, %eax ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %esi, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i8: @@ -18,7 +18,8 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax -; X32: retl +; X32-NEXT: # kill: def $al killed $al killed $eax +; X32-NEXT: retl %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) ret i8 %result } @@ -29,7 +30,7 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { ; X64-NEXT: movl %edx, %eax ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %esi, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i16: @@ -37,7 +38,7 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { ; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnew {{[0-9]+}}(%esp), %ax -; X32: retl +; X32-NEXT: retl %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) ret i16 %result } @@ -48,14 +49,14 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; X64-NEXT: movl %edx, %eax ; X64-NEXT: testb $1, %dil ; X64-NEXT: cmovnel %esi, %eax -; X64: retq +; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i32: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax -; X32: retl +; X32-NEXT: retl %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result } From 384b8fe4be3a0bd3ea9fe06db8eb1545a32b4fd3 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Wed, 10 Sep 2025 17:22:50 -0600 Subject: [PATCH 37/63] [CT] code quality improvements to ARM implementation (#43) * splits vector and scalar handling for ctselect pseudo expansion --- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 182 ++++++++++++----------- llvm/lib/Target/ARM/ARMBaseInstrInfo.h | 2 + 2 files changed, 101 insertions(+), 83 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index b1e2c843a0cf2..027cfb95b7f51 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1526,46 +1526,94 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const { BB->erase(MI); } -// Expands the ctselect pseudo, post-RA. -bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const { +// Expands the ctselect pseudo for vector operands, post-RA. +bool ARMBaseInstrInfo::expandCtSelectVector(MachineInstr &MI) const { MachineBasicBlock *MBB = MI.getParent(); DebugLoc DL = MI.getDebugLoc(); Register DestReg = MI.getOperand(0).getReg(); Register MaskReg = MI.getOperand(1).getReg(); - const TargetRegisterInfo *TRI = &getRegisterInfo(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(DestReg); - - Register DestRegSavedRef = DestReg; - Register VectorMaskReg = 0; - Register Src1Reg, Src2Reg, CondReg; // These operations will differ by operand register size. - unsigned AndOp = ARM::ANDrr; - unsigned BicOp = ARM::BICrr; - unsigned OrrOp = ARM::ORRrr; + unsigned AndOp = ARM::VANDd; + unsigned BicOp = ARM::VBICd; + unsigned OrrOp = ARM::VORRd; unsigned BroadcastOp = ARM::VDUP32d; - unsigned Opcode = MI.getOpcode(); - bool IsVector = false; - + const TargetRegisterInfo *TRI = &getRegisterInfo(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(DestReg); + if (ARM::QPRRegClass.hasSubClassEq(RC)) { AndOp = ARM::VANDq; BicOp = ARM::VBICq; OrrOp = ARM::VORRq; BroadcastOp = ARM::VDUP32q; - IsVector = true; - } else if (ARM::DPRRegClass.hasSubClassEq(RC)) { - AndOp = ARM::VANDd; - BicOp = ARM::VBICd; - OrrOp = ARM::VORRd; - IsVector = true; } - // NB: we handle f64 as a vec of two f32s. - if (Opcode == ARM::CTSELECTf64) { - IsVector = true; - } + // Any vector pseudo has: ((outs $dst, $tmp_mask, $bcast_mask), (ins $src1, $src2, $cond)) + Register VectorMaskReg = MI.getOperand(2).getReg(); + Register Src1Reg = MI.getOperand(3).getReg(); + Register Src2Reg = MI.getOperand(4).getReg(); + Register CondReg = MI.getOperand(5).getReg(); + + // The following sequence of steps yields: (src1 & mask) | (src2 & ~mask) + + // 1. mask = 0 - cond + // When cond = 0: mask = 0x00000000. + // When cond = 1: mask = 0xFFFFFFFF. + BuildMI(*MBB, MI, DL, get(ARM::RSBri), MaskReg) + .addReg(CondReg) + .addImm(0) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 2. A = src1 & mask + // For vectors, broadcast the scalar mask so it matches operand size. + BuildMI(*MBB, MI, DL, get(BroadcastOp), VectorMaskReg) + .addReg(MaskReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + BuildMI(*MBB, MI, DL, get(AndOp), DestReg) + .addReg(Src1Reg) + .addReg(VectorMaskReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 3. B = src2 & ~mask + BuildMI(*MBB, MI, DL, get(BicOp), VectorMaskReg) + .addReg(Src2Reg) + .addReg(VectorMaskReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 4. result = A | B + BuildMI(*MBB, MI, DL, get(OrrOp), DestReg) + .addReg(DestReg) + .addReg(VectorMaskReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + MI.eraseFromParent(); + return true; +} + +// Expands the ctselect pseudo, post-RA. +bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + Register DestReg = MI.getOperand(0).getReg(); + Register MaskReg = MI.getOperand(1).getReg(); + Register DestRegSavedRef = DestReg; + Register Src1Reg, Src2Reg, CondReg; + + // These operations will differ by operand register size. + unsigned AndOp = ARM::ANDrr; + unsigned BicOp = ARM::BICrr; + unsigned OrrOp = ARM::ORRrr; + unsigned Opcode = MI.getOpcode(); bool IsFloat = Opcode == ARM::CTSELECTf32 || Opcode == ARM::CTSELECTf16 || Opcode == ARM::CTSELECTbf16; if (IsFloat) { @@ -1597,12 +1645,6 @@ bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const { Src2Reg = GPRScratch2; // Reuse GPRScratch1 for dest after we are done working with src1. DestReg = GPRScratch1; - } else if (IsVector) { - // Any vector pseudo has: ((outs $dst, $tmp_mask, $bcast_mask), (ins $src1, $src2, $cond)) - VectorMaskReg = MI.getOperand(2).getReg(); - Src1Reg = MI.getOperand(3).getReg(); - Src2Reg = MI.getOperand(4).getReg(); - CondReg = MI.getOperand(5).getReg(); } else { // Any non-float, non-vector pseudo has: (outs $dst, $tmp_mask), (ins $src1, $src2, $cond)) Src1Reg = MI.getOperand(2).getReg(); @@ -1623,58 +1665,28 @@ bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const { .setMIFlag(MachineInstr::MIFlag::NoMerge); // 2. A = src1 & mask - if (IsVector) { - // For vectors, broadcast the scalar mask so it matches operand size. - BuildMI(*MBB, MI, DL, get(BroadcastOp), VectorMaskReg) - .addReg(MaskReg) - .add(predOps(ARMCC::AL)) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - BuildMI(*MBB, MI, DL, get(AndOp), DestReg) - .addReg(Src1Reg) - .addReg(VectorMaskReg) - .add(predOps(ARMCC::AL)) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - } else { - BuildMI(*MBB, MI, DL, get(AndOp), DestReg) - .addReg(Src1Reg) - .addReg(MaskReg) - .add(predOps(ARMCC::AL)) - .add(condCodeOp()) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - } + BuildMI(*MBB, MI, DL, get(AndOp), DestReg) + .addReg(Src1Reg) + .addReg(MaskReg) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()) + .setMIFlag(MachineInstr::MIFlag::NoMerge); // 3. B = src2 & ~mask - if (IsVector) { - BuildMI(*MBB, MI, DL, get(BicOp), VectorMaskReg) - .addReg(Src2Reg) - .addReg(VectorMaskReg) - .add(predOps(ARMCC::AL)) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - } else { - BuildMI(*MBB, MI, DL, get(BicOp), MaskReg) - .addReg(Src2Reg) - .addReg(MaskReg) - .add(predOps(ARMCC::AL)) - .add(condCodeOp()) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - } + BuildMI(*MBB, MI, DL, get(BicOp), MaskReg) + .addReg(Src2Reg) + .addReg(MaskReg) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()) + .setMIFlag(MachineInstr::MIFlag::NoMerge); // 4. result = A | B - if (IsVector) { - BuildMI(*MBB, MI, DL, get(OrrOp), DestReg) - .addReg(DestReg) - .addReg(VectorMaskReg) - .add(predOps(ARMCC::AL)) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - } else { - BuildMI(*MBB, MI, DL, get(OrrOp), DestReg) - .addReg(DestReg) - .addReg(MaskReg) - .add(predOps(ARMCC::AL)) - .add(condCodeOp()) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - } + BuildMI(*MBB, MI, DL, get(OrrOp), DestReg) + .addReg(DestReg) + .addReg(MaskReg) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()) + .setMIFlag(MachineInstr::MIFlag::NoMerge); if (IsFloat) { // Return our result from GPR to the correct register type. @@ -1702,11 +1714,7 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } - if (opcode == ARM::CTSELECTint || - opcode == ARM::CTSELECTf16 || - opcode == ARM::CTSELECTbf16 || - opcode == ARM::CTSELECTf32 || - opcode == ARM::CTSELECTf64 || + if (opcode == ARM::CTSELECTf64 || opcode == ARM::CTSELECTv8i8 || opcode == ARM::CTSELECTv4i16 || opcode == ARM::CTSELECTv2i32 || @@ -1722,6 +1730,14 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { opcode == ARM::CTSELECTv2f64 || opcode == ARM::CTSELECTv8f16 || opcode == ARM::CTSELECTv8bf16) { + LLVM_DEBUG(dbgs() << "Opcode (vector) " << opcode << "replaced by: " << MI); + return expandCtSelectVector(MI); + } + + if (opcode == ARM::CTSELECTint || + opcode == ARM::CTSELECTf16 || + opcode == ARM::CTSELECTbf16 || + opcode == ARM::CTSELECTf32) { LLVM_DEBUG(dbgs() << "Opcode " << opcode << "replaced by: " << MI); return expandCtSelect(MI); } diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 9fc13d1a8e977..b692010087148 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -221,6 +221,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; + bool expandCtSelectVector(MachineInstr &MI) const; + bool expandCtSelect(MachineInstr &MI) const; bool expandPostRAPseudo(MachineInstr &MI) const override; From 9ea628ec48ba844e19d52cfcc652cd09901aa40e Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Tue, 16 Sep 2025 08:28:29 -0400 Subject: [PATCH 38/63] [CT][NFC] Added Mips code-gen tests that's affected by DAG Chaining --- .../Mips/ctselect-fallback-edge-cases.ll | 154 ++++---- .../Mips/ctselect-fallback-patterns.ll | 161 ++++----- llvm/test/CodeGen/Mips/ctselect-fallback.ll | 332 +++++++++--------- .../CodeGen/Mips/ctselect-side-effects.ll | 24 +- 4 files changed, 324 insertions(+), 347 deletions(-) diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll index afdb85ee69e16..42f460f2c598f 100644 --- a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll +++ b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll @@ -33,29 +33,29 @@ define i32 @test_ctselect_extremal_values(i1 %cond) { ; M32-LABEL: test_ctselect_extremal_values: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: lui $2, 32767 ; M32-NEXT: lui $3, 32768 +; M32-NEXT: addiu $2, $1, -1 ; M32-NEXT: negu $1, $1 -; M32-NEXT: ori $2, $2, 65535 -; M32-NEXT: and $2, $1, $2 -; M32-NEXT: not $1, $1 +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: lui $3, 32767 +; M32-NEXT: ori $3, $3, 65535 ; M32-NEXT: and $1, $1, $3 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_extremal_values: ; M64: # %bb.0: ; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: lui $2, 32767 ; M64-NEXT: lui $3, 32768 ; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: ori $2, $2, 65535 +; M64-NEXT: addiu $2, $1, -1 ; M64-NEXT: negu $1, $1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: lui $3, 32767 +; M64-NEXT: ori $3, $3, 65535 ; M64-NEXT: and $1, $1, $3 ; M64-NEXT: jr $ra -; M64-NEXT: or $2, $2, $1 +; M64-NEXT: or $2, $1, $2 %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648) ret i32 %result } @@ -84,23 +84,22 @@ define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { ; M32-LABEL: test_ctselect_function_ptr: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 ; M32-NEXT: negu $1, $1 -; M32-NEXT: and $2, $1, $5 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $6 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_function_ptr: ; M64: # %bb.0: ; M64-NEXT: andi $1, $4, 1 -; M64-NEXT: daddiu $3, $zero, -1 +; M64-NEXT: daddiu $2, $1, -1 ; M64-NEXT: dnegu $1, $1 -; M64-NEXT: and $2, $1, $5 -; M64-NEXT: xor $1, $1, $3 -; M64-NEXT: and $1, $1, $6 +; M64-NEXT: and $2, $2, $6 +; M64-NEXT: and $1, $1, $5 ; M64-NEXT: jr $ra -; M64-NEXT: or $2, $2, $1 +; M64-NEXT: or $2, $1, $2 %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2) ret ptr %result } @@ -111,12 +110,12 @@ define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { ; M32: # %bb.0: ; M32-NEXT: xor $1, $4, $5 ; M32-NEXT: sltu $1, $zero, $1 +; M32-NEXT: negu $2, $1 ; M32-NEXT: addiu $1, $1, -1 -; M32-NEXT: and $2, $1, $6 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $7 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_ptr_cmp: ; M64: # %bb.0: @@ -141,23 +140,22 @@ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { ; M32-LABEL: test_ctselect_struct_ptr: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 ; M32-NEXT: negu $1, $1 -; M32-NEXT: and $2, $1, $5 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $6 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_struct_ptr: ; M64: # %bb.0: ; M64-NEXT: andi $1, $4, 1 -; M64-NEXT: daddiu $3, $zero, -1 +; M64-NEXT: daddiu $2, $1, -1 ; M64-NEXT: dnegu $1, $1 -; M64-NEXT: and $2, $1, $5 -; M64-NEXT: xor $1, $1, $3 -; M64-NEXT: and $1, $1, $6 +; M64-NEXT: and $2, $2, $6 +; M64-NEXT: and $1, $1, $5 ; M64-NEXT: jr $ra -; M64-NEXT: or $2, $2, $1 +; M64-NEXT: or $2, $1, $2 %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) ret ptr %result } @@ -167,70 +165,70 @@ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, ; M32-LABEL: test_ctselect_deeply_nested: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: lw $2, 16($sp) ; M32-NEXT: lw $3, 20($sp) -; M32-NEXT: lw $4, 24($sp) +; M32-NEXT: addiu $2, $1, -1 ; M32-NEXT: negu $1, $1 -; M32-NEXT: and $2, $1, $2 -; M32-NEXT: not $1, $1 +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: lw $3, 16($sp) ; M32-NEXT: and $1, $1, $3 -; M32-NEXT: or $1, $2, $1 +; M32-NEXT: or $1, $1, $2 ; M32-NEXT: andi $2, $5, 1 -; M32-NEXT: negu $2, $2 -; M32-NEXT: not $3, $2 -; M32-NEXT: and $1, $2, $1 -; M32-NEXT: and $2, $3, $4 -; M32-NEXT: andi $4, $6, 1 +; M32-NEXT: negu $3, $2 +; M32-NEXT: addiu $2, $2, -1 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 24($sp) +; M32-NEXT: and $2, $2, $3 ; M32-NEXT: andi $3, $7, 1 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: andi $2, $6, 1 ; M32-NEXT: lw $6, 32($sp) -; M32-NEXT: negu $4, $4 +; M32-NEXT: negu $4, $3 +; M32-NEXT: addiu $3, $3, -1 +; M32-NEXT: negu $5, $2 +; M32-NEXT: addiu $2, $2, -1 +; M32-NEXT: and $1, $5, $1 +; M32-NEXT: lw $5, 28($sp) +; M32-NEXT: and $2, $2, $5 ; M32-NEXT: or $1, $1, $2 -; M32-NEXT: negu $3, $3 +; M32-NEXT: and $2, $3, $6 ; M32-NEXT: and $1, $4, $1 -; M32-NEXT: not $2, $4 -; M32-NEXT: lw $4, 28($sp) -; M32-NEXT: not $5, $3 -; M32-NEXT: and $2, $2, $4 -; M32-NEXT: or $1, $1, $2 -; M32-NEXT: and $2, $5, $6 -; M32-NEXT: and $1, $3, $1 ; M32-NEXT: jr $ra ; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_deeply_nested: ; M64: # %bb.0: ; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: sll $2, $7, 0 -; M64-NEXT: sll $5, $5, 0 ; M64-NEXT: sll $4, $9, 0 -; M64-NEXT: sll $7, $8, 0 +; M64-NEXT: sll $3, $8, 0 +; M64-NEXT: sll $8, $11, 0 +; M64-NEXT: lw $9, 0($sp) ; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: andi $5, $5, 1 -; M64-NEXT: andi $2, $2, 1 -; M64-NEXT: negu $1, $1 -; M64-NEXT: negu $5, $5 -; M64-NEXT: negu $2, $2 -; M64-NEXT: not $3, $1 -; M64-NEXT: and $1, $1, $7 -; M64-NEXT: lw $7, 0($sp) -; M64-NEXT: and $3, $3, $4 -; M64-NEXT: sll $4, $6, 0 -; M64-NEXT: not $6, $2 -; M64-NEXT: or $1, $1, $3 -; M64-NEXT: not $3, $5 +; M64-NEXT: negu $2, $1 +; M64-NEXT: addiu $1, $1, -1 +; M64-NEXT: and $1, $1, $4 +; M64-NEXT: sll $4, $5, 0 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: sll $3, $6, 0 +; M64-NEXT: sll $5, $7, 0 ; M64-NEXT: andi $4, $4, 1 -; M64-NEXT: and $1, $5, $1 -; M64-NEXT: sll $5, $10, 0 -; M64-NEXT: negu $4, $4 -; M64-NEXT: and $3, $3, $5 -; M64-NEXT: or $1, $1, $3 -; M64-NEXT: not $3, $4 -; M64-NEXT: and $1, $4, $1 -; M64-NEXT: sll $4, $11, 0 -; M64-NEXT: and $3, $3, $4 -; M64-NEXT: or $1, $1, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: andi $3, $3, 1 +; M64-NEXT: andi $5, $5, 1 +; M64-NEXT: negu $2, $4 +; M64-NEXT: addiu $4, $4, -1 +; M64-NEXT: negu $7, $3 +; M64-NEXT: negu $6, $5 +; M64-NEXT: addiu $5, $5, -1 ; M64-NEXT: and $1, $2, $1 -; M64-NEXT: and $2, $6, $7 +; M64-NEXT: sll $2, $10, 0 +; M64-NEXT: and $2, $4, $2 +; M64-NEXT: or $1, $1, $2 +; M64-NEXT: addiu $2, $3, -1 +; M64-NEXT: and $1, $7, $1 +; M64-NEXT: and $2, $2, $8 +; M64-NEXT: or $1, $1, $2 +; M64-NEXT: and $2, $5, $9 +; M64-NEXT: and $1, $6, $1 ; M64-NEXT: jr $ra ; M64-NEXT: or $2, $1, $2 %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll index e195bdb369dae..8fc1af159ec17 100644 --- a/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll +++ b/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll @@ -26,17 +26,16 @@ define i32 @test_ctselect_smax_zero(i32 %x) { ; M32-LABEL: test_ctselect_smax_zero: ; M32: # %bb.0: ; M32-NEXT: slti $1, $4, 1 -; M32-NEXT: addiu $1, $1, -1 +; M32-NEXT: movn $4, $zero, $1 ; M32-NEXT: jr $ra -; M32-NEXT: and $2, $1, $4 +; M32-NEXT: move $2, $4 ; ; M64-LABEL: test_ctselect_smax_zero: ; M64: # %bb.0: -; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: slti $2, $1, 1 -; M64-NEXT: addiu $2, $2, -1 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: slti $1, $2, 1 ; M64-NEXT: jr $ra -; M64-NEXT: and $2, $2, $1 +; M64-NEXT: movn $2, $zero, $1 %cmp = icmp sgt i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) ret i32 %result @@ -48,12 +47,12 @@ define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { ; M32: # %bb.0: ; M32-NEXT: slt $1, $4, $5 ; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: negu $2, $1 ; M32-NEXT: addiu $1, $1, -1 -; M32-NEXT: and $2, $1, $4 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $5 +; M32-NEXT: and $2, $2, $5 +; M32-NEXT: and $1, $1, $4 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_smin_generic: ; M64: # %bb.0: @@ -61,10 +60,10 @@ define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { ; M64-NEXT: sll $2, $4, 0 ; M64-NEXT: slt $3, $2, $1 ; M64-NEXT: xori $3, $3, 1 +; M64-NEXT: negu $4, $3 ; M64-NEXT: addiu $3, $3, -1 +; M64-NEXT: and $1, $4, $1 ; M64-NEXT: and $2, $3, $2 -; M64-NEXT: not $3, $3 -; M64-NEXT: and $1, $3, $1 ; M64-NEXT: jr $ra ; M64-NEXT: or $2, $2, $1 %cmp = icmp slt i32 %x, %y @@ -78,12 +77,12 @@ define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { ; M32: # %bb.0: ; M32-NEXT: slt $1, $5, $4 ; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: negu $2, $1 ; M32-NEXT: addiu $1, $1, -1 -; M32-NEXT: and $2, $1, $4 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $5 +; M32-NEXT: and $2, $2, $5 +; M32-NEXT: and $1, $1, $4 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_smax_generic: ; M64: # %bb.0: @@ -91,10 +90,10 @@ define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { ; M64-NEXT: sll $2, $5, 0 ; M64-NEXT: slt $3, $2, $1 ; M64-NEXT: xori $3, $3, 1 +; M64-NEXT: negu $4, $3 ; M64-NEXT: addiu $3, $3, -1 +; M64-NEXT: and $2, $4, $2 ; M64-NEXT: and $1, $3, $1 -; M64-NEXT: not $3, $3 -; M64-NEXT: and $2, $3, $2 ; M64-NEXT: jr $ra ; M64-NEXT: or $2, $1, $2 %cmp = icmp sgt i32 %x, %y @@ -108,12 +107,12 @@ define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { ; M32: # %bb.0: ; M32-NEXT: sltu $1, $4, $5 ; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: negu $2, $1 ; M32-NEXT: addiu $1, $1, -1 -; M32-NEXT: and $2, $1, $4 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $5 +; M32-NEXT: and $2, $2, $5 +; M32-NEXT: and $1, $1, $4 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_umin_generic: ; M64: # %bb.0: @@ -121,10 +120,10 @@ define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { ; M64-NEXT: sll $2, $4, 0 ; M64-NEXT: sltu $3, $2, $1 ; M64-NEXT: xori $3, $3, 1 +; M64-NEXT: negu $4, $3 ; M64-NEXT: addiu $3, $3, -1 +; M64-NEXT: and $1, $4, $1 ; M64-NEXT: and $2, $3, $2 -; M64-NEXT: not $3, $3 -; M64-NEXT: and $1, $3, $1 ; M64-NEXT: jr $ra ; M64-NEXT: or $2, $2, $1 %cmp = icmp ult i32 %x, %y @@ -138,12 +137,12 @@ define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { ; M32: # %bb.0: ; M32-NEXT: sltu $1, $5, $4 ; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: negu $2, $1 ; M32-NEXT: addiu $1, $1, -1 -; M32-NEXT: and $2, $1, $4 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $5 +; M32-NEXT: and $2, $2, $5 +; M32-NEXT: and $1, $1, $4 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_umax_generic: ; M64: # %bb.0: @@ -151,10 +150,10 @@ define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { ; M64-NEXT: sll $2, $5, 0 ; M64-NEXT: sltu $3, $2, $1 ; M64-NEXT: xori $3, $3, 1 +; M64-NEXT: negu $4, $3 ; M64-NEXT: addiu $3, $3, -1 +; M64-NEXT: and $2, $4, $2 ; M64-NEXT: and $1, $3, $1 -; M64-NEXT: not $3, $3 -; M64-NEXT: and $2, $3, $2 ; M64-NEXT: jr $ra ; M64-NEXT: or $2, $1, $2 %cmp = icmp ugt i32 %x, %y @@ -239,18 +238,14 @@ define i32 @test_ctselect_sign_extend(i32 %x) { define i32 @test_ctselect_zero_extend(i32 %x) { ; M32-LABEL: test_ctselect_zero_extend: ; M32: # %bb.0: -; M32-NEXT: sltiu $1, $4, 1 -; M32-NEXT: addiu $1, $1, -1 ; M32-NEXT: jr $ra -; M32-NEXT: andi $2, $1, 1 +; M32-NEXT: sltu $2, $zero, $4 ; ; M64-LABEL: test_ctselect_zero_extend: ; M64: # %bb.0: ; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: sltiu $1, $1, 1 -; M64-NEXT: addiu $1, $1, -1 ; M64-NEXT: jr $ra -; M64-NEXT: andi $2, $1, 1 +; M64-NEXT: sltu $2, $zero, $1 %cmp = icmp ne i32 %x, 0 %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) ret i32 %result @@ -289,25 +284,13 @@ define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { ; M32-LABEL: test_ctselect_identical_operands: ; M32: # %bb.0: -; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: negu $1, $1 -; M32-NEXT: and $2, $1, $5 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $5 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: move $2, $5 ; ; M64-LABEL: test_ctselect_identical_operands: ; M64: # %bb.0: -; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: sll $2, $5, 0 -; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: negu $1, $1 -; M64-NEXT: and $3, $1, $2 -; M64-NEXT: not $1, $1 -; M64-NEXT: and $1, $1, $2 ; M64-NEXT: jr $ra -; M64-NEXT: or $2, $3, $1 +; M64-NEXT: sll $2, $5, 0 %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x) ret i32 %result } @@ -318,12 +301,12 @@ define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { ; M32: # %bb.0: ; M32-NEXT: xor $1, $4, $5 ; M32-NEXT: sltiu $1, $1, 1 +; M32-NEXT: negu $2, $1 ; M32-NEXT: addiu $1, $1, -1 -; M32-NEXT: and $2, $1, $6 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $7 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_inverted_condition: ; M64: # %bb.0: @@ -331,14 +314,14 @@ define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { ; M64-NEXT: sll $2, $4, 0 ; M64-NEXT: sll $3, $7, 0 ; M64-NEXT: xor $1, $2, $1 -; M64-NEXT: sll $2, $6, 0 ; M64-NEXT: sltiu $1, $1, 1 +; M64-NEXT: negu $2, $1 ; M64-NEXT: addiu $1, $1, -1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: sll $3, $6, 0 ; M64-NEXT: and $1, $1, $3 ; M64-NEXT: jr $ra -; M64-NEXT: or $2, $2, $1 +; M64-NEXT: or $2, $1, $2 %cmp = icmp eq i32 %x, %y %not_cmp = xor i1 %cmp, true %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b) @@ -351,53 +334,53 @@ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 ; M32-NEXT: lw $3, 16($sp) +; M32-NEXT: addiu $2, $1, -1 ; M32-NEXT: negu $1, $1 -; M32-NEXT: and $2, $1, $7 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $3 -; M32-NEXT: lw $3, 20($sp) -; M32-NEXT: or $1, $2, $1 +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: and $1, $1, $7 +; M32-NEXT: or $1, $1, $2 ; M32-NEXT: andi $2, $5, 1 -; M32-NEXT: negu $2, $2 -; M32-NEXT: and $1, $2, $1 -; M32-NEXT: not $2, $2 +; M32-NEXT: negu $3, $2 +; M32-NEXT: addiu $2, $2, -1 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 20($sp) ; M32-NEXT: and $2, $2, $3 -; M32-NEXT: lw $3, 24($sp) ; M32-NEXT: or $1, $1, $2 ; M32-NEXT: andi $2, $6, 1 -; M32-NEXT: negu $2, $2 -; M32-NEXT: and $1, $2, $1 -; M32-NEXT: not $2, $2 +; M32-NEXT: negu $3, $2 +; M32-NEXT: addiu $2, $2, -1 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 24($sp) ; M32-NEXT: and $2, $2, $3 ; M32-NEXT: jr $ra ; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_chain: ; M64: # %bb.0: -; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: sll $2, $7, 0 -; M64-NEXT: sll $3, $8, 0 -; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: negu $1, $1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 -; M64-NEXT: and $1, $1, $3 -; M64-NEXT: sll $3, $9, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $1, $8, 0 +; M64-NEXT: sll $4, $10, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $7, 0 +; M64-NEXT: and $2, $2, $3 ; M64-NEXT: or $1, $2, $1 ; M64-NEXT: sll $2, $5, 0 ; M64-NEXT: andi $2, $2, 1 -; M64-NEXT: negu $2, $2 -; M64-NEXT: and $1, $2, $1 -; M64-NEXT: not $2, $2 +; M64-NEXT: negu $3, $2 +; M64-NEXT: addiu $2, $2, -1 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $9, 0 ; M64-NEXT: and $2, $2, $3 -; M64-NEXT: sll $3, $6, 0 ; M64-NEXT: or $1, $1, $2 -; M64-NEXT: andi $2, $3, 1 -; M64-NEXT: sll $3, $10, 0 -; M64-NEXT: negu $2, $2 -; M64-NEXT: and $1, $2, $1 -; M64-NEXT: not $2, $2 -; M64-NEXT: and $2, $2, $3 +; M64-NEXT: sll $2, $6, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: negu $3, $2 +; M64-NEXT: addiu $2, $2, -1 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: and $2, $2, $4 ; M64-NEXT: jr $ra ; M64-NEXT: or $2, $1, $2 %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback.ll b/llvm/test/CodeGen/Mips/ctselect-fallback.ll index 7f924253b4ccc..22b24b33cff3c 100644 --- a/llvm/test/CodeGen/Mips/ctselect-fallback.ll +++ b/llvm/test/CodeGen/Mips/ctselect-fallback.ll @@ -61,23 +61,23 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; M32-LABEL: test_ctselect_i32: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 ; M32-NEXT: negu $1, $1 -; M32-NEXT: and $2, $1, $5 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $6 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_i32: ; M64: # %bb.0: -; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: sll $2, $5, 0 -; M64-NEXT: sll $3, $6, 0 -; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: negu $1, $1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 -; M64-NEXT: and $1, $1, $3 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $1, $6, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $5, 0 +; M64-NEXT: and $2, $2, $3 ; M64-NEXT: jr $ra ; M64-NEXT: or $2, $2, $1 %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) @@ -103,13 +103,12 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { ; M64-LABEL: test_ctselect_i64: ; M64: # %bb.0: ; M64-NEXT: andi $1, $4, 1 -; M64-NEXT: daddiu $3, $zero, -1 +; M64-NEXT: daddiu $2, $1, -1 ; M64-NEXT: dnegu $1, $1 -; M64-NEXT: and $2, $1, $5 -; M64-NEXT: xor $1, $1, $3 -; M64-NEXT: and $1, $1, $6 +; M64-NEXT: and $2, $2, $6 +; M64-NEXT: and $1, $1, $5 ; M64-NEXT: jr $ra -; M64-NEXT: or $2, $2, $1 +; M64-NEXT: or $2, $1, $2 %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) ret i64 %result } @@ -118,23 +117,22 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { ; M32-LABEL: test_ctselect_ptr: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 ; M32-NEXT: negu $1, $1 -; M32-NEXT: and $2, $1, $5 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $6 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_ptr: ; M64: # %bb.0: ; M64-NEXT: andi $1, $4, 1 -; M64-NEXT: daddiu $3, $zero, -1 +; M64-NEXT: daddiu $2, $1, -1 ; M64-NEXT: dnegu $1, $1 -; M64-NEXT: and $2, $1, $5 -; M64-NEXT: xor $1, $1, $3 -; M64-NEXT: and $1, $1, $6 +; M64-NEXT: and $2, $2, $6 +; M64-NEXT: and $1, $1, $5 ; M64-NEXT: jr $ra -; M64-NEXT: or $2, $2, $1 +; M64-NEXT: or $2, $1, $2 %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) ret ptr %result } @@ -174,12 +172,12 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { ; M32: # %bb.0: ; M32-NEXT: xor $1, $4, $5 ; M32-NEXT: sltu $1, $zero, $1 +; M32-NEXT: negu $2, $1 ; M32-NEXT: addiu $1, $1, -1 -; M32-NEXT: and $2, $1, $6 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $7 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_icmp_eq: ; M64: # %bb.0: @@ -187,14 +185,14 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { ; M64-NEXT: sll $2, $4, 0 ; M64-NEXT: sll $3, $7, 0 ; M64-NEXT: xor $1, $2, $1 -; M64-NEXT: sll $2, $6, 0 ; M64-NEXT: sltu $1, $zero, $1 +; M64-NEXT: negu $2, $1 ; M64-NEXT: addiu $1, $1, -1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: sll $3, $6, 0 ; M64-NEXT: and $1, $1, $3 ; M64-NEXT: jr $ra -; M64-NEXT: or $2, $2, $1 +; M64-NEXT: or $2, $1, $2 %cond = icmp eq i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result @@ -205,12 +203,12 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { ; M32: # %bb.0: ; M32-NEXT: xor $1, $4, $5 ; M32-NEXT: sltiu $1, $1, 1 +; M32-NEXT: negu $2, $1 ; M32-NEXT: addiu $1, $1, -1 -; M32-NEXT: and $2, $1, $6 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $7 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_icmp_ne: ; M64: # %bb.0: @@ -218,14 +216,14 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { ; M64-NEXT: sll $2, $4, 0 ; M64-NEXT: sll $3, $7, 0 ; M64-NEXT: xor $1, $2, $1 -; M64-NEXT: sll $2, $6, 0 ; M64-NEXT: sltiu $1, $1, 1 +; M64-NEXT: negu $2, $1 ; M64-NEXT: addiu $1, $1, -1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: sll $3, $6, 0 ; M64-NEXT: and $1, $1, $3 ; M64-NEXT: jr $ra -; M64-NEXT: or $2, $2, $1 +; M64-NEXT: or $2, $1, $2 %cond = icmp ne i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result @@ -236,12 +234,12 @@ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { ; M32: # %bb.0: ; M32-NEXT: slt $1, $4, $5 ; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: negu $2, $1 ; M32-NEXT: addiu $1, $1, -1 -; M32-NEXT: and $2, $1, $6 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $7 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_icmp_slt: ; M64: # %bb.0: @@ -249,14 +247,14 @@ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { ; M64-NEXT: sll $2, $4, 0 ; M64-NEXT: sll $3, $7, 0 ; M64-NEXT: slt $1, $2, $1 -; M64-NEXT: sll $2, $6, 0 ; M64-NEXT: xori $1, $1, 1 +; M64-NEXT: negu $2, $1 ; M64-NEXT: addiu $1, $1, -1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: sll $3, $6, 0 ; M64-NEXT: and $1, $1, $3 ; M64-NEXT: jr $ra -; M64-NEXT: or $2, $2, $1 +; M64-NEXT: or $2, $1, $2 %cond = icmp slt i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result @@ -267,12 +265,12 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { ; M32: # %bb.0: ; M32-NEXT: sltu $1, $4, $5 ; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: negu $2, $1 ; M32-NEXT: addiu $1, $1, -1 -; M32-NEXT: and $2, $1, $6 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $7 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_icmp_ult: ; M64: # %bb.0: @@ -280,14 +278,14 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { ; M64-NEXT: sll $2, $4, 0 ; M64-NEXT: sll $3, $7, 0 ; M64-NEXT: sltu $1, $2, $1 -; M64-NEXT: sll $2, $6, 0 ; M64-NEXT: xori $1, $1, 1 +; M64-NEXT: negu $2, $1 ; M64-NEXT: addiu $1, $1, -1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: sll $3, $6, 0 ; M64-NEXT: and $1, $1, $3 ; M64-NEXT: jr $ra -; M64-NEXT: or $2, $2, $1 +; M64-NEXT: or $2, $1, $2 %cond = icmp ult i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result @@ -297,26 +295,26 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { ; M32-LABEL: test_ctselect_load: ; M32: # %bb.0: -; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: lw $2, 0($5) -; M32-NEXT: lw $3, 0($6) -; M32-NEXT: negu $1, $1 -; M32-NEXT: and $2, $1, $2 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $3 +; M32-NEXT: andi $2, $4, 1 +; M32-NEXT: lw $1, 0($6) +; M32-NEXT: addiu $3, $2, -1 +; M32-NEXT: negu $2, $2 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 0($5) +; M32-NEXT: and $2, $2, $3 ; M32-NEXT: jr $ra ; M32-NEXT: or $2, $2, $1 ; ; M64-LABEL: test_ctselect_load: ; M64: # %bb.0: -; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: lw $2, 0($5) -; M64-NEXT: lw $3, 0($6) -; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: negu $1, $1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 -; M64-NEXT: and $1, $1, $3 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: lw $1, 0($6) +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: lw $3, 0($5) +; M64-NEXT: and $2, $2, $3 ; M64-NEXT: jr $ra ; M64-NEXT: or $2, $2, $1 %a = load i32, ptr %p1 @@ -330,37 +328,37 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ; M32-LABEL: test_ctselect_nested: ; M32: # %bb.0: ; M32-NEXT: andi $1, $5, 1 -; M32-NEXT: lw $3, 16($sp) +; M32-NEXT: addiu $2, $1, -1 ; M32-NEXT: negu $1, $1 -; M32-NEXT: and $2, $1, $6 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $7 -; M32-NEXT: or $1, $2, $1 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: or $1, $1, $2 ; M32-NEXT: andi $2, $4, 1 -; M32-NEXT: negu $2, $2 -; M32-NEXT: and $1, $2, $1 -; M32-NEXT: not $2, $2 +; M32-NEXT: negu $3, $2 +; M32-NEXT: addiu $2, $2, -1 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 16($sp) ; M32-NEXT: and $2, $2, $3 ; M32-NEXT: jr $ra ; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_ctselect_nested: ; M64: # %bb.0: -; M64-NEXT: sll $1, $5, 0 -; M64-NEXT: sll $2, $6, 0 -; M64-NEXT: sll $3, $7, 0 -; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: negu $1, $1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 -; M64-NEXT: and $1, $1, $3 -; M64-NEXT: sll $3, $8, 0 +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: sll $1, $7, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $6, 0 +; M64-NEXT: and $2, $2, $3 ; M64-NEXT: or $1, $2, $1 ; M64-NEXT: sll $2, $4, 0 ; M64-NEXT: andi $2, $2, 1 -; M64-NEXT: negu $2, $2 -; M64-NEXT: and $1, $2, $1 -; M64-NEXT: not $2, $2 +; M64-NEXT: negu $3, $2 +; M64-NEXT: addiu $2, $2, -1 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $8, 0 ; M64-NEXT: and $2, $2, $3 ; M64-NEXT: jr $ra ; M64-NEXT: or $2, $1, $2 @@ -374,24 +372,24 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; M32-LABEL: test_ctselect_f32: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 ; M32-NEXT: negu $1, $1 -; M32-NEXT: and $2, $1, $5 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $6 -; M32-NEXT: or $1, $2, $1 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: or $1, $1, $2 ; M32-NEXT: jr $ra ; M32-NEXT: mtc1 $1, $f0 ; ; M64-LABEL: test_ctselect_f32: ; M64: # %bb.0: -; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: mfc1 $2, $f13 -; M64-NEXT: mfc1 $3, $f14 -; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: negu $1, $1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 -; M64-NEXT: and $1, $1, $3 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: mfc1 $1, $f14 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: mfc1 $3, $f13 +; M64-NEXT: and $2, $2, $3 ; M64-NEXT: or $1, $2, $1 ; M64-NEXT: jr $ra ; M64-NEXT: mtc1 $1, $f0 @@ -429,14 +427,13 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) { ; ; M64-LABEL: test_ctselect_f64: ; M64: # %bb.0: -; M64-NEXT: andi $1, $4, 1 -; M64-NEXT: dmfc1 $2, $f13 -; M64-NEXT: daddiu $3, $zero, -1 -; M64-NEXT: dnegu $1, $1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: xor $1, $1, $3 -; M64-NEXT: dmfc1 $3, $f14 -; M64-NEXT: and $1, $1, $3 +; M64-NEXT: andi $2, $4, 1 +; M64-NEXT: dmfc1 $1, $f14 +; M64-NEXT: daddiu $3, $2, -1 +; M64-NEXT: dnegu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: dmfc1 $3, $f13 +; M64-NEXT: and $2, $2, $3 ; M64-NEXT: or $1, $2, $1 ; M64-NEXT: jr $ra ; M64-NEXT: dmtc1 $1, $f0 @@ -450,16 +447,16 @@ define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, ; M32-LABEL: test_ctselect_f32_chain: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: lw $3, 16($sp) +; M32-NEXT: addiu $2, $1, -1 ; M32-NEXT: negu $1, $1 -; M32-NEXT: and $2, $1, $6 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $7 -; M32-NEXT: or $1, $2, $1 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: or $1, $1, $2 ; M32-NEXT: andi $2, $5, 1 -; M32-NEXT: negu $2, $2 -; M32-NEXT: and $1, $2, $1 -; M32-NEXT: not $2, $2 +; M32-NEXT: negu $3, $2 +; M32-NEXT: addiu $2, $2, -1 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 16($sp) ; M32-NEXT: and $2, $2, $3 ; M32-NEXT: or $1, $1, $2 ; M32-NEXT: jr $ra @@ -467,21 +464,21 @@ define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, ; ; M64-LABEL: test_ctselect_f32_chain: ; M64: # %bb.0: -; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: mfc1 $2, $f14 -; M64-NEXT: mfc1 $3, $f15 -; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: negu $1, $1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 -; M64-NEXT: and $1, $1, $3 -; M64-NEXT: mfc1 $3, $f16 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: mfc1 $1, $f15 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: mfc1 $3, $f14 +; M64-NEXT: and $2, $2, $3 ; M64-NEXT: or $1, $2, $1 ; M64-NEXT: sll $2, $5, 0 ; M64-NEXT: andi $2, $2, 1 -; M64-NEXT: negu $2, $2 -; M64-NEXT: and $1, $2, $1 -; M64-NEXT: not $2, $2 +; M64-NEXT: negu $3, $2 +; M64-NEXT: addiu $2, $2, -1 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: mfc1 $3, $f16 ; M64-NEXT: and $2, $2, $3 ; M64-NEXT: or $1, $1, $2 ; M64-NEXT: jr $ra @@ -495,27 +492,27 @@ define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { ; M32-LABEL: test_ctselect_f32_load: ; M32: # %bb.0: -; M32-NEXT: andi $1, $4, 1 -; M32-NEXT: lw $2, 0($5) -; M32-NEXT: lw $3, 0($6) -; M32-NEXT: negu $1, $1 -; M32-NEXT: and $2, $1, $2 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $3 +; M32-NEXT: andi $2, $4, 1 +; M32-NEXT: lw $1, 0($6) +; M32-NEXT: addiu $3, $2, -1 +; M32-NEXT: negu $2, $2 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 0($5) +; M32-NEXT: and $2, $2, $3 ; M32-NEXT: or $1, $2, $1 ; M32-NEXT: jr $ra ; M32-NEXT: mtc1 $1, $f0 ; ; M64-LABEL: test_ctselect_f32_load: ; M64: # %bb.0: -; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: lw $2, 0($5) -; M64-NEXT: lw $3, 0($6) -; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: negu $1, $1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 -; M64-NEXT: and $1, $1, $3 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: lw $1, 0($6) +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: lw $3, 0($5) +; M64-NEXT: and $2, $2, $3 ; M64-NEXT: or $1, $2, $1 ; M64-NEXT: jr $ra ; M64-NEXT: mtc1 $1, $f0 @@ -552,14 +549,13 @@ define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) { ; ; M64-LABEL: test_ctselect_f64_load: ; M64: # %bb.0: -; M64-NEXT: andi $1, $4, 1 -; M64-NEXT: ld $2, 0($5) -; M64-NEXT: daddiu $3, $zero, -1 -; M64-NEXT: dnegu $1, $1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: xor $1, $1, $3 -; M64-NEXT: ld $3, 0($6) -; M64-NEXT: and $1, $1, $3 +; M64-NEXT: andi $2, $4, 1 +; M64-NEXT: ld $1, 0($6) +; M64-NEXT: daddiu $3, $2, -1 +; M64-NEXT: dnegu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: ld $3, 0($5) +; M64-NEXT: and $2, $2, $3 ; M64-NEXT: or $1, $2, $1 ; M64-NEXT: jr $ra ; M64-NEXT: dmtc1 $1, $f0 @@ -575,15 +571,15 @@ define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { ; M32: # %bb.0: ; M32-NEXT: mtc1 $6, $f0 ; M32-NEXT: mtc1 $5, $f1 -; M32-NEXT: andi $2, $4, 1 -; M32-NEXT: negu $2, $2 -; M32-NEXT: sub.s $f2, $f1, $f0 -; M32-NEXT: add.s $f0, $f1, $f0 -; M32-NEXT: not $3, $2 -; M32-NEXT: mfc1 $1, $f2 -; M32-NEXT: and $1, $3, $1 -; M32-NEXT: mfc1 $3, $f0 +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: negu $2, $1 +; M32-NEXT: addiu $1, $1, -1 +; M32-NEXT: add.s $f2, $f1, $f0 +; M32-NEXT: sub.s $f0, $f1, $f0 +; M32-NEXT: mfc1 $3, $f2 ; M32-NEXT: and $2, $2, $3 +; M32-NEXT: mfc1 $3, $f0 +; M32-NEXT: and $1, $1, $3 ; M32-NEXT: or $1, $2, $1 ; M32-NEXT: jr $ra ; M32-NEXT: mtc1 $1, $f0 @@ -593,11 +589,11 @@ define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { ; M64-NEXT: add.s $f0, $f13, $f14 ; M64-NEXT: sll $1, $4, 0 ; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: negu $1, $1 -; M64-NEXT: mfc1 $2, $f0 +; M64-NEXT: negu $2, $1 +; M64-NEXT: addiu $1, $1, -1 +; M64-NEXT: mfc1 $3, $f0 ; M64-NEXT: sub.s $f0, $f13, $f14 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 +; M64-NEXT: and $2, $2, $3 ; M64-NEXT: mfc1 $3, $f0 ; M64-NEXT: and $1, $1, $3 ; M64-NEXT: or $1, $2, $1 diff --git a/llvm/test/CodeGen/Mips/ctselect-side-effects.ll b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll index f0f6a18ae73bf..9a0263ad5915c 100644 --- a/llvm/test/CodeGen/Mips/ctselect-side-effects.ll +++ b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll @@ -39,23 +39,23 @@ define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { ; M32-LABEL: test_protected_no_branch: ; M32: # %bb.0: ; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 ; M32-NEXT: negu $1, $1 -; M32-NEXT: and $2, $1, $5 -; M32-NEXT: not $1, $1 -; M32-NEXT: and $1, $1, $6 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 ; M32-NEXT: jr $ra -; M32-NEXT: or $2, $2, $1 +; M32-NEXT: or $2, $1, $2 ; ; M64-LABEL: test_protected_no_branch: ; M64: # %bb.0: -; M64-NEXT: sll $1, $4, 0 -; M64-NEXT: sll $2, $5, 0 -; M64-NEXT: sll $3, $6, 0 -; M64-NEXT: andi $1, $1, 1 -; M64-NEXT: negu $1, $1 -; M64-NEXT: and $2, $1, $2 -; M64-NEXT: not $1, $1 -; M64-NEXT: and $1, $1, $3 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $1, $6, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $5, 0 +; M64-NEXT: and $2, $2, $3 ; M64-NEXT: jr $ra ; M64-NEXT: or $2, $2, $1 %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) From 034f1e85c33371ed93317c34bafb70eab84b15be Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 16 Sep 2025 11:20:36 -0700 Subject: [PATCH 39/63] [CT] implements thumb1 and thumb2 lowering for ARM back end --- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 99 +- llvm/lib/Target/ARM/ARMBaseInstrInfo.h | 2 + llvm/test/CodeGen/ARM/ctselect-half.ll | 478 ++++++++ llvm/test/CodeGen/ARM/ctselect-vector.ll | 1272 ++++++++++++++++++++++ llvm/test/CodeGen/ARM/ctselect.ll | 357 ++++++ 5 files changed, 2201 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 027cfb95b7f51..198daa292f786 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1550,6 +1550,8 @@ bool ARMBaseInstrInfo::expandCtSelectVector(MachineInstr &MI) const { BroadcastOp = ARM::VDUP32q; } + unsigned RsbOp = Subtarget.isThumb2() ? ARM::t2RSBri : ARM::RSBri; + // Any vector pseudo has: ((outs $dst, $tmp_mask, $bcast_mask), (ins $src1, $src2, $cond)) Register VectorMaskReg = MI.getOperand(2).getReg(); Register Src1Reg = MI.getOperand(3).getReg(); @@ -1561,7 +1563,8 @@ bool ARMBaseInstrInfo::expandCtSelectVector(MachineInstr &MI) const { // 1. mask = 0 - cond // When cond = 0: mask = 0x00000000. // When cond = 1: mask = 0xFFFFFFFF. - BuildMI(*MBB, MI, DL, get(ARM::RSBri), MaskReg) + + BuildMI(*MBB, MI, DL, get(RsbOp), MaskReg) .addReg(CondReg) .addImm(0) .add(predOps(ARMCC::AL)) @@ -1599,6 +1602,66 @@ bool ARMBaseInstrInfo::expandCtSelectVector(MachineInstr &MI) const { return true; } +// Expands the ctselect pseudo for thumb1, post-RA. +bool ARMBaseInstrInfo::expandCtSelectThumb(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + // pseudos in thumb1 mode have: (outs $dst, $tmp_mask), (ins $src1, $src2, $cond)) + // register class here is always tGPR. + Register DestReg = MI.getOperand(0).getReg(); + Register MaskReg = MI.getOperand(1).getReg(); + Register Src1Reg = MI.getOperand(2).getReg(); + Register Src2Reg = MI.getOperand(3).getReg(); + Register CondReg = MI.getOperand(4).getReg(); + + // The following sequence of steps yields: (src1 & mask) | (src2 & ~mask) + // 1. mask = -cond + BuildMI(*MBB, MI, DL, get(ARM::tRSB), MaskReg) + .add(t1CondCodeOp()) + .addReg(CondReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + BuildMI(*MBB, MI, DL, get(ARM::tMOVr), DestReg) + .addReg(Src1Reg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 2. A = src1 & mask + BuildMI(*MBB, MI, DL, get(ARM::tAND), DestReg) + .add(t1CondCodeOp()) + .addReg(DestReg, RegState::Kill) + .addReg(MaskReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 3. B = src2 & ~mask + Register BICScratch = Src1Reg; + BuildMI(*MBB, MI, DL, get(ARM::tMOVr), BICScratch) + .addReg(Src2Reg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + BuildMI(*MBB, MI, DL, get(ARM::tBIC), BICScratch) + .add(t1CondCodeOp()) + .addReg(BICScratch, RegState::Kill) + .addReg(MaskReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 4. result = A | B + BuildMI(*MBB, MI, DL, get(ARM::tORR), DestReg) + .add(t1CondCodeOp()) + .addReg(DestReg, RegState::Kill) + .addReg(BICScratch) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + MI.eraseFromParent(); + return true; +} + // Expands the ctselect pseudo, post-RA. bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const { MachineBasicBlock *MBB = MI.getParent(); @@ -1610,11 +1673,19 @@ bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const { Register Src1Reg, Src2Reg, CondReg; // These operations will differ by operand register size. + unsigned RsbOp = ARM::RSBri; unsigned AndOp = ARM::ANDrr; unsigned BicOp = ARM::BICrr; unsigned OrrOp = ARM::ORRrr; - unsigned Opcode = MI.getOpcode(); + if (Subtarget.isThumb2()) { + RsbOp = ARM::t2RSBri; + AndOp = ARM::t2ANDrr; + BicOp = ARM::t2BICrr; + OrrOp = ARM::t2ORRrr; + } + + unsigned Opcode = MI.getOpcode(); bool IsFloat = Opcode == ARM::CTSELECTf32 || Opcode == ARM::CTSELECTf16 || Opcode == ARM::CTSELECTbf16; if (IsFloat) { // Each float pseudo has: (outs $dst, $tmp_mask, $scratch1, $scratch2), (ins $src1, $src2, $cond)) @@ -1657,7 +1728,7 @@ bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const { // 1. mask = 0 - cond // When cond = 0: mask = 0x00000000. // When cond = 1: mask = 0xFFFFFFFF. - BuildMI(*MBB, MI, DL, get(ARM::RSBri), MaskReg) + BuildMI(*MBB, MI, DL, get(RsbOp), MaskReg) .addReg(CondReg) .addImm(0) .add(predOps(ARMCC::AL)) @@ -1714,8 +1785,17 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } - if (opcode == ARM::CTSELECTf64 || - opcode == ARM::CTSELECTv8i8 || + if (opcode == ARM::CTSELECTf64) { + if (Subtarget.isThumb1Only()) { + LLVM_DEBUG(dbgs() << "Opcode (thumb1 subtarget) " << opcode << "replaced by: " << MI); + return expandCtSelectThumb(MI); + } else { + LLVM_DEBUG(dbgs() << "Opcode (vector) " << opcode << "replaced by: " << MI); + return expandCtSelectVector(MI); + } + } + + if (opcode == ARM::CTSELECTv8i8 || opcode == ARM::CTSELECTv4i16 || opcode == ARM::CTSELECTv2i32 || opcode == ARM::CTSELECTv1i64 || @@ -1738,8 +1818,13 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { opcode == ARM::CTSELECTf16 || opcode == ARM::CTSELECTbf16 || opcode == ARM::CTSELECTf32) { - LLVM_DEBUG(dbgs() << "Opcode " << opcode << "replaced by: " << MI); - return expandCtSelect(MI); + if (Subtarget.isThumb1Only()) { + LLVM_DEBUG(dbgs() << "Opcode (thumb1 subtarget) " << opcode << "replaced by: " << MI); + return expandCtSelectThumb(MI); + } else { + LLVM_DEBUG(dbgs() << "Opcode " << opcode << "replaced by: " << MI); + return expandCtSelect(MI); + } } // This hook gets to expand COPY instructions before they become diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index b692010087148..f0e090f09f5dc 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -223,6 +223,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { bool expandCtSelectVector(MachineInstr &MI) const; + bool expandCtSelectThumb(MachineInstr &MI) const; + bool expandCtSelect(MachineInstr &MI) const; bool expandPostRAPseudo(MachineInstr &MI) const override; diff --git a/llvm/test/CodeGen/ARM/ctselect-half.ll b/llvm/test/CodeGen/ARM/ctselect-half.ll index 0f1b4a4b14ac1..40a5655be28e9 100644 --- a/llvm/test/CodeGen/ARM/ctselect-half.ll +++ b/llvm/test/CodeGen/ARM/ctselect-half.ll @@ -2,6 +2,8 @@ ; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s ; RUN: llc < %s -mtriple=armv8.6a-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=BFLOAT-F16-NATIVE %s ; RUN: llc < %s -mtriple=armv8.2a-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=F16-NATIVE %s +; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB1 %s +; RUN: llc < %s -mtriple=thumbv7m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB2 %s define half @ct_half(i1 %cond, half %a, half %b) { ; CT-LABEL: ct_half: @@ -30,6 +32,30 @@ define half @ct_half(i1 %cond, half %a, half %b) { ; F16-NATIVE-NEXT: bic r12, r2, r12 ; F16-NATIVE-NEXT: orr r0, r0, r12 ; F16-NATIVE-NEXT: bx lr +; +; THUMB1-LABEL: ct_half: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: bics r1, r4 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_half: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; THUMB2-NOT: it{{[te]+}} entry: %sel = call half @llvm.ct.select.f16(i1 %cond, half %a, half %b) ret half %sel @@ -67,6 +93,30 @@ define bfloat @ct_bf16(i1 %cond, bfloat %a, bfloat %b) { ; F16-NATIVE-NEXT: bic r12, r2, r12 ; F16-NATIVE-NEXT: orr r0, r0, r12 ; F16-NATIVE-NEXT: bx lr +; +; THUMB1-LABEL: ct_bf16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: bics r1, r4 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_bf16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; THUMB2-NOT: it{{[te]+}} entry: %sel = call bfloat @llvm.ct.select.bf16(i1 %cond, bfloat %a, bfloat %b) ret bfloat %sel @@ -156,6 +206,77 @@ define <4 x half> @ct_v4f16(i1 %cond, <4 x half> %a, <4 x half> %b) { ; F16-NATIVE-NEXT: vmov.u16 r2, d18[2] ; F16-NATIVE-NEXT: vmov.u16 r3, d18[3] ; F16-NATIVE-NEXT: pop {r4, r5, r6, pc} +; +; THUMB1-LABEL: ct_v4f16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #32] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: ldr r2, [sp, #36] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: bics r3, r5 +; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: ldr r3, [sp, #40] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: bics r5, r6 +; THUMB1-NEXT: orrs r2, r5 +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: ldr r6, [sp, #28] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: bics r6, r7 +; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v4f16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldrh.w r1, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldrh.w r2, [sp, #28] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: ldrh.w r3, [sp, #16] +; THUMB2-NEXT: ldrh.w lr, [sp, #32] +; THUMB2-NEXT: and.w r2, r3, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: ldrh.w lr, [sp, #36] +; THUMB2-NEXT: orrs r2, r4 +; THUMB2-NEXT: ldrh.w r4, [sp, #20] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <4 x half> @llvm.ct.select.v4f16(i1 %cond, <4 x half> %a, <4 x half> %b) ret <4 x half> %sel @@ -230,6 +351,77 @@ define <4 x bfloat> @ct_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) { ; F16-NATIVE-NEXT: vmov.u16 r2, d18[2] ; F16-NATIVE-NEXT: vmov.u16 r3, d18[3] ; F16-NATIVE-NEXT: pop {r4, r5, r6, pc} +; +; THUMB1-LABEL: ct_v4bf16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #32] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: ldr r2, [sp, #36] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: bics r3, r5 +; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: ldr r3, [sp, #40] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: bics r5, r6 +; THUMB1-NEXT: orrs r2, r5 +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: ldr r6, [sp, #28] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: bics r6, r7 +; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v4bf16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldrh.w r1, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldrh.w r2, [sp, #28] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: ldrh.w r3, [sp, #16] +; THUMB2-NEXT: ldrh.w lr, [sp, #32] +; THUMB2-NEXT: and.w r2, r3, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: ldrh.w lr, [sp, #36] +; THUMB2-NEXT: orrs r2, r4 +; THUMB2-NEXT: ldrh.w r4, [sp, #20] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <4 x bfloat> @llvm.ct.select.v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) ret <4 x bfloat> %sel @@ -364,6 +556,149 @@ define <8 x half> @ct_v8f16(i1 %cond, <8 x half> %a, <8 x half> %b) { ; F16-NATIVE-NEXT: vorr q10, q10, q11 ; F16-NATIVE-NEXT: vst1.64 {d20, d21}, [r0:128] ; F16-NATIVE-NEXT: pop {r4, r5, r6, r7, r8, pc} +; +; THUMB1-LABEL: ct_v8f16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r1 +; THUMB1-NEXT: ldr r1, [sp, #76] +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #14] +; THUMB1-NEXT: ldr r1, [sp, #72] +; THUMB1-NEXT: ldr r5, [sp, #40] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #12] +; THUMB1-NEXT: ldr r1, [sp, #68] +; THUMB1-NEXT: ldr r5, [sp, #36] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #10] +; THUMB1-NEXT: ldr r1, [sp, #64] +; THUMB1-NEXT: ldr r5, [sp, #32] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #8] +; THUMB1-NEXT: ldr r1, [sp, #60] +; THUMB1-NEXT: ldr r5, [sp, #28] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #6] +; THUMB1-NEXT: ldr r1, [sp, #56] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #4] +; THUMB1-NEXT: ldr r1, [sp, #52] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: ands r5, r6 +; THUMB1-NEXT: mov r3, r1 +; THUMB1-NEXT: bics r3, r6 +; THUMB1-NEXT: orrs r5, r3 +; THUMB1-NEXT: strh r5, [r0, #2] +; THUMB1-NEXT: ldr r1, [sp, #48] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: ands r3, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r3, r2 +; THUMB1-NEXT: strh r3, [r0] +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v8f16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and lr, r1, #1 +; THUMB2-NEXT: ldrh.w r12, [sp, #68] +; THUMB2-NEXT: ldrh.w r1, [sp, #36] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r4, r1, r5 +; THUMB2-NEXT: bic.w r5, r12, r5 +; THUMB2-NEXT: orrs r4, r5 +; THUMB2-NEXT: ldrh.w r12, [sp, #64] +; THUMB2-NEXT: ldrh.w r5, [sp, #32] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #14] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #60] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #28] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #12] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #56] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #24] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #10] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #52] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #20] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #8] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #48] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #16] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #6] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r1, [sp, #44] +; THUMB2-NEXT: strh r4, [r0, #4] +; THUMB2-NEXT: rsb.w r4, lr, #0 +; THUMB2-NEXT: and.w r5, r3, r4 +; THUMB2-NEXT: bic.w r4, r1, r4 +; THUMB2-NEXT: orrs r5, r4 +; THUMB2-NEXT: ldrh.w r1, [sp, #40] +; THUMB2-NEXT: strh r5, [r0, #2] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r3, r2, r5 +; THUMB2-NEXT: bic.w r5, r1, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: strh r3, [r0] +; THUMB2-NEXT: pop {r4, r5, r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <8 x half> @llvm.ct.select.v8f16(i1 %cond, <8 x half> %a, <8 x half> %b) ret <8 x half> %sel @@ -471,6 +806,149 @@ define <8 x bfloat> @ct_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) { ; F16-NATIVE-NEXT: vorr q10, q10, q11 ; F16-NATIVE-NEXT: vst1.64 {d20, d21}, [r0:128] ; F16-NATIVE-NEXT: pop {r4, r5, r6, r7, r8, pc} +; +; THUMB1-LABEL: ct_v8bf16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r1 +; THUMB1-NEXT: ldr r1, [sp, #76] +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #14] +; THUMB1-NEXT: ldr r1, [sp, #72] +; THUMB1-NEXT: ldr r5, [sp, #40] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #12] +; THUMB1-NEXT: ldr r1, [sp, #68] +; THUMB1-NEXT: ldr r5, [sp, #36] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #10] +; THUMB1-NEXT: ldr r1, [sp, #64] +; THUMB1-NEXT: ldr r5, [sp, #32] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #8] +; THUMB1-NEXT: ldr r1, [sp, #60] +; THUMB1-NEXT: ldr r5, [sp, #28] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #6] +; THUMB1-NEXT: ldr r1, [sp, #56] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #4] +; THUMB1-NEXT: ldr r1, [sp, #52] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: ands r5, r6 +; THUMB1-NEXT: mov r3, r1 +; THUMB1-NEXT: bics r3, r6 +; THUMB1-NEXT: orrs r5, r3 +; THUMB1-NEXT: strh r5, [r0, #2] +; THUMB1-NEXT: ldr r1, [sp, #48] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: ands r3, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r3, r2 +; THUMB1-NEXT: strh r3, [r0] +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v8bf16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and lr, r1, #1 +; THUMB2-NEXT: ldrh.w r12, [sp, #68] +; THUMB2-NEXT: ldrh.w r1, [sp, #36] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r4, r1, r5 +; THUMB2-NEXT: bic.w r5, r12, r5 +; THUMB2-NEXT: orrs r4, r5 +; THUMB2-NEXT: ldrh.w r12, [sp, #64] +; THUMB2-NEXT: ldrh.w r5, [sp, #32] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #14] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #60] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #28] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #12] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #56] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #24] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #10] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #52] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #20] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #8] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #48] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #16] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #6] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r1, [sp, #44] +; THUMB2-NEXT: strh r4, [r0, #4] +; THUMB2-NEXT: rsb.w r4, lr, #0 +; THUMB2-NEXT: and.w r5, r3, r4 +; THUMB2-NEXT: bic.w r4, r1, r4 +; THUMB2-NEXT: orrs r5, r4 +; THUMB2-NEXT: ldrh.w r1, [sp, #40] +; THUMB2-NEXT: strh r5, [r0, #2] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r3, r2, r5 +; THUMB2-NEXT: bic.w r5, r1, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: strh r3, [r0] +; THUMB2-NEXT: pop {r4, r5, r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <8 x bfloat> @llvm.ct.select.v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) ret <8 x bfloat> %sel diff --git a/llvm/test/CodeGen/ARM/ctselect-vector.ll b/llvm/test/CodeGen/ARM/ctselect-vector.ll index 8afa8275d9aff..946c10b033fd9 100644 --- a/llvm/test/CodeGen/ARM/ctselect-vector.ll +++ b/llvm/test/CodeGen/ARM/ctselect-vector.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s ; RUN: llc < %s -mtriple=armv6 -verify-machineinstrs | FileCheck --check-prefix=DEFAULT %s +; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB1 %s +; RUN: llc < %s -mtriple=thumbv7m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB2 %s define <8 x i8> @ct_v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) { ; CT-LABEL: ct_v8i8: @@ -75,6 +77,149 @@ define <8 x i8> @ct_v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) { ; DEFAULT-NEXT: orr r3, r3, r5 ; DEFAULT-NEXT: strb r3, [r0] ; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v8i8: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r1 +; THUMB1-NEXT: ldr r1, [sp, #76] +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #7] +; THUMB1-NEXT: ldr r1, [sp, #72] +; THUMB1-NEXT: ldr r5, [sp, #40] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #6] +; THUMB1-NEXT: ldr r1, [sp, #68] +; THUMB1-NEXT: ldr r5, [sp, #36] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #5] +; THUMB1-NEXT: ldr r1, [sp, #64] +; THUMB1-NEXT: ldr r5, [sp, #32] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #4] +; THUMB1-NEXT: ldr r1, [sp, #60] +; THUMB1-NEXT: ldr r5, [sp, #28] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #3] +; THUMB1-NEXT: ldr r1, [sp, #56] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #2] +; THUMB1-NEXT: ldr r1, [sp, #52] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: ands r5, r6 +; THUMB1-NEXT: mov r3, r1 +; THUMB1-NEXT: bics r3, r6 +; THUMB1-NEXT: orrs r5, r3 +; THUMB1-NEXT: strb r5, [r0, #1] +; THUMB1-NEXT: ldr r1, [sp, #48] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: ands r3, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r3, r2 +; THUMB1-NEXT: strb r3, [r0] +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v8i8: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and lr, r1, #1 +; THUMB2-NEXT: ldrb.w r12, [sp, #68] +; THUMB2-NEXT: ldrb.w r1, [sp, #36] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r4, r1, r5 +; THUMB2-NEXT: bic.w r5, r12, r5 +; THUMB2-NEXT: orrs r4, r5 +; THUMB2-NEXT: ldrb.w r12, [sp, #64] +; THUMB2-NEXT: ldrb.w r5, [sp, #32] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #7] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #60] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #28] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #6] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #56] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #24] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #5] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #52] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #20] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #4] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #48] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #16] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #3] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r1, [sp, #44] +; THUMB2-NEXT: strb r4, [r0, #2] +; THUMB2-NEXT: rsb.w r4, lr, #0 +; THUMB2-NEXT: and.w r5, r3, r4 +; THUMB2-NEXT: bic.w r4, r1, r4 +; THUMB2-NEXT: orrs r5, r4 +; THUMB2-NEXT: ldrb.w r1, [sp, #40] +; THUMB2-NEXT: strb r5, [r0, #1] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r3, r2, r5 +; THUMB2-NEXT: bic.w r5, r1, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: strb r3, [r0] +; THUMB2-NEXT: pop {r4, r5, r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <8 x i8> @llvm.ct.select.v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) ret <8 x i8> %sel @@ -121,6 +266,77 @@ define <4 x i16> @ct_v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) { ; DEFAULT-NEXT: bic r5, lr, r5 ; DEFAULT-NEXT: orr r3, r3, r5 ; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v4i16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #32] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: ldr r2, [sp, #36] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: bics r3, r5 +; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: ldr r3, [sp, #40] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: bics r5, r6 +; THUMB1-NEXT: orrs r2, r5 +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: ldr r6, [sp, #28] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: bics r6, r7 +; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v4i16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldrh.w r1, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldrh.w r2, [sp, #28] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: ldrh.w r3, [sp, #16] +; THUMB2-NEXT: ldrh.w lr, [sp, #32] +; THUMB2-NEXT: and.w r2, r3, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: ldrh.w lr, [sp, #36] +; THUMB2-NEXT: orrs r2, r4 +; THUMB2-NEXT: ldrh.w r4, [sp, #20] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <4 x i16> @llvm.ct.select.v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) ret <4 x i16> %sel @@ -155,6 +371,46 @@ define <2 x i32> @ct_v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) { ; DEFAULT-NEXT: bic lr, r2, lr ; DEFAULT-NEXT: orr r1, r1, lr ; DEFAULT-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: ct_v2i32: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #16] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: ldr r2, [sp, #20] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: bics r3, r5 +; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: ct_v2i32: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #8] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #12] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: pop {r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <2 x i32> @llvm.ct.select.v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) ret <2 x i32> %sel @@ -189,6 +445,46 @@ define <1 x i64> @ct_v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b) { ; DEFAULT-NEXT: bic lr, r2, lr ; DEFAULT-NEXT: orr r1, r1, lr ; DEFAULT-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: ct_v1i64: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #16] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: ldr r2, [sp, #20] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: bics r3, r5 +; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: ct_v1i64: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #8] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #12] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: pop {r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b) ret <1 x i64> %sel @@ -223,6 +519,46 @@ define <2 x float> @ct_v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) { ; DEFAULT-NEXT: bic lr, r2, lr ; DEFAULT-NEXT: orr r1, r1, lr ; DEFAULT-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: ct_v2f32: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #16] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: ldr r2, [sp, #20] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: bics r3, r5 +; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: ct_v2f32: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #8] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #12] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: pop {r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <2 x float> @llvm.ct.select.v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) ret <2 x float> %sel @@ -360,6 +696,277 @@ define <16 x i8> @ct_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) { ; DEFAULT-NEXT: orr r3, r3, r5 ; DEFAULT-NEXT: strb r3, [r0] ; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v16i8: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r1 +; THUMB1-NEXT: ldr r1, [sp, #140] +; THUMB1-NEXT: ldr r5, [sp, #76] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #15] +; THUMB1-NEXT: ldr r1, [sp, #136] +; THUMB1-NEXT: ldr r5, [sp, #72] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #14] +; THUMB1-NEXT: ldr r1, [sp, #132] +; THUMB1-NEXT: ldr r5, [sp, #68] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #13] +; THUMB1-NEXT: ldr r1, [sp, #128] +; THUMB1-NEXT: ldr r5, [sp, #64] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #12] +; THUMB1-NEXT: ldr r1, [sp, #124] +; THUMB1-NEXT: ldr r5, [sp, #60] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #11] +; THUMB1-NEXT: ldr r1, [sp, #120] +; THUMB1-NEXT: ldr r5, [sp, #56] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #10] +; THUMB1-NEXT: ldr r1, [sp, #116] +; THUMB1-NEXT: ldr r5, [sp, #52] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #9] +; THUMB1-NEXT: ldr r1, [sp, #112] +; THUMB1-NEXT: ldr r5, [sp, #48] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #8] +; THUMB1-NEXT: ldr r1, [sp, #108] +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #7] +; THUMB1-NEXT: ldr r1, [sp, #104] +; THUMB1-NEXT: ldr r5, [sp, #40] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #6] +; THUMB1-NEXT: ldr r1, [sp, #100] +; THUMB1-NEXT: ldr r5, [sp, #36] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #5] +; THUMB1-NEXT: ldr r1, [sp, #96] +; THUMB1-NEXT: ldr r5, [sp, #32] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #4] +; THUMB1-NEXT: ldr r1, [sp, #92] +; THUMB1-NEXT: ldr r5, [sp, #28] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #3] +; THUMB1-NEXT: ldr r1, [sp, #88] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strb r6, [r0, #2] +; THUMB1-NEXT: ldr r1, [sp, #84] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: ands r5, r6 +; THUMB1-NEXT: mov r3, r1 +; THUMB1-NEXT: bics r3, r6 +; THUMB1-NEXT: orrs r5, r3 +; THUMB1-NEXT: strb r5, [r0, #1] +; THUMB1-NEXT: ldr r1, [sp, #80] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: ands r3, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r3, r2 +; THUMB1-NEXT: strb r3, [r0] +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v16i8: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and lr, r1, #1 +; THUMB2-NEXT: ldrb.w r12, [sp, #132] +; THUMB2-NEXT: ldrb.w r1, [sp, #68] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r4, r1, r5 +; THUMB2-NEXT: bic.w r5, r12, r5 +; THUMB2-NEXT: orrs r4, r5 +; THUMB2-NEXT: ldrb.w r12, [sp, #128] +; THUMB2-NEXT: ldrb.w r5, [sp, #64] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #15] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #124] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #60] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #14] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #120] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #56] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #13] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #116] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #52] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #12] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #112] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #48] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #11] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #108] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #44] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #10] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #104] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #40] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #9] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #100] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #36] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #8] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #96] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #32] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #7] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #92] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #28] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #6] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #88] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #24] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #5] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #84] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #20] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #4] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #80] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #16] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #3] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r1, [sp, #76] +; THUMB2-NEXT: strb r4, [r0, #2] +; THUMB2-NEXT: rsb.w r4, lr, #0 +; THUMB2-NEXT: and.w r5, r3, r4 +; THUMB2-NEXT: bic.w r4, r1, r4 +; THUMB2-NEXT: orrs r5, r4 +; THUMB2-NEXT: ldrb.w r1, [sp, #72] +; THUMB2-NEXT: strb r5, [r0, #1] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r3, r2, r5 +; THUMB2-NEXT: bic.w r5, r1, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: strb r3, [r0] +; THUMB2-NEXT: pop {r4, r5, r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) ret <16 x i8> %sel @@ -441,6 +1048,149 @@ define <8 x i16> @ct_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) { ; DEFAULT-NEXT: orr r3, r3, r5 ; DEFAULT-NEXT: strh r3, [r0] ; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v8i16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r1 +; THUMB1-NEXT: ldr r1, [sp, #76] +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #14] +; THUMB1-NEXT: ldr r1, [sp, #72] +; THUMB1-NEXT: ldr r5, [sp, #40] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #12] +; THUMB1-NEXT: ldr r1, [sp, #68] +; THUMB1-NEXT: ldr r5, [sp, #36] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #10] +; THUMB1-NEXT: ldr r1, [sp, #64] +; THUMB1-NEXT: ldr r5, [sp, #32] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #8] +; THUMB1-NEXT: ldr r1, [sp, #60] +; THUMB1-NEXT: ldr r5, [sp, #28] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #6] +; THUMB1-NEXT: ldr r1, [sp, #56] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: mov r5, r1 +; THUMB1-NEXT: bics r5, r7 +; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: strh r6, [r0, #4] +; THUMB1-NEXT: ldr r1, [sp, #52] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: ands r5, r6 +; THUMB1-NEXT: mov r3, r1 +; THUMB1-NEXT: bics r3, r6 +; THUMB1-NEXT: orrs r5, r3 +; THUMB1-NEXT: strh r5, [r0, #2] +; THUMB1-NEXT: ldr r1, [sp, #48] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: ands r3, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r3, r2 +; THUMB1-NEXT: strh r3, [r0] +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v8i16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and lr, r1, #1 +; THUMB2-NEXT: ldrh.w r12, [sp, #68] +; THUMB2-NEXT: ldrh.w r1, [sp, #36] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r4, r1, r5 +; THUMB2-NEXT: bic.w r5, r12, r5 +; THUMB2-NEXT: orrs r4, r5 +; THUMB2-NEXT: ldrh.w r12, [sp, #64] +; THUMB2-NEXT: ldrh.w r5, [sp, #32] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #14] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #60] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #28] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #12] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #56] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #24] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #10] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #52] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #20] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #8] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #48] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #16] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #6] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r1, [sp, #44] +; THUMB2-NEXT: strh r4, [r0, #4] +; THUMB2-NEXT: rsb.w r4, lr, #0 +; THUMB2-NEXT: and.w r5, r3, r4 +; THUMB2-NEXT: bic.w r4, r1, r4 +; THUMB2-NEXT: orrs r5, r4 +; THUMB2-NEXT: ldrh.w r1, [sp, #40] +; THUMB2-NEXT: strh r5, [r0, #2] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r3, r2, r5 +; THUMB2-NEXT: bic.w r5, r1, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: strh r3, [r0] +; THUMB2-NEXT: pop {r4, r5, r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) ret <8 x i16> %sel @@ -490,6 +1240,77 @@ define <4 x i32> @ct_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { ; DEFAULT-NEXT: bic r5, lr, r5 ; DEFAULT-NEXT: orr r3, r3, r5 ; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v4i32: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #32] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: ldr r2, [sp, #36] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: bics r3, r5 +; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: ldr r3, [sp, #40] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: bics r5, r6 +; THUMB1-NEXT: orrs r2, r5 +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: ldr r6, [sp, #28] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: bics r6, r7 +; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v4i32: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #28] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: ldr r3, [sp, #16] +; THUMB2-NEXT: ldr.w lr, [sp, #32] +; THUMB2-NEXT: and.w r2, r3, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: ldr.w lr, [sp, #36] +; THUMB2-NEXT: orrs r2, r4 +; THUMB2-NEXT: ldr r4, [sp, #20] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) ret <4 x i32> %sel @@ -539,6 +1360,77 @@ define <2 x i64> @ct_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { ; DEFAULT-NEXT: bic r5, lr, r5 ; DEFAULT-NEXT: orr r3, r3, r5 ; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v2i64: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #32] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: ldr r2, [sp, #36] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: bics r3, r5 +; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: ldr r3, [sp, #40] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: bics r5, r6 +; THUMB1-NEXT: orrs r2, r5 +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: ldr r6, [sp, #28] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: bics r6, r7 +; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v2i64: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #28] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: ldr r3, [sp, #16] +; THUMB2-NEXT: ldr.w lr, [sp, #32] +; THUMB2-NEXT: and.w r2, r3, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: ldr.w lr, [sp, #36] +; THUMB2-NEXT: orrs r2, r4 +; THUMB2-NEXT: ldr r4, [sp, #20] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) ret <2 x i64> %sel @@ -588,6 +1480,77 @@ define <4 x float> @ct_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { ; DEFAULT-NEXT: bic r5, lr, r5 ; DEFAULT-NEXT: orr r3, r3, r5 ; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v4f32: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #32] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: ldr r2, [sp, #36] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: bics r3, r5 +; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: ldr r3, [sp, #40] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: bics r5, r6 +; THUMB1-NEXT: orrs r2, r5 +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: ldr r6, [sp, #28] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: bics r6, r7 +; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v4f32: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #28] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: ldr r3, [sp, #16] +; THUMB2-NEXT: ldr.w lr, [sp, #32] +; THUMB2-NEXT: and.w r2, r3, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: ldr.w lr, [sp, #36] +; THUMB2-NEXT: orrs r2, r4 +; THUMB2-NEXT: ldr r4, [sp, #20] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) ret <4 x float> %sel @@ -637,6 +1600,77 @@ define <2 x double> @ct_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { ; DEFAULT-NEXT: bic r5, lr, r5 ; DEFAULT-NEXT: orr r3, r3, r5 ; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v2f64: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #32] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: ldr r2, [sp, #36] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: bics r3, r5 +; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: ldr r3, [sp, #40] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: bics r5, r6 +; THUMB1-NEXT: orrs r2, r5 +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: ldr r6, [sp, #28] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: bics r6, r7 +; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v2f64: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #28] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: ldr r3, [sp, #16] +; THUMB2-NEXT: ldr.w lr, [sp, #32] +; THUMB2-NEXT: and.w r2, r3, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: ldr.w lr, [sp, #36] +; THUMB2-NEXT: orrs r2, r4 +; THUMB2-NEXT: ldr r4, [sp, #20] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) ret <2 x double> %sel @@ -663,6 +1697,30 @@ define <1 x i8> @ct_v1i8(i1 %cond, <1 x i8> %a, <1 x i8> %b) { ; DEFAULT-NEXT: bic r12, r2, r12 ; DEFAULT-NEXT: orr r0, r0, r12 ; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_v1i8: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: bics r1, r4 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_v1i8: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <1 x i8> @llvm.ct.select.i8(i1 %cond, <1 x i8> %a, <1 x i8> %b) ret <1 x i8> %sel @@ -696,6 +1754,44 @@ define <2 x i8> @ct_v2i8(i1 %cond, <2 x i8> %a, <2 x i8> %b) { ; DEFAULT-NEXT: bic lr, r3, lr ; DEFAULT-NEXT: orr r1, r1, lr ; DEFAULT-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: ct_v2i8: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: bics r1, r5 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: ldr r3, [sp, #16] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: mov r2, r3 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r1, r2 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: ct_v2i8: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r0, r1, lr +; THUMB2-NEXT: bic.w lr, r3, lr +; THUMB2-NEXT: ldrb.w r3, [sp, #8] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r2, lr +; THUMB2-NEXT: bic.w lr, r3, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: pop {r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <2 x i8> @llvm.ct.select.i16(i1 %cond, <2 x i8> %a, <2 x i8> %b) ret <2 x i8> %sel @@ -741,6 +1837,72 @@ define <4 x i8> @ct_v4i8(i1 %cond, <4 x i8> %a, <4 x i8> %b) { ; DEFAULT-NEXT: bic r5, lr, r5 ; DEFAULT-NEXT: orr r3, r3, r5 ; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v4i8: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: ands r0, r6 +; THUMB1-NEXT: mov r1, r5 +; THUMB1-NEXT: bics r1, r6 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: ldr r5, [sp, #28] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: ands r1, r6 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: bics r2, r6 +; THUMB1-NEXT: orrs r1, r2 +; THUMB1-NEXT: ldr r5, [sp, #32] +; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r2, r3 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: mov r3, r5 +; THUMB1-NEXT: bics r3, r6 +; THUMB1-NEXT: orrs r2, r3 +; THUMB1-NEXT: ldr r5, [sp, #36] +; THUMB1-NEXT: ldr r6, [sp, #20] +; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: bics r6, r7 +; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v4i8: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldrb.w lr, [sp, #20] +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r0, r1, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: orrs r0, r4 +; THUMB2-NEXT: ldrb.w r4, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r2, lr +; THUMB2-NEXT: bic.w lr, r4, lr +; THUMB2-NEXT: ldrb.w r4, [sp, #28] +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r2, r3, lr +; THUMB2-NEXT: bic.w lr, r4, lr +; THUMB2-NEXT: orr.w r2, r2, lr +; THUMB2-NEXT: ldrb.w r4, [sp, #16] +; THUMB2-NEXT: ldrb.w lr, [sp, #32] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <4 x i8> @llvm.ct.select.i32(i1 %cond, <4 x i8> %a, <4 x i8> %b) ret <4 x i8> %sel @@ -764,6 +1926,30 @@ define <1 x i16> @ct_v1i16(i1 %cond, <1 x i16> %a, <1 x i16> %b) { ; DEFAULT-NEXT: bic r12, r2, r12 ; DEFAULT-NEXT: orr r0, r0, r12 ; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_v1i16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: bics r1, r4 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_v1i16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <1 x i16> @llvm.ct.select.i16(i1 %cond, <1 x i16> %a, <1 x i16> %b) ret <1 x i16> %sel @@ -797,6 +1983,44 @@ define <2 x i16> @ct_v2i16(i1 %cond, <2 x i16> %a, <2 x i16> %b) { ; DEFAULT-NEXT: bic lr, r3, lr ; DEFAULT-NEXT: orr r1, r1, lr ; DEFAULT-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: ct_v2i16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: bics r1, r5 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: ldr r3, [sp, #16] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: mov r2, r3 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r1, r2 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: ct_v2i16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r0, r1, lr +; THUMB2-NEXT: bic.w lr, r3, lr +; THUMB2-NEXT: ldrh.w r3, [sp, #8] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r2, lr +; THUMB2-NEXT: bic.w lr, r3, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: pop {r7, pc} +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <2 x i16> @llvm.ct.select.i32(i1 %cond, <2 x i16> %a, <2 x i16> %b) ret <2 x i16> %sel @@ -820,6 +2044,30 @@ define <1 x i32> @ct_v1i32(i1 %cond, <1 x i32> %a, <1 x i32> %b) { ; DEFAULT-NEXT: bic r12, r2, r12 ; DEFAULT-NEXT: orr r0, r0, r12 ; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_v1i32: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: bics r1, r4 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_v1i32: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <1 x i32> @llvm.ct.select.i32(i1 %cond, <1 x i32> %a, <1 x i32> %b) ret <1 x i32> %sel @@ -849,6 +2097,30 @@ define <1 x float> @ct_v1f32(i1 %cond, <1 x float> %a, <1 x float> %b) { ; DEFAULT-NEXT: bic r12, r2, r12 ; DEFAULT-NEXT: orr r0, r0, r12 ; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_v1f32: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: bics r1, r4 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_v1f32: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; THUMB2-NOT: it{{[te]+}} entry: %sel = call <1 x float> @llvm.ct.select.f32(i1 %cond, <1 x float> %a, <1 x float> %b) ret <1 x float> %sel diff --git a/llvm/test/CodeGen/ARM/ctselect.ll b/llvm/test/CodeGen/ARM/ctselect.ll index e054f99cf0db8..0dec8ce4a9725 100644 --- a/llvm/test/CodeGen/ARM/ctselect.ll +++ b/llvm/test/CodeGen/ARM/ctselect.ll @@ -1,6 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s ; RUN: llc < %s -mtriple=armv6 -verify-machineinstrs | FileCheck --check-prefix=DEFAULT %s +; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB1 %s +; RUN: llc < %s -mtriple=thumbv7m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB2 %s +; RUN: llc < %s -mtriple=thumbv7-linux-gnueabihf -mcpu=cortex-a9 -verify-machineinstrs | FileCheck --check-prefix=CORTEXA9 %s +; RUN: llc < %s -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 -verify-machineinstrs | FileCheck --check-prefix=CORTEX-NOTHUMB %s define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) { ; CT-LABEL: ct_i1: @@ -20,6 +24,48 @@ define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) { ; DEFAULT-NEXT: bic r12, r2, r12 ; DEFAULT-NEXT: orr r0, r0, r12 ; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_i1: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: bics r1, r4 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_i1: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; THUMB2-NOT: it{{[te]+}} +; +; CORTEXA9-LABEL: ct_i1: +; CORTEXA9: @ %bb.0: @ %entry +; CORTEXA9-NEXT: and r3, r0, #1 +; CORTEXA9-NEXT: rsb.w r12, r3, #0 +; CORTEXA9-NEXT: and.w r0, r1, r12 +; CORTEXA9-NEXT: bic.w r12, r2, r12 +; CORTEXA9-NEXT: orr.w r0, r0, r12 +; CORTEXA9-NEXT: bx lr +; +; CORTEX-NOTHUMB-LABEL: ct_i1: +; CORTEX-NOTHUMB: @ %bb.0: @ %entry +; CORTEX-NOTHUMB-NEXT: and r3, r0, #1 +; CORTEX-NOTHUMB-NEXT: rsb r12, r3, #0 +; CORTEX-NOTHUMB-NEXT: and r0, r1, r12 +; CORTEX-NOTHUMB-NEXT: bic r12, r2, r12 +; CORTEX-NOTHUMB-NEXT: orr r0, r0, r12 +; CORTEX-NOTHUMB-NEXT: bx lr entry: %sel = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) ret i1 %sel @@ -43,6 +89,48 @@ define i8 @ct_int8(i1 %cond, i8 %a, i8 %b) { ; DEFAULT-NEXT: bic r12, r2, r12 ; DEFAULT-NEXT: orr r0, r0, r12 ; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_int8: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: bics r1, r4 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_int8: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; THUMB2-NOT: it{{[te]+}} +; +; CORTEXA9-LABEL: ct_int8: +; CORTEXA9: @ %bb.0: @ %entry +; CORTEXA9-NEXT: and r3, r0, #1 +; CORTEXA9-NEXT: rsb.w r12, r3, #0 +; CORTEXA9-NEXT: and.w r0, r1, r12 +; CORTEXA9-NEXT: bic.w r12, r2, r12 +; CORTEXA9-NEXT: orr.w r0, r0, r12 +; CORTEXA9-NEXT: bx lr +; +; CORTEX-NOTHUMB-LABEL: ct_int8: +; CORTEX-NOTHUMB: @ %bb.0: @ %entry +; CORTEX-NOTHUMB-NEXT: and r3, r0, #1 +; CORTEX-NOTHUMB-NEXT: rsb r12, r3, #0 +; CORTEX-NOTHUMB-NEXT: and r0, r1, r12 +; CORTEX-NOTHUMB-NEXT: bic r12, r2, r12 +; CORTEX-NOTHUMB-NEXT: orr r0, r0, r12 +; CORTEX-NOTHUMB-NEXT: bx lr entry: %sel = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) ret i8 %sel @@ -66,6 +154,48 @@ define i16 @ct_int16(i1 %cond, i16 %a, i16 %b) { ; DEFAULT-NEXT: bic r12, r2, r12 ; DEFAULT-NEXT: orr r0, r0, r12 ; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_int16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: bics r1, r4 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_int16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; THUMB2-NOT: it{{[te]+}} +; +; CORTEXA9-LABEL: ct_int16: +; CORTEXA9: @ %bb.0: @ %entry +; CORTEXA9-NEXT: and r3, r0, #1 +; CORTEXA9-NEXT: rsb.w r12, r3, #0 +; CORTEXA9-NEXT: and.w r0, r1, r12 +; CORTEXA9-NEXT: bic.w r12, r2, r12 +; CORTEXA9-NEXT: orr.w r0, r0, r12 +; CORTEXA9-NEXT: bx lr +; +; CORTEX-NOTHUMB-LABEL: ct_int16: +; CORTEX-NOTHUMB: @ %bb.0: @ %entry +; CORTEX-NOTHUMB-NEXT: and r3, r0, #1 +; CORTEX-NOTHUMB-NEXT: rsb r12, r3, #0 +; CORTEX-NOTHUMB-NEXT: and r0, r1, r12 +; CORTEX-NOTHUMB-NEXT: bic r12, r2, r12 +; CORTEX-NOTHUMB-NEXT: orr r0, r0, r12 +; CORTEX-NOTHUMB-NEXT: bx lr entry: %sel = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) ret i16 %sel @@ -89,6 +219,48 @@ define i32 @ct_int32(i1 %cond, i32 %a, i32 %b) { ; DEFAULT-NEXT: bic r12, r2, r12 ; DEFAULT-NEXT: orr r0, r0, r12 ; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_int32: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: bics r1, r4 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_int32: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; THUMB2-NOT: it{{[te]+}} +; +; CORTEXA9-LABEL: ct_int32: +; CORTEXA9: @ %bb.0: @ %entry +; CORTEXA9-NEXT: and r3, r0, #1 +; CORTEXA9-NEXT: rsb.w r12, r3, #0 +; CORTEXA9-NEXT: and.w r0, r1, r12 +; CORTEXA9-NEXT: bic.w r12, r2, r12 +; CORTEXA9-NEXT: orr.w r0, r0, r12 +; CORTEXA9-NEXT: bx lr +; +; CORTEX-NOTHUMB-LABEL: ct_int32: +; CORTEX-NOTHUMB: @ %bb.0: @ %entry +; CORTEX-NOTHUMB-NEXT: and r3, r0, #1 +; CORTEX-NOTHUMB-NEXT: rsb r12, r3, #0 +; CORTEX-NOTHUMB-NEXT: and r0, r1, r12 +; CORTEX-NOTHUMB-NEXT: bic r12, r2, r12 +; CORTEX-NOTHUMB-NEXT: orr r0, r0, r12 +; CORTEX-NOTHUMB-NEXT: bx lr entry: %sel = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %sel @@ -127,6 +299,79 @@ define i64 @ct_int64(i1 %cond, i64 %a, i64 %b) { ; DEFAULT-NEXT: bic lr, r2, lr ; DEFAULT-NEXT: orr r1, r1, lr ; DEFAULT-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: ct_int64: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #16] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: ldr r2, [sp, #20] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: bics r3, r5 +; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: ct_int64: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #8] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #12] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: pop {r7, pc} +; THUMB2-NOT: it{{[te]+}} +; +; CORTEXA9-LABEL: ct_int64: +; CORTEXA9: @ %bb.0: @ %entry +; CORTEXA9-NEXT: .save {r4, lr} +; CORTEXA9-NEXT: push {r4, lr} +; CORTEXA9-NEXT: and lr, r0, #1 +; CORTEXA9-NEXT: ldrd r1, r12, [sp, #8] +; CORTEXA9-NEXT: rsb.w r4, lr, #0 +; CORTEXA9-NEXT: and.w r0, r2, r4 +; CORTEXA9-NEXT: rsb.w r2, lr, #0 +; CORTEXA9-NEXT: bic.w r4, r1, r4 +; CORTEXA9-NEXT: and.w r1, r3, r2 +; CORTEXA9-NEXT: bic.w r2, r12, r2 +; CORTEXA9-NEXT: orrs r0, r4 +; CORTEXA9-NEXT: orr.w r1, r1, r2 +; CORTEXA9-NEXT: pop {r4, pc} +; +; CORTEX-NOTHUMB-LABEL: ct_int64: +; CORTEX-NOTHUMB: @ %bb.0: @ %entry +; CORTEX-NOTHUMB-NEXT: .save {r4, lr} +; CORTEX-NOTHUMB-NEXT: push {r4, lr} +; CORTEX-NOTHUMB-NEXT: and lr, r0, #1 +; CORTEX-NOTHUMB-NEXT: ldr r12, [sp, #12] +; CORTEX-NOTHUMB-NEXT: ldr r1, [sp, #8] +; CORTEX-NOTHUMB-NEXT: rsb r4, lr, #0 +; CORTEX-NOTHUMB-NEXT: and r0, r2, r4 +; CORTEX-NOTHUMB-NEXT: rsb r2, lr, #0 +; CORTEX-NOTHUMB-NEXT: bic r4, r1, r4 +; CORTEX-NOTHUMB-NEXT: and r1, r3, r2 +; CORTEX-NOTHUMB-NEXT: bic r2, r12, r2 +; CORTEX-NOTHUMB-NEXT: orr r0, r0, r4 +; CORTEX-NOTHUMB-NEXT: orr r1, r1, r2 +; CORTEX-NOTHUMB-NEXT: pop {r4, pc} entry: %sel = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) ret i64 %sel @@ -156,6 +401,56 @@ define float @ct_float(i1 %cond, float %a, float %b) { ; DEFAULT-NEXT: bic r12, r2, r12 ; DEFAULT-NEXT: orr r0, r0, r12 ; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_float: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: bics r1, r4 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_float: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; THUMB2-NOT: it{{[te]+}} +; +; CORTEXA9-LABEL: ct_float: +; CORTEXA9: @ %bb.0: @ %entry +; CORTEXA9-NEXT: and r0, r0, #1 +; CORTEXA9-NEXT: vmov r2, s0 +; CORTEXA9-NEXT: vmov r3, s1 +; CORTEXA9-NEXT: rsbs r1, r0, #0 +; CORTEXA9-NEXT: ands r2, r1 +; CORTEXA9-NEXT: bic.w r1, r3, r1 +; CORTEXA9-NEXT: orrs r2, r1 +; CORTEXA9-NEXT: vmov s2, r2 +; CORTEXA9-NEXT: vmov.f32 s0, s2 +; CORTEXA9-NEXT: bx lr +; +; CORTEX-NOTHUMB-LABEL: ct_float: +; CORTEX-NOTHUMB: @ %bb.0: @ %entry +; CORTEX-NOTHUMB-NEXT: and r0, r0, #1 +; CORTEX-NOTHUMB-NEXT: vmov r2, s0 +; CORTEX-NOTHUMB-NEXT: vmov r3, s1 +; CORTEX-NOTHUMB-NEXT: rsb r1, r0, #0 +; CORTEX-NOTHUMB-NEXT: and r2, r2, r1 +; CORTEX-NOTHUMB-NEXT: bic r1, r3, r1 +; CORTEX-NOTHUMB-NEXT: orr r2, r2, r1 +; CORTEX-NOTHUMB-NEXT: vmov s2, r2 +; CORTEX-NOTHUMB-NEXT: vmov.f32 s0, s2 +; CORTEX-NOTHUMB-NEXT: bx lr entry: %sel = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) ret float %sel @@ -190,6 +485,68 @@ define double @ct_f64(i1 %cond, double %a, double %b) { ; DEFAULT-NEXT: bic lr, r2, lr ; DEFAULT-NEXT: orr r1, r1, lr ; DEFAULT-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: ct_f64: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #16] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: bics r2, r5 +; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: ldr r2, [sp, #20] +; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: bics r3, r5 +; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: ct_f64: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #8] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #12] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: pop {r7, pc} +; THUMB2-NOT: it{{[te]+}} +; +; CORTEXA9-LABEL: ct_f64: +; CORTEXA9: @ %bb.0: @ %entry +; CORTEXA9-NEXT: and r0, r0, #1 +; CORTEXA9-NEXT: rsbs r1, r0, #0 +; CORTEXA9-NEXT: vdup.32 d17, r1 +; CORTEXA9-NEXT: vand d16, d0, d17 +; CORTEXA9-NEXT: vbic d17, d1, d17 +; CORTEXA9-NEXT: vorr d16, d16, d17 +; CORTEXA9-NEXT: vorr d0, d16, d16 +; CORTEXA9-NEXT: bx lr +; +; CORTEX-NOTHUMB-LABEL: ct_f64: +; CORTEX-NOTHUMB: @ %bb.0: @ %entry +; CORTEX-NOTHUMB-NEXT: and r0, r0, #1 +; CORTEX-NOTHUMB-NEXT: rsb r1, r0, #0 +; CORTEX-NOTHUMB-NEXT: vdup.32 d17, r1 +; CORTEX-NOTHUMB-NEXT: vand d16, d0, d17 +; CORTEX-NOTHUMB-NEXT: vbic d17, d1, d17 +; CORTEX-NOTHUMB-NEXT: vorr d16, d16, d17 +; CORTEX-NOTHUMB-NEXT: vorr d0, d16, d16 +; CORTEX-NOTHUMB-NEXT: bx lr entry: %sel = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) ret double %sel From 1fdd126e024936b12f8d0b2d7b57ef9e686ed844 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Wed, 17 Sep 2025 13:08:52 -0700 Subject: [PATCH 40/63] [CT] removes constant folding from our intrinisic --- clang/include/clang/Basic/Builtins.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 551e0cbfb3971..37e6250e7c587 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -5257,7 +5257,7 @@ def CountedByRef : Builtin { // Constant-time select builtin def CtSelect : Builtin { let Spellings = ["__builtin_ct_select"]; - let Attributes = [NoThrow, Const, UnevaluatedArguments, + let Attributes = [NoThrow, UnevaluatedArguments, ConstIgnoringExceptions, CustomTypeChecking]; let Prototype = "void(...)"; } From 5732f375d137c14ccc82b3a2b96b9170db8daf7c Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Wed, 17 Sep 2025 13:09:31 -0700 Subject: [PATCH 41/63] [CT] do not allow duplication and do not touch memory --- llvm/include/llvm/IR/Intrinsics.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 8c76ed010096b..32f8fce3f05d9 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1818,7 +1818,7 @@ def int_coro_subfn_addr : DefaultAttrsIntrinsic< // Intrinsic to support constant time select def int_ct_select : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_i1_ty, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrWriteMem, IntrWillReturn, NoUndef]>; + [IntrNoMem, IntrWillReturn, IntrNoDuplicate, NoUndef]>; ///===-------------------------- Other Intrinsics --------------------------===// // From 4440482e486fee029da57cd4b40cd293d6b1a0ef Mon Sep 17 00:00:00 2001 From: kumarak Date: Sun, 21 Sep 2025 14:56:03 +0000 Subject: [PATCH 42/63] [CT] fix codegen for thumb1 instructions --- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 58 +- llvm/test/CodeGen/ARM/ctselect-half.ll | 240 +++++---- llvm/test/CodeGen/ARM/ctselect-vector.ll | 648 ++++++++++++----------- llvm/test/CodeGen/ARM/ctselect.ll | 88 +-- 4 files changed, 559 insertions(+), 475 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 198daa292f786..01603e85de2c3 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1615,46 +1615,56 @@ bool ARMBaseInstrInfo::expandCtSelectThumb(MachineInstr &MI) const { Register Src2Reg = MI.getOperand(3).getReg(); Register CondReg = MI.getOperand(4).getReg(); - // The following sequence of steps yields: (src1 & mask) | (src2 & ~mask) - // 1. mask = -cond - BuildMI(*MBB, MI, DL, get(ARM::tRSB), MaskReg) - .add(t1CondCodeOp()) + // Access register info + MachineFunction *MF = MBB->getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + unsigned RegSize = TRI->getRegSizeInBits(MaskReg, MRI); + unsigned ShiftAmount = RegSize - 1; + + // Option 1: Shift-based mask (preferred - no flag modification) + BuildMI(*MBB, MI, DL, get(ARM::tMOVr), MaskReg) .addReg(CondReg) .add(predOps(ARMCC::AL)) .setMIFlag(MachineInstr::MIFlag::NoMerge); - BuildMI(*MBB, MI, DL, get(ARM::tMOVr), DestReg) - .addReg(Src1Reg) + // Instead of using RSB, we can use LSL and ASR to get the mask. This is to avoid the flag modification caused by RSB. + BuildMI(*MBB, MI, DL, get(ARM::tLSLri), MaskReg) + .addReg(MaskReg) + .addImm(ShiftAmount) .add(predOps(ARMCC::AL)) .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // 2. A = src1 & mask - BuildMI(*MBB, MI, DL, get(ARM::tAND), DestReg) - .add(t1CondCodeOp()) - .addReg(DestReg, RegState::Kill) + + BuildMI(*MBB, MI, DL, get(ARM::tASRri), MaskReg) .addReg(MaskReg) + .addImm(ShiftAmount) .add(predOps(ARMCC::AL)) .setMIFlag(MachineInstr::MIFlag::NoMerge); - // 3. B = src2 & ~mask - Register BICScratch = Src1Reg; - BuildMI(*MBB, MI, DL, get(ARM::tMOVr), BICScratch) + // 2. xor_diff = src1 ^ src2 + BuildMI(*MBB, MI, DL, get(ARM::tMOVr), DestReg) + .addReg(Src1Reg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + BuildMI(*MBB, MI, DL, get(ARM::tEOR), DestReg) + .addReg(DestReg) .addReg(Src2Reg) .add(predOps(ARMCC::AL)) .setMIFlag(MachineInstr::MIFlag::NoMerge); - - BuildMI(*MBB, MI, DL, get(ARM::tBIC), BICScratch) - .add(t1CondCodeOp()) - .addReg(BICScratch, RegState::Kill) - .addReg(MaskReg) + + // 3. masked_xor = xor_diff & mask + BuildMI(*MBB, MI, DL, get(ARM::tAND), DestReg) + .addReg(DestReg) + .addReg(MaskReg, RegState::Kill) .add(predOps(ARMCC::AL)) .setMIFlag(MachineInstr::MIFlag::NoMerge); - // 4. result = A | B - BuildMI(*MBB, MI, DL, get(ARM::tORR), DestReg) - .add(t1CondCodeOp()) - .addReg(DestReg, RegState::Kill) - .addReg(BICScratch) + // 4. result = src2 ^ masked_xor + BuildMI(*MBB, MI, DL, get(ARM::tEOR), DestReg) + .addReg(DestReg) + .addReg(Src2Reg) .add(predOps(ARMCC::AL)) .setMIFlag(MachineInstr::MIFlag::NoMerge); diff --git a/llvm/test/CodeGen/ARM/ctselect-half.ll b/llvm/test/CodeGen/ARM/ctselect-half.ll index 40a5655be28e9..f75707fc91af3 100644 --- a/llvm/test/CodeGen/ARM/ctselect-half.ll +++ b/llvm/test/CodeGen/ARM/ctselect-half.ll @@ -39,12 +39,13 @@ define half @ct_half(i1 %cond, half %a, half %b) { ; THUMB1-NEXT: push {r4, lr} ; THUMB1-NEXT: movs r3, #1 ; THUMB1-NEXT: ands r3, r0 -; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 ; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: ands r0, r4 -; THUMB1-NEXT: mov r1, r2 -; THUMB1-NEXT: bics r1, r4 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: pop {r4, pc} ; ; THUMB2-LABEL: ct_half: @@ -55,7 +56,6 @@ define half @ct_half(i1 %cond, half %a, half %b) { ; THUMB2-NEXT: bic.w r12, r2, r12 ; THUMB2-NEXT: orr.w r0, r0, r12 ; THUMB2-NEXT: bx lr -; THUMB2-NOT: it{{[te]+}} entry: %sel = call half @llvm.ct.select.f16(i1 %cond, half %a, half %b) ret half %sel @@ -100,12 +100,13 @@ define bfloat @ct_bf16(i1 %cond, bfloat %a, bfloat %b) { ; THUMB1-NEXT: push {r4, lr} ; THUMB1-NEXT: movs r3, #1 ; THUMB1-NEXT: ands r3, r0 -; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 ; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: ands r0, r4 -; THUMB1-NEXT: mov r1, r2 -; THUMB1-NEXT: bics r1, r4 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: pop {r4, pc} ; ; THUMB2-LABEL: ct_bf16: @@ -116,7 +117,6 @@ define bfloat @ct_bf16(i1 %cond, bfloat %a, bfloat %b) { ; THUMB2-NEXT: bic.w r12, r2, r12 ; THUMB2-NEXT: orr.w r0, r0, r12 ; THUMB2-NEXT: bx lr -; THUMB2-NOT: it{{[te]+}} entry: %sel = call bfloat @llvm.ct.select.bf16(i1 %cond, bfloat %a, bfloat %b) ret bfloat %sel @@ -216,35 +216,39 @@ define <4 x half> @ct_v4f16(i1 %cond, <4 x half> %a, <4 x half> %b) { ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 ; THUMB1-NEXT: ldr r1, [sp, #32] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ldr r2, [sp, #36] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ands r1, r5 -; THUMB1-NEXT: mov r3, r2 -; THUMB1-NEXT: bics r3, r5 -; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ldr r3, [sp, #40] ; THUMB1-NEXT: ldr r5, [sp, #24] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: eors r2, r3 ; THUMB1-NEXT: ands r2, r6 -; THUMB1-NEXT: mov r5, r3 -; THUMB1-NEXT: bics r5, r6 -; THUMB1-NEXT: orrs r2, r5 +; THUMB1-NEXT: eors r2, r3 ; THUMB1-NEXT: ldr r5, [sp, #44] ; THUMB1-NEXT: ldr r6, [sp, #28] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: ands r3, r7 -; THUMB1-NEXT: mov r6, r5 -; THUMB1-NEXT: bics r6, r7 -; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: add sp, #4 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} ; @@ -276,7 +280,6 @@ define <4 x half> @ct_v4f16(i1 %cond, <4 x half> %a, <4 x half> %b) { ; THUMB2-NEXT: bic.w r5, lr, r5 ; THUMB2-NEXT: orrs r3, r5 ; THUMB2-NEXT: pop {r4, r5, r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <4 x half> @llvm.ct.select.v4f16(i1 %cond, <4 x half> %a, <4 x half> %b) ret <4 x half> %sel @@ -361,35 +364,39 @@ define <4 x bfloat> @ct_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) { ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 ; THUMB1-NEXT: ldr r1, [sp, #32] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ldr r2, [sp, #36] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ands r1, r5 -; THUMB1-NEXT: mov r3, r2 -; THUMB1-NEXT: bics r3, r5 -; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ldr r3, [sp, #40] ; THUMB1-NEXT: ldr r5, [sp, #24] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: eors r2, r3 ; THUMB1-NEXT: ands r2, r6 -; THUMB1-NEXT: mov r5, r3 -; THUMB1-NEXT: bics r5, r6 -; THUMB1-NEXT: orrs r2, r5 +; THUMB1-NEXT: eors r2, r3 ; THUMB1-NEXT: ldr r5, [sp, #44] ; THUMB1-NEXT: ldr r6, [sp, #28] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: ands r3, r7 -; THUMB1-NEXT: mov r6, r5 -; THUMB1-NEXT: bics r6, r7 -; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: add sp, #4 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} ; @@ -421,7 +428,6 @@ define <4 x bfloat> @ct_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) { ; THUMB2-NEXT: bic.w r5, lr, r5 ; THUMB2-NEXT: orrs r3, r5 ; THUMB2-NEXT: pop {r4, r5, r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <4 x bfloat> @llvm.ct.select.v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) ret <4 x bfloat> %sel @@ -567,73 +573,81 @@ define <8 x half> @ct_v8f16(i1 %cond, <8 x half> %a, <8 x half> %b) { ; THUMB1-NEXT: ands r4, r1 ; THUMB1-NEXT: ldr r1, [sp, #76] ; THUMB1-NEXT: ldr r5, [sp, #44] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #14] ; THUMB1-NEXT: ldr r1, [sp, #72] ; THUMB1-NEXT: ldr r5, [sp, #40] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #12] ; THUMB1-NEXT: ldr r1, [sp, #68] ; THUMB1-NEXT: ldr r5, [sp, #36] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #10] ; THUMB1-NEXT: ldr r1, [sp, #64] ; THUMB1-NEXT: ldr r5, [sp, #32] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #8] ; THUMB1-NEXT: ldr r1, [sp, #60] ; THUMB1-NEXT: ldr r5, [sp, #28] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #6] ; THUMB1-NEXT: ldr r1, [sp, #56] ; THUMB1-NEXT: ldr r5, [sp, #24] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #4] ; THUMB1-NEXT: ldr r1, [sp, #52] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: eors r5, r1 ; THUMB1-NEXT: ands r5, r6 -; THUMB1-NEXT: mov r3, r1 -; THUMB1-NEXT: bics r3, r6 -; THUMB1-NEXT: orrs r5, r3 +; THUMB1-NEXT: eors r5, r1 ; THUMB1-NEXT: strh r5, [r0, #2] ; THUMB1-NEXT: ldr r1, [sp, #48] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: eors r3, r1 ; THUMB1-NEXT: ands r3, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r3, r2 +; THUMB1-NEXT: eors r3, r1 ; THUMB1-NEXT: strh r3, [r0] ; THUMB1-NEXT: add sp, #4 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} @@ -698,7 +712,6 @@ define <8 x half> @ct_v8f16(i1 %cond, <8 x half> %a, <8 x half> %b) { ; THUMB2-NEXT: orrs r3, r5 ; THUMB2-NEXT: strh r3, [r0] ; THUMB2-NEXT: pop {r4, r5, r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <8 x half> @llvm.ct.select.v8f16(i1 %cond, <8 x half> %a, <8 x half> %b) ret <8 x half> %sel @@ -817,73 +830,81 @@ define <8 x bfloat> @ct_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) { ; THUMB1-NEXT: ands r4, r1 ; THUMB1-NEXT: ldr r1, [sp, #76] ; THUMB1-NEXT: ldr r5, [sp, #44] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #14] ; THUMB1-NEXT: ldr r1, [sp, #72] ; THUMB1-NEXT: ldr r5, [sp, #40] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #12] ; THUMB1-NEXT: ldr r1, [sp, #68] ; THUMB1-NEXT: ldr r5, [sp, #36] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #10] ; THUMB1-NEXT: ldr r1, [sp, #64] ; THUMB1-NEXT: ldr r5, [sp, #32] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #8] ; THUMB1-NEXT: ldr r1, [sp, #60] ; THUMB1-NEXT: ldr r5, [sp, #28] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #6] ; THUMB1-NEXT: ldr r1, [sp, #56] ; THUMB1-NEXT: ldr r5, [sp, #24] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #4] ; THUMB1-NEXT: ldr r1, [sp, #52] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: eors r5, r1 ; THUMB1-NEXT: ands r5, r6 -; THUMB1-NEXT: mov r3, r1 -; THUMB1-NEXT: bics r3, r6 -; THUMB1-NEXT: orrs r5, r3 +; THUMB1-NEXT: eors r5, r1 ; THUMB1-NEXT: strh r5, [r0, #2] ; THUMB1-NEXT: ldr r1, [sp, #48] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: eors r3, r1 ; THUMB1-NEXT: ands r3, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r3, r2 +; THUMB1-NEXT: eors r3, r1 ; THUMB1-NEXT: strh r3, [r0] ; THUMB1-NEXT: add sp, #4 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} @@ -948,7 +969,6 @@ define <8 x bfloat> @ct_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) { ; THUMB2-NEXT: orrs r3, r5 ; THUMB2-NEXT: strh r3, [r0] ; THUMB2-NEXT: pop {r4, r5, r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <8 x bfloat> @llvm.ct.select.v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) ret <8 x bfloat> %sel diff --git a/llvm/test/CodeGen/ARM/ctselect-vector.ll b/llvm/test/CodeGen/ARM/ctselect-vector.ll index 946c10b033fd9..c410f78b24c0e 100644 --- a/llvm/test/CodeGen/ARM/ctselect-vector.ll +++ b/llvm/test/CodeGen/ARM/ctselect-vector.ll @@ -88,73 +88,81 @@ define <8 x i8> @ct_v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) { ; THUMB1-NEXT: ands r4, r1 ; THUMB1-NEXT: ldr r1, [sp, #76] ; THUMB1-NEXT: ldr r5, [sp, #44] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #7] ; THUMB1-NEXT: ldr r1, [sp, #72] ; THUMB1-NEXT: ldr r5, [sp, #40] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #6] ; THUMB1-NEXT: ldr r1, [sp, #68] ; THUMB1-NEXT: ldr r5, [sp, #36] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #5] ; THUMB1-NEXT: ldr r1, [sp, #64] ; THUMB1-NEXT: ldr r5, [sp, #32] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #4] ; THUMB1-NEXT: ldr r1, [sp, #60] ; THUMB1-NEXT: ldr r5, [sp, #28] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #3] ; THUMB1-NEXT: ldr r1, [sp, #56] ; THUMB1-NEXT: ldr r5, [sp, #24] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #2] ; THUMB1-NEXT: ldr r1, [sp, #52] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: eors r5, r1 ; THUMB1-NEXT: ands r5, r6 -; THUMB1-NEXT: mov r3, r1 -; THUMB1-NEXT: bics r3, r6 -; THUMB1-NEXT: orrs r5, r3 +; THUMB1-NEXT: eors r5, r1 ; THUMB1-NEXT: strb r5, [r0, #1] ; THUMB1-NEXT: ldr r1, [sp, #48] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: eors r3, r1 ; THUMB1-NEXT: ands r3, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r3, r2 +; THUMB1-NEXT: eors r3, r1 ; THUMB1-NEXT: strb r3, [r0] ; THUMB1-NEXT: add sp, #4 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} @@ -219,7 +227,6 @@ define <8 x i8> @ct_v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) { ; THUMB2-NEXT: orrs r3, r5 ; THUMB2-NEXT: strb r3, [r0] ; THUMB2-NEXT: pop {r4, r5, r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <8 x i8> @llvm.ct.select.v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) ret <8 x i8> %sel @@ -276,35 +283,39 @@ define <4 x i16> @ct_v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) { ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 ; THUMB1-NEXT: ldr r1, [sp, #32] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ldr r2, [sp, #36] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ands r1, r5 -; THUMB1-NEXT: mov r3, r2 -; THUMB1-NEXT: bics r3, r5 -; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ldr r3, [sp, #40] ; THUMB1-NEXT: ldr r5, [sp, #24] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: eors r2, r3 ; THUMB1-NEXT: ands r2, r6 -; THUMB1-NEXT: mov r5, r3 -; THUMB1-NEXT: bics r5, r6 -; THUMB1-NEXT: orrs r2, r5 +; THUMB1-NEXT: eors r2, r3 ; THUMB1-NEXT: ldr r5, [sp, #44] ; THUMB1-NEXT: ldr r6, [sp, #28] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: ands r3, r7 -; THUMB1-NEXT: mov r6, r5 -; THUMB1-NEXT: bics r6, r7 -; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: add sp, #4 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} ; @@ -336,7 +347,6 @@ define <4 x i16> @ct_v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) { ; THUMB2-NEXT: bic.w r5, lr, r5 ; THUMB2-NEXT: orrs r3, r5 ; THUMB2-NEXT: pop {r4, r5, r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <4 x i16> @llvm.ct.select.v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) ret <4 x i16> %sel @@ -379,19 +389,21 @@ define <2 x i32> @ct_v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) { ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 ; THUMB1-NEXT: ldr r1, [sp, #16] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ldr r2, [sp, #20] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ands r1, r5 -; THUMB1-NEXT: mov r3, r2 -; THUMB1-NEXT: bics r3, r5 -; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: pop {r4, r5, r7, pc} ; ; THUMB2-LABEL: ct_v2i32: @@ -410,7 +422,6 @@ define <2 x i32> @ct_v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) { ; THUMB2-NEXT: bic.w lr, r2, lr ; THUMB2-NEXT: orr.w r1, r1, lr ; THUMB2-NEXT: pop {r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <2 x i32> @llvm.ct.select.v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) ret <2 x i32> %sel @@ -453,19 +464,21 @@ define <1 x i64> @ct_v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b) { ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 ; THUMB1-NEXT: ldr r1, [sp, #16] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ldr r2, [sp, #20] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ands r1, r5 -; THUMB1-NEXT: mov r3, r2 -; THUMB1-NEXT: bics r3, r5 -; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: pop {r4, r5, r7, pc} ; ; THUMB2-LABEL: ct_v1i64: @@ -484,7 +497,6 @@ define <1 x i64> @ct_v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b) { ; THUMB2-NEXT: bic.w lr, r2, lr ; THUMB2-NEXT: orr.w r1, r1, lr ; THUMB2-NEXT: pop {r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b) ret <1 x i64> %sel @@ -527,19 +539,21 @@ define <2 x float> @ct_v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) { ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 ; THUMB1-NEXT: ldr r1, [sp, #16] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ldr r2, [sp, #20] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ands r1, r5 -; THUMB1-NEXT: mov r3, r2 -; THUMB1-NEXT: bics r3, r5 -; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: pop {r4, r5, r7, pc} ; ; THUMB2-LABEL: ct_v2f32: @@ -558,7 +572,6 @@ define <2 x float> @ct_v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) { ; THUMB2-NEXT: bic.w lr, r2, lr ; THUMB2-NEXT: orr.w r1, r1, lr ; THUMB2-NEXT: pop {r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <2 x float> @llvm.ct.select.v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) ret <2 x float> %sel @@ -707,145 +720,161 @@ define <16 x i8> @ct_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) { ; THUMB1-NEXT: ands r4, r1 ; THUMB1-NEXT: ldr r1, [sp, #140] ; THUMB1-NEXT: ldr r5, [sp, #76] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #15] ; THUMB1-NEXT: ldr r1, [sp, #136] ; THUMB1-NEXT: ldr r5, [sp, #72] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #14] ; THUMB1-NEXT: ldr r1, [sp, #132] ; THUMB1-NEXT: ldr r5, [sp, #68] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #13] ; THUMB1-NEXT: ldr r1, [sp, #128] ; THUMB1-NEXT: ldr r5, [sp, #64] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #12] ; THUMB1-NEXT: ldr r1, [sp, #124] ; THUMB1-NEXT: ldr r5, [sp, #60] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #11] ; THUMB1-NEXT: ldr r1, [sp, #120] ; THUMB1-NEXT: ldr r5, [sp, #56] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #10] ; THUMB1-NEXT: ldr r1, [sp, #116] ; THUMB1-NEXT: ldr r5, [sp, #52] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #9] ; THUMB1-NEXT: ldr r1, [sp, #112] ; THUMB1-NEXT: ldr r5, [sp, #48] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #8] ; THUMB1-NEXT: ldr r1, [sp, #108] ; THUMB1-NEXT: ldr r5, [sp, #44] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #7] ; THUMB1-NEXT: ldr r1, [sp, #104] ; THUMB1-NEXT: ldr r5, [sp, #40] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #6] ; THUMB1-NEXT: ldr r1, [sp, #100] ; THUMB1-NEXT: ldr r5, [sp, #36] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #5] ; THUMB1-NEXT: ldr r1, [sp, #96] ; THUMB1-NEXT: ldr r5, [sp, #32] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #4] ; THUMB1-NEXT: ldr r1, [sp, #92] ; THUMB1-NEXT: ldr r5, [sp, #28] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #3] ; THUMB1-NEXT: ldr r1, [sp, #88] ; THUMB1-NEXT: ldr r5, [sp, #24] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strb r6, [r0, #2] ; THUMB1-NEXT: ldr r1, [sp, #84] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: eors r5, r1 ; THUMB1-NEXT: ands r5, r6 -; THUMB1-NEXT: mov r3, r1 -; THUMB1-NEXT: bics r3, r6 -; THUMB1-NEXT: orrs r5, r3 +; THUMB1-NEXT: eors r5, r1 ; THUMB1-NEXT: strb r5, [r0, #1] ; THUMB1-NEXT: ldr r1, [sp, #80] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: eors r3, r1 ; THUMB1-NEXT: ands r3, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r3, r2 +; THUMB1-NEXT: eors r3, r1 ; THUMB1-NEXT: strb r3, [r0] ; THUMB1-NEXT: add sp, #4 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} @@ -966,7 +995,6 @@ define <16 x i8> @ct_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) { ; THUMB2-NEXT: orrs r3, r5 ; THUMB2-NEXT: strb r3, [r0] ; THUMB2-NEXT: pop {r4, r5, r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) ret <16 x i8> %sel @@ -1059,73 +1087,81 @@ define <8 x i16> @ct_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) { ; THUMB1-NEXT: ands r4, r1 ; THUMB1-NEXT: ldr r1, [sp, #76] ; THUMB1-NEXT: ldr r5, [sp, #44] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #14] ; THUMB1-NEXT: ldr r1, [sp, #72] ; THUMB1-NEXT: ldr r5, [sp, #40] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #12] ; THUMB1-NEXT: ldr r1, [sp, #68] ; THUMB1-NEXT: ldr r5, [sp, #36] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #10] ; THUMB1-NEXT: ldr r1, [sp, #64] ; THUMB1-NEXT: ldr r5, [sp, #32] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #8] ; THUMB1-NEXT: ldr r1, [sp, #60] ; THUMB1-NEXT: ldr r5, [sp, #28] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #6] ; THUMB1-NEXT: ldr r1, [sp, #56] ; THUMB1-NEXT: ldr r5, [sp, #24] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: ands r6, r7 -; THUMB1-NEXT: mov r5, r1 -; THUMB1-NEXT: bics r5, r7 -; THUMB1-NEXT: orrs r6, r5 +; THUMB1-NEXT: eors r6, r1 ; THUMB1-NEXT: strh r6, [r0, #4] ; THUMB1-NEXT: ldr r1, [sp, #52] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: eors r5, r1 ; THUMB1-NEXT: ands r5, r6 -; THUMB1-NEXT: mov r3, r1 -; THUMB1-NEXT: bics r3, r6 -; THUMB1-NEXT: orrs r5, r3 +; THUMB1-NEXT: eors r5, r1 ; THUMB1-NEXT: strh r5, [r0, #2] ; THUMB1-NEXT: ldr r1, [sp, #48] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: eors r3, r1 ; THUMB1-NEXT: ands r3, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r3, r2 +; THUMB1-NEXT: eors r3, r1 ; THUMB1-NEXT: strh r3, [r0] ; THUMB1-NEXT: add sp, #4 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} @@ -1190,7 +1226,6 @@ define <8 x i16> @ct_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) { ; THUMB2-NEXT: orrs r3, r5 ; THUMB2-NEXT: strh r3, [r0] ; THUMB2-NEXT: pop {r4, r5, r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) ret <8 x i16> %sel @@ -1250,35 +1285,39 @@ define <4 x i32> @ct_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 ; THUMB1-NEXT: ldr r1, [sp, #32] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ldr r2, [sp, #36] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ands r1, r5 -; THUMB1-NEXT: mov r3, r2 -; THUMB1-NEXT: bics r3, r5 -; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ldr r3, [sp, #40] ; THUMB1-NEXT: ldr r5, [sp, #24] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: eors r2, r3 ; THUMB1-NEXT: ands r2, r6 -; THUMB1-NEXT: mov r5, r3 -; THUMB1-NEXT: bics r5, r6 -; THUMB1-NEXT: orrs r2, r5 +; THUMB1-NEXT: eors r2, r3 ; THUMB1-NEXT: ldr r5, [sp, #44] ; THUMB1-NEXT: ldr r6, [sp, #28] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: ands r3, r7 -; THUMB1-NEXT: mov r6, r5 -; THUMB1-NEXT: bics r6, r7 -; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: add sp, #4 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} ; @@ -1310,7 +1349,6 @@ define <4 x i32> @ct_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { ; THUMB2-NEXT: bic.w r5, lr, r5 ; THUMB2-NEXT: orrs r3, r5 ; THUMB2-NEXT: pop {r4, r5, r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) ret <4 x i32> %sel @@ -1370,35 +1408,39 @@ define <2 x i64> @ct_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 ; THUMB1-NEXT: ldr r1, [sp, #32] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ldr r2, [sp, #36] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ands r1, r5 -; THUMB1-NEXT: mov r3, r2 -; THUMB1-NEXT: bics r3, r5 -; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ldr r3, [sp, #40] ; THUMB1-NEXT: ldr r5, [sp, #24] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: eors r2, r3 ; THUMB1-NEXT: ands r2, r6 -; THUMB1-NEXT: mov r5, r3 -; THUMB1-NEXT: bics r5, r6 -; THUMB1-NEXT: orrs r2, r5 +; THUMB1-NEXT: eors r2, r3 ; THUMB1-NEXT: ldr r5, [sp, #44] ; THUMB1-NEXT: ldr r6, [sp, #28] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: ands r3, r7 -; THUMB1-NEXT: mov r6, r5 -; THUMB1-NEXT: bics r6, r7 -; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: add sp, #4 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} ; @@ -1430,7 +1472,6 @@ define <2 x i64> @ct_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { ; THUMB2-NEXT: bic.w r5, lr, r5 ; THUMB2-NEXT: orrs r3, r5 ; THUMB2-NEXT: pop {r4, r5, r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) ret <2 x i64> %sel @@ -1490,35 +1531,39 @@ define <4 x float> @ct_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 ; THUMB1-NEXT: ldr r1, [sp, #32] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ldr r2, [sp, #36] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ands r1, r5 -; THUMB1-NEXT: mov r3, r2 -; THUMB1-NEXT: bics r3, r5 -; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ldr r3, [sp, #40] ; THUMB1-NEXT: ldr r5, [sp, #24] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: eors r2, r3 ; THUMB1-NEXT: ands r2, r6 -; THUMB1-NEXT: mov r5, r3 -; THUMB1-NEXT: bics r5, r6 -; THUMB1-NEXT: orrs r2, r5 +; THUMB1-NEXT: eors r2, r3 ; THUMB1-NEXT: ldr r5, [sp, #44] ; THUMB1-NEXT: ldr r6, [sp, #28] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: ands r3, r7 -; THUMB1-NEXT: mov r6, r5 -; THUMB1-NEXT: bics r6, r7 -; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: add sp, #4 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} ; @@ -1550,7 +1595,6 @@ define <4 x float> @ct_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { ; THUMB2-NEXT: bic.w r5, lr, r5 ; THUMB2-NEXT: orrs r3, r5 ; THUMB2-NEXT: pop {r4, r5, r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) ret <4 x float> %sel @@ -1610,35 +1654,39 @@ define <2 x double> @ct_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 ; THUMB1-NEXT: ldr r1, [sp, #32] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ldr r2, [sp, #36] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ands r1, r5 -; THUMB1-NEXT: mov r3, r2 -; THUMB1-NEXT: bics r3, r5 -; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ldr r3, [sp, #40] ; THUMB1-NEXT: ldr r5, [sp, #24] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: eors r2, r3 ; THUMB1-NEXT: ands r2, r6 -; THUMB1-NEXT: mov r5, r3 -; THUMB1-NEXT: bics r5, r6 -; THUMB1-NEXT: orrs r2, r5 +; THUMB1-NEXT: eors r2, r3 ; THUMB1-NEXT: ldr r5, [sp, #44] ; THUMB1-NEXT: ldr r6, [sp, #28] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: ands r3, r7 -; THUMB1-NEXT: mov r6, r5 -; THUMB1-NEXT: bics r6, r7 -; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: add sp, #4 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} ; @@ -1670,7 +1718,6 @@ define <2 x double> @ct_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { ; THUMB2-NEXT: bic.w r5, lr, r5 ; THUMB2-NEXT: orrs r3, r5 ; THUMB2-NEXT: pop {r4, r5, r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) ret <2 x double> %sel @@ -1704,12 +1751,13 @@ define <1 x i8> @ct_v1i8(i1 %cond, <1 x i8> %a, <1 x i8> %b) { ; THUMB1-NEXT: push {r4, lr} ; THUMB1-NEXT: movs r3, #1 ; THUMB1-NEXT: ands r3, r0 -; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 ; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: ands r0, r4 -; THUMB1-NEXT: mov r1, r2 -; THUMB1-NEXT: bics r1, r4 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: pop {r4, pc} ; ; THUMB2-LABEL: ct_v1i8: @@ -1720,7 +1768,6 @@ define <1 x i8> @ct_v1i8(i1 %cond, <1 x i8> %a, <1 x i8> %b) { ; THUMB2-NEXT: bic.w r12, r2, r12 ; THUMB2-NEXT: orr.w r0, r0, r12 ; THUMB2-NEXT: bx lr -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <1 x i8> @llvm.ct.select.i8(i1 %cond, <1 x i8> %a, <1 x i8> %b) ret <1 x i8> %sel @@ -1761,19 +1808,21 @@ define <2 x i8> @ct_v2i8(i1 %cond, <2 x i8> %a, <2 x i8> %b) { ; THUMB1-NEXT: push {r4, r5, r7, lr} ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r3 ; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: mov r1, r3 -; THUMB1-NEXT: bics r1, r5 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: eors r0, r3 ; THUMB1-NEXT: ldr r3, [sp, #16] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: eors r1, r3 ; THUMB1-NEXT: ands r1, r5 -; THUMB1-NEXT: mov r2, r3 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r1, r2 +; THUMB1-NEXT: eors r1, r3 ; THUMB1-NEXT: pop {r4, r5, r7, pc} ; ; THUMB2-LABEL: ct_v2i8: @@ -1791,7 +1840,6 @@ define <2 x i8> @ct_v2i8(i1 %cond, <2 x i8> %a, <2 x i8> %b) { ; THUMB2-NEXT: bic.w lr, r3, lr ; THUMB2-NEXT: orr.w r1, r1, lr ; THUMB2-NEXT: pop {r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <2 x i8> @llvm.ct.select.i16(i1 %cond, <2 x i8> %a, <2 x i8> %b) ret <2 x i8> %sel @@ -1845,34 +1893,38 @@ define <4 x i8> @ct_v4i8(i1 %cond, <4 x i8> %a, <4 x i8> %b) { ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 ; THUMB1-NEXT: ldr r5, [sp, #24] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r5 ; THUMB1-NEXT: ands r0, r6 -; THUMB1-NEXT: mov r1, r5 -; THUMB1-NEXT: bics r1, r6 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: eors r0, r5 ; THUMB1-NEXT: ldr r5, [sp, #28] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: eors r1, r5 ; THUMB1-NEXT: ands r1, r6 -; THUMB1-NEXT: mov r2, r5 -; THUMB1-NEXT: bics r2, r6 -; THUMB1-NEXT: orrs r1, r2 +; THUMB1-NEXT: eors r1, r5 ; THUMB1-NEXT: ldr r5, [sp, #32] -; THUMB1-NEXT: rsbs r6, r4, #0 +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 ; THUMB1-NEXT: mov r2, r3 +; THUMB1-NEXT: eors r2, r5 ; THUMB1-NEXT: ands r2, r6 -; THUMB1-NEXT: mov r3, r5 -; THUMB1-NEXT: bics r3, r6 -; THUMB1-NEXT: orrs r2, r3 +; THUMB1-NEXT: eors r2, r5 ; THUMB1-NEXT: ldr r5, [sp, #36] ; THUMB1-NEXT: ldr r6, [sp, #20] -; THUMB1-NEXT: rsbs r7, r4, #0 +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 ; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: ands r3, r7 -; THUMB1-NEXT: mov r6, r5 -; THUMB1-NEXT: bics r6, r7 -; THUMB1-NEXT: orrs r3, r6 +; THUMB1-NEXT: eors r3, r5 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} ; ; THUMB2-LABEL: ct_v4i8: @@ -1902,7 +1954,6 @@ define <4 x i8> @ct_v4i8(i1 %cond, <4 x i8> %a, <4 x i8> %b) { ; THUMB2-NEXT: bic.w r5, lr, r5 ; THUMB2-NEXT: orrs r3, r5 ; THUMB2-NEXT: pop {r4, r5, r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <4 x i8> @llvm.ct.select.i32(i1 %cond, <4 x i8> %a, <4 x i8> %b) ret <4 x i8> %sel @@ -1933,12 +1984,13 @@ define <1 x i16> @ct_v1i16(i1 %cond, <1 x i16> %a, <1 x i16> %b) { ; THUMB1-NEXT: push {r4, lr} ; THUMB1-NEXT: movs r3, #1 ; THUMB1-NEXT: ands r3, r0 -; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 ; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: ands r0, r4 -; THUMB1-NEXT: mov r1, r2 -; THUMB1-NEXT: bics r1, r4 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: pop {r4, pc} ; ; THUMB2-LABEL: ct_v1i16: @@ -1949,7 +2001,6 @@ define <1 x i16> @ct_v1i16(i1 %cond, <1 x i16> %a, <1 x i16> %b) { ; THUMB2-NEXT: bic.w r12, r2, r12 ; THUMB2-NEXT: orr.w r0, r0, r12 ; THUMB2-NEXT: bx lr -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <1 x i16> @llvm.ct.select.i16(i1 %cond, <1 x i16> %a, <1 x i16> %b) ret <1 x i16> %sel @@ -1990,19 +2041,21 @@ define <2 x i16> @ct_v2i16(i1 %cond, <2 x i16> %a, <2 x i16> %b) { ; THUMB1-NEXT: push {r4, r5, r7, lr} ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r3 ; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: mov r1, r3 -; THUMB1-NEXT: bics r1, r5 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: eors r0, r3 ; THUMB1-NEXT: ldr r3, [sp, #16] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: eors r1, r3 ; THUMB1-NEXT: ands r1, r5 -; THUMB1-NEXT: mov r2, r3 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r1, r2 +; THUMB1-NEXT: eors r1, r3 ; THUMB1-NEXT: pop {r4, r5, r7, pc} ; ; THUMB2-LABEL: ct_v2i16: @@ -2020,7 +2073,6 @@ define <2 x i16> @ct_v2i16(i1 %cond, <2 x i16> %a, <2 x i16> %b) { ; THUMB2-NEXT: bic.w lr, r3, lr ; THUMB2-NEXT: orr.w r1, r1, lr ; THUMB2-NEXT: pop {r7, pc} -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <2 x i16> @llvm.ct.select.i32(i1 %cond, <2 x i16> %a, <2 x i16> %b) ret <2 x i16> %sel @@ -2051,12 +2103,13 @@ define <1 x i32> @ct_v1i32(i1 %cond, <1 x i32> %a, <1 x i32> %b) { ; THUMB1-NEXT: push {r4, lr} ; THUMB1-NEXT: movs r3, #1 ; THUMB1-NEXT: ands r3, r0 -; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 ; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: ands r0, r4 -; THUMB1-NEXT: mov r1, r2 -; THUMB1-NEXT: bics r1, r4 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: pop {r4, pc} ; ; THUMB2-LABEL: ct_v1i32: @@ -2067,7 +2120,6 @@ define <1 x i32> @ct_v1i32(i1 %cond, <1 x i32> %a, <1 x i32> %b) { ; THUMB2-NEXT: bic.w r12, r2, r12 ; THUMB2-NEXT: orr.w r0, r0, r12 ; THUMB2-NEXT: bx lr -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <1 x i32> @llvm.ct.select.i32(i1 %cond, <1 x i32> %a, <1 x i32> %b) ret <1 x i32> %sel @@ -2104,12 +2156,13 @@ define <1 x float> @ct_v1f32(i1 %cond, <1 x float> %a, <1 x float> %b) { ; THUMB1-NEXT: push {r4, lr} ; THUMB1-NEXT: movs r3, #1 ; THUMB1-NEXT: ands r3, r0 -; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 ; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: ands r0, r4 -; THUMB1-NEXT: mov r1, r2 -; THUMB1-NEXT: bics r1, r4 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: pop {r4, pc} ; ; THUMB2-LABEL: ct_v1f32: @@ -2120,7 +2173,6 @@ define <1 x float> @ct_v1f32(i1 %cond, <1 x float> %a, <1 x float> %b) { ; THUMB2-NEXT: bic.w r12, r2, r12 ; THUMB2-NEXT: orr.w r0, r0, r12 ; THUMB2-NEXT: bx lr -; THUMB2-NOT: it{{[te]+}} entry: %sel = call <1 x float> @llvm.ct.select.f32(i1 %cond, <1 x float> %a, <1 x float> %b) ret <1 x float> %sel diff --git a/llvm/test/CodeGen/ARM/ctselect.ll b/llvm/test/CodeGen/ARM/ctselect.ll index 0dec8ce4a9725..40e17cb135627 100644 --- a/llvm/test/CodeGen/ARM/ctselect.ll +++ b/llvm/test/CodeGen/ARM/ctselect.ll @@ -31,12 +31,13 @@ define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) { ; THUMB1-NEXT: push {r4, lr} ; THUMB1-NEXT: movs r3, #1 ; THUMB1-NEXT: ands r3, r0 -; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 ; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: ands r0, r4 -; THUMB1-NEXT: mov r1, r2 -; THUMB1-NEXT: bics r1, r4 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: pop {r4, pc} ; ; THUMB2-LABEL: ct_i1: @@ -47,7 +48,6 @@ define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) { ; THUMB2-NEXT: bic.w r12, r2, r12 ; THUMB2-NEXT: orr.w r0, r0, r12 ; THUMB2-NEXT: bx lr -; THUMB2-NOT: it{{[te]+}} ; ; CORTEXA9-LABEL: ct_i1: ; CORTEXA9: @ %bb.0: @ %entry @@ -96,12 +96,13 @@ define i8 @ct_int8(i1 %cond, i8 %a, i8 %b) { ; THUMB1-NEXT: push {r4, lr} ; THUMB1-NEXT: movs r3, #1 ; THUMB1-NEXT: ands r3, r0 -; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 ; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: ands r0, r4 -; THUMB1-NEXT: mov r1, r2 -; THUMB1-NEXT: bics r1, r4 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: pop {r4, pc} ; ; THUMB2-LABEL: ct_int8: @@ -112,7 +113,6 @@ define i8 @ct_int8(i1 %cond, i8 %a, i8 %b) { ; THUMB2-NEXT: bic.w r12, r2, r12 ; THUMB2-NEXT: orr.w r0, r0, r12 ; THUMB2-NEXT: bx lr -; THUMB2-NOT: it{{[te]+}} ; ; CORTEXA9-LABEL: ct_int8: ; CORTEXA9: @ %bb.0: @ %entry @@ -161,12 +161,13 @@ define i16 @ct_int16(i1 %cond, i16 %a, i16 %b) { ; THUMB1-NEXT: push {r4, lr} ; THUMB1-NEXT: movs r3, #1 ; THUMB1-NEXT: ands r3, r0 -; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 ; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: ands r0, r4 -; THUMB1-NEXT: mov r1, r2 -; THUMB1-NEXT: bics r1, r4 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: pop {r4, pc} ; ; THUMB2-LABEL: ct_int16: @@ -177,7 +178,6 @@ define i16 @ct_int16(i1 %cond, i16 %a, i16 %b) { ; THUMB2-NEXT: bic.w r12, r2, r12 ; THUMB2-NEXT: orr.w r0, r0, r12 ; THUMB2-NEXT: bx lr -; THUMB2-NOT: it{{[te]+}} ; ; CORTEXA9-LABEL: ct_int16: ; CORTEXA9: @ %bb.0: @ %entry @@ -226,12 +226,13 @@ define i32 @ct_int32(i1 %cond, i32 %a, i32 %b) { ; THUMB1-NEXT: push {r4, lr} ; THUMB1-NEXT: movs r3, #1 ; THUMB1-NEXT: ands r3, r0 -; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 ; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: ands r0, r4 -; THUMB1-NEXT: mov r1, r2 -; THUMB1-NEXT: bics r1, r4 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: pop {r4, pc} ; ; THUMB2-LABEL: ct_int32: @@ -242,7 +243,6 @@ define i32 @ct_int32(i1 %cond, i32 %a, i32 %b) { ; THUMB2-NEXT: bic.w r12, r2, r12 ; THUMB2-NEXT: orr.w r0, r0, r12 ; THUMB2-NEXT: bx lr -; THUMB2-NOT: it{{[te]+}} ; ; CORTEXA9-LABEL: ct_int32: ; CORTEXA9: @ %bb.0: @ %entry @@ -307,19 +307,21 @@ define i64 @ct_int64(i1 %cond, i64 %a, i64 %b) { ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 ; THUMB1-NEXT: ldr r1, [sp, #16] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ldr r2, [sp, #20] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ands r1, r5 -; THUMB1-NEXT: mov r3, r2 -; THUMB1-NEXT: bics r3, r5 -; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: pop {r4, r5, r7, pc} ; ; THUMB2-LABEL: ct_int64: @@ -338,7 +340,6 @@ define i64 @ct_int64(i1 %cond, i64 %a, i64 %b) { ; THUMB2-NEXT: bic.w lr, r2, lr ; THUMB2-NEXT: orr.w r1, r1, lr ; THUMB2-NEXT: pop {r7, pc} -; THUMB2-NOT: it{{[te]+}} ; ; CORTEXA9-LABEL: ct_int64: ; CORTEXA9: @ %bb.0: @ %entry @@ -408,12 +409,13 @@ define float @ct_float(i1 %cond, float %a, float %b) { ; THUMB1-NEXT: push {r4, lr} ; THUMB1-NEXT: movs r3, #1 ; THUMB1-NEXT: ands r3, r0 -; THUMB1-NEXT: rsbs r4, r3, #0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 ; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: ands r0, r4 -; THUMB1-NEXT: mov r1, r2 -; THUMB1-NEXT: bics r1, r4 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: eors r0, r2 ; THUMB1-NEXT: pop {r4, pc} ; ; THUMB2-LABEL: ct_float: @@ -424,7 +426,6 @@ define float @ct_float(i1 %cond, float %a, float %b) { ; THUMB2-NEXT: bic.w r12, r2, r12 ; THUMB2-NEXT: orr.w r0, r0, r12 ; THUMB2-NEXT: bx lr -; THUMB2-NOT: it{{[te]+}} ; ; CORTEXA9-LABEL: ct_float: ; CORTEXA9: @ %bb.0: @ %entry @@ -493,19 +494,21 @@ define double @ct_f64(i1 %cond, double %a, double %b) { ; THUMB1-NEXT: movs r4, #1 ; THUMB1-NEXT: ands r4, r0 ; THUMB1-NEXT: ldr r1, [sp, #16] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: mov r2, r1 -; THUMB1-NEXT: bics r2, r5 -; THUMB1-NEXT: orrs r0, r2 +; THUMB1-NEXT: eors r0, r1 ; THUMB1-NEXT: ldr r2, [sp, #20] -; THUMB1-NEXT: rsbs r5, r4, #0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 ; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: ands r1, r5 -; THUMB1-NEXT: mov r3, r2 -; THUMB1-NEXT: bics r3, r5 -; THUMB1-NEXT: orrs r1, r3 +; THUMB1-NEXT: eors r1, r2 ; THUMB1-NEXT: pop {r4, r5, r7, pc} ; ; THUMB2-LABEL: ct_f64: @@ -524,7 +527,6 @@ define double @ct_f64(i1 %cond, double %a, double %b) { ; THUMB2-NEXT: bic.w lr, r2, lr ; THUMB2-NEXT: orr.w r1, r1, lr ; THUMB2-NEXT: pop {r7, pc} -; THUMB2-NOT: it{{[te]+}} ; ; CORTEXA9-LABEL: ct_f64: ; CORTEXA9: @ %bb.0: @ %entry From c4b5a5be26ecf24074b57d827c6a652dc1c498e7 Mon Sep 17 00:00:00 2001 From: Francesco Bertolaccini Date: Mon, 22 Sep 2025 17:50:27 +0200 Subject: [PATCH 43/63] [CT] Fix legalization of vectors with small lanes on AArch64 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 17e76a2945db3..89e949d96146e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -531,7 +531,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTSELECT, MVT::f32, Custom); setOperationAction(ISD::CTSELECT, MVT::f64, Custom); for (MVT VT : MVT::vector_valuetypes()) { - setOperationAction(ISD::CTSELECT, VT, Expand); + MVT elemType = VT.getVectorElementType(); + if (elemType == MVT::i8 || elemType == MVT::i16) { + setOperationAction(ISD::CTSELECT, VT, Promote); + } else if ((elemType == MVT::f16 || elemType == MVT::bf16) && !Subtarget->hasFullFP16()) { + setOperationAction(ISD::CTSELECT, VT, Promote); + } else { + setOperationAction(ISD::CTSELECT, VT, Expand); + } } setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); From 97c00444a8c2aca21973b80327d32a0c9bb1dcdd Mon Sep 17 00:00:00 2001 From: kumarak Date: Wed, 24 Sep 2025 23:02:13 +0000 Subject: [PATCH 44/63] [CT] create instruction bundle during post RA expansion --- .../SelectionDAG/SelectionDAGBuilder.cpp | 4 ++ .../Target/AArch64/AArch64TargetMachine.cpp | 4 +- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 59 +++++++++++++------ llvm/lib/Target/ARM/ARMTargetMachine.cpp | 3 +- llvm/lib/Target/X86/X86InstrInfo.cpp | 8 +-- llvm/lib/Target/X86/X86TargetMachine.cpp | 3 +- 6 files changed, 55 insertions(+), 26 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 8276bd25b840d..90d9ac76b6e57 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6854,6 +6854,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, "llvm.ct.select: predicates with vector types not supported yet"); } + // Set function attribute to indicate ct.select usage + Function &F = DAG.getMachineFunction().getFunction(); + F.addFnAttr("ct-select"); + // Handle scalar types if (TLI.isSelectSupported( TargetLoweringBase::SelectSupportKind::CtSelect) && diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 5b80b08375f8c..e10652a118157 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -896,7 +896,9 @@ void AArch64PassConfig::addPostBBSections() { void AArch64PassConfig::addPreEmitPass2() { // SVE bundles move prefixes with destructive operations. BLR_RVMARKER pseudo // instructions are lowered to bundles as well. - addPass(createUnpackMachineBundles(nullptr)); + addPass(createUnpackMachineBundles([](const MachineFunction &MF) { + return MF.getFunction().hasFnAttribute("ct-select"); + })); } bool AArch64PassConfig::addRegAssignAndRewriteOptimized() { diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 01603e85de2c3..f0f57ade2bbe5 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1564,7 +1564,8 @@ bool ARMBaseInstrInfo::expandCtSelectVector(MachineInstr &MI) const { // When cond = 0: mask = 0x00000000. // When cond = 1: mask = 0xFFFFFFFF. - BuildMI(*MBB, MI, DL, get(RsbOp), MaskReg) + MachineInstr *FirstNewMI = + BuildMI(*MBB, MI, DL, get(RsbOp), MaskReg) .addReg(CondReg) .addImm(0) .add(predOps(ARMCC::AL)) @@ -1592,11 +1593,17 @@ bool ARMBaseInstrInfo::expandCtSelectVector(MachineInstr &MI) const { .setMIFlag(MachineInstr::MIFlag::NoMerge); // 4. result = A | B - BuildMI(*MBB, MI, DL, get(OrrOp), DestReg) + auto LastNewMI = BuildMI(*MBB, MI, DL, get(OrrOp), DestReg) .addReg(DestReg) .addReg(VectorMaskReg) .add(predOps(ARMCC::AL)) .setMIFlag(MachineInstr::MIFlag::NoMerge); + + auto BundleStart = FirstNewMI->getIterator(); + auto BundleEnd = LastNewMI->getIterator(); + + // Add instruction bundling + finalizeBundle(*MBB, BundleStart, std::next(BundleEnd)); MI.eraseFromParent(); return true; @@ -1624,7 +1631,8 @@ bool ARMBaseInstrInfo::expandCtSelectThumb(MachineInstr &MI) const { unsigned ShiftAmount = RegSize - 1; // Option 1: Shift-based mask (preferred - no flag modification) - BuildMI(*MBB, MI, DL, get(ARM::tMOVr), MaskReg) + MachineInstr *FirstNewMI = + BuildMI(*MBB, MI, DL, get(ARM::tMOVr), MaskReg) .addReg(CondReg) .add(predOps(ARMCC::AL)) .setMIFlag(MachineInstr::MIFlag::NoMerge); @@ -1662,12 +1670,16 @@ bool ARMBaseInstrInfo::expandCtSelectThumb(MachineInstr &MI) const { .setMIFlag(MachineInstr::MIFlag::NoMerge); // 4. result = src2 ^ masked_xor - BuildMI(*MBB, MI, DL, get(ARM::tEOR), DestReg) + auto LastMI = BuildMI(*MBB, MI, DL, get(ARM::tEOR), DestReg) .addReg(DestReg) .addReg(Src2Reg) .add(predOps(ARMCC::AL)) .setMIFlag(MachineInstr::MIFlag::NoMerge); + // Add instruction bundling + auto BundleStart = FirstNewMI->getIterator(); + finalizeBundle(*MBB, BundleStart, std::next(LastMI->getIterator())); + MI.eraseFromParent(); return true; } @@ -1697,6 +1709,7 @@ bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); bool IsFloat = Opcode == ARM::CTSELECTf32 || Opcode == ARM::CTSELECTf16 || Opcode == ARM::CTSELECTbf16; + MachineInstr *FirstNewMI = nullptr; if (IsFloat) { // Each float pseudo has: (outs $dst, $tmp_mask, $scratch1, $scratch2), (ins $src1, $src2, $cond)) // We use two scratch registers in tablegen for bitwise ops on float types,. @@ -1710,11 +1723,11 @@ bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const { // cond from __builtin_ct_select(cond, a, b) CondReg = MI.getOperand(6).getReg(); - // Move fp src1 to GPR scratch1 so we can do our bitwise ops - BuildMI(*MBB, MI, DL, get(ARM::VMOVRS), GPRScratch1) - .addReg(Src1Reg) - .add(predOps(ARMCC::AL)) - .setMIFlag(MachineInstr::MIFlag::NoMerge); + // Move fp src1 to GPR scratch1 so we can do our bitwise ops + FirstNewMI = BuildMI(*MBB, MI, DL, get(ARM::VMOVRS), GPRScratch1) + .addReg(Src1Reg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); // Move src2 to scratch2 BuildMI(*MBB, MI, DL, get(ARM::VMOVRS), GPRScratch2) @@ -1738,13 +1751,17 @@ bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const { // 1. mask = 0 - cond // When cond = 0: mask = 0x00000000. // When cond = 1: mask = 0xFFFFFFFF. - BuildMI(*MBB, MI, DL, get(RsbOp), MaskReg) - .addReg(CondReg) - .addImm(0) - .add(predOps(ARMCC::AL)) - .add(condCodeOp()) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - + auto TmpNewMI = BuildMI(*MBB, MI, DL, get(RsbOp), MaskReg) + .addReg(CondReg) + .addImm(0) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // We use the first instruction in the bundle as the first instruction. + if (!FirstNewMI) + FirstNewMI = TmpNewMI; + // 2. A = src1 & mask BuildMI(*MBB, MI, DL, get(AndOp), DestReg) .addReg(Src1Reg) @@ -1762,7 +1779,7 @@ bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const { .setMIFlag(MachineInstr::MIFlag::NoMerge); // 4. result = A | B - BuildMI(*MBB, MI, DL, get(OrrOp), DestReg) + auto LastNewMI = BuildMI(*MBB, MI, DL, get(OrrOp), DestReg) .addReg(DestReg) .addReg(MaskReg) .add(predOps(ARMCC::AL)) @@ -1771,11 +1788,17 @@ bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const { if (IsFloat) { // Return our result from GPR to the correct register type. - BuildMI(*MBB, MI, DL, get(ARM::VMOVSR), DestRegSavedRef) + LastNewMI =BuildMI(*MBB, MI, DL, get(ARM::VMOVSR), DestRegSavedRef) .addReg(DestReg) .add(predOps(ARMCC::AL)) .setMIFlag(MachineInstr::MIFlag::NoMerge); } + + auto BundleStart = FirstNewMI->getIterator(); + auto BundleEnd = LastNewMI->getIterator(); + + // Add instruction bundling + finalizeBundle(*MBB, BundleStart, std::next(BundleEnd)); MI.eraseFromParent(); return true; diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 86740a92b32c5..18d47d9c68767 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -519,7 +519,8 @@ void ARMPassConfig::addPreEmitPass() { // Constant island pass work on unbundled instructions. addPass(createUnpackMachineBundles([](const MachineFunction &MF) { - return MF.getSubtarget().isThumb2(); + return MF.getSubtarget().isThumb2() || + MF.getFunction().hasFnAttribute("ct-select"); })); // Don't optimize barriers or block placement at -O0. diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index ea779fe66946f..98dc26c28e747 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -657,7 +657,7 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { X86::CondCode CC = X86::CondCode(MI.getOperand(5).getImm()); // condition // Create scalar mask in tempGPR and broadcast to vector mask - BuildMI(*MBB, MI, DL, get(X86::MOV32ri), TmpGPR) + auto FirstNewMI = BuildMI(*MBB, MI, DL, get(X86::MOV32ri), TmpGPR) .addImm(0) .setMIFlags(MachineInstr::MIFlag::NoMerge); @@ -836,10 +836,8 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { .setMIFlags(MachineInstr::MIFlag::NoMerge); } - // TODO: Bundle instructions to avoid future optimizations from breaking up - // the instructions sequence. However, bundled instructions disappears after - // unpack-mi-bundles pass. Look into the issue and fix it before enabling the - // instruction bundling. + // Add instruction bundling using the original pseudo as the exclusive end + finalizeBundle(*MBB, FirstNewMI->getIterator(), MI.getIterator()); MI.eraseFromParent(); diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 8dd6f3d97ccea..4425b3eebee8e 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -620,7 +620,8 @@ void X86PassConfig::addPreEmitPass2() { return M->getModuleFlag("kcfi") || (TT.isOSDarwin() && (M->getFunction("objc_retainAutoreleasedReturnValue") || - M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))); + M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))) || + F.hasFnAttribute("ct-select"); })); // Analyzes and emits pseudos to support Win x64 Unwind V2. This pass must run From e61e4be578a1c27d1cccd31160d5232b5a287dea Mon Sep 17 00:00:00 2001 From: kumarak Date: Thu, 25 Sep 2025 19:56:48 +0000 Subject: [PATCH 45/63] [CT] record last intstruction during bundling for x86 --- .../Target/AArch64/AArch64TargetMachine.cpp | 4 +- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 2 +- llvm/lib/Target/X86/X86InstrInfo.cpp | 173 ++++++++++-------- 3 files changed, 95 insertions(+), 84 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index e10652a118157..5b80b08375f8c 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -896,9 +896,7 @@ void AArch64PassConfig::addPostBBSections() { void AArch64PassConfig::addPreEmitPass2() { // SVE bundles move prefixes with destructive operations. BLR_RVMARKER pseudo // instructions are lowered to bundles as well. - addPass(createUnpackMachineBundles([](const MachineFunction &MF) { - return MF.getFunction().hasFnAttribute("ct-select"); - })); + addPass(createUnpackMachineBundles(nullptr)); } bool AArch64PassConfig::addRegAssignAndRewriteOptimized() { diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index f0f57ade2bbe5..fa10c00526cf7 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1676,7 +1676,7 @@ bool ARMBaseInstrInfo::expandCtSelectThumb(MachineInstr &MI) const { .add(predOps(ARMCC::AL)) .setMIFlag(MachineInstr::MIFlag::NoMerge); - // Add instruction bundling + // Add instruction bundling auto BundleStart = FirstNewMI->getIterator(); finalizeBundle(*MBB, BundleStart, std::next(LastMI->getIterator())); diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 98dc26c28e747..af0eddd576aa1 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -656,61 +656,73 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { Register TrueVal = MI.getOperand(4).getReg(); // false_value X86::CondCode CC = X86::CondCode(MI.getOperand(5).getImm()); // condition + MachineInstr *FirstInstr = nullptr; + MachineInstr *LastInstr = nullptr; + auto recordInstr = [&](MachineInstrBuilder MIB) { + MachineInstr *NewMI = MIB.getInstr(); + LastInstr = NewMI; + if (!FirstInstr) + FirstInstr = NewMI; + }; + // Create scalar mask in tempGPR and broadcast to vector mask - auto FirstNewMI = BuildMI(*MBB, MI, DL, get(X86::MOV32ri), TmpGPR) - .addImm(0) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOV32ri), TmpGPR) + .addImm(0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); const TargetRegisterInfo *TRI = &getRegisterInfo(); auto SubReg = TRI->getSubReg(TmpGPR, X86::sub_8bit); - BuildMI(*MBB, MI, DL, get(X86::SETCCr)) - .addReg(SubReg) - .addImm(CC) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(X86::SETCCr)) + .addReg(SubReg) + .addImm(CC) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); // Zero-extend byte to 32-bit register (movzbl %al, %eax) - BuildMI(*MBB, MI, DL, get(X86::MOVZX32rr8), TmpGPR) - .addReg(SubReg) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVZX32rr8), TmpGPR) + .addReg(SubReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) { // Shift left 31 bits to convert 1 -> 0x80000000, 0 -> 0x00000000 (shll $31, // %eax) - BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR).addReg(TmpGPR).addImm(31); + recordInstr(BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR) + .addReg(TmpGPR) + .addImm(31)); } else { // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax) - BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR).addReg(TmpGPR); + recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR) + .addReg(TmpGPR)); } // Broadcast to TmpX (vector mask) - BuildMI(*MBB, MI, DL, get(X86::PXORrr), MaskReg) - .addReg(MaskReg) - .addReg(MaskReg) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(X86::PXORrr), MaskReg) + .addReg(MaskReg) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); // Move scalar mask to vector register - BuildMI(*MBB, MI, DL, get(Instruction.IntMoveOpc), MaskReg) - .addReg(TmpGPR) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.IntMoveOpc), MaskReg) + .addReg(TmpGPR) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); if (Instruction.Use256) { // Broadcast to 256-bit vector register - BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) - .addReg(MaskReg) - .addImm(0) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) + .addReg(MaskReg) + .addImm(0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); } else { if (Subtarget.hasSSE2() || Subtarget.hasAVX()) { - BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) - .addReg(MaskReg) - .addImm(0x00) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) + .addReg(MaskReg) + .addImm(0x00) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); } else { - BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) - .addReg(MaskReg) - .addReg(MaskReg) - .addImm(0x00) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) + .addReg(MaskReg) + .addReg(MaskReg) + .addImm(0x00) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); } } @@ -739,9 +751,9 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { // if XMM0 is one of the source registers, it will not match with Dst // registers, so we need to move it to Dst register - BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) - .addReg(SrcXMM0) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(SrcXMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); // update FalseVal and TrueVal to Dst register if (FalseVal == X86::XMM0) @@ -759,9 +771,9 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { // if XMM0 is not allocated for any of the register, we stil need to save // and restore it after using as mask register - BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) - .addReg(X86::XMM0) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(X86::XMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); SavedXMM0 = Dst; DidSaveXMM0 = true; } @@ -769,75 +781,76 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { if (MaskReg != X86::XMM0) { // BLENDV uses XMM0 as implicit mask register // https://www.felixcloutier.com/x86/pblendvb - BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0) - .addReg(MaskReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge)); // move FalseVal to mask (use MaskReg as the dst of the blend) - BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), MaskReg) - .addReg(FalseVal) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), MaskReg) + .addReg(FalseVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); // MaskReg := blend(MaskReg /*false*/, TrueVal /*true*/) ; mask in // xmm0 - BuildMI(*MBB, MI, DL, get(BlendOpc), MaskReg) - .addReg(MaskReg) - .addReg(TrueVal) - .addReg(X86::XMM0) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), MaskReg) + .addReg(MaskReg) + .addReg(TrueVal) + .addReg(X86::XMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); // restore XMM0 from SavedXMM0 if we saved it into Dst if (DidSaveXMM0) { - BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0) - .addReg(SavedXMM0) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0) + .addReg(SavedXMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); } // dst = result (now in MaskReg) - BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) - .addReg(MaskReg) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); } else { // move FalseVal to Dst register since MaskReg is XMM0 and Dst is not - BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) - .addReg(FalseVal) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(FalseVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); // Dst := blend(Dst /*false*/, TrueVal /*true*/) ; mask in // xmm0 - BuildMI(*MBB, MI, DL, get(BlendOpc), Dst) - .addReg(Dst) - .addReg(TrueVal) - .addReg(X86::XMM0) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), Dst) + .addReg(Dst) + .addReg(TrueVal) + .addReg(X86::XMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); } } else { // dst = mask - BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) - .addReg(MaskReg) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); // mask &= true_val - BuildMI(*MBB, MI, DL, get(X86::PANDrr), MaskReg) - .addReg(MaskReg) - .addReg(TrueVal) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDrr), MaskReg) + .addReg(MaskReg) + .addReg(TrueVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); // dst = ~mask & false_val - BuildMI(*MBB, MI, DL, get(X86::PANDNrr), Dst) - .addReg(Dst) - .addReg(FalseVal) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDNrr), Dst) + .addReg(Dst) + .addReg(FalseVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); // dst |= mask; (mask & t) | (~mask & f) - BuildMI(*MBB, MI, DL, get(X86::PORrr), Dst) - .addReg(Dst) - .addReg(MaskReg) - .setMIFlags(MachineInstr::MIFlag::NoMerge); + recordInstr(BuildMI(*MBB, MI, DL, get(X86::PORrr), Dst) + .addReg(Dst) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); } - // Add instruction bundling using the original pseudo as the exclusive end - finalizeBundle(*MBB, FirstNewMI->getIterator(), MI.getIterator()); + assert(FirstInstr && LastInstr && "Expected at least one expanded instruction"); + auto BundleEnd = LastInstr->getIterator(); + finalizeBundle(*MBB, FirstInstr->getIterator(), std::next(BundleEnd)); MI.eraseFromParent(); From a9b12f0f962ca9fc17cd48d6278cee45d7157eff Mon Sep 17 00:00:00 2001 From: Henrik Brodin <90325907+hbrodin@users.noreply.github.com> Date: Mon, 29 Sep 2025 14:33:10 +0200 Subject: [PATCH 46/63] [CT] Integer constant-time selection without CMOV on i386 Implement constant time logic for integers on non-CMOV capable targets. - Only promote i8 to i32 when CMOV available, use native 8-bit otherwise - Add post-RA expansion for CTSELECT_I386_GR* pseudos with temp registers --- llvm/lib/Target/X86/X86ISelLowering.cpp | 176 ++++----- llvm/lib/Target/X86/X86ISelLowering.h | 3 - llvm/lib/Target/X86/X86InstrCMovSetCC.td | 6 +- llvm/lib/Target/X86/X86InstrCompiler.td | 67 +++- llvm/lib/Target/X86/X86InstrInfo.cpp | 118 ++++++ llvm/lib/Target/X86/X86InstrInfo.h | 1 + llvm/lib/Target/X86/X86InstrPredicates.td | 5 + llvm/test/CodeGen/X86/ctselect.ll | 432 ++++++++++++++++++++++ 8 files changed, 687 insertions(+), 121 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 91b5ed5c3f8b6..34a37253258b1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25504,7 +25504,8 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { } // Promote small integer types to avoid partial register stalls - if ((Op.getValueType() == MVT::i8) || + // Exception: For i8 without CMOV, prefer native 8-bit constant-time operations + if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) || (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(TrueOp, Subtarget) && !X86::mayFoldLoad(FalseOp, Subtarget))) { TrueOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueOp); @@ -37983,122 +37984,69 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI, return BB; } -MachineBasicBlock * -X86TargetLowering::EmitLoweredCtSelect(MachineInstr &MI, - MachineBasicBlock *ThisMBB) const { - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); +/// Helper function to emit i386 CTSELECT with condition materialization. +/// This converts EFLAGS-based CTSELECT into a condition byte that can be +/// shared across multiple operations (critical for i64 type legalization). +/// +/// Phase 1: Materialize condition byte from EFLAGS using SETCC +/// Phase 2: Create internal pseudo with condition byte for post-RA expansion +/// +/// This approach ensures that when i64 is type-legalized into two i32 +/// operations, both operations share the same condition byte rather than +/// each independently reading (and destroying) EFLAGS. +static MachineBasicBlock * +emitCTSelectI386WithConditionMaterialization(MachineInstr &MI, + MachineBasicBlock *BB, + unsigned InternalPseudoOpcode) { + const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); const MIMetadata MIMD(MI); - MachineRegisterInfo &MRI = ThisMBB->getParent()->getRegInfo(); - DebugLoc DL = MI.getDebugLoc(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); - // Get operands - Register DstReg = MI.getOperand(0).getReg(); - Register TrueReg = MI.getOperand(1).getReg(); - Register FalseReg = MI.getOperand(2).getReg(); + // Original pseudo operands: (outs dst), (ins src1, src2, cond) + Register Src1Reg = MI.getOperand(1).getReg(); + Register Src2Reg = MI.getOperand(2).getReg(); + X86::CondCode CC = static_cast(MI.getOperand(3).getImm()); - X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm()); + // Get opposite condition (SETCC sets to 1 when condition is TRUE, + // but we want to select src1 when condition is FALSE for X86 semantics) X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); - const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - unsigned SETCCOp, MOVZXOp, NEGOp, ANDOp, XOROp, OROp; - const TargetRegisterClass *condRC; - - if (RC == &X86::GR8RegClass) { - SETCCOp = X86::SETCCr; - MOVZXOp = 0; // No extension needed for 8-bit - NEGOp = X86::NEG8r; - ANDOp = X86::AND8rr; - XOROp = X86::XOR8ri; - OROp = X86::OR8rr; - condRC = &X86::GR8RegClass; - } else if (RC == &X86::GR16RegClass) { - SETCCOp = X86::SETCCr; - MOVZXOp = X86::MOVZX16rr8; - NEGOp = X86::NEG16r; - ANDOp = X86::AND16rr; - XOROp = X86::XOR16ri; - OROp = X86::OR16rr; - condRC = &X86::GR16RegClass; - } else if (RC == &X86::GR32RegClass) { - SETCCOp = X86::SETCCr; - MOVZXOp = X86::MOVZX32rr8; - NEGOp = X86::NEG32r; - ANDOp = X86::AND32rr; - XOROp = X86::XOR32ri; - OROp = X86::OR32rr; - condRC = &X86::GR32RegClass; + // Step 1: Materialize condition byte from EFLAGS + // This is done OUTSIDE the constant-time bundle, before any EFLAGS corruption + Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC); + + // Step 2: Create internal pseudo that takes condition byte as input + // This pseudo will be expanded post-RA into the actual constant-time bundle + // The condition byte can now be safely shared between multiple pseudos + + // Internal pseudo has operands: (outs dst, tmp_byte, tmp_mask), (ins src1, src2, cond_byte) + Register DstReg = MI.getOperand(0).getReg(); + + // Create virtual registers for the temporary outputs + Register TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + Register TmpMaskReg; + + // Determine the register class for tmp_mask based on the data type + if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR8rr || + InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR16rr || + InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR32rr) { + TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); } else { - llvm_unreachable("Unsupported register class for conditional select"); + llvm_unreachable("Unknown internal pseudo opcode"); } - auto BundleStart = MI.getIterator(); + BuildMI(*BB, MI, MIMD, TII->get(InternalPseudoOpcode)) + .addDef(DstReg) // dst (output) + .addDef(TmpByteReg) // tmp_byte (output) + .addDef(TmpMaskReg) // tmp_mask (output) + .addReg(Src1Reg) // src1 (input) + .addReg(Src2Reg) // src2 (input) + .addReg(CondByteReg); // pre-materialized condition byte (input) - // Step 1: Create condition value using SETCC instruction - Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); - BuildMI(*ThisMBB, MI, MIMD, TII->get(SETCCOp), CondByteReg) - .addImm(OppCC) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - Register CondReg; - if (RC == &X86::GR8RegClass) { - // For 8-bit, use the byte register directly - CondReg = CondByteReg; - } else { - // For 16/32-bit, zero-extend the byte to the target size - CondReg = MRI.createVirtualRegister(condRC); - BuildMI(*ThisMBB, MI, MIMD, TII->get(MOVZXOp), CondReg) - .addReg(CondByteReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - } - - // Step 2: Convert condition to mask (1 -> 0xFFFF..., 0 -> 0x0000...) - // Use NEG to create all-ones mask when condition is true - Register MaskReg = MRI.createVirtualRegister(condRC); - BuildMI(*ThisMBB, MI, MIMD, TII->get(NEGOp), MaskReg) - .addReg(CondReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // Step 3: Implement conditional select using bitwise operations - // Result = (TrueReg & Mask) | (FalseReg & ~Mask) - - // Create inverted mask (~Mask) - Register InvMaskReg = MRI.createVirtualRegister(condRC); - BuildMI(*ThisMBB, MI, MIMD, TII->get(XOROp), InvMaskReg) - .addReg(MaskReg) - .addImm(-1) - .setMIFlag(MachineInstr::MIFlag::NoMerge); // XOR with all 1s to invert - - // Compute TrueReg & Mask - Register TrueMaskedReg = MRI.createVirtualRegister(condRC); - BuildMI(*ThisMBB, MI, MIMD, TII->get(ANDOp), TrueMaskedReg) - .addReg(TrueReg) - .addReg(MaskReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // Compute FalseReg & ~Mask - Register FalseMaskedReg = MRI.createVirtualRegister(condRC); - BuildMI(*ThisMBB, MI, MIMD, TII->get(ANDOp), FalseMaskedReg) - .addReg(FalseReg) - .addReg(InvMaskReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // Final result: (TrueReg & Mask) | (FalseReg & ~Mask) - BuildMI(*ThisMBB, MI, MIMD, TII->get(OROp), DstReg) - .addReg(TrueMaskedReg) - .addReg(FalseMaskedReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); - - // Remove the original instruction MI.eraseFromParent(); - - auto BundleEnd = MI.getIterator(); - if (BundleStart != BundleEnd) { - // Only bundle if we have multiple instructions - MachineInstr *BundleHeader = - BuildMI(*ThisMBB, BundleStart, DL, TII->get(TargetOpcode::BUNDLE)); - finalizeBundle(*ThisMBB, BundleHeader->getIterator(), std::next(BundleEnd)); - } - return ThisMBB; + return BB; } MachineBasicBlock * @@ -38162,9 +38110,17 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::CMOV_VK64: return EmitLoweredSelect(MI, BB); - case X86::CTSELECT_GR16rr: - case X86::CTSELECT_GR32rr: - return EmitLoweredCtSelect(MI, BB); + case X86::CTSELECT_I386_GR8rr: + return emitCTSelectI386WithConditionMaterialization( + MI, BB, X86::CTSELECT_I386_INT_GR8rr); + + case X86::CTSELECT_I386_GR16rr: + return emitCTSelectI386WithConditionMaterialization( + MI, BB, X86::CTSELECT_I386_INT_GR16rr); + + case X86::CTSELECT_I386_GR32rr: + return emitCTSelectI386WithConditionMaterialization( + MI, BB, X86::CTSELECT_I386_INT_GR32rr); case X86::CTSELECT_FP32rr: case X86::CTSELECT_FP64rr: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 7cd63a1d77c7d..f79eec03de23c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1872,9 +1872,6 @@ namespace llvm { MachineBasicBlock *EmitLoweredSelect(MachineInstr &I, MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredCtSelect(MachineInstr &MI, - MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const; diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td index ede8d80f0b897..ecc3f11a478bf 100644 --- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -114,7 +114,7 @@ let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in { multiclass CTSELECT { // register-only - let isCommutable = 0, SchedRW = [WriteCMOV], + let isCommutable = 0, SchedRW = [WriteCMOV], Predicates = [HasCMOV], AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in { def rr : PseudoI<(outs t.RegClass:$dst), (ins t.RegClass:$src1, t.RegClass:$src2, i8imm:$cond), @@ -122,7 +122,7 @@ let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in { } // register-memory - let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold], + let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold], Predicates = [HasCMOV], AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in { def rm : PseudoI<(outs t.RegClass:$dst), (ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond), @@ -132,7 +132,7 @@ let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in { } let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { - let Predicates = [HasCMOV], Constraints = "$dst = $src1" in { + let Constraints = "$dst = $src1" in { defm CTSELECT16 : CTSELECT; defm CTSELECT32 : CTSELECT; defm CTSELECT64 : CTSELECT; diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 2806e1a174abc..d5b30b50d0e57 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -694,8 +694,34 @@ def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; // CTSELECT +// Enhanced CTSELECT pseudos for i386 with temporary register allocation +// These use a two-phase approach: +// 1. Custom inserter materializes condition byte from EFLAGS +// 2. Post-RA expansion generates constant-time instruction bundles + +let isPseudo = 1, isNotDuplicable = 1 in { + // Phase 1: Initial pseudos that consume EFLAGS (via custom inserter) + // These are matched by patterns and convert EFLAGS to condition byte + multiclass CTSELECT_I386_INITIAL { + let Uses = [EFLAGS], Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { + def rr : PseudoI<(outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$cond), []>; + } + } + + // Phase 2: Internal pseudos with pre-materialized condition byte (post-RA expansion) + // These generate the actual constant-time instruction bundles + multiclass CTSELECT_I386_INTERNAL { + let hasNoSchedulingInfo = 1 in { + def rr : PseudoI<(outs RC:$dst, ByteRC:$tmp_byte, RC:$tmp_mask), + (ins RC:$src1, RC:$src2, ByteRC:$cond_byte), []> { + let Constraints = + "@earlyclobber $dst,@earlyclobber $tmp_byte,@earlyclobber " + "$tmp_mask"; + } + } + } -let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in { multiclass CTSELECT_NOCMOV { let hasNoSchedulingInfo = 1 in { def rr : PseudoI<(outs RC:$dst), @@ -710,15 +736,29 @@ let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in { } } +// Phase 1 pseudos for non-CMOV targets (custom inserter materializes condition) +let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { + let Predicates = [NoNativeCMOV] in { + defm CTSELECT_I386_GR8 : CTSELECT_I386_INITIAL; + defm CTSELECT_I386_GR16 : CTSELECT_I386_INITIAL; + defm CTSELECT_I386_GR32 : CTSELECT_I386_INITIAL; + } +} + +// Phase 2 pseudos (post-RA expansion with pre-materialized condition byte) +let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { + let Predicates = [NoNativeCMOV] in { + defm CTSELECT_I386_INT_GR8 : CTSELECT_I386_INTERNAL; + defm CTSELECT_I386_INT_GR16 : CTSELECT_I386_INTERNAL; + defm CTSELECT_I386_INT_GR32 : CTSELECT_I386_INTERNAL; + } +} + let usesCustomInserter = 1, isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1, Constraints = "$dst = $src1" in { - let Predicates = [NoCMOV] in { - defm CTSELECT_GR16 : CTSELECT_NOCMOV; - defm CTSELECT_GR32 : CTSELECT_NOCMOV; - } let Predicates = [FPStackf32] in defm CTSELECT_FP32 : CTSELECT_NOCMOV; @@ -732,6 +772,23 @@ let usesCustomInserter = 1, defm CTSELECT_VR64 : CTSELECT_NOCMOV; } + +// Pattern matching for non-native-CMOV CTSELECT (routes to custom inserter for condition materialization) +// NoNativeCMOV ensures these patterns are used when actual CMOV instruction is not available +// even if canUseCMOV() is true (e.g., i386 with SSE which can emulate CMOV) +let Predicates = [NoNativeCMOV] in { + def : Pat<(i8(X86ctselect GR8:$src1, GR8:$src2, timm:$cond, EFLAGS)), + (CTSELECT_I386_GR8rr GR8:$src1, GR8:$src2, timm:$cond)>; + + def : Pat<(i16(X86ctselect GR16:$src1, GR16:$src2, timm:$cond, EFLAGS)), + (CTSELECT_I386_GR16rr GR16:$src1, GR16:$src2, timm:$cond)>; + + def : Pat<(i32(X86ctselect GR32:$src1, GR32:$src2, timm:$cond, EFLAGS)), + (CTSELECT_I386_GR32rr GR32:$src1, GR32:$src2, timm:$cond)>; + + // i64 patterns handled automatically by type legalization +} + //===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index af0eddd576aa1..58efc32d5da34 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -914,6 +914,117 @@ bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const { return true; } +/// Expand i386-specific CTSELECT pseudo instructions (post-RA, constant-time) +/// These internal pseudos receive a pre-materialized condition byte from the +/// custom inserter, avoiding EFLAGS corruption issues during i64 type legalization. +bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + // CTSELECT_I386_INT_GRxxrr has operands: (outs dst, tmp_byte, tmp_mask), + // (ins src1, src2, cond_byte) + // Note: cond_byte is pre-materialized by custom inserter, not EFLAGS-dependent + Register DstReg = MI.getOperand(0).getReg(); + Register TmpByteReg = MI.getOperand(1).getReg(); + Register TmpMaskReg = MI.getOperand(2).getReg(); + Register Src1Reg = MI.getOperand(3).getReg(); + Register Src2Reg = MI.getOperand(4).getReg(); + Register CondByteReg = MI.getOperand(5).getReg(); // Pre-materialized condition byte + + // Determine instruction opcodes based on register width + unsigned MovZXOp, NegOp, MovOp, AndOp, NotOp, OrOp; + if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) { + MovZXOp = 0; // No zero-extend needed for GR8 + NegOp = X86::NEG8r; + MovOp = X86::MOV8rr; + AndOp = X86::AND8rr; + NotOp = X86::NOT8r; + OrOp = X86::OR8rr; + } else if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR16rr) { + MovZXOp = X86::MOVZX16rr8; + NegOp = X86::NEG16r; + MovOp = X86::MOV16rr; + AndOp = X86::AND16rr; + NotOp = X86::NOT16r; + OrOp = X86::OR16rr; + } else { // X86::CTSELECT_I386_INT_GR32rr + MovZXOp = X86::MOVZX32rr8; + NegOp = X86::NEG32r; + MovOp = X86::MOV32rr; + AndOp = X86::AND32rr; + NotOp = X86::NOT32r; + OrOp = X86::OR32rr; + } + + // 7-instruction constant-time selection bundle (no SETCC inside): + // result = (true_val & mask) | (false_val & ~mask) + // The condition byte is already materialized, avoiding EFLAGS dependency + + // Step 1: Copy pre-materialized condition byte to TmpByteReg + // This allows the bundle to work with allocated temporaries + auto I1 = BuildMI(*MBB, MI, DL, get(X86::MOV8rr), TmpByteReg) + .addReg(CondByteReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + auto BundleStart = I1->getIterator(); + + // Step 2: Zero-extend condition byte to register width (0 or 1) + if (MI.getOpcode() != X86::CTSELECT_I386_INT_GR8rr) { + BuildMI(*MBB, MI, DL, get(MovZXOp), TmpMaskReg) + .addReg(TmpByteReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } + + // Step 3: Convert condition to bitmask (NEG: 1 -> 0xFFFF..., 0 -> 0x0000...) + Register MaskReg = (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) ? TmpByteReg : TmpMaskReg; + BuildMI(*MBB, MI, DL, get(NegOp), MaskReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 4,5: Apply mask to true value - copy src1 to dest, then AND with mask + BuildMI(*MBB, MI, DL, get(MovOp), DstReg) + .addReg(Src1Reg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + BuildMI(*MBB, MI, DL, get(AndOp), DstReg) + .addReg(DstReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 6: Create inverted mask inline (~mask) + BuildMI(*MBB, MI, DL, get(NotOp), MaskReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 7: Apply inverted mask to false value - reuse mask register directly + BuildMI(*MBB, MI, DL, get(AndOp), MaskReg) + .addReg(MaskReg) + .addReg(Src2Reg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 8: Final result: (src1 & mask) | (src2 & ~mask) + auto LI = BuildMI(*MBB, MI, DL, get(OrOp), DstReg) + .addReg(DstReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Bundle all generated instructions for atomic execution before removing MI + auto BundleEnd = std::next(LI->getIterator()); + if (BundleStart != BundleEnd) { + // Only bundle if we have multiple instructions + finalizeBundle(*MBB, BundleStart, BundleEnd); + } + + // TODO: Optimization opportunity - The register allocator may choose callee-saved + // registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, causing unnecessary + // save/restore overhead. Consider constraining these to caller-saved register + // classes (e.g., GR8_AL, GR32_CallSaved) in the TableGen definitions to improve + // constant-time performance by eliminating prologue/epilogue instructions. + + // Remove the original pseudo instruction + MI.eraseFromParent(); + return true; +} + static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) { switch (Opcode) { default: @@ -6858,6 +6969,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::CTSELECT16rm: return expandCtSelectWithCMOV(MI); + // non-cmov CTSELECT expansion (post-RA, constant-time) + // These are the internal pseudos with pre-materialized condition byte + case X86::CTSELECT_I386_INT_GR8rr: + case X86::CTSELECT_I386_INT_GR16rr: + case X86::CTSELECT_I386_INT_GR32rr: + return expandCtSelectIntWithoutCMOV(MI); + case X86::CTSELECT_V2F64: case X86::CTSELECT_V4F32: case X86::CTSELECT_V2I64: diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index a8d61cfd579ad..ebd7e070d5fe8 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -726,6 +726,7 @@ class X86InstrInfo final : public X86GenInstrInfo { /// Expand the CTSELECT pseudo-instructions. bool expandCtSelectWithCMOV(MachineInstr &MI) const; + bool expandCtSelectIntWithoutCMOV(MachineInstr &MI) const; bool expandCtSelectVector(MachineInstr &MI) const; diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index c20bb05018b4d..e59cd4e1ddf7b 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -49,6 +49,11 @@ def HasZU : Predicate<"Subtarget->hasZU()">; def HasCF : Predicate<"Subtarget->hasCF()">; def HasCMOV : Predicate<"Subtarget->canUseCMOV()">; def NoCMOV : Predicate<"!Subtarget->canUseCMOV()">; +// Predicate for constant-time selection without native CMOV instruction +// Unlike NoCMOV, this checks hasCMOV() directly, not canUseCMOV() +// This ensures i386 with SSE (canUseCMOV=true but hasCMOV=false) uses +// the constant-time i386-specific implementation instead of CMOV emulation +def NoNativeCMOV : Predicate<"!Subtarget->hasCMOV()">; def HasNOPL : Predicate<"Subtarget->hasNOPL()">; def HasMMX : Predicate<"Subtarget->hasMMX()">; def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll index 580253b27d44e..705e5377c75a4 100644 --- a/llvm/test/CodeGen/X86/ctselect.ll +++ b/llvm/test/CodeGen/X86/ctselect.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X64 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=X32-NOCMOV ; Test basic ct.select functionality for scalar types @@ -20,6 +21,26 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_i8: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -8 +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %ah +; X32-NOCMOV-NEXT: movb %ah, %ch +; X32-NOCMOV-NEXT: negb %ch +; X32-NOCMOV-NEXT: movb %dl, %al +; X32-NOCMOV-NEXT: andb %ch, %al +; X32-NOCMOV-NEXT: notb %ch +; X32-NOCMOV-NEXT: andb %cl, %ch +; X32-NOCMOV-NEXT: orb %ch, %al +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) ret i8 %result } @@ -39,6 +60,32 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnew {{[0-9]+}}(%esp), %ax ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_i16: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbw %bh, %esi +; X32-NOCMOV-NEXT: negw %esi +; X32-NOCMOV-NEXT: movw %dx, %ax +; X32-NOCMOV-NEXT: andw %esi, %ax +; X32-NOCMOV-NEXT: notw %esi +; X32-NOCMOV-NEXT: andw %cx, %esi +; X32-NOCMOV-NEXT: orw %esi, %ax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) ret i16 %result } @@ -57,6 +104,32 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_i32: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result } @@ -77,6 +150,52 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_i64: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -20 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -16 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %ebp +; X32-NOCMOV-NEXT: negl %ebp +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %ebp, %eax +; X32-NOCMOV-NEXT: notl %ebp +; X32-NOCMOV-NEXT: andl %ecx, %ebp +; X32-NOCMOV-NEXT: orl %ebp, %eax +; X32-NOCMOV-NEXT: movb %bl, %cl +; X32-NOCMOV-NEXT: movzbl %cl, %ebp +; X32-NOCMOV-NEXT: negl %ebp +; X32-NOCMOV-NEXT: movl %edi, %edx +; X32-NOCMOV-NEXT: andl %ebp, %edx +; X32-NOCMOV-NEXT: notl %ebp +; X32-NOCMOV-NEXT: andl %esi, %ebp +; X32-NOCMOV-NEXT: orl %ebp, %edx +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) ret i64 %result } @@ -103,6 +222,19 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; X32-NEXT: .LBB4_2: ; X32-NEXT: fstp %st(0) ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_f32: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: jne .LBB4_2 +; X32-NOCMOV-NEXT: # %bb.1: +; X32-NOCMOV-NEXT: fstp %st(1) +; X32-NOCMOV-NEXT: fldz +; X32-NOCMOV-NEXT: .LBB4_2: +; X32-NOCMOV-NEXT: fstp %st(0) +; X32-NOCMOV-NEXT: retl %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) ret float %result } @@ -129,6 +261,19 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) { ; X32-NEXT: .LBB5_2: ; X32-NEXT: fstp %st(0) ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_f64: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: fldl {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: fldl {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: jne .LBB5_2 +; X32-NOCMOV-NEXT: # %bb.1: +; X32-NOCMOV-NEXT: fstp %st(1) +; X32-NOCMOV-NEXT: fldz +; X32-NOCMOV-NEXT: .LBB5_2: +; X32-NOCMOV-NEXT: fstp %st(0) +; X32-NOCMOV-NEXT: retl %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) ret double %result } @@ -147,6 +292,32 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_ptr: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) ret ptr %result } @@ -168,6 +339,33 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) { ; X32-NEXT: testb %cl, %cl ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_const_true: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movb $1, %al +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) ret i32 %result } @@ -188,6 +386,33 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) { ; X32-NEXT: testb %cl, %cl ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_const_false: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: xorl %eax, %eax +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) ret i32 %result } @@ -212,6 +437,35 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { ; X32-NEXT: testb %cl, %cl ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_icmp_eq: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: sete %al +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl %cond = icmp eq i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result @@ -236,6 +490,35 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { ; X32-NEXT: testb %cl, %cl ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_icmp_ne: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: setne %al +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl %cond = icmp ne i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result @@ -260,6 +543,35 @@ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { ; X32-NEXT: testb %cl, %cl ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_icmp_slt: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: setl %al +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl %cond = icmp slt i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result @@ -284,6 +596,35 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { ; X32-NEXT: testb %cl, %cl ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_icmp_ult: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: setb %al +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl %cond = icmp ult i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result @@ -320,6 +661,27 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { ; X32-NEXT: .LBB13_2: ; X32-NEXT: fstp %st(0) ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: fucompp +; X32-NOCMOV-NEXT: fnstsw %ax +; X32-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax +; X32-NOCMOV-NEXT: sahf +; X32-NOCMOV-NEXT: setnp %al +; X32-NOCMOV-NEXT: sete %cl +; X32-NOCMOV-NEXT: testb %al, %cl +; X32-NOCMOV-NEXT: jne .LBB13_2 +; X32-NOCMOV-NEXT: # %bb.1: +; X32-NOCMOV-NEXT: fstp %st(1) +; X32-NOCMOV-NEXT: fldz +; X32-NOCMOV-NEXT: .LBB13_2: +; X32-NOCMOV-NEXT: fstp %st(0) +; X32-NOCMOV-NEXT: retl %cond = fcmp oeq float %x, %y %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) ret float %result @@ -342,6 +704,34 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel (%ecx), %eax ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_load: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl (%ecx), %ecx +; X32-NOCMOV-NEXT: movl (%eax), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl %a = load i32, ptr %p1 %b = load i32, ptr %p2 %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) @@ -368,6 +758,48 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel %ecx, %eax ; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_nested: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -16 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %eax, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %dl +; X32-NOCMOV-NEXT: movb %dl, %dh +; X32-NOCMOV-NEXT: movzbl %dh, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %ecx, %eax +; X32-NOCMOV-NEXT: andl %edi, %eax +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %esi, %edi +; X32-NOCMOV-NEXT: orl %edi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c) ret i32 %result From 449225865429f7ce34d4c9164df595e8955331ee Mon Sep 17 00:00:00 2001 From: Henrik Brodin <90325907+hbrodin@users.noreply.github.com> Date: Thu, 2 Oct 2025 15:50:40 +0200 Subject: [PATCH 47/63] [CT] Make cmov predicates more stringent Previously, in some cases, multiple patterns could match depending on what attributes where used. Now we are solely relying on the hasCMOV. Clarified a comment about why we are not widening 8-bit values in non-cmov context. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 5 +++-- llvm/lib/Target/X86/X86InstrCMovSetCC.td | 4 ++-- llvm/lib/Target/X86/X86InstrPredicates.td | 10 +++++----- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 34a37253258b1..101ed142f690a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25504,8 +25504,9 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { } // Promote small integer types to avoid partial register stalls - // Exception: For i8 without CMOV, prefer native 8-bit constant-time operations - if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) || + // Exception: For i8 without CMOV, we can generate a shorter instruction + // sequence without movzx so keep it as is. + if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMOV()) || (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(TrueOp, Subtarget) && !X86::mayFoldLoad(FalseOp, Subtarget))) { TrueOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueOp); diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td index ecc3f11a478bf..9c34889f03354 100644 --- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -114,7 +114,7 @@ let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in { multiclass CTSELECT { // register-only - let isCommutable = 0, SchedRW = [WriteCMOV], Predicates = [HasCMOV], + let isCommutable = 0, SchedRW = [WriteCMOV], Predicates = [HasNativeCMOV], AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in { def rr : PseudoI<(outs t.RegClass:$dst), (ins t.RegClass:$src1, t.RegClass:$src2, i8imm:$cond), @@ -122,7 +122,7 @@ let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in { } // register-memory - let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold], Predicates = [HasCMOV], + let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold], Predicates = [HasNativeCMOV], AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in { def rm : PseudoI<(outs t.RegClass:$dst), (ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond), diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index e59cd4e1ddf7b..23841034ed411 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -49,11 +49,11 @@ def HasZU : Predicate<"Subtarget->hasZU()">; def HasCF : Predicate<"Subtarget->hasCF()">; def HasCMOV : Predicate<"Subtarget->canUseCMOV()">; def NoCMOV : Predicate<"!Subtarget->canUseCMOV()">; -// Predicate for constant-time selection without native CMOV instruction -// Unlike NoCMOV, this checks hasCMOV() directly, not canUseCMOV() -// This ensures i386 with SSE (canUseCMOV=true but hasCMOV=false) uses -// the constant-time i386-specific implementation instead of CMOV emulation -def NoNativeCMOV : Predicate<"!Subtarget->hasCMOV()">; +// Predicates for native CMOV instruction (checks hasCMOV(), not canUseCMOV()) +// HasCMOV may be true even without native CMOV (e.g., via SSE emulation) +// Use HasNativeCMOV/NoNativeCMOV for constant-time code that requires actual CMOV +def HasNativeCMOV : Predicate<"Subtarget->hasCMOV()">; +def NoNativeCMOV : Predicate<"!Subtarget->hasCMOV()">; def HasNOPL : Predicate<"Subtarget->hasNOPL()">; def HasMMX : Predicate<"Subtarget->hasMMX()">; def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; From a20a64ca02049dd3b74a2509c8116ab0023b84b9 Mon Sep 17 00:00:00 2001 From: Henrik Brodin <90325907+hbrodin@users.noreply.github.com> Date: Tue, 26 Aug 2025 08:54:08 +0200 Subject: [PATCH 48/63] [CT] Implement constant-time CTSELECT for i386 targets without CMOV Add post-RA expansion for CTSELECT on i386 targets that lack CMOV support. Uses CMOV when available for optimal performance, falls back to bundled bitwise operations for constant-time guarantees on legacy targets. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 + llvm/lib/Target/X86/X86InstrCompiler.td | 12 +- llvm/lib/Target/X86/X86InstrInfo.cpp | 8 + llvm/lib/Target/X86/X86InstrInfo.h | 3 + .../CodeGen/X86/ctselect-constant-time.ll | 159 +++++++++ .../X86/ctselect-i386-bundle-expansion.ll | 184 +++++++++++ .../CodeGen/X86/ctselect-i386-security.ll | 141 ++++++++ llvm/test/CodeGen/X86/ctselect-i386.ll | 312 ++++++++++++++++++ 8 files changed, 814 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/X86/ctselect-constant-time.ll create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-bundle-expansion.ll create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-security.ll create mode 100644 llvm/test/CodeGen/X86/ctselect-i386.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 101ed142f690a..e0f16020be80f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25374,6 +25374,10 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = TrueOp.getSimpleValueType(); + // Special handling for i386 targets (no CMOV) - route to post-RA expansion + // pseudos Let standard type legalization handle i64 automatically (splits + // into EDX:EAX) + // Handle soft float16 by converting to integer operations if (isSoftF16(VT, Subtarget)) { MVT NVT = VT.changeTypeToInteger(); diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index d5b30b50d0e57..a7a2986fa9548 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -724,14 +724,9 @@ let isPseudo = 1, isNotDuplicable = 1 in { multiclass CTSELECT_NOCMOV { let hasNoSchedulingInfo = 1 in { - def rr : PseudoI<(outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$cond), - [(set RC:$dst, ( VT (X86ctselect RC:$src1, RC:$src2, timm:$cond, EFLAGS)))]>; - - // TODO: Do we need register-memory variant?? - //def rm : PseudoI<(outs t.RegClass:$dst), - // (ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond), - // [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>; + def rr : PseudoI<(outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cond), + [(set RC:$dst, (VT(X86ctselect RC:$src1, RC:$src2, + timm:$cond, EFLAGS)))]>; } } } @@ -772,7 +767,6 @@ let usesCustomInserter = 1, defm CTSELECT_VR64 : CTSELECT_NOCMOV; } - // Pattern matching for non-native-CMOV CTSELECT (routes to custom inserter for condition materialization) // NoNativeCMOV ensures these patterns are used when actual CMOV instruction is not available // even if canUseCMOV() is true (e.g., i386 with SSE which can emulate CMOV) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 58efc32d5da34..1bd36bf7a9172 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -6961,12 +6961,15 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break; + case X86::CTSELECT64rr: case X86::CTSELECT32rr: case X86::CTSELECT16rr: case X86::CTSELECT64rm: case X86::CTSELECT32rm: case X86::CTSELECT16rm: + // These CTSELECT pseudos are only selected when CMOV is available + // Pattern matching ensures we use CTSELECT_I386 when CMOV is not available return expandCtSelectWithCMOV(MI); // non-cmov CTSELECT expansion (post-RA, constant-time) @@ -6995,6 +6998,11 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::CTSELECT_V4F64: case X86::CTSELECT_V8F32: return expandCtSelectVector(MI); + + // i386-specific CTSELECT expansion (post-RA, constant-time) + case X86::CTSELECT_I386_GR16rr: + case X86::CTSELECT_I386_GR32rr: + return expandCtSelectI386(MI); } return false; } diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index ebd7e070d5fe8..c4e2c4ee44460 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -730,6 +730,9 @@ class X86InstrInfo final : public X86GenInstrInfo { bool expandCtSelectVector(MachineInstr &MI) const; + /// Expand i386-specific CTSELECT pseudo instructions (post-RA, constant-time) + bool expandCtSelectI386(MachineInstr &MI) const; + /// Returns true iff the routine could find two commutable operands in the /// given machine instruction with 3 vector inputs. /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their diff --git a/llvm/test/CodeGen/X86/ctselect-constant-time.ll b/llvm/test/CodeGen/X86/ctselect-constant-time.ll new file mode 100644 index 0000000000000..d4996e13cf047 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-constant-time.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=CONSTANT-TIME +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=CONSTANT-TIME + +; Test constant-time properties of CTSELECT implementation on i386 +; This test verifies that: +; 1. No conditional branches are generated +; 2. Both true and false values are always loaded/computed +; 3. No conditional memory loads are performed + +define i32 @crypto_key_select(i32 %secret_bit, i32 %key1, i32 %key2) nounwind { +; CONSTANT-TIME-LABEL: crypto_key_select: +; CONSTANT-TIME: # %bb.0: +; CONSTANT-TIME-NEXT: pushl %ebx +; CONSTANT-TIME-NEXT: pushl %edi +; CONSTANT-TIME-NEXT: pushl %esi +; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %edx +; CONSTANT-TIME-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; CONSTANT-TIME-NEXT: setne %al +; CONSTANT-TIME-NEXT: testb %al, %al +; CONSTANT-TIME-NEXT: BUNDLE +; CONSTANT-TIME-NEXT: popl %esi +; CONSTANT-TIME-NEXT: popl %edi +; CONSTANT-TIME-NEXT: popl %ebx +; CONSTANT-TIME-NEXT: retl + %cond = icmp ne i32 %secret_bit, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %key1, i32 %key2) + ret i32 %result +} + +define i32 @constant_memory_access(i32 %secret, i32* %data1, i32* %data2) nounwind { +; CONSTANT-TIME-LABEL: constant_memory_access: +; CONSTANT-TIME: # %bb.0: +; CONSTANT-TIME-NEXT: pushl %ebx +; CONSTANT-TIME-NEXT: pushl %edi +; CONSTANT-TIME-NEXT: pushl %esi +; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %eax +; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CONSTANT-TIME-NEXT: movl (%ecx), %ecx +; CONSTANT-TIME-NEXT: movl (%eax), %edx +; CONSTANT-TIME-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; CONSTANT-TIME-NEXT: setne %al +; CONSTANT-TIME-NEXT: testb %al, %al +; CONSTANT-TIME-NEXT: BUNDLE +; CONSTANT-TIME-NEXT: popl %esi +; CONSTANT-TIME-NEXT: popl %edi +; CONSTANT-TIME-NEXT: popl %ebx +; CONSTANT-TIME-NEXT: retl + %val1 = load i32, i32* %data1 + %val2 = load i32, i32* %data2 + %cond = icmp ne i32 %secret, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2) + ret i32 %result +} + +define i64 @crypto_key_select_64bit(i32 %secret_bit, i64 %key1, i64 %key2) nounwind { +; CONSTANT-TIME-LABEL: crypto_key_select_64bit: +; CONSTANT-TIME: # %bb.0: +; CONSTANT-TIME-NEXT: pushl %ebp +; CONSTANT-TIME-NEXT: pushl %ebx +; CONSTANT-TIME-NEXT: pushl %edi +; CONSTANT-TIME-NEXT: pushl %esi +; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %edx +; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %edi +; CONSTANT-TIME-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; CONSTANT-TIME-NEXT: setne %al +; CONSTANT-TIME-NEXT: testb %al, %al +; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %esi +; CONSTANT-TIME-NEXT: BUNDLE +; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %esi +; CONSTANT-TIME-NEXT: BUNDLE +; CONSTANT-TIME-NEXT: popl %esi +; CONSTANT-TIME-NEXT: popl %edi +; CONSTANT-TIME-NEXT: popl %ebx +; CONSTANT-TIME-NEXT: popl %ebp +; CONSTANT-TIME-NEXT: retl + %cond = icmp ne i32 %secret_bit, 0 + %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %key1, i64 %key2) + ret i64 %result +} + +define i32 @array_index_protection(i32 %secret_index, i32* %array) nounwind { +; CONSTANT-TIME-LABEL: array_index_protection: +; CONSTANT-TIME: # %bb.0: +; CONSTANT-TIME-NEXT: pushl %ebx +; CONSTANT-TIME-NEXT: pushl %edi +; CONSTANT-TIME-NEXT: pushl %esi +; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %eax +; CONSTANT-TIME-NEXT: movl (%eax), %ecx +; CONSTANT-TIME-NEXT: movl 4(%eax), %edx +; CONSTANT-TIME-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; CONSTANT-TIME-NEXT: setne %al +; CONSTANT-TIME-NEXT: testb %al, %al +; CONSTANT-TIME-NEXT: BUNDLE +; CONSTANT-TIME-NEXT: popl %esi +; CONSTANT-TIME-NEXT: popl %edi +; CONSTANT-TIME-NEXT: popl %ebx +; CONSTANT-TIME-NEXT: retl + %val0 = load i32, i32* %array + %ptr1 = getelementptr i32, i32* %array, i32 1 + %val1 = load i32, i32* %ptr1 + %cond = icmp ne i32 %secret_index, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val0) + ret i32 %result +} + +; Test that complex expressions also work in constant-time +define i32 @complex_crypto_operation(i32 %secret, i32 %a, i32 %b) nounwind { +; CONSTANT-TIME-LABEL: complex_crypto_operation: +; CONSTANT-TIME: # %bb.0: +; CONSTANT-TIME-NEXT: pushl %ebx +; CONSTANT-TIME-NEXT: pushl %edi +; CONSTANT-TIME-NEXT: pushl %esi +; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %edx +; CONSTANT-TIME-NEXT: addl %ecx, %edx +; CONSTANT-TIME-NEXT: imull %ecx, %ecx +; CONSTANT-TIME-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; CONSTANT-TIME-NEXT: setne %al +; CONSTANT-TIME-NEXT: testb %al, %al +; CONSTANT-TIME-NEXT: BUNDLE +; CONSTANT-TIME-NEXT: popl %esi +; CONSTANT-TIME-NEXT: popl %edi +; CONSTANT-TIME-NEXT: popl %ebx +; CONSTANT-TIME-NEXT: retl + %expr1 = add i32 %a, %b + %expr2 = mul i32 %b, %b + %cond = icmp ne i32 %secret, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %expr1, i32 %expr2) + ret i32 %result +} + +; Verify that bundling prevents instruction reordering +; This is harder to test directly, but we can check that instructions appear +; in the expected sequence and no optimization has moved them around +define i32 @verify_bundling(i32 %secret, i32 %val1, i32 %val2) nounwind { +; CONSTANT-TIME-LABEL: verify_bundling: +; CONSTANT-TIME: # %bb.0: +; CONSTANT-TIME-NEXT: pushl %ebx +; CONSTANT-TIME-NEXT: pushl %edi +; CONSTANT-TIME-NEXT: pushl %esi +; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %edx +; CONSTANT-TIME-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; CONSTANT-TIME-NEXT: setne %al +; CONSTANT-TIME-NEXT: testb %al, %al +; CONSTANT-TIME-NEXT: BUNDLE +; CONSTANT-TIME-NEXT: popl %esi +; CONSTANT-TIME-NEXT: popl %edi +; CONSTANT-TIME-NEXT: popl %ebx +; CONSTANT-TIME-NEXT: retl + %cond = icmp ne i32 %secret, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2) + ret i32 %result +} + +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) diff --git a/llvm/test/CodeGen/X86/ctselect-i386-bundle-expansion.ll b/llvm/test/CodeGen/X86/ctselect-i386-bundle-expansion.ll new file mode 100644 index 0000000000000..2be7f8ac60578 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-i386-bundle-expansion.ll @@ -0,0 +1,184 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov -print-after=pseudo-probe-inserter < %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=BUNDLE +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov < %s | FileCheck %s --check-prefix=ASM + +; Test that CTSELECT expansion creates proper constant-time bundles with correct instruction sequences + +define i32 @test_ctselect_i32_bundle_expansion(i1 %cond, i32 %a, i32 %b) { +; ASM-LABEL: test_ctselect_i32_bundle_expansion: +; ASM: # %bb.0: +; ASM-NEXT: pushl %ebx +; ASM-NEXT: .cfi_def_cfa_offset 8 +; ASM-NEXT: pushl %edi +; ASM-NEXT: .cfi_def_cfa_offset 12 +; ASM-NEXT: pushl %esi +; ASM-NEXT: .cfi_def_cfa_offset 16 +; ASM-NEXT: .cfi_offset %esi, -16 +; ASM-NEXT: .cfi_offset %edi, -12 +; ASM-NEXT: .cfi_offset %ebx, -8 +; ASM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; ASM-NEXT: movl {{[0-9]+}}(%esp), %edx +; ASM-NEXT: testb $1, {{[0-9]+}}(%esp) +; ASM-NEXT: BUNDLE +; ASM-NEXT: popl %esi +; ASM-NEXT: .cfi_def_cfa_offset 12 +; ASM-NEXT: popl %edi +; ASM-NEXT: .cfi_def_cfa_offset 8 +; ASM-NEXT: popl %ebx +; ASM-NEXT: .cfi_def_cfa_offset 4 +; ASM-NEXT: retl + + + %1 = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %1 +} + +define i16 @test_ctselect_i16_bundle_expansion(i1 %cond, i16 %a, i16 %b) { +; ASM-LABEL: test_ctselect_i16_bundle_expansion: +; ASM: # %bb.0: +; ASM-NEXT: pushl %ebx +; ASM-NEXT: .cfi_def_cfa_offset 8 +; ASM-NEXT: pushl %edi +; ASM-NEXT: .cfi_def_cfa_offset 12 +; ASM-NEXT: pushl %esi +; ASM-NEXT: .cfi_def_cfa_offset 16 +; ASM-NEXT: .cfi_offset %esi, -16 +; ASM-NEXT: .cfi_offset %edi, -12 +; ASM-NEXT: .cfi_offset %ebx, -8 +; ASM-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; ASM-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; ASM-NEXT: testb $1, {{[0-9]+}}(%esp) +; ASM-NEXT: BUNDLE +; ASM-NEXT: popl %esi +; ASM-NEXT: .cfi_def_cfa_offset 12 +; ASM-NEXT: popl %edi +; ASM-NEXT: .cfi_def_cfa_offset 8 +; ASM-NEXT: popl %ebx +; ASM-NEXT: .cfi_def_cfa_offset 4 +; ASM-NEXT: retl + + + %1 = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %1 +} + +define i64 @test_ctselect_i64_bundle_expansion(i1 %cond, i64 %a, i64 %b) { +; ASM-LABEL: test_ctselect_i64_bundle_expansion: +; ASM: # %bb.0: +; ASM-NEXT: pushl %ebp +; ASM-NEXT: .cfi_def_cfa_offset 8 +; ASM-NEXT: pushl %ebx +; ASM-NEXT: .cfi_def_cfa_offset 12 +; ASM-NEXT: pushl %edi +; ASM-NEXT: .cfi_def_cfa_offset 16 +; ASM-NEXT: pushl %esi +; ASM-NEXT: .cfi_def_cfa_offset 20 +; ASM-NEXT: .cfi_offset %esi, -20 +; ASM-NEXT: .cfi_offset %edi, -16 +; ASM-NEXT: .cfi_offset %ebx, -12 +; ASM-NEXT: .cfi_offset %ebp, -8 +; ASM-NEXT: movl {{[0-9]+}}(%esp), %edx +; ASM-NEXT: movl {{[0-9]+}}(%esp), %edi +; ASM-NEXT: testb $1, {{[0-9]+}}(%esp) +; ASM-NEXT: movl {{[0-9]+}}(%esp), %esi +; ASM-NEXT: BUNDLE +; ASM-NEXT: movl {{[0-9]+}}(%esp), %esi +; ASM-NEXT: BUNDLE +; ASM-NEXT: popl %esi +; ASM-NEXT: .cfi_def_cfa_offset 16 +; ASM-NEXT: popl %edi +; ASM-NEXT: .cfi_def_cfa_offset 12 +; ASM-NEXT: popl %ebx +; ASM-NEXT: .cfi_def_cfa_offset 8 +; ASM-NEXT: popl %ebp +; ASM-NEXT: .cfi_def_cfa_offset 4 +; ASM-NEXT: retl +; First bundle for low 32 bits +; Second bundle for high 32 bits + + + %1 = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) + ret i64 %1 +} + +define i32 @test_ctselect_different_conditions(i32 %x, i32 %y, i32 %a, i32 %b) { +; Test different comparison conditions to ensure they all use the same bundle pattern +; ASM-LABEL: test_ctselect_different_conditions: +; ASM: # %bb.0: +; ASM-NEXT: pushl %ebx +; ASM-NEXT: .cfi_def_cfa_offset 8 +; ASM-NEXT: pushl %edi +; ASM-NEXT: .cfi_def_cfa_offset 12 +; ASM-NEXT: pushl %esi +; ASM-NEXT: .cfi_def_cfa_offset 16 +; ASM-NEXT: .cfi_offset %esi, -16 +; ASM-NEXT: .cfi_offset %edi, -12 +; ASM-NEXT: .cfi_offset %ebx, -8 +; ASM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; ASM-NEXT: movl {{[0-9]+}}(%esp), %edx +; ASM-NEXT: movl {{[0-9]+}}(%esp), %eax +; ASM-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; ASM-NEXT: setl %al +; ASM-NEXT: testb %al, %al +; ASM-NEXT: BUNDLE +; ASM-NEXT: popl %esi +; ASM-NEXT: .cfi_def_cfa_offset 12 +; ASM-NEXT: popl %edi +; ASM-NEXT: .cfi_def_cfa_offset 8 +; ASM-NEXT: popl %ebx +; ASM-NEXT: .cfi_def_cfa_offset 4 +; ASM-NEXT: retl + + + %cond = icmp slt i32 %x, %y + %1 = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %1 +} + +define i32 @test_ctselect_constant_operands(i1 %cond) { +; Test with constant operands to ensure no special optimizations break bundling +; ASM-LABEL: test_ctselect_constant_operands: +; ASM: # %bb.0: +; ASM-NEXT: pushl %ebx +; ASM-NEXT: .cfi_def_cfa_offset 8 +; ASM-NEXT: pushl %edi +; ASM-NEXT: .cfi_def_cfa_offset 12 +; ASM-NEXT: pushl %esi +; ASM-NEXT: .cfi_def_cfa_offset 16 +; ASM-NEXT: .cfi_offset %esi, -16 +; ASM-NEXT: .cfi_offset %edi, -12 +; ASM-NEXT: .cfi_offset %ebx, -8 +; ASM-NEXT: xorl %ecx, %ecx +; ASM-NEXT: testb $1, {{[0-9]+}}(%esp) +; ASM-NEXT: movl $42, %edx +; ASM-NEXT: BUNDLE +; ASM-NEXT: popl %esi +; ASM-NEXT: .cfi_def_cfa_offset 12 +; ASM-NEXT: popl %edi +; ASM-NEXT: .cfi_def_cfa_offset 8 +; ASM-NEXT: popl %ebx +; ASM-NEXT: .cfi_def_cfa_offset 4 +; ASM-NEXT: retl + + + %1 = call i32 @llvm.ct.select.i32(i1 %cond, i32 42, i32 0) + ret i32 %1 +} + +; Verify that each bundle contains exactly the expected constant-time sequence: +; 1. SETCCr - Set condition code to register (sete/setne/setl etc) +; 2. MOVZX32rr8 - Zero-extend 8-bit to 32-bit +; 3. NEG32r/NEG16r - Negate to create bitmask (0 -> 0, 1 -> 0xFFFFFFFF) +; 4. MOV32rr/MOV16rr - Copy mask +; 5. XOR32ri/XOR16ri - Create inverted mask (~mask) +; 6. MOV32rr/MOV16rr - Copy first operand +; 7. AND32rr/AND16rr - Mask first operand (operand & mask) +; 8. MOV32rr/MOV16rr - Copy second operand +; 9. AND32rr/AND16rr - Mask second operand (operand & ~mask) +; 10. OR32rr/OR16rr - Combine results (result = (a & mask) | (b & ~mask)) + +declare i16 @llvm.ct.select.i16(i1, i16, i16) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; BUNDLE: {{.*}} diff --git a/llvm/test/CodeGen/X86/ctselect-i386-security.ll b/llvm/test/CodeGen/X86/ctselect-i386-security.ll new file mode 100644 index 0000000000000..3cec0cf2c6df4 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-i386-security.ll @@ -0,0 +1,141 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=SECURE +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=SECURE + +; Verify security properties of i386 CTSELECT post-RA expansion +; This test ensures: +; 1. No conditional branches (jcc instructions) in output +; 2. Both true and false values are always processed +; 3. No conditional memory accesses +; 4. Instructions are bundled for atomic treatment + +; Test: No conditional branches should appear in constant-time path +define i32 @test_no_conditional_branches(i32 %secret, i32 %val1, i32 %val2) nounwind { +; SECURE-LABEL: test_no_conditional_branches: +; SECURE: # %bb.0: +; SECURE-NEXT: pushl %ebx +; SECURE-NEXT: pushl %edi +; SECURE-NEXT: pushl %esi +; SECURE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SECURE-NEXT: movl {{[0-9]+}}(%esp), %edx +; SECURE-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; SECURE-NEXT: setne %al +; SECURE-NEXT: testb %al, %al +; SECURE-NEXT: BUNDLE +; SECURE-NEXT: popl %esi +; SECURE-NEXT: popl %edi +; SECURE-NEXT: popl %ebx +; SECURE-NEXT: retl + %cond = icmp ne i32 %secret, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2) + ret i32 %result +} + +; Test: Both values must always be processed (no conditional loads) +define i32 @test_always_process_both_values(i32 %secret, i32* %ptr1, i32* %ptr2) nounwind { +; SECURE-LABEL: test_always_process_both_values: +; SECURE: # %bb.0: +; SECURE-NEXT: pushl %ebx +; SECURE-NEXT: pushl %edi +; SECURE-NEXT: pushl %esi +; SECURE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SECURE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SECURE-NEXT: movl (%ecx), %ecx +; SECURE-NEXT: movl (%eax), %edx +; SECURE-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; SECURE-NEXT: setne %al +; SECURE-NEXT: testb %al, %al +; SECURE-NEXT: BUNDLE +; SECURE-NEXT: popl %esi +; SECURE-NEXT: popl %edi +; SECURE-NEXT: popl %ebx +; SECURE-NEXT: retl + %val1 = load i32, i32* %ptr1 + %val2 = load i32, i32* %ptr2 + %cond = icmp ne i32 %secret, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2) + ret i32 %result +} + +; Test: 64-bit constant-time selection on i386 +define i64 @test_i64_constant_time(i32 %secret, i64 %val1, i64 %val2) nounwind { +; SECURE-LABEL: test_i64_constant_time: +; SECURE: # %bb.0: +; SECURE-NEXT: pushl %ebp +; SECURE-NEXT: pushl %ebx +; SECURE-NEXT: pushl %edi +; SECURE-NEXT: pushl %esi +; SECURE-NEXT: movl {{[0-9]+}}(%esp), %edx +; SECURE-NEXT: movl {{[0-9]+}}(%esp), %edi +; SECURE-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; SECURE-NEXT: setne %al +; SECURE-NEXT: testb %al, %al +; SECURE-NEXT: movl {{[0-9]+}}(%esp), %esi +; SECURE-NEXT: BUNDLE +; SECURE-NEXT: movl {{[0-9]+}}(%esp), %esi +; SECURE-NEXT: BUNDLE +; SECURE-NEXT: popl %esi +; SECURE-NEXT: popl %edi +; SECURE-NEXT: popl %ebx +; SECURE-NEXT: popl %ebp +; SECURE-NEXT: retl + %cond = icmp ne i32 %secret, 0 + %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %val1, i64 %val2) + ret i64 %result +} + +; Test: Verify instruction bundling prevents optimization between operations +define i32 @test_bundle_atomicity(i32 %secret, i32 %val1, i32 %val2) nounwind { +; SECURE-LABEL: test_bundle_atomicity: +; SECURE: # %bb.0: +; SECURE-NEXT: pushl %ebx +; SECURE-NEXT: pushl %edi +; SECURE-NEXT: pushl %esi +; SECURE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SECURE-NEXT: movl {{[0-9]+}}(%esp), %edx +; SECURE-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; SECURE-NEXT: setne %al +; SECURE-NEXT: testb %al, %al +; SECURE-NEXT: BUNDLE +; SECURE-NEXT: popl %esi +; SECURE-NEXT: popl %edi +; SECURE-NEXT: popl %ebx +; SECURE-NEXT: retl + %cond = icmp ne i32 %secret, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2) + ret i32 %result +} + +; Test: Multiple CTSELECT operations should maintain constant-time properties +define i32 @test_multiple_ctselect(i32 %secret1, i32 %secret2, i32 %val1, i32 %val2, i32 %val3, i32 %val4) nounwind { +; SECURE-LABEL: test_multiple_ctselect: +; SECURE: # %bb.0: +; SECURE-NEXT: pushl %ebp +; SECURE-NEXT: pushl %ebx +; SECURE-NEXT: pushl %edi +; SECURE-NEXT: pushl %esi +; SECURE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SECURE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SECURE-NEXT: movl {{[0-9]+}}(%esp), %edx +; SECURE-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; SECURE-NEXT: setne %bl +; SECURE-NEXT: testb %bl, %bl +; SECURE-NEXT: BUNDLE +; SECURE-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; SECURE-NEXT: setne %al +; SECURE-NEXT: testb %al, %al +; SECURE-NEXT: BUNDLE +; SECURE-NEXT: popl %esi +; SECURE-NEXT: popl %edi +; SECURE-NEXT: popl %ebx +; SECURE-NEXT: popl %ebp +; SECURE-NEXT: retl + %cond1 = icmp ne i32 %secret1, 0 + %tmp = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %val1, i32 %val2) + %cond2 = icmp ne i32 %secret2, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %tmp, i32 %val3) + ret i32 %result +} + +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) diff --git a/llvm/test/CodeGen/X86/ctselect-i386.ll b/llvm/test/CodeGen/X86/ctselect-i386.ll new file mode 100644 index 0000000000000..5e63f357d3c44 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-i386.ll @@ -0,0 +1,312 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV + +; Test CTSELECT post-RA expansion on i386 targets +; - Without CMOV: constant-time implementation using new post-RA expansion +; - With CMOV: CMOV-based implementation +; All expansion happens post-RA for better optimization control and constant-time guarantees + +define i32 @test_ctselect_i32_reg(i32 %a, i32 %b, i32 %c) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i32_reg: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: cmpl %edx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_i32_reg: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %cl +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: retl + %cond = icmp eq i32 %a, %c + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c) + ret i32 %result +} + +define i16 @test_ctselect_i16_reg(i16 %a, i16 %b, i16 %c) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i16_reg: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: cmpw %dx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_i16_reg: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: cmpw %ax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %cl +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: cmovnew {{[0-9]+}}(%esp), %ax +; I386-CMOV-NEXT: retl + %cond = icmp eq i16 %a, %c + %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %b, i16 %c) + ret i16 %result +} + +define i64 @test_ctselect_i64_reg(i64 %a, i64 %b, i64 %c) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i64_reg: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: xorl %edi, %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx +; I386-NOCMOV-NEXT: xorl %edx, %ebx +; I386-NOCMOV-NEXT: orl %eax, %ebx +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: popl %ebp +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_i64_reg: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: xorl %edx, %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: xorl %eax, %esi +; I386-CMOV-NEXT: orl %ecx, %esi +; I386-CMOV-NEXT: sete %cl +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: retl + %cond = icmp eq i64 %a, %c + %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %b, i64 %c) + ret i64 %result +} + +define i32 @test_ctselect_i32_mem(i32 %a, i32* %b_ptr, i32 %c) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i32_mem: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl (%eax), %edx +; I386-NOCMOV-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_i32_mem: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %dl +; I386-CMOV-NEXT: testb %dl, %dl +; I386-CMOV-NEXT: cmovnel (%ecx), %eax +; I386-CMOV-NEXT: retl + %b = load i32, i32* %b_ptr + %cond = icmp eq i32 %a, %c + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c) + ret i32 %result +} + +; Test various condition codes +define i32 @test_ctselect_different_cond(i32 %a, i32 %b, i32 %c) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_different_cond: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: cmpl %edx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setl %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_different_cond: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setl %cl +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: retl + %cond = icmp slt i32 %a, %c + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c) + ret i32 %result +} + +; Verify no conditional branches in constant-time version +define i32 @test_no_branches(i32 %secret, i32 %public1, i32 %public2) nounwind { +; I386-NOCMOV-LABEL: test_no_branches: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_no_branches: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %cl +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: retl + %cond = icmp ne i32 %secret, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %public1, i32 %public2) + ret i32 %result +} + +; Test edge cases for post-RA expansion +define i32 @test_ctselect_zero_one(i32 %cond) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_zero_one: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %al +; I386-NOCMOV-NEXT: xorl %ecx, %ecx +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: movl $1, %edx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_zero_one: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %cl +; I386-CMOV-NEXT: xorl %eax, %eax +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: movl $1, %ecx +; I386-CMOV-NEXT: cmovnel %ecx, %eax +; I386-CMOV-NEXT: retl + %test = icmp ne i32 %cond, 0 + %result = call i32 @llvm.ct.select.i32(i1 %test, i32 1, i32 0) + ret i32 %result +} + +; Test bundling behavior - instructions should be bundled for atomic treatment +define i32 @test_ctselect_bundling(i32 %a, i32 %b, i32 %c) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_bundling: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: cmpl %edx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_bundling: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %cl +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: retl + %cond = icmp eq i32 %a, %c + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c) + ret i32 %result +} + +; Test i8 promotion to i32 for i386 +define i8 @test_ctselect_i8_promotion(i8 %a, i8 %b, i8 %c) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i8_promotion: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: cmpb %cl, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: # kill: def $al killed $al killed $eax +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_i8_promotion: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: cmpb %al, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %cl +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: # kill: def $al killed $al killed $eax +; I386-CMOV-NEXT: retl + %cond = icmp eq i8 %a, %c + %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %b, i8 %c) + ret i8 %result +} + +declare i8 @llvm.ct.select.i8(i1, i8, i8) +declare i16 @llvm.ct.select.i16(i1, i16, i16) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) From 2102d145334c9f7d7f1303de2fde7dcd24a48820 Mon Sep 17 00:00:00 2001 From: Henrik Brodin <90325907+hbrodin@users.noreply.github.com> Date: Fri, 29 Aug 2025 10:02:27 +0200 Subject: [PATCH 49/63] [CT] VR64 support --- llvm/lib/Target/X86/X86InstrCompiler.td | 9 + llvm/lib/Target/X86/X86InstrInfo.cpp | 4 + llvm/lib/Target/X86/X86InstrInfo.h | 3 + .../CodeGen/X86/ctselect-constant-time.ll | 159 ------ .../X86/ctselect-i386-bundle-expansion.ll | 184 ------- llvm/test/CodeGen/X86/ctselect-i386-fp.ll | 465 ++++++++++++++++++ llvm/test/CodeGen/X86/ctselect-i386-mmx.ll | 343 +++++++++++++ .../CodeGen/X86/ctselect-i386-security.ll | 141 ------ llvm/test/CodeGen/X86/ctselect-i386.ll | 231 +++------ 9 files changed, 888 insertions(+), 651 deletions(-) delete mode 100644 llvm/test/CodeGen/X86/ctselect-constant-time.ll delete mode 100644 llvm/test/CodeGen/X86/ctselect-i386-bundle-expansion.ll create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-fp.ll create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-mmx.ll delete mode 100644 llvm/test/CodeGen/X86/ctselect-i386-security.ll diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index a7a2986fa9548..69a01801c6fb5 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -747,6 +747,9 @@ let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { defm CTSELECT_I386_INT_GR16 : CTSELECT_I386_INTERNAL; defm CTSELECT_I386_INT_GR32 : CTSELECT_I386_INTERNAL; } + let Predicates = [NoCMOV, HasMMX] in { + defm CTSELECT_I386_VR64 : CTSELECT_I386_VR64; + } } let usesCustomInserter = 1, @@ -783,6 +786,12 @@ let Predicates = [NoNativeCMOV] in { // i64 patterns handled automatically by type legalization } +// Pattern matching for VR64 CTSELECT on i386 without CMOV (routes to post-RA expansion) +let Predicates = [NoCMOV, Not64BitMode, HasMMX] in { + def : Pat<(x86mmx(X86ctselect VR64:$src1, VR64:$src2, timm:$cond, EFLAGS)), + (CTSELECT_I386_VR64rr VR64:$src1, VR64:$src2, timm:$cond)>; +} + //===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 1bd36bf7a9172..fa98d0fc4cf27 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -7003,6 +7003,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::CTSELECT_I386_GR16rr: case X86::CTSELECT_I386_GR32rr: return expandCtSelectI386(MI); + + // VR64-specific CTSELECT expansion (post-RA, constant-time) + case X86::CTSELECT_I386_VR64rr: + return expandCtSelectI386VR64(MI); } return false; } diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index c4e2c4ee44460..064445d277574 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -733,6 +733,9 @@ class X86InstrInfo final : public X86GenInstrInfo { /// Expand i386-specific CTSELECT pseudo instructions (post-RA, constant-time) bool expandCtSelectI386(MachineInstr &MI) const; + /// Expand VR64-specific CTSELECT pseudo instructions (post-RA, constant-time) + bool expandCtSelectI386VR64(MachineInstr &MI) const; + /// Returns true iff the routine could find two commutable operands in the /// given machine instruction with 3 vector inputs. /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their diff --git a/llvm/test/CodeGen/X86/ctselect-constant-time.ll b/llvm/test/CodeGen/X86/ctselect-constant-time.ll deleted file mode 100644 index d4996e13cf047..0000000000000 --- a/llvm/test/CodeGen/X86/ctselect-constant-time.ll +++ /dev/null @@ -1,159 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=CONSTANT-TIME -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=CONSTANT-TIME - -; Test constant-time properties of CTSELECT implementation on i386 -; This test verifies that: -; 1. No conditional branches are generated -; 2. Both true and false values are always loaded/computed -; 3. No conditional memory loads are performed - -define i32 @crypto_key_select(i32 %secret_bit, i32 %key1, i32 %key2) nounwind { -; CONSTANT-TIME-LABEL: crypto_key_select: -; CONSTANT-TIME: # %bb.0: -; CONSTANT-TIME-NEXT: pushl %ebx -; CONSTANT-TIME-NEXT: pushl %edi -; CONSTANT-TIME-NEXT: pushl %esi -; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %edx -; CONSTANT-TIME-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CONSTANT-TIME-NEXT: setne %al -; CONSTANT-TIME-NEXT: testb %al, %al -; CONSTANT-TIME-NEXT: BUNDLE -; CONSTANT-TIME-NEXT: popl %esi -; CONSTANT-TIME-NEXT: popl %edi -; CONSTANT-TIME-NEXT: popl %ebx -; CONSTANT-TIME-NEXT: retl - %cond = icmp ne i32 %secret_bit, 0 - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %key1, i32 %key2) - ret i32 %result -} - -define i32 @constant_memory_access(i32 %secret, i32* %data1, i32* %data2) nounwind { -; CONSTANT-TIME-LABEL: constant_memory_access: -; CONSTANT-TIME: # %bb.0: -; CONSTANT-TIME-NEXT: pushl %ebx -; CONSTANT-TIME-NEXT: pushl %edi -; CONSTANT-TIME-NEXT: pushl %esi -; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %eax -; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CONSTANT-TIME-NEXT: movl (%ecx), %ecx -; CONSTANT-TIME-NEXT: movl (%eax), %edx -; CONSTANT-TIME-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CONSTANT-TIME-NEXT: setne %al -; CONSTANT-TIME-NEXT: testb %al, %al -; CONSTANT-TIME-NEXT: BUNDLE -; CONSTANT-TIME-NEXT: popl %esi -; CONSTANT-TIME-NEXT: popl %edi -; CONSTANT-TIME-NEXT: popl %ebx -; CONSTANT-TIME-NEXT: retl - %val1 = load i32, i32* %data1 - %val2 = load i32, i32* %data2 - %cond = icmp ne i32 %secret, 0 - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2) - ret i32 %result -} - -define i64 @crypto_key_select_64bit(i32 %secret_bit, i64 %key1, i64 %key2) nounwind { -; CONSTANT-TIME-LABEL: crypto_key_select_64bit: -; CONSTANT-TIME: # %bb.0: -; CONSTANT-TIME-NEXT: pushl %ebp -; CONSTANT-TIME-NEXT: pushl %ebx -; CONSTANT-TIME-NEXT: pushl %edi -; CONSTANT-TIME-NEXT: pushl %esi -; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %edx -; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %edi -; CONSTANT-TIME-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CONSTANT-TIME-NEXT: setne %al -; CONSTANT-TIME-NEXT: testb %al, %al -; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %esi -; CONSTANT-TIME-NEXT: BUNDLE -; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %esi -; CONSTANT-TIME-NEXT: BUNDLE -; CONSTANT-TIME-NEXT: popl %esi -; CONSTANT-TIME-NEXT: popl %edi -; CONSTANT-TIME-NEXT: popl %ebx -; CONSTANT-TIME-NEXT: popl %ebp -; CONSTANT-TIME-NEXT: retl - %cond = icmp ne i32 %secret_bit, 0 - %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %key1, i64 %key2) - ret i64 %result -} - -define i32 @array_index_protection(i32 %secret_index, i32* %array) nounwind { -; CONSTANT-TIME-LABEL: array_index_protection: -; CONSTANT-TIME: # %bb.0: -; CONSTANT-TIME-NEXT: pushl %ebx -; CONSTANT-TIME-NEXT: pushl %edi -; CONSTANT-TIME-NEXT: pushl %esi -; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %eax -; CONSTANT-TIME-NEXT: movl (%eax), %ecx -; CONSTANT-TIME-NEXT: movl 4(%eax), %edx -; CONSTANT-TIME-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CONSTANT-TIME-NEXT: setne %al -; CONSTANT-TIME-NEXT: testb %al, %al -; CONSTANT-TIME-NEXT: BUNDLE -; CONSTANT-TIME-NEXT: popl %esi -; CONSTANT-TIME-NEXT: popl %edi -; CONSTANT-TIME-NEXT: popl %ebx -; CONSTANT-TIME-NEXT: retl - %val0 = load i32, i32* %array - %ptr1 = getelementptr i32, i32* %array, i32 1 - %val1 = load i32, i32* %ptr1 - %cond = icmp ne i32 %secret_index, 0 - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val0) - ret i32 %result -} - -; Test that complex expressions also work in constant-time -define i32 @complex_crypto_operation(i32 %secret, i32 %a, i32 %b) nounwind { -; CONSTANT-TIME-LABEL: complex_crypto_operation: -; CONSTANT-TIME: # %bb.0: -; CONSTANT-TIME-NEXT: pushl %ebx -; CONSTANT-TIME-NEXT: pushl %edi -; CONSTANT-TIME-NEXT: pushl %esi -; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %edx -; CONSTANT-TIME-NEXT: addl %ecx, %edx -; CONSTANT-TIME-NEXT: imull %ecx, %ecx -; CONSTANT-TIME-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CONSTANT-TIME-NEXT: setne %al -; CONSTANT-TIME-NEXT: testb %al, %al -; CONSTANT-TIME-NEXT: BUNDLE -; CONSTANT-TIME-NEXT: popl %esi -; CONSTANT-TIME-NEXT: popl %edi -; CONSTANT-TIME-NEXT: popl %ebx -; CONSTANT-TIME-NEXT: retl - %expr1 = add i32 %a, %b - %expr2 = mul i32 %b, %b - %cond = icmp ne i32 %secret, 0 - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %expr1, i32 %expr2) - ret i32 %result -} - -; Verify that bundling prevents instruction reordering -; This is harder to test directly, but we can check that instructions appear -; in the expected sequence and no optimization has moved them around -define i32 @verify_bundling(i32 %secret, i32 %val1, i32 %val2) nounwind { -; CONSTANT-TIME-LABEL: verify_bundling: -; CONSTANT-TIME: # %bb.0: -; CONSTANT-TIME-NEXT: pushl %ebx -; CONSTANT-TIME-NEXT: pushl %edi -; CONSTANT-TIME-NEXT: pushl %esi -; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CONSTANT-TIME-NEXT: movl {{[0-9]+}}(%esp), %edx -; CONSTANT-TIME-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CONSTANT-TIME-NEXT: setne %al -; CONSTANT-TIME-NEXT: testb %al, %al -; CONSTANT-TIME-NEXT: BUNDLE -; CONSTANT-TIME-NEXT: popl %esi -; CONSTANT-TIME-NEXT: popl %edi -; CONSTANT-TIME-NEXT: popl %ebx -; CONSTANT-TIME-NEXT: retl - %cond = icmp ne i32 %secret, 0 - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2) - ret i32 %result -} - -declare i32 @llvm.ct.select.i32(i1, i32, i32) -declare i64 @llvm.ct.select.i64(i1, i64, i64) diff --git a/llvm/test/CodeGen/X86/ctselect-i386-bundle-expansion.ll b/llvm/test/CodeGen/X86/ctselect-i386-bundle-expansion.ll deleted file mode 100644 index 2be7f8ac60578..0000000000000 --- a/llvm/test/CodeGen/X86/ctselect-i386-bundle-expansion.ll +++ /dev/null @@ -1,184 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov -print-after=pseudo-probe-inserter < %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=BUNDLE -; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov < %s | FileCheck %s --check-prefix=ASM - -; Test that CTSELECT expansion creates proper constant-time bundles with correct instruction sequences - -define i32 @test_ctselect_i32_bundle_expansion(i1 %cond, i32 %a, i32 %b) { -; ASM-LABEL: test_ctselect_i32_bundle_expansion: -; ASM: # %bb.0: -; ASM-NEXT: pushl %ebx -; ASM-NEXT: .cfi_def_cfa_offset 8 -; ASM-NEXT: pushl %edi -; ASM-NEXT: .cfi_def_cfa_offset 12 -; ASM-NEXT: pushl %esi -; ASM-NEXT: .cfi_def_cfa_offset 16 -; ASM-NEXT: .cfi_offset %esi, -16 -; ASM-NEXT: .cfi_offset %edi, -12 -; ASM-NEXT: .cfi_offset %ebx, -8 -; ASM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; ASM-NEXT: movl {{[0-9]+}}(%esp), %edx -; ASM-NEXT: testb $1, {{[0-9]+}}(%esp) -; ASM-NEXT: BUNDLE -; ASM-NEXT: popl %esi -; ASM-NEXT: .cfi_def_cfa_offset 12 -; ASM-NEXT: popl %edi -; ASM-NEXT: .cfi_def_cfa_offset 8 -; ASM-NEXT: popl %ebx -; ASM-NEXT: .cfi_def_cfa_offset 4 -; ASM-NEXT: retl - - - %1 = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) - ret i32 %1 -} - -define i16 @test_ctselect_i16_bundle_expansion(i1 %cond, i16 %a, i16 %b) { -; ASM-LABEL: test_ctselect_i16_bundle_expansion: -; ASM: # %bb.0: -; ASM-NEXT: pushl %ebx -; ASM-NEXT: .cfi_def_cfa_offset 8 -; ASM-NEXT: pushl %edi -; ASM-NEXT: .cfi_def_cfa_offset 12 -; ASM-NEXT: pushl %esi -; ASM-NEXT: .cfi_def_cfa_offset 16 -; ASM-NEXT: .cfi_offset %esi, -16 -; ASM-NEXT: .cfi_offset %edi, -12 -; ASM-NEXT: .cfi_offset %ebx, -8 -; ASM-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; ASM-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; ASM-NEXT: testb $1, {{[0-9]+}}(%esp) -; ASM-NEXT: BUNDLE -; ASM-NEXT: popl %esi -; ASM-NEXT: .cfi_def_cfa_offset 12 -; ASM-NEXT: popl %edi -; ASM-NEXT: .cfi_def_cfa_offset 8 -; ASM-NEXT: popl %ebx -; ASM-NEXT: .cfi_def_cfa_offset 4 -; ASM-NEXT: retl - - - %1 = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) - ret i16 %1 -} - -define i64 @test_ctselect_i64_bundle_expansion(i1 %cond, i64 %a, i64 %b) { -; ASM-LABEL: test_ctselect_i64_bundle_expansion: -; ASM: # %bb.0: -; ASM-NEXT: pushl %ebp -; ASM-NEXT: .cfi_def_cfa_offset 8 -; ASM-NEXT: pushl %ebx -; ASM-NEXT: .cfi_def_cfa_offset 12 -; ASM-NEXT: pushl %edi -; ASM-NEXT: .cfi_def_cfa_offset 16 -; ASM-NEXT: pushl %esi -; ASM-NEXT: .cfi_def_cfa_offset 20 -; ASM-NEXT: .cfi_offset %esi, -20 -; ASM-NEXT: .cfi_offset %edi, -16 -; ASM-NEXT: .cfi_offset %ebx, -12 -; ASM-NEXT: .cfi_offset %ebp, -8 -; ASM-NEXT: movl {{[0-9]+}}(%esp), %edx -; ASM-NEXT: movl {{[0-9]+}}(%esp), %edi -; ASM-NEXT: testb $1, {{[0-9]+}}(%esp) -; ASM-NEXT: movl {{[0-9]+}}(%esp), %esi -; ASM-NEXT: BUNDLE -; ASM-NEXT: movl {{[0-9]+}}(%esp), %esi -; ASM-NEXT: BUNDLE -; ASM-NEXT: popl %esi -; ASM-NEXT: .cfi_def_cfa_offset 16 -; ASM-NEXT: popl %edi -; ASM-NEXT: .cfi_def_cfa_offset 12 -; ASM-NEXT: popl %ebx -; ASM-NEXT: .cfi_def_cfa_offset 8 -; ASM-NEXT: popl %ebp -; ASM-NEXT: .cfi_def_cfa_offset 4 -; ASM-NEXT: retl -; First bundle for low 32 bits -; Second bundle for high 32 bits - - - %1 = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) - ret i64 %1 -} - -define i32 @test_ctselect_different_conditions(i32 %x, i32 %y, i32 %a, i32 %b) { -; Test different comparison conditions to ensure they all use the same bundle pattern -; ASM-LABEL: test_ctselect_different_conditions: -; ASM: # %bb.0: -; ASM-NEXT: pushl %ebx -; ASM-NEXT: .cfi_def_cfa_offset 8 -; ASM-NEXT: pushl %edi -; ASM-NEXT: .cfi_def_cfa_offset 12 -; ASM-NEXT: pushl %esi -; ASM-NEXT: .cfi_def_cfa_offset 16 -; ASM-NEXT: .cfi_offset %esi, -16 -; ASM-NEXT: .cfi_offset %edi, -12 -; ASM-NEXT: .cfi_offset %ebx, -8 -; ASM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; ASM-NEXT: movl {{[0-9]+}}(%esp), %edx -; ASM-NEXT: movl {{[0-9]+}}(%esp), %eax -; ASM-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; ASM-NEXT: setl %al -; ASM-NEXT: testb %al, %al -; ASM-NEXT: BUNDLE -; ASM-NEXT: popl %esi -; ASM-NEXT: .cfi_def_cfa_offset 12 -; ASM-NEXT: popl %edi -; ASM-NEXT: .cfi_def_cfa_offset 8 -; ASM-NEXT: popl %ebx -; ASM-NEXT: .cfi_def_cfa_offset 4 -; ASM-NEXT: retl - - - %cond = icmp slt i32 %x, %y - %1 = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) - ret i32 %1 -} - -define i32 @test_ctselect_constant_operands(i1 %cond) { -; Test with constant operands to ensure no special optimizations break bundling -; ASM-LABEL: test_ctselect_constant_operands: -; ASM: # %bb.0: -; ASM-NEXT: pushl %ebx -; ASM-NEXT: .cfi_def_cfa_offset 8 -; ASM-NEXT: pushl %edi -; ASM-NEXT: .cfi_def_cfa_offset 12 -; ASM-NEXT: pushl %esi -; ASM-NEXT: .cfi_def_cfa_offset 16 -; ASM-NEXT: .cfi_offset %esi, -16 -; ASM-NEXT: .cfi_offset %edi, -12 -; ASM-NEXT: .cfi_offset %ebx, -8 -; ASM-NEXT: xorl %ecx, %ecx -; ASM-NEXT: testb $1, {{[0-9]+}}(%esp) -; ASM-NEXT: movl $42, %edx -; ASM-NEXT: BUNDLE -; ASM-NEXT: popl %esi -; ASM-NEXT: .cfi_def_cfa_offset 12 -; ASM-NEXT: popl %edi -; ASM-NEXT: .cfi_def_cfa_offset 8 -; ASM-NEXT: popl %ebx -; ASM-NEXT: .cfi_def_cfa_offset 4 -; ASM-NEXT: retl - - - %1 = call i32 @llvm.ct.select.i32(i1 %cond, i32 42, i32 0) - ret i32 %1 -} - -; Verify that each bundle contains exactly the expected constant-time sequence: -; 1. SETCCr - Set condition code to register (sete/setne/setl etc) -; 2. MOVZX32rr8 - Zero-extend 8-bit to 32-bit -; 3. NEG32r/NEG16r - Negate to create bitmask (0 -> 0, 1 -> 0xFFFFFFFF) -; 4. MOV32rr/MOV16rr - Copy mask -; 5. XOR32ri/XOR16ri - Create inverted mask (~mask) -; 6. MOV32rr/MOV16rr - Copy first operand -; 7. AND32rr/AND16rr - Mask first operand (operand & mask) -; 8. MOV32rr/MOV16rr - Copy second operand -; 9. AND32rr/AND16rr - Mask second operand (operand & ~mask) -; 10. OR32rr/OR16rr - Combine results (result = (a & mask) | (b & ~mask)) - -declare i16 @llvm.ct.select.i16(i1, i16, i16) -declare i32 @llvm.ct.select.i32(i1, i32, i32) -declare i64 @llvm.ct.select.i64(i1, i64, i64) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; BUNDLE: {{.*}} diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll new file mode 100644 index 0000000000000..b8daf26158d21 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll @@ -0,0 +1,465 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV + +; Comprehensive CTSELECT tests for i386 targets with floating-point types +; - Without CMOV: constant-time implementation using FP->int conversion + existing post-RA CTSELECT +; - With CMOV: CMOV-based implementation +; - Verifies security properties: no conditional branches, constant execution time +; Strategy: FP values stored to memory, converted to integers, CTSELECT on integers, converted back to FP + +; Test basic f32 functionality +define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $12, %esp +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edx, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: jne .LBB0_2 +; I386-CMOV-NEXT: # %bb.1: +; I386-CMOV-NEXT: fstp %st(1) +; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: .LBB0_2: +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test f32 with different condition codes +define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_eq: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $12, %esp +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fucompp +; I386-NOCMOV-NEXT: fnstsw %ax +; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax +; I386-NOCMOV-NEXT: sahf +; I386-NOCMOV-NEXT: setnp %al +; I386-NOCMOV-NEXT: sete %cl +; I386-NOCMOV-NEXT: testb %al, %cl +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edx, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_eq: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fucompi %st(1), %st +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: setnp %al +; I386-CMOV-NEXT: sete %cl +; I386-CMOV-NEXT: testb %al, %cl +; I386-CMOV-NEXT: jne .LBB1_2 +; I386-CMOV-NEXT: # %bb.1: +; I386-CMOV-NEXT: fstp %st(1) +; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: .LBB1_2: +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: retl + %cmp = fcmp oeq float %x, %y + %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b) + ret float %result +} + +; Test basic f64 functionality +define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f64_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $28, %esp +; I386-NOCMOV-NEXT: fldl {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldl {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstpl {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstpl {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edx, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldl (%esp) +; I386-NOCMOV-NEXT: addl $28, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f64_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: fldl {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fldl {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: jne .LBB2_2 +; I386-CMOV-NEXT: # %bb.1: +; I386-CMOV-NEXT: fstp %st(1) +; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: .LBB2_2: +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: retl + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + +; Test basic x86_fp80 functionality +define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f80_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $44, %esp +; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edx, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movw %dx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldt (%esp) +; I386-NOCMOV-NEXT: addl $44, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f80_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: jne .LBB3_2 +; I386-CMOV-NEXT: # %bb.1: +; I386-CMOV-NEXT: fstp %st(1) +; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: .LBB3_2: +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: retl + %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b) + ret x86_fp80 %result +} + +; Test f32 with complex conditions +define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_gt: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $12, %esp +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fucompp +; I386-NOCMOV-NEXT: fnstsw %ax +; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax +; I386-NOCMOV-NEXT: sahf +; I386-NOCMOV-NEXT: seta %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edx, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_gt: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fucompi %st(1), %st +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: seta %al +; I386-CMOV-NEXT: testb %al, %al +; I386-CMOV-NEXT: jne .LBB4_2 +; I386-CMOV-NEXT: # %bb.1: +; I386-CMOV-NEXT: fstp %st(1) +; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: .LBB4_2: +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: retl + %cmp = fcmp ogt float %x, %y + %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b) + ret float %result +} + +; Test constant-time properties: verify no branches in generated code +define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_no_branches: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $12, %esp +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edx, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_no_branches: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: jne .LBB5_2 +; I386-CMOV-NEXT: # %bb.1: +; I386-CMOV-NEXT: fstp %st(1) +; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: .LBB5_2: +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test that BUNDLE directives are present for constant-time guarantees +define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_bundled: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $12, %esp +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edx, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_bundled: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: jne .LBB6_2 +; I386-CMOV-NEXT: # %bb.1: +; I386-CMOV-NEXT: fstp %st(1) +; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: .LBB6_2: +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test edge case: NaN handling +define float @test_ctselect_f32_nan(i1 %cond) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_nan: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $12, %esp +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; I386-NOCMOV-NEXT: fldz +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edx, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_nan: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: jne .LBB7_2 +; I386-CMOV-NEXT: # %bb.1: +; I386-CMOV-NEXT: fstp %st(1) +; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: .LBB7_2: +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: retl + %nan = bitcast i32 2139095040 to float ; 0x7F800000 = +inf + %zero = bitcast i32 0 to float + %result = call float @llvm.ct.select.f32(i1 %cond, float %nan, float %zero) + ret float %result +} + +; Test memory alignment for f80 +define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f80_alignment: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $44, %esp +; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edx, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movw %dx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldt (%esp) +; I386-NOCMOV-NEXT: addl $44, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f80_alignment: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: jne .LBB8_2 +; I386-CMOV-NEXT: # %bb.1: +; I386-CMOV-NEXT: fstp %st(1) +; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: .LBB8_2: +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: retl + %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b) + ret x86_fp80 %result +} + +; Stress test: multiple CTSELECT operations +define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float %b, float %c, float %d) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_multiple: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $24, %esp +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fxch %st(1) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edx, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $24, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_multiple: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: jne .LBB9_2 +; I386-CMOV-NEXT: # %bb.1: +; I386-CMOV-NEXT: fstp %st(1) +; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: .LBB9_2: +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: jne .LBB9_4 +; I386-CMOV-NEXT: # %bb.3: +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: fxch %st(1) +; I386-CMOV-NEXT: .LBB9_4: +; I386-CMOV-NEXT: fstp %st(1) +; I386-CMOV-NEXT: retl + %sel1 = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b) + %sel2 = call float @llvm.ct.select.f32(i1 %cond2, float %sel1, float %c) + ret float %sel2 +} + +; Declare intrinsics +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) +declare x86_fp80 @llvm.ct.select.f80(i1, x86_fp80, x86_fp80) diff --git a/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll new file mode 100644 index 0000000000000..bdd88c5d682d2 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll @@ -0,0 +1,343 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx < %s | FileCheck %s --check-prefix=I386-NOCMOV +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+cmov,+mmx < %s | FileCheck %s --check-prefix=I386-CMOV +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx -verify-machineinstrs < %s | FileCheck %s --check-prefix=I386-NOCMOV + +; Test constant-time selection with MMX intrinsics to exercise VR64 CTSELECT +; These tests use MMX intrinsics to create <1 x i64> values that get allocated to VR64 registers + +; Test MMX ct.select using paddd intrinsic to force VR64 allocation +define <1 x i64> @test_mmx_ctselect_with_paddd(i32 %cond, i64 %a, i64 %b) { +; I386-NOCMOV-LABEL: test_mmx_ctselect_with_paddd: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: subl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40 +; I386-NOCMOV-NEXT: .cfi_offset %esi, -20 +; I386-NOCMOV-NEXT: .cfi_offset %edi, -16 +; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %bl +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-NOCMOV-NEXT: paddd %mm0, %mm0 +; I386-NOCMOV-NEXT: movq %mm0, (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: addl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: popl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_mmx_ctselect_with_paddd: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 24 +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %dl +; I386-CMOV-NEXT: testb %dl, %dl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-CMOV-NEXT: paddd %mm0, %mm0 +; I386-CMOV-NEXT: movq %mm0, (%esp) +; I386-CMOV-NEXT: movl (%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: addl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-CMOV-NEXT: retl + %mmx_a = bitcast i64 %a to <1 x i64> + %mmx_b = bitcast i64 %b to <1 x i64> + %cmp = icmp ne i32 %cond, 0 + %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b) + %result = call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %sel, <1 x i64> %sel) + ret <1 x i64> %result +} + +; Test MMX ct.select using psllw intrinsic +define <1 x i64> @test_mmx_ctselect_with_psllw(i32 %cond, i64 %a, i64 %b) { +; I386-NOCMOV-LABEL: test_mmx_ctselect_with_psllw: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: subl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40 +; I386-NOCMOV-NEXT: .cfi_offset %esi, -20 +; I386-NOCMOV-NEXT: .cfi_offset %edi, -16 +; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %bl +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-NOCMOV-NEXT: psllw %mm0, %mm0 +; I386-NOCMOV-NEXT: movq %mm0, (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: addl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: popl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_mmx_ctselect_with_psllw: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 24 +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %dl +; I386-CMOV-NEXT: testb %dl, %dl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-CMOV-NEXT: psllw %mm0, %mm0 +; I386-CMOV-NEXT: movq %mm0, (%esp) +; I386-CMOV-NEXT: movl (%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: addl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-CMOV-NEXT: retl + %mmx_a = bitcast i64 %a to <1 x i64> + %mmx_b = bitcast i64 %b to <1 x i64> + %cmp = icmp ne i32 %cond, 0 + %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b) + %result = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %sel, <1 x i64> %sel) + ret <1 x i64> %result +} + +; Test nested MMX ct.selects with pand intrinsic +define <1 x i64> @test_mmx_nested_ctselect_with_pand(i32 %cond1, i32 %cond2, i64 %a, i64 %b, i64 %c) { +; I386-NOCMOV-LABEL: test_mmx_nested_ctselect_with_pand: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: subl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40 +; I386-NOCMOV-NEXT: .cfi_offset %esi, -20 +; I386-NOCMOV-NEXT: .cfi_offset %edi, -16 +; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %cl +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %ah +; I386-NOCMOV-NEXT: testb %ah, %ah +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: testb %cl, %cl +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-NOCMOV-NEXT: pand %mm0, %mm0 +; I386-NOCMOV-NEXT: movq %mm0, (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: addl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: popl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_mmx_nested_ctselect_with_pand: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %ebx +; I386-CMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-CMOV-NEXT: subl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 32 +; I386-CMOV-NEXT: .cfi_offset %esi, -12 +; I386-CMOV-NEXT: .cfi_offset %ebx, -8 +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %bl +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %bh +; I386-CMOV-NEXT: testb %bh, %bh +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: testb %bl, %bl +; I386-CMOV-NEXT: cmovnel %esi, %edx +; I386-CMOV-NEXT: movl %edx, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel %ecx, %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-CMOV-NEXT: pand %mm0, %mm0 +; I386-CMOV-NEXT: movq %mm0, (%esp) +; I386-CMOV-NEXT: movl (%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: addl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-CMOV-NEXT: popl %ebx +; I386-CMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-CMOV-NEXT: retl + %mmx_a = bitcast i64 %a to <1 x i64> + %mmx_b = bitcast i64 %b to <1 x i64> + %mmx_c = bitcast i64 %c to <1 x i64> + %cmp1 = icmp ne i32 %cond1, 0 + %cmp2 = icmp ne i32 %cond2, 0 + %sel1 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp2, <1 x i64> %mmx_a, <1 x i64> %mmx_b) + %sel2 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp1, <1 x i64> %sel1, <1 x i64> %mmx_c) + %result = call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %sel2, <1 x i64> %sel2) + ret <1 x i64> %result +} + +; Test MMX ct.select with por intrinsic +define <1 x i64> @test_mmx_ctselect_with_por(i32 %cond, i64 %a, i64 %b) { +; I386-NOCMOV-LABEL: test_mmx_ctselect_with_por: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: subl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40 +; I386-NOCMOV-NEXT: .cfi_offset %esi, -20 +; I386-NOCMOV-NEXT: .cfi_offset %edi, -16 +; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %bl +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-NOCMOV-NEXT: por %mm0, %mm0 +; I386-NOCMOV-NEXT: movq %mm0, (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: addl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: popl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_mmx_ctselect_with_por: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 24 +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %dl +; I386-CMOV-NEXT: testb %dl, %dl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-CMOV-NEXT: por %mm0, %mm0 +; I386-CMOV-NEXT: movq %mm0, (%esp) +; I386-CMOV-NEXT: movl (%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: addl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-CMOV-NEXT: retl + %mmx_a = bitcast i64 %a to <1 x i64> + %mmx_b = bitcast i64 %b to <1 x i64> + %cmp = icmp ne i32 %cond, 0 + %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b) + %result = call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %sel, <1 x i64> %sel) + ret <1 x i64> %result +} + +; Declare MMX intrinsics +declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) + +; Declare constant-time selection intrinsic +declare <1 x i64> @llvm.ct.select.v1i64(i1, <1 x i64>, <1 x i64>) diff --git a/llvm/test/CodeGen/X86/ctselect-i386-security.ll b/llvm/test/CodeGen/X86/ctselect-i386-security.ll deleted file mode 100644 index 3cec0cf2c6df4..0000000000000 --- a/llvm/test/CodeGen/X86/ctselect-i386-security.ll +++ /dev/null @@ -1,141 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=SECURE -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=SECURE - -; Verify security properties of i386 CTSELECT post-RA expansion -; This test ensures: -; 1. No conditional branches (jcc instructions) in output -; 2. Both true and false values are always processed -; 3. No conditional memory accesses -; 4. Instructions are bundled for atomic treatment - -; Test: No conditional branches should appear in constant-time path -define i32 @test_no_conditional_branches(i32 %secret, i32 %val1, i32 %val2) nounwind { -; SECURE-LABEL: test_no_conditional_branches: -; SECURE: # %bb.0: -; SECURE-NEXT: pushl %ebx -; SECURE-NEXT: pushl %edi -; SECURE-NEXT: pushl %esi -; SECURE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; SECURE-NEXT: movl {{[0-9]+}}(%esp), %edx -; SECURE-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; SECURE-NEXT: setne %al -; SECURE-NEXT: testb %al, %al -; SECURE-NEXT: BUNDLE -; SECURE-NEXT: popl %esi -; SECURE-NEXT: popl %edi -; SECURE-NEXT: popl %ebx -; SECURE-NEXT: retl - %cond = icmp ne i32 %secret, 0 - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2) - ret i32 %result -} - -; Test: Both values must always be processed (no conditional loads) -define i32 @test_always_process_both_values(i32 %secret, i32* %ptr1, i32* %ptr2) nounwind { -; SECURE-LABEL: test_always_process_both_values: -; SECURE: # %bb.0: -; SECURE-NEXT: pushl %ebx -; SECURE-NEXT: pushl %edi -; SECURE-NEXT: pushl %esi -; SECURE-NEXT: movl {{[0-9]+}}(%esp), %eax -; SECURE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; SECURE-NEXT: movl (%ecx), %ecx -; SECURE-NEXT: movl (%eax), %edx -; SECURE-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; SECURE-NEXT: setne %al -; SECURE-NEXT: testb %al, %al -; SECURE-NEXT: BUNDLE -; SECURE-NEXT: popl %esi -; SECURE-NEXT: popl %edi -; SECURE-NEXT: popl %ebx -; SECURE-NEXT: retl - %val1 = load i32, i32* %ptr1 - %val2 = load i32, i32* %ptr2 - %cond = icmp ne i32 %secret, 0 - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2) - ret i32 %result -} - -; Test: 64-bit constant-time selection on i386 -define i64 @test_i64_constant_time(i32 %secret, i64 %val1, i64 %val2) nounwind { -; SECURE-LABEL: test_i64_constant_time: -; SECURE: # %bb.0: -; SECURE-NEXT: pushl %ebp -; SECURE-NEXT: pushl %ebx -; SECURE-NEXT: pushl %edi -; SECURE-NEXT: pushl %esi -; SECURE-NEXT: movl {{[0-9]+}}(%esp), %edx -; SECURE-NEXT: movl {{[0-9]+}}(%esp), %edi -; SECURE-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; SECURE-NEXT: setne %al -; SECURE-NEXT: testb %al, %al -; SECURE-NEXT: movl {{[0-9]+}}(%esp), %esi -; SECURE-NEXT: BUNDLE -; SECURE-NEXT: movl {{[0-9]+}}(%esp), %esi -; SECURE-NEXT: BUNDLE -; SECURE-NEXT: popl %esi -; SECURE-NEXT: popl %edi -; SECURE-NEXT: popl %ebx -; SECURE-NEXT: popl %ebp -; SECURE-NEXT: retl - %cond = icmp ne i32 %secret, 0 - %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %val1, i64 %val2) - ret i64 %result -} - -; Test: Verify instruction bundling prevents optimization between operations -define i32 @test_bundle_atomicity(i32 %secret, i32 %val1, i32 %val2) nounwind { -; SECURE-LABEL: test_bundle_atomicity: -; SECURE: # %bb.0: -; SECURE-NEXT: pushl %ebx -; SECURE-NEXT: pushl %edi -; SECURE-NEXT: pushl %esi -; SECURE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; SECURE-NEXT: movl {{[0-9]+}}(%esp), %edx -; SECURE-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; SECURE-NEXT: setne %al -; SECURE-NEXT: testb %al, %al -; SECURE-NEXT: BUNDLE -; SECURE-NEXT: popl %esi -; SECURE-NEXT: popl %edi -; SECURE-NEXT: popl %ebx -; SECURE-NEXT: retl - %cond = icmp ne i32 %secret, 0 - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2) - ret i32 %result -} - -; Test: Multiple CTSELECT operations should maintain constant-time properties -define i32 @test_multiple_ctselect(i32 %secret1, i32 %secret2, i32 %val1, i32 %val2, i32 %val3, i32 %val4) nounwind { -; SECURE-LABEL: test_multiple_ctselect: -; SECURE: # %bb.0: -; SECURE-NEXT: pushl %ebp -; SECURE-NEXT: pushl %ebx -; SECURE-NEXT: pushl %edi -; SECURE-NEXT: pushl %esi -; SECURE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; SECURE-NEXT: movl {{[0-9]+}}(%esp), %eax -; SECURE-NEXT: movl {{[0-9]+}}(%esp), %edx -; SECURE-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; SECURE-NEXT: setne %bl -; SECURE-NEXT: testb %bl, %bl -; SECURE-NEXT: BUNDLE -; SECURE-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; SECURE-NEXT: setne %al -; SECURE-NEXT: testb %al, %al -; SECURE-NEXT: BUNDLE -; SECURE-NEXT: popl %esi -; SECURE-NEXT: popl %edi -; SECURE-NEXT: popl %ebx -; SECURE-NEXT: popl %ebp -; SECURE-NEXT: retl - %cond1 = icmp ne i32 %secret1, 0 - %tmp = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %val1, i32 %val2) - %cond2 = icmp ne i32 %secret2, 0 - %result = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %tmp, i32 %val3) - ret i32 %result -} - -declare i32 @llvm.ct.select.i32(i1, i32, i32) -declare i64 @llvm.ct.select.i64(i1, i64, i64) diff --git a/llvm/test/CodeGen/X86/ctselect-i386.ll b/llvm/test/CodeGen/X86/ctselect-i386.ll index 5e63f357d3c44..653fe034d9128 100644 --- a/llvm/test/CodeGen/X86/ctselect-i386.ll +++ b/llvm/test/CodeGen/X86/ctselect-i386.ll @@ -3,188 +3,91 @@ ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV -; Test CTSELECT post-RA expansion on i386 targets -; - Without CMOV: constant-time implementation using new post-RA expansion +; Comprehensive CTSELECT tests for i386 targets with scalar integer types +; - Without CMOV: constant-time implementation using post-RA expansion with bundled instructions ; - With CMOV: CMOV-based implementation +; - Verifies security properties: no conditional branches, constant execution time ; All expansion happens post-RA for better optimization control and constant-time guarantees -define i32 @test_ctselect_i32_reg(i32 %a, i32 %b, i32 %c) nounwind { -; I386-NOCMOV-LABEL: test_ctselect_i32_reg: +; Test basic i32 functionality +define i32 @test_ctselect_i32_basic(i1 %cond, i32 %a, i32 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i32_basic: ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: cmpl %edx, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: BUNDLE ; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; -; I386-CMOV-LABEL: test_ctselect_i32_reg: +; I386-CMOV-LABEL: test_ctselect_i32_basic: ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-CMOV-NEXT: cmpl %eax, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: sete %cl -; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; I386-CMOV-NEXT: retl - %cond = icmp eq i32 %a, %c - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c) + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result } -define i16 @test_ctselect_i16_reg(i16 %a, i16 %b, i16 %c) nounwind { -; I386-NOCMOV-LABEL: test_ctselect_i16_reg: +; Test i16 functionality +define i16 @test_ctselect_i16_basic(i1 %cond, i16 %a, i16 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i16_basic: ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: cmpw %dx, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: BUNDLE ; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; -; I386-CMOV-LABEL: test_ctselect_i16_reg: +; I386-CMOV-LABEL: test_ctselect_i16_basic: ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; I386-CMOV-NEXT: cmpw %ax, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: sete %cl -; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: cmovnew {{[0-9]+}}(%esp), %ax ; I386-CMOV-NEXT: retl - %cond = icmp eq i16 %a, %c - %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %b, i16 %c) + %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) ret i16 %result } -define i64 @test_ctselect_i64_reg(i64 %a, i64 %b, i64 %c) nounwind { -; I386-NOCMOV-LABEL: test_ctselect_i64_reg: -; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: pushl %ebp -; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %edi -; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-NOCMOV-NEXT: xorl %edi, %eax -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx -; I386-NOCMOV-NEXT: xorl %edx, %ebx -; I386-NOCMOV-NEXT: orl %eax, %ebx -; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: testb %al, %al -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %edi -; I386-NOCMOV-NEXT: popl %ebx -; I386-NOCMOV-NEXT: popl %ebp -; I386-NOCMOV-NEXT: retl -; -; I386-CMOV-LABEL: test_ctselect_i64_reg: -; I386-CMOV: # %bb.0: -; I386-CMOV-NEXT: pushl %esi -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: xorl %edx, %ecx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-CMOV-NEXT: xorl %eax, %esi -; I386-CMOV-NEXT: orl %ecx, %esi -; I386-CMOV-NEXT: sete %cl -; I386-CMOV-NEXT: testb %cl, %cl -; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax -; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: popl %esi -; I386-CMOV-NEXT: retl - %cond = icmp eq i64 %a, %c - %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %b, i64 %c) - ret i64 %result -} - -define i32 @test_ctselect_i32_mem(i32 %a, i32* %b_ptr, i32 %c) nounwind { -; I386-NOCMOV-LABEL: test_ctselect_i32_mem: -; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %edi -; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-NOCMOV-NEXT: movl (%eax), %edx -; I386-NOCMOV-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: testb %al, %al -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %edi -; I386-NOCMOV-NEXT: popl %ebx -; I386-NOCMOV-NEXT: retl -; -; I386-CMOV-LABEL: test_ctselect_i32_mem: -; I386-CMOV: # %bb.0: -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-CMOV-NEXT: cmpl %eax, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: sete %dl -; I386-CMOV-NEXT: testb %dl, %dl -; I386-CMOV-NEXT: cmovnel (%ecx), %eax -; I386-CMOV-NEXT: retl - %b = load i32, i32* %b_ptr - %cond = icmp eq i32 %a, %c - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c) - ret i32 %result -} - -; Test various condition codes -define i32 @test_ctselect_different_cond(i32 %a, i32 %b, i32 %c) nounwind { -; I386-NOCMOV-LABEL: test_ctselect_different_cond: +; Test i8 functionality +define i8 @test_ctselect_i8_basic(i1 %cond, i8 %a, i8 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i8_basic: ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: cmpl %edx, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: setl %al -; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: # kill: def $al killed $al killed $eax ; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; -; I386-CMOV-LABEL: test_ctselect_different_cond: +; I386-CMOV-LABEL: test_ctselect_i8_basic: ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-CMOV-NEXT: cmpl %eax, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: setl %cl -; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: # kill: def $al killed $al killed $eax ; I386-CMOV-NEXT: retl - %cond = icmp slt i32 %a, %c - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c) - ret i32 %result + %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %result } -; Verify no conditional branches in constant-time version -define i32 @test_no_branches(i32 %secret, i32 %public1, i32 %public2) nounwind { -; I386-NOCMOV-LABEL: test_no_branches: +; Test security property: constant-time execution for cryptographic use case +define i32 @test_crypto_key_select(i32 %secret_bit, i32 %key1, i32 %key2) nounwind { +; I386-NOCMOV-LABEL: test_crypto_key_select: ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -193,11 +96,10 @@ define i32 @test_no_branches(i32 %secret, i32 %public1, i32 %public2) nounwind { ; I386-NOCMOV-NEXT: testb %al, %al ; I386-NOCMOV-NEXT: BUNDLE ; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; -; I386-CMOV-LABEL: test_no_branches: +; I386-CMOV-LABEL: test_crypto_key_select: ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) @@ -205,49 +107,45 @@ define i32 @test_no_branches(i32 %secret, i32 %public1, i32 %public2) nounwind { ; I386-CMOV-NEXT: testb %cl, %cl ; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; I386-CMOV-NEXT: retl - %cond = icmp ne i32 %secret, 0 - %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %public1, i32 %public2) + %cond = icmp ne i32 %secret_bit, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %key1, i32 %key2) ret i32 %result } -; Test edge cases for post-RA expansion -define i32 @test_ctselect_zero_one(i32 %cond) nounwind { -; I386-NOCMOV-LABEL: test_ctselect_zero_one: +; Test that no conditional branches appear in constant-time path +define i32 @test_no_conditional_branches(i32 %secret, i32 %val1, i32 %val2) nounwind { +; I386-NOCMOV-LABEL: test_no_conditional_branches: ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: setne %al -; I386-NOCMOV-NEXT: xorl %ecx, %ecx ; I386-NOCMOV-NEXT: testb %al, %al -; I386-NOCMOV-NEXT: movl $1, %edx ; I386-NOCMOV-NEXT: BUNDLE ; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; -; I386-CMOV-LABEL: test_ctselect_zero_one: +; I386-CMOV-LABEL: test_no_conditional_branches: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: setne %cl -; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb %cl, %cl -; I386-CMOV-NEXT: movl $1, %ecx -; I386-CMOV-NEXT: cmovnel %ecx, %eax +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; I386-CMOV-NEXT: retl - %test = icmp ne i32 %cond, 0 - %result = call i32 @llvm.ct.select.i32(i1 %test, i32 1, i32 0) + %cond = icmp ne i32 %secret, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2) ret i32 %result } -; Test bundling behavior - instructions should be bundled for atomic treatment -define i32 @test_ctselect_bundling(i32 %a, i32 %b, i32 %c) nounwind { -; I386-NOCMOV-LABEL: test_ctselect_bundling: +; Test with comparison condition +define i32 @test_ctselect_i32_cmp(i32 %a, i32 %b, i32 %c) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i32_cmp: ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -256,11 +154,10 @@ define i32 @test_ctselect_bundling(i32 %a, i32 %b, i32 %c) nounwind { ; I386-NOCMOV-NEXT: testb %al, %al ; I386-NOCMOV-NEXT: BUNDLE ; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; -; I386-CMOV-LABEL: test_ctselect_bundling: +; I386-CMOV-LABEL: test_ctselect_i32_cmp: ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; I386-CMOV-NEXT: cmpl %eax, {{[0-9]+}}(%esp) @@ -273,40 +170,40 @@ define i32 @test_ctselect_bundling(i32 %a, i32 %b, i32 %c) nounwind { ret i32 %result } -; Test i8 promotion to i32 for i386 -define i8 @test_ctselect_i8_promotion(i8 %a, i8 %b, i8 %c) nounwind { -; I386-NOCMOV-LABEL: test_ctselect_i8_promotion: +; Test nested selects +define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_nested: ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %ebx ; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: cmpb %cl, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: # kill: def $al killed $al killed $eax ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; -; I386-CMOV-LABEL: test_ctselect_i8_promotion: +; I386-CMOV-LABEL: test_ctselect_nested: ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-CMOV-NEXT: cmpb %al, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: sete %cl -; I386-CMOV-NEXT: testb %cl, %cl -; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax -; I386-CMOV-NEXT: # kill: def $al killed $al killed $eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel %ecx, %eax ; I386-CMOV-NEXT: retl - %cond = icmp eq i8 %a, %c - %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %b, i8 %c) - ret i8 %result + %sel1 = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %sel1, i32 %c) + ret i32 %sel2 } +; Declare ct.select intrinsics declare i8 @llvm.ct.select.i8(i1, i8, i8) declare i16 @llvm.ct.select.i16(i1, i16, i16) declare i32 @llvm.ct.select.i32(i1, i32, i32) -declare i64 @llvm.ct.select.i64(i1, i64, i64) From ab15655cf828f45b1855b059008e03c3a3e49552 Mon Sep 17 00:00:00 2001 From: Henrik Brodin <90325907+hbrodin@users.noreply.github.com> Date: Mon, 1 Sep 2025 17:20:12 +0200 Subject: [PATCH 50/63] [CT] Fix bundle unpacking --- llvm/lib/Target/X86/X86InstrInfo.cpp | 9 +++ llvm/test/CodeGen/X86/ctselect-i386-mmx.ll | 90 +++++++++++++++++++--- llvm/test/CodeGen/X86/ctselect-i386.ll | 72 +++++++++++++++-- 3 files changed, 153 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index fa98d0fc4cf27..5769119e83081 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1022,6 +1022,15 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const { // Remove the original pseudo instruction MI.eraseFromParent(); + + // Bundle all generated instructions for atomic execution + auto BundleEnd = MI.getIterator(); + if (BundleStart != BundleEnd) { + // Only bundle if we have multiple instructions + MachineInstr *BundleHeader = + BuildMI(*MBB, BundleStart, DL, get(TargetOpcode::BUNDLE)); + finalizeBundle(*MBB, BundleHeader->getIterator(), std::next(BundleEnd)); + } return true; } diff --git a/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll index bdd88c5d682d2..f410560b7a78f 100644 --- a/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll +++ b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll @@ -31,9 +31,23 @@ define <1 x i64> @test_mmx_ctselect_with_paddd(i32 %cond, i64 %a, i64 %b) { ; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: setne %bl ; I386-NOCMOV-NEXT: testb %bl, %bl -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movzbl %bl, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %esi, %edi +; I386-NOCMOV-NEXT: andl %ebp, %edi +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %ecx, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %edi ; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %ecx +; I386-NOCMOV-NEXT: andl %esi, %ecx +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %eax, %esi +; I386-NOCMOV-NEXT: orl %esi, %ecx ; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; I386-NOCMOV-NEXT: paddd %mm0, %mm0 @@ -106,9 +120,23 @@ define <1 x i64> @test_mmx_ctselect_with_psllw(i32 %cond, i64 %a, i64 %b) { ; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: setne %bl ; I386-NOCMOV-NEXT: testb %bl, %bl -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movzbl %bl, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %esi, %edi +; I386-NOCMOV-NEXT: andl %ebp, %edi +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %ecx, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %edi ; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %ecx +; I386-NOCMOV-NEXT: andl %esi, %ecx +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %eax, %esi +; I386-NOCMOV-NEXT: orl %esi, %ecx ; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; I386-NOCMOV-NEXT: psllw %mm0, %mm0 @@ -183,14 +211,42 @@ define <1 x i64> @test_mmx_nested_ctselect_with_pand(i32 %cond1, i32 %cond2, i64 ; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: setne %ah ; I386-NOCMOV-NEXT: testb %ah, %ah -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %ch +; I386-NOCMOV-NEXT: movzbl %ch, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %edi, %eax +; I386-NOCMOV-NEXT: andl %ebp, %eax +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %edx, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %eax +; I386-NOCMOV-NEXT: sete %ch +; I386-NOCMOV-NEXT: movzbl %ch, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %ebx, %edx +; I386-NOCMOV-NEXT: andl %edi, %edx +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %esi, %edi +; I386-NOCMOV-NEXT: orl %edi, %edx ; I386-NOCMOV-NEXT: testb %cl, %cl ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edi, %ecx +; I386-NOCMOV-NEXT: andl %esi, %ecx +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %edx, %esi +; I386-NOCMOV-NEXT: orl %esi, %ecx ; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %dl +; I386-NOCMOV-NEXT: movzbl %dl, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edi, %ecx +; I386-NOCMOV-NEXT: andl %esi, %ecx +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %eax, %esi +; I386-NOCMOV-NEXT: orl %esi, %ecx ; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; I386-NOCMOV-NEXT: pand %mm0, %mm0 @@ -283,9 +339,23 @@ define <1 x i64> @test_mmx_ctselect_with_por(i32 %cond, i64 %a, i64 %b) { ; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: setne %bl ; I386-NOCMOV-NEXT: testb %bl, %bl -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movzbl %bl, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %esi, %edi +; I386-NOCMOV-NEXT: andl %ebp, %edi +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %ecx, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %edi ; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %ecx +; I386-NOCMOV-NEXT: andl %esi, %ecx +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %eax, %esi +; I386-NOCMOV-NEXT: orl %esi, %ecx ; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; I386-NOCMOV-NEXT: por %mm0, %mm0 diff --git a/llvm/test/CodeGen/X86/ctselect-i386.ll b/llvm/test/CodeGen/X86/ctselect-i386.ll index 653fe034d9128..94101f6a16b23 100644 --- a/llvm/test/CodeGen/X86/ctselect-i386.ll +++ b/llvm/test/CodeGen/X86/ctselect-i386.ll @@ -18,7 +18,14 @@ define i32 @test_ctselect_i32_basic(i1 %cond, i32 %a, i32 %b) nounwind { ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: andl %esi, %eax +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %esi, %eax ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl @@ -42,7 +49,14 @@ define i16 @test_ctselect_i16_basic(i1 %cond, i16 %a, i16 %b) nounwind { ; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: negw %si +; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: andw %si, %ax +; I386-NOCMOV-NEXT: notw %si +; I386-NOCMOV-NEXT: andw %cx, %si +; I386-NOCMOV-NEXT: orw %si, %ax ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl @@ -66,7 +80,14 @@ define i8 @test_ctselect_i8_basic(i1 %cond, i8 %a, i8 %b) nounwind { ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: andl %esi, %eax +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %esi, %eax ; I386-NOCMOV-NEXT: # kill: def $al killed $al killed $eax ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %ebx @@ -94,7 +115,14 @@ define i32 @test_crypto_key_select(i32 %secret_bit, i32 %key1, i32 %key2) nounwi ; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: setne %al ; I386-NOCMOV-NEXT: testb %al, %al -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: andl %esi, %eax +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %esi, %eax ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl @@ -123,7 +151,14 @@ define i32 @test_no_conditional_branches(i32 %secret, i32 %val1, i32 %val2) noun ; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: setne %al ; I386-NOCMOV-NEXT: testb %al, %al -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: andl %esi, %eax +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %esi, %eax ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl @@ -152,7 +187,14 @@ define i32 @test_ctselect_i32_cmp(i32 %a, i32 %b, i32 %c) nounwind { ; I386-NOCMOV-NEXT: cmpl %edx, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al ; I386-NOCMOV-NEXT: testb %al, %al -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: andl %esi, %eax +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %esi, %eax ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl @@ -181,9 +223,23 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) n ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movzbl %bl, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %eax, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: BUNDLE +; I386-NOCMOV-NEXT: sete %dl +; I386-NOCMOV-NEXT: movzbl %dl, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %ecx, %eax +; I386-NOCMOV-NEXT: andl %edi, %eax +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %esi, %edi +; I386-NOCMOV-NEXT: orl %edi, %eax ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: popl %ebx From c03aa50a8996797488a6cf0a949639d4c5b5592d Mon Sep 17 00:00:00 2001 From: Henrik Brodin <90325907+hbrodin@users.noreply.github.com> Date: Wed, 24 Sep 2025 11:46:08 +0200 Subject: [PATCH 51/63] [CT] Properly expand BUNDLEs on x86 --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 4 ++++ llvm/lib/Target/X86/X86TargetMachine.cpp | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 90d9ac76b6e57..b096953f67fc4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6834,6 +6834,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } case Intrinsic::ct_select: { + // Set function attribute to indicate ct.select usage + Function &F = DAG.getMachineFunction().getFunction(); + F.addFnAttr("ct-select"); + SDLoc DL = getCurSDLoc(); SDValue Cond = getValue(I.getArgOperand(0)); // i1 diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 4425b3eebee8e..a776b54912c16 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -617,7 +617,7 @@ void X86PassConfig::addPreEmitPass2() { // ObjC runtime functions present in the module. const Function &F = MF.getFunction(); const Module *M = F.getParent(); - return M->getModuleFlag("kcfi") || + return M->getModuleFlag("kcfi") || F.hasFnAttribute("ct-select") || (TT.isOSDarwin() && (M->getFunction("objc_retainAutoreleasedReturnValue") || M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))) || From b321ad985a50b7286bc98771b460b850275d8430 Mon Sep 17 00:00:00 2001 From: Henrik Brodin <90325907+hbrodin@users.noreply.github.com> Date: Wed, 24 Sep 2025 11:47:18 +0200 Subject: [PATCH 52/63] [CT] Fix bundled sequence for ints --- llvm/lib/Target/X86/X86InstrInfo.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 5769119e83081..c1e09a3d22b7b 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1027,10 +1027,11 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const { auto BundleEnd = MI.getIterator(); if (BundleStart != BundleEnd) { // Only bundle if we have multiple instructions - MachineInstr *BundleHeader = - BuildMI(*MBB, BundleStart, DL, get(TargetOpcode::BUNDLE)); - finalizeBundle(*MBB, BundleHeader->getIterator(), std::next(BundleEnd)); + finalizeBundle(*MBB, BundleStart, BundleEnd); } + + // Remove the original pseudo instruction + MI.eraseFromParent(); return true; } From 4d69736b61d3e9a71a24081898fbcf21ca807147 Mon Sep 17 00:00:00 2001 From: Henrik Brodin <90325907+hbrodin@users.noreply.github.com> Date: Wed, 24 Sep 2025 15:45:46 +0200 Subject: [PATCH 53/63] [CT] FP32 via integer ct-select --- llvm/lib/Target/X86/X86ISelLowering.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e0f16020be80f..793b2bf4731d4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25520,6 +25520,18 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect); } + // Handle floating point on i386 without SSE/CMOV (constant-time requirement) + if (!Subtarget.hasSSE1() && VT.isFloatingPoint() && !VT.isVector()) { + if (VT == MVT::f32) { + // Bitcast f32 to i32, use raw condition with ISD::CTSELECT (avoids EFLAGS redundancy) + TrueOp = DAG.getBitcast(MVT::i32, TrueOp); + FalseOp = DAG.getBitcast(MVT::i32, FalseOp); + SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueOp, FalseOp); + return DAG.getBitcast(VT, CtSelect); + } + // For f64 and f80 on i386, fall through to generic handling for now + } + if (isScalarFPTypeInSSEReg(VT)) { MVT IntVT = (VT == MVT::f32) ? MVT::i32 : MVT::i64; TrueOp = DAG.getBitcast(IntVT, TrueOp); From 7a9c27a30ee1b90dd29b84ecc18af03546216136 Mon Sep 17 00:00:00 2001 From: Henrik Brodin <90325907+hbrodin@users.noreply.github.com> Date: Wed, 24 Sep 2025 17:53:47 +0200 Subject: [PATCH 54/63] [CT] FP64 via two 32-bit integer ctselect --- llvm/lib/Target/X86/X86ISelLowering.cpp | 48 +++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 793b2bf4731d4..60fc0faf84fdc 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25528,8 +25528,52 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { FalseOp = DAG.getBitcast(MVT::i32, FalseOp); SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueOp, FalseOp); return DAG.getBitcast(VT, CtSelect); - } - // For f64 and f80 on i386, fall through to generic handling for now + } else if (VT == MVT::f64) { + // For f64 on i386, avoid all i64 operations by using memory to split/reassemble + // TODO: Consider creating CTSELECT_I386_F64mm pseudo instruction + // for single bundled 64-bit memory-based post-RA expansion + + SDValue Chain = DAG.getEntryNode(); + + // Create temporary stack slots for input f64 values + SDValue TrueSlot = DAG.CreateStackTemporary(MVT::f64); + SDValue FalseSlot = DAG.CreateStackTemporary(MVT::f64); + + // Store f64 values to memory + SDValue StoreTrueF64 = DAG.getStore(Chain, DL, TrueOp, TrueSlot, + MachinePointerInfo()); + SDValue StoreFalseF64 = DAG.getStore(Chain, DL, FalseOp, FalseSlot, + MachinePointerInfo()); + + // Load i32 parts from memory (lo at offset 0, hi at offset 4) + SDValue TrueLo = DAG.getLoad(MVT::i32, DL, StoreTrueF64, TrueSlot, + MachinePointerInfo()); + SDValue TrueHiPtr = DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(4), DL); + SDValue TrueHi = DAG.getLoad(MVT::i32, DL, StoreTrueF64, TrueHiPtr, + MachinePointerInfo()); + + SDValue FalseLo = DAG.getLoad(MVT::i32, DL, StoreFalseF64, FalseSlot, + MachinePointerInfo()); + SDValue FalseHiPtr = DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(4), DL); + SDValue FalseHi = DAG.getLoad(MVT::i32, DL, StoreFalseF64, FalseHiPtr, + MachinePointerInfo()); + + // Create two i32 CTSELECT operations + SDValue LoSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueLo, FalseLo); + SDValue HiSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueHi, FalseHi); + + // Create result stack slot and store the selected parts + SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f64); + SDValue StoreResLo = DAG.getStore(Chain, DL, LoSelect, ResultSlot, + MachinePointerInfo()); + SDValue ResHiPtr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL); + SDValue StoreResHi = DAG.getStore(StoreResLo, DL, HiSelect, ResHiPtr, + MachinePointerInfo()); + + // Load complete f64 result from memory + return DAG.getLoad(MVT::f64, DL, StoreResHi, ResultSlot, MachinePointerInfo()); + } + // For f80 on i386, fall through to generic handling for now } if (isScalarFPTypeInSSEReg(VT)) { From 878d53db18dd3da81a5146ac75bb576efc6da07a Mon Sep 17 00:00:00 2001 From: Henrik Brodin <90325907+hbrodin@users.noreply.github.com> Date: Wed, 24 Sep 2025 18:02:37 +0200 Subject: [PATCH 55/63] [CT] FP80 support via 32-bit integer ctselect --- llvm/lib/Target/X86/X86ISelLowering.cpp | 56 ++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 60fc0faf84fdc..c688894cefa0e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25572,8 +25572,62 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { // Load complete f64 result from memory return DAG.getLoad(MVT::f64, DL, StoreResHi, ResultSlot, MachinePointerInfo()); + } else if (VT == MVT::f80) { + // For f80 on i386, use memory-based approach with 3×32-bit chunks + // f80 is stored as 96 bits (80 bits + 16 padding), handled as 3×i32 + // TODO: Consider creating CTSELECT_I386_F80mm pseudo instruction + // for single bundled 80-bit memory-based post-RA expansion + + SDValue Chain = DAG.getEntryNode(); + + // Create temporary stack slots for input f80 values + SDValue TrueSlot = DAG.CreateStackTemporary(MVT::f80); + SDValue FalseSlot = DAG.CreateStackTemporary(MVT::f80); + + // Store f80 values to memory + SDValue StoreTrueF80 = DAG.getStore(Chain, DL, TrueOp, TrueSlot, + MachinePointerInfo()); + SDValue StoreFalseF80 = DAG.getStore(Chain, DL, FalseOp, FalseSlot, + MachinePointerInfo()); + + // Load i32 parts from memory (3 chunks: [0-3], [4-7], [8-11] bytes) + SDValue TruePart0 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TrueSlot, + MachinePointerInfo()); + SDValue TruePart1Ptr = DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(4), DL); + SDValue TruePart1 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart1Ptr, + MachinePointerInfo()); + SDValue TruePart2Ptr = DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(8), DL); + SDValue TruePart2 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart2Ptr, + MachinePointerInfo()); + + SDValue FalsePart0 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalseSlot, + MachinePointerInfo()); + SDValue FalsePart1Ptr = DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(4), DL); + SDValue FalsePart1 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart1Ptr, + MachinePointerInfo()); + SDValue FalsePart2Ptr = DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(8), DL); + SDValue FalsePart2 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart2Ptr, + MachinePointerInfo()); + + // Create three i32 CTSELECT operations + SDValue Part0Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart0, FalsePart0); + SDValue Part1Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart1, FalsePart1); + SDValue Part2Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart2, FalsePart2); + + // Create result stack slot and store the selected parts + SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f80); + SDValue StorePart0 = DAG.getStore(Chain, DL, Part0Select, ResultSlot, + MachinePointerInfo()); + SDValue ResPart1Ptr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL); + SDValue StorePart1 = DAG.getStore(StorePart0, DL, Part1Select, ResPart1Ptr, + MachinePointerInfo()); + SDValue ResPart2Ptr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(8), DL); + SDValue StorePart2 = DAG.getStore(StorePart1, DL, Part2Select, ResPart2Ptr, + MachinePointerInfo()); + + // Load complete f80 result from memory + return DAG.getLoad(MVT::f80, DL, StorePart2, ResultSlot, MachinePointerInfo()); } - // For f80 on i386, fall through to generic handling for now } if (isScalarFPTypeInSSEReg(VT)) { From e4fcc0fcfbbb14aaa35e36a4fbbe5f0f9f3b579b Mon Sep 17 00:00:00 2001 From: Henrik Brodin <90325907+hbrodin@users.noreply.github.com> Date: Wed, 24 Sep 2025 18:13:51 +0200 Subject: [PATCH 56/63] [CT] Optimize for when operands on stack already - skip FP --- .../SelectionDAG/SelectionDAGBuilder.cpp | 3 - llvm/lib/Target/X86/X86ISelLowering.cpp | 109 +++++++++++++++++- llvm/lib/Target/X86/X86InstrCompiler.td | 9 -- llvm/lib/Target/X86/X86InstrInfo.cpp | 19 +-- 4 files changed, 111 insertions(+), 29 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index b096953f67fc4..c80fa04cde4e1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6834,9 +6834,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } case Intrinsic::ct_select: { - // Set function attribute to indicate ct.select usage - Function &F = DAG.getMachineFunction().getFunction(); - F.addFnAttr("ct-select"); SDLoc DL = getCurSDLoc(); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c688894cefa0e..fe4d527a55c04 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25523,13 +25523,69 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { // Handle floating point on i386 without SSE/CMOV (constant-time requirement) if (!Subtarget.hasSSE1() && VT.isFloatingPoint() && !VT.isVector()) { if (VT == MVT::f32) { - // Bitcast f32 to i32, use raw condition with ISD::CTSELECT (avoids EFLAGS redundancy) + // Optimize: if operands are memory loads, access raw bits directly + if (TrueOp.getOpcode() == ISD::LOAD && FalseOp.getOpcode() == ISD::LOAD) { + LoadSDNode *TrueLoad = cast(TrueOp.getNode()); + LoadSDNode *FalseLoad = cast(FalseOp.getNode()); + + // Load the same memory addresses as i32 (raw f32 bits) + SDValue TrueI32 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(), + TrueLoad->getBasePtr(), TrueLoad->getPointerInfo()); + SDValue FalseI32 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(), + FalseLoad->getBasePtr(), FalseLoad->getPointerInfo()); + + // Direct CTSELECT on raw bits + SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueI32, FalseI32); + + // Store result and load back as f32 + SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f32); + SDValue Store = DAG.getStore(DAG.getEntryNode(), DL, CtSelect, ResultSlot, + MachinePointerInfo()); + return DAG.getLoad(MVT::f32, DL, Store, ResultSlot, MachinePointerInfo()); + } + + // Fallback: bitcast approach for register values TrueOp = DAG.getBitcast(MVT::i32, TrueOp); FalseOp = DAG.getBitcast(MVT::i32, FalseOp); SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueOp, FalseOp); return DAG.getBitcast(VT, CtSelect); } else if (VT == MVT::f64) { - // For f64 on i386, avoid all i64 operations by using memory to split/reassemble + // Optimize: if operands are memory loads, access raw bits directly + if (TrueOp.getOpcode() == ISD::LOAD && FalseOp.getOpcode() == ISD::LOAD) { + LoadSDNode *TrueLoad = cast(TrueOp.getNode()); + LoadSDNode *FalseLoad = cast(FalseOp.getNode()); + + // Load i32 parts directly from memory (lo/hi 32-bit chunks) + SDValue TrueLo = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(), + TrueLoad->getBasePtr(), TrueLoad->getPointerInfo()); + SDValue TrueHiPtr = DAG.getMemBasePlusOffset(TrueLoad->getBasePtr(), + TypeSize::getFixed(4), DL); + SDValue TrueHi = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(), + TrueHiPtr, TrueLoad->getPointerInfo()); + + SDValue FalseLo = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(), + FalseLoad->getBasePtr(), FalseLoad->getPointerInfo()); + SDValue FalseHiPtr = DAG.getMemBasePlusOffset(FalseLoad->getBasePtr(), + TypeSize::getFixed(4), DL); + SDValue FalseHi = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(), + FalseHiPtr, FalseLoad->getPointerInfo()); + + // Direct CTSELECT on both i32 parts + SDValue LoSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueLo, FalseLo); + SDValue HiSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueHi, FalseHi); + + // Store result parts and load back as f64 + SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f64); + SDValue Chain = DAG.getEntryNode(); + SDValue StoreResLo = DAG.getStore(Chain, DL, LoSelect, ResultSlot, + MachinePointerInfo()); + SDValue ResHiPtr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL); + SDValue StoreResHi = DAG.getStore(StoreResLo, DL, HiSelect, ResHiPtr, + MachinePointerInfo()); + return DAG.getLoad(MVT::f64, DL, StoreResHi, ResultSlot, MachinePointerInfo()); + } + + // Fallback: memory-based approach for register values // TODO: Consider creating CTSELECT_I386_F64mm pseudo instruction // for single bundled 64-bit memory-based post-RA expansion @@ -25573,7 +25629,54 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { // Load complete f64 result from memory return DAG.getLoad(MVT::f64, DL, StoreResHi, ResultSlot, MachinePointerInfo()); } else if (VT == MVT::f80) { - // For f80 on i386, use memory-based approach with 3×32-bit chunks + // Optimize: if operands are memory loads, access raw bits directly + if (TrueOp.getOpcode() == ISD::LOAD && FalseOp.getOpcode() == ISD::LOAD) { + LoadSDNode *TrueLoad = cast(TrueOp.getNode()); + LoadSDNode *FalseLoad = cast(FalseOp.getNode()); + + // Load i32 parts directly from memory (3 chunks: [0-3], [4-7], [8-11] bytes) + SDValue TruePart0 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(), + TrueLoad->getBasePtr(), TrueLoad->getPointerInfo()); + SDValue TruePart1Ptr = DAG.getMemBasePlusOffset(TrueLoad->getBasePtr(), + TypeSize::getFixed(4), DL); + SDValue TruePart1 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(), + TruePart1Ptr, TrueLoad->getPointerInfo()); + SDValue TruePart2Ptr = DAG.getMemBasePlusOffset(TrueLoad->getBasePtr(), + TypeSize::getFixed(8), DL); + SDValue TruePart2 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(), + TruePart2Ptr, TrueLoad->getPointerInfo()); + + SDValue FalsePart0 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(), + FalseLoad->getBasePtr(), FalseLoad->getPointerInfo()); + SDValue FalsePart1Ptr = DAG.getMemBasePlusOffset(FalseLoad->getBasePtr(), + TypeSize::getFixed(4), DL); + SDValue FalsePart1 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(), + FalsePart1Ptr, FalseLoad->getPointerInfo()); + SDValue FalsePart2Ptr = DAG.getMemBasePlusOffset(FalseLoad->getBasePtr(), + TypeSize::getFixed(8), DL); + SDValue FalsePart2 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(), + FalsePart2Ptr, FalseLoad->getPointerInfo()); + + // Direct CTSELECT on all three i32 parts + SDValue Part0Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart0, FalsePart0); + SDValue Part1Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart1, FalsePart1); + SDValue Part2Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart2, FalsePart2); + + // Store result parts and load back as f80 + SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f80); + SDValue Chain = DAG.getEntryNode(); + SDValue StorePart0 = DAG.getStore(Chain, DL, Part0Select, ResultSlot, + MachinePointerInfo()); + SDValue ResPart1Ptr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL); + SDValue StorePart1 = DAG.getStore(StorePart0, DL, Part1Select, ResPart1Ptr, + MachinePointerInfo()); + SDValue ResPart2Ptr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(8), DL); + SDValue StorePart2 = DAG.getStore(StorePart1, DL, Part2Select, ResPart2Ptr, + MachinePointerInfo()); + return DAG.getLoad(MVT::f80, DL, StorePart2, ResultSlot, MachinePointerInfo()); + } + + // Fallback: memory-based approach for register values // f80 is stored as 96 bits (80 bits + 16 padding), handled as 3×i32 // TODO: Consider creating CTSELECT_I386_F80mm pseudo instruction // for single bundled 80-bit memory-based post-RA expansion diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 69a01801c6fb5..a7a2986fa9548 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -747,9 +747,6 @@ let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { defm CTSELECT_I386_INT_GR16 : CTSELECT_I386_INTERNAL; defm CTSELECT_I386_INT_GR32 : CTSELECT_I386_INTERNAL; } - let Predicates = [NoCMOV, HasMMX] in { - defm CTSELECT_I386_VR64 : CTSELECT_I386_VR64; - } } let usesCustomInserter = 1, @@ -786,12 +783,6 @@ let Predicates = [NoNativeCMOV] in { // i64 patterns handled automatically by type legalization } -// Pattern matching for VR64 CTSELECT on i386 without CMOV (routes to post-RA expansion) -let Predicates = [NoCMOV, Not64BitMode, HasMMX] in { - def : Pat<(x86mmx(X86ctselect VR64:$src1, VR64:$src2, timm:$cond, EFLAGS)), - (CTSELECT_I386_VR64rr VR64:$src1, VR64:$src2, timm:$cond)>; -} - //===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index c1e09a3d22b7b..9335b78b11785 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1023,15 +1023,6 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const { // Remove the original pseudo instruction MI.eraseFromParent(); - // Bundle all generated instructions for atomic execution - auto BundleEnd = MI.getIterator(); - if (BundleStart != BundleEnd) { - // Only bundle if we have multiple instructions - finalizeBundle(*MBB, BundleStart, BundleEnd); - } - - // Remove the original pseudo instruction - MI.eraseFromParent(); return true; } @@ -7010,13 +7001,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return expandCtSelectVector(MI); // i386-specific CTSELECT expansion (post-RA, constant-time) - case X86::CTSELECT_I386_GR16rr: - case X86::CTSELECT_I386_GR32rr: - return expandCtSelectI386(MI); + //case X86::CTSELECT_I386_GR16rr: + //case X86::CTSELECT_I386_GR32rr: + // return expandCtSelectI386(MI); // VR64-specific CTSELECT expansion (post-RA, constant-time) - case X86::CTSELECT_I386_VR64rr: - return expandCtSelectI386VR64(MI); + //case X86::CTSELECT_I386_VR64rr: + // return expandCtSelectI386VR64(MI); } return false; } From c80ffd7078c842dc687f5ca2261b3e0240d751eb Mon Sep 17 00:00:00 2001 From: kumarak Date: Wed, 8 Oct 2025 13:47:07 +0000 Subject: [PATCH 57/63] [CT] Update handling of floating type and emit pseudo code in custom inserter --- llvm/lib/Target/X86/X86ISelLowering.cpp | 411 +++++++++++------------- llvm/lib/Target/X86/X86InstrCompiler.td | 31 +- 2 files changed, 195 insertions(+), 247 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index fe4d527a55c04..8fad801b56b22 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25520,219 +25520,6 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect); } - // Handle floating point on i386 without SSE/CMOV (constant-time requirement) - if (!Subtarget.hasSSE1() && VT.isFloatingPoint() && !VT.isVector()) { - if (VT == MVT::f32) { - // Optimize: if operands are memory loads, access raw bits directly - if (TrueOp.getOpcode() == ISD::LOAD && FalseOp.getOpcode() == ISD::LOAD) { - LoadSDNode *TrueLoad = cast(TrueOp.getNode()); - LoadSDNode *FalseLoad = cast(FalseOp.getNode()); - - // Load the same memory addresses as i32 (raw f32 bits) - SDValue TrueI32 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(), - TrueLoad->getBasePtr(), TrueLoad->getPointerInfo()); - SDValue FalseI32 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(), - FalseLoad->getBasePtr(), FalseLoad->getPointerInfo()); - - // Direct CTSELECT on raw bits - SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueI32, FalseI32); - - // Store result and load back as f32 - SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f32); - SDValue Store = DAG.getStore(DAG.getEntryNode(), DL, CtSelect, ResultSlot, - MachinePointerInfo()); - return DAG.getLoad(MVT::f32, DL, Store, ResultSlot, MachinePointerInfo()); - } - - // Fallback: bitcast approach for register values - TrueOp = DAG.getBitcast(MVT::i32, TrueOp); - FalseOp = DAG.getBitcast(MVT::i32, FalseOp); - SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueOp, FalseOp); - return DAG.getBitcast(VT, CtSelect); - } else if (VT == MVT::f64) { - // Optimize: if operands are memory loads, access raw bits directly - if (TrueOp.getOpcode() == ISD::LOAD && FalseOp.getOpcode() == ISD::LOAD) { - LoadSDNode *TrueLoad = cast(TrueOp.getNode()); - LoadSDNode *FalseLoad = cast(FalseOp.getNode()); - - // Load i32 parts directly from memory (lo/hi 32-bit chunks) - SDValue TrueLo = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(), - TrueLoad->getBasePtr(), TrueLoad->getPointerInfo()); - SDValue TrueHiPtr = DAG.getMemBasePlusOffset(TrueLoad->getBasePtr(), - TypeSize::getFixed(4), DL); - SDValue TrueHi = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(), - TrueHiPtr, TrueLoad->getPointerInfo()); - - SDValue FalseLo = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(), - FalseLoad->getBasePtr(), FalseLoad->getPointerInfo()); - SDValue FalseHiPtr = DAG.getMemBasePlusOffset(FalseLoad->getBasePtr(), - TypeSize::getFixed(4), DL); - SDValue FalseHi = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(), - FalseHiPtr, FalseLoad->getPointerInfo()); - - // Direct CTSELECT on both i32 parts - SDValue LoSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueLo, FalseLo); - SDValue HiSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueHi, FalseHi); - - // Store result parts and load back as f64 - SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f64); - SDValue Chain = DAG.getEntryNode(); - SDValue StoreResLo = DAG.getStore(Chain, DL, LoSelect, ResultSlot, - MachinePointerInfo()); - SDValue ResHiPtr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL); - SDValue StoreResHi = DAG.getStore(StoreResLo, DL, HiSelect, ResHiPtr, - MachinePointerInfo()); - return DAG.getLoad(MVT::f64, DL, StoreResHi, ResultSlot, MachinePointerInfo()); - } - - // Fallback: memory-based approach for register values - // TODO: Consider creating CTSELECT_I386_F64mm pseudo instruction - // for single bundled 64-bit memory-based post-RA expansion - - SDValue Chain = DAG.getEntryNode(); - - // Create temporary stack slots for input f64 values - SDValue TrueSlot = DAG.CreateStackTemporary(MVT::f64); - SDValue FalseSlot = DAG.CreateStackTemporary(MVT::f64); - - // Store f64 values to memory - SDValue StoreTrueF64 = DAG.getStore(Chain, DL, TrueOp, TrueSlot, - MachinePointerInfo()); - SDValue StoreFalseF64 = DAG.getStore(Chain, DL, FalseOp, FalseSlot, - MachinePointerInfo()); - - // Load i32 parts from memory (lo at offset 0, hi at offset 4) - SDValue TrueLo = DAG.getLoad(MVT::i32, DL, StoreTrueF64, TrueSlot, - MachinePointerInfo()); - SDValue TrueHiPtr = DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(4), DL); - SDValue TrueHi = DAG.getLoad(MVT::i32, DL, StoreTrueF64, TrueHiPtr, - MachinePointerInfo()); - - SDValue FalseLo = DAG.getLoad(MVT::i32, DL, StoreFalseF64, FalseSlot, - MachinePointerInfo()); - SDValue FalseHiPtr = DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(4), DL); - SDValue FalseHi = DAG.getLoad(MVT::i32, DL, StoreFalseF64, FalseHiPtr, - MachinePointerInfo()); - - // Create two i32 CTSELECT operations - SDValue LoSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueLo, FalseLo); - SDValue HiSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueHi, FalseHi); - - // Create result stack slot and store the selected parts - SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f64); - SDValue StoreResLo = DAG.getStore(Chain, DL, LoSelect, ResultSlot, - MachinePointerInfo()); - SDValue ResHiPtr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL); - SDValue StoreResHi = DAG.getStore(StoreResLo, DL, HiSelect, ResHiPtr, - MachinePointerInfo()); - - // Load complete f64 result from memory - return DAG.getLoad(MVT::f64, DL, StoreResHi, ResultSlot, MachinePointerInfo()); - } else if (VT == MVT::f80) { - // Optimize: if operands are memory loads, access raw bits directly - if (TrueOp.getOpcode() == ISD::LOAD && FalseOp.getOpcode() == ISD::LOAD) { - LoadSDNode *TrueLoad = cast(TrueOp.getNode()); - LoadSDNode *FalseLoad = cast(FalseOp.getNode()); - - // Load i32 parts directly from memory (3 chunks: [0-3], [4-7], [8-11] bytes) - SDValue TruePart0 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(), - TrueLoad->getBasePtr(), TrueLoad->getPointerInfo()); - SDValue TruePart1Ptr = DAG.getMemBasePlusOffset(TrueLoad->getBasePtr(), - TypeSize::getFixed(4), DL); - SDValue TruePart1 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(), - TruePart1Ptr, TrueLoad->getPointerInfo()); - SDValue TruePart2Ptr = DAG.getMemBasePlusOffset(TrueLoad->getBasePtr(), - TypeSize::getFixed(8), DL); - SDValue TruePart2 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(), - TruePart2Ptr, TrueLoad->getPointerInfo()); - - SDValue FalsePart0 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(), - FalseLoad->getBasePtr(), FalseLoad->getPointerInfo()); - SDValue FalsePart1Ptr = DAG.getMemBasePlusOffset(FalseLoad->getBasePtr(), - TypeSize::getFixed(4), DL); - SDValue FalsePart1 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(), - FalsePart1Ptr, FalseLoad->getPointerInfo()); - SDValue FalsePart2Ptr = DAG.getMemBasePlusOffset(FalseLoad->getBasePtr(), - TypeSize::getFixed(8), DL); - SDValue FalsePart2 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(), - FalsePart2Ptr, FalseLoad->getPointerInfo()); - - // Direct CTSELECT on all three i32 parts - SDValue Part0Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart0, FalsePart0); - SDValue Part1Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart1, FalsePart1); - SDValue Part2Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart2, FalsePart2); - - // Store result parts and load back as f80 - SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f80); - SDValue Chain = DAG.getEntryNode(); - SDValue StorePart0 = DAG.getStore(Chain, DL, Part0Select, ResultSlot, - MachinePointerInfo()); - SDValue ResPart1Ptr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL); - SDValue StorePart1 = DAG.getStore(StorePart0, DL, Part1Select, ResPart1Ptr, - MachinePointerInfo()); - SDValue ResPart2Ptr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(8), DL); - SDValue StorePart2 = DAG.getStore(StorePart1, DL, Part2Select, ResPart2Ptr, - MachinePointerInfo()); - return DAG.getLoad(MVT::f80, DL, StorePart2, ResultSlot, MachinePointerInfo()); - } - - // Fallback: memory-based approach for register values - // f80 is stored as 96 bits (80 bits + 16 padding), handled as 3×i32 - // TODO: Consider creating CTSELECT_I386_F80mm pseudo instruction - // for single bundled 80-bit memory-based post-RA expansion - - SDValue Chain = DAG.getEntryNode(); - - // Create temporary stack slots for input f80 values - SDValue TrueSlot = DAG.CreateStackTemporary(MVT::f80); - SDValue FalseSlot = DAG.CreateStackTemporary(MVT::f80); - - // Store f80 values to memory - SDValue StoreTrueF80 = DAG.getStore(Chain, DL, TrueOp, TrueSlot, - MachinePointerInfo()); - SDValue StoreFalseF80 = DAG.getStore(Chain, DL, FalseOp, FalseSlot, - MachinePointerInfo()); - - // Load i32 parts from memory (3 chunks: [0-3], [4-7], [8-11] bytes) - SDValue TruePart0 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TrueSlot, - MachinePointerInfo()); - SDValue TruePart1Ptr = DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(4), DL); - SDValue TruePart1 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart1Ptr, - MachinePointerInfo()); - SDValue TruePart2Ptr = DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(8), DL); - SDValue TruePart2 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart2Ptr, - MachinePointerInfo()); - - SDValue FalsePart0 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalseSlot, - MachinePointerInfo()); - SDValue FalsePart1Ptr = DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(4), DL); - SDValue FalsePart1 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart1Ptr, - MachinePointerInfo()); - SDValue FalsePart2Ptr = DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(8), DL); - SDValue FalsePart2 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart2Ptr, - MachinePointerInfo()); - - // Create three i32 CTSELECT operations - SDValue Part0Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart0, FalsePart0); - SDValue Part1Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart1, FalsePart1); - SDValue Part2Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart2, FalsePart2); - - // Create result stack slot and store the selected parts - SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f80); - SDValue StorePart0 = DAG.getStore(Chain, DL, Part0Select, ResultSlot, - MachinePointerInfo()); - SDValue ResPart1Ptr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL); - SDValue StorePart1 = DAG.getStore(StorePart0, DL, Part1Select, ResPart1Ptr, - MachinePointerInfo()); - SDValue ResPart2Ptr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(8), DL); - SDValue StorePart2 = DAG.getStore(StorePart1, DL, Part2Select, ResPart2Ptr, - MachinePointerInfo()); - - // Load complete f80 result from memory - return DAG.getLoad(MVT::f80, DL, StorePart2, ResultSlot, MachinePointerInfo()); - } - } - if (isScalarFPTypeInSSEReg(VT)) { MVT IntVT = (VT == MVT::f32) ? MVT::i32 : MVT::i64; TrueOp = DAG.getBitcast(IntVT, TrueOp); @@ -38212,10 +37999,8 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI, /// This approach ensures that when i64 is type-legalized into two i32 /// operations, both operations share the same condition byte rather than /// each independently reading (and destroying) EFLAGS. -static MachineBasicBlock * -emitCTSelectI386WithConditionMaterialization(MachineInstr &MI, - MachineBasicBlock *BB, - unsigned InternalPseudoOpcode) { +static MachineBasicBlock *emitCTSelectI386WithConditionMaterialization( + MachineInstr &MI, MachineBasicBlock *BB, unsigned InternalPseudoOpcode) { const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); const MIMetadata MIMD(MI); MachineFunction *MF = BB->getParent(); @@ -38239,7 +38024,8 @@ emitCTSelectI386WithConditionMaterialization(MachineInstr &MI, // This pseudo will be expanded post-RA into the actual constant-time bundle // The condition byte can now be safely shared between multiple pseudos - // Internal pseudo has operands: (outs dst, tmp_byte, tmp_mask), (ins src1, src2, cond_byte) + // Internal pseudo has operands: (outs dst, tmp_byte, tmp_mask), (ins src1, + // src2, cond_byte) Register DstReg = MI.getOperand(0).getReg(); // Create virtual registers for the temporary outputs @@ -38256,14 +38042,182 @@ emitCTSelectI386WithConditionMaterialization(MachineInstr &MI, } BuildMI(*BB, MI, MIMD, TII->get(InternalPseudoOpcode)) - .addDef(DstReg) // dst (output) - .addDef(TmpByteReg) // tmp_byte (output) - .addDef(TmpMaskReg) // tmp_mask (output) - .addReg(Src1Reg) // src1 (input) - .addReg(Src2Reg) // src2 (input) - .addReg(CondByteReg); // pre-materialized condition byte (input) + .addDef(DstReg) // dst (output) + .addDef(TmpByteReg) // tmp_byte (output) + .addDef(TmpMaskReg) // tmp_mask (output) + .addReg(Src1Reg) // src1 (input) + .addReg(Src2Reg) // src2 (input) + .addReg(CondByteReg); // pre-materialized condition byte (input) + + MI.eraseFromParent(); + return BB; +} + +static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI, + MachineBasicBlock *BB, + unsigned pseudoInstr) { + const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); + const MIMetadata MIMD(MI); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + unsigned RegSizeInByte = 4; + + // Get operands + // MI operands: %result:rfp80 = CTSELECT_I386 %false:rfp80, %true:rfp80, %cond:i8imm + unsigned DestReg = MI.getOperand(0).getReg(); + unsigned FalseReg = MI.getOperand(1).getReg(); + unsigned TrueReg = MI.getOperand(2).getReg(); + X86::CondCode CC = static_cast(MI.getOperand(3).getImm()); + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); + + // Materialize condition byte from EFLAGS + Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC); + + // Create mask from condition: 0x00000000 or 0xFFFFFFFF + unsigned MaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); + unsigned ExtReg = MRI.createVirtualRegister(&X86::GR32RegClass); + + // Zero-extend i8 condition to i32 + BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rr8), ExtReg) + .addReg(CondByteReg, RegState::Kill); + + // Negate to create mask + BuildMI(*BB, MI, MIMD, TII->get(X86::NEG32r), MaskReg) + .addReg(ExtReg, RegState::Kill); + + // Create inverted mask + unsigned InvMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::NOT32r), InvMaskReg).addReg(MaskReg); + + auto storeFpToSlot = [&](unsigned Opcode, int Slot, Register Reg) { + addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(Opcode)), Slot) + .addReg(Reg, RegState::Kill); + }; + + auto emitCtSelect = [&](unsigned NumValues, int TrueSlot, int FalseSlot, + int ResultSlot, bool KillMaskRegs) { + for (unsigned Val = 0; Val < NumValues; ++Val) { + unsigned Offset = Val * RegSizeInByte; + unsigned TrueReg = MRI.createVirtualRegister(&X86::GR32RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), TrueReg) + .addFrameIndex(TrueSlot) + .addImm(1) + .addReg(0) + .addImm(Offset) + .addReg(0); + + unsigned FalseReg = MRI.createVirtualRegister(&X86::GR32RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), FalseReg) + .addFrameIndex(FalseSlot) + .addImm(1) + .addReg(0) + .addImm(Offset) + .addReg(0); + + unsigned MaskedTrueReg = MRI.createVirtualRegister(&X86::GR32RegClass); + unsigned MaskedFalseReg = MRI.createVirtualRegister(&X86::GR32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&X86::GR32RegClass); + + bool KillMasksNow = KillMaskRegs && Val + 1 == NumValues; + + auto TrueMIB = + BuildMI(*BB, MI, MIMD, TII->get(X86::AND32rr), MaskedTrueReg); + TrueMIB.addReg(TrueReg, RegState::Kill); + if (KillMasksNow) + TrueMIB.addReg(MaskReg, RegState::Kill); + else + TrueMIB.addReg(MaskReg); + + auto FalseMIB = + BuildMI(*BB, MI, MIMD, TII->get(X86::AND32rr), MaskedFalseReg); + FalseMIB.addReg(FalseReg, RegState::Kill); + if (KillMasksNow) + FalseMIB.addReg(InvMaskReg, RegState::Kill); + else + FalseMIB.addReg(InvMaskReg); + + BuildMI(*BB, MI, MIMD, TII->get(X86::OR32rr), ResultReg) + .addReg(MaskedTrueReg, RegState::Kill) + .addReg(MaskedFalseReg, RegState::Kill); + + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr)) + .addFrameIndex(ResultSlot) + .addImm(1) + .addReg(0) + .addImm(Offset) + .addReg(0) + .addReg(ResultReg, RegState::Kill); + } + }; + + switch (pseudoInstr) { + case X86::CTSELECT_I386_FP32rr: { + + // Allocate stack slot for result (4 bytes for f32) + int ResultSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); + int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); + int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); + + // Store f32 to stack using pseudo instruction (ST_Fp32m will be handled by + // FP stackifier) + storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg); + storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg); + + emitCtSelect(1, TrueSlot, FalseSlot, ResultSlot, true); + + // Load as f32 to x87 stack using pseudo instruction + addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp32m), DestReg), + ResultSlot); + break; + } + case X86::CTSELECT_I386_FP64rr: { + unsigned StackSlotSize = 8; + // Allocate stack slots for temporaries (8 bytes for f64) + int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); + int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); + int ResultSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); + + // Store x87 values to stack using pseudo instruction + // ST_Fp64m will be handled by the FP stackifier + storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg); + storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg); + + emitCtSelect(StackSlotSize/RegSizeInByte, TrueSlot, FalseSlot, ResultSlot, true); + + // Load final f64 result back to x87 stack using pseudo instruction + addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp64m), DestReg), + ResultSlot); + break; + } + case X86::CTSELECT_I386_FP80rr: { + // Allocate stack slots for temporaries + unsigned StackObjctSize = 12; + int TrueSlot = MFI.CreateStackObject( + StackObjctSize, Align(4), false); // 80-bit = 10 bytes, aligned to 12 + int FalseSlot = MFI.CreateStackObject(StackObjctSize, Align(4), false); + int ResultSlot = MFI.CreateStackObject(StackObjctSize, Align(4), false); + + // Store x87 values to stack using pseudo instruction + // ST_FpP80m will be handled by the FP stackifier + storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg); + storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg); + + // Process 3 x i32 parts (bytes 0-3, 4-7, 8-11) + emitCtSelect(StackObjctSize/RegSizeInByte, TrueSlot, FalseSlot, ResultSlot, true); + + // Load final f80 result back to x87 stack using pseudo instruction + addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp80m), DestReg), + ResultSlot); + break; + } + default: + llvm_unreachable("Invalid CTSELECT opcode"); + } MI.eraseFromParent(); + return BB; } @@ -38340,9 +38294,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return emitCTSelectI386WithConditionMaterialization( MI, BB, X86::CTSELECT_I386_INT_GR32rr); - case X86::CTSELECT_FP32rr: - case X86::CTSELECT_FP64rr: - case X86::CTSELECT_FP80rr: + case X86::CTSELECT_I386_FP32rr: + return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP32rr); + case X86::CTSELECT_I386_FP64rr: + return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP64rr); + case X86::CTSELECT_I386_FP80rr: + return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP80rr); case X86::CTSELECT_VR64rr: return EmitLoweredSelect( MI, BB); // TODO: Implement this to generate for Constant time version diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index a7a2986fa9548..cf3aaa265db7f 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -702,10 +702,11 @@ def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), let isPseudo = 1, isNotDuplicable = 1 in { // Phase 1: Initial pseudos that consume EFLAGS (via custom inserter) // These are matched by patterns and convert EFLAGS to condition byte - multiclass CTSELECT_I386_INITIAL { + multiclass CTSELECT_I386_INITIAL { let Uses = [EFLAGS], Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { def rr : PseudoI<(outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$cond), []>; + (ins RC:$src1, RC:$src2, i8imm:$cond), + [(set RC:$dst, (VT(X86ctselect RC:$src1, RC:$src2, timm:$cond, EFLAGS)))]>; } } @@ -721,22 +722,14 @@ let isPseudo = 1, isNotDuplicable = 1 in { } } } - - multiclass CTSELECT_NOCMOV { - let hasNoSchedulingInfo = 1 in { - def rr : PseudoI<(outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cond), - [(set RC:$dst, (VT(X86ctselect RC:$src1, RC:$src2, - timm:$cond, EFLAGS)))]>; - } - } } // Phase 1 pseudos for non-CMOV targets (custom inserter materializes condition) let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { let Predicates = [NoNativeCMOV] in { - defm CTSELECT_I386_GR8 : CTSELECT_I386_INITIAL; - defm CTSELECT_I386_GR16 : CTSELECT_I386_INITIAL; - defm CTSELECT_I386_GR32 : CTSELECT_I386_INITIAL; + defm CTSELECT_I386_GR8 : CTSELECT_I386_INITIAL; + defm CTSELECT_I386_GR16 : CTSELECT_I386_INITIAL; + defm CTSELECT_I386_GR32 : CTSELECT_I386_INITIAL; } } @@ -749,22 +742,20 @@ let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { } } -let usesCustomInserter = 1, - isCodeGenOnly = 1, - hasSideEffects = 1, +let hasSideEffects = 1, ForceDisassemble = 1, Constraints = "$dst = $src1" in { let Predicates = [FPStackf32] in - defm CTSELECT_FP32 : CTSELECT_NOCMOV; + defm CTSELECT_I386_FP32 : CTSELECT_I386_INITIAL; let Predicates = [FPStackf64] in - defm CTSELECT_FP64 : CTSELECT_NOCMOV; + defm CTSELECT_I386_FP64 : CTSELECT_I386_INITIAL; - defm CTSELECT_FP80 : CTSELECT_NOCMOV; + defm CTSELECT_I386_FP80 : CTSELECT_I386_INITIAL; let Predicates = [HasMMX] in - defm CTSELECT_VR64 : CTSELECT_NOCMOV; + defm CTSELECT_VR64 : CTSELECT_I386_INITIAL; } // Pattern matching for non-native-CMOV CTSELECT (routes to custom inserter for condition materialization) From a5702d4f6db754efc7dcfd14bbd05cbd6494a984 Mon Sep 17 00:00:00 2001 From: kumarak Date: Thu, 9 Oct 2025 03:00:24 +0000 Subject: [PATCH 58/63] [CT] update unit tests for fp types --- llvm/test/CodeGen/X86/ctselect-i386-fp.ll | 575 ++++++++++++++------- llvm/test/CodeGen/X86/ctselect-i386-mmx.ll | 99 ++-- 2 files changed, 439 insertions(+), 235 deletions(-) diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll index b8daf26158d21..b188c07164716 100644 --- a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll +++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll @@ -13,35 +13,48 @@ define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f32_basic: ; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $12, %esp ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: negl %eax +; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl %edx, (%esp) -; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: andl %eax, %ecx +; I386-NOCMOV-NEXT: notl %eax +; I386-NOCMOV-NEXT: andl %edx, %eax +; I386-NOCMOV-NEXT: orl %ecx, %eax +; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: addl $12, %esp -; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f32_basic: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $12, %esp ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: jne .LBB0_2 -; I386-CMOV-NEXT: # %bb.1: -; I386-CMOV-NEXT: fstp %st(1) -; I386-CMOV-NEXT: fldz -; I386-CMOV-NEXT: .LBB0_2: -; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: negl %eax +; I386-CMOV-NEXT: fxch %st(1) +; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstps (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: andl %eax, %ecx +; I386-CMOV-NEXT: notl %eax +; I386-CMOV-NEXT: andl %edx, %eax +; I386-CMOV-NEXT: orl %ecx, %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: addl $12, %esp ; I386-CMOV-NEXT: retl %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) ret float %result @@ -51,8 +64,6 @@ define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind { define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f32_eq: ; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $12, %esp ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) @@ -62,38 +73,53 @@ define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounw ; I386-NOCMOV-NEXT: fnstsw %ax ; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax ; I386-NOCMOV-NEXT: sahf -; I386-NOCMOV-NEXT: setnp %al -; I386-NOCMOV-NEXT: sete %cl -; I386-NOCMOV-NEXT: testb %al, %cl -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setnp %cl +; I386-NOCMOV-NEXT: sete %dl +; I386-NOCMOV-NEXT: xorl %eax, %eax +; I386-NOCMOV-NEXT: testb %cl, %dl +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: negl %eax +; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl %edx, (%esp) -; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: andl %eax, %ecx +; I386-NOCMOV-NEXT: notl %eax +; I386-NOCMOV-NEXT: andl %edx, %eax +; I386-NOCMOV-NEXT: orl %ecx, %eax +; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: addl $12, %esp -; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f32_eq: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $12, %esp ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fucompi %st(1), %st ; I386-CMOV-NEXT: fstp %st(0) -; I386-CMOV-NEXT: setnp %al -; I386-CMOV-NEXT: sete %cl -; I386-CMOV-NEXT: testb %al, %cl -; I386-CMOV-NEXT: jne .LBB1_2 -; I386-CMOV-NEXT: # %bb.1: -; I386-CMOV-NEXT: fstp %st(1) -; I386-CMOV-NEXT: fldz -; I386-CMOV-NEXT: .LBB1_2: -; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: setnp %cl +; I386-CMOV-NEXT: sete %dl +; I386-CMOV-NEXT: xorl %eax, %eax +; I386-CMOV-NEXT: testb %cl, %dl +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: negl %eax +; I386-CMOV-NEXT: fxch %st(1) +; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstps (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: andl %eax, %ecx +; I386-CMOV-NEXT: notl %eax +; I386-CMOV-NEXT: andl %edx, %eax +; I386-CMOV-NEXT: orl %ecx, %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: addl $12, %esp ; I386-CMOV-NEXT: retl %cmp = fcmp oeq float %x, %y %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b) @@ -104,39 +130,66 @@ define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounw define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f64_basic: ; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: pushl %ebx ; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: subl $28, %esp +; I386-NOCMOV-NEXT: subl $24, %esp ; I386-NOCMOV-NEXT: fldl {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldl {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: negl %eax +; I386-NOCMOV-NEXT: movl %eax, %ecx +; I386-NOCMOV-NEXT: notl %ecx +; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstpl {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstpl {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl %edx, (%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl %edx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: andl %eax, %edx +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: andl %eax, %edx +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldl (%esp) -; I386-NOCMOV-NEXT: addl $28, %esp +; I386-NOCMOV-NEXT: addl $24, %esp ; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f64_basic: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: subl $24, %esp ; I386-CMOV-NEXT: fldl {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fldl {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: jne .LBB2_2 -; I386-CMOV-NEXT: # %bb.1: -; I386-CMOV-NEXT: fstp %st(1) -; I386-CMOV-NEXT: fldz -; I386-CMOV-NEXT: .LBB2_2: -; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: negl %eax +; I386-CMOV-NEXT: movl %eax, %ecx +; I386-CMOV-NEXT: notl %ecx +; I386-CMOV-NEXT: fxch %st(1) +; I386-CMOV-NEXT: fstpl {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstpl {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: andl %eax, %edx +; I386-CMOV-NEXT: andl %ecx, %esi +; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: andl %eax, %edx +; I386-CMOV-NEXT: andl %ecx, %esi +; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fldl (%esp) +; I386-CMOV-NEXT: addl $24, %esp +; I386-CMOV-NEXT: popl %esi ; I386-CMOV-NEXT: retl %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) ret double %result @@ -146,43 +199,78 @@ define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f80_basic: ; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: pushl %ebx ; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: subl $44, %esp +; I386-NOCMOV-NEXT: subl $36, %esp ; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: negl %eax +; I386-NOCMOV-NEXT: movl %eax, %ecx +; I386-NOCMOV-NEXT: notl %ecx +; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl %edx, (%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl %edx, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movw %dx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: andl %eax, %edx +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: andl %eax, %edx +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: andl %eax, %edx +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldt (%esp) -; I386-NOCMOV-NEXT: addl $44, %esp +; I386-NOCMOV-NEXT: addl $36, %esp ; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f80_basic: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: subl $36, %esp ; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: jne .LBB3_2 -; I386-CMOV-NEXT: # %bb.1: -; I386-CMOV-NEXT: fstp %st(1) -; I386-CMOV-NEXT: fldz -; I386-CMOV-NEXT: .LBB3_2: -; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: negl %eax +; I386-CMOV-NEXT: movl %eax, %ecx +; I386-CMOV-NEXT: notl %ecx +; I386-CMOV-NEXT: fxch %st(1) +; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: andl %eax, %edx +; I386-CMOV-NEXT: andl %ecx, %esi +; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: andl %eax, %edx +; I386-CMOV-NEXT: andl %ecx, %esi +; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: andl %eax, %edx +; I386-CMOV-NEXT: andl %ecx, %esi +; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fldt (%esp) +; I386-CMOV-NEXT: addl $36, %esp +; I386-CMOV-NEXT: popl %esi ; I386-CMOV-NEXT: retl %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b) ret x86_fp80 %result @@ -192,8 +280,6 @@ define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nou define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f32_gt: ; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $12, %esp ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) @@ -203,36 +289,51 @@ define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounw ; I386-NOCMOV-NEXT: fnstsw %ax ; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax ; I386-NOCMOV-NEXT: sahf -; I386-NOCMOV-NEXT: seta %al -; I386-NOCMOV-NEXT: testb %al, %al -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: seta %cl +; I386-NOCMOV-NEXT: xorl %eax, %eax +; I386-NOCMOV-NEXT: testb %cl, %cl +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: negl %eax +; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl %edx, (%esp) -; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: andl %eax, %ecx +; I386-NOCMOV-NEXT: notl %eax +; I386-NOCMOV-NEXT: andl %edx, %eax +; I386-NOCMOV-NEXT: orl %ecx, %eax +; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: addl $12, %esp -; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f32_gt: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $12, %esp ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fucompi %st(1), %st ; I386-CMOV-NEXT: fstp %st(0) -; I386-CMOV-NEXT: seta %al -; I386-CMOV-NEXT: testb %al, %al -; I386-CMOV-NEXT: jne .LBB4_2 -; I386-CMOV-NEXT: # %bb.1: -; I386-CMOV-NEXT: fstp %st(1) -; I386-CMOV-NEXT: fldz -; I386-CMOV-NEXT: .LBB4_2: -; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: seta %cl +; I386-CMOV-NEXT: xorl %eax, %eax +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: negl %eax +; I386-CMOV-NEXT: fxch %st(1) +; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstps (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: andl %eax, %ecx +; I386-CMOV-NEXT: notl %eax +; I386-CMOV-NEXT: andl %edx, %eax +; I386-CMOV-NEXT: orl %ecx, %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: addl $12, %esp ; I386-CMOV-NEXT: retl %cmp = fcmp ogt float %x, %y %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b) @@ -243,35 +344,48 @@ define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounw define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f32_no_branches: ; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $12, %esp ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: negl %eax +; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl %edx, (%esp) -; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: andl %eax, %ecx +; I386-NOCMOV-NEXT: notl %eax +; I386-NOCMOV-NEXT: andl %edx, %eax +; I386-NOCMOV-NEXT: orl %ecx, %eax +; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: addl $12, %esp -; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f32_no_branches: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $12, %esp ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: jne .LBB5_2 -; I386-CMOV-NEXT: # %bb.1: -; I386-CMOV-NEXT: fstp %st(1) -; I386-CMOV-NEXT: fldz -; I386-CMOV-NEXT: .LBB5_2: -; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: negl %eax +; I386-CMOV-NEXT: fxch %st(1) +; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstps (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: andl %eax, %ecx +; I386-CMOV-NEXT: notl %eax +; I386-CMOV-NEXT: andl %edx, %eax +; I386-CMOV-NEXT: orl %ecx, %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: addl $12, %esp ; I386-CMOV-NEXT: retl %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) ret float %result @@ -281,35 +395,48 @@ define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwi define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f32_bundled: ; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $12, %esp ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: negl %eax +; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl %edx, (%esp) -; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: andl %eax, %ecx +; I386-NOCMOV-NEXT: notl %eax +; I386-NOCMOV-NEXT: andl %edx, %eax +; I386-NOCMOV-NEXT: orl %ecx, %eax +; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: addl $12, %esp -; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f32_bundled: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $12, %esp ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: jne .LBB6_2 -; I386-CMOV-NEXT: # %bb.1: -; I386-CMOV-NEXT: fstp %st(1) -; I386-CMOV-NEXT: fldz -; I386-CMOV-NEXT: .LBB6_2: -; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: negl %eax +; I386-CMOV-NEXT: fxch %st(1) +; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstps (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: andl %eax, %ecx +; I386-CMOV-NEXT: notl %eax +; I386-CMOV-NEXT: andl %edx, %eax +; I386-CMOV-NEXT: orl %ecx, %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: addl $12, %esp ; I386-CMOV-NEXT: retl %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) ret float %result @@ -319,35 +446,48 @@ define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind { define float @test_ctselect_f32_nan(i1 %cond) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f32_nan: ; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $12, %esp +; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; I386-NOCMOV-NEXT: fldz +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: negl %eax +; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl %edx, (%esp) -; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: andl %eax, %ecx +; I386-NOCMOV-NEXT: notl %eax +; I386-NOCMOV-NEXT: andl %edx, %eax +; I386-NOCMOV-NEXT: orl %ecx, %eax +; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: addl $12, %esp -; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f32_nan: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $12, %esp +; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; I386-CMOV-NEXT: fldz -; I386-CMOV-NEXT: jne .LBB7_2 -; I386-CMOV-NEXT: # %bb.1: -; I386-CMOV-NEXT: fstp %st(1) -; I386-CMOV-NEXT: fldz -; I386-CMOV-NEXT: .LBB7_2: -; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: negl %eax +; I386-CMOV-NEXT: fxch %st(1) +; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstps (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: andl %eax, %ecx +; I386-CMOV-NEXT: notl %eax +; I386-CMOV-NEXT: andl %edx, %eax +; I386-CMOV-NEXT: orl %ecx, %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: addl $12, %esp ; I386-CMOV-NEXT: retl %nan = bitcast i32 2139095040 to float ; 0x7F800000 = +inf %zero = bitcast i32 0 to float @@ -359,43 +499,78 @@ define float @test_ctselect_f32_nan(i1 %cond) nounwind { define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f80_alignment: ; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: pushl %ebx ; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: subl $44, %esp +; I386-NOCMOV-NEXT: subl $36, %esp ; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: negl %eax +; I386-NOCMOV-NEXT: movl %eax, %ecx +; I386-NOCMOV-NEXT: notl %ecx +; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl %edx, (%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl %edx, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movw %dx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: andl %eax, %edx +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: andl %eax, %edx +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: andl %eax, %edx +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldt (%esp) -; I386-NOCMOV-NEXT: addl $44, %esp +; I386-NOCMOV-NEXT: addl $36, %esp ; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f80_alignment: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: subl $36, %esp ; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: jne .LBB8_2 -; I386-CMOV-NEXT: # %bb.1: -; I386-CMOV-NEXT: fstp %st(1) -; I386-CMOV-NEXT: fldz -; I386-CMOV-NEXT: .LBB8_2: -; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: negl %eax +; I386-CMOV-NEXT: movl %eax, %ecx +; I386-CMOV-NEXT: notl %ecx +; I386-CMOV-NEXT: fxch %st(1) +; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: andl %eax, %edx +; I386-CMOV-NEXT: andl %ecx, %esi +; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: andl %eax, %edx +; I386-CMOV-NEXT: andl %ecx, %esi +; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: andl %eax, %edx +; I386-CMOV-NEXT: andl %ecx, %esi +; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fldt (%esp) +; I386-CMOV-NEXT: addl $36, %esp +; I386-CMOV-NEXT: popl %esi ; I386-CMOV-NEXT: retl %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b) ret x86_fp80 %result @@ -405,54 +580,78 @@ define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float %b, float %c, float %d) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f32_multiple: ; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $24, %esp ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: negl %eax +; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl %edx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: andl %eax, %ecx +; I386-NOCMOV-NEXT: notl %eax +; I386-NOCMOV-NEXT: andl %edx, %eax +; I386-NOCMOV-NEXT: orl %ecx, %eax +; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fxch %st(1) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: negl %eax ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: BUNDLE -; I386-NOCMOV-NEXT: movl %edx, (%esp) -; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: andl %eax, %ecx +; I386-NOCMOV-NEXT: notl %eax +; I386-NOCMOV-NEXT: andl %edx, %eax +; I386-NOCMOV-NEXT: orl %ecx, %eax +; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: addl $24, %esp -; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %ebx ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f32_multiple: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $24, %esp ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: jne .LBB9_2 -; I386-CMOV-NEXT: # %bb.1: -; I386-CMOV-NEXT: fstp %st(1) -; I386-CMOV-NEXT: fldz -; I386-CMOV-NEXT: .LBB9_2: -; I386-CMOV-NEXT: fstp %st(0) -; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: jne .LBB9_4 -; I386-CMOV-NEXT: # %bb.3: -; I386-CMOV-NEXT: fstp %st(0) -; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: negl %eax ; I386-CMOV-NEXT: fxch %st(1) -; I386-CMOV-NEXT: .LBB9_4: -; I386-CMOV-NEXT: fstp %st(1) +; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: andl %eax, %ecx +; I386-CMOV-NEXT: notl %eax +; I386-CMOV-NEXT: andl %edx, %eax +; I386-CMOV-NEXT: orl %ecx, %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: xorl %eax, %eax +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: negl %eax +; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstps (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: andl %eax, %ecx +; I386-CMOV-NEXT: notl %eax +; I386-CMOV-NEXT: andl %edx, %eax +; I386-CMOV-NEXT: orl %ecx, %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: addl $24, %esp ; I386-CMOV-NEXT: retl %sel1 = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b) %sel2 = call float @llvm.ct.select.f32(i1 %cond2, float %sel1, float %c) diff --git a/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll index f410560b7a78f..2cb67ba9c29b5 100644 --- a/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll +++ b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll @@ -32,7 +32,8 @@ define <1 x i64> @test_mmx_ctselect_with_paddd(i32 %cond, i64 %a, i64 %b) { ; I386-NOCMOV-NEXT: setne %bl ; I386-NOCMOV-NEXT: testb %bl, %bl ; I386-NOCMOV-NEXT: sete %bl -; I386-NOCMOV-NEXT: movzbl %bl, %ebp +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %ebp ; I386-NOCMOV-NEXT: negl %ebp ; I386-NOCMOV-NEXT: movl %esi, %edi ; I386-NOCMOV-NEXT: andl %ebp, %edi @@ -40,8 +41,8 @@ define <1 x i64> @test_mmx_ctselect_with_paddd(i32 %cond, i64 %a, i64 %b) { ; I386-NOCMOV-NEXT: andl %ecx, %ebp ; I386-NOCMOV-NEXT: orl %ebp, %edi ; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: sete %bl -; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi ; I386-NOCMOV-NEXT: negl %esi ; I386-NOCMOV-NEXT: movl %edx, %ecx ; I386-NOCMOV-NEXT: andl %esi, %ecx @@ -121,7 +122,8 @@ define <1 x i64> @test_mmx_ctselect_with_psllw(i32 %cond, i64 %a, i64 %b) { ; I386-NOCMOV-NEXT: setne %bl ; I386-NOCMOV-NEXT: testb %bl, %bl ; I386-NOCMOV-NEXT: sete %bl -; I386-NOCMOV-NEXT: movzbl %bl, %ebp +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %ebp ; I386-NOCMOV-NEXT: negl %ebp ; I386-NOCMOV-NEXT: movl %esi, %edi ; I386-NOCMOV-NEXT: andl %ebp, %edi @@ -129,8 +131,8 @@ define <1 x i64> @test_mmx_ctselect_with_psllw(i32 %cond, i64 %a, i64 %b) { ; I386-NOCMOV-NEXT: andl %ecx, %ebp ; I386-NOCMOV-NEXT: orl %ebp, %edi ; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: sete %bl -; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi ; I386-NOCMOV-NEXT: negl %esi ; I386-NOCMOV-NEXT: movl %edx, %ecx ; I386-NOCMOV-NEXT: andl %esi, %ecx @@ -202,52 +204,54 @@ define <1 x i64> @test_mmx_nested_ctselect_with_pand(i32 %cond1, i32 %cond2, i64 ; I386-NOCMOV-NEXT: .cfi_offset %edi, -16 ; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12 ; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx ; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: setne %cl -; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: setne %ah -; I386-NOCMOV-NEXT: testb %ah, %ah -; I386-NOCMOV-NEXT: sete %ch -; I386-NOCMOV-NEXT: movzbl %ch, %ebp +; I386-NOCMOV-NEXT: setne %bl +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %ebp ; I386-NOCMOV-NEXT: negl %ebp -; I386-NOCMOV-NEXT: movl %edi, %eax -; I386-NOCMOV-NEXT: andl %ebp, %eax +; I386-NOCMOV-NEXT: movl %edx, %edi +; I386-NOCMOV-NEXT: andl %ebp, %edi ; I386-NOCMOV-NEXT: notl %ebp -; I386-NOCMOV-NEXT: andl %edx, %ebp -; I386-NOCMOV-NEXT: orl %ebp, %eax -; I386-NOCMOV-NEXT: sete %ch -; I386-NOCMOV-NEXT: movzbl %ch, %edi -; I386-NOCMOV-NEXT: negl %edi -; I386-NOCMOV-NEXT: movl %ebx, %edx -; I386-NOCMOV-NEXT: andl %edi, %edx -; I386-NOCMOV-NEXT: notl %edi -; I386-NOCMOV-NEXT: andl %esi, %edi -; I386-NOCMOV-NEXT: orl %edi, %edx +; I386-NOCMOV-NEXT: andl %eax, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %edi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %ecx +; I386-NOCMOV-NEXT: negl %ecx +; I386-NOCMOV-NEXT: movl %esi, %ebp +; I386-NOCMOV-NEXT: andl %ecx, %ebp +; I386-NOCMOV-NEXT: notl %ecx +; I386-NOCMOV-NEXT: andl %eax, %ecx +; I386-NOCMOV-NEXT: orl %ecx, %ebp +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %cl ; I386-NOCMOV-NEXT: testb %cl, %cl -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi -; I386-NOCMOV-NEXT: sete %bl -; I386-NOCMOV-NEXT: movzbl %bl, %esi -; I386-NOCMOV-NEXT: negl %esi -; I386-NOCMOV-NEXT: movl %edi, %ecx -; I386-NOCMOV-NEXT: andl %esi, %ecx -; I386-NOCMOV-NEXT: notl %esi -; I386-NOCMOV-NEXT: andl %edx, %esi -; I386-NOCMOV-NEXT: orl %esi, %ecx -; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi -; I386-NOCMOV-NEXT: sete %dl -; I386-NOCMOV-NEXT: movzbl %dl, %esi +; I386-NOCMOV-NEXT: sete %cl +; I386-NOCMOV-NEXT: movb %cl, %ch +; I386-NOCMOV-NEXT: movzbl %ch, %ebx +; I386-NOCMOV-NEXT: negl %ebx +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %ebx, %esi +; I386-NOCMOV-NEXT: notl %ebx +; I386-NOCMOV-NEXT: andl %ebp, %ebx +; I386-NOCMOV-NEXT: orl %ebx, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movb %cl, %ch +; I386-NOCMOV-NEXT: movzbl %ch, %esi ; I386-NOCMOV-NEXT: negl %esi -; I386-NOCMOV-NEXT: movl %edi, %ecx -; I386-NOCMOV-NEXT: andl %esi, %ecx +; I386-NOCMOV-NEXT: movl %eax, %edx +; I386-NOCMOV-NEXT: andl %esi, %edx ; I386-NOCMOV-NEXT: notl %esi -; I386-NOCMOV-NEXT: andl %eax, %esi -; I386-NOCMOV-NEXT: orl %esi, %ecx -; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: orl %esi, %edx +; I386-NOCMOV-NEXT: movl %edx, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; I386-NOCMOV-NEXT: pand %mm0, %mm0 ; I386-NOCMOV-NEXT: movq %mm0, (%esp) @@ -340,7 +344,8 @@ define <1 x i64> @test_mmx_ctselect_with_por(i32 %cond, i64 %a, i64 %b) { ; I386-NOCMOV-NEXT: setne %bl ; I386-NOCMOV-NEXT: testb %bl, %bl ; I386-NOCMOV-NEXT: sete %bl -; I386-NOCMOV-NEXT: movzbl %bl, %ebp +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %ebp ; I386-NOCMOV-NEXT: negl %ebp ; I386-NOCMOV-NEXT: movl %esi, %edi ; I386-NOCMOV-NEXT: andl %ebp, %edi @@ -348,8 +353,8 @@ define <1 x i64> @test_mmx_ctselect_with_por(i32 %cond, i64 %a, i64 %b) { ; I386-NOCMOV-NEXT: andl %ecx, %ebp ; I386-NOCMOV-NEXT: orl %ebp, %edi ; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: sete %bl -; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi ; I386-NOCMOV-NEXT: negl %esi ; I386-NOCMOV-NEXT: movl %edx, %ecx ; I386-NOCMOV-NEXT: andl %esi, %ecx From dc9b327af736bb3eac9faca1cd420bba147a16db Mon Sep 17 00:00:00 2001 From: kumarak Date: Thu, 9 Oct 2025 03:43:12 +0000 Subject: [PATCH 59/63] [CT] fix mismatch of register type for CTSELECT_I386_GR16 pseudo instruction --- .../SelectionDAG/SelectionDAGBuilder.cpp | 1 - llvm/lib/Target/X86/X86ISelLowering.cpp | 129 ++-- llvm/lib/Target/X86/X86InstrCompiler.td | 3 - llvm/lib/Target/X86/X86InstrInfo.cpp | 9 - llvm/lib/Target/X86/X86InstrInfo.h | 6 - llvm/test/CodeGen/X86/ctselect-i386-fp.ll | 576 +++++++++++------- llvm/test/CodeGen/X86/ctselect-i386.ll | 48 +- 7 files changed, 440 insertions(+), 332 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index c80fa04cde4e1..90d9ac76b6e57 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6834,7 +6834,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } case Intrinsic::ct_select: { - SDLoc DL = getCurSDLoc(); SDValue Cond = getValue(I.getArgOperand(0)); // i1 diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8fad801b56b22..c375676dafdc7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -38033,11 +38033,13 @@ static MachineBasicBlock *emitCTSelectI386WithConditionMaterialization( Register TmpMaskReg; // Determine the register class for tmp_mask based on the data type - if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR8rr || - InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR16rr || - InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR32rr) { + if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR8rr) + TmpMaskReg = MRI.createVirtualRegister(&X86::GR8RegClass); + else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR16rr) + TmpMaskReg = MRI.createVirtualRegister(&X86::GR16RegClass); + else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR32rr) TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); - } else { + else { llvm_unreachable("Unknown internal pseudo opcode"); } @@ -38075,139 +38077,109 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI, Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC); - // Create mask from condition: 0x00000000 or 0xFFFFFFFF - unsigned MaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); - unsigned ExtReg = MRI.createVirtualRegister(&X86::GR32RegClass); - - // Zero-extend i8 condition to i32 - BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rr8), ExtReg) - .addReg(CondByteReg, RegState::Kill); - - // Negate to create mask - BuildMI(*BB, MI, MIMD, TII->get(X86::NEG32r), MaskReg) - .addReg(ExtReg, RegState::Kill); - - // Create inverted mask - unsigned InvMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); - BuildMI(*BB, MI, MIMD, TII->get(X86::NOT32r), InvMaskReg).addReg(MaskReg); - auto storeFpToSlot = [&](unsigned Opcode, int Slot, Register Reg) { addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(Opcode)), Slot) .addReg(Reg, RegState::Kill); }; - auto emitCtSelect = [&](unsigned NumValues, int TrueSlot, int FalseSlot, - int ResultSlot, bool KillMaskRegs) { + auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) { for (unsigned Val = 0; Val < NumValues; ++Val) { unsigned Offset = Val * RegSizeInByte; - unsigned TrueReg = MRI.createVirtualRegister(&X86::GR32RegClass); - BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), TrueReg) + + // Load true and false values from stack as 32-bit integers + unsigned TrueIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), TrueIntReg) .addFrameIndex(TrueSlot) .addImm(1) .addReg(0) .addImm(Offset) .addReg(0); - unsigned FalseReg = MRI.createVirtualRegister(&X86::GR32RegClass); - BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), FalseReg) + unsigned FalseIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), FalseIntReg) .addFrameIndex(FalseSlot) .addImm(1) .addReg(0) .addImm(Offset) .addReg(0); - unsigned MaskedTrueReg = MRI.createVirtualRegister(&X86::GR32RegClass); - unsigned MaskedFalseReg = MRI.createVirtualRegister(&X86::GR32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&X86::GR32RegClass); - - bool KillMasksNow = KillMaskRegs && Val + 1 == NumValues; - - auto TrueMIB = - BuildMI(*BB, MI, MIMD, TII->get(X86::AND32rr), MaskedTrueReg); - TrueMIB.addReg(TrueReg, RegState::Kill); - if (KillMasksNow) - TrueMIB.addReg(MaskReg, RegState::Kill); - else - TrueMIB.addReg(MaskReg); - - auto FalseMIB = - BuildMI(*BB, MI, MIMD, TII->get(X86::AND32rr), MaskedFalseReg); - FalseMIB.addReg(FalseReg, RegState::Kill); - if (KillMasksNow) - FalseMIB.addReg(InvMaskReg, RegState::Kill); - else - FalseMIB.addReg(InvMaskReg); - - BuildMI(*BB, MI, MIMD, TII->get(X86::OR32rr), ResultReg) - .addReg(MaskedTrueReg, RegState::Kill) - .addReg(MaskedFalseReg, RegState::Kill); - + // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection + unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); + + BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr)) + .addDef(ResultIntReg) // dst (output) + .addDef(TmpByteReg) // tmp_byte (output) + .addDef(TmpMaskReg) // tmp_mask (output) + .addReg(FalseIntReg) // src1 (input) - false value + .addReg(TrueIntReg) // src2 (input) - true value + .addReg(CondByteReg); // pre-materialized condition byte (input) + + // Store result back to result slot BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr)) .addFrameIndex(ResultSlot) .addImm(1) .addReg(0) .addImm(Offset) .addReg(0) - .addReg(ResultReg, RegState::Kill); + .addReg(ResultIntReg, RegState::Kill); } }; switch (pseudoInstr) { case X86::CTSELECT_I386_FP32rr: { - - // Allocate stack slot for result (4 bytes for f32) + // Allocate stack slots (4 bytes for f32) int ResultSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); - // Store f32 to stack using pseudo instruction (ST_Fp32m will be handled by - // FP stackifier) + // Store f32 values to stack storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg); storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg); - emitCtSelect(1, TrueSlot, FalseSlot, ResultSlot, true); + // Use pseudo instruction for selection (1 x 32-bit value) + emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot); - // Load as f32 to x87 stack using pseudo instruction + // Load result back as f32 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp32m), DestReg), ResultSlot); break; } case X86::CTSELECT_I386_FP64rr: { unsigned StackSlotSize = 8; - // Allocate stack slots for temporaries (8 bytes for f64) + // Allocate stack slots (8 bytes for f64) int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); int ResultSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); - // Store x87 values to stack using pseudo instruction - // ST_Fp64m will be handled by the FP stackifier + // Store f64 values to stack storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg); storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg); - emitCtSelect(StackSlotSize/RegSizeInByte, TrueSlot, FalseSlot, ResultSlot, true); + // Use pseudo instruction for selection (2 x 32-bit values) + emitCtSelectWithPseudo(StackSlotSize/RegSizeInByte, TrueSlot, FalseSlot, ResultSlot); - // Load final f64 result back to x87 stack using pseudo instruction + // Load result back as f64 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp64m), DestReg), ResultSlot); break; } case X86::CTSELECT_I386_FP80rr: { - // Allocate stack slots for temporaries - unsigned StackObjctSize = 12; - int TrueSlot = MFI.CreateStackObject( - StackObjctSize, Align(4), false); // 80-bit = 10 bytes, aligned to 12 - int FalseSlot = MFI.CreateStackObject(StackObjctSize, Align(4), false); - int ResultSlot = MFI.CreateStackObject(StackObjctSize, Align(4), false); - - // Store x87 values to stack using pseudo instruction - // ST_FpP80m will be handled by the FP stackifier + // Allocate stack slots (12 bytes for f80 - 80-bit = 10 bytes, aligned to 12) + unsigned StackObjectSize = 12; + int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false); + int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false); + int ResultSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false); + + // Store f80 values to stack storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg); storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg); - // Process 3 x i32 parts (bytes 0-3, 4-7, 8-11) - emitCtSelect(StackObjctSize/RegSizeInByte, TrueSlot, FalseSlot, ResultSlot, true); + // Use pseudo instruction for selection (3 x 32-bit values) + emitCtSelectWithPseudo(StackObjectSize/RegSizeInByte, TrueSlot, FalseSlot, ResultSlot); - // Load final f80 result back to x87 stack using pseudo instruction + // Load result back as f80 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp80m), DestReg), ResultSlot); break; @@ -38300,10 +38272,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP64rr); case X86::CTSELECT_I386_FP80rr: return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP80rr); - case X86::CTSELECT_VR64rr: - return EmitLoweredSelect( - MI, BB); // TODO: Implement this to generate for Constant time version - + case X86::FP80_ADDr: case X86::FP80_ADDm32: { // Change the floating point control register to use double extended diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index cf3aaa265db7f..b4d2993be72f4 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -753,9 +753,6 @@ let hasSideEffects = 1, defm CTSELECT_I386_FP64 : CTSELECT_I386_INITIAL; defm CTSELECT_I386_FP80 : CTSELECT_I386_INITIAL; - - let Predicates = [HasMMX] in - defm CTSELECT_VR64 : CTSELECT_I386_INITIAL; } // Pattern matching for non-native-CMOV CTSELECT (routes to custom inserter for condition materialization) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 9335b78b11785..119e8f00d2f86 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -6999,15 +6999,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::CTSELECT_V4F64: case X86::CTSELECT_V8F32: return expandCtSelectVector(MI); - - // i386-specific CTSELECT expansion (post-RA, constant-time) - //case X86::CTSELECT_I386_GR16rr: - //case X86::CTSELECT_I386_GR32rr: - // return expandCtSelectI386(MI); - - // VR64-specific CTSELECT expansion (post-RA, constant-time) - //case X86::CTSELECT_I386_VR64rr: - // return expandCtSelectI386VR64(MI); } return false; } diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 064445d277574..ebd7e070d5fe8 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -730,12 +730,6 @@ class X86InstrInfo final : public X86GenInstrInfo { bool expandCtSelectVector(MachineInstr &MI) const; - /// Expand i386-specific CTSELECT pseudo instructions (post-RA, constant-time) - bool expandCtSelectI386(MachineInstr &MI) const; - - /// Expand VR64-specific CTSELECT pseudo instructions (post-RA, constant-time) - bool expandCtSelectI386VR64(MachineInstr &MI) const; - /// Returns true iff the routine could find two commutable operands in the /// given machine instruction with 3 vector inputs. /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll index b188c07164716..b62bb9075f3db 100644 --- a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll +++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll @@ -13,48 +13,60 @@ define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f32_basic: ; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $12, %esp ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: negl %eax ; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl (%esp), %edx -; I386-NOCMOV-NEXT: andl %eax, %ecx -; I386-NOCMOV-NEXT: notl %eax -; I386-NOCMOV-NEXT: andl %edx, %eax -; I386-NOCMOV-NEXT: orl %ecx, %eax -; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f32_basic: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi ; I386-CMOV-NEXT: subl $12, %esp ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: negl %eax ; I386-CMOV-NEXT: fxch %st(1) ; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fstps (%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl (%esp), %edx -; I386-CMOV-NEXT: andl %eax, %ecx -; I386-CMOV-NEXT: notl %eax -; I386-CMOV-NEXT: andl %edx, %eax -; I386-CMOV-NEXT: orl %ecx, %eax -; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) ret float %result @@ -64,6 +76,8 @@ define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind { define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f32_eq: ; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $12, %esp ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) @@ -73,28 +87,34 @@ define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounw ; I386-NOCMOV-NEXT: fnstsw %ax ; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax ; I386-NOCMOV-NEXT: sahf -; I386-NOCMOV-NEXT: setnp %cl -; I386-NOCMOV-NEXT: sete %dl -; I386-NOCMOV-NEXT: xorl %eax, %eax -; I386-NOCMOV-NEXT: testb %cl, %dl +; I386-NOCMOV-NEXT: setnp %al +; I386-NOCMOV-NEXT: sete %cl +; I386-NOCMOV-NEXT: testb %al, %cl ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: negl %eax ; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl (%esp), %edx -; I386-NOCMOV-NEXT: andl %eax, %ecx -; I386-NOCMOV-NEXT: notl %eax -; I386-NOCMOV-NEXT: andl %edx, %eax -; I386-NOCMOV-NEXT: orl %ecx, %eax -; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f32_eq: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi ; I386-CMOV-NEXT: subl $12, %esp ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) @@ -102,24 +122,28 @@ define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounw ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fucompi %st(1), %st ; I386-CMOV-NEXT: fstp %st(0) -; I386-CMOV-NEXT: setnp %cl -; I386-CMOV-NEXT: sete %dl -; I386-CMOV-NEXT: xorl %eax, %eax -; I386-CMOV-NEXT: testb %cl, %dl +; I386-CMOV-NEXT: setnp %al +; I386-CMOV-NEXT: sete %cl +; I386-CMOV-NEXT: testb %al, %cl ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: negl %eax ; I386-CMOV-NEXT: fxch %st(1) ; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fstps (%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl (%esp), %edx -; I386-CMOV-NEXT: andl %eax, %ecx -; I386-CMOV-NEXT: notl %eax -; I386-CMOV-NEXT: andl %edx, %eax -; I386-CMOV-NEXT: orl %ecx, %eax -; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl %cmp = fcmp oeq float %x, %y %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b) @@ -130,66 +154,82 @@ define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounw define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f64_basic: ; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $24, %esp ; I386-NOCMOV-NEXT: fldl {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldl {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: negl %eax -; I386-NOCMOV-NEXT: movl %eax, %ecx -; I386-NOCMOV-NEXT: notl %ecx ; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstpl {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstpl {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-NOCMOV-NEXT: andl %eax, %edx -; I386-NOCMOV-NEXT: andl %ecx, %esi -; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi ; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-NOCMOV-NEXT: andl %eax, %edx -; I386-NOCMOV-NEXT: andl %ecx, %esi -; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi ; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldl (%esp) ; I386-NOCMOV-NEXT: addl $24, %esp ; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f64_basic: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi ; I386-CMOV-NEXT: pushl %esi ; I386-CMOV-NEXT: subl $24, %esp ; I386-CMOV-NEXT: fldl {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fldl {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: negl %eax -; I386-CMOV-NEXT: movl %eax, %ecx -; I386-CMOV-NEXT: notl %ecx ; I386-CMOV-NEXT: fxch %st(1) ; I386-CMOV-NEXT: fstpl {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fstpl {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-CMOV-NEXT: andl %eax, %edx -; I386-CMOV-NEXT: andl %ecx, %esi -; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi ; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-CMOV-NEXT: andl %eax, %edx -; I386-CMOV-NEXT: andl %ecx, %esi -; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi ; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fldl (%esp) ; I386-CMOV-NEXT: addl $24, %esp ; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) ret double %result @@ -199,78 +239,104 @@ define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f80_basic: ; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $36, %esp ; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: negl %eax -; I386-NOCMOV-NEXT: movl %eax, %ecx -; I386-NOCMOV-NEXT: notl %ecx ; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-NOCMOV-NEXT: andl %eax, %edx -; I386-NOCMOV-NEXT: andl %ecx, %esi -; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi ; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-NOCMOV-NEXT: andl %eax, %edx -; I386-NOCMOV-NEXT: andl %ecx, %esi -; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi ; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-NOCMOV-NEXT: andl %eax, %edx -; I386-NOCMOV-NEXT: andl %ecx, %esi -; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi ; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldt (%esp) ; I386-NOCMOV-NEXT: addl $36, %esp ; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f80_basic: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi ; I386-CMOV-NEXT: pushl %esi ; I386-CMOV-NEXT: subl $36, %esp ; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: negl %eax -; I386-CMOV-NEXT: movl %eax, %ecx -; I386-CMOV-NEXT: notl %ecx ; I386-CMOV-NEXT: fxch %st(1) ; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-CMOV-NEXT: andl %eax, %edx -; I386-CMOV-NEXT: andl %ecx, %esi -; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi ; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-CMOV-NEXT: andl %eax, %edx -; I386-CMOV-NEXT: andl %ecx, %esi -; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi ; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-CMOV-NEXT: andl %eax, %edx -; I386-CMOV-NEXT: andl %ecx, %esi -; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi ; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fldt (%esp) ; I386-CMOV-NEXT: addl $36, %esp ; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b) ret x86_fp80 %result @@ -280,6 +346,8 @@ define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nou define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f32_gt: ; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $12, %esp ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) @@ -289,27 +357,33 @@ define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounw ; I386-NOCMOV-NEXT: fnstsw %ax ; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax ; I386-NOCMOV-NEXT: sahf -; I386-NOCMOV-NEXT: seta %cl -; I386-NOCMOV-NEXT: xorl %eax, %eax -; I386-NOCMOV-NEXT: testb %cl, %cl +; I386-NOCMOV-NEXT: seta %al +; I386-NOCMOV-NEXT: testb %al, %al ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: negl %eax ; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl (%esp), %edx -; I386-NOCMOV-NEXT: andl %eax, %ecx -; I386-NOCMOV-NEXT: notl %eax -; I386-NOCMOV-NEXT: andl %edx, %eax -; I386-NOCMOV-NEXT: orl %ecx, %eax -; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f32_gt: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi ; I386-CMOV-NEXT: subl $12, %esp ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) @@ -317,23 +391,27 @@ define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounw ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fucompi %st(1), %st ; I386-CMOV-NEXT: fstp %st(0) -; I386-CMOV-NEXT: seta %cl -; I386-CMOV-NEXT: xorl %eax, %eax -; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: seta %al +; I386-CMOV-NEXT: testb %al, %al ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: negl %eax ; I386-CMOV-NEXT: fxch %st(1) ; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fstps (%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl (%esp), %edx -; I386-CMOV-NEXT: andl %eax, %ecx -; I386-CMOV-NEXT: notl %eax -; I386-CMOV-NEXT: andl %edx, %eax -; I386-CMOV-NEXT: orl %ecx, %eax -; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl %cmp = fcmp ogt float %x, %y %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b) @@ -344,48 +422,60 @@ define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounw define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f32_no_branches: ; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $12, %esp ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: negl %eax ; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl (%esp), %edx -; I386-NOCMOV-NEXT: andl %eax, %ecx -; I386-NOCMOV-NEXT: notl %eax -; I386-NOCMOV-NEXT: andl %edx, %eax -; I386-NOCMOV-NEXT: orl %ecx, %eax -; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f32_no_branches: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi ; I386-CMOV-NEXT: subl $12, %esp ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: negl %eax ; I386-CMOV-NEXT: fxch %st(1) ; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fstps (%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl (%esp), %edx -; I386-CMOV-NEXT: andl %eax, %ecx -; I386-CMOV-NEXT: notl %eax -; I386-CMOV-NEXT: andl %edx, %eax -; I386-CMOV-NEXT: orl %ecx, %eax -; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) ret float %result @@ -395,48 +485,60 @@ define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwi define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f32_bundled: ; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $12, %esp ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: negl %eax ; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl (%esp), %edx -; I386-NOCMOV-NEXT: andl %eax, %ecx -; I386-NOCMOV-NEXT: notl %eax -; I386-NOCMOV-NEXT: andl %edx, %eax -; I386-NOCMOV-NEXT: orl %ecx, %eax -; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f32_bundled: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi ; I386-CMOV-NEXT: subl $12, %esp ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: negl %eax ; I386-CMOV-NEXT: fxch %st(1) ; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fstps (%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl (%esp), %edx -; I386-CMOV-NEXT: andl %eax, %ecx -; I386-CMOV-NEXT: notl %eax -; I386-CMOV-NEXT: andl %edx, %eax -; I386-CMOV-NEXT: orl %ecx, %eax -; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) ret float %result @@ -446,48 +548,60 @@ define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind { define float @test_ctselect_f32_nan(i1 %cond) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f32_nan: ; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $12, %esp -; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; I386-NOCMOV-NEXT: fldz ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: negl %eax ; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl (%esp), %edx -; I386-NOCMOV-NEXT: andl %eax, %ecx -; I386-NOCMOV-NEXT: notl %eax -; I386-NOCMOV-NEXT: andl %edx, %eax -; I386-NOCMOV-NEXT: orl %ecx, %eax -; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f32_nan: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi ; I386-CMOV-NEXT: subl $12, %esp -; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; I386-CMOV-NEXT: fldz ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: negl %eax ; I386-CMOV-NEXT: fxch %st(1) ; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fstps (%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl (%esp), %edx -; I386-CMOV-NEXT: andl %eax, %ecx -; I386-CMOV-NEXT: notl %eax -; I386-CMOV-NEXT: andl %edx, %eax -; I386-CMOV-NEXT: orl %ecx, %eax -; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl %nan = bitcast i32 2139095040 to float ; 0x7F800000 = +inf %zero = bitcast i32 0 to float @@ -499,78 +613,104 @@ define float @test_ctselect_f32_nan(i1 %cond) nounwind { define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f80_alignment: ; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $36, %esp ; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: negl %eax -; I386-NOCMOV-NEXT: movl %eax, %ecx -; I386-NOCMOV-NEXT: notl %ecx ; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-NOCMOV-NEXT: andl %eax, %edx -; I386-NOCMOV-NEXT: andl %ecx, %esi -; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi ; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-NOCMOV-NEXT: andl %eax, %edx -; I386-NOCMOV-NEXT: andl %ecx, %esi -; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi ; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-NOCMOV-NEXT: andl %eax, %edx -; I386-NOCMOV-NEXT: andl %ecx, %esi -; I386-NOCMOV-NEXT: orl %edx, %esi +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi ; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldt (%esp) ; I386-NOCMOV-NEXT: addl $36, %esp ; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f80_alignment: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi ; I386-CMOV-NEXT: pushl %esi ; I386-CMOV-NEXT: subl $36, %esp ; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: negl %eax -; I386-CMOV-NEXT: movl %eax, %ecx -; I386-CMOV-NEXT: notl %ecx ; I386-CMOV-NEXT: fxch %st(1) ; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-CMOV-NEXT: andl %eax, %edx -; I386-CMOV-NEXT: andl %ecx, %esi -; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi ; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-CMOV-NEXT: andl %eax, %edx -; I386-CMOV-NEXT: andl %ecx, %esi -; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi ; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; I386-CMOV-NEXT: andl %eax, %edx -; I386-CMOV-NEXT: andl %ecx, %esi -; I386-CMOV-NEXT: orl %edx, %esi +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi ; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fldt (%esp) ; I386-CMOV-NEXT: addl $36, %esp ; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b) ret x86_fp80 %result @@ -580,78 +720,94 @@ define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float %b, float %c, float %d) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f32_multiple: ; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi ; I386-NOCMOV-NEXT: subl $24, %esp ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: negl %eax ; I386-NOCMOV-NEXT: fxch %st(1) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: andl %eax, %ecx -; I386-NOCMOV-NEXT: notl %eax -; I386-NOCMOV-NEXT: andl %edx, %eax -; I386-NOCMOV-NEXT: orl %ecx, %eax -; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: xorl %eax, %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: negl %eax ; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl (%esp), %edx -; I386-NOCMOV-NEXT: andl %eax, %ecx -; I386-NOCMOV-NEXT: notl %eax -; I386-NOCMOV-NEXT: andl %edx, %eax -; I386-NOCMOV-NEXT: orl %ecx, %eax -; I386-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: addl $24, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f32_multiple: ; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi ; I386-CMOV-NEXT: subl $24, %esp ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: negl %eax ; I386-CMOV-NEXT: fxch %st(1) ; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: andl %eax, %ecx -; I386-CMOV-NEXT: notl %eax -; I386-CMOV-NEXT: andl %edx, %eax -; I386-CMOV-NEXT: orl %ecx, %eax -; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: xorl %eax, %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: negl %eax ; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fstps (%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl (%esp), %edx -; I386-CMOV-NEXT: andl %eax, %ecx -; I386-CMOV-NEXT: notl %eax -; I386-CMOV-NEXT: andl %edx, %eax -; I386-CMOV-NEXT: orl %ecx, %eax -; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: addl $24, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl %sel1 = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b) %sel2 = call float @llvm.ct.select.f32(i1 %cond2, float %sel1, float %c) diff --git a/llvm/test/CodeGen/X86/ctselect-i386.ll b/llvm/test/CodeGen/X86/ctselect-i386.ll index 94101f6a16b23..d7345f1121540 100644 --- a/llvm/test/CodeGen/X86/ctselect-i386.ll +++ b/llvm/test/CodeGen/X86/ctselect-i386.ll @@ -19,7 +19,8 @@ define i32 @test_ctselect_i32_basic(i1 %cond, i32 %a, i32 %b) nounwind { ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %bl -; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi ; I386-NOCMOV-NEXT: negl %esi ; I386-NOCMOV-NEXT: movl %edx, %eax ; I386-NOCMOV-NEXT: andl %esi, %eax @@ -50,9 +51,10 @@ define i16 @test_ctselect_i16_basic(i1 %cond, i16 %a, i16 %b) nounwind { ; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %bl -; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbw %bh, %si ; I386-NOCMOV-NEXT: negw %si -; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: movw %dx, %ax ; I386-NOCMOV-NEXT: andw %si, %ax ; I386-NOCMOV-NEXT: notw %si ; I386-NOCMOV-NEXT: andw %cx, %si @@ -75,22 +77,17 @@ define i16 @test_ctselect_i16_basic(i1 %cond, i16 %a, i16 %b) nounwind { define i8 @test_ctselect_i8_basic(i1 %cond, i8 %a, i8 %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_i8_basic: ; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: pushl %ebx -; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: sete %bl -; I386-NOCMOV-NEXT: movzbl %bl, %esi -; I386-NOCMOV-NEXT: negl %esi -; I386-NOCMOV-NEXT: movl %edx, %eax -; I386-NOCMOV-NEXT: andl %esi, %eax -; I386-NOCMOV-NEXT: notl %esi -; I386-NOCMOV-NEXT: andl %ecx, %esi -; I386-NOCMOV-NEXT: orl %esi, %eax -; I386-NOCMOV-NEXT: # kill: def $al killed $al killed $eax -; I386-NOCMOV-NEXT: popl %esi -; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: sete %ah +; I386-NOCMOV-NEXT: movb %ah, %ch +; I386-NOCMOV-NEXT: negb %ch +; I386-NOCMOV-NEXT: movb %dl, %al +; I386-NOCMOV-NEXT: andb %ch, %al +; I386-NOCMOV-NEXT: notb %ch +; I386-NOCMOV-NEXT: andb %cl, %ch +; I386-NOCMOV-NEXT: orb %ch, %al ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_i8_basic: @@ -116,7 +113,8 @@ define i32 @test_crypto_key_select(i32 %secret_bit, i32 %key1, i32 %key2) nounwi ; I386-NOCMOV-NEXT: setne %al ; I386-NOCMOV-NEXT: testb %al, %al ; I386-NOCMOV-NEXT: sete %bl -; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi ; I386-NOCMOV-NEXT: negl %esi ; I386-NOCMOV-NEXT: movl %edx, %eax ; I386-NOCMOV-NEXT: andl %esi, %eax @@ -152,7 +150,8 @@ define i32 @test_no_conditional_branches(i32 %secret, i32 %val1, i32 %val2) noun ; I386-NOCMOV-NEXT: setne %al ; I386-NOCMOV-NEXT: testb %al, %al ; I386-NOCMOV-NEXT: sete %bl -; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi ; I386-NOCMOV-NEXT: negl %esi ; I386-NOCMOV-NEXT: movl %edx, %eax ; I386-NOCMOV-NEXT: andl %esi, %eax @@ -188,7 +187,8 @@ define i32 @test_ctselect_i32_cmp(i32 %a, i32 %b, i32 %c) nounwind { ; I386-NOCMOV-NEXT: sete %al ; I386-NOCMOV-NEXT: testb %al, %al ; I386-NOCMOV-NEXT: sete %bl -; I386-NOCMOV-NEXT: movzbl %bl, %esi +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi ; I386-NOCMOV-NEXT: negl %esi ; I386-NOCMOV-NEXT: movl %edx, %eax ; I386-NOCMOV-NEXT: andl %esi, %eax @@ -224,7 +224,8 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) n ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %bl -; I386-NOCMOV-NEXT: movzbl %bl, %edi +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %edi ; I386-NOCMOV-NEXT: negl %edi ; I386-NOCMOV-NEXT: movl %edx, %esi ; I386-NOCMOV-NEXT: andl %edi, %esi @@ -233,7 +234,8 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) n ; I386-NOCMOV-NEXT: orl %edi, %esi ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %dl -; I386-NOCMOV-NEXT: movzbl %dl, %edi +; I386-NOCMOV-NEXT: movb %dl, %dh +; I386-NOCMOV-NEXT: movzbl %dh, %edi ; I386-NOCMOV-NEXT: negl %edi ; I386-NOCMOV-NEXT: movl %ecx, %eax ; I386-NOCMOV-NEXT: andl %edi, %eax From def8b7b7d888259d7bc103d2caefec8db41801c6 Mon Sep 17 00:00:00 2001 From: kumarak Date: Thu, 9 Oct 2025 16:12:52 +0000 Subject: [PATCH 60/63] [CT] switch to using class for CTSELECT_I386* and fix formatting --- llvm/lib/Target/X86/X86ISelLowering.cpp | 26 +++++++------ llvm/lib/Target/X86/X86InstrCompiler.td | 50 +++++++++++++------------ llvm/lib/Target/X86/X86InstrInfo.cpp | 1 - 3 files changed, 40 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c375676dafdc7..1ebbecc58d9cb 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37999,8 +37999,10 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI, /// This approach ensures that when i64 is type-legalized into two i32 /// operations, both operations share the same condition byte rather than /// each independently reading (and destroying) EFLAGS. -static MachineBasicBlock *emitCTSelectI386WithConditionMaterialization( - MachineInstr &MI, MachineBasicBlock *BB, unsigned InternalPseudoOpcode) { +static MachineBasicBlock * +emitCTSelectI386WithConditionMaterialization(MachineInstr &MI, + MachineBasicBlock *BB, + unsigned InternalPseudoOpcode) { const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); const MIMetadata MIMD(MI); MachineFunction *MF = BB->getParent(); @@ -38033,23 +38035,23 @@ static MachineBasicBlock *emitCTSelectI386WithConditionMaterialization( Register TmpMaskReg; // Determine the register class for tmp_mask based on the data type - if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR8rr) + if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR8rr) { TmpMaskReg = MRI.createVirtualRegister(&X86::GR8RegClass); - else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR16rr) + } else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR16rr) { TmpMaskReg = MRI.createVirtualRegister(&X86::GR16RegClass); - else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR32rr) + } else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR32rr) { TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); - else { + } else { llvm_unreachable("Unknown internal pseudo opcode"); } BuildMI(*BB, MI, MIMD, TII->get(InternalPseudoOpcode)) - .addDef(DstReg) // dst (output) - .addDef(TmpByteReg) // tmp_byte (output) - .addDef(TmpMaskReg) // tmp_mask (output) - .addReg(Src1Reg) // src1 (input) - .addReg(Src2Reg) // src2 (input) - .addReg(CondByteReg); // pre-materialized condition byte (input) + .addDef(DstReg) // dst (output) + .addDef(TmpByteReg) // tmp_byte (output) + .addDef(TmpMaskReg) // tmp_mask (output) + .addReg(Src1Reg) // src1 (input) + .addReg(Src2Reg) // src2 (input) + .addReg(CondByteReg); // pre-materialized condition byte (input) MI.eraseFromParent(); return BB; diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index b4d2993be72f4..f4163f55d66ce 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -702,43 +702,45 @@ def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), let isPseudo = 1, isNotDuplicable = 1 in { // Phase 1: Initial pseudos that consume EFLAGS (via custom inserter) // These are matched by patterns and convert EFLAGS to condition byte - multiclass CTSELECT_I386_INITIAL { - let Uses = [EFLAGS], Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { - def rr : PseudoI<(outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$cond), - [(set RC:$dst, (VT(X86ctselect RC:$src1, RC:$src2, timm:$cond, EFLAGS)))]>; - } + class CTSELECT_I386_INITIAL + : PseudoI<(outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$cond), + [(set RC:$dst, (VT(X86ctselect RC:$src1, RC:$src2, timm:$cond, + EFLAGS)))]> { + let Uses = [EFLAGS]; + let Defs = [EFLAGS]; + let usesCustomInserter = 1; + let hasNoSchedulingInfo = 1; } // Phase 2: Internal pseudos with pre-materialized condition byte (post-RA expansion) // These generate the actual constant-time instruction bundles - multiclass CTSELECT_I386_INTERNAL { - let hasNoSchedulingInfo = 1 in { - def rr : PseudoI<(outs RC:$dst, ByteRC:$tmp_byte, RC:$tmp_mask), - (ins RC:$src1, RC:$src2, ByteRC:$cond_byte), []> { - let Constraints = - "@earlyclobber $dst,@earlyclobber $tmp_byte,@earlyclobber " - "$tmp_mask"; - } - } + class CTSELECT_I386_INTERNAL + : PseudoI<(outs RC:$dst, ByteRC:$tmp_byte, RC:$tmp_mask), + (ins RC:$src1, RC:$src2, ByteRC:$cond_byte), []> { + let hasNoSchedulingInfo = 1; + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_byte,@earlyclobber $tmp_mask"; } } // Phase 1 pseudos for non-CMOV targets (custom inserter materializes condition) let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { let Predicates = [NoNativeCMOV] in { - defm CTSELECT_I386_GR8 : CTSELECT_I386_INITIAL; - defm CTSELECT_I386_GR16 : CTSELECT_I386_INITIAL; - defm CTSELECT_I386_GR32 : CTSELECT_I386_INITIAL; + def CTSELECT_I386_GR8rr : CTSELECT_I386_INITIAL; + def CTSELECT_I386_GR16rr : CTSELECT_I386_INITIAL; + def CTSELECT_I386_GR32rr : CTSELECT_I386_INITIAL; } } // Phase 2 pseudos (post-RA expansion with pre-materialized condition byte) let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { let Predicates = [NoNativeCMOV] in { - defm CTSELECT_I386_INT_GR8 : CTSELECT_I386_INTERNAL; - defm CTSELECT_I386_INT_GR16 : CTSELECT_I386_INTERNAL; - defm CTSELECT_I386_INT_GR32 : CTSELECT_I386_INTERNAL; + def CTSELECT_I386_INT_GR8rr : + CTSELECT_I386_INTERNAL; + def CTSELECT_I386_INT_GR16rr : + CTSELECT_I386_INTERNAL; + def CTSELECT_I386_INT_GR32rr : + CTSELECT_I386_INTERNAL; } } @@ -747,12 +749,12 @@ let hasSideEffects = 1, Constraints = "$dst = $src1" in { let Predicates = [FPStackf32] in - defm CTSELECT_I386_FP32 : CTSELECT_I386_INITIAL; + def CTSELECT_I386_FP32rr : CTSELECT_I386_INITIAL; let Predicates = [FPStackf64] in - defm CTSELECT_I386_FP64 : CTSELECT_I386_INITIAL; + def CTSELECT_I386_FP64rr : CTSELECT_I386_INITIAL; - defm CTSELECT_I386_FP80 : CTSELECT_I386_INITIAL; + def CTSELECT_I386_FP80rr : CTSELECT_I386_INITIAL; } // Pattern matching for non-native-CMOV CTSELECT (routes to custom inserter for condition materialization) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 119e8f00d2f86..ef270fc49a224 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1022,7 +1022,6 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const { // Remove the original pseudo instruction MI.eraseFromParent(); - return true; } From 6dcb509bafc75f3316fcb6f8ef572eeee0f1f841 Mon Sep 17 00:00:00 2001 From: Henrik Brodin <90325907+hbrodin@users.noreply.github.com> Date: Tue, 14 Oct 2025 13:11:38 +0200 Subject: [PATCH 61/63] [CT] Generate more efficient code when fp is from memory Typical sequence was: 1. Load fp from mem -> fp reg 2. Store fp -> stack 3. Load int from stack New sequence is instead: 1. Load from stack (as int) This improves codegen for: - Global floating point variables - Global floating point constants - Stack based floating point variables --- llvm/lib/Target/X86/X86ISelLowering.cpp | 316 +++++++++++++++++-- llvm/test/CodeGen/X86/ctselect-edge-cases.ll | 82 ++++- llvm/test/CodeGen/X86/ctselect-i386-fp.ll | 242 +++++--------- llvm/test/CodeGen/X86/ctselect.ll | 249 +++++++++++---- 4 files changed, 622 insertions(+), 267 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1ebbecc58d9cb..a11ef3833b69b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -38057,6 +38057,121 @@ emitCTSelectI386WithConditionMaterialization(MachineInstr &MI, return BB; } +// Helper structure to hold memory operand information for FP loads +struct FPLoadMemOperands { + bool IsValid = false; + unsigned BaseReg = 0; + int64_t ScaleVal = 1; + unsigned IndexReg = 0; + int64_t Disp = 0; + unsigned SegReg = 0; + int FrameIndex = -1; + bool IsFrameIndex = false; + int ConstantPoolIndex = -1; + bool IsConstantPool = false; + const GlobalValue *Global = nullptr; + int64_t GlobalOffset = 0; + bool IsGlobal = false; +}; + +// Check if a virtual register is defined by a simple FP load instruction +// Returns the memory operands if it's a simple load, otherwise returns invalid +static FPLoadMemOperands getFPLoadMemOperands(Register Reg, + MachineRegisterInfo &MRI, + unsigned ExpectedLoadOpcode) { + FPLoadMemOperands Result; + + if (!Reg.isVirtual()) + return Result; + + MachineInstr *DefMI = MRI.getVRegDef(Reg); + if (!DefMI) + return Result; + + // Check if it's the expected load opcode (e.g., LD_Fp32m, LD_Fp64m, LD_Fp80m) + if (DefMI->getOpcode() != ExpectedLoadOpcode) + return Result; + + // Check that this is a simple load - not volatile, not atomic, etc. + // FP loads have hasSideEffects = 0 in their definition for simple loads + if (DefMI->hasOrderedMemoryRef()) + return Result; + + // The load should have a single def (the destination register) and memory operands + // Format: %reg = LD_Fpxxm , 1, %noreg, 0, %noreg + // or: %reg = LD_Fpxxm %base, scale, %index, disp, %segment + if (DefMI->getNumOperands() < 6) + return Result; + + // Operand 0 is the destination, operands 1-5 are the memory reference + MachineOperand &BaseMO = DefMI->getOperand(1); + MachineOperand &ScaleMO = DefMI->getOperand(2); + MachineOperand &IndexMO = DefMI->getOperand(3); + MachineOperand &DispMO = DefMI->getOperand(4); + MachineOperand &SegMO = DefMI->getOperand(5); + + // Check if this is a frame index load + if (BaseMO.isFI()) { + Result.IsValid = true; + Result.IsFrameIndex = true; + Result.FrameIndex = BaseMO.getIndex(); + Result.ScaleVal = ScaleMO.getImm(); + Result.IndexReg = IndexMO.getReg(); + Result.Disp = DispMO.getImm(); + Result.SegReg = SegMO.getReg(); + return Result; + } + + // Check if this is a constant pool load + // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, %const.N, $noreg + if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister && + ScaleMO.isImm() && IndexMO.isReg() && + IndexMO.getReg() == X86::NoRegister && + DispMO.isCPI() && SegMO.isReg()) { + Result.IsValid = true; + Result.IsConstantPool = true; + Result.ConstantPoolIndex = DispMO.getIndex(); + Result.ScaleVal = ScaleMO.getImm(); + Result.IndexReg = IndexMO.getReg(); + Result.Disp = 0; + Result.SegReg = SegMO.getReg(); + return Result; + } + + // Check if this is a global variable load + // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, @global_name, $noreg + if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister && + ScaleMO.isImm() && IndexMO.isReg() && + IndexMO.getReg() == X86::NoRegister && + DispMO.isGlobal() && SegMO.isReg()) { + Result.IsValid = true; + Result.IsGlobal = true; + Result.Global = DispMO.getGlobal(); + Result.GlobalOffset = DispMO.getOffset(); + Result.ScaleVal = ScaleMO.getImm(); + Result.IndexReg = IndexMO.getReg(); + Result.Disp = 0; + Result.SegReg = SegMO.getReg(); + return Result; + } + + // Regular memory operands (e.g., pointer loads) + if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() && + DispMO.isImm() && SegMO.isReg()) { + Result.IsValid = true; + Result.IsFrameIndex = false; + Result.IsConstantPool = false; + Result.BaseReg = BaseMO.getReg(); + Result.ScaleVal = ScaleMO.getImm(); + Result.IndexReg = IndexMO.getReg(); + Result.Disp = DispMO.getImm(); + Result.SegReg = SegMO.getReg(); + return Result; + } + + return Result; +} + static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI, MachineBasicBlock *BB, unsigned pseudoInstr) { @@ -38084,6 +38199,85 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI, .addReg(Reg, RegState::Kill); }; + // Helper to load integer from memory operands + auto loadIntFromMemOperands = [&](const FPLoadMemOperands &MemOps, + unsigned Offset) -> unsigned { + unsigned IntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + MachineInstrBuilder MIB = + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), IntReg); + + if (MemOps.IsFrameIndex) { + // Frame index: addFrameIndex + scale + index + disp + segment + MIB.addFrameIndex(MemOps.FrameIndex) + .addImm(MemOps.ScaleVal) + .addReg(MemOps.IndexReg) + .addImm(MemOps.Disp + Offset) + .addReg(MemOps.SegReg); + } else if (MemOps.IsConstantPool) { + // Constant pool: base_reg + scale + index + CP_index + segment + // MOV32rm format: base, scale, index, displacement, segment + MIB.addReg(X86::NoRegister) // Base register + .addImm(MemOps.ScaleVal) // Scale + .addReg(MemOps.IndexReg) // Index register + .addConstantPoolIndex(MemOps.ConstantPoolIndex, Offset) // Displacement (CP index) + .addReg(MemOps.SegReg); // Segment + } else if (MemOps.IsGlobal) { + // Global variable: base_reg + scale + index + global + segment + // MOV32rm format: base, scale, index, displacement, segment + MIB.addReg(X86::NoRegister) // Base register + .addImm(MemOps.ScaleVal) // Scale + .addReg(MemOps.IndexReg) // Index register + .addGlobalAddress(MemOps.Global, MemOps.GlobalOffset + Offset) // Displacement (global address) + .addReg(MemOps.SegReg); // Segment + } else { + // Regular memory: base_reg + scale + index + disp + segment + MIB.addReg(MemOps.BaseReg) + .addImm(MemOps.ScaleVal) + .addReg(MemOps.IndexReg) + .addImm(MemOps.Disp + Offset) + .addReg(MemOps.SegReg); + } + + return IntReg; + }; + + // Optimized path: load integers directly from memory when both operands are + // memory loads, avoiding FP register round-trip + auto emitCtSelectFromMemory = [&](unsigned NumValues, + const FPLoadMemOperands &TrueMemOps, + const FPLoadMemOperands &FalseMemOps, + int ResultSlot) { + for (unsigned Val = 0; Val < NumValues; ++Val) { + unsigned Offset = Val * RegSizeInByte; + + // Load true and false values directly from their memory locations as integers + unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset); + unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset); + + // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection + unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); + + BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr)) + .addDef(ResultIntReg) // dst (output) + .addDef(TmpByteReg) // tmp_byte (output) + .addDef(TmpMaskReg) // tmp_mask (output) + .addReg(FalseIntReg) // src1 (input) - false value + .addReg(TrueIntReg) // src2 (input) - true value + .addReg(CondByteReg); // pre-materialized condition byte (input) + + // Store result back to result slot + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr)) + .addFrameIndex(ResultSlot) + .addImm(1) + .addReg(0) + .addImm(Offset) + .addReg(0) + .addReg(ResultIntReg, RegState::Kill); + } + }; + auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) { for (unsigned Val = 0; Val < NumValues; ++Val) { unsigned Offset = Val * RegSizeInByte; @@ -38131,17 +38325,40 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI, switch (pseudoInstr) { case X86::CTSELECT_I386_FP32rr: { - // Allocate stack slots (4 bytes for f32) + // Check if both operands are simple memory loads + FPLoadMemOperands TrueMemOps = + getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp32m); + FPLoadMemOperands FalseMemOps = + getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp32m); + int ResultSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); - int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); - int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); - // Store f32 values to stack - storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg); - storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg); + if (TrueMemOps.IsValid && FalseMemOps.IsValid) { + // Optimized path: load directly from memory as integers + // Works for both frame index loads (stack parameters) and + // constant pool loads (constants) + emitCtSelectFromMemory(1, TrueMemOps, FalseMemOps, ResultSlot); - // Use pseudo instruction for selection (1 x 32-bit value) - emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot); + // Erase the original FP load instructions since we're not using them + // and have loaded the data directly as integers instead + if (MRI.hasOneUse(TrueReg)) { + if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg)) + TrueDefMI->eraseFromParent(); + } + if (MRI.hasOneUse(FalseReg)) { + if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg)) + FalseDefMI->eraseFromParent(); + } + } else { + // General path: spill FP registers to stack first + int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); + int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); + + storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg); + storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg); + + emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot); + } // Load result back as f32 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp32m), DestReg), @@ -38150,17 +38367,42 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI, } case X86::CTSELECT_I386_FP64rr: { unsigned StackSlotSize = 8; - // Allocate stack slots (8 bytes for f64) - int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); - int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); + + // Check if both operands are simple memory loads + FPLoadMemOperands TrueMemOps = + getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp64m); + FPLoadMemOperands FalseMemOps = + getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp64m); + int ResultSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); - // Store f64 values to stack - storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg); - storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg); + if (TrueMemOps.IsValid && FalseMemOps.IsValid) { + // Optimized path: load directly from memory as integers + // Works for both frame index loads (stack parameters) and + // constant pool loads (constants) + emitCtSelectFromMemory(StackSlotSize / RegSizeInByte, TrueMemOps, + FalseMemOps, ResultSlot); - // Use pseudo instruction for selection (2 x 32-bit values) - emitCtSelectWithPseudo(StackSlotSize/RegSizeInByte, TrueSlot, FalseSlot, ResultSlot); + // Erase the original FP load instructions since we're not using them + if (MRI.hasOneUse(TrueReg)) { + if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg)) + TrueDefMI->eraseFromParent(); + } + if (MRI.hasOneUse(FalseReg)) { + if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg)) + FalseDefMI->eraseFromParent(); + } + } else { + // General path: spill FP registers to stack first + int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); + int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); + + storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg); + storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg); + + emitCtSelectWithPseudo(StackSlotSize / RegSizeInByte, TrueSlot, FalseSlot, + ResultSlot); + } // Load result back as f64 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp64m), DestReg), @@ -38168,18 +38410,44 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI, break; } case X86::CTSELECT_I386_FP80rr: { - // Allocate stack slots (12 bytes for f80 - 80-bit = 10 bytes, aligned to 12) + // f80 is 80 bits (10 bytes), but stored with 12-byte alignment unsigned StackObjectSize = 12; - int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false); - int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false); + + // Check if both operands are simple memory loads + FPLoadMemOperands TrueMemOps = + getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp80m); + FPLoadMemOperands FalseMemOps = + getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp80m); + int ResultSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false); - // Store f80 values to stack - storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg); - storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg); + if (TrueMemOps.IsValid && FalseMemOps.IsValid) { + // Optimized path: load directly from memory as integers + // Works for both frame index loads (stack parameters) and + // constant pool loads (constants) + emitCtSelectFromMemory(StackObjectSize / RegSizeInByte, TrueMemOps, + FalseMemOps, ResultSlot); - // Use pseudo instruction for selection (3 x 32-bit values) - emitCtSelectWithPseudo(StackObjectSize/RegSizeInByte, TrueSlot, FalseSlot, ResultSlot); + // Erase the original FP load instructions since we're not using them + if (MRI.hasOneUse(TrueReg)) { + if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg)) + TrueDefMI->eraseFromParent(); + } + if (MRI.hasOneUse(FalseReg)) { + if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg)) + FalseDefMI->eraseFromParent(); + } + } else { + // General path: spill FP registers to stack first + int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false); + int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false); + + storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg); + storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg); + + emitCtSelectWithPseudo(StackObjectSize / RegSizeInByte, TrueSlot, + FalseSlot, ResultSlot); + } // Load result back as f80 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp80m), DestReg), diff --git a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll index fb6b4706d62d8..c37daa41eeb72 100644 --- a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll +++ b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll @@ -101,15 +101,34 @@ define float @test_ctselect_f32_special_values(i1 %cond) { ; ; X32-LABEL: test_ctselect_f32_special_values: ; X32: # %bb.0: +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} -; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} -; X32-NEXT: jne .LBB3_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: fstp %st(1) -; X32-NEXT: fldz -; X32-NEXT: .LBB3_2: -; X32-NEXT: fstp %st(0) +; X32-NEXT: sete %al +; X32-NEXT: movl {{\.?LCPI[0-9]+_[0-9]+}}, %ecx +; X32-NEXT: movl {{\.?LCPI[0-9]+_[0-9]+}}, %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: addl $4, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000) ret float %result @@ -127,15 +146,50 @@ define double @test_ctselect_f64_special_values(i1 %cond) { ; ; X32-LABEL: test_ctselect_f64_special_values: ; X32: # %bb.0: +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: subl $24, %esp +; X32-NEXT: .cfi_def_cfa_offset 36 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} -; X32-NEXT: jne .LBB4_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: fstp %st(1) -; X32-NEXT: fldz -; X32-NEXT: .LBB4_2: -; X32-NEXT: fstp %st(0) +; X32-NEXT: sete %al +; X32-NEXT: fxch %st(1) +; X32-NEXT: fstpl {{[0-9]+}}(%esp) +; X32-NEXT: fstpl (%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NEXT: fldl {{[0-9]+}}(%esp) +; X32-NEXT: addl $24, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000) ret double %result diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll index b62bb9075f3db..ea943307c644f 100644 --- a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll +++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll @@ -15,16 +15,11 @@ define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind { ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: subl $12, %esp -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: pushl %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: fxch %st(1) -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: movb %al, %ah ; I386-NOCMOV-NEXT: movzbl %ah, %edi ; I386-NOCMOV-NEXT: negl %edi @@ -33,9 +28,9 @@ define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind { ; I386-NOCMOV-NEXT: notl %edi ; I386-NOCMOV-NEXT: andl %ecx, %edi ; I386-NOCMOV-NEXT: orl %edi, %esi -; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl @@ -44,16 +39,11 @@ define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind { ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: pushl %edi ; I386-CMOV-NEXT: pushl %esi -; I386-CMOV-NEXT: subl $12, %esp -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: pushl %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: fxch %st(1) -; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: fstps (%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-CMOV-NEXT: movb %al, %ah ; I386-CMOV-NEXT: movzbl %ah, %edi ; I386-CMOV-NEXT: negl %edi @@ -62,9 +52,9 @@ define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind { ; I386-CMOV-NEXT: notl %edi ; I386-CMOV-NEXT: andl %ecx, %edi ; I386-CMOV-NEXT: orl %edi, %esi -; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp ; I386-CMOV-NEXT: popl %esi ; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl @@ -78,9 +68,7 @@ define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounw ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: subl $12, %esp -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: pushl %eax ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fucompp @@ -91,11 +79,8 @@ define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounw ; I386-NOCMOV-NEXT: sete %cl ; I386-NOCMOV-NEXT: testb %al, %cl ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: fxch %st(1) -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: movb %al, %ah ; I386-NOCMOV-NEXT: movzbl %ah, %edi ; I386-NOCMOV-NEXT: negl %edi @@ -104,9 +89,9 @@ define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounw ; I386-NOCMOV-NEXT: notl %edi ; I386-NOCMOV-NEXT: andl %ecx, %edi ; I386-NOCMOV-NEXT: orl %edi, %esi -; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl @@ -115,9 +100,7 @@ define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounw ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: pushl %edi ; I386-CMOV-NEXT: pushl %esi -; I386-CMOV-NEXT: subl $12, %esp -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: pushl %eax ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fucompi %st(1), %st @@ -126,11 +109,8 @@ define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounw ; I386-CMOV-NEXT: sete %cl ; I386-CMOV-NEXT: testb %al, %cl ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: fxch %st(1) -; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: fstps (%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-CMOV-NEXT: movb %al, %ah ; I386-CMOV-NEXT: movzbl %ah, %edi ; I386-CMOV-NEXT: negl %edi @@ -139,9 +119,9 @@ define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounw ; I386-CMOV-NEXT: notl %edi ; I386-CMOV-NEXT: andl %ecx, %edi ; I386-CMOV-NEXT: orl %edi, %esi -; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp ; I386-CMOV-NEXT: popl %esi ; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl @@ -156,14 +136,9 @@ define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: subl $24, %esp -; I386-NOCMOV-NEXT: fldl {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fldl {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: subl $8, %esp ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: fxch %st(1) -; I386-NOCMOV-NEXT: fstpl {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstpl {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: movb %al, %ah @@ -187,7 +162,7 @@ define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind ; I386-NOCMOV-NEXT: orl %edi, %esi ; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldl (%esp) -; I386-NOCMOV-NEXT: addl $24, %esp +; I386-NOCMOV-NEXT: addl $8, %esp ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl @@ -196,14 +171,9 @@ define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: pushl %edi ; I386-CMOV-NEXT: pushl %esi -; I386-CMOV-NEXT: subl $24, %esp -; I386-CMOV-NEXT: fldl {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: fldl {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: subl $8, %esp ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: fxch %st(1) -; I386-CMOV-NEXT: fstpl {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: fstpl {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-CMOV-NEXT: movb %al, %ah @@ -227,7 +197,7 @@ define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind ; I386-CMOV-NEXT: orl %edi, %esi ; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fldl (%esp) -; I386-CMOV-NEXT: addl $24, %esp +; I386-CMOV-NEXT: addl $8, %esp ; I386-CMOV-NEXT: popl %esi ; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl @@ -241,14 +211,9 @@ define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nou ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: subl $36, %esp -; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: subl $12, %esp ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: fxch %st(1) -; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: movb %al, %ah @@ -283,7 +248,7 @@ define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nou ; I386-NOCMOV-NEXT: orl %edi, %esi ; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldt (%esp) -; I386-NOCMOV-NEXT: addl $36, %esp +; I386-NOCMOV-NEXT: addl $12, %esp ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl @@ -292,14 +257,9 @@ define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nou ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: pushl %edi ; I386-CMOV-NEXT: pushl %esi -; I386-CMOV-NEXT: subl $36, %esp -; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: subl $12, %esp ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: fxch %st(1) -; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-CMOV-NEXT: movb %al, %ah @@ -334,7 +294,7 @@ define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nou ; I386-CMOV-NEXT: orl %edi, %esi ; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fldt (%esp) -; I386-CMOV-NEXT: addl $36, %esp +; I386-CMOV-NEXT: addl $12, %esp ; I386-CMOV-NEXT: popl %esi ; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl @@ -348,9 +308,7 @@ define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounw ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: subl $12, %esp -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: pushl %eax ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fucompp @@ -360,11 +318,8 @@ define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounw ; I386-NOCMOV-NEXT: seta %al ; I386-NOCMOV-NEXT: testb %al, %al ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: fxch %st(1) -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: movb %al, %ah ; I386-NOCMOV-NEXT: movzbl %ah, %edi ; I386-NOCMOV-NEXT: negl %edi @@ -373,9 +328,9 @@ define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounw ; I386-NOCMOV-NEXT: notl %edi ; I386-NOCMOV-NEXT: andl %ecx, %edi ; I386-NOCMOV-NEXT: orl %edi, %esi -; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl @@ -384,9 +339,7 @@ define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounw ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: pushl %edi ; I386-CMOV-NEXT: pushl %esi -; I386-CMOV-NEXT: subl $12, %esp -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: pushl %eax ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fucompi %st(1), %st @@ -394,11 +347,8 @@ define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounw ; I386-CMOV-NEXT: seta %al ; I386-CMOV-NEXT: testb %al, %al ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: fxch %st(1) -; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: fstps (%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-CMOV-NEXT: movb %al, %ah ; I386-CMOV-NEXT: movzbl %ah, %edi ; I386-CMOV-NEXT: negl %edi @@ -407,9 +357,9 @@ define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounw ; I386-CMOV-NEXT: notl %edi ; I386-CMOV-NEXT: andl %ecx, %edi ; I386-CMOV-NEXT: orl %edi, %esi -; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp ; I386-CMOV-NEXT: popl %esi ; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl @@ -424,16 +374,11 @@ define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwi ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: subl $12, %esp -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: pushl %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: fxch %st(1) -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: movb %al, %ah ; I386-NOCMOV-NEXT: movzbl %ah, %edi ; I386-NOCMOV-NEXT: negl %edi @@ -442,9 +387,9 @@ define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwi ; I386-NOCMOV-NEXT: notl %edi ; I386-NOCMOV-NEXT: andl %ecx, %edi ; I386-NOCMOV-NEXT: orl %edi, %esi -; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl @@ -453,16 +398,11 @@ define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwi ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: pushl %edi ; I386-CMOV-NEXT: pushl %esi -; I386-CMOV-NEXT: subl $12, %esp -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: pushl %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: fxch %st(1) -; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: fstps (%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-CMOV-NEXT: movb %al, %ah ; I386-CMOV-NEXT: movzbl %ah, %edi ; I386-CMOV-NEXT: negl %edi @@ -471,9 +411,9 @@ define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwi ; I386-CMOV-NEXT: notl %edi ; I386-CMOV-NEXT: andl %ecx, %edi ; I386-CMOV-NEXT: orl %edi, %esi -; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp ; I386-CMOV-NEXT: popl %esi ; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl @@ -487,16 +427,11 @@ define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind { ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: subl $12, %esp -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: pushl %eax ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: fxch %st(1) -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: movb %al, %ah ; I386-NOCMOV-NEXT: movzbl %ah, %edi ; I386-NOCMOV-NEXT: negl %edi @@ -505,9 +440,9 @@ define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind { ; I386-NOCMOV-NEXT: notl %edi ; I386-NOCMOV-NEXT: andl %ecx, %edi ; I386-NOCMOV-NEXT: orl %edi, %esi -; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl @@ -516,16 +451,11 @@ define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind { ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: pushl %edi ; I386-CMOV-NEXT: pushl %esi -; I386-CMOV-NEXT: subl $12, %esp -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: pushl %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: fxch %st(1) -; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: fstps (%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-CMOV-NEXT: movb %al, %ah ; I386-CMOV-NEXT: movzbl %ah, %edi ; I386-CMOV-NEXT: negl %edi @@ -534,9 +464,9 @@ define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind { ; I386-CMOV-NEXT: notl %edi ; I386-CMOV-NEXT: andl %ecx, %edi ; I386-CMOV-NEXT: orl %edi, %esi -; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp ; I386-CMOV-NEXT: popl %esi ; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl @@ -615,14 +545,9 @@ define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: subl $36, %esp -; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: subl $12, %esp ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: fxch %st(1) -; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: movb %al, %ah @@ -657,7 +582,7 @@ define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) ; I386-NOCMOV-NEXT: orl %edi, %esi ; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: fldt (%esp) -; I386-NOCMOV-NEXT: addl $36, %esp +; I386-NOCMOV-NEXT: addl $12, %esp ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl @@ -666,14 +591,9 @@ define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: pushl %edi ; I386-CMOV-NEXT: pushl %esi -; I386-CMOV-NEXT: subl $36, %esp -; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: subl $12, %esp ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: fxch %st(1) -; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-CMOV-NEXT: movb %al, %ah @@ -708,7 +628,7 @@ define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) ; I386-CMOV-NEXT: orl %edi, %esi ; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: fldt (%esp) -; I386-CMOV-NEXT: addl $36, %esp +; I386-CMOV-NEXT: addl $12, %esp ; I386-CMOV-NEXT: popl %esi ; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl @@ -722,15 +642,9 @@ define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float % ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: subl $24, %esp -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: subl $8, %esp ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: fxch %st(1) -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: movb %al, %ah @@ -742,13 +656,10 @@ define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float % ; I386-NOCMOV-NEXT: andl %ecx, %edi ; I386-NOCMOV-NEXT: orl %edi, %esi ; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fstps (%esp) ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-NOCMOV-NEXT: movb %al, %ah ; I386-NOCMOV-NEXT: movzbl %ah, %edi ; I386-NOCMOV-NEXT: negl %edi @@ -757,9 +668,9 @@ define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float % ; I386-NOCMOV-NEXT: notl %edi ; I386-NOCMOV-NEXT: andl %ecx, %edi ; I386-NOCMOV-NEXT: orl %edi, %esi -; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: addl $24, %esp +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $8, %esp ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %edi ; I386-NOCMOV-NEXT: retl @@ -768,15 +679,9 @@ define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float % ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: pushl %edi ; I386-CMOV-NEXT: pushl %esi -; I386-CMOV-NEXT: subl $24, %esp -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: subl $8, %esp ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: fxch %st(1) -; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-CMOV-NEXT: movb %al, %ah @@ -788,13 +693,10 @@ define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float % ; I386-CMOV-NEXT: andl %ecx, %edi ; I386-CMOV-NEXT: orl %edi, %esi ; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; I386-CMOV-NEXT: sete %al -; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: fstps (%esp) ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; I386-CMOV-NEXT: movb %al, %ah ; I386-CMOV-NEXT: movzbl %ah, %edi ; I386-CMOV-NEXT: negl %edi @@ -803,9 +705,9 @@ define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float % ; I386-CMOV-NEXT: notl %edi ; I386-CMOV-NEXT: andl %ecx, %edi ; I386-CMOV-NEXT: orl %edi, %esi -; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: addl $24, %esp +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $8, %esp ; I386-CMOV-NEXT: popl %esi ; I386-CMOV-NEXT: popl %edi ; I386-CMOV-NEXT: retl diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll index 705e5377c75a4..3f6276add0a5c 100644 --- a/llvm/test/CodeGen/X86/ctselect.ll +++ b/llvm/test/CodeGen/X86/ctselect.ll @@ -24,9 +24,6 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { ; ; X32-NOCMOV-LABEL: test_ctselect_i8: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: pushl %esi -; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 -; X32-NOCMOV-NEXT: .cfi_offset %esi, -8 ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) @@ -38,8 +35,6 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { ; X32-NOCMOV-NEXT: notb %ch ; X32-NOCMOV-NEXT: andb %cl, %ch ; X32-NOCMOV-NEXT: orb %ch, %al -; X32-NOCMOV-NEXT: popl %esi -; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) ret i8 %result @@ -74,13 +69,13 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { ; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NOCMOV-NEXT: sete %bl ; X32-NOCMOV-NEXT: movb %bl, %bh -; X32-NOCMOV-NEXT: movzbw %bh, %esi -; X32-NOCMOV-NEXT: negw %esi +; X32-NOCMOV-NEXT: movzbw %bh, %si +; X32-NOCMOV-NEXT: negw %si ; X32-NOCMOV-NEXT: movw %dx, %ax -; X32-NOCMOV-NEXT: andw %esi, %ax -; X32-NOCMOV-NEXT: notw %esi -; X32-NOCMOV-NEXT: andw %cx, %esi -; X32-NOCMOV-NEXT: orw %esi, %ax +; X32-NOCMOV-NEXT: andw %si, %ax +; X32-NOCMOV-NEXT: notw %si +; X32-NOCMOV-NEXT: andw %cx, %si +; X32-NOCMOV-NEXT: orw %si, %ax ; X32-NOCMOV-NEXT: popl %esi ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 ; X32-NOCMOV-NEXT: popl %ebx @@ -212,28 +207,66 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; ; X32-LABEL: test_ctselect_f32: ; X32: # %bb.0: -; X32-NEXT: flds {{[0-9]+}}(%esp) -; X32-NEXT: flds {{[0-9]+}}(%esp) +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: jne .LBB4_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: fstp %st(1) -; X32-NEXT: fldz -; X32-NEXT: .LBB4_2: -; X32-NEXT: fstp %st(0) +; X32-NEXT: sete %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: addl $4, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_f32: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %eax +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -8 ; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NOCMOV-NEXT: jne .LBB4_2 -; X32-NOCMOV-NEXT: # %bb.1: -; X32-NOCMOV-NEXT: fstp %st(1) -; X32-NOCMOV-NEXT: fldz -; X32-NOCMOV-NEXT: .LBB4_2: -; X32-NOCMOV-NEXT: fstp %st(0) +; X32-NOCMOV-NEXT: sete %al +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movb %al, %ah +; X32-NOCMOV-NEXT: movzbl %ah, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %ecx, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: movl %esi, (%esp) +; X32-NOCMOV-NEXT: flds (%esp) +; X32-NOCMOV-NEXT: addl $4, %esp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) ret float %result @@ -251,28 +284,88 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) { ; ; X32-LABEL: test_ctselect_f64: ; X32: # %bb.0: -; X32-NEXT: fldl {{[0-9]+}}(%esp) -; X32-NEXT: fldl {{[0-9]+}}(%esp) +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: subl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 20 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NEXT: jne .LBB5_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: fstp %st(1) -; X32-NEXT: fldz -; X32-NEXT: .LBB5_2: -; X32-NEXT: fstp %st(0) +; X32-NEXT: sete %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, (%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NEXT: fldl (%esp) +; X32-NEXT: addl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_f64: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: fldl {{[0-9]+}}(%esp) -; X32-NOCMOV-NEXT: fldl {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: subl $8, %esp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -8 ; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; X32-NOCMOV-NEXT: jne .LBB5_2 -; X32-NOCMOV-NEXT: # %bb.1: -; X32-NOCMOV-NEXT: fstp %st(1) -; X32-NOCMOV-NEXT: fldz -; X32-NOCMOV-NEXT: .LBB5_2: -; X32-NOCMOV-NEXT: fstp %st(0) +; X32-NOCMOV-NEXT: sete %al +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movb %al, %ah +; X32-NOCMOV-NEXT: movzbl %ah, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %ecx, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: movl %esi, (%esp) +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movb %al, %ah +; X32-NOCMOV-NEXT: movzbl %ah, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %ecx, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: fldl (%esp) +; X32-NOCMOV-NEXT: addl $8, %esp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) ret double %result @@ -645,8 +738,14 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { ; ; X32-LABEL: test_ctselect_fcmp_oeq: ; X32: # %bb.0: -; X32-NEXT: flds {{[0-9]+}}(%esp) -; X32-NEXT: flds {{[0-9]+}}(%esp) +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 ; X32-NEXT: flds {{[0-9]+}}(%esp) ; X32-NEXT: flds {{[0-9]+}}(%esp) ; X32-NEXT: fucompi %st(1), %st @@ -654,18 +753,37 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { ; X32-NEXT: setnp %al ; X32-NEXT: sete %cl ; X32-NEXT: testb %al, %cl -; X32-NEXT: jne .LBB13_2 -; X32-NEXT: # %bb.1: -; X32-NEXT: fstp %st(1) -; X32-NEXT: fldz -; X32-NEXT: .LBB13_2: -; X32-NEXT: fstp %st(0) +; X32-NEXT: sete %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: addl $4, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) -; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %eax +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -8 ; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; X32-NOCMOV-NEXT: fucompp @@ -675,12 +793,25 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { ; X32-NOCMOV-NEXT: setnp %al ; X32-NOCMOV-NEXT: sete %cl ; X32-NOCMOV-NEXT: testb %al, %cl -; X32-NOCMOV-NEXT: jne .LBB13_2 -; X32-NOCMOV-NEXT: # %bb.1: -; X32-NOCMOV-NEXT: fstp %st(1) -; X32-NOCMOV-NEXT: fldz -; X32-NOCMOV-NEXT: .LBB13_2: -; X32-NOCMOV-NEXT: fstp %st(0) +; X32-NOCMOV-NEXT: sete %al +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movb %al, %ah +; X32-NOCMOV-NEXT: movzbl %ah, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %ecx, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: movl %esi, (%esp) +; X32-NOCMOV-NEXT: flds (%esp) +; X32-NOCMOV-NEXT: addl $4, %esp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %cond = fcmp oeq float %x, %y %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) From 110dcc9da0136bf9230c4ef5dad12610b173df76 Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Fri, 17 Oct 2025 21:02:28 -0400 Subject: [PATCH 62/63] [CT] Clean up, for rebase --- clang/lib/Sema/SemaChecking.cpp | 2 +- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 - llvm/lib/Target/AArch64/AArch64ISelLowering.h | 1024 +---------------- 3 files changed, 5 insertions(+), 1024 deletions(-) diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 12be5426ccd23..d7c283367353c 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3528,7 +3528,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, if ((AIsInteger && BIsInteger) || (AIsFloating && BIsFloating)) { // Both are in the same category, allow usual arithmetic conversions ResultTy = UsualArithmeticConversions( - ARes, BRes, TheCall->getBeginLoc(), ACK_Conditional); + ARes, BRes, TheCall->getBeginLoc(), ArithConvKind::Conditional); if (ARes.isInvalid() || BRes.isInvalid() || ResultTy.isNull()) { return Diag(A->getBeginLoc(), diag::err_typecheck_cond_incompatible_operands) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 589f85671eddd..06167fb7c79d6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4119,9 +4119,6 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { unsigned BitWidth = VT.getScalarSizeInBits(); SDLoc DL(N); - if (N->getFlags().hasNoCtSelectOpt()) - return SDValue(); - if (SDValue V = foldSubCtlzNot(N, DAG)) return V; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index e83694d8b0a6b..415360ea57adf 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -23,1028 +23,12 @@ namespace llvm { -class AArch64TargetMachine; -namespace AArch64ISD { - -// For predicated nodes where the result is a vector, the operation is -// controlled by a governing predicate and the inactive lanes are explicitly -// defined with a value, please stick the following naming convention: -// -// _MERGE_OP The result value is a vector with inactive lanes equal -// to source operand OP. -// -// _MERGE_ZERO The result value is a vector with inactive lanes -// actively zeroed. -// -// _MERGE_PASSTHRU The result value is a vector with inactive lanes equal -// to the last source operand which only purpose is being -// a passthru value. -// -// For other cases where no explicit action is needed to set the inactive lanes, -// or when the result is not a vector and it is needed or helpful to -// distinguish a node from similar unpredicated nodes, use: -// -// _PRED -// -enum NodeType : unsigned { - FIRST_NUMBER = ISD::BUILTIN_OP_END, - WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses. - CALL, // Function call. - - // Pseudo for a OBJC call that gets emitted together with a special `mov - // x29, x29` marker instruction. - CALL_RVMARKER, - - CALL_BTI, // Function call followed by a BTI instruction. - - // Function call, authenticating the callee value first: - // AUTH_CALL chain, callee, auth key #, int disc, addr disc, operands. - AUTH_CALL, - // AUTH_TC_RETURN chain, callee, fpdiff, auth key #, int disc, addr disc, - // operands. - AUTH_TC_RETURN, - - // Authenticated variant of CALL_RVMARKER. - AUTH_CALL_RVMARKER, - - COALESCER_BARRIER, - - VG_SAVE, - VG_RESTORE, - - SMSTART, - SMSTOP, - RESTORE_ZA, - RESTORE_ZT, - SAVE_ZT, - - // A call with the callee in x16, i.e. "blr x16". - CALL_ARM64EC_TO_X64, - - // Produces the full sequence of instructions for getting the thread pointer - // offset of a variable into X0, using the TLSDesc model. - TLSDESC_CALLSEQ, - TLSDESC_AUTH_CALLSEQ, - ADRP, // Page address of a TargetGlobalAddress operand. - ADR, // ADR - ADDlow, // Add the low 12 bits of a TargetGlobalAddress operand. - LOADgot, // Load from automatically generated descriptor (e.g. Global - // Offset Table, TLS record). - RET_GLUE, // Return with a glue operand. Operand 0 is the chain operand. - BRCOND, // Conditional branch instruction; "b.cond". - CSEL, - CSINV, // Conditional select invert. - CSNEG, // Conditional select negate. - CSINC, // Conditional select increment. - - // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on - // ELF. - THREAD_POINTER, - ADC, - SBC, // adc, sbc instructions - - // To avoid stack clash, allocation is performed by block and each block is - // probed. - PROBED_ALLOCA, - - // Predicated instructions where inactive lanes produce undefined results. - ABDS_PRED, - ABDU_PRED, - FADD_PRED, - FDIV_PRED, - FMA_PRED, - FMAX_PRED, - FMAXNM_PRED, - FMIN_PRED, - FMINNM_PRED, - FMUL_PRED, - FSUB_PRED, - HADDS_PRED, - HADDU_PRED, - MUL_PRED, - MULHS_PRED, - MULHU_PRED, - RHADDS_PRED, - RHADDU_PRED, - SDIV_PRED, - SHL_PRED, - SMAX_PRED, - SMIN_PRED, - SRA_PRED, - SRL_PRED, - UDIV_PRED, - UMAX_PRED, - UMIN_PRED, - - // Unpredicated vector instructions - BIC, - - SRAD_MERGE_OP1, - - // Predicated instructions with the result of inactive lanes provided by the - // last operand. - FABS_MERGE_PASSTHRU, - FCEIL_MERGE_PASSTHRU, - FFLOOR_MERGE_PASSTHRU, - FNEARBYINT_MERGE_PASSTHRU, - FNEG_MERGE_PASSTHRU, - FRECPX_MERGE_PASSTHRU, - FRINT_MERGE_PASSTHRU, - FROUND_MERGE_PASSTHRU, - FROUNDEVEN_MERGE_PASSTHRU, - FSQRT_MERGE_PASSTHRU, - FTRUNC_MERGE_PASSTHRU, - FP_ROUND_MERGE_PASSTHRU, - FP_EXTEND_MERGE_PASSTHRU, - UINT_TO_FP_MERGE_PASSTHRU, - SINT_TO_FP_MERGE_PASSTHRU, - FCVTX_MERGE_PASSTHRU, - FCVTZU_MERGE_PASSTHRU, - FCVTZS_MERGE_PASSTHRU, - SIGN_EXTEND_INREG_MERGE_PASSTHRU, - ZERO_EXTEND_INREG_MERGE_PASSTHRU, - ABS_MERGE_PASSTHRU, - NEG_MERGE_PASSTHRU, - - SETCC_MERGE_ZERO, - - // Arithmetic instructions which write flags. - ADDS, - SUBS, - ADCS, - SBCS, - ANDS, - - // Conditional compares. Operands: left,right,falsecc,cc,flags - CCMP, - CCMN, - FCCMP, - - // Floating point comparison - FCMP, - - // Scalar-to-vector duplication - DUP, - DUPLANE8, - DUPLANE16, - DUPLANE32, - DUPLANE64, - DUPLANE128, - - // Vector immedate moves - MOVI, - MOVIshift, - MOVIedit, - MOVImsl, - FMOV, - MVNIshift, - MVNImsl, - - // Vector immediate ops - BICi, - ORRi, - - // Vector bitwise select: similar to ISD::VSELECT but not all bits within an - // element must be identical. - BSP, - - // Vector shuffles - ZIP1, - ZIP2, - UZP1, - UZP2, - TRN1, - TRN2, - REV16, - REV32, - REV64, - EXT, - SPLICE, - - // Vector shift by scalar - VSHL, - VLSHR, - VASHR, - - // Vector shift by scalar (again) - SQSHL_I, - UQSHL_I, - SQSHLU_I, - SRSHR_I, - URSHR_I, - URSHR_I_PRED, - - // Vector narrowing shift by immediate (bottom) - RSHRNB_I, - - // Vector shift by constant and insert - VSLI, - VSRI, - - // Vector comparisons - CMEQ, - CMGE, - CMGT, - CMHI, - CMHS, - FCMEQ, - FCMGE, - FCMGT, - - // Vector zero comparisons - CMEQz, - CMGEz, - CMGTz, - CMLEz, - CMLTz, - FCMEQz, - FCMGEz, - FCMGTz, - FCMLEz, - FCMLTz, - - // Round wide FP to narrow FP with inexact results to odd. - FCVTXN, - - // Vector across-lanes addition - // Only the lower result lane is defined. - SADDV, - UADDV, - - // Unsigned sum Long across Vector - UADDLV, - SADDLV, - - // Wide adds - SADDWT, - SADDWB, - UADDWT, - UADDWB, - - // Add Pairwise of two vectors - ADDP, - // Add Long Pairwise - SADDLP, - UADDLP, - - // udot/sdot/usdot instructions - UDOT, - SDOT, - USDOT, - - // Vector across-lanes min/max - // Only the lower result lane is defined. - SMINV, - UMINV, - SMAXV, - UMAXV, - - SADDV_PRED, - UADDV_PRED, - SMAXV_PRED, - UMAXV_PRED, - SMINV_PRED, - UMINV_PRED, - ORV_PRED, - EORV_PRED, - ANDV_PRED, - - // Compare-and-branch - CBZ, - CBNZ, - TBZ, - TBNZ, - - // Tail calls - TC_RETURN, - - // Custom prefetch handling - PREFETCH, - - // {s|u}int to FP within a FP register. - SITOF, - UITOF, - - /// Natural vector cast. ISD::BITCAST is not natural in the big-endian - /// world w.r.t vectors; which causes additional REV instructions to be - /// generated to compensate for the byte-swapping. But sometimes we do - /// need to re-interpret the data in SIMD vector registers in big-endian - /// mode without emitting such REV instructions. - NVCAST, - - MRS, // MRS, also sets the flags via a glue. - - SMULL, - UMULL, - - PMULL, - - // Reciprocal estimates and steps. - FRECPE, - FRECPS, - FRSQRTE, - FRSQRTS, - - SUNPKHI, - SUNPKLO, - UUNPKHI, - UUNPKLO, - - CLASTA_N, - CLASTB_N, - LASTA, - LASTB, - TBL, - - // Floating-point reductions. - FADDA_PRED, - FADDV_PRED, - FMAXV_PRED, - FMAXNMV_PRED, - FMINV_PRED, - FMINNMV_PRED, - - INSR, - PTEST, - PTEST_ANY, - PTRUE, - - CTTZ_ELTS, - - BITREVERSE_MERGE_PASSTHRU, - BSWAP_MERGE_PASSTHRU, - REVH_MERGE_PASSTHRU, - REVW_MERGE_PASSTHRU, - CTLZ_MERGE_PASSTHRU, - CTPOP_MERGE_PASSTHRU, - DUP_MERGE_PASSTHRU, - INDEX_VECTOR, - - // Cast between vectors of the same element type but differ in length. - REINTERPRET_CAST, - - // Nodes to build an LD64B / ST64B 64-bit quantity out of i64, and vice versa - LS64_BUILD, - LS64_EXTRACT, - - LD1_MERGE_ZERO, - LD1S_MERGE_ZERO, - LDNF1_MERGE_ZERO, - LDNF1S_MERGE_ZERO, - LDFF1_MERGE_ZERO, - LDFF1S_MERGE_ZERO, - LD1RQ_MERGE_ZERO, - LD1RO_MERGE_ZERO, - - // Structured loads. - SVE_LD2_MERGE_ZERO, - SVE_LD3_MERGE_ZERO, - SVE_LD4_MERGE_ZERO, - - // Unsigned gather loads. - GLD1_MERGE_ZERO, - GLD1_SCALED_MERGE_ZERO, - GLD1_UXTW_MERGE_ZERO, - GLD1_SXTW_MERGE_ZERO, - GLD1_UXTW_SCALED_MERGE_ZERO, - GLD1_SXTW_SCALED_MERGE_ZERO, - GLD1_IMM_MERGE_ZERO, - GLD1Q_MERGE_ZERO, - GLD1Q_INDEX_MERGE_ZERO, - - // Signed gather loads - GLD1S_MERGE_ZERO, - GLD1S_SCALED_MERGE_ZERO, - GLD1S_UXTW_MERGE_ZERO, - GLD1S_SXTW_MERGE_ZERO, - GLD1S_UXTW_SCALED_MERGE_ZERO, - GLD1S_SXTW_SCALED_MERGE_ZERO, - GLD1S_IMM_MERGE_ZERO, - - // Unsigned gather loads. - GLDFF1_MERGE_ZERO, - GLDFF1_SCALED_MERGE_ZERO, - GLDFF1_UXTW_MERGE_ZERO, - GLDFF1_SXTW_MERGE_ZERO, - GLDFF1_UXTW_SCALED_MERGE_ZERO, - GLDFF1_SXTW_SCALED_MERGE_ZERO, - GLDFF1_IMM_MERGE_ZERO, - - // Signed gather loads. - GLDFF1S_MERGE_ZERO, - GLDFF1S_SCALED_MERGE_ZERO, - GLDFF1S_UXTW_MERGE_ZERO, - GLDFF1S_SXTW_MERGE_ZERO, - GLDFF1S_UXTW_SCALED_MERGE_ZERO, - GLDFF1S_SXTW_SCALED_MERGE_ZERO, - GLDFF1S_IMM_MERGE_ZERO, - - // Non-temporal gather loads - GLDNT1_MERGE_ZERO, - GLDNT1_INDEX_MERGE_ZERO, - GLDNT1S_MERGE_ZERO, - - // Contiguous masked store. - ST1_PRED, - - // Scatter store - SST1_PRED, - SST1_SCALED_PRED, - SST1_UXTW_PRED, - SST1_SXTW_PRED, - SST1_UXTW_SCALED_PRED, - SST1_SXTW_SCALED_PRED, - SST1_IMM_PRED, - SST1Q_PRED, - SST1Q_INDEX_PRED, - - // Non-temporal scatter store - SSTNT1_PRED, - SSTNT1_INDEX_PRED, - - // SME - RDSVL, - REVD_MERGE_PASSTHRU, - ALLOCATE_ZA_BUFFER, - INIT_TPIDR2OBJ, - - // Needed for __arm_agnostic("sme_za_state") - GET_SME_SAVE_SIZE, - ALLOC_SME_SAVE_BUFFER, - - // Asserts that a function argument (i32) is zero-extended to i8 by - // the caller - ASSERT_ZEXT_BOOL, - - // 128-bit system register accesses - // lo64, hi64, chain = MRRS(chain, sysregname) - MRRS, - // chain = MSRR(chain, sysregname, lo64, hi64) - MSRR, - - // Strict (exception-raising) floating point comparison - FIRST_STRICTFP_OPCODE, - STRICT_FCMP = FIRST_STRICTFP_OPCODE, - STRICT_FCMPE, - LAST_STRICTFP_OPCODE = STRICT_FCMPE, - - // NEON Load/Store with post-increment base updates - FIRST_MEMORY_OPCODE, - LD2post = FIRST_MEMORY_OPCODE, - LD3post, - LD4post, - ST2post, - ST3post, - ST4post, - LD1x2post, - LD1x3post, - LD1x4post, - ST1x2post, - ST1x3post, - ST1x4post, - LD1DUPpost, - LD2DUPpost, - LD3DUPpost, - LD4DUPpost, - LD1LANEpost, - LD2LANEpost, - LD3LANEpost, - LD4LANEpost, - ST2LANEpost, - ST3LANEpost, - ST4LANEpost, - - STG, - STZG, - ST2G, - STZ2G, - - LDP, - LDIAPP, - LDNP, - STP, - STILP, - STNP, - LAST_MEMORY_OPCODE = STNP, - - // SME ZA loads and stores - SME_ZA_LDR, - SME_ZA_STR, -}; - -} // end namespace AArch64ISD - namespace AArch64ISD { +// Forward declare the enum from the generated file +enum GenNodeType : unsigned; +} // namespace AArch64ISD -// For predicated nodes where the result is a vector, the operation is -// controlled by a governing predicate and the inactive lanes are explicitly -// defined with a value, please stick the following naming convention: -// -// _MERGE_OP The result value is a vector with inactive lanes equal -// to source operand OP. -// -// _MERGE_ZERO The result value is a vector with inactive lanes -// actively zeroed. -// -// _MERGE_PASSTHRU The result value is a vector with inactive lanes equal -// to the last source operand which only purpose is being -// a passthru value. -// -// For other cases where no explicit action is needed to set the inactive lanes, -// or when the result is not a vector and it is needed or helpful to -// distinguish a node from similar unpredicated nodes, use: -// -// _PRED -// -enum NodeType : unsigned { - FIRST_NUMBER = ISD::BUILTIN_OP_END, - WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses. - CALL, // Function call. - - // Pseudo for a OBJC call that gets emitted together with a special `mov - // x29, x29` marker instruction. - CALL_RVMARKER, - - CALL_BTI, // Function call followed by a BTI instruction. - - // Function call, authenticating the callee value first: - // AUTH_CALL chain, callee, auth key #, int disc, addr disc, operands. - AUTH_CALL, - // AUTH_TC_RETURN chain, callee, fpdiff, auth key #, int disc, addr disc, - // operands. - AUTH_TC_RETURN, - - // Authenticated variant of CALL_RVMARKER. - AUTH_CALL_RVMARKER, - - COALESCER_BARRIER, - - VG_SAVE, - VG_RESTORE, - - SMSTART, - SMSTOP, - RESTORE_ZA, - RESTORE_ZT, - SAVE_ZT, - - // A call with the callee in x16, i.e. "blr x16". - CALL_ARM64EC_TO_X64, - - // Produces the full sequence of instructions for getting the thread pointer - // offset of a variable into X0, using the TLSDesc model. - TLSDESC_CALLSEQ, - TLSDESC_AUTH_CALLSEQ, - ADRP, // Page address of a TargetGlobalAddress operand. - ADR, // ADR - ADDlow, // Add the low 12 bits of a TargetGlobalAddress operand. - LOADgot, // Load from automatically generated descriptor (e.g. Global - // Offset Table, TLS record). - RET_GLUE, // Return with a glue operand. Operand 0 is the chain operand. - BRCOND, // Conditional branch instruction; "b.cond". - CSEL, - CSINV, // Conditional select invert. - CSNEG, // Conditional select negate. - CSINC, // Conditional select increment. - - CTSELECT, // AArch64 Constant-time conditional select, implemented with CSEL - - // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on - // ELF. - THREAD_POINTER, - ADC, - SBC, // adc, sbc instructions - - // To avoid stack clash, allocation is performed by block and each block is - // probed. - PROBED_ALLOCA, - - // Predicated instructions where inactive lanes produce undefined results. - ABDS_PRED, - ABDU_PRED, - FADD_PRED, - FDIV_PRED, - FMA_PRED, - FMAX_PRED, - FMAXNM_PRED, - FMIN_PRED, - FMINNM_PRED, - FMUL_PRED, - FSUB_PRED, - HADDS_PRED, - HADDU_PRED, - MUL_PRED, - MULHS_PRED, - MULHU_PRED, - RHADDS_PRED, - RHADDU_PRED, - SDIV_PRED, - SHL_PRED, - SMAX_PRED, - SMIN_PRED, - SRA_PRED, - SRL_PRED, - UDIV_PRED, - UMAX_PRED, - UMIN_PRED, - - // Unpredicated vector instructions - BIC, - - SRAD_MERGE_OP1, - - // Predicated instructions with the result of inactive lanes provided by the - // last operand. - FABS_MERGE_PASSTHRU, - FCEIL_MERGE_PASSTHRU, - FFLOOR_MERGE_PASSTHRU, - FNEARBYINT_MERGE_PASSTHRU, - FNEG_MERGE_PASSTHRU, - FRECPX_MERGE_PASSTHRU, - FRINT_MERGE_PASSTHRU, - FROUND_MERGE_PASSTHRU, - FROUNDEVEN_MERGE_PASSTHRU, - FSQRT_MERGE_PASSTHRU, - FTRUNC_MERGE_PASSTHRU, - FP_ROUND_MERGE_PASSTHRU, - FP_EXTEND_MERGE_PASSTHRU, - UINT_TO_FP_MERGE_PASSTHRU, - SINT_TO_FP_MERGE_PASSTHRU, - FCVTX_MERGE_PASSTHRU, - FCVTZU_MERGE_PASSTHRU, - FCVTZS_MERGE_PASSTHRU, - SIGN_EXTEND_INREG_MERGE_PASSTHRU, - ZERO_EXTEND_INREG_MERGE_PASSTHRU, - ABS_MERGE_PASSTHRU, - NEG_MERGE_PASSTHRU, - - SETCC_MERGE_ZERO, - - // Arithmetic instructions which write flags. - ADDS, - SUBS, - ADCS, - SBCS, - ANDS, - - // Conditional compares. Operands: left,right,falsecc,cc,flags - CCMP, - CCMN, - FCCMP, - - // Floating point comparison - FCMP, - - // Scalar-to-vector duplication - DUP, - DUPLANE8, - DUPLANE16, - DUPLANE32, - DUPLANE64, - DUPLANE128, - - // Vector immedate moves - MOVI, - MOVIshift, - MOVIedit, - MOVImsl, - FMOV, - MVNIshift, - MVNImsl, - - // Vector immediate ops - BICi, - ORRi, - - // Vector bitwise select: similar to ISD::VSELECT but not all bits within an - // element must be identical. - BSP, - - // Vector shuffles - ZIP1, - ZIP2, - UZP1, - UZP2, - TRN1, - TRN2, - REV16, - REV32, - REV64, - EXT, - SPLICE, - - // Vector shift by scalar - VSHL, - VLSHR, - VASHR, - - // Vector shift by scalar (again) - SQSHL_I, - UQSHL_I, - SQSHLU_I, - SRSHR_I, - URSHR_I, - URSHR_I_PRED, - - // Vector narrowing shift by immediate (bottom) - RSHRNB_I, - - // Vector shift by constant and insert - VSLI, - VSRI, - - // Vector comparisons - CMEQ, - CMGE, - CMGT, - CMHI, - CMHS, - FCMEQ, - FCMGE, - FCMGT, - - // Vector zero comparisons - CMEQz, - CMGEz, - CMGTz, - CMLEz, - CMLTz, - FCMEQz, - FCMGEz, - FCMGTz, - FCMLEz, - FCMLTz, - - // Round wide FP to narrow FP with inexact results to odd. - FCVTXN, - - // Vector across-lanes addition - // Only the lower result lane is defined. - SADDV, - UADDV, - - // Unsigned sum Long across Vector - UADDLV, - SADDLV, - - // Wide adds - SADDWT, - SADDWB, - UADDWT, - UADDWB, - - // Add Pairwise of two vectors - ADDP, - // Add Long Pairwise - SADDLP, - UADDLP, - - // udot/sdot/usdot instructions - UDOT, - SDOT, - USDOT, - - // Vector across-lanes min/max - // Only the lower result lane is defined. - SMINV, - UMINV, - SMAXV, - UMAXV, - - SADDV_PRED, - UADDV_PRED, - SMAXV_PRED, - UMAXV_PRED, - SMINV_PRED, - UMINV_PRED, - ORV_PRED, - EORV_PRED, - ANDV_PRED, - - // Compare-and-branch - CBZ, - CBNZ, - TBZ, - TBNZ, - - // Tail calls - TC_RETURN, - - // Custom prefetch handling - PREFETCH, - - // {s|u}int to FP within a FP register. - SITOF, - UITOF, - - /// Natural vector cast. ISD::BITCAST is not natural in the big-endian - /// world w.r.t vectors; which causes additional REV instructions to be - /// generated to compensate for the byte-swapping. But sometimes we do - /// need to re-interpret the data in SIMD vector registers in big-endian - /// mode without emitting such REV instructions. - NVCAST, - - MRS, // MRS, also sets the flags via a glue. - - SMULL, - UMULL, - - PMULL, - - // Reciprocal estimates and steps. - FRECPE, - FRECPS, - FRSQRTE, - FRSQRTS, - - SUNPKHI, - SUNPKLO, - UUNPKHI, - UUNPKLO, - - CLASTA_N, - CLASTB_N, - LASTA, - LASTB, - TBL, - - // Floating-point reductions. - FADDA_PRED, - FADDV_PRED, - FMAXV_PRED, - FMAXNMV_PRED, - FMINV_PRED, - FMINNMV_PRED, - - INSR, - PTEST, - PTEST_ANY, - PTRUE, - - CTTZ_ELTS, - - BITREVERSE_MERGE_PASSTHRU, - BSWAP_MERGE_PASSTHRU, - REVH_MERGE_PASSTHRU, - REVW_MERGE_PASSTHRU, - CTLZ_MERGE_PASSTHRU, - CTPOP_MERGE_PASSTHRU, - DUP_MERGE_PASSTHRU, - INDEX_VECTOR, - - // Cast between vectors of the same element type but differ in length. - REINTERPRET_CAST, - - // Nodes to build an LD64B / ST64B 64-bit quantity out of i64, and vice versa - LS64_BUILD, - LS64_EXTRACT, - - LD1_MERGE_ZERO, - LD1S_MERGE_ZERO, - LDNF1_MERGE_ZERO, - LDNF1S_MERGE_ZERO, - LDFF1_MERGE_ZERO, - LDFF1S_MERGE_ZERO, - LD1RQ_MERGE_ZERO, - LD1RO_MERGE_ZERO, - - // Structured loads. - SVE_LD2_MERGE_ZERO, - SVE_LD3_MERGE_ZERO, - SVE_LD4_MERGE_ZERO, - - // Unsigned gather loads. - GLD1_MERGE_ZERO, - GLD1_SCALED_MERGE_ZERO, - GLD1_UXTW_MERGE_ZERO, - GLD1_SXTW_MERGE_ZERO, - GLD1_UXTW_SCALED_MERGE_ZERO, - GLD1_SXTW_SCALED_MERGE_ZERO, - GLD1_IMM_MERGE_ZERO, - GLD1Q_MERGE_ZERO, - GLD1Q_INDEX_MERGE_ZERO, - - // Signed gather loads - GLD1S_MERGE_ZERO, - GLD1S_SCALED_MERGE_ZERO, - GLD1S_UXTW_MERGE_ZERO, - GLD1S_SXTW_MERGE_ZERO, - GLD1S_UXTW_SCALED_MERGE_ZERO, - GLD1S_SXTW_SCALED_MERGE_ZERO, - GLD1S_IMM_MERGE_ZERO, - - // Unsigned gather loads. - GLDFF1_MERGE_ZERO, - GLDFF1_SCALED_MERGE_ZERO, - GLDFF1_UXTW_MERGE_ZERO, - GLDFF1_SXTW_MERGE_ZERO, - GLDFF1_UXTW_SCALED_MERGE_ZERO, - GLDFF1_SXTW_SCALED_MERGE_ZERO, - GLDFF1_IMM_MERGE_ZERO, - - // Signed gather loads. - GLDFF1S_MERGE_ZERO, - GLDFF1S_SCALED_MERGE_ZERO, - GLDFF1S_UXTW_MERGE_ZERO, - GLDFF1S_SXTW_MERGE_ZERO, - GLDFF1S_UXTW_SCALED_MERGE_ZERO, - GLDFF1S_SXTW_SCALED_MERGE_ZERO, - GLDFF1S_IMM_MERGE_ZERO, - - // Non-temporal gather loads - GLDNT1_MERGE_ZERO, - GLDNT1_INDEX_MERGE_ZERO, - GLDNT1S_MERGE_ZERO, - - // Contiguous masked store. - ST1_PRED, - - // Scatter store - SST1_PRED, - SST1_SCALED_PRED, - SST1_UXTW_PRED, - SST1_SXTW_PRED, - SST1_UXTW_SCALED_PRED, - SST1_SXTW_SCALED_PRED, - SST1_IMM_PRED, - SST1Q_PRED, - SST1Q_INDEX_PRED, - - // Non-temporal scatter store - SSTNT1_PRED, - SSTNT1_INDEX_PRED, - - // SME - RDSVL, - REVD_MERGE_PASSTHRU, - ALLOCATE_ZA_BUFFER, - INIT_TPIDR2OBJ, - - // Needed for __arm_agnostic("sme_za_state") - GET_SME_SAVE_SIZE, - ALLOC_SME_SAVE_BUFFER, - - // Asserts that a function argument (i32) is zero-extended to i8 by - // the caller - ASSERT_ZEXT_BOOL, - - // 128-bit system register accesses - // lo64, hi64, chain = MRRS(chain, sysregname) - MRRS, - // chain = MSRR(chain, sysregname, lo64, hi64) - MSRR, - - // Strict (exception-raising) floating point comparison - FIRST_STRICTFP_OPCODE, - STRICT_FCMP = FIRST_STRICTFP_OPCODE, - STRICT_FCMPE, - LAST_STRICTFP_OPCODE = STRICT_FCMPE, - - // NEON Load/Store with post-increment base updates - FIRST_MEMORY_OPCODE, - LD2post = FIRST_MEMORY_OPCODE, - LD3post, - LD4post, - ST2post, - ST3post, - ST4post, - LD1x2post, - LD1x3post, - LD1x4post, - ST1x2post, - ST1x3post, - ST1x4post, - LD1DUPpost, - LD2DUPpost, - LD3DUPpost, - LD4DUPpost, - LD1LANEpost, - LD2LANEpost, - LD3LANEpost, - LD4LANEpost, - ST2LANEpost, - ST3LANEpost, - ST4LANEpost, - - STG, - STZG, - ST2G, - STZ2G, - - LDP, - LDIAPP, - LDNP, - STP, - STILP, - STNP, - LAST_MEMORY_OPCODE = STNP, - - // SME ZA loads and stores - SME_ZA_LDR, - SME_ZA_STR, -}; - -} // end namespace AArch64ISD +class AArch64TargetMachine; namespace AArch64 { /// Possible values of current rounding mode, which is specified in bits From 2ae33e95d945ef0e38e3d53b5529fd4183701a47 Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Fri, 17 Oct 2025 22:35:58 -0400 Subject: [PATCH 63/63] [CT] X86 Change tests to work with LLVM 21 --- llvm/test/CodeGen/X86/ctselect-edge-cases.ll | 10 +- llvm/test/CodeGen/X86/ctselect-vector.ll | 448 +++++++++---------- 2 files changed, 231 insertions(+), 227 deletions(-) diff --git a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll index c37daa41eeb72..0797265972a1f 100644 --- a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll +++ b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll @@ -21,6 +21,8 @@ define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) { ; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: pushl %esi ; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_def_cfa_offset 16 ; X32-NEXT: .cfi_offset %esi, -12 ; X32-NEXT: .cfi_offset %edi, -8 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -31,12 +33,14 @@ define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) { ; X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %esi ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edi -; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx ; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %edx, 12(%eax) -; X32-NEXT: movl %ecx, 8(%eax) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %ecx, 12(%eax) +; X32-NEXT: movl %edx, 8(%eax) ; X32-NEXT: movl %edi, 4(%eax) ; X32-NEXT: movl %esi, (%eax) +; X32-NEXT: addl $4, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 ; X32-NEXT: popl %esi ; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/ctselect-vector.ll b/llvm/test/CodeGen/X86/ctselect-vector.ll index 0e53a8324e5ce..2206e32cd6d34 100644 --- a/llvm/test/CodeGen/X86/ctselect-vector.ll +++ b/llvm/test/CodeGen/X86/ctselect-vector.ll @@ -14,14 +14,14 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v4i32: @@ -31,14 +31,14 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %xmm3, %xmm3 +; AVX-NEXT: pxor %xmm3, %xmm3 ; AVX-NEXT: movd %eax, %xmm3 ; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: movdqa %xmm3, %xmm2 ; AVX-NEXT: pand %xmm0, %xmm3 ; AVX-NEXT: pandn %xmm1, %xmm2 ; AVX-NEXT: por %xmm3, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, %xmm0 +; AVX-NEXT: vmovaps %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v4i32: @@ -48,14 +48,14 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %xmm3, %xmm3 +; AVX2-NEXT: pxor %xmm3, %xmm3 ; AVX2-NEXT: movd %eax, %xmm3 ; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX2-NEXT: movdqa %xmm3, %xmm2 ; AVX2-NEXT: pand %xmm0, %xmm3 ; AVX2-NEXT: pandn %xmm1, %xmm2 ; AVX2-NEXT: por %xmm3, %xmm2 -; AVX2-NEXT: vmovdqa %xmm2, %xmm0 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v4i32: ; AVX512: # %bb.0: @@ -78,14 +78,14 @@ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v4f32: @@ -95,14 +95,14 @@ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %xmm3, %xmm3 +; AVX-NEXT: pxor %xmm3, %xmm3 ; AVX-NEXT: movd %eax, %xmm3 ; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: movdqa %xmm3, %xmm2 ; AVX-NEXT: pand %xmm0, %xmm3 ; AVX-NEXT: pandn %xmm1, %xmm2 ; AVX-NEXT: por %xmm3, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, %xmm0 +; AVX-NEXT: vmovaps %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v4f32: @@ -112,14 +112,14 @@ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %xmm3, %xmm3 +; AVX2-NEXT: pxor %xmm3, %xmm3 ; AVX2-NEXT: movd %eax, %xmm3 ; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX2-NEXT: movdqa %xmm3, %xmm2 ; AVX2-NEXT: pand %xmm0, %xmm3 ; AVX2-NEXT: pandn %xmm1, %xmm2 ; AVX2-NEXT: por %xmm3, %xmm2 -; AVX2-NEXT: vmovdqa %xmm2, %xmm0 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v4f32: ; AVX512: # %bb.0: @@ -142,14 +142,14 @@ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v2i64: @@ -159,14 +159,14 @@ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %xmm3, %xmm3 +; AVX-NEXT: pxor %xmm3, %xmm3 ; AVX-NEXT: movd %eax, %xmm3 ; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: movdqa %xmm3, %xmm2 ; AVX-NEXT: pand %xmm0, %xmm3 ; AVX-NEXT: pandn %xmm1, %xmm2 ; AVX-NEXT: por %xmm3, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, %xmm0 +; AVX-NEXT: vmovaps %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v2i64: @@ -176,14 +176,14 @@ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %xmm3, %xmm3 +; AVX2-NEXT: pxor %xmm3, %xmm3 ; AVX2-NEXT: movd %eax, %xmm3 ; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX2-NEXT: movdqa %xmm3, %xmm2 ; AVX2-NEXT: pand %xmm0, %xmm3 ; AVX2-NEXT: pandn %xmm1, %xmm2 ; AVX2-NEXT: por %xmm3, %xmm2 -; AVX2-NEXT: vmovdqa %xmm2, %xmm0 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v2i64: ; AVX512: # %bb.0: @@ -206,14 +206,14 @@ define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: movapd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v2f64: @@ -223,14 +223,14 @@ define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %xmm3, %xmm3 +; AVX-NEXT: pxor %xmm3, %xmm3 ; AVX-NEXT: movd %eax, %xmm3 ; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: movdqa %xmm3, %xmm2 ; AVX-NEXT: pand %xmm0, %xmm3 ; AVX-NEXT: pandn %xmm1, %xmm2 ; AVX-NEXT: por %xmm3, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, %xmm0 +; AVX-NEXT: vmovaps %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v2f64: @@ -240,14 +240,14 @@ define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %xmm3, %xmm3 +; AVX2-NEXT: pxor %xmm3, %xmm3 ; AVX2-NEXT: movd %eax, %xmm3 ; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX2-NEXT: movdqa %xmm3, %xmm2 ; AVX2-NEXT: pand %xmm0, %xmm3 ; AVX2-NEXT: pandn %xmm1, %xmm2 ; AVX2-NEXT: por %xmm3, %xmm2 -; AVX2-NEXT: vmovdqa %xmm2, %xmm0 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v2f64: ; AVX512: # %bb.0: @@ -271,7 +271,7 @@ define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) { ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: movd %eax, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] ; SSE2-NEXT: movdqa %xmm5, %xmm4 @@ -289,8 +289,8 @@ define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) { ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v8i32: @@ -300,13 +300,13 @@ define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) { ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %ymm3, %ymm3 +; AVX-NEXT: pxor %ymm3, %ymm3 ; AVX-NEXT: vmovd %eax, %ymm3 -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX-NEXT: vmovaps %ymm3, %ymm2 -; AVX-NEXT: andps %ymm0, %ymm3 -; AVX-NEXT: andnps %ymm1, %ymm2 -; AVX-NEXT: orps %ymm3, %ymm2 +; AVX-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm3, %ymm2 +; AVX-NEXT: pand %ymm0, %ymm3 +; AVX-NEXT: pandn %ymm1, %ymm2 +; AVX-NEXT: por %ymm3, %ymm2 ; AVX-NEXT: vmovaps %ymm2, %ymm0 ; AVX-NEXT: retq ; @@ -317,14 +317,14 @@ define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) { ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %ymm3, %ymm3 +; AVX2-NEXT: pxor %ymm3, %ymm3 ; AVX2-NEXT: vmovd %eax, %ymm3 -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovdqa %ymm3, %ymm2 ; AVX2-NEXT: pand %ymm0, %ymm3 ; AVX2-NEXT: pandn %ymm1, %ymm2 ; AVX2-NEXT: por %ymm3, %ymm2 -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm0 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v8i32: ; AVX512: # %bb.0: @@ -347,10 +347,10 @@ define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: movd %eax, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: movaps %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm5, %xmm4 @@ -361,12 +361,12 @@ define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v8f32: @@ -376,13 +376,13 @@ define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %ymm3, %ymm3 +; AVX-NEXT: pxor %ymm3, %ymm3 ; AVX-NEXT: vmovd %eax, %ymm3 -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX-NEXT: vmovaps %ymm3, %ymm2 -; AVX-NEXT: andps %ymm0, %ymm3 -; AVX-NEXT: andnps %ymm1, %ymm2 -; AVX-NEXT: orps %ymm3, %ymm2 +; AVX-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm3, %ymm2 +; AVX-NEXT: pand %ymm0, %ymm3 +; AVX-NEXT: pandn %ymm1, %ymm2 +; AVX-NEXT: por %ymm3, %ymm2 ; AVX-NEXT: vmovaps %ymm2, %ymm0 ; AVX-NEXT: retq ; @@ -393,14 +393,14 @@ define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %ymm3, %ymm3 +; AVX2-NEXT: pxor %ymm3, %ymm3 ; AVX2-NEXT: vmovd %eax, %ymm3 -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovdqa %ymm3, %ymm2 ; AVX2-NEXT: pand %ymm0, %ymm3 ; AVX2-NEXT: pandn %ymm1, %ymm2 ; AVX2-NEXT: por %ymm3, %ymm2 -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm0 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v8f32: ; AVX512: # %bb.0: @@ -423,7 +423,7 @@ define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) { ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: movd %eax, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] ; SSE2-NEXT: movdqa %xmm5, %xmm4 @@ -441,8 +441,8 @@ define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) { ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v4i64: @@ -452,14 +452,14 @@ define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) { ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %ymm3, %ymm3 +; AVX-NEXT: pxor %ymm3, %ymm3 ; AVX-NEXT: vmovd %eax, %ymm3 -; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,2,2] -; AVX-NEXT: vmovapd %ymm3, %ymm2 -; AVX-NEXT: andpd %ymm0, %ymm3 -; AVX-NEXT: andnpd %ymm1, %ymm2 -; AVX-NEXT: orpd %ymm3, %ymm2 -; AVX-NEXT: vmovapd %ymm2, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm3, %ymm2 +; AVX-NEXT: pand %ymm0, %ymm3 +; AVX-NEXT: pandn %ymm1, %ymm2 +; AVX-NEXT: por %ymm3, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v4i64: @@ -469,14 +469,14 @@ define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) { ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %ymm3, %ymm3 +; AVX2-NEXT: pxor %ymm3, %ymm3 ; AVX2-NEXT: vmovd %eax, %ymm3 -; AVX2-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,2,2] -; AVX2-NEXT: vmovapd %ymm3, %ymm2 -; AVX2-NEXT: andpd %ymm0, %ymm3 -; AVX2-NEXT: andnpd %ymm1, %ymm2 -; AVX2-NEXT: orpd %ymm3, %ymm2 -; AVX2-NEXT: vmovapd %ymm2, %ymm0 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-NEXT: pand %ymm0, %ymm3 +; AVX2-NEXT: pandn %ymm1, %ymm2 +; AVX2-NEXT: por %ymm3, %ymm2 +; AVX2-NEXT: vmovaps %ymm2, %ymm0 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v4i64: ; AVX512: # %bb.0: @@ -499,10 +499,10 @@ define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: movd %eax, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: movapd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm5, %xmm4 @@ -513,12 +513,12 @@ define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movapd %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v4f64: @@ -528,14 +528,14 @@ define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %ymm3, %ymm3 +; AVX-NEXT: pxor %ymm3, %ymm3 ; AVX-NEXT: vmovd %eax, %ymm3 -; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,2,2] -; AVX-NEXT: vmovapd %ymm3, %ymm2 -; AVX-NEXT: andpd %ymm0, %ymm3 -; AVX-NEXT: andnpd %ymm1, %ymm2 -; AVX-NEXT: orpd %ymm3, %ymm2 -; AVX-NEXT: vmovapd %ymm2, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm3, %ymm2 +; AVX-NEXT: pand %ymm0, %ymm3 +; AVX-NEXT: pandn %ymm1, %ymm2 +; AVX-NEXT: por %ymm3, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v4f64: @@ -545,14 +545,14 @@ define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %ymm3, %ymm3 +; AVX2-NEXT: pxor %ymm3, %ymm3 ; AVX2-NEXT: vmovd %eax, %ymm3 -; AVX2-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,2,2] -; AVX2-NEXT: vmovapd %ymm3, %ymm2 -; AVX2-NEXT: andpd %ymm0, %ymm3 -; AVX2-NEXT: andnpd %ymm1, %ymm2 -; AVX2-NEXT: orpd %ymm3, %ymm2 -; AVX2-NEXT: vmovapd %ymm2, %ymm0 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-NEXT: pand %ymm0, %ymm3 +; AVX2-NEXT: pandn %ymm1, %ymm2 +; AVX2-NEXT: por %ymm3, %ymm2 +; AVX2-NEXT: vmovaps %ymm2, %ymm0 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v4f64: ; AVX512: # %bb.0: @@ -576,7 +576,7 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm9, %xmm9 +; SSE2-NEXT: pxor %xmm9, %xmm9 ; SSE2-NEXT: movd %eax, %xmm9 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] ; SSE2-NEXT: movdqa %xmm9, %xmm8 @@ -616,10 +616,10 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm7, %xmm6 ; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: movaps %xmm8, %xmm0 +; SSE2-NEXT: movaps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm5, %xmm2 +; SSE2-NEXT: movaps %xmm6, %xmm3 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v16i32: @@ -629,24 +629,24 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %ymm5, %ymm5 +; AVX-NEXT: pxor %ymm5, %ymm5 ; AVX-NEXT: vmovd %eax, %ymm5 -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] -; AVX-NEXT: vmovaps %ymm5, %ymm4 -; AVX-NEXT: andps %ymm0, %ymm5 -; AVX-NEXT: andnps %ymm2, %ymm4 -; AVX-NEXT: orps %ymm5, %ymm4 +; AVX-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm5, %ymm4 +; AVX-NEXT: pand %ymm0, %ymm5 +; AVX-NEXT: pandn %ymm2, %ymm4 +; AVX-NEXT: por %ymm5, %ymm4 ; AVX-NEXT: movl $0, %eax ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %ymm0, %ymm0 +; AVX-NEXT: pxor %ymm0, %ymm0 ; AVX-NEXT: vmovd %eax, %ymm0 -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX-NEXT: vmovaps %ymm0, %ymm2 -; AVX-NEXT: andps %ymm1, %ymm0 -; AVX-NEXT: andnps %ymm3, %ymm2 -; AVX-NEXT: orps %ymm0, %ymm2 +; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm0, %ymm2 +; AVX-NEXT: pand %ymm1, %ymm0 +; AVX-NEXT: pandn %ymm3, %ymm2 +; AVX-NEXT: por %ymm0, %ymm2 ; AVX-NEXT: vmovaps %ymm4, %ymm0 ; AVX-NEXT: vmovaps %ymm2, %ymm1 ; AVX-NEXT: retq @@ -658,9 +658,9 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %ymm5, %ymm5 +; AVX2-NEXT: pxor %ymm5, %ymm5 ; AVX2-NEXT: vmovd %eax, %ymm5 -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovdqa %ymm5, %ymm4 ; AVX2-NEXT: pand %ymm0, %ymm5 ; AVX2-NEXT: pandn %ymm2, %ymm4 @@ -671,13 +671,13 @@ define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) ; AVX2-NEXT: negl %eax ; AVX2-NEXT: pxor %ymm0, %ymm0 ; AVX2-NEXT: vmovd %eax, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovdqa %ymm0, %ymm2 ; AVX2-NEXT: pand %ymm1, %ymm0 ; AVX2-NEXT: pandn %ymm3, %ymm2 ; AVX2-NEXT: por %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa %ymm4, %ymm0 -; AVX2-NEXT: vmovdqa %ymm2, %ymm1 +; AVX2-NEXT: vmovaps %ymm4, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v16i32: ; AVX512: # %bb.0: @@ -700,10 +700,10 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm9, %xmm9 +; SSE2-NEXT: pxor %xmm9, %xmm9 ; SSE2-NEXT: movd %eax, %xmm9 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] -; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: movaps %xmm9, %xmm8 ; SSE2-NEXT: pand %xmm0, %xmm9 ; SSE2-NEXT: pandn %xmm4, %xmm8 ; SSE2-NEXT: por %xmm9, %xmm8 @@ -714,7 +714,7 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 @@ -725,7 +725,7 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: movaps %xmm0, %xmm5 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 @@ -736,14 +736,14 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: movaps %xmm0, %xmm6 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm7, %xmm6 ; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: movaps %xmm8, %xmm0 +; SSE2-NEXT: movaps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm5, %xmm2 +; SSE2-NEXT: movaps %xmm6, %xmm3 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v16f32: @@ -753,24 +753,24 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %ymm5, %ymm5 +; AVX-NEXT: pxor %ymm5, %ymm5 ; AVX-NEXT: vmovd %eax, %ymm5 -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] -; AVX-NEXT: vmovaps %ymm5, %ymm4 -; AVX-NEXT: andps %ymm0, %ymm5 -; AVX-NEXT: andnps %ymm2, %ymm4 -; AVX-NEXT: orps %ymm5, %ymm4 +; AVX-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm5, %ymm4 +; AVX-NEXT: pand %ymm0, %ymm5 +; AVX-NEXT: pandn %ymm2, %ymm4 +; AVX-NEXT: por %ymm5, %ymm4 ; AVX-NEXT: movl $0, %eax ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %ymm0, %ymm0 +; AVX-NEXT: pxor %ymm0, %ymm0 ; AVX-NEXT: vmovd %eax, %ymm0 -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX-NEXT: vmovaps %ymm0, %ymm2 -; AVX-NEXT: andps %ymm1, %ymm0 -; AVX-NEXT: andnps %ymm3, %ymm2 -; AVX-NEXT: orps %ymm0, %ymm2 +; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm0, %ymm2 +; AVX-NEXT: pand %ymm1, %ymm0 +; AVX-NEXT: pandn %ymm3, %ymm2 +; AVX-NEXT: por %ymm0, %ymm2 ; AVX-NEXT: vmovaps %ymm4, %ymm0 ; AVX-NEXT: vmovaps %ymm2, %ymm1 ; AVX-NEXT: retq @@ -782,9 +782,9 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %ymm5, %ymm5 +; AVX2-NEXT: pxor %ymm5, %ymm5 ; AVX2-NEXT: vmovd %eax, %ymm5 -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovdqa %ymm5, %ymm4 ; AVX2-NEXT: pand %ymm0, %ymm5 ; AVX2-NEXT: pandn %ymm2, %ymm4 @@ -795,13 +795,13 @@ define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float ; AVX2-NEXT: negl %eax ; AVX2-NEXT: pxor %ymm0, %ymm0 ; AVX2-NEXT: vmovd %eax, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovdqa %ymm0, %ymm2 ; AVX2-NEXT: pand %ymm1, %ymm0 ; AVX2-NEXT: pandn %ymm3, %ymm2 ; AVX2-NEXT: por %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa %ymm4, %ymm0 -; AVX2-NEXT: vmovdqa %ymm2, %ymm1 +; AVX2-NEXT: vmovaps %ymm4, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v16f32: ; AVX512: # %bb.0: @@ -824,7 +824,7 @@ define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm9, %xmm9 +; SSE2-NEXT: pxor %xmm9, %xmm9 ; SSE2-NEXT: movd %eax, %xmm9 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] ; SSE2-NEXT: movdqa %xmm9, %xmm8 @@ -864,10 +864,10 @@ define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm7, %xmm6 ; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: movaps %xmm8, %xmm0 +; SSE2-NEXT: movaps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm5, %xmm2 +; SSE2-NEXT: movaps %xmm6, %xmm3 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v8i64: @@ -877,26 +877,26 @@ define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %ymm5, %ymm5 +; AVX-NEXT: pxor %ymm5, %ymm5 ; AVX-NEXT: vmovd %eax, %ymm5 -; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,2,2] -; AVX-NEXT: vmovapd %ymm5, %ymm4 -; AVX-NEXT: andpd %ymm0, %ymm5 -; AVX-NEXT: andnpd %ymm2, %ymm4 -; AVX-NEXT: orpd %ymm5, %ymm4 +; AVX-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm5, %ymm4 +; AVX-NEXT: pand %ymm0, %ymm5 +; AVX-NEXT: pandn %ymm2, %ymm4 +; AVX-NEXT: por %ymm5, %ymm4 ; AVX-NEXT: movl $0, %eax ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorpd %ymm0, %ymm0 +; AVX-NEXT: pxor %ymm0, %ymm0 ; AVX-NEXT: vmovd %eax, %ymm0 -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX-NEXT: vmovapd %ymm0, %ymm2 -; AVX-NEXT: andpd %ymm1, %ymm0 -; AVX-NEXT: andnpd %ymm3, %ymm2 -; AVX-NEXT: orpd %ymm0, %ymm2 -; AVX-NEXT: vmovapd %ymm4, %ymm0 -; AVX-NEXT: vmovapd %ymm2, %ymm1 +; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm0, %ymm2 +; AVX-NEXT: pand %ymm1, %ymm0 +; AVX-NEXT: pandn %ymm3, %ymm2 +; AVX-NEXT: por %ymm0, %ymm2 +; AVX-NEXT: vmovaps %ymm4, %ymm0 +; AVX-NEXT: vmovaps %ymm2, %ymm1 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v8i64: @@ -906,26 +906,26 @@ define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %ymm5, %ymm5 +; AVX2-NEXT: pxor %ymm5, %ymm5 ; AVX2-NEXT: vmovd %eax, %ymm5 -; AVX2-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,2,2] -; AVX2-NEXT: vmovapd %ymm5, %ymm4 -; AVX2-NEXT: andpd %ymm0, %ymm5 -; AVX2-NEXT: andnpd %ymm2, %ymm4 -; AVX2-NEXT: orpd %ymm5, %ymm4 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-NEXT: pand %ymm0, %ymm5 +; AVX2-NEXT: pandn %ymm2, %ymm4 +; AVX2-NEXT: por %ymm5, %ymm4 ; AVX2-NEXT: movl $0, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorpd %ymm0, %ymm0 +; AVX2-NEXT: pxor %ymm0, %ymm0 ; AVX2-NEXT: vmovd %eax, %ymm0 -; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX2-NEXT: vmovapd %ymm0, %ymm2 -; AVX2-NEXT: andpd %ymm1, %ymm0 -; AVX2-NEXT: andnpd %ymm3, %ymm2 -; AVX2-NEXT: orpd %ymm0, %ymm2 -; AVX2-NEXT: vmovapd %ymm4, %ymm0 -; AVX2-NEXT: vmovapd %ymm2, %ymm1 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm0, %ymm2 +; AVX2-NEXT: pand %ymm1, %ymm0 +; AVX2-NEXT: pandn %ymm3, %ymm2 +; AVX2-NEXT: por %ymm0, %ymm2 +; AVX2-NEXT: vmovaps %ymm4, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v8i64: ; AVX512: # %bb.0: @@ -948,10 +948,10 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm9, %xmm9 +; SSE2-NEXT: pxor %xmm9, %xmm9 ; SSE2-NEXT: movd %eax, %xmm9 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] -; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: movapd %xmm9, %xmm8 ; SSE2-NEXT: pand %xmm0, %xmm9 ; SSE2-NEXT: pandn %xmm4, %xmm8 ; SSE2-NEXT: por %xmm9, %xmm8 @@ -962,7 +962,7 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: movapd %xmm0, %xmm4 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 @@ -973,7 +973,7 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: movapd %xmm0, %xmm5 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 @@ -984,14 +984,14 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: movapd %xmm0, %xmm6 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm7, %xmm6 ; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: movaps %xmm8, %xmm0 +; SSE2-NEXT: movaps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm5, %xmm2 +; SSE2-NEXT: movaps %xmm6, %xmm3 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v8f64: @@ -1001,26 +1001,26 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %ymm5, %ymm5 +; AVX-NEXT: pxor %ymm5, %ymm5 ; AVX-NEXT: vmovd %eax, %ymm5 -; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,2,2] -; AVX-NEXT: vmovapd %ymm5, %ymm4 -; AVX-NEXT: andpd %ymm0, %ymm5 -; AVX-NEXT: andnpd %ymm2, %ymm4 -; AVX-NEXT: orpd %ymm5, %ymm4 +; AVX-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm5, %ymm4 +; AVX-NEXT: pand %ymm0, %ymm5 +; AVX-NEXT: pandn %ymm2, %ymm4 +; AVX-NEXT: por %ymm5, %ymm4 ; AVX-NEXT: movl $0, %eax ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorpd %ymm0, %ymm0 +; AVX-NEXT: pxor %ymm0, %ymm0 ; AVX-NEXT: vmovd %eax, %ymm0 -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX-NEXT: vmovapd %ymm0, %ymm2 -; AVX-NEXT: andpd %ymm1, %ymm0 -; AVX-NEXT: andnpd %ymm3, %ymm2 -; AVX-NEXT: orpd %ymm0, %ymm2 -; AVX-NEXT: vmovapd %ymm4, %ymm0 -; AVX-NEXT: vmovapd %ymm2, %ymm1 +; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm0, %ymm2 +; AVX-NEXT: pand %ymm1, %ymm0 +; AVX-NEXT: pandn %ymm3, %ymm2 +; AVX-NEXT: por %ymm0, %ymm2 +; AVX-NEXT: vmovaps %ymm4, %ymm0 +; AVX-NEXT: vmovaps %ymm2, %ymm1 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v8f64: @@ -1030,26 +1030,26 @@ define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %ymm5, %ymm5 +; AVX2-NEXT: pxor %ymm5, %ymm5 ; AVX2-NEXT: vmovd %eax, %ymm5 -; AVX2-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,2,2] -; AVX2-NEXT: vmovapd %ymm5, %ymm4 -; AVX2-NEXT: andpd %ymm0, %ymm5 -; AVX2-NEXT: andnpd %ymm2, %ymm4 -; AVX2-NEXT: orpd %ymm5, %ymm4 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-NEXT: pand %ymm0, %ymm5 +; AVX2-NEXT: pandn %ymm2, %ymm4 +; AVX2-NEXT: por %ymm5, %ymm4 ; AVX2-NEXT: movl $0, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorpd %ymm0, %ymm0 +; AVX2-NEXT: pxor %ymm0, %ymm0 ; AVX2-NEXT: vmovd %eax, %ymm0 -; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX2-NEXT: vmovapd %ymm0, %ymm2 -; AVX2-NEXT: andpd %ymm1, %ymm0 -; AVX2-NEXT: andnpd %ymm3, %ymm2 -; AVX2-NEXT: orpd %ymm0, %ymm2 -; AVX2-NEXT: vmovapd %ymm4, %ymm0 -; AVX2-NEXT: vmovapd %ymm2, %ymm1 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm0, %ymm2 +; AVX2-NEXT: pand %ymm1, %ymm0 +; AVX2-NEXT: pandn %ymm3, %ymm2 +; AVX2-NEXT: por %ymm0, %ymm2 +; AVX2-NEXT: vmovaps %ymm4, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v8f64: ; AVX512: # %bb.0: @@ -1074,14 +1074,14 @@ define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) { ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v4i32_const_true: @@ -1092,14 +1092,14 @@ define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) { ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %xmm3, %xmm3 +; AVX-NEXT: pxor %xmm3, %xmm3 ; AVX-NEXT: movd %eax, %xmm3 ; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: movdqa %xmm3, %xmm2 ; AVX-NEXT: pand %xmm0, %xmm3 ; AVX-NEXT: pandn %xmm1, %xmm2 ; AVX-NEXT: por %xmm3, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, %xmm0 +; AVX-NEXT: vmovaps %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v4i32_const_true: @@ -1110,14 +1110,14 @@ define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) { ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %xmm3, %xmm3 +; AVX2-NEXT: pxor %xmm3, %xmm3 ; AVX2-NEXT: movd %eax, %xmm3 ; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX2-NEXT: movdqa %xmm3, %xmm2 ; AVX2-NEXT: pand %xmm0, %xmm3 ; AVX2-NEXT: pandn %xmm1, %xmm2 ; AVX2-NEXT: por %xmm3, %xmm2 -; AVX2-NEXT: vmovdqa %xmm2, %xmm0 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v4i32_const_true: ; AVX512: # %bb.0: @@ -1135,14 +1135,14 @@ define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) { ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v4i32_const_false: @@ -1153,14 +1153,14 @@ define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) { ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %xmm3, %xmm3 +; AVX-NEXT: pxor %xmm3, %xmm3 ; AVX-NEXT: movd %eax, %xmm3 ; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: movdqa %xmm3, %xmm2 ; AVX-NEXT: pand %xmm0, %xmm3 ; AVX-NEXT: pandn %xmm1, %xmm2 ; AVX-NEXT: por %xmm3, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, %xmm0 +; AVX-NEXT: vmovaps %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v4i32_const_false: @@ -1171,14 +1171,14 @@ define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) { ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %xmm3, %xmm3 +; AVX2-NEXT: pxor %xmm3, %xmm3 ; AVX2-NEXT: movd %eax, %xmm3 ; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX2-NEXT: movdqa %xmm3, %xmm2 ; AVX2-NEXT: pand %xmm0, %xmm3 ; AVX2-NEXT: pandn %xmm1, %xmm2 ; AVX2-NEXT: por %xmm3, %xmm2 -; AVX2-NEXT: vmovdqa %xmm2, %xmm0 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v4i32_const_false: ; AVX512: # %bb.0: @@ -1199,14 +1199,14 @@ define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i3 ; SSE2-NEXT: setne %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: negl %eax -; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_ctselect_v4i32_icmp: @@ -1218,14 +1218,14 @@ define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i3 ; AVX-NEXT: setne %al ; AVX-NEXT: movzbl %al, %eax ; AVX-NEXT: negl %eax -; AVX-NEXT: xorps %xmm3, %xmm3 +; AVX-NEXT: pxor %xmm3, %xmm3 ; AVX-NEXT: movd %eax, %xmm3 ; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: movdqa %xmm3, %xmm2 ; AVX-NEXT: pand %xmm0, %xmm3 ; AVX-NEXT: pandn %xmm1, %xmm2 ; AVX-NEXT: por %xmm3, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, %xmm0 +; AVX-NEXT: vmovaps %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: test_ctselect_v4i32_icmp: @@ -1237,14 +1237,14 @@ define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i3 ; AVX2-NEXT: setne %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: xorps %xmm3, %xmm3 +; AVX2-NEXT: pxor %xmm3, %xmm3 ; AVX2-NEXT: movd %eax, %xmm3 ; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX2-NEXT: movdqa %xmm3, %xmm2 ; AVX2-NEXT: pand %xmm0, %xmm3 ; AVX2-NEXT: pandn %xmm1, %xmm2 ; AVX2-NEXT: por %xmm3, %xmm2 -; AVX2-NEXT: vmovdqa %xmm2, %xmm0 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: retq ; AVX512-LABEL: test_ctselect_v4i32_icmp: ; AVX512: # %bb.0: