From 995ba77e0ae68e9ea99098568645f47d094aafe4 Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Tue, 12 Aug 2025 18:01:46 -0700 Subject: [PATCH 1/5] [NVPTX] legalize v2i32 to improve codegen of v2f32 ops Since v2f32 is legal but v2i32 is not, this causes some sequences of operations like bitcast (build_vector) to be lowered inefficiently. --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 1 + llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 184 ++++--- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 8 +- llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td | 3 +- llvm/lib/Target/NVPTX/NVPTXUtilities.h | 4 +- .../test/CodeGen/NVPTX/f32x2-convert-i32x2.ll | 119 +++++ llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 170 ++++-- llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll | 16 +- .../load-store-256-addressing-invariant.ll | 28 +- .../NVPTX/load-store-256-addressing.ll | 28 +- .../CodeGen/NVPTX/load-store-vectors-256.ll | 252 +++++---- .../CodeGen/NVPTX/reduction-intrinsics.ll | 486 ++++++++++++------ 12 files changed, 899 insertions(+), 400 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index e86abb7203f2b..bef4868492d4e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1027,6 +1027,7 @@ pickOpcodeForVT(MVT::SimpleValueType VT, std::optional Opcode_i16, case MVT::f32: return Opcode_i32; case MVT::v2f32: + case MVT::v2i32: case MVT::i64: case MVT::f64: return Opcode_i64; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 68935784128c0..544f0bab9dfa9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -226,21 +226,20 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, switch (VectorVT.SimpleTy) { default: return std::nullopt; + case MVT::v4i64: case MVT::v4f64: - case MVT::v8i32: - // This is a "native" vector type iff the address space is global - // and the target supports 256-bit loads/stores + // This is a "native" vector type iff the address space is global and the + // target supports 256-bit loads/stores if (!CanLowerTo256Bit) return std::nullopt; LLVM_FALLTHROUGH; case MVT::v2i8: - case MVT::v2i32: case MVT::v2i64: case MVT::v2f64: - case MVT::v4i32: // This is a "native" vector type return std::pair(NumElts, EltVT); + case MVT::v16f16: // <8 x f16x2> case MVT::v16bf16: // <8 x bf16x2> case MVT::v16i16: // <8 x i16x2> @@ -264,12 +263,18 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, case MVT::v16i8: // <4 x i8x4> PackRegSize = 32; break; + case MVT::v8f32: // <4 x f32x2> + case MVT::v8i32: // <4 x i32x2> + // This is a "native" vector type iff the address space is global and the + // target supports 256-bit loads/stores if (!CanLowerTo256Bit) return std::nullopt; LLVM_FALLTHROUGH; case MVT::v2f32: // <1 x f32x2> case MVT::v4f32: // <2 x f32x2> + case MVT::v2i32: // <1 x i32x2> + case MVT::v4i32: // <2 x i32x2> if (!STI.hasF32x2Instructions()) return std::pair(NumElts, EltVT); PackRegSize = 64; @@ -590,8 +595,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, addRegisterClass(MVT::bf16, &NVPTX::B16RegClass); addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass); - if (STI.hasF32x2Instructions()) + if (STI.hasF32x2Instructions()) { addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass); + addRegisterClass(MVT::v2i32, &NVPTX::B64RegClass); + } // Conversion to/from FP16/FP16x2 is always legal. setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); @@ -628,12 +635,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom); - // No support for these operations with v2f32. - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Expand); + // No support for these operations with v2f32/v2i32 + setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand); // Need custom lowering in case the index is dynamic. if (STI.hasF32x2Instructions()) - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, + Custom); // Custom conversions to/from v2i8. setOperationAction(ISD::BITCAST, MVT::v2i8, Custom); @@ -661,14 +669,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // Operations not directly supported by NVPTX. for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, - MVT::v4i8, MVT::i32, MVT::i64}) { + MVT::v4i8, MVT::i32, MVT::v2i32, MVT::i64}) { setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::BR_CC, VT, Expand); } - // Not directly supported. TLI would attempt to expand operations like - // FMINIMUM(v2f32) using invalid SETCC and VSELECT nodes. - setOperationAction(ISD::VSELECT, MVT::v2f32, Expand); + // We don't want ops like FMINIMUM or UMAX to be lowered to SETCC+VSELECT. + setOperationAction(ISD::VSELECT, {MVT::v2f32, MVT::v2i32}, Expand); // Some SIGN_EXTEND_INREG can be done using cvt instruction. // For others we will expand to a SHL/SRA pair. @@ -815,7 +822,14 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS, ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::SETCC}, - MVT::v2i16, Expand); + {MVT::v2i16, MVT::v2i32}, Expand); + + // v2i32 is not supported for any arithmetic operations + setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, + ISD::CTPOP, ISD::CTLZ, ISD::ADD, ISD::SUB, ISD::MUL, + ISD::SHL, ISD::SRA, ISD::SRL, ISD::OR, ISD::AND, ISD::XOR, + ISD::SREM, ISD::UREM}, + MVT::v2i32, Expand); setOperationAction(ISD::ADDC, MVT::i32, Legal); setOperationAction(ISD::ADDE, MVT::i32, Legal); @@ -829,7 +843,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, } setOperationAction(ISD::CTTZ, MVT::i16, Expand); - setOperationAction(ISD::CTTZ, MVT::v2i16, Expand); + setOperationAction(ISD::CTTZ, {MVT::v2i16, MVT::v2i32}, Expand); setOperationAction(ISD::CTTZ, MVT::i32, Expand); setOperationAction(ISD::CTTZ, MVT::i64, Expand); @@ -1071,7 +1085,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // Custom lowering for tcgen05.st vector operands setOperationAction(ISD::INTRINSIC_VOID, {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, - MVT::v32i32, MVT::v64i32, MVT::v128i32}, + MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::Other}, Custom); // Enable custom lowering for the following: @@ -2604,7 +2618,7 @@ static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) { return V; } -static SDValue LowerTcgen05St(SDValue Op, SelectionDAG &DAG) { +static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); SDLoc DL(N); SmallVector Ops; @@ -2719,7 +2733,52 @@ static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG) { return Tcgen05MMANode; } -static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) { +// Lower vector return type of tcgen05.ld intrinsics +static std::optional> +lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) { + SDLoc DL(N); + EVT ResVT = N->getValueType(0); + if (!ResVT.isVector()) + return {}; // already legalized. + + const unsigned NumElts = ResVT.getVectorNumElements(); + + // Create the return type of the instructions + SmallVector ListVTs; + for (unsigned i = 0; i < NumElts; ++i) + ListVTs.push_back(MVT::i32); + + ListVTs.push_back(N->getValueType(1)); // Chain + + SDVTList ResVTs = DAG.getVTList(ListVTs); + + SmallVector Ops{N->getOperand(0), N->getOperand(1), + N->getOperand(2)}; + + if (HasOffset) { + Ops.push_back(N->getOperand(3)); // offset + Ops.push_back(N->getOperand(4)); // Pack flag + } else + Ops.push_back(N->getOperand(3)); // Pack flag + + MemIntrinsicSDNode *MemSD = cast(N); + SDValue NewNode = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, ResVTs, Ops, + MemSD->getMemoryVT(), MemSD->getMemOperand()); + + // split the vector result + SmallVector ScalarRes; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Res = NewNode.getValue(i); + ScalarRes.push_back(Res); + } + + SDValue Chain = NewNode.getValue(NumElts); + SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes); + return {{BuildVector, Chain}}; +} + +static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); SDValue Intrin = N->getOperand(1); @@ -2765,7 +2824,7 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) { case Intrinsic::nvvm_tcgen05_st_16x64b_x64: case Intrinsic::nvvm_tcgen05_st_32x32b_x64: case Intrinsic::nvvm_tcgen05_st_32x32b_x128: - return LowerTcgen05St(Op, DAG); + return lowerTcgen05St(Op, DAG); case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1: case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2: case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1: @@ -2867,6 +2926,26 @@ static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) { SDValue Selector = (Op->op_end() - 1)->get(); return getPRMT(A, B, Selector, DL, DAG, Mode); } + +static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG) { + switch (Op->getConstantOperandVal(1)) { + default: + return Op; + + case Intrinsic::nvvm_tcgen05_ld_16x64b_x2: + case Intrinsic::nvvm_tcgen05_ld_16x128b_x1: + case Intrinsic::nvvm_tcgen05_ld_32x32b_x2: + if (auto Pair = lowerTcgen05Ld(Op.getNode(), DAG)) + return DAG.getMergeValues({Pair->first, Pair->second}, SDLoc(Op)); + return SDValue(); + + case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: + if (auto Pair = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true)) + return DAG.getMergeValues({Pair->first, Pair->second}, SDLoc(Op)); + return SDValue(); + } +} + static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) { switch (Op->getConstantOperandVal(0)) { default: @@ -3029,11 +3108,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG); case ISD::INTRINSIC_W_CHAIN: - return Op; + return lowerIntrinsicWChain(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return lowerIntrinsicWOChain(Op, DAG); case ISD::INTRINSIC_VOID: - return LowerIntrinsicVoid(Op, DAG); + return lowerIntrinsicVoid(Op, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::BITCAST: @@ -5920,7 +5999,7 @@ static SDValue PerformEXTRACTCombine(SDNode *N, IsPTXVectorType(VectorVT.getSimpleVT())) return SDValue(); // Native vector loads already combine nicely w/ // extract_vector_elt. - // Don't mess with singletons or packed types (v2f32, v2*16, v4i8 and v8i8), + // Don't mess with singletons or packed types (v2*32, v2*16, v4i8 and v8i8), // we already handle them OK. if (VectorVT.getVectorNumElements() == 1 || NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8) @@ -6300,53 +6379,6 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1})); } -// Lower vector return type of tcgen05.ld intrinsics -static void ReplaceTcgen05Ld(SDNode *N, SelectionDAG &DAG, - SmallVectorImpl &Results, - bool hasOffset = false) { - SDLoc DL(N); - EVT ResVT = N->getValueType(0); - if (!ResVT.isVector()) - return; // already legalized. - - const unsigned NumElts = ResVT.getVectorNumElements(); - - // Create the return type of the instructions - SmallVector ListVTs; - for (unsigned i = 0; i < NumElts; ++i) - ListVTs.push_back(MVT::i32); - - ListVTs.push_back(N->getValueType(1)); // Chain - - SDVTList ResVTs = DAG.getVTList(ListVTs); - - SmallVector Ops{N->getOperand(0), N->getOperand(1), - N->getOperand(2)}; - - if (hasOffset) { - Ops.push_back(N->getOperand(3)); // offset - Ops.push_back(N->getOperand(4)); // Pack flag - } else - Ops.push_back(N->getOperand(3)); // Pack flag - - MemIntrinsicSDNode *MemSD = cast(N); - SDValue NewNode = - DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, ResVTs, Ops, - MemSD->getMemoryVT(), MemSD->getMemOperand()); - - // split the vector result - SmallVector ScalarRes; - for (unsigned i = 0; i < NumElts; ++i) { - SDValue Res = NewNode.getValue(i); - ScalarRes.push_back(Res); - } - - SDValue Chain = NewNode.getValue(NumElts); - SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes); - Results.push_back(BuildVector); // Build Vector - Results.push_back(Chain); // Chain -} - static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl &Results) { SDValue Chain = N->getOperand(0); @@ -6482,7 +6514,11 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, case Intrinsic::nvvm_tcgen05_ld_16x256b_x8: case Intrinsic::nvvm_tcgen05_ld_16x256b_x16: case Intrinsic::nvvm_tcgen05_ld_16x256b_x32: - return ReplaceTcgen05Ld(N, DAG, Results); + if (auto Pair = lowerTcgen05Ld(N, DAG)) { + Results.push_back(Pair->first); + Results.push_back(Pair->second); + } + return; case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4: @@ -6491,7 +6527,11 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32: case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64: case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: - return ReplaceTcgen05Ld(N, DAG, Results, /* Offset */ true); + if (auto Pair = lowerTcgen05Ld(N, DAG, /*HasOffset=*/true)) { + Results.push_back(Pair->first); + Results.push_back(Pair->second); + } + return; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 0b2bb694f1fd5..4cacee2290763 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -756,8 +756,10 @@ def : Pat<(vt (select i1:$p, vt:$a, vt:$b)), (SELP_b32rr $a, $b, $p)>; } -def : Pat<(v2f32 (select i1:$p, v2f32:$a, v2f32:$b)), +foreach vt = [v2f32, v2i32] in { +def : Pat<(vt (select i1:$p, vt:$a, vt:$b)), (SELP_b64rr $a, $b, $p)>; +} //----------------------------------- // Test Instructions @@ -2101,8 +2103,8 @@ foreach vt = [v2f16, v2bf16, v2i16] in { (V2I16toI32 $a, $b)>; } -// Same thing for the 64-bit type v2f32. -foreach vt = [v2f32] in { +// Handle extracting one element from the pair (64-bit types) +foreach vt = [v2f32, v2i32] in { def : Pat<(extractelt vt:$src, 0), (I64toI32L_Sink $src)>, Requires<[hasPTX<71>]>; def : Pat<(extractelt vt:$src, 1), (I64toI32H_Sink $src)>, Requires<[hasPTX<71>]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td index 2e81ab122d1df..913487b64617a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -54,7 +54,8 @@ def B16 : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4))>; def B32 : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8, f32], 32, (add (sequence "R%u", 0, 4), VRFrame32, VRFrameLocal32)>; -def B64 : NVPTXRegClass<[i64, v2f32, f64], 64, (add (sequence "RL%u", 0, 4), +def B64 : NVPTXRegClass<[i64, v2i32, v2f32, f64], 64, + (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>; // 128-bit regs are not defined as general regs in NVPTX. They are used for inlineASM only. def B128 : NVPTXRegClass<[i128], 128, (add (sequence "RQ%u", 0, 4))>; diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index a070789f85e0b..4b5cb30fd3036 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -99,8 +99,8 @@ namespace NVPTX { // register. NOTE: This must be kept in sync with the register classes // defined in NVPTXRegisterInfo.td. inline auto packed_types() { - static const auto PackedTypes = {MVT::v4i8, MVT::v2f16, MVT::v2bf16, - MVT::v2i16, MVT::v2f32}; + static const auto PackedTypes = {MVT::v4i8, MVT::v2f16, MVT::v2bf16, + MVT::v2i16, MVT::v2f32, MVT::v2i32}; return PackedTypes; } diff --git a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll new file mode 100644 index 0000000000000..18fb87935d17d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_90a -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-SM90A %s +; RUN: %if ptxas-12.7 %{ \ +; RUN: llc < %s -mcpu=sm_90a -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_90a \ +; RUN: %} +; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-SM100 %s +; RUN: %if ptxas-12.7 %{ \ +; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \ +; RUN: %} + +; Test that v2i32 -> v2f32 conversions don't emit bitwise operations on i64. + +target triple = "nvptx64-nvidia-cuda" + +declare <2 x i32> @return_i32x2(i32 %0) + +; Test with v2i32. +define ptx_kernel void @store_i32x2(i32 %0, ptr %p) { +; CHECK-SM90A-LABEL: store_i32x2( +; CHECK-SM90A: { +; CHECK-SM90A-NEXT: .reg .b32 %r<6>; +; CHECK-SM90A-NEXT: .reg .b64 %rd<2>; +; CHECK-SM90A-EMPTY: +; CHECK-SM90A-NEXT: // %bb.0: +; CHECK-SM90A-NEXT: ld.param.b64 %rd1, [store_i32x2_param_1]; +; CHECK-SM90A-NEXT: ld.param.b32 %r1, [store_i32x2_param_0]; +; CHECK-SM90A-NEXT: { // callseq 0, 0 +; CHECK-SM90A-NEXT: .param .b32 param0; +; CHECK-SM90A-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-SM90A-NEXT: st.param.b32 [param0], %r1; +; CHECK-SM90A-NEXT: call.uni (retval0), return_i32x2, (param0); +; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r2, %r3}, [retval0]; +; CHECK-SM90A-NEXT: } // callseq 0 +; CHECK-SM90A-NEXT: add.rn.f32 %r4, %r3, %r3; +; CHECK-SM90A-NEXT: add.rn.f32 %r5, %r2, %r2; +; CHECK-SM90A-NEXT: st.v2.b32 [%rd1], {%r5, %r4}; +; CHECK-SM90A-NEXT: ret; +; +; CHECK-SM100-LABEL: store_i32x2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<2>; +; CHECK-SM100-NEXT: .reg .b64 %rd<4>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b64 %rd1, [store_i32x2_param_1]; +; CHECK-SM100-NEXT: ld.param.b32 %r1, [store_i32x2_param_0]; +; CHECK-SM100-NEXT: { // callseq 0, 0 +; CHECK-SM100-NEXT: .param .b32 param0; +; CHECK-SM100-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-SM100-NEXT: st.param.b32 [param0], %r1; +; CHECK-SM100-NEXT: call.uni (retval0), return_i32x2, (param0); +; CHECK-SM100-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-SM100-NEXT: } // callseq 0 +; CHECK-SM100-NEXT: add.rn.f32x2 %rd3, %rd2, %rd2; +; CHECK-SM100-NEXT: st.b64 [%rd1], %rd3; +; CHECK-SM100-NEXT: ret; + %v = call <2 x i32> @return_i32x2(i32 %0) + %v.f32x2 = bitcast <2 x i32> %v to <2 x float> + %res = fadd <2 x float> %v.f32x2, %v.f32x2 + store <2 x float> %res, ptr %p, align 8 + ret void +} + +; Test with inline ASM returning { <1 x float>, <1 x float> }, which decays to +; v2i32. +define ptx_kernel void @inlineasm(ptr %p) { +; CHECK-SM90A-LABEL: inlineasm( +; CHECK-SM90A: { +; CHECK-SM90A-NEXT: .reg .b32 %r<7>; +; CHECK-SM90A-NEXT: .reg .b64 %rd<2>; +; CHECK-SM90A-EMPTY: +; CHECK-SM90A-NEXT: // %bb.0: +; CHECK-SM90A-NEXT: ld.param.b64 %rd1, [inlineasm_param_0]; +; CHECK-SM90A-NEXT: mov.b32 %r3, 0; +; CHECK-SM90A-NEXT: mov.b32 %r4, %r3; +; CHECK-SM90A-NEXT: mov.b32 %r2, %r4; +; CHECK-SM90A-NEXT: mov.b32 %r1, %r3; +; CHECK-SM90A-NEXT: // begin inline asm +; CHECK-SM90A-NEXT: // nop +; CHECK-SM90A-NEXT: // end inline asm +; CHECK-SM90A-NEXT: mul.rn.f32 %r5, %r2, 0f00000000; +; CHECK-SM90A-NEXT: mul.rn.f32 %r6, %r1, 0f00000000; +; CHECK-SM90A-NEXT: st.v2.b32 [%rd1], {%r6, %r5}; +; CHECK-SM90A-NEXT: ret; +; +; CHECK-SM100-LABEL: inlineasm( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<6>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b64 %rd1, [inlineasm_param_0]; +; CHECK-SM100-NEXT: mov.b32 %r3, 0; +; CHECK-SM100-NEXT: mov.b32 %r4, %r3; +; CHECK-SM100-NEXT: mov.b32 %r2, %r4; +; CHECK-SM100-NEXT: mov.b32 %r1, %r3; +; CHECK-SM100-NEXT: // begin inline asm +; CHECK-SM100-NEXT: // nop +; CHECK-SM100-NEXT: // end inline asm +; CHECK-SM100-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-SM100-NEXT: mov.b32 %r5, 0f00000000; +; CHECK-SM100-NEXT: mov.b64 %rd3, {%r5, %r5}; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd4, %rd2, %rd3; +; CHECK-SM100-NEXT: st.b64 [%rd1], %rd4; +; CHECK-SM100-NEXT: ret; + %r = call { <1 x float>, <1 x float> } asm sideeffect "// nop", "=f,=f,0,1"(<1 x float> zeroinitializer, <1 x float> zeroinitializer) + %i0 = extractvalue { <1 x float>, <1 x float> } %r, 0 + %i1 = extractvalue { <1 x float>, <1 x float> } %r, 1 + %i4 = shufflevector <1 x float> %i0, <1 x float> %i1, <2 x i32> + %mul = fmul < 2 x float> %i4, zeroinitializer + store <2 x float> %mul, ptr %p, align 8 + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll index 217bb483682ff..a90cfff51e2c6 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -1938,16 +1938,29 @@ define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 { } define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 { -; CHECK-LABEL: test_uitofp_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_param_0]; -; CHECK-NEXT: cvt.rn.f32.u32 %r3, %r2; -; CHECK-NEXT: cvt.rn.f32.u32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_uitofp_2xi32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_uitofp_2xi32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = uitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -1969,16 +1982,29 @@ define <2 x float> @test_uitofp_2xi64(<2 x i64> %a) #0 { } define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 { -; CHECK-LABEL: test_sitofp_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_param_0]; -; CHECK-NEXT: cvt.rn.f32.s32 %r3, %r2; -; CHECK-NEXT: cvt.rn.f32.s32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_sitofp_2xi32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.s32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.s32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_sitofp_2xi32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_sitofp_2xi32_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = sitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -2017,16 +2043,17 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 { ; CHECK-F32X2-LABEL: test_uitofp_2xi32_fadd( ; CHECK-F32X2: { ; CHECK-F32X2-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<5>; ; CHECK-F32X2-EMPTY: ; CHECK-F32X2-NEXT: // %bb.0: -; CHECK-F32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; -; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_uitofp_2xi32_fadd_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r3, %r2; ; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; -; CHECK-F32X2-NEXT: mov.b64 %rd2, {%r4, %r3}; -; CHECK-F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; -; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: mov.b64 %rd3, {%r4, %r3}; +; CHECK-F32X2-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-F32X2-NEXT: ret; %c = uitofp <2 x i32> %a to <2 x float> %r = fadd <2 x float> %b, %c @@ -2114,14 +2141,23 @@ define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 { } define <2 x float> @test_bitcast_2xi32_to_2xfloat(<2 x i32> %a) #0 { -; CHECK-LABEL: test_bitcast_2xi32_to_2xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0]; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_bitcast_2xi32_to_2xfloat( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_bitcast_2xi32_to_2xfloat( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xi32_to_2xfloat_param_0]; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-F32X2-NEXT: ret; %r = bitcast <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -2851,31 +2887,57 @@ define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 { } define <2 x float> @test_sitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { -; CHECK-LABEL: test_sitofp_2xi32_to_2xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_to_2xfloat_param_0]; -; CHECK-NEXT: cvt.rn.f32.s32 %r3, %r2; -; CHECK-NEXT: cvt.rn.f32.s32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_sitofp_2xi32_to_2xfloat( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_to_2xfloat_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.s32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.s32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_sitofp_2xi32_to_2xfloat( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_sitofp_2xi32_to_2xfloat_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = sitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { -; CHECK-LABEL: test_uitofp_2xi32_to_2xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_to_2xfloat_param_0]; -; CHECK-NEXT: cvt.rn.f32.u32 %r3, %r2; -; CHECK-NEXT: cvt.rn.f32.u32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_uitofp_2xi32_to_2xfloat( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_to_2xfloat_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_uitofp_2xi32_to_2xfloat( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_to_2xfloat_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = uitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll index d219493d2b31b..3fac29f74125b 100644 --- a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll +++ b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll @@ -346,15 +346,19 @@ define i32 @ld_global_v8i32(ptr addrspace(1) %ptr) { ; SM100-LABEL: ld_global_v8i32( ; SM100: { ; SM100-NEXT: .reg .b32 %r<16>; -; SM100-NEXT: .reg .b64 %rd<2>; +; SM100-NEXT: .reg .b64 %rd<6>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8i32_param_0]; -; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: add.s32 %r9, %r1, %r2; -; SM100-NEXT: add.s32 %r10, %r3, %r4; -; SM100-NEXT: add.s32 %r11, %r5, %r6; -; SM100-NEXT: add.s32 %r12, %r7, %r8; +; SM100-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: mov.b64 {%r1, %r2}, %rd5; +; SM100-NEXT: mov.b64 {%r3, %r4}, %rd4; +; SM100-NEXT: mov.b64 {%r5, %r6}, %rd3; +; SM100-NEXT: mov.b64 {%r7, %r8}, %rd2; +; SM100-NEXT: add.s32 %r9, %r7, %r8; +; SM100-NEXT: add.s32 %r10, %r5, %r6; +; SM100-NEXT: add.s32 %r11, %r3, %r4; +; SM100-NEXT: add.s32 %r12, %r1, %r2; ; SM100-NEXT: add.s32 %r13, %r9, %r10; ; SM100-NEXT: add.s32 %r14, %r11, %r12; ; SM100-NEXT: add.s32 %r15, %r13, %r14; diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll index 12e3287e73f0f..57852451c0c72 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll @@ -82,11 +82,11 @@ define void @avar_bfloat() { define void @avar_i32() { ; PTX-LABEL: avar_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; -; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; +; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %load = load <8 x i32>, ptr addrspace(1) @globalin, !invariant.load !0 store <8 x i32> %load, ptr addrspace(1) @globalout @@ -202,11 +202,11 @@ define void @asi_bfloat() { define void @asi_i32() { ; PTX-LABEL: asi_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; -; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; +; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 %load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0 @@ -331,14 +331,13 @@ define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-LABEL: areg_64_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i32_param_0]; -; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i32_param_1]; -; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; PTX-NEXT: ld.param.b64 %rd6, [areg_64_i32_param_1]; +; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; PTX-NEXT: ret; %load = load <8 x i32>, ptr addrspace(1) %in, !invariant.load !0 store <8 x i32> %load, ptr addrspace(1) %out @@ -472,14 +471,13 @@ define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-LABEL: ari_64_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i32_param_0]; ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i32_param_1]; -; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; -; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; +; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 %load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0 diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll index b7fa1dd5f2c4d..21604dfbf0013 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll @@ -78,11 +78,11 @@ define void @avar_bfloat() { define void @avar_i32() { ; PTX-LABEL: avar_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; -; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; +; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %load = load <8 x i32>, ptr addrspace(1) @globalin store <8 x i32> %load, ptr addrspace(1) @globalout @@ -198,11 +198,11 @@ define void @asi_bfloat() { define void @asi_i32() { ; PTX-LABEL: asi_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; -; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; +; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 %load = load <8 x i32>, ptr addrspace(1) %in.offset @@ -327,14 +327,13 @@ define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-LABEL: areg_64_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i32_param_0]; -; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i32_param_1]; -; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; PTX-NEXT: ld.param.b64 %rd6, [areg_64_i32_param_1]; +; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; PTX-NEXT: ret; %load = load <8 x i32>, ptr addrspace(1) %in store <8 x i32> %load, ptr addrspace(1) %out @@ -468,14 +467,13 @@ define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-LABEL: ari_64_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i32_param_0]; ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i32_param_1]; -; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; -; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; +; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 %load = load <8 x i32>, ptr addrspace(1) %in.offset diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll index e8b43ad28ad27..b5319935f0f9d 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll @@ -100,19 +100,32 @@ define void @generic_16xbfloat(ptr %a, ptr %b) { } define void @generic_8xi32(ptr %a, ptr %b) { -; CHECK-LABEL: generic_8xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xi32_param_0]; -; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [generic_8xi32_param_1]; -; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; -; CHECK-NEXT: ret; +; SM90-LABEL: generic_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [generic_8xi32_param_0]; +; SM90-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [generic_8xi32_param_1]; +; SM90-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: generic_8xi32( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [generic_8xi32_param_0]; +; SM100-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [generic_8xi32_param_1]; +; SM100-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load <8 x i32>, ptr %a store <8 x i32> %a.load, ptr %b ret void @@ -265,19 +278,32 @@ define void @generic_volatile_16xbfloat(ptr %a, ptr %b) { } define void @generic_volatile_8xi32(ptr %a, ptr %b) { -; CHECK-LABEL: generic_volatile_8xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi32_param_0]; -; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_8xi32_param_1]; -; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; -; CHECK-NEXT: ret; +; SM90-LABEL: generic_volatile_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi32_param_0]; +; SM90-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [generic_volatile_8xi32_param_1]; +; SM90-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: generic_volatile_8xi32( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi32_param_0]; +; SM100-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [generic_volatile_8xi32_param_1]; +; SM100-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load volatile <8 x i32>, ptr %a store volatile <8 x i32> %a.load, ptr %b ret void @@ -496,14 +522,13 @@ define void @global_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; ; SM100-LABEL: global_8xi32( ; SM100: { -; SM100-NEXT: .reg .b32 %r<9>; -; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-NEXT: .reg .b64 %rd<7>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0]; -; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.b64 %rd2, [global_8xi32_param_1]; -; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [global_8xi32_param_1]; +; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; SM100-NEXT: ret; %a.load = load <8 x i32>, ptr addrspace(1) %a store <8 x i32> %a.load, ptr addrspace(1) %b @@ -741,14 +766,13 @@ define void @global_volatile_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; ; SM100-LABEL: global_volatile_8xi32( ; SM100: { -; SM100-NEXT: .reg .b32 %r<9>; -; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-NEXT: .reg .b64 %rd<7>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_8xi32_param_0]; -; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_8xi32_param_1]; -; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [global_volatile_8xi32_param_1]; +; SM100-NEXT: st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; SM100-NEXT: ret; %a.load = load volatile <8 x i32>, ptr addrspace(1) %a store volatile <8 x i32> %a.load, ptr addrspace(1) %b @@ -924,19 +948,32 @@ define void @shared_16xbfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { } define void @shared_8xi32(ptr addrspace(3) %a, ptr addrspace(3) %b) { -; CHECK-LABEL: shared_8xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [shared_8xi32_param_0]; -; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [shared_8xi32_param_1]; -; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; -; CHECK-NEXT: ret; +; SM90-LABEL: shared_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [shared_8xi32_param_0]; +; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [shared_8xi32_param_1]; +; SM90-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: shared_8xi32( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [shared_8xi32_param_0]; +; SM100-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [shared_8xi32_param_1]; +; SM100-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load <8 x i32>, ptr addrspace(3) %a store <8 x i32> %a.load, ptr addrspace(3) %b ret void @@ -1089,19 +1126,32 @@ define void @shared_volatile_16xbfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) } define void @shared_volatile_8xi32(ptr addrspace(3) %a, ptr addrspace(3) %b) { -; CHECK-LABEL: shared_volatile_8xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi32_param_0]; -; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_8xi32_param_1]; -; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; -; CHECK-NEXT: ret; +; SM90-LABEL: shared_volatile_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi32_param_0]; +; SM90-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [shared_volatile_8xi32_param_1]; +; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: shared_volatile_8xi32( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi32_param_0]; +; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [shared_volatile_8xi32_param_1]; +; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load volatile <8 x i32>, ptr addrspace(3) %a store volatile <8 x i32> %a.load, ptr addrspace(3) %b ret void @@ -1256,19 +1306,32 @@ define void @local_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { } define void @local_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) { -; CHECK-LABEL: local_8xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [local_8xi32_param_0]; -; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [local_8xi32_param_1]; -; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; -; CHECK-NEXT: ret; +; SM90-LABEL: local_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [local_8xi32_param_0]; +; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [local_8xi32_param_1]; +; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: local_8xi32( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [local_8xi32_param_0]; +; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [local_8xi32_param_1]; +; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load <8 x i32>, ptr addrspace(5) %a store <8 x i32> %a.load, ptr addrspace(5) %b ret void @@ -1421,19 +1484,32 @@ define void @local_volatile_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) } define void @local_volatile_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) { -; CHECK-LABEL: local_volatile_8xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xi32_param_0]; -; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_8xi32_param_1]; -; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; -; CHECK-NEXT: ret; +; SM90-LABEL: local_volatile_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [local_volatile_8xi32_param_0]; +; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [local_volatile_8xi32_param_1]; +; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: local_volatile_8xi32( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [local_volatile_8xi32_param_0]; +; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [local_volatile_8xi32_param_1]; +; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load volatile <8 x i32>, ptr addrspace(5) %a store volatile <8 x i32> %a.load, ptr addrspace(5) %b ret void diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index f871e4039a558..87787ba2bf81c 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -1452,22 +1452,44 @@ define i16 @reduce_add_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_add_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_add_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0]; -; CHECK-NEXT: add.s32 %r9, %r4, %r8; -; CHECK-NEXT: add.s32 %r10, %r2, %r6; -; CHECK-NEXT: add.s32 %r11, %r10, %r9; -; CHECK-NEXT: add.s32 %r12, %r3, %r7; -; CHECK-NEXT: add.s32 %r13, %r1, %r5; -; CHECK-NEXT: add.s32 %r14, %r13, %r12; -; CHECK-NEXT: add.s32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_add_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0]; +; CHECK-SM80-NEXT: add.s32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: add.s32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: add.s32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: add.s32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: add.s32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: add.s32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: add.s32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_add_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_add_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_add_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: add.s32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: add.s32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: add.s32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: add.s32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: add.s32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: add.s32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: add.s32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.add(<8 x i32> %in) ret i32 %res } @@ -1543,22 +1565,44 @@ define i16 @reduce_mul_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_mul_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_mul_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0]; -; CHECK-NEXT: mul.lo.s32 %r9, %r4, %r8; -; CHECK-NEXT: mul.lo.s32 %r10, %r2, %r6; -; CHECK-NEXT: mul.lo.s32 %r11, %r10, %r9; -; CHECK-NEXT: mul.lo.s32 %r12, %r3, %r7; -; CHECK-NEXT: mul.lo.s32 %r13, %r1, %r5; -; CHECK-NEXT: mul.lo.s32 %r14, %r13, %r12; -; CHECK-NEXT: mul.lo.s32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_mul_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0]; +; CHECK-SM80-NEXT: mul.lo.s32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: mul.lo.s32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: mul.lo.s32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: mul.lo.s32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: mul.lo.s32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: mul.lo.s32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: mul.lo.s32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_mul_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_mul_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_mul_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: mul.lo.s32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: mul.lo.s32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: mul.lo.s32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: mul.lo.s32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: mul.lo.s32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: mul.lo.s32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: mul.lo.s32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.mul(<8 x i32> %in) ret i32 %res } @@ -1673,22 +1717,44 @@ define i16 @reduce_umax_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_umax_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_umax_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0]; -; CHECK-NEXT: max.u32 %r9, %r4, %r8; -; CHECK-NEXT: max.u32 %r10, %r2, %r6; -; CHECK-NEXT: max.u32 %r11, %r10, %r9; -; CHECK-NEXT: max.u32 %r12, %r3, %r7; -; CHECK-NEXT: max.u32 %r13, %r1, %r5; -; CHECK-NEXT: max.u32 %r14, %r13, %r12; -; CHECK-NEXT: max.u32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_umax_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0]; +; CHECK-SM80-NEXT: max.u32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: max.u32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: max.u32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: max.u32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: max.u32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: max.u32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: max.u32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_umax_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_umax_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_umax_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: max.u32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: max.u32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: max.u32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: max.u32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: max.u32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: max.u32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: max.u32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.umax(<8 x i32> %in) ret i32 %res } @@ -1803,22 +1869,44 @@ define i16 @reduce_umin_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_umin_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_umin_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0]; -; CHECK-NEXT: min.u32 %r9, %r4, %r8; -; CHECK-NEXT: min.u32 %r10, %r2, %r6; -; CHECK-NEXT: min.u32 %r11, %r10, %r9; -; CHECK-NEXT: min.u32 %r12, %r3, %r7; -; CHECK-NEXT: min.u32 %r13, %r1, %r5; -; CHECK-NEXT: min.u32 %r14, %r13, %r12; -; CHECK-NEXT: min.u32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_umin_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0]; +; CHECK-SM80-NEXT: min.u32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: min.u32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: min.u32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: min.u32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: min.u32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: min.u32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: min.u32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_umin_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_umin_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_umin_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: min.u32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: min.u32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: min.u32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: min.u32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: min.u32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: min.u32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: min.u32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.umin(<8 x i32> %in) ret i32 %res } @@ -1933,22 +2021,44 @@ define i16 @reduce_smax_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_smax_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_smax_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0]; -; CHECK-NEXT: max.s32 %r9, %r4, %r8; -; CHECK-NEXT: max.s32 %r10, %r2, %r6; -; CHECK-NEXT: max.s32 %r11, %r10, %r9; -; CHECK-NEXT: max.s32 %r12, %r3, %r7; -; CHECK-NEXT: max.s32 %r13, %r1, %r5; -; CHECK-NEXT: max.s32 %r14, %r13, %r12; -; CHECK-NEXT: max.s32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_smax_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0]; +; CHECK-SM80-NEXT: max.s32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: max.s32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: max.s32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: max.s32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: max.s32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: max.s32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: max.s32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_smax_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_smax_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_smax_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: max.s32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: max.s32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: max.s32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: max.s32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: max.s32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: max.s32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: max.s32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.smax(<8 x i32> %in) ret i32 %res } @@ -2063,22 +2173,44 @@ define i16 @reduce_smin_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_smin_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_smin_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0]; -; CHECK-NEXT: min.s32 %r9, %r4, %r8; -; CHECK-NEXT: min.s32 %r10, %r2, %r6; -; CHECK-NEXT: min.s32 %r11, %r10, %r9; -; CHECK-NEXT: min.s32 %r12, %r3, %r7; -; CHECK-NEXT: min.s32 %r13, %r1, %r5; -; CHECK-NEXT: min.s32 %r14, %r13, %r12; -; CHECK-NEXT: min.s32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_smin_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0]; +; CHECK-SM80-NEXT: min.s32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: min.s32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: min.s32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: min.s32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: min.s32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: min.s32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: min.s32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_smin_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_smin_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_smin_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: min.s32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: min.s32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: min.s32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: min.s32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: min.s32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: min.s32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: min.s32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.smin(<8 x i32> %in) ret i32 %res } @@ -2152,22 +2284,44 @@ define i16 @reduce_and_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_and_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_and_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0]; -; CHECK-NEXT: and.b32 %r9, %r4, %r8; -; CHECK-NEXT: and.b32 %r10, %r2, %r6; -; CHECK-NEXT: and.b32 %r11, %r10, %r9; -; CHECK-NEXT: and.b32 %r12, %r3, %r7; -; CHECK-NEXT: and.b32 %r13, %r1, %r5; -; CHECK-NEXT: and.b32 %r14, %r13, %r12; -; CHECK-NEXT: and.b32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_and_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0]; +; CHECK-SM80-NEXT: and.b32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: and.b32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: and.b32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: and.b32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: and.b32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: and.b32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: and.b32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_and_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_and_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_and_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: and.b32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: and.b32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: and.b32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: and.b32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: and.b32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: and.b32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: and.b32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.and(<8 x i32> %in) ret i32 %res } @@ -2241,22 +2395,44 @@ define i16 @reduce_or_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_or_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_or_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0]; -; CHECK-NEXT: or.b32 %r9, %r4, %r8; -; CHECK-NEXT: or.b32 %r10, %r2, %r6; -; CHECK-NEXT: or.b32 %r11, %r10, %r9; -; CHECK-NEXT: or.b32 %r12, %r3, %r7; -; CHECK-NEXT: or.b32 %r13, %r1, %r5; -; CHECK-NEXT: or.b32 %r14, %r13, %r12; -; CHECK-NEXT: or.b32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_or_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0]; +; CHECK-SM80-NEXT: or.b32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: or.b32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: or.b32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: or.b32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: or.b32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: or.b32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: or.b32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_or_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_or_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_or_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: or.b32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: or.b32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: or.b32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: or.b32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: or.b32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: or.b32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: or.b32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.or(<8 x i32> %in) ret i32 %res } @@ -2330,22 +2506,44 @@ define i16 @reduce_xor_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_xor_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_xor_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0]; -; CHECK-NEXT: xor.b32 %r9, %r4, %r8; -; CHECK-NEXT: xor.b32 %r10, %r2, %r6; -; CHECK-NEXT: xor.b32 %r11, %r10, %r9; -; CHECK-NEXT: xor.b32 %r12, %r3, %r7; -; CHECK-NEXT: xor.b32 %r13, %r1, %r5; -; CHECK-NEXT: xor.b32 %r14, %r13, %r12; -; CHECK-NEXT: xor.b32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_xor_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0]; +; CHECK-SM80-NEXT: xor.b32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: xor.b32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: xor.b32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: xor.b32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: xor.b32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: xor.b32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: xor.b32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_xor_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_xor_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_xor_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: xor.b32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: xor.b32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: xor.b32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: xor.b32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: xor.b32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: xor.b32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: xor.b32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.xor(<8 x i32> %in) ret i32 %res } From c21fd6fb59d64ef0646bfafb5c4d56d6c5314b3e Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Thu, 25 Sep 2025 14:25:36 -0700 Subject: [PATCH 2/5] address reviewer comments --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 544f0bab9dfa9..3e4abb72d377e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2932,6 +2932,8 @@ static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG) { default: return Op; + // These tcgen05 intrinsics return a v2i32, which is legal, so we have to + // lower them through LowerOperation() instead of ReplaceNodeResults(). case Intrinsic::nvvm_tcgen05_ld_16x64b_x2: case Intrinsic::nvvm_tcgen05_ld_16x128b_x1: case Intrinsic::nvvm_tcgen05_ld_32x32b_x2: @@ -6487,21 +6489,18 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, return; } - case Intrinsic::nvvm_tcgen05_ld_16x64b_x2: case Intrinsic::nvvm_tcgen05_ld_16x64b_x4: case Intrinsic::nvvm_tcgen05_ld_16x64b_x8: case Intrinsic::nvvm_tcgen05_ld_16x64b_x16: case Intrinsic::nvvm_tcgen05_ld_16x64b_x32: case Intrinsic::nvvm_tcgen05_ld_16x64b_x64: case Intrinsic::nvvm_tcgen05_ld_16x64b_x128: - case Intrinsic::nvvm_tcgen05_ld_32x32b_x2: case Intrinsic::nvvm_tcgen05_ld_32x32b_x4: case Intrinsic::nvvm_tcgen05_ld_32x32b_x8: case Intrinsic::nvvm_tcgen05_ld_32x32b_x16: case Intrinsic::nvvm_tcgen05_ld_32x32b_x32: case Intrinsic::nvvm_tcgen05_ld_32x32b_x64: case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: - case Intrinsic::nvvm_tcgen05_ld_16x128b_x1: case Intrinsic::nvvm_tcgen05_ld_16x128b_x2: case Intrinsic::nvvm_tcgen05_ld_16x128b_x4: case Intrinsic::nvvm_tcgen05_ld_16x128b_x8: @@ -6520,7 +6519,6 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, } return; - case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4: case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8: case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16: From 5e60c6288ed43147f054a289dfef5860758fdd43 Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Thu, 25 Sep 2025 15:49:09 -0700 Subject: [PATCH 3/5] [NVPTX] add -O3 test for f32x2 instructions --- llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 1257 ++++++++++++++++- 1 file changed, 1242 insertions(+), 15 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll index a90cfff51e2c6..3f457de7798a6 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -2,16 +2,22 @@ ; ## Full FP32x2 support enabled by default. ; RUN: llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-NOF32X2 %s -; RUN: %if ptxas-sm_80 %{ \ +; RUN: %if ptxas-sm_80 %{ \ ; RUN: llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_80 \ ; RUN: %} ; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-F32X2 %s -; RUN: %if ptxas-sm_100 %{ \ +; RUN: %if ptxas-sm_100 %{ \ ; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \ ; RUN: %} +; RUN: llc < %s -mcpu=sm_100 -O3 -frame-pointer=all -verify-machineinstrs \ +; RUN: | FileCheck --check-prefixes=CHECK,CHECK-F32X2-O3 %s +; RUN: %if ptxas-sm_100 %{ \ +; RUN: llc < %s -mcpu=sm_100 -O3 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \ +; RUN: %} target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "nvptx64-nvidia-cuda" @@ -47,6 +53,15 @@ define float @test_extract_0(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: mov.b64 {%r1, _}, %rd1; ; CHECK-F32X2-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_extract_0( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<2>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b32 %r1, [test_extract_0_param_0]; +; CHECK-F32X2-O3-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-F32X2-O3-NEXT: ret; %e = extractelement <2 x float> %a, i32 0 ret float %e } @@ -71,6 +86,15 @@ define float @test_extract_1(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: mov.b64 {_, %r1}, %rd1; ; CHECK-F32X2-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_extract_1( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<2>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b32 %r1, [test_extract_1_param_0+4]; +; CHECK-F32X2-O3-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-F32X2-O3-NEXT: ret; %e = extractelement <2 x float> %a, i32 1 ret float %e } @@ -112,6 +136,21 @@ define float @test_extract_i(<2 x float> %a, i64 %idx) #0 { ; CHECK-F32X2-NEXT: selp.f32 %r3, %r1, %r2, %p1; ; CHECK-F32X2-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_extract_i( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<2>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<6>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; +; CHECK-F32X2-O3-NEXT: and.b64 %rd2, %rd1, 1; +; CHECK-F32X2-O3-NEXT: shl.b64 %rd3, %rd2, 2; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd4, test_extract_i_param_0; +; CHECK-F32X2-O3-NEXT: add.s64 %rd5, %rd4, %rd3; +; CHECK-F32X2-O3-NEXT: ld.param.b32 %r1, [%rd5]; +; CHECK-F32X2-O3-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-F32X2-O3-NEXT: ret; %e = extractelement <2 x float> %a, i64 %idx ret float %e } @@ -139,6 +178,17 @@ define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fadd( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fadd_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fadd_param_1]; +; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = fadd <2 x float> %a, %b ret <2 x float> %r } @@ -168,6 +218,20 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fadd_imm_0( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_param_0]; +; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40000000; +; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f3F800000; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd2, {%r2, %r1}; +; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = fadd <2 x float> , %a ret <2 x float> %r } @@ -197,6 +261,20 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fadd_imm_1( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_param_0]; +; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40000000; +; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f3F800000; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd2, {%r2, %r1}; +; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = fadd <2 x float> %a, ret <2 x float> %r } @@ -227,6 +305,18 @@ define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 { ; CHECK-F32X2-NEXT: add.rn.f32x2 %rd6, %rd1, %rd3; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fadd_v4( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1]; +; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd5, %rd2, %rd4; +; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd6, %rd1, %rd3; +; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; +; CHECK-F32X2-O3-NEXT: ret; %r = fadd <4 x float> %a, %b ret <4 x float> %r } @@ -262,6 +352,24 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 { ; CHECK-F32X2-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fadd_imm_0_v4( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0]; +; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40800000; +; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f40400000; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3; +; CHECK-F32X2-O3-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-F32X2-O3-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd5, {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5; +; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; +; CHECK-F32X2-O3-NEXT: ret; %r = fadd <4 x float> , %a ret <4 x float> %r } @@ -297,6 +405,24 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 { ; CHECK-F32X2-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fadd_imm_1_v4( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0]; +; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40800000; +; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f40400000; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3; +; CHECK-F32X2-O3-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-F32X2-O3-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd5, {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5; +; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; +; CHECK-F32X2-O3-NEXT: ret; %r = fadd <4 x float> %a, ret <4 x float> %r } @@ -324,6 +450,17 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: sub.rn.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fsub( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fsub_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fsub_param_1]; +; CHECK-F32X2-O3-NEXT: sub.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = fsub <2 x float> %a, %b ret <2 x float> %r } @@ -352,6 +489,17 @@ define <2 x float> @test_fneg(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: neg.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fneg( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_param_0]; +; CHECK-F32X2-O3-NEXT: neg.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: neg.f32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = fneg <2 x float> %a ret <2 x float> %r } @@ -379,6 +527,17 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: mul.rn.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fmul( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fmul_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fmul_param_1]; +; CHECK-F32X2-O3-NEXT: mul.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = fmul <2 x float> %a, %b ret <2 x float> %r } @@ -410,6 +569,18 @@ define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: div.rn.f32 %r6, %r3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fdiv( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_param_1]; +; CHECK-F32X2-O3-NEXT: div.rn.f32 %r5, %r2, %r4; +; CHECK-F32X2-O3-NEXT: div.rn.f32 %r6, %r1, %r3; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-O3-NEXT: ret; %r = fdiv <2 x float> %a, %b ret <2 x float> %r } @@ -463,6 +634,29 @@ define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.f32 %r14, %r3, %r13, %p2; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_frem( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<15>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_param_1]; +; CHECK-F32X2-O3-NEXT: div.rn.f32 %r5, %r2, %r4; +; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r6, %r5; +; CHECK-F32X2-O3-NEXT: neg.f32 %r7, %r6; +; CHECK-F32X2-O3-NEXT: fma.rn.f32 %r8, %r7, %r4, %r2; +; CHECK-F32X2-O3-NEXT: testp.infinite.f32 %p1, %r4; +; CHECK-F32X2-O3-NEXT: selp.f32 %r9, %r2, %r8, %p1; +; CHECK-F32X2-O3-NEXT: div.rn.f32 %r10, %r1, %r3; +; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r11, %r10; +; CHECK-F32X2-O3-NEXT: neg.f32 %r12, %r11; +; CHECK-F32X2-O3-NEXT: fma.rn.f32 %r13, %r12, %r3, %r1; +; CHECK-F32X2-O3-NEXT: testp.infinite.f32 %p2, %r3; +; CHECK-F32X2-O3-NEXT: selp.f32 %r14, %r1, %r13, %p2; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; +; CHECK-F32X2-O3-NEXT: ret; %r = frem <2 x float> %a, %b ret <2 x float> %r } @@ -490,6 +684,17 @@ define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fadd_ftz( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fadd_ftz_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fadd_ftz_param_1]; +; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = fadd <2 x float> %a, %b ret <2 x float> %r } @@ -519,6 +724,20 @@ define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 { ; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fadd_imm_0_ftz( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_ftz_param_0]; +; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40000000; +; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f3F800000; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd2, {%r2, %r1}; +; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = fadd <2 x float> , %a ret <2 x float> %r } @@ -548,6 +767,20 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 { ; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fadd_imm_1_ftz( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_ftz_param_0]; +; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40000000; +; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f3F800000; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd2, {%r2, %r1}; +; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = fadd <2 x float> %a, ret <2 x float> %r } @@ -578,6 +811,18 @@ define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 { ; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd3; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fadd_v4_ftz( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1]; +; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd5, %rd2, %rd4; +; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd3; +; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; +; CHECK-F32X2-O3-NEXT: ret; %r = fadd <4 x float> %a, %b ret <4 x float> %r } @@ -613,6 +858,24 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 { ; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fadd_imm_0_v4_ftz( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0]; +; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40800000; +; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f40400000; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3; +; CHECK-F32X2-O3-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-F32X2-O3-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd5, {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5; +; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; +; CHECK-F32X2-O3-NEXT: ret; %r = fadd <4 x float> , %a ret <4 x float> %r } @@ -648,6 +911,24 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 { ; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fadd_imm_1_v4_ftz( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0]; +; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40800000; +; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f40400000; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3; +; CHECK-F32X2-O3-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-F32X2-O3-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd5, {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5; +; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; +; CHECK-F32X2-O3-NEXT: ret; %r = fadd <4 x float> %a, ret <4 x float> %r } @@ -675,6 +956,17 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-F32X2-NEXT: sub.rn.ftz.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fsub_ftz( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fsub_ftz_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fsub_ftz_param_1]; +; CHECK-F32X2-O3-NEXT: sub.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = fsub <2 x float> %a, %b ret <2 x float> %r } @@ -703,6 +995,17 @@ define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 { ; CHECK-F32X2-NEXT: neg.ftz.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fneg_ftz( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_ftz_param_0]; +; CHECK-F32X2-O3-NEXT: neg.ftz.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: neg.ftz.f32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = fneg <2 x float> %a ret <2 x float> %r } @@ -730,6 +1033,17 @@ define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-F32X2-NEXT: mul.rn.ftz.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fmul_ftz( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fmul_ftz_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fmul_ftz_param_1]; +; CHECK-F32X2-O3-NEXT: mul.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = fmul <2 x float> %a, %b ret <2 x float> %r } @@ -759,6 +1073,18 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) ; CHECK-F32X2-NEXT: fma.rn.ftz.f32x2 %rd4, %rd1, %rd2, %rd3; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fma_ftz( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fma_ftz_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fma_ftz_param_1]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [test_fma_ftz_param_2]; +; CHECK-F32X2-O3-NEXT: fma.rn.ftz.f32x2 %rd4, %rd1, %rd2, %rd3; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) ret <2 x float> %r } @@ -790,6 +1116,18 @@ define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-F32X2-NEXT: div.rn.ftz.f32 %r6, %r3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fdiv_ftz( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_ftz_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_ftz_param_1]; +; CHECK-F32X2-O3-NEXT: div.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-F32X2-O3-NEXT: div.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-O3-NEXT: ret; %r = fdiv <2 x float> %a, %b ret <2 x float> %r } @@ -843,6 +1181,29 @@ define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-F32X2-NEXT: selp.f32 %r14, %r3, %r13, %p2; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_frem_ftz( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<15>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_ftz_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_ftz_param_1]; +; CHECK-F32X2-O3-NEXT: div.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-F32X2-O3-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5; +; CHECK-F32X2-O3-NEXT: neg.ftz.f32 %r7, %r6; +; CHECK-F32X2-O3-NEXT: fma.rn.ftz.f32 %r8, %r7, %r4, %r2; +; CHECK-F32X2-O3-NEXT: testp.infinite.f32 %p1, %r4; +; CHECK-F32X2-O3-NEXT: selp.f32 %r9, %r2, %r8, %p1; +; CHECK-F32X2-O3-NEXT: div.rn.ftz.f32 %r10, %r1, %r3; +; CHECK-F32X2-O3-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10; +; CHECK-F32X2-O3-NEXT: neg.ftz.f32 %r12, %r11; +; CHECK-F32X2-O3-NEXT: fma.rn.ftz.f32 %r13, %r12, %r3, %r1; +; CHECK-F32X2-O3-NEXT: testp.infinite.f32 %p2, %r3; +; CHECK-F32X2-O3-NEXT: selp.f32 %r14, %r1, %r13, %p2; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; +; CHECK-F32X2-O3-NEXT: ret; %r = frem <2 x float> %a, %b ret <2 x float> %r } @@ -870,25 +1231,64 @@ define void @test_ldst_v2f32(ptr %a, ptr %b) #0 { ; CHECK-F32X2-NEXT: ld.b64 %rd3, [%rd1]; ; CHECK-F32X2-NEXT: st.b64 [%rd2], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_ldst_v2f32( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_ldst_v2f32_param_0]; +; CHECK-F32X2-O3-NEXT: ld.b64 %rd2, [%rd1]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [test_ldst_v2f32_param_1]; +; CHECK-F32X2-O3-NEXT: st.b64 [%rd3], %rd2; +; CHECK-F32X2-O3-NEXT: ret; %t1 = load <2 x float>, ptr %a store <2 x float> %t1, ptr %b, align 32 ret void } define void @test_ldst_v3f32(ptr %a, ptr %b) #0 { -; CHECK-LABEL: test_ldst_v3f32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v3f32_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v3f32_param_0]; -; CHECK-NEXT: ld.b64 %rd3, [%rd1]; -; CHECK-NEXT: ld.b32 %r1, [%rd1+8]; -; CHECK-NEXT: st.b32 [%rd2+8], %r1; -; CHECK-NEXT: st.b64 [%rd2], %rd3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_ldst_v3f32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<2>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v3f32_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v3f32_param_0]; +; CHECK-NOF32X2-NEXT: ld.b64 %rd3, [%rd1]; +; CHECK-NOF32X2-NEXT: ld.b32 %r1, [%rd1+8]; +; CHECK-NOF32X2-NEXT: st.b32 [%rd2+8], %r1; +; CHECK-NOF32X2-NEXT: st.b64 [%rd2], %rd3; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_ldst_v3f32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<2>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v3f32_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v3f32_param_0]; +; CHECK-F32X2-NEXT: ld.b64 %rd3, [%rd1]; +; CHECK-F32X2-NEXT: ld.b32 %r1, [%rd1+8]; +; CHECK-F32X2-NEXT: st.b32 [%rd2+8], %r1; +; CHECK-F32X2-NEXT: st.b64 [%rd2], %rd3; +; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_ldst_v3f32( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<2>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_ldst_v3f32_param_0]; +; CHECK-F32X2-O3-NEXT: ld.b64 %rd2, [%rd1]; +; CHECK-F32X2-O3-NEXT: ld.b32 %r1, [%rd1+8]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [test_ldst_v3f32_param_1]; +; CHECK-F32X2-O3-NEXT: st.b32 [%rd3+8], %r1; +; CHECK-F32X2-O3-NEXT: st.b64 [%rd3], %rd2; +; CHECK-F32X2-O3-NEXT: ret; %t1 = load <3 x float>, ptr %a store <3 x float> %t1, ptr %b, align 32 ret void @@ -917,6 +1317,17 @@ define void @test_ldst_v4f32(ptr %a, ptr %b) #0 { ; CHECK-F32X2-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1]; ; CHECK-F32X2-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_ldst_v4f32( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_ldst_v4f32_param_0]; +; CHECK-F32X2-O3-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd4, [test_ldst_v4f32_param_1]; +; CHECK-F32X2-O3-NEXT: st.v2.b64 [%rd4], {%rd2, %rd3}; +; CHECK-F32X2-O3-NEXT: ret; %t1 = load <4 x float>, ptr %a store <4 x float> %t1, ptr %b, align 32 ret void @@ -949,6 +1360,19 @@ define void @test_ldst_v8f32(ptr %a, ptr %b) #0 { ; CHECK-F32X2-NEXT: st.v2.b64 [%rd2+16], {%rd5, %rd6}; ; CHECK-F32X2-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_ldst_v8f32( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_ldst_v8f32_param_0]; +; CHECK-F32X2-O3-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-F32X2-O3-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd6, [test_ldst_v8f32_param_1]; +; CHECK-F32X2-O3-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-F32X2-O3-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3}; +; CHECK-F32X2-O3-NEXT: ret; %t1 = load <8 x float>, ptr %a store <8 x float> %t1, ptr %b, align 32 ret void @@ -994,6 +1418,25 @@ define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: } // callseq 0 ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_call( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_call_param_0]; +; CHECK-F32X2-O3-NEXT: { // callseq 0, 0 +; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_call_param_1]; +; CHECK-F32X2-O3-NEXT: st.param.b64 [param1], %rd2; +; CHECK-F32X2-O3-NEXT: st.param.b64 [param0], %rd1; +; CHECK-F32X2-O3-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-F32X2-O3-NEXT: } // callseq 0 +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } @@ -1036,6 +1479,25 @@ define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: } // callseq 1 ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_call_flipped( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_call_flipped_param_0]; +; CHECK-F32X2-O3-NEXT: { // callseq 1, 0 +; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-F32X2-O3-NEXT: st.param.b64 [param1], %rd1; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_call_flipped_param_1]; +; CHECK-F32X2-O3-NEXT: st.param.b64 [param0], %rd2; +; CHECK-F32X2-O3-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-F32X2-O3-NEXT: } // callseq 1 +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a) ret <2 x float> %r } @@ -1078,6 +1540,25 @@ define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: } // callseq 2 ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_tailcall_flipped( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_tailcall_flipped_param_0]; +; CHECK-F32X2-O3-NEXT: { // callseq 2, 0 +; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-F32X2-O3-NEXT: st.param.b64 [param1], %rd1; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_tailcall_flipped_param_1]; +; CHECK-F32X2-O3-NEXT: st.param.b64 [param0], %rd2; +; CHECK-F32X2-O3-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-F32X2-O3-NEXT: } // callseq 2 +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = tail call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a) ret <2 x float> %r } @@ -1115,6 +1596,22 @@ define <2 x float> @test_select(<2 x float> %a, <2 x float> %b, i1 zeroext %c) # ; CHECK-F32X2-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_select( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<2>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b8 %rs1, [test_select_param_2]; +; CHECK-F32X2-O3-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-F32X2-O3-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_select_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_select_param_1]; +; CHECK-F32X2-O3-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = select i1 %c, <2 x float> %a, <2 x float> %b ret <2 x float> %r } @@ -1158,6 +1655,23 @@ define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> % ; CHECK-F32X2-NEXT: selp.f32 %r10, %r7, %r5, %p1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_select_cc( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<11>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_param_2]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_param_3]; +; CHECK-F32X2-O3-NEXT: setp.neu.f32 %p1, %r3, %r5; +; CHECK-F32X2-O3-NEXT: setp.neu.f32 %p2, %r4, %r6; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r7, %r8}, [test_select_cc_param_1]; +; CHECK-F32X2-O3-NEXT: selp.f32 %r9, %r2, %r8, %p2; +; CHECK-F32X2-O3-NEXT: selp.f32 %r10, %r1, %r7, %p1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9}; +; CHECK-F32X2-O3-NEXT: ret; %cc = fcmp une <2 x float> %c, %d %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b ret <2 x float> %r @@ -1201,6 +1715,24 @@ define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 ; CHECK-F32X2-NEXT: selp.f64 %rd8, %rd1, %rd3, %p1; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd7}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_select_cc_f64_f32( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f64_f32_param_2]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f64_f32_param_3]; +; CHECK-F32X2-O3-NEXT: setp.neu.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.neu.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1]; +; CHECK-F32X2-O3-NEXT: selp.f64 %rd5, %rd2, %rd4, %p2; +; CHECK-F32X2-O3-NEXT: selp.f64 %rd6, %rd1, %rd3, %p1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; +; CHECK-F32X2-O3-NEXT: ret; %cc = fcmp une <2 x float> %c, %d %r = select <2 x i1> %cc, <2 x double> %a, <2 x double> %b ret <2 x double> %r @@ -1244,6 +1776,24 @@ define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x ; CHECK-F32X2-NEXT: selp.f32 %r6, %r3, %r1, %p1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_select_cc_f32_f64( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f64_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f32_f64_param_2]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_3]; +; CHECK-F32X2-O3-NEXT: setp.neu.f64 %p1, %rd1, %rd3; +; CHECK-F32X2-O3-NEXT: setp.neu.f64 %p2, %rd2, %rd4; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f64_param_1]; +; CHECK-F32X2-O3-NEXT: selp.f32 %r5, %r2, %r4, %p2; +; CHECK-F32X2-O3-NEXT: selp.f32 %r6, %r1, %r3, %p1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-O3-NEXT: ret; %cc = fcmp une <2 x double> %c, %d %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b ret <2 x float> %r @@ -1286,6 +1836,23 @@ define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fcmp_une( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_une_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_une_param_1]; +; CHECK-F32X2-O3-NEXT: setp.neu.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.neu.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-F32X2-O3-NEXT: ret; %r = fcmp une <2 x float> %a, %b ret <2 x i1> %r } @@ -1327,6 +1894,23 @@ define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fcmp_ueq( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ueq_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ueq_param_1]; +; CHECK-F32X2-O3-NEXT: setp.equ.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.equ.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-F32X2-O3-NEXT: ret; %r = fcmp ueq <2 x float> %a, %b ret <2 x i1> %r } @@ -1368,6 +1952,23 @@ define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fcmp_ugt( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ugt_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ugt_param_1]; +; CHECK-F32X2-O3-NEXT: setp.gtu.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.gtu.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-F32X2-O3-NEXT: ret; %r = fcmp ugt <2 x float> %a, %b ret <2 x i1> %r } @@ -1409,6 +2010,23 @@ define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fcmp_uge( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uge_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uge_param_1]; +; CHECK-F32X2-O3-NEXT: setp.geu.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.geu.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-F32X2-O3-NEXT: ret; %r = fcmp uge <2 x float> %a, %b ret <2 x i1> %r } @@ -1450,6 +2068,23 @@ define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fcmp_ult( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ult_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ult_param_1]; +; CHECK-F32X2-O3-NEXT: setp.ltu.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.ltu.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-F32X2-O3-NEXT: ret; %r = fcmp ult <2 x float> %a, %b ret <2 x i1> %r } @@ -1491,6 +2126,23 @@ define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fcmp_ule( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ule_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ule_param_1]; +; CHECK-F32X2-O3-NEXT: setp.leu.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.leu.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-F32X2-O3-NEXT: ret; %r = fcmp ule <2 x float> %a, %b ret <2 x i1> %r } @@ -1532,6 +2184,23 @@ define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fcmp_uno( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uno_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uno_param_1]; +; CHECK-F32X2-O3-NEXT: setp.nan.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.nan.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-F32X2-O3-NEXT: ret; %r = fcmp uno <2 x float> %a, %b ret <2 x i1> %r } @@ -1573,6 +2242,23 @@ define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fcmp_one( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_one_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_one_param_1]; +; CHECK-F32X2-O3-NEXT: setp.ne.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.ne.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-F32X2-O3-NEXT: ret; %r = fcmp one <2 x float> %a, %b ret <2 x i1> %r } @@ -1614,6 +2300,23 @@ define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fcmp_oeq( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oeq_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oeq_param_1]; +; CHECK-F32X2-O3-NEXT: setp.eq.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.eq.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-F32X2-O3-NEXT: ret; %r = fcmp oeq <2 x float> %a, %b ret <2 x i1> %r } @@ -1655,6 +2358,23 @@ define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fcmp_ogt( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ogt_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ogt_param_1]; +; CHECK-F32X2-O3-NEXT: setp.gt.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.gt.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-F32X2-O3-NEXT: ret; %r = fcmp ogt <2 x float> %a, %b ret <2 x i1> %r } @@ -1696,6 +2416,23 @@ define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fcmp_oge( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oge_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oge_param_1]; +; CHECK-F32X2-O3-NEXT: setp.ge.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.ge.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-F32X2-O3-NEXT: ret; %r = fcmp oge <2 x float> %a, %b ret <2 x i1> %r } @@ -1737,6 +2474,23 @@ define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fcmp_olt( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_olt_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_olt_param_1]; +; CHECK-F32X2-O3-NEXT: setp.lt.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.lt.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-F32X2-O3-NEXT: ret; %r = fcmp olt <2 x float> %a, %b ret <2 x i1> %r } @@ -1778,6 +2532,23 @@ define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fcmp_ole( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ole_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ole_param_1]; +; CHECK-F32X2-O3-NEXT: setp.le.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.le.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-F32X2-O3-NEXT: ret; %r = fcmp ole <2 x float> %a, %b ret <2 x i1> %r } @@ -1819,6 +2590,23 @@ define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fcmp_ord( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ord_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ord_param_1]; +; CHECK-F32X2-O3-NEXT: setp.num.f32 %p1, %r1, %r3; +; CHECK-F32X2-O3-NEXT: setp.num.f32 %p2, %r2, %r4; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; +; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-F32X2-O3-NEXT: ret; %r = fcmp ord <2 x float> %a, %b ret <2 x i1> %r } @@ -1847,6 +2635,17 @@ define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rzi.s32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fptosi_i32( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i32_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rzi.s32.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rzi.s32.f32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = fptosi <2 x float> %a to <2 x i32> ret <2 x i32> %r } @@ -1876,6 +2675,18 @@ define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rzi.s64.f32 %rd3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fptosi_i64( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i64_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rzi.s64.f32 %rd1, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rzi.s64.f32 %rd2, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; CHECK-F32X2-O3-NEXT: ret; %r = fptosi <2 x float> %a to <2 x i64> ret <2 x i64> %r } @@ -1904,6 +2715,17 @@ define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rzi.u32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fptoui_2xi32( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi32_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rzi.u32.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rzi.u32.f32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = fptoui <2 x float> %a to <2 x i32> ret <2 x i32> %r } @@ -1933,6 +2755,18 @@ define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rzi.u64.f32 %rd3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fptoui_2xi64( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi64_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rzi.u64.f32 %rd1, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rzi.u64.f32 %rd2, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; CHECK-F32X2-O3-NEXT: ret; %r = fptoui <2 x float> %a to <2 x i64> ret <2 x i64> %r } @@ -1961,6 +2795,17 @@ define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_uitofp_2xi32( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = uitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -2005,6 +2850,17 @@ define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_sitofp_2xi32( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rn.f32.s32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rn.f32.s32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = sitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -2055,6 +2911,21 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_uitofp_2xi32_fadd( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd1, {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_uitofp_2xi32_fadd_param_1]; +; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd3, %rd2, %rd1; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %c = uitofp <2 x i32> %a to <2 x float> %r = fadd <2 x float> %b, %c ret <2 x float> %r @@ -2085,6 +2956,19 @@ define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 { ; CHECK-F32X2-NEXT: mov.b64 %rd3, {%r2, %r1}; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fptrunc_2xdouble( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rn.f32.f64 %r1, %rd2; +; CHECK-F32X2-O3-NEXT: cvt.rn.f32.f64 %r2, %rd1; +; CHECK-F32X2-O3-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-O3-NEXT: ret; %r = fptrunc <2 x double> %a to <2 x float> ret <2 x float> %r } @@ -2114,6 +2998,18 @@ define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.f64.f32 %rd3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fpext_2xdouble( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fpext_2xdouble_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.f64.f32 %rd1, %r2; +; CHECK-F32X2-O3-NEXT: cvt.f64.f32 %rd2, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; CHECK-F32X2-O3-NEXT: ret; %r = fpext <2 x float> %a to <2 x double> ret <2 x double> %r } @@ -2136,6 +3032,15 @@ define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0]; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_bitcast_2xfloat_to_2xi32( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0]; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-F32X2-O3-NEXT: ret; %r = bitcast <2 x float> %a to <2 x i32> ret <2 x i32> %r } @@ -2158,6 +3063,15 @@ define <2 x float> @test_bitcast_2xi32_to_2xfloat(<2 x i32> %a) #0 { ; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xi32_to_2xfloat_param_0]; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_bitcast_2xi32_to_2xfloat( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_bitcast_2xi32_to_2xfloat_param_0]; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-F32X2-O3-NEXT: ret; %r = bitcast <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -2193,6 +3107,15 @@ define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_double_param_0]; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_bitcast_2xfloat_to_double( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_double_param_0]; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-F32X2-O3-NEXT: ret; %r = bitcast <2 x float> %a to double ret double %r } @@ -2221,6 +3144,17 @@ define <2 x float> @test_sqrt(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: sqrt.rn.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_sqrt( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sqrt_param_0]; +; CHECK-F32X2-O3-NEXT: sqrt.rn.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: sqrt.rn.f32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.sqrt(<2 x float> %a) ret <2 x float> %r } @@ -2256,6 +3190,17 @@ define <2 x float> @test_sin(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: sin.approx.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_sin( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sin_param_0]; +; CHECK-F32X2-O3-NEXT: sin.approx.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: sin.approx.f32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = call afn <2 x float> @llvm.sin(<2 x float> %a) ret <2 x float> %r } @@ -2284,6 +3229,17 @@ define <2 x float> @test_cos(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cos.approx.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_cos( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_cos_param_0]; +; CHECK-F32X2-O3-NEXT: cos.approx.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: cos.approx.f32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = call afn <2 x float> @llvm.cos(<2 x float> %a) ret <2 x float> %r } @@ -2356,6 +3312,18 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 ; CHECK-F32X2-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fma( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fma_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fma_param_1]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [test_fma_param_2]; +; CHECK-F32X2-O3-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) ret <2 x float> %r } @@ -2384,6 +3352,17 @@ define <2 x float> @test_fabs(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: abs.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fabs( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fabs_param_0]; +; CHECK-F32X2-O3-NEXT: abs.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: abs.f32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.fabs(<2 x float> %a) ret <2 x float> %r } @@ -2415,6 +3394,18 @@ define <2 x float> @test_minnum(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: min.f32 %r6, %r3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_minnum( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_minnum_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_minnum_param_1]; +; CHECK-F32X2-O3-NEXT: min.f32 %r5, %r2, %r4; +; CHECK-F32X2-O3-NEXT: min.f32 %r6, %r1, %r3; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.minnum(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } @@ -2446,6 +3437,18 @@ define <2 x float> @test_maxnum(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: max.f32 %r6, %r3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_maxnum( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_maxnum_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_maxnum_param_1]; +; CHECK-F32X2-O3-NEXT: max.f32 %r5, %r2, %r4; +; CHECK-F32X2-O3-NEXT: max.f32 %r6, %r1, %r3; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.maxnum(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } @@ -2477,6 +3480,18 @@ define <2 x float> @test_copysign(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: copysign.f32 %r6, %r3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_copysign( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_param_1]; +; CHECK-F32X2-O3-NEXT: copysign.f32 %r5, %r4, %r2; +; CHECK-F32X2-O3-NEXT: copysign.f32 %r6, %r3, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } @@ -2530,6 +3545,30 @@ define <2 x float> @test_copysign_f64(<2 x float> %a, <2 x double> %b) #0 { ; CHECK-F32X2-NEXT: selp.f32 %r8, %r7, %r6, %p2; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_copysign_f64( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<9>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_f64_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1]; +; CHECK-F32X2-O3-NEXT: abs.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: neg.f32 %r4, %r3; +; CHECK-F32X2-O3-NEXT: shr.u64 %rd3, %rd2, 63; +; CHECK-F32X2-O3-NEXT: and.b64 %rd4, %rd3, 1; +; CHECK-F32X2-O3-NEXT: setp.ne.b64 %p1, %rd4, 0; +; CHECK-F32X2-O3-NEXT: selp.f32 %r5, %r4, %r3, %p1; +; CHECK-F32X2-O3-NEXT: abs.f32 %r6, %r1; +; CHECK-F32X2-O3-NEXT: neg.f32 %r7, %r6; +; CHECK-F32X2-O3-NEXT: shr.u64 %rd5, %rd1, 63; +; CHECK-F32X2-O3-NEXT: and.b64 %rd6, %rd5, 1; +; CHECK-F32X2-O3-NEXT: setp.ne.b64 %p2, %rd6, 0; +; CHECK-F32X2-O3-NEXT: selp.f32 %r8, %r7, %r6, %p2; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5}; +; CHECK-F32X2-O3-NEXT: ret; %tb = fptrunc <2 x double> %b to <2 x float> %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %tb) ret <2 x float> %r @@ -2567,6 +3606,21 @@ define <2 x double> @test_copysign_extended(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: cvt.f64.f32 %rd4, %r5; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_copysign_extended( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_extended_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_extended_param_1]; +; CHECK-F32X2-O3-NEXT: copysign.f32 %r5, %r3, %r1; +; CHECK-F32X2-O3-NEXT: copysign.f32 %r6, %r4, %r2; +; CHECK-F32X2-O3-NEXT: cvt.f64.f32 %rd1, %r6; +; CHECK-F32X2-O3-NEXT: cvt.f64.f32 %rd2, %r5; +; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b) %xr = fpext <2 x float> %r to <2 x double> ret <2 x double> %xr @@ -2596,6 +3650,17 @@ define <2 x float> @test_floor(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rmi.f32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_floor( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_floor_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rmi.f32.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rmi.f32.f32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.floor(<2 x float> %a) ret <2 x float> %r } @@ -2624,6 +3689,17 @@ define <2 x float> @test_ceil(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rpi.f32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_ceil( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_ceil_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rpi.f32.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rpi.f32.f32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.ceil(<2 x float> %a) ret <2 x float> %r } @@ -2652,6 +3728,17 @@ define <2 x float> @test_trunc(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_trunc( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.trunc(<2 x float> %a) ret <2 x float> %r } @@ -2680,6 +3767,17 @@ define <2 x float> @test_rint(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_rint( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_rint_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.rint(<2 x float> %a) ret <2 x float> %r } @@ -2708,6 +3806,17 @@ define <2 x float> @test_nearbyint(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_nearbyint( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_nearbyint_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.nearbyint(<2 x float> %a) ret <2 x float> %r } @@ -2736,6 +3845,17 @@ define <2 x float> @test_roundeven(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_roundeven( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_roundeven_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.roundeven(<2 x float> %a) ret <2 x float> %r } @@ -2803,6 +3923,36 @@ define <2 x float> @test_round(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: selp.f32 %r18, %r17, %r16, %p4; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r18, %r10}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_round( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .pred %p<5>; +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<19>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_round_param_0]; +; CHECK-F32X2-O3-NEXT: and.b32 %r3, %r2, -2147483648; +; CHECK-F32X2-O3-NEXT: or.b32 %r4, %r3, 1056964608; +; CHECK-F32X2-O3-NEXT: add.rn.f32 %r5, %r2, %r4; +; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r6, %r5; +; CHECK-F32X2-O3-NEXT: abs.f32 %r7, %r2; +; CHECK-F32X2-O3-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000; +; CHECK-F32X2-O3-NEXT: selp.f32 %r8, %r2, %r6, %p1; +; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r9, %r2; +; CHECK-F32X2-O3-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000; +; CHECK-F32X2-O3-NEXT: selp.f32 %r10, %r9, %r8, %p2; +; CHECK-F32X2-O3-NEXT: and.b32 %r11, %r1, -2147483648; +; CHECK-F32X2-O3-NEXT: or.b32 %r12, %r11, 1056964608; +; CHECK-F32X2-O3-NEXT: add.rn.f32 %r13, %r1, %r12; +; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r14, %r13; +; CHECK-F32X2-O3-NEXT: abs.f32 %r15, %r1; +; CHECK-F32X2-O3-NEXT: setp.gt.f32 %p3, %r15, 0f4B000000; +; CHECK-F32X2-O3-NEXT: selp.f32 %r16, %r1, %r14, %p3; +; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r17, %r1; +; CHECK-F32X2-O3-NEXT: setp.lt.f32 %p4, %r15, 0f3F000000; +; CHECK-F32X2-O3-NEXT: selp.f32 %r18, %r17, %r16, %p4; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r18, %r10}; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.round(<2 x float> %a) ret <2 x float> %r } @@ -2832,6 +3982,18 @@ define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) ; CHECK-F32X2-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_fmuladd( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fmuladd_param_0]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fmuladd_param_1]; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [test_fmuladd_param_2]; +; CHECK-F32X2-O3-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; +; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) ret <2 x float> %r } @@ -2856,6 +4018,15 @@ define <2 x float> @test_shufflevector(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_shufflevector( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_shufflevector_param_0]; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-F32X2-O3-NEXT: ret; %s = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> ret <2 x float> %s } @@ -2882,6 +4053,16 @@ define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 { ; CHECK-F32X2-NEXT: mov.b64 {%r2, _}, %rd1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_insertelement( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.b32 %r1, [test_insertelement_param_1]; +; CHECK-F32X2-O3-NEXT: ld.param.b32 %r2, [test_insertelement_param_0]; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-F32X2-O3-NEXT: ret; %i = insertelement <2 x float> %a, float %x, i64 1 ret <2 x float> %i } @@ -2910,6 +4091,17 @@ define <2 x float> @test_sitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_sitofp_2xi32_to_2xfloat( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_to_2xfloat_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rn.f32.s32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rn.f32.s32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = sitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -2938,6 +4130,17 @@ define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_uitofp_2xi32_to_2xfloat( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_to_2xfloat_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-F32X2-O3-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-O3-NEXT: ret; %r = uitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -2967,6 +4170,18 @@ define void @test_trunc_to_v2bf16(<2 x float> %a, ptr %p) { ; CHECK-F32X2-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; ; CHECK-F32X2-NEXT: st.b32 [%rd2], %r3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_trunc_to_v2bf16( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<4>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2bf16_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_1]; +; CHECK-F32X2-O3-NEXT: st.b32 [%rd1], %r3; +; CHECK-F32X2-O3-NEXT: ret; %trunc = fptrunc <2 x float> %a to <2 x bfloat> store <2 x bfloat> %trunc, ptr %p ret void @@ -2997,6 +4212,18 @@ define void @test_trunc_to_v2f16(<2 x float> %a, ptr %p) { ; CHECK-F32X2-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1; ; CHECK-F32X2-NEXT: st.b32 [%rd2], %r3; ; CHECK-F32X2-NEXT: ret; +; +; CHECK-F32X2-O3-LABEL: test_trunc_to_v2f16( +; CHECK-F32X2-O3: { +; CHECK-F32X2-O3-NEXT: .reg .b32 %r<4>; +; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-O3-EMPTY: +; CHECK-F32X2-O3-NEXT: // %bb.0: +; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2f16_param_0]; +; CHECK-F32X2-O3-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1; +; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2f16_param_1]; +; CHECK-F32X2-O3-NEXT: st.b32 [%rd1], %r3; +; CHECK-F32X2-O3-NEXT: ret; %trunc = fptrunc <2 x float> %a to <2 x half> store <2 x half> %trunc, ptr %p ret void From e14789197cc221a616bac5681c0cb9ab338f4e42 Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Thu, 25 Sep 2025 16:05:57 -0700 Subject: [PATCH 4/5] address review comments x2 --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 3e4abb72d377e..3ac7c2874408b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2937,13 +2937,13 @@ static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG) { case Intrinsic::nvvm_tcgen05_ld_16x64b_x2: case Intrinsic::nvvm_tcgen05_ld_16x128b_x1: case Intrinsic::nvvm_tcgen05_ld_32x32b_x2: - if (auto Pair = lowerTcgen05Ld(Op.getNode(), DAG)) - return DAG.getMergeValues({Pair->first, Pair->second}, SDLoc(Op)); + if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG)) + return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op)); return SDValue(); case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: - if (auto Pair = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true)) - return DAG.getMergeValues({Pair->first, Pair->second}, SDLoc(Op)); + if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true)) + return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op)); return SDValue(); } } @@ -6513,9 +6513,9 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, case Intrinsic::nvvm_tcgen05_ld_16x256b_x8: case Intrinsic::nvvm_tcgen05_ld_16x256b_x16: case Intrinsic::nvvm_tcgen05_ld_16x256b_x32: - if (auto Pair = lowerTcgen05Ld(N, DAG)) { - Results.push_back(Pair->first); - Results.push_back(Pair->second); + if (auto Res = lowerTcgen05Ld(N, DAG)) { + Results.push_back(Res->first); + Results.push_back(Res->second); } return; @@ -6525,9 +6525,9 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32: case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64: case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: - if (auto Pair = lowerTcgen05Ld(N, DAG, /*HasOffset=*/true)) { - Results.push_back(Pair->first); - Results.push_back(Pair->second); + if (auto Res = lowerTcgen05Ld(N, DAG, /*HasOffset=*/true)) { + Results.push_back(Res->first); + Results.push_back(Res->second); } return; } From 9213919f97dc4809503bc279cdc928bf649f545c Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Thu, 25 Sep 2025 16:47:58 -0700 Subject: [PATCH 5/5] Revert "[NVPTX] add -O3 test for f32x2 instructions" This reverts commit 34859a13d0e4b99d70c250d1dad95392291b61a0. --- llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 1257 +---------------- 1 file changed, 15 insertions(+), 1242 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll index 3f457de7798a6..a90cfff51e2c6 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -2,22 +2,16 @@ ; ## Full FP32x2 support enabled by default. ; RUN: llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-NOF32X2 %s -; RUN: %if ptxas-sm_80 %{ \ +; RUN: %if ptxas-sm_80 %{ \ ; RUN: llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_80 \ ; RUN: %} ; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-F32X2 %s -; RUN: %if ptxas-sm_100 %{ \ +; RUN: %if ptxas-sm_100 %{ \ ; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \ ; RUN: %} -; RUN: llc < %s -mcpu=sm_100 -O3 -frame-pointer=all -verify-machineinstrs \ -; RUN: | FileCheck --check-prefixes=CHECK,CHECK-F32X2-O3 %s -; RUN: %if ptxas-sm_100 %{ \ -; RUN: llc < %s -mcpu=sm_100 -O3 -disable-post-ra -frame-pointer=all \ -; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \ -; RUN: %} target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "nvptx64-nvidia-cuda" @@ -53,15 +47,6 @@ define float @test_extract_0(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: mov.b64 {%r1, _}, %rd1; ; CHECK-F32X2-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_extract_0( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<2>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b32 %r1, [test_extract_0_param_0]; -; CHECK-F32X2-O3-NEXT: st.param.b32 [func_retval0], %r1; -; CHECK-F32X2-O3-NEXT: ret; %e = extractelement <2 x float> %a, i32 0 ret float %e } @@ -86,15 +71,6 @@ define float @test_extract_1(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: mov.b64 {_, %r1}, %rd1; ; CHECK-F32X2-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_extract_1( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<2>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b32 %r1, [test_extract_1_param_0+4]; -; CHECK-F32X2-O3-NEXT: st.param.b32 [func_retval0], %r1; -; CHECK-F32X2-O3-NEXT: ret; %e = extractelement <2 x float> %a, i32 1 ret float %e } @@ -136,21 +112,6 @@ define float @test_extract_i(<2 x float> %a, i64 %idx) #0 { ; CHECK-F32X2-NEXT: selp.f32 %r3, %r1, %r2, %p1; ; CHECK-F32X2-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_extract_i( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<2>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<6>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; -; CHECK-F32X2-O3-NEXT: and.b64 %rd2, %rd1, 1; -; CHECK-F32X2-O3-NEXT: shl.b64 %rd3, %rd2, 2; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd4, test_extract_i_param_0; -; CHECK-F32X2-O3-NEXT: add.s64 %rd5, %rd4, %rd3; -; CHECK-F32X2-O3-NEXT: ld.param.b32 %r1, [%rd5]; -; CHECK-F32X2-O3-NEXT: st.param.b32 [func_retval0], %r1; -; CHECK-F32X2-O3-NEXT: ret; %e = extractelement <2 x float> %a, i64 %idx ret float %e } @@ -178,17 +139,6 @@ define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fadd( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fadd_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fadd_param_1]; -; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = fadd <2 x float> %a, %b ret <2 x float> %r } @@ -218,20 +168,6 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fadd_imm_0( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_param_0]; -; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40000000; -; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f3F800000; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd2, {%r2, %r1}; -; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = fadd <2 x float> , %a ret <2 x float> %r } @@ -261,20 +197,6 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fadd_imm_1( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_param_0]; -; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40000000; -; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f3F800000; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd2, {%r2, %r1}; -; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = fadd <2 x float> %a, ret <2 x float> %r } @@ -305,18 +227,6 @@ define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 { ; CHECK-F32X2-NEXT: add.rn.f32x2 %rd6, %rd1, %rd3; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fadd_v4( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1]; -; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd5, %rd2, %rd4; -; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd6, %rd1, %rd3; -; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; -; CHECK-F32X2-O3-NEXT: ret; %r = fadd <4 x float> %a, %b ret <4 x float> %r } @@ -352,24 +262,6 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 { ; CHECK-F32X2-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fadd_imm_0_v4( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0]; -; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40800000; -; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f40400000; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd3, {%r2, %r1}; -; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3; -; CHECK-F32X2-O3-NEXT: mov.b32 %r3, 0f40000000; -; CHECK-F32X2-O3-NEXT: mov.b32 %r4, 0f3F800000; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd5, {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5; -; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; -; CHECK-F32X2-O3-NEXT: ret; %r = fadd <4 x float> , %a ret <4 x float> %r } @@ -405,24 +297,6 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 { ; CHECK-F32X2-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fadd_imm_1_v4( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0]; -; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40800000; -; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f40400000; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd3, {%r2, %r1}; -; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3; -; CHECK-F32X2-O3-NEXT: mov.b32 %r3, 0f40000000; -; CHECK-F32X2-O3-NEXT: mov.b32 %r4, 0f3F800000; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd5, {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5; -; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; -; CHECK-F32X2-O3-NEXT: ret; %r = fadd <4 x float> %a, ret <4 x float> %r } @@ -450,17 +324,6 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: sub.rn.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fsub( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fsub_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fsub_param_1]; -; CHECK-F32X2-O3-NEXT: sub.rn.f32x2 %rd3, %rd1, %rd2; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = fsub <2 x float> %a, %b ret <2 x float> %r } @@ -489,17 +352,6 @@ define <2 x float> @test_fneg(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: neg.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fneg( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_param_0]; -; CHECK-F32X2-O3-NEXT: neg.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: neg.f32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = fneg <2 x float> %a ret <2 x float> %r } @@ -527,17 +379,6 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: mul.rn.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fmul( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fmul_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fmul_param_1]; -; CHECK-F32X2-O3-NEXT: mul.rn.f32x2 %rd3, %rd1, %rd2; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = fmul <2 x float> %a, %b ret <2 x float> %r } @@ -569,18 +410,6 @@ define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: div.rn.f32 %r6, %r3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fdiv( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<7>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_param_1]; -; CHECK-F32X2-O3-NEXT: div.rn.f32 %r5, %r2, %r4; -; CHECK-F32X2-O3-NEXT: div.rn.f32 %r6, %r1, %r3; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-F32X2-O3-NEXT: ret; %r = fdiv <2 x float> %a, %b ret <2 x float> %r } @@ -634,29 +463,6 @@ define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.f32 %r14, %r3, %r13, %p2; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_frem( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<15>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_param_1]; -; CHECK-F32X2-O3-NEXT: div.rn.f32 %r5, %r2, %r4; -; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r6, %r5; -; CHECK-F32X2-O3-NEXT: neg.f32 %r7, %r6; -; CHECK-F32X2-O3-NEXT: fma.rn.f32 %r8, %r7, %r4, %r2; -; CHECK-F32X2-O3-NEXT: testp.infinite.f32 %p1, %r4; -; CHECK-F32X2-O3-NEXT: selp.f32 %r9, %r2, %r8, %p1; -; CHECK-F32X2-O3-NEXT: div.rn.f32 %r10, %r1, %r3; -; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r11, %r10; -; CHECK-F32X2-O3-NEXT: neg.f32 %r12, %r11; -; CHECK-F32X2-O3-NEXT: fma.rn.f32 %r13, %r12, %r3, %r1; -; CHECK-F32X2-O3-NEXT: testp.infinite.f32 %p2, %r3; -; CHECK-F32X2-O3-NEXT: selp.f32 %r14, %r1, %r13, %p2; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; -; CHECK-F32X2-O3-NEXT: ret; %r = frem <2 x float> %a, %b ret <2 x float> %r } @@ -684,17 +490,6 @@ define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fadd_ftz( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fadd_ftz_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fadd_ftz_param_1]; -; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = fadd <2 x float> %a, %b ret <2 x float> %r } @@ -724,20 +519,6 @@ define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 { ; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fadd_imm_0_ftz( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_ftz_param_0]; -; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40000000; -; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f3F800000; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd2, {%r2, %r1}; -; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = fadd <2 x float> , %a ret <2 x float> %r } @@ -767,20 +548,6 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 { ; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fadd_imm_1_ftz( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_ftz_param_0]; -; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40000000; -; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f3F800000; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd2, {%r2, %r1}; -; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = fadd <2 x float> %a, ret <2 x float> %r } @@ -811,18 +578,6 @@ define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 { ; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd3; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fadd_v4_ftz( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1]; -; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd5, %rd2, %rd4; -; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd3; -; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; -; CHECK-F32X2-O3-NEXT: ret; %r = fadd <4 x float> %a, %b ret <4 x float> %r } @@ -858,24 +613,6 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 { ; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fadd_imm_0_v4_ftz( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0]; -; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40800000; -; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f40400000; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd3, {%r2, %r1}; -; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3; -; CHECK-F32X2-O3-NEXT: mov.b32 %r3, 0f40000000; -; CHECK-F32X2-O3-NEXT: mov.b32 %r4, 0f3F800000; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd5, {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5; -; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; -; CHECK-F32X2-O3-NEXT: ret; %r = fadd <4 x float> , %a ret <4 x float> %r } @@ -911,24 +648,6 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 { ; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fadd_imm_1_v4_ftz( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0]; -; CHECK-F32X2-O3-NEXT: mov.b32 %r1, 0f40800000; -; CHECK-F32X2-O3-NEXT: mov.b32 %r2, 0f40400000; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd3, {%r2, %r1}; -; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3; -; CHECK-F32X2-O3-NEXT: mov.b32 %r3, 0f40000000; -; CHECK-F32X2-O3-NEXT: mov.b32 %r4, 0f3F800000; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd5, {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5; -; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; -; CHECK-F32X2-O3-NEXT: ret; %r = fadd <4 x float> %a, ret <4 x float> %r } @@ -956,17 +675,6 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-F32X2-NEXT: sub.rn.ftz.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fsub_ftz( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fsub_ftz_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fsub_ftz_param_1]; -; CHECK-F32X2-O3-NEXT: sub.rn.ftz.f32x2 %rd3, %rd1, %rd2; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = fsub <2 x float> %a, %b ret <2 x float> %r } @@ -995,17 +703,6 @@ define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 { ; CHECK-F32X2-NEXT: neg.ftz.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fneg_ftz( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_ftz_param_0]; -; CHECK-F32X2-O3-NEXT: neg.ftz.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: neg.ftz.f32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = fneg <2 x float> %a ret <2 x float> %r } @@ -1033,17 +730,6 @@ define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-F32X2-NEXT: mul.rn.ftz.f32x2 %rd3, %rd1, %rd2; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fmul_ftz( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fmul_ftz_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fmul_ftz_param_1]; -; CHECK-F32X2-O3-NEXT: mul.rn.ftz.f32x2 %rd3, %rd1, %rd2; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = fmul <2 x float> %a, %b ret <2 x float> %r } @@ -1073,18 +759,6 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) ; CHECK-F32X2-NEXT: fma.rn.ftz.f32x2 %rd4, %rd1, %rd2, %rd3; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fma_ftz( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fma_ftz_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fma_ftz_param_1]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [test_fma_ftz_param_2]; -; CHECK-F32X2-O3-NEXT: fma.rn.ftz.f32x2 %rd4, %rd1, %rd2, %rd3; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd4; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) ret <2 x float> %r } @@ -1116,18 +790,6 @@ define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-F32X2-NEXT: div.rn.ftz.f32 %r6, %r3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fdiv_ftz( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<7>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_ftz_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_ftz_param_1]; -; CHECK-F32X2-O3-NEXT: div.rn.ftz.f32 %r5, %r2, %r4; -; CHECK-F32X2-O3-NEXT: div.rn.ftz.f32 %r6, %r1, %r3; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-F32X2-O3-NEXT: ret; %r = fdiv <2 x float> %a, %b ret <2 x float> %r } @@ -1181,29 +843,6 @@ define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-F32X2-NEXT: selp.f32 %r14, %r3, %r13, %p2; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_frem_ftz( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<15>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_ftz_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_ftz_param_1]; -; CHECK-F32X2-O3-NEXT: div.rn.ftz.f32 %r5, %r2, %r4; -; CHECK-F32X2-O3-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5; -; CHECK-F32X2-O3-NEXT: neg.ftz.f32 %r7, %r6; -; CHECK-F32X2-O3-NEXT: fma.rn.ftz.f32 %r8, %r7, %r4, %r2; -; CHECK-F32X2-O3-NEXT: testp.infinite.f32 %p1, %r4; -; CHECK-F32X2-O3-NEXT: selp.f32 %r9, %r2, %r8, %p1; -; CHECK-F32X2-O3-NEXT: div.rn.ftz.f32 %r10, %r1, %r3; -; CHECK-F32X2-O3-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10; -; CHECK-F32X2-O3-NEXT: neg.ftz.f32 %r12, %r11; -; CHECK-F32X2-O3-NEXT: fma.rn.ftz.f32 %r13, %r12, %r3, %r1; -; CHECK-F32X2-O3-NEXT: testp.infinite.f32 %p2, %r3; -; CHECK-F32X2-O3-NEXT: selp.f32 %r14, %r1, %r13, %p2; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; -; CHECK-F32X2-O3-NEXT: ret; %r = frem <2 x float> %a, %b ret <2 x float> %r } @@ -1231,64 +870,25 @@ define void @test_ldst_v2f32(ptr %a, ptr %b) #0 { ; CHECK-F32X2-NEXT: ld.b64 %rd3, [%rd1]; ; CHECK-F32X2-NEXT: st.b64 [%rd2], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_ldst_v2f32( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_ldst_v2f32_param_0]; -; CHECK-F32X2-O3-NEXT: ld.b64 %rd2, [%rd1]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [test_ldst_v2f32_param_1]; -; CHECK-F32X2-O3-NEXT: st.b64 [%rd3], %rd2; -; CHECK-F32X2-O3-NEXT: ret; %t1 = load <2 x float>, ptr %a store <2 x float> %t1, ptr %b, align 32 ret void } define void @test_ldst_v3f32(ptr %a, ptr %b) #0 { -; CHECK-NOF32X2-LABEL: test_ldst_v3f32( -; CHECK-NOF32X2: { -; CHECK-NOF32X2-NEXT: .reg .b32 %r<2>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; -; CHECK-NOF32X2-EMPTY: -; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v3f32_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v3f32_param_0]; -; CHECK-NOF32X2-NEXT: ld.b64 %rd3, [%rd1]; -; CHECK-NOF32X2-NEXT: ld.b32 %r1, [%rd1+8]; -; CHECK-NOF32X2-NEXT: st.b32 [%rd2+8], %r1; -; CHECK-NOF32X2-NEXT: st.b64 [%rd2], %rd3; -; CHECK-NOF32X2-NEXT: ret; -; -; CHECK-F32X2-LABEL: test_ldst_v3f32( -; CHECK-F32X2: { -; CHECK-F32X2-NEXT: .reg .b32 %r<2>; -; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-EMPTY: -; CHECK-F32X2-NEXT: // %bb.0: -; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v3f32_param_1]; -; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v3f32_param_0]; -; CHECK-F32X2-NEXT: ld.b64 %rd3, [%rd1]; -; CHECK-F32X2-NEXT: ld.b32 %r1, [%rd1+8]; -; CHECK-F32X2-NEXT: st.b32 [%rd2+8], %r1; -; CHECK-F32X2-NEXT: st.b64 [%rd2], %rd3; -; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_ldst_v3f32( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<2>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_ldst_v3f32_param_0]; -; CHECK-F32X2-O3-NEXT: ld.b64 %rd2, [%rd1]; -; CHECK-F32X2-O3-NEXT: ld.b32 %r1, [%rd1+8]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [test_ldst_v3f32_param_1]; -; CHECK-F32X2-O3-NEXT: st.b32 [%rd3+8], %r1; -; CHECK-F32X2-O3-NEXT: st.b64 [%rd3], %rd2; -; CHECK-F32X2-O3-NEXT: ret; +; CHECK-LABEL: test_ldst_v3f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v3f32_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v3f32_param_0]; +; CHECK-NEXT: ld.b64 %rd3, [%rd1]; +; CHECK-NEXT: ld.b32 %r1, [%rd1+8]; +; CHECK-NEXT: st.b32 [%rd2+8], %r1; +; CHECK-NEXT: st.b64 [%rd2], %rd3; +; CHECK-NEXT: ret; %t1 = load <3 x float>, ptr %a store <3 x float> %t1, ptr %b, align 32 ret void @@ -1317,17 +917,6 @@ define void @test_ldst_v4f32(ptr %a, ptr %b) #0 { ; CHECK-F32X2-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1]; ; CHECK-F32X2-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_ldst_v4f32( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_ldst_v4f32_param_0]; -; CHECK-F32X2-O3-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd4, [test_ldst_v4f32_param_1]; -; CHECK-F32X2-O3-NEXT: st.v2.b64 [%rd4], {%rd2, %rd3}; -; CHECK-F32X2-O3-NEXT: ret; %t1 = load <4 x float>, ptr %a store <4 x float> %t1, ptr %b, align 32 ret void @@ -1360,19 +949,6 @@ define void @test_ldst_v8f32(ptr %a, ptr %b) #0 { ; CHECK-F32X2-NEXT: st.v2.b64 [%rd2+16], {%rd5, %rd6}; ; CHECK-F32X2-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_ldst_v8f32( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_ldst_v8f32_param_0]; -; CHECK-F32X2-O3-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-F32X2-O3-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd6, [test_ldst_v8f32_param_1]; -; CHECK-F32X2-O3-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5}; -; CHECK-F32X2-O3-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3}; -; CHECK-F32X2-O3-NEXT: ret; %t1 = load <8 x float>, ptr %a store <8 x float> %t1, ptr %b, align 32 ret void @@ -1418,25 +994,6 @@ define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: } // callseq 0 ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_call( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_call_param_0]; -; CHECK-F32X2-O3-NEXT: { // callseq 0, 0 -; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 param1[8]; -; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 retval0[8]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_call_param_1]; -; CHECK-F32X2-O3-NEXT: st.param.b64 [param1], %rd2; -; CHECK-F32X2-O3-NEXT: st.param.b64 [param0], %rd1; -; CHECK-F32X2-O3-NEXT: call.uni (retval0), test_callee, (param0, param1); -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [retval0]; -; CHECK-F32X2-O3-NEXT: } // callseq 0 -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } @@ -1479,25 +1036,6 @@ define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: } // callseq 1 ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_call_flipped( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_call_flipped_param_0]; -; CHECK-F32X2-O3-NEXT: { // callseq 1, 0 -; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 param1[8]; -; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 retval0[8]; -; CHECK-F32X2-O3-NEXT: st.param.b64 [param1], %rd1; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_call_flipped_param_1]; -; CHECK-F32X2-O3-NEXT: st.param.b64 [param0], %rd2; -; CHECK-F32X2-O3-NEXT: call.uni (retval0), test_callee, (param0, param1); -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [retval0]; -; CHECK-F32X2-O3-NEXT: } // callseq 1 -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a) ret <2 x float> %r } @@ -1540,25 +1078,6 @@ define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: } // callseq 2 ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_tailcall_flipped( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_tailcall_flipped_param_0]; -; CHECK-F32X2-O3-NEXT: { // callseq 2, 0 -; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 param1[8]; -; CHECK-F32X2-O3-NEXT: .param .align 8 .b8 retval0[8]; -; CHECK-F32X2-O3-NEXT: st.param.b64 [param1], %rd1; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_tailcall_flipped_param_1]; -; CHECK-F32X2-O3-NEXT: st.param.b64 [param0], %rd2; -; CHECK-F32X2-O3-NEXT: call.uni (retval0), test_callee, (param0, param1); -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [retval0]; -; CHECK-F32X2-O3-NEXT: } // callseq 2 -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = tail call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a) ret <2 x float> %r } @@ -1596,22 +1115,6 @@ define <2 x float> @test_select(<2 x float> %a, <2 x float> %b, i1 zeroext %c) # ; CHECK-F32X2-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_select( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<2>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b8 %rs1, [test_select_param_2]; -; CHECK-F32X2-O3-NEXT: and.b16 %rs2, %rs1, 1; -; CHECK-F32X2-O3-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_select_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_select_param_1]; -; CHECK-F32X2-O3-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = select i1 %c, <2 x float> %a, <2 x float> %b ret <2 x float> %r } @@ -1655,23 +1158,6 @@ define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> % ; CHECK-F32X2-NEXT: selp.f32 %r10, %r7, %r5, %p1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_select_cc( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<11>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_param_2]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_param_3]; -; CHECK-F32X2-O3-NEXT: setp.neu.f32 %p1, %r3, %r5; -; CHECK-F32X2-O3-NEXT: setp.neu.f32 %p2, %r4, %r6; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r7, %r8}, [test_select_cc_param_1]; -; CHECK-F32X2-O3-NEXT: selp.f32 %r9, %r2, %r8, %p2; -; CHECK-F32X2-O3-NEXT: selp.f32 %r10, %r1, %r7, %p1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9}; -; CHECK-F32X2-O3-NEXT: ret; %cc = fcmp une <2 x float> %c, %d %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b ret <2 x float> %r @@ -1715,24 +1201,6 @@ define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 ; CHECK-F32X2-NEXT: selp.f64 %rd8, %rd1, %rd3, %p1; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd7}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_select_cc_f64_f32( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f64_f32_param_2]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f64_f32_param_3]; -; CHECK-F32X2-O3-NEXT: setp.neu.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.neu.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1]; -; CHECK-F32X2-O3-NEXT: selp.f64 %rd5, %rd2, %rd4, %p2; -; CHECK-F32X2-O3-NEXT: selp.f64 %rd6, %rd1, %rd3, %p1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; -; CHECK-F32X2-O3-NEXT: ret; %cc = fcmp une <2 x float> %c, %d %r = select <2 x i1> %cc, <2 x double> %a, <2 x double> %b ret <2 x double> %r @@ -1776,24 +1244,6 @@ define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x ; CHECK-F32X2-NEXT: selp.f32 %r6, %r3, %r1, %p1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_select_cc_f32_f64( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<7>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f64_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f32_f64_param_2]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_3]; -; CHECK-F32X2-O3-NEXT: setp.neu.f64 %p1, %rd1, %rd3; -; CHECK-F32X2-O3-NEXT: setp.neu.f64 %p2, %rd2, %rd4; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f64_param_1]; -; CHECK-F32X2-O3-NEXT: selp.f32 %r5, %r2, %r4, %p2; -; CHECK-F32X2-O3-NEXT: selp.f32 %r6, %r1, %r3, %p1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-F32X2-O3-NEXT: ret; %cc = fcmp une <2 x double> %c, %d %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b ret <2 x float> %r @@ -1836,23 +1286,6 @@ define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fcmp_une( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_une_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_une_param_1]; -; CHECK-F32X2-O3-NEXT: setp.neu.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.neu.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; -; CHECK-F32X2-O3-NEXT: ret; %r = fcmp une <2 x float> %a, %b ret <2 x i1> %r } @@ -1894,23 +1327,6 @@ define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fcmp_ueq( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ueq_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ueq_param_1]; -; CHECK-F32X2-O3-NEXT: setp.equ.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.equ.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; -; CHECK-F32X2-O3-NEXT: ret; %r = fcmp ueq <2 x float> %a, %b ret <2 x i1> %r } @@ -1952,23 +1368,6 @@ define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fcmp_ugt( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ugt_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ugt_param_1]; -; CHECK-F32X2-O3-NEXT: setp.gtu.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.gtu.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; -; CHECK-F32X2-O3-NEXT: ret; %r = fcmp ugt <2 x float> %a, %b ret <2 x i1> %r } @@ -2010,23 +1409,6 @@ define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fcmp_uge( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uge_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uge_param_1]; -; CHECK-F32X2-O3-NEXT: setp.geu.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.geu.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; -; CHECK-F32X2-O3-NEXT: ret; %r = fcmp uge <2 x float> %a, %b ret <2 x i1> %r } @@ -2068,23 +1450,6 @@ define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fcmp_ult( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ult_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ult_param_1]; -; CHECK-F32X2-O3-NEXT: setp.ltu.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.ltu.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; -; CHECK-F32X2-O3-NEXT: ret; %r = fcmp ult <2 x float> %a, %b ret <2 x i1> %r } @@ -2126,23 +1491,6 @@ define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fcmp_ule( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ule_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ule_param_1]; -; CHECK-F32X2-O3-NEXT: setp.leu.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.leu.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; -; CHECK-F32X2-O3-NEXT: ret; %r = fcmp ule <2 x float> %a, %b ret <2 x i1> %r } @@ -2184,23 +1532,6 @@ define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fcmp_uno( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uno_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uno_param_1]; -; CHECK-F32X2-O3-NEXT: setp.nan.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.nan.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; -; CHECK-F32X2-O3-NEXT: ret; %r = fcmp uno <2 x float> %a, %b ret <2 x i1> %r } @@ -2242,23 +1573,6 @@ define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fcmp_one( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_one_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_one_param_1]; -; CHECK-F32X2-O3-NEXT: setp.ne.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.ne.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; -; CHECK-F32X2-O3-NEXT: ret; %r = fcmp one <2 x float> %a, %b ret <2 x i1> %r } @@ -2300,23 +1614,6 @@ define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fcmp_oeq( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oeq_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oeq_param_1]; -; CHECK-F32X2-O3-NEXT: setp.eq.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.eq.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; -; CHECK-F32X2-O3-NEXT: ret; %r = fcmp oeq <2 x float> %a, %b ret <2 x i1> %r } @@ -2358,23 +1655,6 @@ define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fcmp_ogt( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ogt_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ogt_param_1]; -; CHECK-F32X2-O3-NEXT: setp.gt.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.gt.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; -; CHECK-F32X2-O3-NEXT: ret; %r = fcmp ogt <2 x float> %a, %b ret <2 x i1> %r } @@ -2416,23 +1696,6 @@ define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fcmp_oge( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oge_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oge_param_1]; -; CHECK-F32X2-O3-NEXT: setp.ge.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.ge.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; -; CHECK-F32X2-O3-NEXT: ret; %r = fcmp oge <2 x float> %a, %b ret <2 x i1> %r } @@ -2474,23 +1737,6 @@ define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fcmp_olt( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_olt_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_olt_param_1]; -; CHECK-F32X2-O3-NEXT: setp.lt.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.lt.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; -; CHECK-F32X2-O3-NEXT: ret; %r = fcmp olt <2 x float> %a, %b ret <2 x i1> %r } @@ -2532,23 +1778,6 @@ define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fcmp_ole( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ole_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ole_param_1]; -; CHECK-F32X2-O3-NEXT: setp.le.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.le.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; -; CHECK-F32X2-O3-NEXT: ret; %r = fcmp ole <2 x float> %a, %b ret <2 x i1> %r } @@ -2590,23 +1819,6 @@ define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; ; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fcmp_ord( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b16 %rs<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ord_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ord_param_1]; -; CHECK-F32X2-O3-NEXT: setp.num.f32 %p1, %r1, %r3; -; CHECK-F32X2-O3-NEXT: setp.num.f32 %p2, %r2, %r4; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0+1], %rs1; -; CHECK-F32X2-O3-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-F32X2-O3-NEXT: st.param.b8 [func_retval0], %rs2; -; CHECK-F32X2-O3-NEXT: ret; %r = fcmp ord <2 x float> %a, %b ret <2 x i1> %r } @@ -2635,17 +1847,6 @@ define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rzi.s32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fptosi_i32( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i32_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rzi.s32.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rzi.s32.f32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = fptosi <2 x float> %a to <2 x i32> ret <2 x i32> %r } @@ -2675,18 +1876,6 @@ define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rzi.s64.f32 %rd3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fptosi_i64( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<3>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i64_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rzi.s64.f32 %rd1, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rzi.s64.f32 %rd2, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; -; CHECK-F32X2-O3-NEXT: ret; %r = fptosi <2 x float> %a to <2 x i64> ret <2 x i64> %r } @@ -2715,17 +1904,6 @@ define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rzi.u32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fptoui_2xi32( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi32_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rzi.u32.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rzi.u32.f32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = fptoui <2 x float> %a to <2 x i32> ret <2 x i32> %r } @@ -2755,18 +1933,6 @@ define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rzi.u64.f32 %rd3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fptoui_2xi64( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<3>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi64_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rzi.u64.f32 %rd1, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rzi.u64.f32 %rd2, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; -; CHECK-F32X2-O3-NEXT: ret; %r = fptoui <2 x float> %a to <2 x i64> ret <2 x i64> %r } @@ -2795,17 +1961,6 @@ define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_uitofp_2xi32( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rn.f32.u32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rn.f32.u32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = uitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -2850,17 +2005,6 @@ define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_sitofp_2xi32( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rn.f32.s32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rn.f32.s32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = sitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -2911,21 +2055,6 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_uitofp_2xi32_fadd( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rn.f32.u32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rn.f32.u32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd1, {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_uitofp_2xi32_fadd_param_1]; -; CHECK-F32X2-O3-NEXT: add.rn.f32x2 %rd3, %rd2, %rd1; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %c = uitofp <2 x i32> %a to <2 x float> %r = fadd <2 x float> %b, %c ret <2 x float> %r @@ -2956,19 +2085,6 @@ define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 { ; CHECK-F32X2-NEXT: mov.b64 %rd3, {%r2, %r1}; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fptrunc_2xdouble( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<4>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rn.f32.f64 %r1, %rd2; -; CHECK-F32X2-O3-NEXT: cvt.rn.f32.f64 %r2, %rd1; -; CHECK-F32X2-O3-NEXT: mov.b64 %rd3, {%r2, %r1}; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-F32X2-O3-NEXT: ret; %r = fptrunc <2 x double> %a to <2 x float> ret <2 x float> %r } @@ -2998,18 +2114,6 @@ define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.f64.f32 %rd3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fpext_2xdouble( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<3>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fpext_2xdouble_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.f64.f32 %rd1, %r2; -; CHECK-F32X2-O3-NEXT: cvt.f64.f32 %rd2, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; -; CHECK-F32X2-O3-NEXT: ret; %r = fpext <2 x float> %a to <2 x double> ret <2 x double> %r } @@ -3032,15 +2136,6 @@ define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0]; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_bitcast_2xfloat_to_2xi32( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<2>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0]; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd1; -; CHECK-F32X2-O3-NEXT: ret; %r = bitcast <2 x float> %a to <2 x i32> ret <2 x i32> %r } @@ -3063,15 +2158,6 @@ define <2 x float> @test_bitcast_2xi32_to_2xfloat(<2 x i32> %a) #0 { ; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xi32_to_2xfloat_param_0]; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_bitcast_2xi32_to_2xfloat( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<2>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_bitcast_2xi32_to_2xfloat_param_0]; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd1; -; CHECK-F32X2-O3-NEXT: ret; %r = bitcast <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -3107,15 +2193,6 @@ define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_double_param_0]; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_bitcast_2xfloat_to_double( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<2>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_double_param_0]; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd1; -; CHECK-F32X2-O3-NEXT: ret; %r = bitcast <2 x float> %a to double ret double %r } @@ -3144,17 +2221,6 @@ define <2 x float> @test_sqrt(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: sqrt.rn.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_sqrt( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sqrt_param_0]; -; CHECK-F32X2-O3-NEXT: sqrt.rn.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: sqrt.rn.f32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.sqrt(<2 x float> %a) ret <2 x float> %r } @@ -3190,17 +2256,6 @@ define <2 x float> @test_sin(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: sin.approx.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_sin( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sin_param_0]; -; CHECK-F32X2-O3-NEXT: sin.approx.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: sin.approx.f32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = call afn <2 x float> @llvm.sin(<2 x float> %a) ret <2 x float> %r } @@ -3229,17 +2284,6 @@ define <2 x float> @test_cos(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cos.approx.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_cos( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_cos_param_0]; -; CHECK-F32X2-O3-NEXT: cos.approx.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: cos.approx.f32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = call afn <2 x float> @llvm.cos(<2 x float> %a) ret <2 x float> %r } @@ -3312,18 +2356,6 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 ; CHECK-F32X2-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fma( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fma_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fma_param_1]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [test_fma_param_2]; -; CHECK-F32X2-O3-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd4; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) ret <2 x float> %r } @@ -3352,17 +2384,6 @@ define <2 x float> @test_fabs(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: abs.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fabs( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fabs_param_0]; -; CHECK-F32X2-O3-NEXT: abs.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: abs.f32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.fabs(<2 x float> %a) ret <2 x float> %r } @@ -3394,18 +2415,6 @@ define <2 x float> @test_minnum(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: min.f32 %r6, %r3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_minnum( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<7>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_minnum_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_minnum_param_1]; -; CHECK-F32X2-O3-NEXT: min.f32 %r5, %r2, %r4; -; CHECK-F32X2-O3-NEXT: min.f32 %r6, %r1, %r3; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.minnum(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } @@ -3437,18 +2446,6 @@ define <2 x float> @test_maxnum(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: max.f32 %r6, %r3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_maxnum( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<7>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_maxnum_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_maxnum_param_1]; -; CHECK-F32X2-O3-NEXT: max.f32 %r5, %r2, %r4; -; CHECK-F32X2-O3-NEXT: max.f32 %r6, %r1, %r3; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.maxnum(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } @@ -3480,18 +2477,6 @@ define <2 x float> @test_copysign(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: copysign.f32 %r6, %r3, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_copysign( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<7>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_param_1]; -; CHECK-F32X2-O3-NEXT: copysign.f32 %r5, %r4, %r2; -; CHECK-F32X2-O3-NEXT: copysign.f32 %r6, %r3, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } @@ -3545,30 +2530,6 @@ define <2 x float> @test_copysign_f64(<2 x float> %a, <2 x double> %b) #0 { ; CHECK-F32X2-NEXT: selp.f32 %r8, %r7, %r6, %p2; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_copysign_f64( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<3>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<9>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<7>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_f64_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1]; -; CHECK-F32X2-O3-NEXT: abs.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: neg.f32 %r4, %r3; -; CHECK-F32X2-O3-NEXT: shr.u64 %rd3, %rd2, 63; -; CHECK-F32X2-O3-NEXT: and.b64 %rd4, %rd3, 1; -; CHECK-F32X2-O3-NEXT: setp.ne.b64 %p1, %rd4, 0; -; CHECK-F32X2-O3-NEXT: selp.f32 %r5, %r4, %r3, %p1; -; CHECK-F32X2-O3-NEXT: abs.f32 %r6, %r1; -; CHECK-F32X2-O3-NEXT: neg.f32 %r7, %r6; -; CHECK-F32X2-O3-NEXT: shr.u64 %rd5, %rd1, 63; -; CHECK-F32X2-O3-NEXT: and.b64 %rd6, %rd5, 1; -; CHECK-F32X2-O3-NEXT: setp.ne.b64 %p2, %rd6, 0; -; CHECK-F32X2-O3-NEXT: selp.f32 %r8, %r7, %r6, %p2; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5}; -; CHECK-F32X2-O3-NEXT: ret; %tb = fptrunc <2 x double> %b to <2 x float> %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %tb) ret <2 x float> %r @@ -3606,21 +2567,6 @@ define <2 x double> @test_copysign_extended(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-F32X2-NEXT: cvt.f64.f32 %rd4, %r5; ; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_copysign_extended( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<7>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<3>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_extended_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_extended_param_1]; -; CHECK-F32X2-O3-NEXT: copysign.f32 %r5, %r3, %r1; -; CHECK-F32X2-O3-NEXT: copysign.f32 %r6, %r4, %r2; -; CHECK-F32X2-O3-NEXT: cvt.f64.f32 %rd1, %r6; -; CHECK-F32X2-O3-NEXT: cvt.f64.f32 %rd2, %r5; -; CHECK-F32X2-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b) %xr = fpext <2 x float> %r to <2 x double> ret <2 x double> %xr @@ -3650,17 +2596,6 @@ define <2 x float> @test_floor(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rmi.f32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_floor( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_floor_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rmi.f32.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rmi.f32.f32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.floor(<2 x float> %a) ret <2 x float> %r } @@ -3689,17 +2624,6 @@ define <2 x float> @test_ceil(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rpi.f32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_ceil( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_ceil_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rpi.f32.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rpi.f32.f32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.ceil(<2 x float> %a) ret <2 x float> %r } @@ -3728,17 +2652,6 @@ define <2 x float> @test_trunc(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_trunc( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.trunc(<2 x float> %a) ret <2 x float> %r } @@ -3767,17 +2680,6 @@ define <2 x float> @test_rint(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_rint( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_rint_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rni.f32.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rni.f32.f32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.rint(<2 x float> %a) ret <2 x float> %r } @@ -3806,17 +2708,6 @@ define <2 x float> @test_nearbyint(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_nearbyint( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_nearbyint_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rni.f32.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rni.f32.f32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.nearbyint(<2 x float> %a) ret <2 x float> %r } @@ -3845,17 +2736,6 @@ define <2 x float> @test_roundeven(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_roundeven( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_roundeven_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rni.f32.f32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rni.f32.f32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.roundeven(<2 x float> %a) ret <2 x float> %r } @@ -3923,36 +2803,6 @@ define <2 x float> @test_round(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: selp.f32 %r18, %r17, %r16, %p4; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r18, %r10}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_round( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .pred %p<5>; -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<19>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_round_param_0]; -; CHECK-F32X2-O3-NEXT: and.b32 %r3, %r2, -2147483648; -; CHECK-F32X2-O3-NEXT: or.b32 %r4, %r3, 1056964608; -; CHECK-F32X2-O3-NEXT: add.rn.f32 %r5, %r2, %r4; -; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r6, %r5; -; CHECK-F32X2-O3-NEXT: abs.f32 %r7, %r2; -; CHECK-F32X2-O3-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000; -; CHECK-F32X2-O3-NEXT: selp.f32 %r8, %r2, %r6, %p1; -; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r9, %r2; -; CHECK-F32X2-O3-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000; -; CHECK-F32X2-O3-NEXT: selp.f32 %r10, %r9, %r8, %p2; -; CHECK-F32X2-O3-NEXT: and.b32 %r11, %r1, -2147483648; -; CHECK-F32X2-O3-NEXT: or.b32 %r12, %r11, 1056964608; -; CHECK-F32X2-O3-NEXT: add.rn.f32 %r13, %r1, %r12; -; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r14, %r13; -; CHECK-F32X2-O3-NEXT: abs.f32 %r15, %r1; -; CHECK-F32X2-O3-NEXT: setp.gt.f32 %p3, %r15, 0f4B000000; -; CHECK-F32X2-O3-NEXT: selp.f32 %r16, %r1, %r14, %p3; -; CHECK-F32X2-O3-NEXT: cvt.rzi.f32.f32 %r17, %r1; -; CHECK-F32X2-O3-NEXT: setp.lt.f32 %p4, %r15, 0f3F000000; -; CHECK-F32X2-O3-NEXT: selp.f32 %r18, %r17, %r16, %p4; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r18, %r10}; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.round(<2 x float> %a) ret <2 x float> %r } @@ -3982,18 +2832,6 @@ define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) ; CHECK-F32X2-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; ; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_fmuladd( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_fmuladd_param_0]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd2, [test_fmuladd_param_1]; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd3, [test_fmuladd_param_2]; -; CHECK-F32X2-O3-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; -; CHECK-F32X2-O3-NEXT: st.param.b64 [func_retval0], %rd4; -; CHECK-F32X2-O3-NEXT: ret; %r = call <2 x float> @llvm.fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) ret <2 x float> %r } @@ -4018,15 +2856,6 @@ define <2 x float> @test_shufflevector(<2 x float> %a) #0 { ; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_shufflevector( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_shufflevector_param_0]; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; -; CHECK-F32X2-O3-NEXT: ret; %s = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> ret <2 x float> %s } @@ -4053,16 +2882,6 @@ define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 { ; CHECK-F32X2-NEXT: mov.b64 {%r2, _}, %rd1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_insertelement( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<3>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.b32 %r1, [test_insertelement_param_1]; -; CHECK-F32X2-O3-NEXT: ld.param.b32 %r2, [test_insertelement_param_0]; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; -; CHECK-F32X2-O3-NEXT: ret; %i = insertelement <2 x float> %a, float %x, i64 1 ret <2 x float> %i } @@ -4091,17 +2910,6 @@ define <2 x float> @test_sitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_sitofp_2xi32_to_2xfloat( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_to_2xfloat_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rn.f32.s32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rn.f32.s32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = sitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -4130,17 +2938,6 @@ define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { ; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; ; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_uitofp_2xi32_to_2xfloat( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_to_2xfloat_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rn.f32.u32 %r3, %r2; -; CHECK-F32X2-O3-NEXT: cvt.rn.f32.u32 %r4, %r1; -; CHECK-F32X2-O3-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-F32X2-O3-NEXT: ret; %r = uitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -4170,18 +2967,6 @@ define void @test_trunc_to_v2bf16(<2 x float> %a, ptr %p) { ; CHECK-F32X2-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; ; CHECK-F32X2-NEXT: st.b32 [%rd2], %r3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_trunc_to_v2bf16( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<4>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<2>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2bf16_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_1]; -; CHECK-F32X2-O3-NEXT: st.b32 [%rd1], %r3; -; CHECK-F32X2-O3-NEXT: ret; %trunc = fptrunc <2 x float> %a to <2 x bfloat> store <2 x bfloat> %trunc, ptr %p ret void @@ -4212,18 +2997,6 @@ define void @test_trunc_to_v2f16(<2 x float> %a, ptr %p) { ; CHECK-F32X2-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1; ; CHECK-F32X2-NEXT: st.b32 [%rd2], %r3; ; CHECK-F32X2-NEXT: ret; -; -; CHECK-F32X2-O3-LABEL: test_trunc_to_v2f16( -; CHECK-F32X2-O3: { -; CHECK-F32X2-O3-NEXT: .reg .b32 %r<4>; -; CHECK-F32X2-O3-NEXT: .reg .b64 %rd<2>; -; CHECK-F32X2-O3-EMPTY: -; CHECK-F32X2-O3-NEXT: // %bb.0: -; CHECK-F32X2-O3-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2f16_param_0]; -; CHECK-F32X2-O3-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1; -; CHECK-F32X2-O3-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2f16_param_1]; -; CHECK-F32X2-O3-NEXT: st.b32 [%rd1], %r3; -; CHECK-F32X2-O3-NEXT: ret; %trunc = fptrunc <2 x float> %a to <2 x half> store <2 x half> %trunc, ptr %p ret void