From 532e3d246c0a838a0dea306401e6968a3e141bd4 Mon Sep 17 00:00:00 2001 From: sgokhale Date: Tue, 8 Oct 2024 10:02:40 +0530 Subject: [PATCH] [CostModel][AArch64] Make extractelement, with fmul user, free whenever possible In case of Neon, if there exists extractelement from lane != 0 such that 1. extractelement does not necessitate a move from vector_reg -> GPR. 2. extractelement result feeds into fmul. 3. Other operand of fmul is a scalar or extractelement from lane 0 or lane equivalent to 0. then the extractelement can be merged with fmul in the backend and it incurs no cost. e.g. define double @foo(<2 x double> %a) { %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %a, i32 1 %res = fmul double %1, %2 ret double %res } %2 and %res can be merged in the backend to generate: fmul d0, d0, v0.d[1] The change was tested with SPEC FP(C/C++) on Neoverse-v2. Compile time impact: None Performance impact: Observing 1.3-1.7% uplift on lbm benchmark with -flto depending upon the config. --- .../llvm/Analysis/TargetTransformInfo.h | 32 +++++ .../llvm/Analysis/TargetTransformInfoImpl.h | 11 ++ llvm/include/llvm/CodeGen/BasicTTIImpl.h | 13 +- llvm/lib/Analysis/TargetTransformInfo.cpp | 19 ++- .../AArch64/AArch64TargetTransformInfo.cpp | 135 +++++++++++++++++- .../AArch64/AArch64TargetTransformInfo.h | 21 ++- .../Transforms/Vectorize/SLPVectorizer.cpp | 12 +- .../CostModel/AArch64/extract_float.ll | 29 ++-- 8 files changed, 244 insertions(+), 28 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 0dc513d8e65b7..1ccace59d6d36 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -22,6 +22,7 @@ #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H #include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/IR/FMF.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/PassManager.h" @@ -1404,6 +1405,20 @@ class TargetTransformInfo { unsigned Index = -1, Value *Op0 = nullptr, Value *Op1 = nullptr) const; + /// \return The expected cost of vector Insert and Extract. + /// Use -1 to indicate that there is no information on the index value. + /// This is used when the instruction is not available; a typical use + /// case is to provision the cost of vectorization/scalarization in + /// vectorizer passes. + /// \param ScalarUserAndIdx encodes the information about extracts from a + /// vector with 'Scalar' being the value being extracted,'User' being the user + /// of the extract(nullptr if user is not known before vectorization) and + /// 'Idx' being the extract lane. + InstructionCost getVectorInstrCost( + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + Value *Scalar, + ArrayRef> ScalarUserAndIdx) const; + /// \return The expected cost of vector Insert and Extract. /// This is used when instruction is available, and implementation /// asserts 'I' is not nullptr. @@ -2100,6 +2115,16 @@ class TargetTransformInfo::Concept { TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) = 0; + + /// \param ScalarUserAndIdx encodes the information about extracts from a + /// vector with 'Scalar' being the value being extracted,'User' being the user + /// of the extract(nullptr if user is not known before vectorization) and + /// 'Idx' being the extract lane. + virtual InstructionCost getVectorInstrCost( + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + Value *Scalar, + ArrayRef> ScalarUserAndIdx) = 0; + virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) = 0; @@ -2785,6 +2810,13 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { Value *Op1) override { return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); } + InstructionCost getVectorInstrCost( + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + Value *Scalar, + ArrayRef> ScalarUserAndIdx) override { + return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar, + ScalarUserAndIdx); + } InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) override { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 224dfbb9f54b6..c3c5629d61c91 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -700,6 +700,17 @@ class TargetTransformInfoImplBase { return 1; } + /// \param ScalarUserAndIdx encodes the information about extracts from a + /// vector with 'Scalar' being the value being extracted,'User' being the user + /// of the extract(nullptr if user is not known before vectorization) and + /// 'Idx' being the extract lane. + InstructionCost getVectorInstrCost( + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + Value *Scalar, + ArrayRef> ScalarUserAndIdx) const { + return 1; + } + InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const { diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index b0316e67654db..36df9ee2e7d94 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -17,7 +17,6 @@ #define LLVM_CODEGEN_BASICTTIIMPL_H #include "llvm/ADT/APInt.h" -#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -1288,6 +1287,18 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return getRegUsageForType(Val->getScalarType()); } + /// \param ScalarUserAndIdx encodes the information about extracts from a + /// vector with 'Scalar' being the value being extracted,'User' being the user + /// of the extract(nullptr if user is not known before vectorization) and + /// 'Idx' being the extract lane. + InstructionCost getVectorInstrCost( + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + Value *Scalar, + ArrayRef> ScalarUserAndIdx) { + return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr, + nullptr); + } + InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c5c7b7c7c0a57..bc6a528c9dab3 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1047,15 +1047,28 @@ InstructionCost TargetTransformInfo::getCmpSelInstrCost( InstructionCost TargetTransformInfo::getVectorInstrCost( unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) const { - // FIXME: Assert that Opcode is either InsertElement or ExtractElement. - // This is mentioned in the interface description and respected by all - // callers, but never asserted upon. + assert((Opcode == Instruction::InsertElement || + Opcode == Instruction::ExtractElement) && + "Expecting Opcode to be insertelement/extractelement."); InstructionCost Cost = TTIImpl->getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } +InstructionCost TargetTransformInfo::getVectorInstrCost( + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + Value *Scalar, + ArrayRef> ScalarUserAndIdx) const { + assert((Opcode == Instruction::InsertElement || + Opcode == Instruction::ExtractElement) && + "Expecting Opcode to be insertelement/extractelement."); + InstructionCost Cost = TTIImpl->getVectorInstrCost( + Opcode, Val, CostKind, Index, Scalar, ScalarUserAndIdx); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + InstructionCost TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 71f9bbbbc3504..a97b0d3b1db92 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -11,6 +11,7 @@ #include "AArch64PerfectShuffle.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64SMEAttributes.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -3177,10 +3178,10 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, return 0; } -InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I, - Type *Val, - unsigned Index, - bool HasRealUse) { +InstructionCost AArch64TTIImpl::getVectorInstrCostHelper( + unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse, + const Instruction *I, Value *Scalar, + ArrayRef> ScalarUserAndIdx) { assert(Val->isVectorTy() && "This must be a vector type"); if (Index != -1U) { @@ -3226,6 +3227,119 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I, // compile-time considerations. } + // In case of Neon, if there exists extractelement from lane != 0 such that + // 1. extractelement does not necessitate a move from vector_reg -> GPR. + // 2. extractelement result feeds into fmul. + // 3. Other operand of fmul is an extractelement from lane 0 or lane + // equivalent to 0. + // then the extractelement can be merged with fmul in the backend and it + // incurs no cost. + // e.g. + // define double @foo(<2 x double> %a) { + // %1 = extractelement <2 x double> %a, i32 0 + // %2 = extractelement <2 x double> %a, i32 1 + // %res = fmul double %1, %2 + // ret double %res + // } + // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1] + auto ExtractCanFuseWithFmul = [&]() { + // We bail out if the extract is from lane 0. + if (Index == 0) + return false; + + // Check if the scalar element type of the vector operand of ExtractElement + // instruction is one of the allowed types. + auto IsAllowedScalarTy = [&](const Type *T) { + return T->isFloatTy() || T->isDoubleTy() || + (T->isHalfTy() && ST->hasFullFP16()); + }; + + // Check if the extractelement user is scalar fmul. + auto IsUserFMulScalarTy = [](const Value *EEUser) { + // Check if the user is scalar fmul. + const auto *BO = dyn_cast_if_present(EEUser); + return BO && BO->getOpcode() == BinaryOperator::FMul && + !BO->getType()->isVectorTy(); + }; + + // Check if the extract index is from lane 0 or lane equivalent to 0 for a + // certain scalar type and a certain vector register width. + auto IsExtractLaneEquivalentToZero = [&](const unsigned &Idx, + const unsigned &EltSz) { + auto RegWidth = + getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) + .getFixedValue(); + return (Idx == 0 || (Idx * EltSz) % RegWidth == 0); + }; + + // Check if the type constraints on input vector type and result scalar type + // of extractelement instruction are satisfied. + if (!isa(Val) || !IsAllowedScalarTy(Val->getScalarType())) + return false; + + if (Scalar) { + DenseMap UserToExtractIdx; + for (auto *U : Scalar->users()) { + if (!IsUserFMulScalarTy(U)) + return false; + // Recording entry for the user is important. Index value is not + // important. + UserToExtractIdx[U]; + } + for (auto &[S, U, L] : ScalarUserAndIdx) { + for (auto *U : S->users()) { + if (UserToExtractIdx.find(U) != UserToExtractIdx.end()) { + auto *FMul = cast(U); + auto *Op0 = FMul->getOperand(0); + auto *Op1 = FMul->getOperand(1); + if ((Op0 == S && Op1 == S) || (Op0 != S) || (Op1 != S)) { + UserToExtractIdx[U] = L; + break; + } + } + } + } + for (auto &[U, L] : UserToExtractIdx) { + if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) && + !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits())) + return false; + } + } else { + const auto *EE = cast(I); + + const auto *IdxOp = dyn_cast(EE->getIndexOperand()); + if (!IdxOp) + return false; + + return !EE->users().empty() && all_of(EE->users(), [&](const User *U) { + if (!IsUserFMulScalarTy(U)) + return false; + + // Check if the other operand of extractelement is also extractelement + // from lane equivalent to 0. + const auto *BO = cast(U); + const auto *OtherEE = dyn_cast( + BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0)); + if (OtherEE) { + const auto *IdxOp = dyn_cast(OtherEE->getIndexOperand()); + if (!IdxOp) + return false; + return IsExtractLaneEquivalentToZero( + cast(OtherEE->getIndexOperand()) + ->getValue() + .getZExtValue(), + OtherEE->getType()->getScalarSizeInBits()); + } + return true; + }); + } + return true; + }; + + if (Opcode == Instruction::ExtractElement && (I || Scalar) && + ExtractCanFuseWithFmul()) + return 0; + // All other insert/extracts cost this much. return ST->getVectorInsertExtractBaseCost(); } @@ -3236,14 +3350,23 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, Value *Op1) { bool HasRealUse = Opcode == Instruction::InsertElement && Op0 && !isa(Op0); - return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse); + return getVectorInstrCostHelper(Opcode, Val, Index, HasRealUse); +} + +InstructionCost AArch64TTIImpl::getVectorInstrCost( + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + Value *Scalar, + ArrayRef> ScalarUserAndIdx) { + return getVectorInstrCostHelper(Opcode, Val, Index, false, nullptr, Scalar, + ScalarUserAndIdx); } InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) { - return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */); + return getVectorInstrCostHelper(I.getOpcode(), Val, Index, + true /* HasRealUse */, &I); } InstructionCost AArch64TTIImpl::getScalarizationOverhead( diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 1d09d67f6ec9e..a01d061c4c407 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -19,7 +19,6 @@ #include "AArch64.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" -#include "llvm/ADT/ArrayRef.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Function.h" @@ -66,8 +65,14 @@ class AArch64TTIImpl : public BasicTTIImplBase { // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse' // indicates whether the vector instruction is available in the input IR or // just imaginary in vectorizer passes. - InstructionCost getVectorInstrCostHelper(const Instruction *I, Type *Val, - unsigned Index, bool HasRealUse); + /// \param ScalarUserAndIdx encodes the information about extracts from a + /// vector with 'Scalar' being the value being extracted,'User' being the user + /// of the extract(nullptr if user is not known before vectorization) and + /// 'Idx' being the extract lane. + InstructionCost getVectorInstrCostHelper( + unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse, + const Instruction *I = nullptr, Value *Scalar = nullptr, + ArrayRef> ScalarUserAndIdx = {}); public: explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F) @@ -185,6 +190,16 @@ class AArch64TTIImpl : public BasicTTIImplBase { InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1); + + /// \param ScalarUserAndIdx encodes the information about extracts from a + /// vector with 'Scalar' being the value being extracted,'User' being the user + /// of the extract(nullptr if user is not known before vectorization) and + /// 'Idx' being the extract lane. + InstructionCost getVectorInstrCost( + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + Value *Scalar, + ArrayRef> ScalarUserAndIdx); + InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a3ea39cea6b93..506c243688d9c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -12192,6 +12192,13 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { std::optional> ValueToExtUses; DenseMap> ExtractsCount; SmallPtrSet ScalarOpsFromCasts; + // Keep track {Scalar, Index, User} tuple. + // On AArch64, this helps in fusing a mov instruction, associated with + // extractelement, with fmul in the backend so that extractelement is free. + SmallVector, 4> ScalarUserAndIdx; + for (ExternalUser &EU : ExternalUses) { + ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane); + } for (ExternalUser &EU : ExternalUses) { // Uses by ephemeral values are free (because the ephemeral value will be // removed prior to code generation, and so the extraction will be @@ -12304,8 +12311,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), VecTy, EU.Lane); } else { - ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, - CostKind, EU.Lane); + ExtraCost = + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, + EU.Lane, EU.Scalar, ScalarUserAndIdx); } // Leave the scalar instructions as is if they are cheaper than extracts. if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr || diff --git a/llvm/test/Analysis/CostModel/AArch64/extract_float.ll b/llvm/test/Analysis/CostModel/AArch64/extract_float.ll index dd3d0289bbb1c..d2b75faa014d6 100644 --- a/llvm/test/Analysis/CostModel/AArch64/extract_float.ll +++ b/llvm/test/Analysis/CostModel/AArch64/extract_float.ll @@ -8,7 +8,7 @@ define double @extract_case1(<2 x double> %a) { ; CHECK-LABEL: 'extract_case1' ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x double> %a, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x double> %a, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <2 x double> %a, i32 1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %res entry: @@ -57,7 +57,7 @@ entry: ; res = lane 1 * scalar define double @extract_case5(<2 x double> %a, double %b) { ; CHECK-LABEL: 'extract_case5' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %0 = extractelement <2 x double> %a, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x double> %a, i32 1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %b ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %res entry: @@ -71,7 +71,7 @@ entry: define double @extract_case6(<3 x double> %a) { ; CHECK-LABEL: 'extract_case6' ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <3 x double> %a, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <3 x double> %a, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <3 x double> %a, i32 1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %res entry: @@ -86,7 +86,7 @@ entry: ; register. But for other register sizes, this is not the case. define double @extract_case7(<4 x double> %a) { ; CHECK-LABEL: 'extract_case7' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %0 = extractelement <4 x double> %a, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <4 x double> %a, i32 1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <4 x double> %a, i32 2 ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %res @@ -158,11 +158,17 @@ entry: ; res = lane 0 * lane 1 define half @extract_case11(<2 x half> %a) { -; CHECK-LABEL: 'extract_case11' -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x half> %a, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x half> %a, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul half %0, %1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret half %res +; NOFP16-LABEL: 'extract_case11' +; NOFP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x half> %a, i32 0 +; NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x half> %a, i32 1 +; NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul half %0, %1 +; NOFP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret half %res +; +; FULLFP16-LABEL: 'extract_case11' +; FULLFP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x half> %a, i32 0 +; FULLFP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <2 x half> %a, i32 1 +; FULLFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul half %0, %1 +; FULLFP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret half %res entry: %1 = extractelement <2 x half> %a, i32 0 %2 = extractelement <2 x half> %a, i32 1 @@ -174,7 +180,7 @@ entry: define float @extract_case12(<2 x float> %a) { ; CHECK-LABEL: 'extract_case12' ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x float> %a, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x float> %a, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <2 x float> %a, i32 1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul float %0, %1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %res entry: @@ -200,6 +206,3 @@ entry: } declare void @foo(double) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; FULLFP16: {{.*}} -; NOFP16: {{.*}}