@@ -469,6 +469,33 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
469469 return (VT.isScalarInteger () && TLI->isTypeLegal (VT));
470470}
471471
472+ InstructionCost SystemZTTIImpl::
473+ getScalarizationOverhead (VectorType *Ty,
474+ const APInt &DemandedElts,
475+ bool Insert, bool Extract,
476+ TTI::TargetCostKind CostKind) {
477+ unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements ();
478+ InstructionCost Cost = 0 ;
479+
480+ if (Insert && Ty->isIntOrIntVectorTy (64 )) {
481+ // VLVGP will insert two GPRs with one instruction.
482+ InstructionCost CurrVectorCost = 0 ;
483+ for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
484+ if (DemandedElts[Idx])
485+ ++CurrVectorCost;
486+ if (Idx % 2 == 1 ) {
487+ Cost += std::min (InstructionCost (1 ), CurrVectorCost);
488+ CurrVectorCost = 0 ;
489+ }
490+ }
491+ Insert = false ;
492+ }
493+
494+ Cost += BaseT::getScalarizationOverhead (Ty, DemandedElts, Insert,
495+ Extract, CostKind);
496+ return Cost;
497+ }
498+
472499// Return the bit size for the scalar type or vector element
473500// type. getScalarSizeInBits() returns 0 for a pointer type.
474501static unsigned getScalarSizeInBits (Type *Ty) {
@@ -610,7 +637,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
610637 if (DivRemConst) {
611638 SmallVector<Type *> Tys (Args.size (), Ty);
612639 return VF * DivMulSeqCost +
613- getScalarizationOverhead (VTy, Args, Tys, CostKind);
640+ BaseT:: getScalarizationOverhead (VTy, Args, Tys, CostKind);
614641 }
615642 if ((SignedDivRem || UnsignedDivRem) && VF > 4 )
616643 // Temporary hack: disable high vectorization factors with integer
@@ -637,7 +664,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
637664 SmallVector<Type *> Tys (Args.size (), Ty);
638665 InstructionCost Cost =
639666 (VF * ScalarCost) +
640- getScalarizationOverhead (VTy, Args, Tys, CostKind);
667+ BaseT:: getScalarizationOverhead (VTy, Args, Tys, CostKind);
641668 // FIXME: VF 2 for these FP operations are currently just as
642669 // expensive as for VF 4.
643670 if (VF == 2 )
@@ -655,8 +682,9 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
655682 // There is no native support for FRem.
656683 if (Opcode == Instruction::FRem) {
657684 SmallVector<Type *> Tys (Args.size (), Ty);
658- InstructionCost Cost = (VF * LIBCALL_COST) +
659- getScalarizationOverhead (VTy, Args, Tys, CostKind);
685+ InstructionCost Cost =
686+ (VF * LIBCALL_COST) +
687+ BaseT::getScalarizationOverhead (VTy, Args, Tys, CostKind);
660688 // FIXME: VF 2 for float is currently just as expensive as for VF 4.
661689 if (VF == 2 && ScalarBits == 32 )
662690 Cost *= 2 ;
@@ -976,10 +1004,10 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
9761004 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
9771005 NeedsExtracts = false ;
9781006
979- TotCost += getScalarizationOverhead (SrcVecTy, /* Insert*/ false ,
980- NeedsExtracts, CostKind);
981- TotCost += getScalarizationOverhead (DstVecTy, NeedsInserts,
982- /* Extract*/ false , CostKind);
1007+ TotCost += BaseT:: getScalarizationOverhead (SrcVecTy, /* Insert*/ false ,
1008+ NeedsExtracts, CostKind);
1009+ TotCost += BaseT:: getScalarizationOverhead (DstVecTy, NeedsInserts,
1010+ /* Extract*/ false , CostKind);
9831011
9841012 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
9851013 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32 )
@@ -991,8 +1019,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
9911019 if (Opcode == Instruction::FPTrunc) {
9921020 if (SrcScalarBits == 128 ) // fp128 -> double/float + inserts of elements.
9931021 return VF /* ldxbr/lexbr*/ +
994- getScalarizationOverhead (DstVecTy, /* Insert*/ true ,
995- /* Extract*/ false , CostKind);
1022+ BaseT:: getScalarizationOverhead (DstVecTy, /* Insert*/ true ,
1023+ /* Extract*/ false , CostKind);
9961024 else // double -> float
9971025 return VF / 2 /* vledb*/ + std::max (1U , VF / 4 /* vperm*/ );
9981026 }
@@ -1005,8 +1033,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
10051033 return VF * 2 ;
10061034 }
10071035 // -> fp128. VF * lxdb/lxeb + extraction of elements.
1008- return VF + getScalarizationOverhead (SrcVecTy, /* Insert*/ false ,
1009- /* Extract*/ true , CostKind);
1036+ return VF + BaseT:: getScalarizationOverhead (SrcVecTy, /* Insert*/ false ,
1037+ /* Extract*/ true , CostKind);
10101038 }
10111039 }
10121040
@@ -1115,10 +1143,17 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
11151143 TTI::TargetCostKind CostKind,
11161144 unsigned Index, Value *Op0,
11171145 Value *Op1) {
1118- // vlvgp will insert two grs into a vector register, so only count half the
1119- // number of instructions.
1120- if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy (64 ))
1121- return ((Index % 2 == 0 ) ? 1 : 0 );
1146+ if (Opcode == Instruction::InsertElement) {
1147+ // Vector Element Load.
1148+ if (Op1 != nullptr && Op1->hasOneUse () && isa<LoadInst>(Op1))
1149+ return 0 ;
1150+
1151+ // vlvgp will insert two grs into a vector register, so count half the
1152+ // number of instructions as an estimate when we don't have the full
1153+ // picture (as in getScalarizationOverhead()).
1154+ if (Val->isIntOrIntVectorTy (64 ))
1155+ return ((Index % 2 == 0 ) ? 1 : 0 );
1156+ }
11221157
11231158 if (Opcode == Instruction::ExtractElement) {
11241159 int Cost = ((getScalarSizeInBits (Val) == 1 ) ? 2 /* +test-under-mask*/ : 1 );
0 commit comments