@@ -468,6 +468,42 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
468468 return (VT.isScalarInteger () && TLI->isTypeLegal (VT));
469469}
470470
471+ static bool isFreeEltLoad (Value *Op) {
472+ if (isa<LoadInst>(Op) && Op->hasOneUse ()) {
473+ const Instruction *UserI = cast<Instruction>(*Op->user_begin ());
474+ return !isa<StoreInst>(UserI); // Prefer MVC
475+ }
476+ return false ;
477+ }
478+
479+ InstructionCost SystemZTTIImpl::getScalarizationOverhead (
480+ VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
481+ TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
482+ unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements ();
483+ InstructionCost Cost = 0 ;
484+
485+ if (Insert && Ty->isIntOrIntVectorTy (64 )) {
486+ // VLVGP will insert two GPRs with one instruction, while VLE will load
487+ // an element directly with no extra cost
488+ assert ((VL.empty () || VL.size () == NumElts) &&
489+ " Type does not match the number of values." );
490+ InstructionCost CurrVectorCost = 0 ;
491+ for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
492+ if (DemandedElts[Idx] && !(VL.size () && isFreeEltLoad (VL[Idx])))
493+ ++CurrVectorCost;
494+ if (Idx % 2 == 1 ) {
495+ Cost += std::min (InstructionCost (1 ), CurrVectorCost);
496+ CurrVectorCost = 0 ;
497+ }
498+ }
499+ Insert = false ;
500+ }
501+
502+ Cost += BaseT::getScalarizationOverhead (Ty, DemandedElts, Insert, Extract,
503+ CostKind, VL);
504+ return Cost;
505+ }
506+
471507// Return the bit size for the scalar type or vector element
472508// type. getScalarSizeInBits() returns 0 for a pointer type.
473509static unsigned getScalarSizeInBits (Type *Ty) {
@@ -609,7 +645,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
609645 if (DivRemConst) {
610646 SmallVector<Type *> Tys (Args.size (), Ty);
611647 return VF * DivMulSeqCost +
612- getScalarizationOverhead (VTy, Args, Tys, CostKind);
648+ BaseT:: getScalarizationOverhead (VTy, Args, Tys, CostKind);
613649 }
614650 if ((SignedDivRem || UnsignedDivRem) && VF > 4 )
615651 // Temporary hack: disable high vectorization factors with integer
@@ -636,7 +672,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
636672 SmallVector<Type *> Tys (Args.size (), Ty);
637673 InstructionCost Cost =
638674 (VF * ScalarCost) +
639- getScalarizationOverhead (VTy, Args, Tys, CostKind);
675+ BaseT:: getScalarizationOverhead (VTy, Args, Tys, CostKind);
640676 // FIXME: VF 2 for these FP operations are currently just as
641677 // expensive as for VF 4.
642678 if (VF == 2 )
@@ -654,8 +690,9 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
654690 // There is no native support for FRem.
655691 if (Opcode == Instruction::FRem) {
656692 SmallVector<Type *> Tys (Args.size (), Ty);
657- InstructionCost Cost = (VF * LIBCALL_COST) +
658- getScalarizationOverhead (VTy, Args, Tys, CostKind);
693+ InstructionCost Cost =
694+ (VF * LIBCALL_COST) +
695+ BaseT::getScalarizationOverhead (VTy, Args, Tys, CostKind);
659696 // FIXME: VF 2 for float is currently just as expensive as for VF 4.
660697 if (VF == 2 && ScalarBits == 32 )
661698 Cost *= 2 ;
@@ -975,10 +1012,10 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
9751012 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
9761013 NeedsExtracts = false ;
9771014
978- TotCost += getScalarizationOverhead (SrcVecTy, /* Insert*/ false ,
979- NeedsExtracts, CostKind);
980- TotCost += getScalarizationOverhead (DstVecTy, NeedsInserts,
981- /* Extract*/ false , CostKind);
1015+ TotCost += BaseT:: getScalarizationOverhead (SrcVecTy, /* Insert*/ false ,
1016+ NeedsExtracts, CostKind);
1017+ TotCost += BaseT:: getScalarizationOverhead (DstVecTy, NeedsInserts,
1018+ /* Extract*/ false , CostKind);
9821019
9831020 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
9841021 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32 )
@@ -990,8 +1027,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
9901027 if (Opcode == Instruction::FPTrunc) {
9911028 if (SrcScalarBits == 128 ) // fp128 -> double/float + inserts of elements.
9921029 return VF /* ldxbr/lexbr*/ +
993- getScalarizationOverhead (DstVecTy, /* Insert*/ true ,
994- /* Extract*/ false , CostKind);
1030+ BaseT:: getScalarizationOverhead (DstVecTy, /* Insert*/ true ,
1031+ /* Extract*/ false , CostKind);
9951032 else // double -> float
9961033 return VF / 2 /* vledb*/ + std::max (1U , VF / 4 /* vperm*/ );
9971034 }
@@ -1004,8 +1041,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
10041041 return VF * 2 ;
10051042 }
10061043 // -> fp128. VF * lxdb/lxeb + extraction of elements.
1007- return VF + getScalarizationOverhead (SrcVecTy, /* Insert*/ false ,
1008- /* Extract*/ true , CostKind);
1044+ return VF + BaseT:: getScalarizationOverhead (SrcVecTy, /* Insert*/ false ,
1045+ /* Extract*/ true , CostKind);
10091046 }
10101047 }
10111048
@@ -1114,10 +1151,17 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
11141151 TTI::TargetCostKind CostKind,
11151152 unsigned Index, Value *Op0,
11161153 Value *Op1) {
1117- // vlvgp will insert two grs into a vector register, so only count half the
1118- // number of instructions.
1119- if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy (64 ))
1120- return ((Index % 2 == 0 ) ? 1 : 0 );
1154+ if (Opcode == Instruction::InsertElement) {
1155+ // Vector Element Load.
1156+ if (Op1 != nullptr && isFreeEltLoad (Op1))
1157+ return 0 ;
1158+
1159+ // vlvgp will insert two grs into a vector register, so count half the
1160+ // number of instructions as an estimate when we don't have the full
1161+ // picture (as in getScalarizationOverhead()).
1162+ if (Val->isIntOrIntVectorTy (64 ))
1163+ return ((Index % 2 == 0 ) ? 1 : 0 );
1164+ }
11211165
11221166 if (Opcode == Instruction::ExtractElement) {
11231167 int Cost = ((getScalarSizeInBits (Val) == 1 ) ? 2 /* +test-under-mask*/ : 1 );
0 commit comments