@@ -344,9 +344,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344unsigned  GCNTTIImpl::getMaximumVF (unsigned  ElemWidth, unsigned  Opcode) const  {
345345  if  (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346    return  32  * 4  / ElemWidth;
347-   return  (ElemWidth == 16  && ST->has16BitInsts ()) ? 2 
348-        : (ElemWidth == 32  && ST->hasPackedFP32Ops ()) ? 2 
349-        : 1 ;
347+   //  For a given width return the max 0number of elements that can be combined
348+   //  into a wider bit value:
349+   return  (ElemWidth == 8  && ST->has16BitInsts ())       ? 4 
350+          : (ElemWidth == 16  && ST->has16BitInsts ())    ? 2 
351+          : (ElemWidth == 32  && ST->hasPackedFP32Ops ()) ? 2 
352+                                                        : 1 ;
350353}
351354
352355unsigned  GCNTTIImpl::getLoadVectorFactor (unsigned  VF, unsigned  LoadSize,
@@ -1195,14 +1198,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11951198
11961199  Kind = improveShuffleKindFromMask (Kind, Mask, SrcTy, Index, SubTp);
11971200
1198-   //  Larger vector widths may require additional instructions, but are
1199-   //  typically cheaper than scalarized versions.
1200-   unsigned  NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements ();
1201+   unsigned  ScalarSize = DL.getTypeSizeInBits (SrcTy->getElementType ());
12011202  if  (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1202-       DL.getTypeSizeInBits (SrcTy->getElementType ()) == 16 ) {
1203-     bool  HasVOP3P = ST->hasVOP3PInsts ();
1203+       (ScalarSize == 16  || ScalarSize == 8 )) {
1204+     //  Larger vector widths may require additional instructions, but are
1205+     //  typically cheaper than scalarized versions.
1206+     unsigned  NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements ();
12041207    unsigned  RequestedElts =
12051208        count_if (Mask, [](int  MaskElt) { return  MaskElt != -1 ; });
1209+     unsigned  EltsPerReg = 32  / ScalarSize;
12061210    if  (RequestedElts == 0 )
12071211      return  0 ;
12081212    switch  (Kind) {
@@ -1211,9 +1215,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
12111215    case  TTI::SK_PermuteSingleSrc: {
12121216      //  With op_sel VOP3P instructions freely can access the low half or high
12131217      //  half of a register, so any swizzle of two elements is free.
1214-       if  (HasVOP3P  && NumVectorElts == 2 )
1218+       if  (ST-> hasVOP3PInsts () && ScalarSize ==  16  && NumVectorElts == 2 )
12151219        return  0 ;
1216-       unsigned  NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1220+       unsigned  NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
12171221      //  SK_Broadcast just reuses the same mask
12181222      unsigned  NumPermMasks = Kind == TTI::SK_Broadcast ? 1  : NumPerms;
12191223      return  NumPerms + NumPermMasks;
@@ -1225,12 +1229,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
12251229        return  0 ;
12261230      //  Insert/extract subvectors only require shifts / extract code to get the
12271231      //  relevant bits
1228-       return  alignTo (RequestedElts, 2 ) / 2 ;
1232+       return  alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
12291233    }
12301234    case  TTI::SK_PermuteTwoSrc:
12311235    case  TTI::SK_Splice:
12321236    case  TTI::SK_Select: {
1233-       unsigned  NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1237+       unsigned  NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
12341238      //  SK_Select just reuses the same mask
12351239      unsigned  NumPermMasks = Kind == TTI::SK_Select ? 1  : NumPerms;
12361240      return  NumPerms + NumPermMasks;
@@ -1505,3 +1509,30 @@ GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
15051509  return  AMDGPU::isShader (F->getCallingConv ()) ? KnownIEEEMode::Off
15061510                                               : KnownIEEEMode::On;
15071511}
1512+ 
1513+ InstructionCost GCNTTIImpl::getMemoryOpCost (unsigned  Opcode, Type *Src,
1514+                                             Align Alignment,
1515+                                             unsigned  AddressSpace,
1516+                                             TTI::TargetCostKind CostKind,
1517+                                             TTI::OperandValueInfo OpInfo,
1518+                                             const  Instruction *I) const  {
1519+   if  (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1520+     if  ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1521+         VecTy->getElementType ()->isIntegerTy (8 )) {
1522+       return  divideCeil (DL.getTypeSizeInBits (VecTy) - 1 ,
1523+                         getLoadStoreVecRegBitWidth (AddressSpace));
1524+     }
1525+   }
1526+   return  BaseT::getMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind,
1527+                                 OpInfo, I);
1528+ }
1529+ 
1530+ unsigned  GCNTTIImpl::getNumberOfParts (Type *Tp) const  {
1531+   if  (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1532+     if  (VecTy->getElementType ()->isIntegerTy (8 )) {
1533+       unsigned  ElementCount = VecTy->getElementCount ().getFixedValue ();
1534+       return  divideCeil (ElementCount - 1 , 4 );
1535+     }
1536+   }
1537+   return  BaseT::getNumberOfParts (Tp);
1538+ }
0 commit comments