ROCm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp‎
Lines changed: 118 additions & 24 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp‎
Lines changed: 118 additions & 24 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h‎
Lines changed: 14 additions & 1 deletion b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h‎
Lines changed: 14 additions & 1 deletion
@@ -313,24 +313,6 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
   return !F || !ST->isSingleLaneExecution(*F);
 }
 
-unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
-  // For certain 8 bit ops, we can pack a v4i8 into a single part
-  // (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
-  // do not limit the numberOfParts for 8 bit vectors to the
-  // legalization costs of such. It is left up to other target
-  // queries (e.g. get*InstrCost) to decide the proper handling
-  // of 8 bit vectors.
-  if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
-    if (ST->shouldCoerceIllegalTypes() &&
-        DL.getTypeSizeInBits(VTy->getElementType()) == 8) {
-      unsigned ElCount = VTy->getElementCount().getFixedValue();
-      return std::max(UINT64_C(1), PowerOf2Ceil(ElCount / 4));
-    }
-  }
-
-  return BaseT::getNumberOfParts(Tp);
-}
-
 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
   // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
   // registers. See getRegisterClassForType for the implementation.
@@ -363,10 +345,12 @@ unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
     return 32 * 4 / ElemWidth;
 
-  return (ST->shouldCoerceIllegalTypes() && ElemWidth == 8) ? 4
-         : (ElemWidth == 16)                                ? 2
-         : (ElemWidth == 32 && ST->hasPackedFP32Ops())      ? 2
-                                                            : 1;
+  // For a given width return the max number of elements that can be combined
+  // into a wider bit value:
+  return (ElemWidth == 8 && ST->has16BitInsts())       ? 4
+         : (ElemWidth == 16 && ST->has16BitInsts())    ? 2
+         : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+                                                       : 1;
 }
 
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1176,8 +1160,7 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 
   unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType());
   if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
-      (ScalarSize == 16 ||
-       (ScalarSize == 8 && ST->shouldCoerceIllegalTypes()))) {
+      (ScalarSize == 16 || ScalarSize == 8)) {
     // Larger vector widths may require additional instructions, but are
     // typically cheaper than scalarized versions.
     unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
@@ -1239,6 +1222,90 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
 
     if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
       Ops.push_back(&Op);
+
+    // Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
+    // will be optimized away, and sinking them can help SDAG combines.
+    DataLayout DL = I->getModule()->getDataLayout();
+    auto IsFreeExtractInsert = [&DL, this](VectorType *VecType,
+                                           unsigned VecIndex) {
+      unsigned EltSize = DL.getTypeSizeInBits(VecType->getElementType());
+      return EltSize >= 32 ||
+             (EltSize == 16 && VecIndex == 0 && ST->has16BitInsts());
+    };
+
+    uint64_t VecIndex;
+    Value *Vec;
+    if (match(Op.get(), m_ExtractElt(m_Value(Vec), m_ConstantInt(VecIndex)))) {
+      Instruction *VecOpInst =
+          dyn_cast<Instruction>(cast<Instruction>(Op.get())->getOperand(0));
+      // If a zero cost extractvector instruction is the only use of the vector,
+      // then it may be combined with the def.
+      if (VecOpInst && VecOpInst->hasOneUse())
+        continue;
+
+      if (IsFreeExtractInsert(cast<VectorType>(Vec->getType()), VecIndex))
+        Ops.push_back(&Op);
+
+      continue;
+    }
+
+    if (match(Op.get(),
+              m_InsertElt(m_Value(Vec), m_Value(), m_ConstantInt(VecIndex)))) {
+      if (IsFreeExtractInsert(cast<VectorType>(Vec->getType()), VecIndex))
+        Ops.push_back(&Op);
+
+      continue;
+    }
+
+    if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
+      if (Shuffle->isIdentity()) {
+        Ops.push_back(&Op);
+        continue;
+      }
+
+      unsigned EltSize = DL.getTypeSizeInBits(
+          cast<VectorType>(cast<VectorType>(Shuffle->getType()))
+              ->getElementType());
+
+      // For i32 (or greater) shufflevectors, these will be lowered into a
+      // series of insert / extract elements, which will be coalesced away.
+      if (EltSize >= 32) {
+        Ops.push_back(&Op);
+        continue;
+      }
+
+      if (EltSize < 16 || !ST->has16BitInsts())
+        continue;
+
+      int NumSubElts, SubIndex;
+      if (Shuffle->changesLength()) {
+        if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
+          Ops.push_back(&Op);
+          continue;
+        }
+
+        if (Shuffle->isExtractSubvectorMask(SubIndex) ||
+            Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
+          if (!(SubIndex % 2)) {
+            Ops.push_back(&Op);
+            continue;
+          }
+        }
+      }
+
+      if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
+          Shuffle->isSingleSource()) {
+        Ops.push_back(&Op);
+        continue;
+      }
+
+      if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
+        if (!(SubIndex % 2)) {
+          Ops.push_back(&Op);
+          continue;
+        }
+      }
+    }
   }
 
   return !Ops.empty();
@@ -1452,3 +1519,30 @@ unsigned GCNTTIImpl::getPrefetchDistance() const {
 bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
   return AMDGPU::isFlatGlobalAddrSpace(AS);
 }
+
+InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                            Align Alignment,
+                                            unsigned AddressSpace,
+                                            TTI::TargetCostKind CostKind,
+                                            TTI::OperandValueInfo OpInfo,
+                                            const Instruction *I) {
+  if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
+    if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+        VecTy->getElementType()->isIntegerTy(8)) {
+      return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
+                        getLoadStoreVecRegBitWidth(AddressSpace));
+    }
+  }
+  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
+                                OpInfo, I);
+}
+
+unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
+  if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
+    if (VecTy->getElementType()->isIntegerTy(8)) {
+      unsigned ElementCount = VecTy->getElementCount().getFixedValue();
+      return divideCeil(ElementCount - 1, 4);
+    }
+  }
+  return BaseT::getNumberOfParts(Tp);
+}
@@ -118,7 +118,6 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
     return TTI::PSK_FastHardware;
   }
 
-  unsigned getNumberOfParts(Type *Tp);
   unsigned getNumberOfRegisters(unsigned RCID) const;
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
@@ -278,6 +277,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
 
   /// \return if target want to issue a prefetch in address space \p AS.
   bool shouldPrefetchAddressSpace(unsigned AS) const override;
+
+  /// Account for loads of i8 vector types to have reduced cost. For
+  /// example the cost of load 4 i8s values is one is the cost of loading
+  /// a single i32 value.
+  InstructionCost getMemoryOpCost(
+      unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr);
+
+  /// When counting parts on AMD GPUs, account for i8s being grouped
+  /// together under a single i32 value. Otherwise fall back to base
+  /// implementation.
+  unsigned getNumberOfParts(Type *Tp);
 };
 
 } // end namespace llvm