ROCm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp‎
Lines changed: 34 additions & 12 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp‎
Lines changed: 34 additions & 12 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h‎
Lines changed: 1 addition & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/GCNSubtarget.cpp‎
Lines changed: 7 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/GCNSubtarget.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/GCNSubtarget.h‎
Lines changed: 6 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/GCNSubtarget.h‎
Lines changed: 6 additions & 0 deletions
@@ -313,6 +313,24 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
   return !F || !ST->isSingleLaneExecution(*F);
 }
 
+unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
+  // For certain 8 bit ops, we can pack a v4i8 into a single part
+  // (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
+  // do not limit the numberOfParts for 8 bit vectors to the
+  // legalization costs of such. It is left up to other target
+  // queries (e.g. get*InstrCost) to decide the proper handling
+  // of 8 bit vectors.
+  if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
+    if (ST->shouldCoerceIllegalTypes() &&
+        DL.getTypeSizeInBits(VTy->getElementType()) == 8) {
+      unsigned ElCount = VTy->getElementCount().getFixedValue();
+      return PowerOf2Ceil(ElCount / 4);
+    }
+  }
+
+  return BaseT::getNumberOfParts(Tp);
+}
+
 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
   // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
   // registers. See getRegisterClassForType for the implementation.
@@ -344,9 +362,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
     return 32 * 4 / ElemWidth;
-  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
-       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
-       : 1;
+
+  return (ST->shouldCoerceIllegalTypes() && ElemWidth == 8) ? 4
+         : (ElemWidth == 16)                                ? 2
+         : (ElemWidth == 32 && ST->hasPackedFP32Ops())      ? 2
+                                                            : 1;
 }
 
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1131,14 +1151,16 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 
   Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
 
-  // Larger vector widths may require additional instructions, but are
-  // typically cheaper than scalarized versions.
-  unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
+  unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType());
   if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
-      DL.getTypeSizeInBits(VT->getElementType()) == 16) {
-    bool HasVOP3P = ST->hasVOP3PInsts();
+      (ScalarSize == 16 ||
+       (ScalarSize == 8 && ST->shouldCoerceIllegalTypes()))) {
+    // Larger vector widths may require additional instructions, but are
+    // typically cheaper than scalarized versions.
+    unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
     unsigned RequestedElts =
         count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
+    unsigned EltsPerReg = 32 / ScalarSize;
     if (RequestedElts == 0)
       return 0;
     switch (Kind) {
@@ -1147,9 +1169,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     case TTI::SK_PermuteSingleSrc: {
       // With op_sel VOP3P instructions freely can access the low half or high
       // half of a register, so any swizzle of two elements is free.
-      if (HasVOP3P && NumVectorElts == 2)
+      if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
         return 0;
-      unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
+      unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
       // SK_Broadcast just reuses the same mask
       unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
       return NumPerms + NumPermMasks;
@@ -1161,12 +1183,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         return 0;
       // Insert/extract subvectors only require shifts / extract code to get the
       // relevant bits
-      return alignTo(RequestedElts, 2) / 2;
+      return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
     }
     case TTI::SK_PermuteTwoSrc:
     case TTI::SK_Splice:
     case TTI::SK_Select: {
-      unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
+      unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
       // SK_Select just reuses the same mask
       unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
       return NumPerms + NumPermMasks;
 
@@ -118,6 +118,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
     return TTI::PSK_FastHardware;
   }
 
+  unsigned getNumberOfParts(Type *Tp);
   unsigned getNumberOfRegisters(unsigned RCID) const;
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
 
@@ -52,6 +52,11 @@ static cl::opt<unsigned>
                  cl::desc("Number of addresses from which to enable MIMG NSA."),
                  cl::init(2), cl::Hidden);
 
+static cl::opt<bool>
+    CoerceIllegal("amdgpu-coerce-illegal-types",
+                  cl::desc("Whether or not to coerce illegal types"),
+                  cl::ReallyHidden, cl::init(false));
+
 GCNSubtarget::~GCNSubtarget() = default;
 
 GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
@@ -191,6 +196,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
   RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);
   InstSelector =
       std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);
+
+  ShouldCoerceIllegalTypes = CoerceIllegal;
 }
 
 const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const {
 
@@ -259,6 +259,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // Dummy feature to use for assembler in tablegen.
   bool FeatureDisable = false;
 
+  bool ShouldCoerceIllegalTypes = false;
+
 private:
   SIInstrInfo InstrInfo;
   SITargetLowering TLInfo;
@@ -1445,6 +1447,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // of sign-extending.
   bool hasGetPCZeroExtension() const { return GFX12Insts; }
 
+  /// \returns whether or not we should coerce illegal types into vectors of
+  // legal types for values that span basic blocks.
+  bool shouldCoerceIllegalTypes() const { return ShouldCoerceIllegalTypes; }
+
   /// \returns SGPR allocation granularity supported by the subtarget.
   unsigned getSGPRAllocGranule() const {
     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
Original file line number	Diff line number	Diff line change
`@@ -118,6 +118,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {`
`118`	`118`	`return TTI::PSK_FastHardware;`
`119`	`119`	`}`
`120`	`120`
	`121`	`+ unsigned getNumberOfParts(Type *Tp);`
`121`	`122`	`unsigned getNumberOfRegisters(unsigned RCID) const;`
`122`	`123`	`TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;`
`123`	`124`	`unsigned getMinVectorRegisterBitWidth() const;`