Enable vectorization of i8 values.

doru1004 · doru1004 · commit d19fb9ae382e · 2025-04-18T17:24:12.000-04:00
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
     return 32 * 4 / ElemWidth;
-  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
-       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
-       : 1;
+  return ElemWidth == 8                                ? 4
+         : (ElemWidth == 16 && ST->has16BitInsts())    ? 2
+         : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+                                                       : 1;
 }
 
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -562,6 +563,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
     if (ST->has16BitInsts() && SLT == MVT::i16)
       NElts = (NElts + 1) / 2;
 
+    // i32
     return LT.first * NElts * getFullRateInstrCost();
   case ISD::MUL: {
     const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
@@ -1423,3 +1425,30 @@ void GCNTTIImpl::collectKernelLaunchBounds(
   LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
   LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
 }
+
+InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                            Align Alignment,
+                                            unsigned AddressSpace,
+                                            TTI::TargetCostKind CostKind,
+                                            TTI::OperandValueInfo OpInfo,
+                                            const Instruction *I) {
+  if (VectorType *VecTy = dyn_cast<VectorType>(Src))
+    if (Opcode == Instruction::Load &&
+        VecTy->getElementType() ==
+            IntegerType::getInt8Ty(VecTy->getContext())) {
+      unsigned ElementCount = VecTy->getElementCount().getFixedValue();
+      return ((ElementCount - 1) / 4) + 1;
+    }
+  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
+                                OpInfo, I);
+}
+
+unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
+  if (VectorType *VecTy = dyn_cast<VectorType>(Tp))
+    if (VecTy->getElementType() ==
+        IntegerType::getInt8Ty(VecTy->getContext())) {
+      unsigned ElementCount = VecTy->getElementCount().getFixedValue();
+      return ((ElementCount - 1) / 4) + 1;
+    }
+  return BaseT::getNumberOfParts(Tp);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -282,6 +282,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   void collectKernelLaunchBounds(
       const Function &F,
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
+
+  /// Account for loads of i8 vector types to have reduced cost. For
+  /// example the cost of load 4 i8s values is one is the cost of loading
+  /// a single i32 value.
+  InstructionCost getMemoryOpCost(
+      unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr);
+
+  /// When counting parts on AMD GPUs, account for i8s being grouped
+  /// together under a single i32 value. Otherwise fall back to base
+  /// implementation.
+  unsigned getNumberOfParts(Type *Tp);
 };
 
 } // end namespace llvm
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8-as-i32.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8-as-i32.ll