[AArch64] Increase scatter overhead on Neoverse-V2 (#101296)

madhur13490 · web-flow · commit b73771cf0fc4 · 2024-08-14T10:12:40.000+05:30
This patch increases scatter overhead on Neoverse-V2 to 13. This
benefits s128 kernel from TSVC_2 test suite.
SPEC 17, RAJAPerf, and Sptter are unaffected by this patch.

This patch boosts s128 kernel's performance from TSVC test suite by about
40% as this enables vectorization. Also, handle minor code refactoring
for gather related part.
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -233,9 +233,12 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     PrefLoopAlignment = Align(32);
     MaxBytesForLoopAlignment = 16;
     break;
+  case NeoverseV2:
+    // Specialize cost for Neoverse-V2.
+    ScatterOverhead = 13;
+    LLVM_FALLTHROUGH;
   case NeoverseN2:
   case NeoverseN3:
-  case NeoverseV2:
   case NeoverseV3:
     PrefFunctionAlignment = Align(16);
     PrefLoopAlignment = Align(32);
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -59,6 +59,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   uint8_t MaxInterleaveFactor = 2;
   uint8_t VectorInsertExtractBaseCost = 2;
   uint16_t CacheLineSize = 0;
+  // Default scatter/gather overhead.
+  unsigned ScatterOverhead = 10;
+  unsigned GatherOverhead = 10;
   uint16_t PrefetchDistance = 0;
   uint16_t MinPrefetchStride = 1;
   unsigned MaxPrefetchIterationsAhead = UINT_MAX;
@@ -225,6 +228,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
   unsigned getVectorInsertExtractBaseCost() const;
   unsigned getCacheLineSize() const override { return CacheLineSize; }
+  unsigned getScatterOverhead() const { return ScatterOverhead; }
+  unsigned getGatherOverhead() const { return GatherOverhead; }
   unsigned getPrefetchDistance() const override { return PrefetchDistance; }
   unsigned getMinPrefetchStride(unsigned NumMemAccesses,
                                 unsigned NumStridedMemAccesses,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3411,8 +3411,26 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
   return LT.first;
 }
 
-static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
-  return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
+// This function returns gather/scatter overhead either from
+// user-provided value or specialized values per-target from \p ST.
+static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
+                                            const AArch64Subtarget *ST) {
+  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+         "Should be called on only load or stores.");
+  switch (Opcode) {
+  case Instruction::Load:
+    if (SVEGatherOverhead.getNumOccurrences() > 0)
+      return SVEGatherOverhead;
+    return ST->getGatherOverhead();
+    break;
+  case Instruction::Store:
+    if (SVEScatterOverhead.getNumOccurrences() > 0)
+      return SVEScatterOverhead;
+    return ST->getScatterOverhead();
+    break;
+  default:
+    llvm_unreachable("Shouldn't have reached here");
+  }
 }
 
 InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
@@ -3444,9 +3462,7 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
                       {TTI::OK_AnyValue, TTI::OP_None}, I);
   // Add on an overhead cost for using gathers/scatters.
-  // TODO: At the moment this is applied unilaterally for all CPUs, but at some
-  // point we may want a per-CPU overhead.
-  MemOpCost *= getSVEGatherScatterOverhead(Opcode);
+  MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
   return LT.first * MemOpCost * getMaxNumElements(LegalVF);
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
@@ -0,0 +1,10 @@
+; RUN: opt -mtriple aarch64 -mcpu=neoverse-v2 -passes="print<cost-model>" -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-V2
+; RUN: opt -mtriple aarch64 -mattr=+sve2  -passes="print<cost-model>" -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-GENERIC
+; CHECK-V2: Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f32
+; CHECK-GENERIC: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4f32
+
+define void @masked_scatter_nxv8f32_i64(<vscale x 4 x float> %data, <vscale x 4 x ptr> %b, <vscale x 4 x i64> %V) #0 {
+  call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> %data, <vscale x 4 x ptr> %b, i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+  ret void
+}
+