Skip to content

Commit 0eb5fb6

Browse files
committed
[AArch64] Enable maximize vector bandwidth for Neoverse-V1, V2 and N1
This PR enables maximizing vector bandwidth for the Neoverse-V1, V2 and N1 CPUs.
1 parent 2286118 commit 0eb5fb6

File tree

3 files changed

+61
-38
lines changed

3 files changed

+61
-38
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -371,8 +371,15 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
371371
bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
372372
TargetTransformInfo::RegisterKind K) const {
373373
assert(K != TargetTransformInfo::RGK_Scalar);
374-
return (K == TargetTransformInfo::RGK_FixedWidthVector &&
375-
ST->isNeonAvailable());
374+
switch (ST->getProcFamily()) {
375+
case AArch64Subtarget::NeoverseV2:
376+
case AArch64Subtarget::NeoverseV1:
377+
case AArch64Subtarget::NeoverseN1:
378+
return true;
379+
default:
380+
return (K == TargetTransformInfo::RGK_FixedWidthVector &&
381+
ST->isNeonAvailable());
382+
}
376383
}
377384

378385
/// Calculate the cost of materializing a 64-bit value. This helper

llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
5050
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
5151
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
5252
; CHECK: Cost for VF 16: 57
53-
; CHECK: LV: Selecting VF: vscale x 2
53+
; CHECK: LV: Selecting VF: 16
5454
entry:
5555
br label %for.body
5656

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll

Lines changed: 51 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -674,59 +674,75 @@ define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
674674
; CHECK-INTERLEAVE1-NEXT: entry:
675675
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]]
676676
; CHECK-INTERLEAVE1: vector.ph:
677+
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
678+
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], 8
679+
; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP2]]
680+
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
677681
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
678682
; CHECK-INTERLEAVE1: vector.body:
679683
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
680-
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
684+
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
681685
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
682-
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
683-
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
684-
; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = sub <16 x i32> [[VEC_PHI]], [[TMP3]]
685-
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
686-
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
686+
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP1]], align 1
687+
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
688+
; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = sub <vscale x 8 x i32> [[VEC_PHI]], [[TMP3]]
689+
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
690+
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
687691
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
688692
; CHECK-INTERLEAVE1: middle.block:
689-
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
690-
; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]]
693+
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP4]])
694+
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
695+
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
691696
; CHECK-INTERLEAVE1: scalar.ph:
692697
;
693698
; CHECK-INTERLEAVED-LABEL: define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(
694699
; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
695700
; CHECK-INTERLEAVED-NEXT: entry:
696701
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]]
697702
; CHECK-INTERLEAVED: vector.ph:
703+
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
704+
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], 32
705+
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP2]]
706+
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
698707
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
699708
; CHECK-INTERLEAVED: vector.body:
700709
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
701-
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
702-
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
703-
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
704-
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
710+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
711+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
712+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
713+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
705714
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
706-
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
707-
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 32
708-
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
709-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
710-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
711-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
712-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
713-
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
714-
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
715-
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
716-
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
717-
; CHECK-INTERLEAVED-NEXT: [[TMP6]] = sub <16 x i32> [[VEC_PHI]], [[TMP4]]
718-
; CHECK-INTERLEAVED-NEXT: [[TMP7]] = sub <16 x i32> [[VEC_PHI1]], [[TMP5]]
719-
; CHECK-INTERLEAVED-NEXT: [[TMP10]] = sub <16 x i32> [[VEC_PHI2]], [[TMP12]]
720-
; CHECK-INTERLEAVED-NEXT: [[TMP11]] = sub <16 x i32> [[VEC_PHI3]], [[TMP14]]
721-
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
722-
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
723-
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
715+
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
716+
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 3
717+
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 [[TMP4]]
718+
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
719+
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 4
720+
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP1]], i64 [[TMP7]]
721+
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
722+
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 24
723+
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP1]], i64 [[TMP10]]
724+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP1]], align 1
725+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP5]], align 1
726+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
727+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
728+
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
729+
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
730+
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD5]] to <vscale x 8 x i32>
731+
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD6]] to <vscale x 8 x i32>
732+
; CHECK-INTERLEAVED-NEXT: [[TMP16]] = sub <vscale x 8 x i32> [[VEC_PHI]], [[TMP12]]
733+
; CHECK-INTERLEAVED-NEXT: [[TMP17]] = sub <vscale x 8 x i32> [[VEC_PHI1]], [[TMP13]]
734+
; CHECK-INTERLEAVED-NEXT: [[TMP18]] = sub <vscale x 8 x i32> [[VEC_PHI2]], [[TMP14]]
735+
; CHECK-INTERLEAVED-NEXT: [[TMP19]] = sub <vscale x 8 x i32> [[VEC_PHI3]], [[TMP15]]
736+
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
737+
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
738+
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
724739
; CHECK-INTERLEAVED: middle.block:
725-
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]]
726-
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX7:%.*]] = add <16 x i32> [[TMP10]], [[BIN_RDX]]
727-
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX8:%.*]] = add <16 x i32> [[TMP11]], [[BIN_RDX7]]
728-
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX8]])
729-
; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]]
740+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP17]], [[TMP16]]
741+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX7:%.*]] = add <vscale x 8 x i32> [[TMP18]], [[BIN_RDX]]
742+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX8:%.*]] = add <vscale x 8 x i32> [[TMP19]], [[BIN_RDX7]]
743+
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX8]])
744+
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
745+
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
730746
; CHECK-INTERLEAVED: scalar.ph:
731747
;
732748
; CHECK-MAXBW-LABEL: define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(

0 commit comments

Comments
 (0)