Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4894,15 +4894,14 @@ void AArch64TTIImpl::getUnrollingPreferences(
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;

// No need to unroll auto-vectorized loops
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
return;

// Scan the loop: don't unroll loops with calls as this could prevent
// inlining. Don't unroll vector loops either, as they don't benefit much from
// unrolling.
// inlining.
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
// Don't unroll vectorised loop.
if (I.getType()->isVectorTy())
return;

if (isa<CallBase>(I)) {
if (isa<CallInst>(I) || isa<InvokeInst>(I))
if (const Function *F = cast<CallBase>(I).getCalledFunction())
Expand Down
203 changes: 203 additions & 0 deletions llvm/test/Transforms/LoopUnroll/AArch64/vector.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefixes=COMMON,APPLE %s
; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefixes=COMMON,GENERIC
define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-LABEL: define void @reverse(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
; APPLE-NEXT: [[ENTRY:.*]]:
; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[LEN]], -1
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7
; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
; APPLE: [[ENTRY_NEW]]:
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
; APPLE-NEXT: br label %[[FOR_BODY:.*]]
; APPLE: [[FOR_BODY]]:
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
; APPLE-NEXT: [[TMP2:%.*]] = sub nsw i64 [[LEN]], [[IV]]
; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP2]]
; APPLE-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
; APPLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
; APPLE-NEXT: store <4 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 16
; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
; APPLE-NEXT: [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]]
; APPLE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]]
; APPLE-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
; APPLE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]]
; APPLE-NEXT: store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16
; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
; APPLE-NEXT: [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]]
; APPLE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]]
; APPLE-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
; APPLE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]]
; APPLE-NEXT: store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16
; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
; APPLE-NEXT: [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]]
; APPLE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]]
; APPLE-NEXT: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
; APPLE-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]]
; APPLE-NEXT: store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16
; APPLE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4
; APPLE-NEXT: [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_3]]
; APPLE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]]
; APPLE-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
; APPLE-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_3]]
; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_4]], align 16
; APPLE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5
; APPLE-NEXT: [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_4]]
; APPLE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]]
; APPLE-NEXT: [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
; APPLE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_4]]
; APPLE-NEXT: store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_5]], align 16
; APPLE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6
; APPLE-NEXT: [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_5]]
; APPLE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]]
; APPLE-NEXT: [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
; APPLE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_5]]
; APPLE-NEXT: store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_6]], align 16
; APPLE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7
; APPLE-NEXT: [[TMP16:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_6]]
; APPLE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP16]]
; APPLE-NEXT: [[TMP17:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
; APPLE-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_6]]
; APPLE-NEXT: store <4 x float> [[TMP17]], ptr [[ARRAYIDX2_7]], align 16
; APPLE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8
; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[FOR_BODY]] ]
; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
; APPLE: [[EXIT_UNR_LCSSA]]:
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
; APPLE: [[FOR_BODY_EPIL]]:
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
; APPLE-NEXT: [[TMP18:%.*]] = sub nsw i64 [[LEN]], [[IV_EPIL]]
; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP18]]
; APPLE-NEXT: [[TMP19:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_EPIL]]
; APPLE-NEXT: store <4 x float> [[TMP19]], ptr [[ARRAYIDX2_EPIL]], align 16
; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[LEN]]
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
; APPLE: [[EXIT_EPILOG_LCSSA]]:
; APPLE-NEXT: br label %[[EXIT]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
; GENERIC-LABEL: define void @reverse(
; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
; GENERIC-NEXT: [[ENTRY:.*]]:
; GENERIC-NEXT: br label %[[FOR_BODY:.*]]
; GENERIC: [[FOR_BODY]]:
; GENERIC-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; GENERIC-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[IV]]
; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
; GENERIC-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
; GENERIC-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
; GENERIC-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
; GENERIC-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; GENERIC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[LEN]]
; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
; GENERIC: [[EXIT]]:
; GENERIC-NEXT: ret void
;
entry: ; preds = %entry
br label %for.body

for.body: ; preds = %entry, %for.body
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%1 = sub nsw i64 %len, %iv
%arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
%2 = load <4 x float>, ptr %arrayidx, align 16
%arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %iv
store <4 x float> %2, ptr %arrayidx2, align 16
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %len
br i1 %exitcond.not, label %exit, label %for.body

exit: ; preds = %for.body, %entry
ret void
}


define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; APPLE-NEXT: [[ENTRY:.*]]:
; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
; APPLE: [[VECTOR_BODY]]:
; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; APPLE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; APPLE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; APPLE-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
; GENERIC-LABEL: define void @saxpy_tripcount1K_av1(
; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
; GENERIC-NEXT: [[ENTRY:.*]]:
; GENERIC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; GENERIC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; GENERIC-NEXT: br label %[[VECTOR_BODY:.*]]
; GENERIC: [[VECTOR_BODY]]:
; GENERIC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; GENERIC-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
; GENERIC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; GENERIC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
; GENERIC-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; GENERIC-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; GENERIC-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; GENERIC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; GENERIC-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; GENERIC-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; GENERIC: [[EXIT]]:
; GENERIC-NEXT: ret void
;
entry:
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
br label %vector.body

vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds nuw float, ptr %src, i64 %index
%wide.load = load <4 x float>, ptr %0, align 4
%1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
%wide.load12 = load <4 x float>, ptr %1, align 4
%2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
store <4 x float> %2, ptr %1, align 4
%index.next = add nuw i64 %index, 4
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %exit, label %vector.body, !llvm.loop !0

exit: ; preds = %vector.body
ret void
}
!0 = !{!"llvm.loop.isvectorized", i32 1}

;.
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
; APPLE: [[LOOP2]] = !{!"llvm.loop.isvectorized", i32 1}
;.
; GENERIC: [[LOOP0]] = !{!"llvm.loop.isvectorized", i32 1}
;.
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; COMMON: {{.*}}
Loading