diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 5b333d33cffd5..7ee22166cb0a7 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3989,6 +3989,92 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, } } +/// For Apple CPUs, we want to runtime-unroll loops to make better use if the +/// OOO engine's wide instruction window and various predictors. +static void +getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, + TargetTransformInfo::UnrollingPreferences &UP, + AArch64TTIImpl &TTI) { + // Limit loops with structure that is highly likely to benefit from runtime + // unrolling; that is we exclude outer loops, loops with multiple exits and + // many blocks (i.e. likely with complex control flow). Note that the + // heuristics here may be overly conservative and we err on the side of + // avoiding runtime unrolling rather than unroll excessively. They are all + // subject to further refinement. + if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8) + return; + + const SCEV *BTC = SE.getBackedgeTakenCount(L); + if (isa(BTC) || isa(BTC) || + (SE.getSmallConstantMaxTripCount(L) > 0 && + SE.getSmallConstantMaxTripCount(L) <= 32)) + return; + if (findStringMetadataForLoop(L, "llvm.loop.isvectorized")) + return; + + int64_t Size = 0; + for (auto *BB : L->getBlocks()) { + for (auto &I : *BB) { + if (!isa(&I) && isa(&I)) + return; + SmallVector Operands(I.operand_values()); + Size += + *TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue(); + } + } + + // Limit to loops with trip counts that are cheap to expand. + UP.SCEVExpansionBudget = 1; + + // Try to unroll small, single block loops, if they have load/store + // dependencies, to expose more parallel memory access streams. + if (L->getHeader() != L->getLoopLatch() || Size > 8) + return; + + SmallPtrSet LoadedValues; + SmallVector Stores; + for (auto *BB : L->blocks()) { + for (auto &I : *BB) { + Value *Ptr = getLoadStorePointerOperand(&I); + if (!Ptr) + continue; + const SCEV *PtrSCEV = SE.getSCEV(Ptr); + if (SE.isLoopInvariant(PtrSCEV, L)) + continue; + if (isa(&I)) + LoadedValues.insert(&I); + else + Stores.push_back(cast(&I)); + } + } + + // Try to find an unroll count that maximizes the use of the instruction + // window, i.e. trying to fetch as many instructions per cycle as possible. + unsigned MaxInstsPerLine = 16; + unsigned UC = 1; + unsigned BestUC = 1; + unsigned SizeWithBestUC = BestUC * Size; + while (UC <= 8) { + unsigned SizeWithUC = UC * Size; + if (SizeWithUC > 48) + break; + if ((SizeWithUC % MaxInstsPerLine) == 0 || + (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) { + BestUC = UC; + SizeWithBestUC = BestUC * Size; + } + UC++; + } + + if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) { + return LoadedValues.contains(SI->getOperand(0)); + })) + return; + + UP.Runtime = true; + UP.DefaultUnrollRuntimeCount = BestUC; +} + void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) { @@ -4006,9 +4092,21 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, // Disable partial & runtime unrolling on -Os. UP.PartialOptSizeThreshold = 0; - if (ST->getProcFamily() == AArch64Subtarget::Falkor && - EnableFalkorHWPFUnrollFix) - getFalkorUnrollingPreferences(L, SE, UP); + // Apply subtarget-specific unrolling preferences. + switch (ST->getProcFamily()) { + case AArch64Subtarget::AppleA14: + case AArch64Subtarget::AppleA15: + case AArch64Subtarget::AppleA16: + case AArch64Subtarget::AppleM4: + getAppleRuntimeUnrollPreferences(L, SE, UP, *this); + break; + case AArch64Subtarget::Falkor: + if (EnableFalkorHWPFUnrollFix) + getFalkorUnrollingPreferences(L, SE, UP); + break; + default: + break; + } // Scan the loop: don't unroll loops with calls as this could prevent // inlining. Don't unroll vector loops either, as they don't benefit much from diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll index deacec795fb03..d27d5e74e28f2 100644 --- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll +++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll @@ -12,17 +12,91 @@ define void @small_load_store_loop(ptr %src, ptr %dst, i64 %N, i64 %scale) { ; APPLE-LABEL: define void @small_load_store_loop( ; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]]) #[[ATTR0:[0-9]+]] { ; APPLE-NEXT: [[ENTRY:.*]]: +; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 7 +; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7 +; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]] +; APPLE: [[ENTRY_NEW]]: +; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]] ; APPLE-NEXT: br label %[[LOOP:.*]] ; APPLE: [[LOOP]]: -; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP]] ] ; APPLE-NEXT: [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_EPIL]], [[SCALE]] ; APPLE-NEXT: [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]] ; APPLE-NEXT: [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4 ; APPLE-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL]] ; APPLE-NEXT: store float [[L_EPIL]], ptr [[GEP_DST_EPIL]], align 4 -; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1 -; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]] -; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]] +; APPLE-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1 +; APPLE-NEXT: [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT_EPIL]], [[SCALE]] +; APPLE-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]] +; APPLE-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4 +; APPLE-NEXT: [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_EPIL]] +; APPLE-NEXT: store float [[L_1]], ptr [[GEP_DST_1]], align 4 +; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2 +; APPLE-NEXT: [[SCALED_IV_2:%.*]] = mul nuw nsw i64 [[IV_NEXT_1]], [[SCALE]] +; APPLE-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_2]] +; APPLE-NEXT: [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4 +; APPLE-NEXT: [[GEP_DST_2:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_1]] +; APPLE-NEXT: store float [[L_2]], ptr [[GEP_DST_2]], align 4 +; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_EPIL]], 3 +; APPLE-NEXT: [[SCALED_IV_3:%.*]] = mul nuw nsw i64 [[IV_NEXT_2]], [[SCALE]] +; APPLE-NEXT: [[GEP_SRC_3:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_3]] +; APPLE-NEXT: [[L_3:%.*]] = load float, ptr [[GEP_SRC_3]], align 4 +; APPLE-NEXT: [[GEP_DST_3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_2]] +; APPLE-NEXT: store float [[L_3]], ptr [[GEP_DST_3]], align 4 +; APPLE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV_EPIL]], 4 +; APPLE-NEXT: [[SCALED_IV_4:%.*]] = mul nuw nsw i64 [[IV_NEXT_3]], [[SCALE]] +; APPLE-NEXT: [[GEP_SRC_4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_4]] +; APPLE-NEXT: [[L_4:%.*]] = load float, ptr [[GEP_SRC_4]], align 4 +; APPLE-NEXT: [[GEP_DST_4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_3]] +; APPLE-NEXT: store float [[L_4]], ptr [[GEP_DST_4]], align 4 +; APPLE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV_EPIL]], 5 +; APPLE-NEXT: [[SCALED_IV_5:%.*]] = mul nuw nsw i64 [[IV_NEXT_4]], [[SCALE]] +; APPLE-NEXT: [[GEP_SRC_5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_5]] +; APPLE-NEXT: [[L_5:%.*]] = load float, ptr [[GEP_SRC_5]], align 4 +; APPLE-NEXT: [[GEP_DST_5:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_4]] +; APPLE-NEXT: store float [[L_5]], ptr [[GEP_DST_5]], align 4 +; APPLE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV_EPIL]], 6 +; APPLE-NEXT: [[SCALED_IV_6:%.*]] = mul nuw nsw i64 [[IV_NEXT_5]], [[SCALE]] +; APPLE-NEXT: [[GEP_SRC_6:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_6]] +; APPLE-NEXT: [[L_6:%.*]] = load float, ptr [[GEP_SRC_6]], align 4 +; APPLE-NEXT: [[GEP_DST_6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_5]] +; APPLE-NEXT: store float [[L_6]], ptr [[GEP_DST_6]], align 4 +; APPLE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV_EPIL]], 7 +; APPLE-NEXT: [[SCALED_IV_7:%.*]] = mul nuw nsw i64 [[IV_NEXT_6]], [[SCALE]] +; APPLE-NEXT: [[GEP_SRC_7:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_7]] +; APPLE-NEXT: [[L_7:%.*]] = load float, ptr [[GEP_SRC_7]], align 4 +; APPLE-NEXT: [[GEP_DST_7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_6]] +; APPLE-NEXT: store float [[L_7]], ptr [[GEP_DST_7]], align 4 +; APPLE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV_EPIL]], 8 +; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8 +; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]] +; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]] +; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]: +; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[LOOP]] ] +; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]] +; APPLE: [[EXIT_UNR_LCSSA]]: +; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]] +; APPLE: [[LOOP_EPIL_PREHEADER]]: +; APPLE-NEXT: br label %[[LOOP_EPIL:.*]] +; APPLE: [[LOOP_EPIL]]: +; APPLE-NEXT: [[IV_EPIL1:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL1:%.*]], %[[LOOP_EPIL]] ] +; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ] +; APPLE-NEXT: [[SCALED_IV_EPIL1:%.*]] = mul nuw nsw i64 [[IV_EPIL1]], [[SCALE]] +; APPLE-NEXT: [[GEP_SRC_EPIL1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL1]] +; APPLE-NEXT: [[L_EPIL1:%.*]] = load float, ptr [[GEP_SRC_EPIL1]], align 4 +; APPLE-NEXT: [[GEP_DST_EPIL1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL1]] +; APPLE-NEXT: store float [[L_EPIL1]], ptr [[GEP_DST_EPIL1]], align 4 +; APPLE-NEXT: [[IV_NEXT_EPIL1]] = add nuw nsw i64 [[IV_EPIL1]], 1 +; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL1]], [[N]] +; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 +; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] +; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]] +; APPLE: [[EXIT_EPILOG_LCSSA]]: +; APPLE-NEXT: br label %[[EXIT]] ; APPLE: [[EXIT]]: ; APPLE-NEXT: ret void ;