From 4bac313f935dab1ef9731d22f6e7cfbf5b6a7bb0 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Fri, 19 Sep 2025 18:26:55 +0100 Subject: [PATCH 1/6] [GlobalOpt][FMV] Fix static resolution of calls. Addresses the issues found on the review of https://github.com/llvm/llvm-project/pull/150267/files#r2356936355 Currently when collecting the users of an IFunc symbol to determine the callers, we incorrectly mix versions of different functions together, alongside non-FMV callers all in the same bag. That is problematic because we incorrectly deduce which features are unavailable as we iterate the callers. I have updated the unit tests to require a resolver function for the callers and regenerated the resolvers since some FMV features have been removed making the detection bitmasks different. I've replaced the deleted FMV feature ls64 with cssc. I've added a new test to cover unrelated callers. --- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 190 ++++++---- .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 355 +++++++++++++++--- 2 files changed, 416 insertions(+), 129 deletions(-) diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index f88d51f443bcf..0707eb5eacf5d 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2482,20 +2482,21 @@ DeleteDeadIFuncs(Module &M, // Follows the use-def chain of \p V backwards until it finds a Function, // in which case it collects in \p Versions. Return true on successful // use-def chain traversal, false otherwise. -static bool collectVersions(TargetTransformInfo &TTI, Value *V, - SmallVectorImpl &Versions) { +static bool +collectVersions(Value *V, SmallVectorImpl &Versions, + function_ref GetTTI) { if (auto *F = dyn_cast(V)) { - if (!TTI.isMultiversionedFunction(*F)) + if (!GetTTI(*F).isMultiversionedFunction(*F)) return false; Versions.push_back(F); } else if (auto *Sel = dyn_cast(V)) { - if (!collectVersions(TTI, Sel->getTrueValue(), Versions)) + if (!collectVersions(Sel->getTrueValue(), Versions, GetTTI)) return false; - if (!collectVersions(TTI, Sel->getFalseValue(), Versions)) + if (!collectVersions(Sel->getFalseValue(), Versions, GetTTI)) return false; } else if (auto *Phi = dyn_cast(V)) { for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) - if (!collectVersions(TTI, Phi->getIncomingValue(I), Versions)) + if (!collectVersions(Phi->getIncomingValue(I), Versions, GetTTI)) return false; } else { // Unknown instruction type. Bail. @@ -2525,8 +2526,14 @@ static bool OptimizeNonTrivialIFuncs( Module &M, function_ref GetTTI) { bool Changed = false; - // Cache containing the mask constructed from a function's target features. + // Map containing the feature bits for a given function. DenseMap FeatureMask; + // Map containing all the versions corresponding to an IFunc symbol. + DenseMap> VersionedFuncs; + // Map containing the IFunc symbol a function is version of. + DenseMap VersionOf; + // List of all the interesting IFuncs found in the module. + SmallVector IFuncs; for (GlobalIFunc &IF : M.ifuncs()) { if (IF.isInterposable()) @@ -2539,107 +2546,140 @@ static bool OptimizeNonTrivialIFuncs( if (Resolver->isInterposable()) continue; - TargetTransformInfo &TTI = GetTTI(*Resolver); - - // Discover the callee versions. - SmallVector Callees; - if (any_of(*Resolver, [&TTI, &Callees](BasicBlock &BB) { + SmallVector Versions; + // Discover the versioned functions. + if (any_of(*Resolver, [&](BasicBlock &BB) { if (auto *Ret = dyn_cast_or_null(BB.getTerminator())) - if (!collectVersions(TTI, Ret->getReturnValue(), Callees)) + if (!collectVersions(Ret->getReturnValue(), Versions, GetTTI)) return true; return false; })) continue; - if (Callees.empty()) + if (Versions.empty()) continue; - LLVM_DEBUG(dbgs() << "Statically resolving calls to function " - << Resolver->getName() << "\n"); - - // Cache the feature mask for each callee. - for (Function *Callee : Callees) { - auto [It, Inserted] = FeatureMask.try_emplace(Callee); + for (Function *V : Versions) { + VersionOf.insert({V, &IF}); + auto [It, Inserted] = FeatureMask.try_emplace(V); if (Inserted) - It->second = TTI.getFeatureMask(*Callee); + It->second = GetTTI(*V).getFeatureMask(*V); } - // Sort the callee versions in decreasing priority order. - sort(Callees, [&](auto *LHS, auto *RHS) { + // Sort function versions in decreasing priority order. + sort(Versions, [&](auto *LHS, auto *RHS) { return FeatureMask[LHS].ugt(FeatureMask[RHS]); }); - // Find the callsites and cache the feature mask for each caller. - SmallVector Callers; + IFuncs.push_back(&IF); + VersionedFuncs.try_emplace(&IF, std::move(Versions)); + } + + for (GlobalIFunc *CalleeIF : IFuncs) { + SmallVector NonFMVCallers; + SmallVector CallerIFuncs; DenseMap> CallSites; - for (User *U : IF.users()) { + + // Find the callsites. + for (User *U : CalleeIF->users()) { if (auto *CB = dyn_cast(U)) { - if (CB->getCalledOperand() == &IF) { + if (CB->getCalledOperand() == CalleeIF) { Function *Caller = CB->getFunction(); - auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(Caller); - if (FeatInserted) - FeatIt->second = TTI.getFeatureMask(*Caller); - auto [CallIt, CallInserted] = CallSites.try_emplace(Caller); - if (CallInserted) - Callers.push_back(Caller); - CallIt->second.push_back(CB); + GlobalIFunc *CallerIFunc = nullptr; + TargetTransformInfo &TTI = GetTTI(*Caller); + bool CallerIsFMV = TTI.isMultiversionedFunction(*Caller); + // The caller is a version of a known IFunc. + if (auto It = VersionOf.find(Caller); It != VersionOf.end()) + CallerIFunc = It->second; + else if (!CallerIsFMV && OptimizeNonFMVCallers) { + // The caller is non-FMV. + auto [It, Inserted] = FeatureMask.try_emplace(Caller); + if (Inserted) + It->second = TTI.getFeatureMask(*Caller); + } else + // The caller is none of the above, skip. + continue; + auto [It, Inserted] = CallSites.try_emplace(Caller); + if (Inserted) { + if (CallerIsFMV) + CallerIFuncs.push_back(CallerIFunc); + else + NonFMVCallers.push_back(Caller); + } + It->second.push_back(CB); } } } - // Sort the caller versions in decreasing priority order. - sort(Callers, [&](auto *LHS, auto *RHS) { - return FeatureMask[LHS].ugt(FeatureMask[RHS]); - }); - - auto implies = [](APInt A, APInt B) { return B.isSubsetOf(A); }; + LLVM_DEBUG(dbgs() << "Statically resolving calls to function " + << CalleeIF->getResolverFunction()->getName() << "\n"); - // Index to the highest priority candidate. - unsigned I = 0; - // Now try to redirect calls starting from higher priority callers. - for (Function *Caller : Callers) { - assert(I < Callees.size() && "Found callers of equal priority"); + auto redirectCalls = [&](SmallVectorImpl &Callers, + SmallVectorImpl &Callees) { + // Index to the current callee candidate. + unsigned I = 0; - Function *Callee = Callees[I]; - APInt CallerBits = FeatureMask[Caller]; - APInt CalleeBits = FeatureMask[Callee]; + // Try to redirect calls starting from higher priority callers. + for (Function *Caller : Callers) { + if (I == Callees.size()) + break; - // In the case of FMV callers, we know that all higher priority callers - // than the current one did not get selected at runtime, which helps - // reason about the callees (if they have versions that mandate presence - // of the features which we already know are unavailable on this target). - if (TTI.isMultiversionedFunction(*Caller)) { + bool CallerIsFMV = GetTTI(*Caller).isMultiversionedFunction(*Caller); + // In the case of FMV callers, we know that all higher priority callers + // than the current one did not get selected at runtime, which helps + // reason about the callees (if they have versions that mandate presence + // of the features which we already know are unavailable on this + // target). + if (!CallerIsFMV) + // We can't reason much about non-FMV callers. Just pick the highest + // priority callee if it matches, otherwise bail. + assert(I == 0 && "Should only select the highest priority candidate"); + + Function *Callee = Callees[I]; + APInt CallerBits = FeatureMask[Caller]; + APInt CalleeBits = FeatureMask[Callee]; // If the feature set of the caller implies the feature set of the - // highest priority candidate then it shall be picked. In case of - // identical sets advance the candidate index one position. - if (CallerBits == CalleeBits) - ++I; - else if (!implies(CallerBits, CalleeBits)) { - // Keep advancing the candidate index as long as the caller's - // features are a subset of the current candidate's. - while (implies(CalleeBits, CallerBits)) { + // highest priority candidate then it shall be picked. + if (CalleeBits.isSubsetOf(CallerBits)) { + // If there are no records of call sites for this particular function + // version, then it is not actually a caller, in which case skip. + if (auto It = CallSites.find(Caller); It != CallSites.end()) { + for (CallBase *CS : It->second) { + LLVM_DEBUG(dbgs() << "Redirecting call " << Caller->getName() + << " -> " << Callee->getName() << "\n"); + CS->setCalledOperand(Callee); + } + Changed = true; + } + } + // Keep advancing the candidate index as long as the caller's + // features are a subset of the current candidate's. + if (CallerIsFMV) { + while (CallerBits.isSubsetOf(CalleeBits)) { if (++I == Callees.size()) break; CalleeBits = FeatureMask[Callees[I]]; } - continue; } - } else { - // We can't reason much about non-FMV callers. Just pick the highest - // priority callee if it matches, otherwise bail. - if (!OptimizeNonFMVCallers || I > 0 || !implies(CallerBits, CalleeBits)) - continue; } - auto &Calls = CallSites[Caller]; - for (CallBase *CS : Calls) { - LLVM_DEBUG(dbgs() << "Redirecting call " << Caller->getName() << " -> " - << Callee->getName() << "\n"); - CS->setCalledOperand(Callee); + }; + + auto &Callees = VersionedFuncs[CalleeIF]; + + // Optimize non-FMV calls. + if (!NonFMVCallers.empty() && OptimizeNonFMVCallers) + redirectCalls(NonFMVCallers, Callees); + + // Optimize FMV calls. + if (!CallerIFuncs.empty()) { + for (GlobalIFunc *CallerIF : CallerIFuncs) { + auto &Callers = VersionedFuncs[CallerIF]; + redirectCalls(Callers, Callees); } - Changed = true; } - if (IF.use_empty() || - all_of(IF.users(), [](User *U) { return isa(U); })) + + if (CalleeIF->use_empty() || + all_of(CalleeIF->users(), [](User *U) { return isa(U); })) NumIFuncsResolved++; } return Changed; diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll index 4b6a19d3f05cf..7ace67e3857ff 100644 --- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names)" --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names|test_unrelated_callers)" --version 4 ; REQUIRES: aarch64-registered-target @@ -13,6 +13,14 @@ $test_caller_feats_not_implied.resolver = comdat any $test_non_fmv_caller.resolver = comdat any $test_priority.resolver = comdat any $test_alternative_names.resolver = comdat any +$test_unrelated_callers.resolver = comdat any +$caller1.resolver = comdat any +$caller2.resolver = comdat any +$caller3.resolver = comdat any +$caller6.resolver = comdat any +$caller7.resolver = comdat any +$caller8.resolver = comdat any +$caller9.resolver = comdat any @__aarch64_cpu_features = external local_unnamed_addr global { i64 } @@ -22,6 +30,14 @@ $test_alternative_names.resolver = comdat any @test_non_fmv_caller = weak_odr ifunc i32 (), ptr @test_non_fmv_caller.resolver @test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver @test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver +@test_unrelated_callers = weak_odr ifunc i32 (), ptr @test_unrelated_callers.resolver +@caller1 = weak_odr ifunc i32 (), ptr @caller1.resolver +@caller2 = weak_odr ifunc i32 (), ptr @caller2.resolver +@caller3 = weak_odr ifunc i32 (), ptr @caller3.resolver +@caller6 = weak_odr ifunc i32 (), ptr @caller6.resolver +@caller7 = weak_odr ifunc i32 (), ptr @caller7.resolver +@caller8 = weak_odr ifunc i32 (), ptr @caller8.resolver +@caller9 = weak_odr ifunc i32 (), ptr @caller9.resolver declare void @__init_cpu_features_resolver() local_unnamed_addr @@ -34,18 +50,18 @@ define weak_odr ptr @test_single_bb_resolver.resolver() comdat { resolver_entry: tail call void @__init_cpu_features_resolver() %0 = load i64, ptr @__aarch64_cpu_features, align 8 - %1 = and i64 %0, 68719476736 - %.not = icmp eq i64 %1, 0 - %2 = and i64 %0, 1073741824 - %.not3 = icmp eq i64 %2, 0 - %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %.not3, ptr @test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve - %common.ret.op = select i1 %.not, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve2 + %1 = and i64 %0, 69793284352 + %2 = icmp eq i64 %1, 69793284352 + %3 = and i64 %0, 1073807616 + %4 = icmp eq i64 %3, 1073807616 + %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %4, ptr @test_single_bb_resolver._Msve, ptr @test_single_bb_resolver.default + %common.ret.op = select i1 %2, ptr @test_single_bb_resolver._Msve2, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default ret ptr %common.ret.op } define i32 @caller1._Msve() #1 { ; CHECK-LABEL: define i32 @caller1._Msve( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve() ; entry: @@ -55,7 +71,7 @@ entry: define i32 @caller1._Msve2() #2 { ; CHECK-LABEL: define i32 @caller1._Msve2( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve2() ; entry: @@ -65,7 +81,7 @@ entry: define i32 @caller1.default() #0 { ; CHECK-LABEL: define i32 @caller1.default( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default() ; entry: @@ -73,6 +89,20 @@ entry: ret i32 %call } +define weak_odr ptr @caller1.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @caller1.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 69793284352 + %2 = icmp eq i64 %1, 69793284352 + %3 = and i64 %0, 1073807616 + %4 = icmp eq i64 %3, 1073807616 + %caller1._Msve.caller1.default = select i1 %4, ptr @caller1._Msve, ptr @caller1.default + %common.ret.op = select i1 %2, ptr @caller1._Msve2, ptr %caller1._Msve.caller1.default + ret ptr %common.ret.op +} + declare i32 @test_multi_bb_resolver._Mmops() #3 declare i32 @test_multi_bb_resolver._Msve2() #2 declare i32 @test_multi_bb_resolver._Msve() #1 @@ -92,20 +122,20 @@ common.ret: ; preds = %resolver_else2, %re ret ptr %common.ret.op resolver_else: ; preds = %resolver_entry - %2 = and i64 %0, 68719476736 - %.not5 = icmp eq i64 %2, 0 - br i1 %.not5, label %resolver_else2, label %common.ret + %2 = and i64 %0, 69793284352 + %3 = icmp eq i64 %2, 69793284352 + br i1 %3, label %common.ret, label %resolver_else2 resolver_else2: ; preds = %resolver_else - %3 = and i64 %0, 1073741824 - %.not6 = icmp eq i64 %3, 0 - %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %.not6, ptr @test_multi_bb_resolver.default, ptr @test_multi_bb_resolver._Msve + %4 = and i64 %0, 1073807616 + %5 = icmp eq i64 %4, 1073807616 + %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %5, ptr @test_multi_bb_resolver._Msve, ptr @test_multi_bb_resolver.default br label %common.ret } define i32 @caller2._MmopsMsve2() #4 { ; CHECK-LABEL: define i32 @caller2._MmopsMsve2( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() ; entry: @@ -115,7 +145,7 @@ entry: define i32 @caller2._Mmops() #3 { ; CHECK-LABEL: define i32 @caller2._Mmops( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() ; entry: @@ -125,7 +155,7 @@ entry: define i32 @caller2._Msve() #1 { ; CHECK-LABEL: define i32 @caller2._Msve( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver() ; entry: @@ -135,7 +165,7 @@ entry: define i32 @caller2.default() #0 { ; CHECK-LABEL: define i32 @caller2.default( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default() ; entry: @@ -143,6 +173,31 @@ entry: ret i32 %call } +define weak_odr ptr @caller2.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @caller2.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460822096707840 + %2 = icmp eq i64 %1, 576460822096707840 + br i1 %2, label %common.ret, label %resolver_else + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @caller2._MmopsMsve2, %resolver_entry ], [ @caller2._Mmops, %resolver_else ], [ %caller2._Msve.caller2.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %3 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %3, 0 + br i1 %.not, label %resolver_else2, label %common.ret + +resolver_else2: ; preds = %resolver_else + %4 = and i64 %0, 1073807616 + %5 = icmp eq i64 %4, 1073807616 + %caller2._Msve.caller2.default = select i1 %5, ptr @caller2._Msve, ptr @caller2.default + br label %common.ret +} + declare i32 @test_caller_feats_not_implied._Mmops() #3 declare i32 @test_caller_feats_not_implied._Msme() #5 declare i32 @test_caller_feats_not_implied._Msve() #1 @@ -162,20 +217,20 @@ common.ret: ; preds = %resolver_else2, %re ret ptr %common.ret.op resolver_else: ; preds = %resolver_entry - %2 = and i64 %0, 4398046511104 - %.not5 = icmp eq i64 %2, 0 - br i1 %.not5, label %resolver_else2, label %common.ret + %2 = and i64 %0, 4398180795136 + %3 = icmp eq i64 %2, 4398180795136 + br i1 %3, label %common.ret, label %resolver_else2 resolver_else2: ; preds = %resolver_else - %3 = and i64 %0, 1073741824 - %.not6 = icmp eq i64 %3, 0 - %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default = select i1 %.not6, ptr @test_caller_feats_not_implied.default, ptr @test_caller_feats_not_implied._Msve + %4 = and i64 %0, 1073807616 + %5 = icmp eq i64 %4, 1073807616 + %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default = select i1 %5, ptr @test_caller_feats_not_implied._Msve, ptr @test_caller_feats_not_implied.default br label %common.ret } define i32 @caller3._Mmops() #3 { ; CHECK-LABEL: define i32 @caller3._Mmops( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR3]] { +; CHECK-SAME: ) #[[ATTR3]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops() ; entry: @@ -185,7 +240,7 @@ entry: define i32 @caller3._Msve() #1 { ; CHECK-LABEL: define i32 @caller3._Msve( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() ; entry: @@ -195,7 +250,7 @@ entry: define i32 @caller3.default() #0 { ; CHECK-LABEL: define i32 @caller3.default( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() ; entry: @@ -203,6 +258,20 @@ entry: ret i32 %call } +define weak_odr ptr @caller3.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @caller3.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 1073807616 + %3 = icmp eq i64 %2, 1073807616 + %caller3._Msve.caller3.default = select i1 %3, ptr @caller3._Msve, ptr @caller3.default + %common.ret.op = select i1 %.not, ptr %caller3._Msve.caller3.default, ptr @caller3._Mmops + ret ptr %common.ret.op +} + declare i32 @test_non_fmv_caller._Maes() #6 declare i32 @test_non_fmv_caller._Msm4() #7 declare i32 @test_non_fmv_caller.default() #0 @@ -212,15 +281,18 @@ define weak_odr ptr @test_non_fmv_caller.resolver() comdat { resolver_entry: tail call void @__init_cpu_features_resolver() %0 = load i64, ptr @__aarch64_cpu_features, align 8 - %1 = and i64 %0, 32768 - %.not = icmp eq i64 %1, 0 - %test_non_fmv_caller._Maes.test_non_fmv_caller.default = select i1 %.not, ptr @test_non_fmv_caller.default, ptr @test_non_fmv_caller._Maes - ret ptr %test_non_fmv_caller._Maes.test_non_fmv_caller.default + %1 = and i64 %0, 33536 + %2 = icmp eq i64 %1, 33536 + %3 = and i64 %0, 800 + %4 = icmp eq i64 %3, 800 + %test_non_fmv_caller._Msm4.test_non_fmv_caller.default = select i1 %4, ptr @test_non_fmv_caller._Msm4, ptr @test_non_fmv_caller.default + %common.ret.op = select i1 %2, ptr @test_non_fmv_caller._Maes, ptr %test_non_fmv_caller._Msm4.test_non_fmv_caller.default + ret ptr %common.ret.op } define i32 @caller4() #8 { ; CHECK-LABEL: define i32 @caller4( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] { +; CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller._Maes() ; entry: @@ -230,7 +302,7 @@ entry: define i32 @caller5() #9 { ; CHECK-LABEL: define i32 @caller5( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] { +; CHECK-SAME: ) local_unnamed_addr #[[ATTR9:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller() ; entry: @@ -239,7 +311,7 @@ entry: } declare i32 @test_priority._Msve2-sha3() #10 -declare i32 @test_priority._Mls64Mssbs() #11 +declare i32 @test_priority._McsscMssbs() #11 declare i32 @test_priority._MflagmMlseMrng() #12 declare i32 @test_priority.default() #0 @@ -248,36 +320,57 @@ define weak_odr ptr @test_priority.resolver() comdat { resolver_entry: tail call void @__init_cpu_features_resolver() %0 = load i64, ptr @__aarch64_cpu_features, align 8 - %1 = and i64 %0, 131 - %2 = icmp eq i64 %1, 131 + %1 = and i64 %0, 562949953423360 + %2 = icmp eq i64 %1, 562949953423360 br i1 %2, label %common.ret, label %resolver_else common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry - %common.ret.op = phi ptr [ @test_priority._MflagmMlseMrng, %resolver_entry ], [ @test_priority._Mls64Mssbs, %resolver_else ], [ %test_priority._Msve2-sha3.test_priority.default, %resolver_else2 ] + %common.ret.op = phi ptr [ @test_priority._McsscMssbs, %resolver_entry ], [ @test_priority._Msve2-sha3, %resolver_else ], [ %test_priority._MflagmMlseMrng.test_priority.default, %resolver_else2 ] ret ptr %common.ret.op resolver_else: ; preds = %resolver_entry - %3 = and i64 %0, 9570149208162304 - %4 = icmp eq i64 %3, 9570149208162304 + %3 = and i64 %0, 1169304924928 + %4 = icmp eq i64 %3, 1169304924928 br i1 %4, label %common.ret, label %resolver_else2 resolver_else2: ; preds = %resolver_else - %5 = and i64 %0, 1099511627776 - %.not = icmp eq i64 %5, 0 - %test_priority._Msve2-sha3.test_priority.default = select i1 %.not, ptr @test_priority.default, ptr @test_priority._Msve2-sha3 + %5 = and i64 %0, 131 + %6 = icmp eq i64 %5, 131 + %test_priority._MflagmMlseMrng.test_priority.default = select i1 %6, ptr @test_priority._MflagmMlseMrng, ptr @test_priority.default br label %common.ret } -define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3() #13 { -; CHECK-LABEL: define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR12:[0-9]+]] { -; CHECK: [[CALL:%.*]] = tail call i32 @test_priority._Mls64Mssbs() +define i32 @caller6._McsscMflagmMlseMrngMssbsMsve2-sha3() #13 { +; CHECK-LABEL: define i32 @caller6._McsscMflagmMlseMrngMssbsMsve2-sha3( +; CHECK-SAME: ) #[[ATTR13:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_priority._McsscMssbs() ; entry: %call = tail call i32 @test_priority() ret i32 %call } +define i32 @caller6.default() #0 { +; CHECK-LABEL: define i32 @caller6.default( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_priority() +; +entry: + %call = tail call i32 @test_priority() + ret i32 %call +} + +define weak_odr ptr @caller6.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @caller6.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 564119258348419 + %2 = icmp eq i64 %1, 564119258348419 + %caller6._McsscMflagmMlseMrngMssbsMsve2-sha3.caller6.default = select i1 %2, ptr @caller6._McsscMflagmMlseMrngMssbsMsve2-sha3, ptr @caller6.default + ret ptr %caller6._McsscMflagmMlseMrngMssbsMsve2-sha3.caller6.default +} + declare i32 @test_alternative_names._Mdpb2Mfrintts() #14 declare i32 @test_alternative_names._Mflagm2Mfrintts() #15 declare i32 @test_alternative_names._Mrcpc2() #16 @@ -310,7 +403,7 @@ resolver_else2: ; preds = %resolver_else define i32 @caller7._Mdpb2Mfrintts() #14 { ; CHECK-LABEL: define i32 @caller7._Mdpb2Mfrintts( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR13:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR14:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names._Mdpb2Mfrintts() ; entry: @@ -320,7 +413,7 @@ entry: define i32 @caller7._Mfrintts() #17 { ; CHECK-LABEL: define i32 @caller7._Mfrintts( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR16:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR17:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names() ; entry: @@ -330,7 +423,7 @@ entry: define i32 @caller7._Mrcpc2() #16 { ; CHECK-LABEL: define i32 @caller7._Mrcpc2( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR15:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR16:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names._Mrcpc2() ; entry: @@ -340,7 +433,7 @@ entry: define i32 @caller7.default() #0 { ; CHECK-LABEL: define i32 @caller7.default( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names.default() ; entry: @@ -348,6 +441,159 @@ entry: ret i32 %call } +define weak_odr ptr @caller7.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @caller7.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 17563904 + %2 = icmp eq i64 %1, 17563904 + br i1 %2, label %common.ret, label %resolver_else + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @caller7._Mdpb2Mfrintts, %resolver_entry ], [ @caller7._Mfrintts, %resolver_else ], [ %caller7._Mrcpc2.caller7.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %3 = and i64 %0, 16777472 + %4 = icmp eq i64 %3, 16777472 + br i1 %4, label %common.ret, label %resolver_else2 + +resolver_else2: ; preds = %resolver_else + %5 = and i64 %0, 12582912 + %6 = icmp eq i64 %5, 12582912 + %caller7._Mrcpc2.caller7.default = select i1 %6, ptr @caller7._Mrcpc2, ptr @caller7.default + br label %common.ret +} + +declare i32 @test_unrelated_callers._Mmops() #3 +declare i32 @test_unrelated_callers._Msve2() #2 +declare i32 @test_unrelated_callers._Msve() #1 +declare i32 @test_unrelated_callers.default() #0 + +define weak_odr ptr @test_unrelated_callers.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_unrelated_callers.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + br i1 %.not, label %resolver_else, label %common.ret + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_unrelated_callers._Mmops, %resolver_entry ], [ @test_unrelated_callers._Msve2, %resolver_else ], [ %test_unrelated_callers._Msve.test_unrelated_callers.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %2 = and i64 %0, 69793284352 + %3 = icmp eq i64 %2, 69793284352 + br i1 %3, label %common.ret, label %resolver_else2 + +resolver_else2: ; preds = %resolver_else + %4 = and i64 %0, 1073807616 + %5 = icmp eq i64 %4, 1073807616 + %test_unrelated_callers._Msve.test_unrelated_callers.default = select i1 %5, ptr @test_unrelated_callers._Msve, ptr @test_unrelated_callers.default + br label %common.ret +} + +define i32 @caller8._MmopsMsve2() #4 { +; CHECK-LABEL: define i32 @caller8._MmopsMsve2( +; CHECK-SAME: ) #[[ATTR4]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Mmops() +; +entry: + %call = tail call i32 @test_unrelated_callers() + ret i32 %call +} + +define i32 @caller8._Msve() #1 { +; CHECK-LABEL: define i32 @caller8._Msve( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers() +; +entry: + %call = tail call i32 @test_unrelated_callers() + ret i32 %call +} + +define i32 @caller8.default() #0 { +; CHECK-LABEL: define i32 @caller8.default( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers() +; +entry: + %call = tail call i32 @test_unrelated_callers() + ret i32 %call +} + +define weak_odr ptr @caller8.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @caller8.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460822096707840 + %2 = icmp eq i64 %1, 576460822096707840 + %3 = and i64 %0, 1073807616 + %4 = icmp eq i64 %3, 1073807616 + %caller8._Msve.caller8.default = select i1 %4, ptr @caller8._Msve, ptr @caller8.default + %common.ret.op = select i1 %2, ptr @caller8._MmopsMsve2, ptr %caller8._Msve.caller8.default + ret ptr %common.ret.op +} + +define i32 @caller9._Mmops() #3 { +; CHECK-LABEL: define i32 @caller9._Mmops( +; CHECK-SAME: ) #[[ATTR3]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Mmops() +; +entry: + %call = tail call i32 @test_unrelated_callers() + ret i32 %call +} + +define i32 @caller9._Msve2() #2 { +; CHECK-LABEL: define i32 @caller9._Msve2( +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Msve2() +; +entry: + %call = tail call i32 @test_unrelated_callers() + ret i32 %call +} + +define i32 @caller9.default() #0 { +; CHECK-LABEL: define i32 @caller9.default( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers() +; +entry: + %call = tail call i32 @test_unrelated_callers() + ret i32 %call +} + +define weak_odr ptr @caller9.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @caller9.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 69793284352 + %3 = icmp eq i64 %2, 69793284352 + %caller9._Msve2.caller9.default = select i1 %3, ptr @caller9._Msve2, ptr @caller9.default + %common.ret.op = select i1 %.not, ptr %caller9._Msve2.caller9.default, ptr @caller9._Mmops + ret ptr %common.ret.op +} + +define i32 @caller10() #18 { +; CHECK-LABEL: define i32 @caller10( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR18:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Mmops() +; +entry: + %call = tail call i32 @test_unrelated_callers() + ret i32 %call +} + attributes #0 = { "fmv-features" } attributes #1 = { "fmv-features"="sve" } attributes #2 = { "fmv-features"="sve2" } @@ -359,10 +605,11 @@ attributes #7 = { "fmv-features"="sm4" } attributes #8 = { "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" } attributes #9 = { "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+sm4" } attributes #10 = { "fmv-features"="sve2-sha3" } -attributes #11 = { "fmv-features"="ls64,ssbs" } +attributes #11 = { "fmv-features"="cssc,ssbs" } attributes #12 = { "fmv-features"="flagm,lse,rng" } -attributes #13 = { "fmv-features"="flagm,ls64,lse,rng,ssbs,sve2-sha3" } +attributes #13 = { "fmv-features"="cssc,flagm,lse,rng,ssbs,sve2-sha3" } attributes #14 = { "fmv-features"="dpb2,frintts" } attributes #15 = { "fmv-features"="flagm2,frintts" } attributes #16 = { "fmv-features"="rcpc2" } attributes #17 = { "fmv-features"="frintts" } +attributes #18 = { "target-features"="+fp-armv8,+mops,+neon,+outline-atomics,+sve,+v8a" } From 28991c037b268788e3ca292d84fce4196ab3bd48 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Mon, 22 Sep 2025 09:50:18 +0100 Subject: [PATCH 2/6] rename var --- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 0707eb5eacf5d..cf01936ff5611 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2585,12 +2585,12 @@ static bool OptimizeNonTrivialIFuncs( if (auto *CB = dyn_cast(U)) { if (CB->getCalledOperand() == CalleeIF) { Function *Caller = CB->getFunction(); - GlobalIFunc *CallerIFunc = nullptr; + GlobalIFunc *CallerIF = nullptr; TargetTransformInfo &TTI = GetTTI(*Caller); bool CallerIsFMV = TTI.isMultiversionedFunction(*Caller); // The caller is a version of a known IFunc. if (auto It = VersionOf.find(Caller); It != VersionOf.end()) - CallerIFunc = It->second; + CallerIF = It->second; else if (!CallerIsFMV && OptimizeNonFMVCallers) { // The caller is non-FMV. auto [It, Inserted] = FeatureMask.try_emplace(Caller); @@ -2602,7 +2602,7 @@ static bool OptimizeNonTrivialIFuncs( auto [It, Inserted] = CallSites.try_emplace(Caller); if (Inserted) { if (CallerIsFMV) - CallerIFuncs.push_back(CallerIFunc); + CallerIFuncs.push_back(CallerIF); else NonFMVCallers.push_back(Caller); } From 704fbe455a0f0fc5441d8ff03588284d9fa9c03f Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Mon, 22 Sep 2025 13:26:08 +0100 Subject: [PATCH 3/6] Change caller8._Msve -> caller8._Msve2 and caller9._Msve2 -> caller9._Msve, and remove callsite from caller9._Msve. --- .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 35 +++++++++---------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll index 7ace67e3857ff..156c49c8b6677 100644 --- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -506,9 +506,9 @@ entry: ret i32 %call } -define i32 @caller8._Msve() #1 { -; CHECK-LABEL: define i32 @caller8._Msve( -; CHECK-SAME: ) #[[ATTR1]] { +define dso_local i32 @caller8._Msve2() #2 { +; CHECK-LABEL: define dso_local i32 @caller8._Msve2( +; CHECK-SAME: ) #[[ATTR2]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers() ; entry: @@ -533,10 +533,10 @@ resolver_entry: %0 = load i64, ptr @__aarch64_cpu_features, align 8 %1 = and i64 %0, 576460822096707840 %2 = icmp eq i64 %1, 576460822096707840 - %3 = and i64 %0, 1073807616 - %4 = icmp eq i64 %3, 1073807616 - %caller8._Msve.caller8.default = select i1 %4, ptr @caller8._Msve, ptr @caller8.default - %common.ret.op = select i1 %2, ptr @caller8._MmopsMsve2, ptr %caller8._Msve.caller8.default + %3 = and i64 %0, 69793284352 + %4 = icmp eq i64 %3, 69793284352 + %caller8._Msve2.caller8.default = select i1 %4, ptr @caller8._Msve2, ptr @caller8.default + %common.ret.op = select i1 %2, ptr @caller8._MmopsMsve2, ptr %caller8._Msve2.caller8.default ret ptr %common.ret.op } @@ -550,20 +550,17 @@ entry: ret i32 %call } -define i32 @caller9._Msve2() #2 { -; CHECK-LABEL: define i32 @caller9._Msve2( -; CHECK-SAME: ) #[[ATTR2]] { -; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Msve2() -; +define i32 @caller9._Msve() #1 { +; CHECK-LABEL: define i32 @caller9._Msve( +; CHECK-SAME: ) #[[ATTR1]] { entry: - %call = tail call i32 @test_unrelated_callers() - ret i32 %call + ret i32 1 } define i32 @caller9.default() #0 { ; CHECK-LABEL: define i32 @caller9.default( ; CHECK-SAME: ) #[[ATTR0]] { -; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers() +; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers.default() ; entry: %call = tail call i32 @test_unrelated_callers() @@ -577,10 +574,10 @@ resolver_entry: %0 = load i64, ptr @__aarch64_cpu_features, align 8 %1 = and i64 %0, 576460752303423488 %.not = icmp eq i64 %1, 0 - %2 = and i64 %0, 69793284352 - %3 = icmp eq i64 %2, 69793284352 - %caller9._Msve2.caller9.default = select i1 %3, ptr @caller9._Msve2, ptr @caller9.default - %common.ret.op = select i1 %.not, ptr %caller9._Msve2.caller9.default, ptr @caller9._Mmops + %2 = and i64 %0, 1073807616 + %3 = icmp eq i64 %2, 1073807616 + %caller9._Msve.caller9.default = select i1 %3, ptr @caller9._Msve, ptr @caller9.default + %common.ret.op = select i1 %.not, ptr %caller9._Msve.caller9.default, ptr @caller9._Mmops ret ptr %common.ret.op } From 94ec4ab8755c4d75c00dcff9523ac838c3cb2fda Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Tue, 23 Sep 2025 16:29:50 +0100 Subject: [PATCH 4/6] Keep track of unavailable features from previous callers. This allows smarter elimination of candidates at the expense of more comparisons. --- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 59 +++++++++++-------- .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 4 +- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index cf01936ff5611..a071ebc2fd628 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2618,6 +2618,8 @@ static bool OptimizeNonTrivialIFuncs( SmallVectorImpl &Callees) { // Index to the current callee candidate. unsigned I = 0; + // Feature bits from callers of previous iterations. + SmallVector KnownBits; // Try to redirect calls starting from higher priority callers. for (Function *Caller : Callers) { @@ -2625,19 +2627,41 @@ static bool OptimizeNonTrivialIFuncs( break; bool CallerIsFMV = GetTTI(*Caller).isMultiversionedFunction(*Caller); + // We can't reason much about non-FMV callers. Just pick the highest + // priority callee if it matches, otherwise bail. + if (!CallerIsFMV) + assert(I == 0 && "Should only select the highest priority candidate"); + + APInt CallerBits = FeatureMask[Caller]; + APInt CalleeBits = FeatureMask[Callees[I]]; // In the case of FMV callers, we know that all higher priority callers // than the current one did not get selected at runtime, which helps // reason about the callees (if they have versions that mandate presence // of the features which we already know are unavailable on this - // target). - if (!CallerIsFMV) - // We can't reason much about non-FMV callers. Just pick the highest - // priority callee if it matches, otherwise bail. - assert(I == 0 && "Should only select the highest priority candidate"); - + // target, then we can skip over those versions/candidates). + if (CallerIsFMV) { + // Discard feature bits that are known to be available + // in the current iteration. + for (APInt &Version: KnownBits) + if (CallerBits.isSubsetOf(Version)) + Version &= ~CallerBits; + // Keep advancing the candidate index as long as the unavailable + // features are a subset of the current candidate's. + unsigned J = 0; + while (J < KnownBits.size()) { + APInt Version = KnownBits[J]; + if (Version.isSubsetOf(CalleeBits)) { + if (++I == Callees.size()) + break; + CalleeBits = FeatureMask[Callees[I]]; + // Start over. + J = 0; + } else + ++J; + } + KnownBits.push_back(CallerBits); + } Function *Callee = Callees[I]; - APInt CallerBits = FeatureMask[Caller]; - APInt CalleeBits = FeatureMask[Callee]; // If the feature set of the caller implies the feature set of the // highest priority candidate then it shall be picked. if (CalleeBits.isSubsetOf(CallerBits)) { @@ -2652,30 +2676,19 @@ static bool OptimizeNonTrivialIFuncs( Changed = true; } } - // Keep advancing the candidate index as long as the caller's - // features are a subset of the current candidate's. - if (CallerIsFMV) { - while (CallerBits.isSubsetOf(CalleeBits)) { - if (++I == Callees.size()) - break; - CalleeBits = FeatureMask[Callees[I]]; - } - } } }; auto &Callees = VersionedFuncs[CalleeIF]; // Optimize non-FMV calls. - if (!NonFMVCallers.empty() && OptimizeNonFMVCallers) + if (OptimizeNonFMVCallers) redirectCalls(NonFMVCallers, Callees); // Optimize FMV calls. - if (!CallerIFuncs.empty()) { - for (GlobalIFunc *CallerIF : CallerIFuncs) { - auto &Callers = VersionedFuncs[CallerIF]; - redirectCalls(Callers, Callees); - } + for (GlobalIFunc *CallerIF : CallerIFuncs) { + auto &Callers = VersionedFuncs[CallerIF]; + redirectCalls(Callers, Callees); } if (CalleeIF->use_empty() || diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll index 156c49c8b6677..e6706b1ced217 100644 --- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -156,7 +156,7 @@ entry: define i32 @caller2._Msve() #1 { ; CHECK-LABEL: define i32 @caller2._Msve( ; CHECK-SAME: ) #[[ATTR1]] { -; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver() +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Msve() ; entry: %call = tail call i32 @test_multi_bb_resolver() @@ -509,7 +509,7 @@ entry: define dso_local i32 @caller8._Msve2() #2 { ; CHECK-LABEL: define dso_local i32 @caller8._Msve2( ; CHECK-SAME: ) #[[ATTR2]] { -; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers() +; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Msve2() ; entry: %call = tail call i32 @test_unrelated_callers() From bd9b454e6a3a8a1746a9752ca49ced1f7a1734b2 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Tue, 23 Sep 2025 19:18:18 +0100 Subject: [PATCH 5/6] clang format --- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index a071ebc2fd628..ab9ed9efad317 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2642,7 +2642,7 @@ static bool OptimizeNonTrivialIFuncs( if (CallerIsFMV) { // Discard feature bits that are known to be available // in the current iteration. - for (APInt &Version: KnownBits) + for (APInt &Version : KnownBits) if (CallerBits.isSubsetOf(Version)) Version &= ~CallerBits; // Keep advancing the candidate index as long as the unavailable From 98f9197a4da5144a690524d0f098ca2008a7ea3b Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Wed, 24 Sep 2025 16:04:59 +0100 Subject: [PATCH 6/6] When disregarding feature bits that are known to be available in the current iteration, the lifespan of this knowledge should expire in the next iteration. Therefore we should not clear those bits from KnownBits. --- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 9 +- .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 92 ++++++++++++++++++- 2 files changed, 93 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index ab9ed9efad317..4449c1e74a612 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2640,16 +2640,13 @@ static bool OptimizeNonTrivialIFuncs( // of the features which we already know are unavailable on this // target, then we can skip over those versions/candidates). if (CallerIsFMV) { - // Discard feature bits that are known to be available - // in the current iteration. - for (APInt &Version : KnownBits) - if (CallerBits.isSubsetOf(Version)) - Version &= ~CallerBits; // Keep advancing the candidate index as long as the unavailable // features are a subset of the current candidate's. unsigned J = 0; while (J < KnownBits.size()) { - APInt Version = KnownBits[J]; + // Discard feature bits that are known to be available + // in the current iteration. + APInt Version = KnownBits[J] & ~CallerBits; if (Version.isSubsetOf(CalleeBits)) { if (++I == Callees.size()) break; diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll index e6706b1ced217..3a6866c4e16a4 100644 --- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names|test_unrelated_callers)" --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names|test_unrelated_callers|test_clear_known_bits)" --version 4 ; REQUIRES: aarch64-registered-target @@ -14,6 +14,7 @@ $test_non_fmv_caller.resolver = comdat any $test_priority.resolver = comdat any $test_alternative_names.resolver = comdat any $test_unrelated_callers.resolver = comdat any +$test_clear_known_bits.resolver = comdat any $caller1.resolver = comdat any $caller2.resolver = comdat any $caller3.resolver = comdat any @@ -21,6 +22,7 @@ $caller6.resolver = comdat any $caller7.resolver = comdat any $caller8.resolver = comdat any $caller9.resolver = comdat any +$caller11.resolver = comdat any @__aarch64_cpu_features = external local_unnamed_addr global { i64 } @@ -31,6 +33,7 @@ $caller9.resolver = comdat any @test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver @test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver @test_unrelated_callers = weak_odr ifunc i32 (), ptr @test_unrelated_callers.resolver +@test_clear_known_bits = weak_odr ifunc i32 (), ptr @test_clear_known_bits.resolver @caller1 = weak_odr ifunc i32 (), ptr @caller1.resolver @caller2 = weak_odr ifunc i32 (), ptr @caller2.resolver @caller3 = weak_odr ifunc i32 (), ptr @caller3.resolver @@ -38,6 +41,7 @@ $caller9.resolver = comdat any @caller7 = weak_odr ifunc i32 (), ptr @caller7.resolver @caller8 = weak_odr ifunc i32 (), ptr @caller8.resolver @caller9 = weak_odr ifunc i32 (), ptr @caller9.resolver +@caller11 = weak_odr ifunc i32 (), ptr @caller11.resolver declare void @__init_cpu_features_resolver() local_unnamed_addr @@ -156,7 +160,7 @@ entry: define i32 @caller2._Msve() #1 { ; CHECK-LABEL: define i32 @caller2._Msve( ; CHECK-SAME: ) #[[ATTR1]] { -; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Msve() +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver() ; entry: %call = tail call i32 @test_multi_bb_resolver() @@ -591,6 +595,89 @@ entry: ret i32 %call } +declare i32 @test_clear_known_bits._Mmops() #3 +declare i32 @test_clear_known_bits._Maes() #6 +declare i32 @test_clear_known_bits.default() #0 + +define weak_odr ptr @test_clear_known_bits.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_clear_known_bits.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 33536 + %3 = icmp eq i64 %2, 33536 + %test_clear_known_bits._Maes.test_clear_known_bits.default = select i1 %3, ptr @test_clear_known_bits._Maes, ptr @test_clear_known_bits.default + %common.ret.op = select i1 %.not, ptr %test_clear_known_bits._Maes.test_clear_known_bits.default, ptr @test_clear_known_bits._Mmops + ret ptr %common.ret.op +} + +define i32 @caller11._MmopsMsve2() #4 { +; CHECK-LABEL: define i32 @caller11._MmopsMsve2( +; CHECK-SAME: ) #[[ATTR4]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_clear_known_bits._Mmops() +; +entry: + %call = tail call i32 @test_clear_known_bits() + ret i32 %call +} + +define i32 @caller11._Msme() #5 { +; CHECK-LABEL: define i32 @caller11._Msme( +; CHECK-SAME: ) #[[ATTR5:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_clear_known_bits() +; +entry: + %call = tail call i32 @test_clear_known_bits() + ret i32 %call +} + +define noundef i32 @caller11._MaesMsve2() #19 { +; CHECK-LABEL: define noundef i32 @caller11._MaesMsve2( +; CHECK-SAME: ) #[[ATTR19:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_clear_known_bits._Maes() +; +entry: + %call = tail call i32 @test_clear_known_bits() + ret i32 %call +} + +define i32 @caller11.default() #0 { +; CHECK-LABEL: define i32 @caller11.default( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_clear_known_bits() +; +entry: + %call = tail call i32 @test_clear_known_bits() + ret i32 %call +} + +define weak_odr ptr @caller11.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @caller11.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460822096707840 + %2 = icmp eq i64 %1, 576460822096707840 + br i1 %2, label %common.ret, label %resolver_else + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @caller11._MmopsMsve2, %resolver_entry ], [ @caller11._Msme, %resolver_else ], [ %caller11._MaesMsve2.caller11.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %3 = and i64 %0, 4398180795136 + %4 = icmp eq i64 %3, 4398180795136 + br i1 %4, label %common.ret, label %resolver_else2 + +resolver_else2: ; preds = %resolver_else + %5 = and i64 %0, 69793317632 + %6 = icmp eq i64 %5, 69793317632 + %caller11._MaesMsve2.caller11.default = select i1 %6, ptr @caller11._MaesMsve2, ptr @caller11.default + br label %common.ret +} + attributes #0 = { "fmv-features" } attributes #1 = { "fmv-features"="sve" } attributes #2 = { "fmv-features"="sve2" } @@ -610,3 +697,4 @@ attributes #15 = { "fmv-features"="flagm2,frintts" } attributes #16 = { "fmv-features"="rcpc2" } attributes #17 = { "fmv-features"="frintts" } attributes #18 = { "target-features"="+fp-armv8,+mops,+neon,+outline-atomics,+sve,+v8a" } +attributes #19 = { "fmv-features"="aes,sve2" }