@@ -140,6 +140,7 @@ cl::opt<bool> MemProfRequireDefinitionForPromotion(
140140} // namespace llvm
141141
142142extern cl::opt<bool > MemProfReportHintedSizes;
143+ extern cl::opt<unsigned > MinClonedColdBytePercent;
143144
144145namespace {
145146// / CRTP base for graphs built from either IR or ThinLTO summary index.
@@ -617,6 +618,11 @@ class CallsiteContextGraph {
617618 static_cast <DerivedCCG *>(this )->updateAllocationCall (Call, AllocType);
618619 }
619620
621+ // / Get the AllocationType assigned to the given allocation instruction clone.
622+ AllocationType getAllocationCallType (const CallInfo &Call) const {
623+ return static_cast <const DerivedCCG *>(this )->getAllocationCallType (Call);
624+ }
625+
620626 // / Update non-allocation call to invoke (possibly cloned) function
621627 // / CalleeFunc.
622628 void updateCall (CallInfo &CallerCall, FuncInfo CalleeFunc) {
@@ -711,7 +717,8 @@ class CallsiteContextGraph {
711717
712718 // / Map from each contextID to the profiled full contexts and their total
713719 // / sizes (there may be more than one due to context trimming),
714- // / optionally populated when requested (via MemProfReportHintedSizes).
720+ // / optionally populated when requested (via MemProfReportHintedSizes or
721+ // / MinClonedColdBytePercent).
715722 DenseMap<uint32_t , std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
716723
717724 // / Identifies the context node created for a stack id when adding the MIB
@@ -773,6 +780,7 @@ class ModuleCallsiteContextGraph
773780 uint64_t getLastStackId (Instruction *Call);
774781 std::vector<uint64_t > getStackIdsWithContextNodesForCall (Instruction *Call);
775782 void updateAllocationCall (CallInfo &Call, AllocationType AllocType);
783+ AllocationType getAllocationCallType (const CallInfo &Call) const ;
776784 void updateCall (CallInfo &CallerCall, FuncInfo CalleeFunc);
777785 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
778786 Instruction *>::FuncInfo
@@ -852,6 +860,7 @@ class IndexCallsiteContextGraph
852860 uint64_t getLastStackId (IndexCall &Call);
853861 std::vector<uint64_t > getStackIdsWithContextNodesForCall (IndexCall &Call);
854862 void updateAllocationCall (CallInfo &Call, AllocationType AllocType);
863+ AllocationType getAllocationCallType (const CallInfo &Call) const ;
855864 void updateCall (CallInfo &CallerCall, FuncInfo CalleeFunc);
856865 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
857866 IndexCall>::FuncInfo
@@ -1201,8 +1210,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
12011210
12021211 ContextIdToAllocationType[++LastContextId] = AllocType;
12031212
1204- if (MemProfReportHintedSizes) {
1205- assert (!ContextSizeInfo.empty ());
1213+ if (!ContextSizeInfo.empty ()) {
12061214 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
12071215 Entry.insert (Entry.begin (), ContextSizeInfo.begin (), ContextSizeInfo.end ());
12081216 }
@@ -2043,14 +2051,15 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
20432051 CallStack<MIBInfo, SmallVector<unsigned >::const_iterator>
20442052 EmptyContext;
20452053 unsigned I = 0 ;
2046- assert (!MemProfReportHintedSizes ||
2047- AN.ContextSizeInfos .size () == AN.MIBs .size ());
2054+ assert (
2055+ (!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100 ) ||
2056+ AN.ContextSizeInfos .size () == AN.MIBs .size ());
20482057 // Now add all of the MIBs and their stack nodes.
20492058 for (auto &MIB : AN.MIBs ) {
20502059 CallStack<MIBInfo, SmallVector<unsigned >::const_iterator>
20512060 StackContext (&MIB);
20522061 std::vector<ContextTotalSize> ContextSizeInfo;
2053- if (MemProfReportHintedSizes ) {
2062+ if (!AN. ContextSizeInfos . empty () ) {
20542063 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos [I])
20552064 ContextSizeInfo.push_back ({FullStackId, TotalSize});
20562065 }
@@ -2825,6 +2834,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
28252834 if (!Node->IsAllocation )
28262835 continue ;
28272836 DenseSet<uint32_t > ContextIds = Node->getContextIds ();
2837+ auto AllocTypeFromCall = getAllocationCallType (Node->Call );
28282838 std::vector<uint32_t > SortedIds (ContextIds.begin (), ContextIds.end ());
28292839 std::sort (SortedIds.begin (), SortedIds.end ());
28302840 for (auto Id : SortedIds) {
@@ -2837,7 +2847,11 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
28372847 << getAllocTypeString ((uint8_t )TypeI->second )
28382848 << " full allocation context " << Info.FullStackId
28392849 << " with total size " << Info.TotalSize << " is "
2840- << getAllocTypeString (Node->AllocTypes ) << " after cloning\n " ;
2850+ << getAllocTypeString (Node->AllocTypes ) << " after cloning" ;
2851+ if (allocTypeToUse (Node->AllocTypes ) != AllocTypeFromCall)
2852+ OS << " marked " << getAllocTypeString ((uint8_t )AllocTypeFromCall)
2853+ << " due to cold byte percent" ;
2854+ OS << " \n " ;
28412855 }
28422856 }
28432857 }
@@ -3487,6 +3501,23 @@ void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
34873501 AI->Versions [Call.cloneNo ()] = (uint8_t )AllocType;
34883502}
34893503
3504+ AllocationType
3505+ ModuleCallsiteContextGraph::getAllocationCallType (const CallInfo &Call) const {
3506+ const auto *CB = cast<CallBase>(Call.call ());
3507+ if (!CB->getAttributes ().hasFnAttr (" memprof" ))
3508+ return AllocationType::None;
3509+ return CB->getAttributes ().getFnAttr (" memprof" ).getValueAsString () == " cold"
3510+ ? AllocationType::Cold
3511+ : AllocationType::NotCold;
3512+ }
3513+
3514+ AllocationType
3515+ IndexCallsiteContextGraph::getAllocationCallType (const CallInfo &Call) const {
3516+ const auto *AI = Call.call ().dyn_cast <AllocInfo *>();
3517+ assert (AI->Versions .size () > Call.cloneNo ());
3518+ return (AllocationType)AI->Versions [Call.cloneNo ()];
3519+ }
3520+
34903521void ModuleCallsiteContextGraph::updateCall (CallInfo &CallerCall,
34913522 FuncInfo CalleeFunc) {
34923523 if (CalleeFunc.cloneNo () > 0 )
@@ -4017,6 +4048,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
40174048 }
40184049 }
40194050
4051+ uint8_t BothTypes =
4052+ (uint8_t )AllocationType::Cold | (uint8_t )AllocationType::NotCold;
4053+
40204054 auto UpdateCalls = [&](ContextNode *Node,
40214055 DenseSet<const ContextNode *> &Visited,
40224056 auto &&UpdateCalls) {
@@ -4036,7 +4070,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
40364070 return ;
40374071
40384072 if (Node->IsAllocation ) {
4039- updateAllocationCall (Node->Call , allocTypeToUse (Node->AllocTypes ));
4073+ auto AT = allocTypeToUse (Node->AllocTypes );
4074+ // If the allocation type is ambiguous, and more aggressive hinting
4075+ // has been enabled via the MinClonedColdBytePercent flag, see if this
4076+ // allocation should be hinted cold anyway because its fraction cold bytes
4077+ // allocated is at least the given threshold.
4078+ if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
4079+ !ContextIdToContextSizeInfos.empty ()) {
4080+ uint64_t TotalCold = 0 ;
4081+ uint64_t Total = 0 ;
4082+ for (auto Id : Node->getContextIds ()) {
4083+ auto TypeI = ContextIdToAllocationType.find (Id);
4084+ assert (TypeI != ContextIdToAllocationType.end ());
4085+ auto CSI = ContextIdToContextSizeInfos.find (Id);
4086+ if (CSI != ContextIdToContextSizeInfos.end ()) {
4087+ for (auto &Info : CSI->second ) {
4088+ Total += Info.TotalSize ;
4089+ if (TypeI->second == AllocationType::Cold)
4090+ TotalCold += Info.TotalSize ;
4091+ }
4092+ }
4093+ }
4094+ if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
4095+ AT = AllocationType::Cold;
4096+ }
4097+ updateAllocationCall (Node->Call , AT);
40404098 assert (Node->MatchingCalls .empty ());
40414099 return ;
40424100 }
@@ -4419,7 +4477,11 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
44194477 // will still be none type or should have gotten the default NotCold.
44204478 // Skip that after calling clone helper since that does some sanity
44214479 // checks that confirm we haven't decided yet that we need cloning.
4422- if (AllocNode.Versions .size () == 1 ) {
4480+ // We might have a single version that is cold due to the
4481+ // MinClonedColdBytePercent heuristic, make sure we don't skip in that
4482+ // case.
4483+ if (AllocNode.Versions .size () == 1 &&
4484+ (AllocationType)AllocNode.Versions [0 ] != AllocationType::Cold) {
44234485 assert ((AllocationType)AllocNode.Versions [0 ] ==
44244486 AllocationType::NotCold ||
44254487 (AllocationType)AllocNode.Versions [0 ] ==
0 commit comments