Skip to content

Commit 17642c7

Browse files
authored
[SamplePGO] Support -salvage-stale-profile without probes too (#86116)
Currently -salvage-stale-profile is a no-op if the profile is not probe-based. We observed that it can help for regular, non-probe- based profiles too: some of our internal benchmarks show 0.2-0.3% QPS improvement. There seems to be no good reason to limit this flag to only work for probe-based profiles.
1 parent 2ff3850 commit 17642c7

File tree

3 files changed

+256
-4
lines changed

3 files changed

+256
-4
lines changed

llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -247,10 +247,10 @@ void SampleProfileMatcher::runOnFunction(Function &F) {
247247
if (ReportProfileStaleness || PersistProfileStaleness)
248248
recordCallsiteMatchStates(F, IRAnchors, ProfileAnchors, nullptr);
249249

250-
// Run profile matching for checksum mismatched profile, currently only
251-
// support for pseudo-probe.
252-
if (SalvageStaleProfile && FunctionSamples::ProfileIsProbeBased &&
253-
!ProbeManager->profileIsValid(F, *FSFlattened)) {
250+
// For probe-based profiles, run matching only when the current profile is not
251+
// valid.
252+
if (SalvageStaleProfile && (!FunctionSamples::ProfileIsProbeBased ||
253+
!ProbeManager->profileIsValid(F, *FSFlattened))) {
254254
// For imported functions, the checksum metadata(pseudo_probe_desc) are
255255
// dropped, so we leverage function attribute(profile-checksum-mismatch) to
256256
// transfer the info: add the attribute during pre-link phase and check it
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
main:9229397:0
2+
0: 0
3+
1: 0
4+
1.1: 47663
5+
1.2: 51871
6+
2: 48723
7+
3: 48723 bar:49018
8+
4: 49087
9+
5: 51871 bar:49588
10+
7: 0
11+
2: foo:1479916
12+
1: 47663
13+
1.1: 46683 bar:43238
14+
2: 4519 bar:4932
15+
3: 48723
16+
4: foo:1505537
17+
1: 48604
18+
1.1: 46965 bar:44479
19+
2: 4613 bar:4967
20+
3: 49087
21+
bar:2333388:196222
22+
0: 194449
23+
1: 194449
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
; REQUIRES: x86_64-linux
2+
; REQUIRES: asserts
3+
; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/non-probe-stale-profile-matching.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl 2>&1 | FileCheck %s
4+
5+
; The profiled source code:
6+
7+
; volatile int x = 1;
8+
; __attribute__((noinline)) int bar(int p) {
9+
; return p;
10+
; }
11+
12+
; __attribute__((always_inline)) int foo(int i, int p) {
13+
; if (i % 10) return bar(p);
14+
; else return bar(p + 1);
15+
; }
16+
17+
; int main() {
18+
; for (int i = 0; i < 1000 * 1000; i++) {
19+
; x += foo(i, x);
20+
; x += bar(x);
21+
; x += foo(i, x);
22+
; x += bar(x);
23+
; }
24+
; }
25+
26+
; The source code for the current build:
27+
28+
; volatile int x = 1;
29+
; __attribute__((noinline)) int bar(int p) {
30+
; return p;
31+
; }
32+
33+
; __attribute__((always_inline)) int foo(int i, int p) {
34+
; if (i % 10) return bar(p);
35+
; else return bar(p + 1);
36+
; }
37+
38+
; int main() {
39+
; if (x == 0) // code change
40+
; return 0; // code change
41+
; for (int i = 0; i < 1000 * 1000; i++) {
42+
; x += foo(i, x);
43+
; x += bar(x);
44+
; if (i < 0) // code change
45+
; return 0; // code change
46+
; x += foo(i, x);
47+
; x += bar(x);
48+
; }
49+
; }
50+
51+
; CHECK: Run stale profile matching for bar
52+
53+
; CHECK: Run stale profile matching for foo
54+
; CHECK: Callsite with callee:bar is matched from 1.1 to 1.1
55+
; CHECK: Callsite with callee:bar is matched from 2 to 2
56+
57+
; CHECK: Run stale profile matching for main
58+
; CHECK: Callsite with callee:foo is matched from 4 to 2
59+
; CHECK: Callsite with callee:bar is matched from 5 to 3
60+
; CHECK: Callsite with callee:foo is matched from 8 to 4
61+
; CHECK: Callsite with callee:bar is matched from 9 to 5
62+
63+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
64+
target triple = "x86_64-unknown-linux-gnu"
65+
66+
@x = dso_local global i32 1, align 4
67+
68+
; Function Attrs: noinline nounwind uwtable
69+
define dso_local i32 @bar(i32 noundef %p) #0 !dbg !9 {
70+
entry:
71+
ret i32 %p, !dbg !13
72+
}
73+
74+
; Function Attrs: alwaysinline nounwind uwtable
75+
define dso_local i32 @foo(i32 noundef %i, i32 noundef %p) #1 !dbg !14 {
76+
entry:
77+
%rem = srem i32 %i, 10, !dbg !15
78+
%tobool = icmp ne i32 %rem, 0, !dbg !15
79+
br i1 %tobool, label %if.then, label %if.else, !dbg !16
80+
81+
if.then: ; preds = %entry
82+
%call = call i32 @bar(i32 noundef %p), !dbg !17
83+
br label %return, !dbg !19
84+
85+
if.else: ; preds = %entry
86+
%add = add nsw i32 %p, 1, !dbg !20
87+
%call1 = call i32 @bar(i32 noundef %add), !dbg !21
88+
br label %return, !dbg !22
89+
90+
return: ; preds = %if.else, %if.then
91+
%retval.0 = phi i32 [ %call, %if.then ], [ %call1, %if.else ], !dbg !23
92+
ret i32 %retval.0, !dbg !24
93+
}
94+
95+
; Function Attrs: nounwind uwtable
96+
define dso_local i32 @main() #2 !dbg !25 {
97+
entry:
98+
%0 = load volatile i32, ptr @x, align 4, !dbg !26, !tbaa !27
99+
%cmp = icmp eq i32 %0, 0, !dbg !31
100+
br i1 %cmp, label %if.then, label %if.end, !dbg !26
101+
102+
if.then: ; preds = %entry
103+
br label %for.end, !dbg !32
104+
105+
if.end: ; preds = %entry
106+
br label %for.cond, !dbg !33
107+
108+
for.cond: ; preds = %if.end6, %if.end
109+
%i.0 = phi i32 [ 0, %if.end ], [ %inc, %if.end6 ], !dbg !34
110+
%cmp1 = icmp slt i32 %i.0, 1000000, !dbg !35
111+
br i1 %cmp1, label %for.body, label %for.cond.cleanup, !dbg !37
112+
113+
for.cond.cleanup: ; preds = %for.cond
114+
br label %cleanup, !dbg !38
115+
116+
for.body: ; preds = %for.cond
117+
%1 = load volatile i32, ptr @x, align 4, !dbg !40, !tbaa !27
118+
%call = call i32 @foo(i32 noundef %i.0, i32 noundef %1), !dbg !41
119+
%2 = load volatile i32, ptr @x, align 4, !dbg !42, !tbaa !27
120+
%add = add nsw i32 %2, %call, !dbg !42
121+
store volatile i32 %add, ptr @x, align 4, !dbg !42, !tbaa !27
122+
%3 = load volatile i32, ptr @x, align 4, !dbg !43, !tbaa !27
123+
%call2 = call i32 @bar(i32 noundef %3), !dbg !44
124+
%4 = load volatile i32, ptr @x, align 4, !dbg !45, !tbaa !27
125+
%add3 = add nsw i32 %4, %call2, !dbg !45
126+
store volatile i32 %add3, ptr @x, align 4, !dbg !45, !tbaa !27
127+
br i1 false, label %if.then5, label %if.end6, !dbg !46
128+
129+
if.then5: ; preds = %for.body
130+
br label %cleanup, !dbg !47
131+
132+
if.end6: ; preds = %for.body
133+
%5 = load volatile i32, ptr @x, align 4, !dbg !48, !tbaa !27
134+
%call7 = call i32 @foo(i32 noundef %i.0, i32 noundef %5), !dbg !49
135+
%6 = load volatile i32, ptr @x, align 4, !dbg !50, !tbaa !27
136+
%add8 = add nsw i32 %6, %call7, !dbg !50
137+
store volatile i32 %add8, ptr @x, align 4, !dbg !50, !tbaa !27
138+
%7 = load volatile i32, ptr @x, align 4, !dbg !51, !tbaa !27
139+
%call9 = call i32 @bar(i32 noundef %7), !dbg !52
140+
%8 = load volatile i32, ptr @x, align 4, !dbg !53, !tbaa !27
141+
%add10 = add nsw i32 %8, %call9, !dbg !53
142+
store volatile i32 %add10, ptr @x, align 4, !dbg !53, !tbaa !27
143+
%inc = add nsw i32 %i.0, 1, !dbg !54
144+
br label %for.cond, !dbg !56, !llvm.loop !57
145+
146+
cleanup: ; preds = %if.then5, %for.cond.cleanup
147+
br label %for.end
148+
149+
for.end: ; preds = %cleanup, %if.then
150+
ret i32 0, !dbg !61
151+
}
152+
153+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
154+
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #3
155+
156+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
157+
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #3
158+
159+
attributes #0 = { noinline nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
160+
attributes #1 = { alwaysinline nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
161+
attributes #2 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
162+
attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
163+
164+
!llvm.dbg.cu = !{!0}
165+
!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
166+
!llvm.ident = !{!8}
167+
168+
!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
169+
!1 = !DIFile(filename: "test.c", directory: "path")
170+
!2 = !{i32 7, !"Dwarf Version", i32 5}
171+
!3 = !{i32 2, !"Debug Info Version", i32 3}
172+
!4 = !{i32 1, !"wchar_size", i32 4}
173+
!5 = !{i32 8, !"PIC Level", i32 2}
174+
!6 = !{i32 7, !"PIE Level", i32 2}
175+
!7 = !{i32 7, !"uwtable", i32 2}
176+
!8 = !{!"clang version 19.0.0git"}
177+
!9 = distinct !DISubprogram(name: "bar", scope: !10, file: !10, line: 2, type: !11, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
178+
!10 = !DIFile(filename: "test.c", directory: "path")
179+
!11 = !DISubroutineType(types: !12)
180+
!12 = !{}
181+
!13 = !DILocation(line: 3, column: 3, scope: !9)
182+
!14 = distinct !DISubprogram(name: "foo", scope: !10, file: !10, line: 6, type: !11, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
183+
!15 = !DILocation(line: 7, column: 9, scope: !14)
184+
!16 = !DILocation(line: 7, column: 7, scope: !14)
185+
!17 = !DILocation(line: 7, column: 23, scope: !18)
186+
!18 = !DILexicalBlockFile(scope: !14, file: !10, discriminator: 2)
187+
!19 = !DILocation(line: 7, column: 15, scope: !18)
188+
!20 = !DILocation(line: 8, column: 21, scope: !14)
189+
!21 = !DILocation(line: 8, column: 15, scope: !14)
190+
!22 = !DILocation(line: 8, column: 8, scope: !14)
191+
!23 = !DILocation(line: 0, scope: !14)
192+
!24 = !DILocation(line: 9, column: 1, scope: !14)
193+
!25 = distinct !DISubprogram(name: "main", scope: !10, file: !10, line: 11, type: !11, scopeLine: 11, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
194+
!26 = !DILocation(line: 12, column: 7, scope: !25)
195+
!27 = !{!28, !28, i64 0}
196+
!28 = !{!"int", !29, i64 0}
197+
!29 = !{!"omnipotent char", !30, i64 0}
198+
!30 = !{!"Simple C/C++ TBAA"}
199+
!31 = !DILocation(line: 12, column: 9, scope: !25)
200+
!32 = !DILocation(line: 13, column: 5, scope: !25)
201+
!33 = !DILocation(line: 14, column: 8, scope: !25)
202+
!34 = !DILocation(line: 14, scope: !25)
203+
!35 = !DILocation(line: 14, column: 21, scope: !36)
204+
!36 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 2)
205+
!37 = !DILocation(line: 14, column: 3, scope: !36)
206+
!38 = !DILocation(line: 14, column: 3, scope: !39)
207+
!39 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 4)
208+
!40 = !DILocation(line: 15, column: 18, scope: !25)
209+
!41 = !DILocation(line: 15, column: 11, scope: !25)
210+
!42 = !DILocation(line: 15, column: 8, scope: !25)
211+
!43 = !DILocation(line: 16, column: 15, scope: !25)
212+
!44 = !DILocation(line: 16, column: 11, scope: !25)
213+
!45 = !DILocation(line: 16, column: 8, scope: !25)
214+
!46 = !DILocation(line: 17, column: 10, scope: !25)
215+
!47 = !DILocation(line: 18, column: 8, scope: !25)
216+
!48 = !DILocation(line: 19, column: 18, scope: !25)
217+
!49 = !DILocation(line: 19, column: 11, scope: !25)
218+
!50 = !DILocation(line: 19, column: 8, scope: !25)
219+
!51 = !DILocation(line: 20, column: 15, scope: !25)
220+
!52 = !DILocation(line: 20, column: 11, scope: !25)
221+
!53 = !DILocation(line: 20, column: 8, scope: !25)
222+
!54 = !DILocation(line: 14, column: 37, scope: !55)
223+
!55 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 6)
224+
!56 = !DILocation(line: 14, column: 3, scope: !55)
225+
!57 = distinct !{!57, !58, !59, !60}
226+
!58 = !DILocation(line: 14, column: 3, scope: !25)
227+
!59 = !DILocation(line: 21, column: 3, scope: !25)
228+
!60 = !{!"llvm.loop.mustprogress"}
229+
!61 = !DILocation(line: 22, column: 1, scope: !25)

0 commit comments

Comments
 (0)