Skip to content

Commit c537239

Browse files
committed
runtime/pprof: add debug=3 goroutine profile with goid and labels
This adds a new goroutine profile debug mode, debug=3. This mode emits in the same binary proto output format as debug=0, with the only difference being that it does not aggregate matching stack/label combinations into a single count, and instead emits a sample per goroutine with additional synthesized labels to communicate some of the details of each goroutine, specifically: - go::goroutine: the goroutine's ID - go::goroutine_created_by: ID of the goroutine's creator (if any) - go::goroutine_state: current state of the goroutine (e.g. runnable) - go::goroutine_wait_minutes: approximate minutes goroutine has spent waiting (if applicable) Previously the debug=2 mode was the only way to get this kind of per-goroutine information, that is sometimes vital to understanding the state of a process. However debug=2 has two major drawbacks: 1) its collection incurs a lengthy and disruptive stop-the-world pause and 2) it does not include user-set labels along side per-goroutine details in the same profile. This new debug=3 mode uses the same concurrent collection mechanism used to produce debug=0 and debug=1 profiles, meaning it has the same minimial stop-the-world penalty. At the same time, it includes the per-goroutine details like status and wait time that make debug=2 so useful, providing a "best-of-both-worlds" option. A new mode is introduced, rather than changing the implementation of the debug=2 format in-place, as it is not clear that debug=2 can utilize a concurrent collection mechanism while maintaining the correctness of its existing output, which includes argument values in its printed stacks. The difference in STW latency observed by running goroutines during profile collection is demonstrated by an included benchmark which spawns a number of goroutines to be profiled and then measures the latency of a short timer while collecting goroutine profiles. BenchmarkGoroutineProfileLatencyImpact │ debug=2 │ debug=3 │ │ max_latency_ns │ max_latency_ns vs base │ goroutines=100x3-14 422.2k ± 13% 190.3k ± 38% -54.93% (p=0.002 n=6) goroutines=100x10-14 619.7k ± 10% 171.1k ± 43% -72.38% (p=0.002 n=6) goroutines=100x50-14 1423.6k ± 7% 174.3k ± 44% -87.76% (p=0.002 n=6) goroutines=1000x3-14 2424.8k ± 8% 298.6k ± 106% -87.68% (p=0.002 n=6) goroutines=1000x10-14 7378.4k ± 2% 268.2k ± 146% -96.36% (p=0.002 n=6) goroutines=1000x50-14 23372.5k ± 10% 330.1k ± 173% -98.59% (p=0.002 n=6) goroutines=10000x3-14 42.802M ± 47% 1.991M ± 105% -95.35% (p=0.002 n=6) goroutines=10000x10-14 36668.2k ± 95% 743.1k ± 72% -97.97% (p=0.002 n=6) goroutines=10000x50-14 120639.1k ± 2% 188.2k ± 2582% -99.84% (p=0.002 n=6) geomean 6.760M 326.2k -95.18% The per-goroutine details are included in the profile as labels, along side any user-set labels. While the pprof format allows for multi-valued labels, so a collision with a user-set label would preserve both values, it also discourages them, thus the 'go::' namespace prefix is used to minimize collisions with user-set labels. The form 'go::' follows the convention established in the pprof format, which reserves 'pprof::'. Fixes golang#74954. Change-Id: If90eb01887ae3f35be8acc3d239b88dc29d338a8
1 parent 12ec09f commit c537239

File tree

8 files changed

+375
-97
lines changed

8 files changed

+375
-97
lines changed

src/internal/profilerecord/profilerecord.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,16 @@
88
// TODO: Consider moving this to internal/runtime, see golang.org/issue/65355.
99
package profilerecord
1010

11+
import "unsafe"
12+
1113
type StackRecord struct {
1214
Stack []uintptr
1315
}
1416

17+
func (r StackRecord) GetStack() []uintptr { return r.Stack }
18+
func (r StackRecord) GetLabels() unsafe.Pointer { return nil }
19+
func (r StackRecord) GetGoroutine() GoroutineRecord { return GoroutineRecord{} }
20+
1521
type MemProfileRecord struct {
1622
AllocBytes, FreeBytes int64
1723
AllocObjects, FreeObjects int64
@@ -26,3 +32,18 @@ type BlockProfileRecord struct {
2632
Cycles int64
2733
Stack []uintptr
2834
}
35+
36+
type GoroutineRecord struct {
37+
ID uint64
38+
State uint32
39+
WaitReason uint8
40+
CreatorID uint64
41+
CreationPC uintptr
42+
WaitSince int64 // approx time when the g became blocked, in nanoseconds
43+
Labels unsafe.Pointer
44+
Stack []uintptr
45+
}
46+
47+
func (r GoroutineRecord) GetStack() []uintptr { return r.Stack }
48+
func (r GoroutineRecord) GetLabels() unsafe.Pointer { return r.Labels }
49+
func (r GoroutineRecord) GetGoroutine() GoroutineRecord { return r }

src/net/http/pprof/pprof.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ func (name handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
264264
runtime.GC()
265265
}
266266
debug, _ := strconv.Atoi(r.FormValue("debug"))
267-
if debug != 0 {
267+
if debug != 0 && !(name == "goroutine" && debug == 3) {
268268
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
269269
} else {
270270
w.Header().Set("Content-Type", "application/octet-stream")

src/runtime/mprof.go

Lines changed: 32 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,40 +1245,21 @@ func pprof_threadCreateInternal(p []profilerecord.StackRecord) (n int, ok bool)
12451245
})
12461246
}
12471247

1248-
//go:linkname pprof_goroutineProfileWithLabels
1249-
func pprof_goroutineProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1250-
return goroutineProfileWithLabels(p, labels)
1248+
//go:linkname pprof_goroutineProfile
1249+
func pprof_goroutineProfile(p []profilerecord.GoroutineRecord) (n int, ok bool) {
1250+
return goroutineProfileInternal(p)
12511251
}
12521252

1253-
// labels may be nil. If labels is non-nil, it must have the same length as p.
1254-
func goroutineProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1255-
if labels != nil && len(labels) != len(p) {
1256-
labels = nil
1257-
}
1258-
1259-
return goroutineProfileWithLabelsConcurrent(p, labels)
1260-
}
1261-
1262-
//go:linkname pprof_goroutineLeakProfileWithLabels
1263-
func pprof_goroutineLeakProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1264-
return goroutineLeakProfileWithLabelsConcurrent(p, labels)
1265-
}
1266-
1267-
// labels may be nil. If labels is non-nil, it must have the same length as p.
1268-
func goroutineLeakProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1269-
if labels != nil && len(labels) != len(p) {
1270-
labels = nil
1271-
}
1272-
1273-
return goroutineLeakProfileWithLabelsConcurrent(p, labels)
1253+
//go:linkname pprof_goroutineLeakProfile
1254+
func pprof_goroutineLeakProfile(p []profilerecord.GoroutineRecord) (n int, ok bool) {
1255+
return goroutineLeakProfileInternal(p)
12741256
}
12751257

12761258
var goroutineProfile = struct {
12771259
sema uint32
12781260
active bool
12791261
offset atomic.Int64
1280-
records []profilerecord.StackRecord
1281-
labels []unsafe.Pointer
1262+
records []profilerecord.GoroutineRecord
12821263
}{
12831264
sema: 1,
12841265
}
@@ -1316,18 +1297,18 @@ func (p *goroutineProfileStateHolder) CompareAndSwap(old, new goroutineProfileSt
13161297
return (*atomic.Uint32)(p).CompareAndSwap(uint32(old), uint32(new))
13171298
}
13181299

1319-
func goroutineLeakProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1300+
func goroutineLeakProfileInternal(p []profilerecord.GoroutineRecord) (n int, ok bool) {
13201301
if len(p) == 0 {
13211302
// An empty slice is obviously too small. Return a rough
13221303
// allocation estimate.
13231304
return work.goroutineLeak.count, false
13241305
}
13251306

1326-
// Use the same semaphore as goroutineProfileWithLabelsConcurrent,
1307+
// Use the same semaphore as goroutineProfileInternal,
13271308
// because ultimately we still use goroutine profiles.
13281309
semacquire(&goroutineProfile.sema)
13291310

1330-
// Unlike in goroutineProfileWithLabelsConcurrent, we don't need to
1311+
// Unlike in goroutineProfileInternal, we don't need to
13311312
// save the current goroutine stack, because it is obviously not leaked.
13321313

13331314
pcbuf := makeProfStack() // see saveg() for explanation
@@ -1358,7 +1339,7 @@ func goroutineLeakProfileWithLabelsConcurrent(p []profilerecord.StackRecord, lab
13581339
return n, true
13591340
}
13601341

1361-
func goroutineProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1342+
func goroutineProfileInternal(p []profilerecord.GoroutineRecord) (n int, ok bool) {
13621343
if len(p) == 0 {
13631344
// An empty slice is obviously too small. Return a rough
13641345
// allocation estimate without bothering to STW. As long as
@@ -1401,9 +1382,15 @@ func goroutineProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels
14011382
systemstack(func() {
14021383
saveg(pc, sp, ourg, &p[0], pcbuf)
14031384
})
1404-
if labels != nil {
1405-
labels[0] = ourg.labels
1406-
}
1385+
1386+
p[0].ID = ourg.goid
1387+
p[0].CreatorID = ourg.parentGoid
1388+
p[0].CreationPC = ourg.gopc
1389+
p[0].Labels = ourg.labels
1390+
p[0].State = readgstatus(ourg) &^ _Gscan
1391+
p[0].WaitReason = uint8(ourg.waitreason)
1392+
p[0].WaitSince = ourg.waitsince
1393+
14071394
ourg.goroutineProfiled.Store(goroutineProfileSatisfied)
14081395
goroutineProfile.offset.Store(1)
14091396

@@ -1414,7 +1401,6 @@ func goroutineProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels
14141401
// field set to goroutineProfileSatisfied.
14151402
goroutineProfile.active = true
14161403
goroutineProfile.records = p
1417-
goroutineProfile.labels = labels
14181404
startTheWorld(stw)
14191405

14201406
// Visit each goroutine that existed as of the startTheWorld call above.
@@ -1436,7 +1422,6 @@ func goroutineProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels
14361422
endOffset := goroutineProfile.offset.Swap(0)
14371423
goroutineProfile.active = false
14381424
goroutineProfile.records = nil
1439-
goroutineProfile.labels = nil
14401425
startTheWorld(stw)
14411426

14421427
// Restore the invariant that every goroutine struct in allgs has its
@@ -1528,7 +1513,7 @@ func doRecordGoroutineProfile(gp1 *g, pcbuf []uintptr) {
15281513
// System goroutines should not appear in the profile.
15291514
// Check this here and not in tryRecordGoroutineProfile because isSystemGoroutine
15301515
// may change on a goroutine while it is executing, so while the scheduler might
1531-
// see a system goroutine, goroutineProfileWithLabelsConcurrent might not, and
1516+
// see a system goroutine, goroutineProfileInternal might not, and
15321517
// this inconsistency could cause invariants to be violated, such as trying to
15331518
// record the stack of a running goroutine below. In short, we still want system
15341519
// goroutines to participate in the same state machine on gp1.goroutineProfiled as
@@ -1545,7 +1530,7 @@ func doRecordGoroutineProfile(gp1 *g, pcbuf []uintptr) {
15451530
if offset >= len(goroutineProfile.records) {
15461531
// Should be impossible, but better to return a truncated profile than
15471532
// to crash the entire process at this point. Instead, deal with it in
1548-
// goroutineProfileWithLabelsConcurrent where we have more context.
1533+
// goroutineProfileInternal where we have more context.
15491534
return
15501535
}
15511536

@@ -1559,12 +1544,16 @@ func doRecordGoroutineProfile(gp1 *g, pcbuf []uintptr) {
15591544
// to avoid schedule delays.
15601545
systemstack(func() { saveg(^uintptr(0), ^uintptr(0), gp1, &goroutineProfile.records[offset], pcbuf) })
15611546

1562-
if goroutineProfile.labels != nil {
1563-
goroutineProfile.labels[offset] = gp1.labels
1564-
}
1547+
goroutineProfile.records[offset].Labels = gp1.labels
1548+
goroutineProfile.records[offset].ID = gp1.goid
1549+
goroutineProfile.records[offset].CreatorID = gp1.parentGoid
1550+
goroutineProfile.records[offset].CreationPC = gp1.gopc
1551+
goroutineProfile.records[offset].State = readgstatus(gp1) &^ _Gscan
1552+
goroutineProfile.records[offset].WaitReason = uint8(gp1.waitreason)
1553+
goroutineProfile.records[offset].WaitSince = gp1.waitsince
15651554
}
15661555

1567-
func goroutineProfileWithLabelsSync(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1556+
func goroutineProfileWithLabelsSync(p []profilerecord.GoroutineRecord, labels []unsafe.Pointer) (n int, ok bool) {
15681557
gp := getg()
15691558

15701559
isOK := func(gp1 *g) bool {
@@ -1650,7 +1639,7 @@ func goroutineProfileWithLabelsSync(p []profilerecord.StackRecord, labels []unsa
16501639
// Most clients should use the [runtime/pprof] package instead
16511640
// of calling GoroutineProfile directly.
16521641
func GoroutineProfile(p []StackRecord) (n int, ok bool) {
1653-
records := make([]profilerecord.StackRecord, len(p))
1642+
records := make([]profilerecord.GoroutineRecord, len(p))
16541643
n, ok = goroutineProfileInternal(records)
16551644
if !ok {
16561645
return
@@ -1662,11 +1651,7 @@ func GoroutineProfile(p []StackRecord) (n int, ok bool) {
16621651
return
16631652
}
16641653

1665-
func goroutineProfileInternal(p []profilerecord.StackRecord) (n int, ok bool) {
1666-
return goroutineProfileWithLabels(p, nil)
1667-
}
1668-
1669-
func saveg(pc, sp uintptr, gp *g, r *profilerecord.StackRecord, pcbuf []uintptr) {
1654+
func saveg(pc, sp uintptr, gp *g, r *profilerecord.GoroutineRecord, pcbuf []uintptr) {
16701655
// To reduce memory usage, we want to allocate a r.Stack that is just big
16711656
// enough to hold gp's stack trace. Naively we might achieve this by
16721657
// recording our stack trace into mp.profStack, and then allocating a

0 commit comments

Comments
 (0)