From c53723972d83930241475c0eb0231751158872fc Mon Sep 17 00:00:00 2001
From: David Taylor <davidt@davidt.io>
Date: Tue, 7 Oct 2025 15:07:01 +0000
Subject: [PATCH] runtime/pprof: add debug=3 goroutine profile with goid and
 labels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds a new goroutine profile debug mode, debug=3.

This mode emits in the same binary proto output format as debug=0, with
the only difference being that it does not aggregate matching
stack/label combinations into a single count, and instead emits a sample
per goroutine with additional synthesized labels to communicate some of
the details of each goroutine, specifically:
  - go::goroutine: the goroutine's ID
  - go::goroutine_created_by: ID of the goroutine's creator (if any)
  - go::goroutine_state: current state of the goroutine (e.g. runnable)
  - go::goroutine_wait_minutes: approximate minutes goroutine has spent
    waiting (if applicable)

Previously the debug=2 mode was the only way to get this kind of
per-goroutine information, that is sometimes vital to understanding the
state of a process. However debug=2 has two major drawbacks:
  1) its collection incurs a lengthy and disruptive stop-the-world pause and
  2) it does not include user-set labels along side per-goroutine details in the same profile.

This new debug=3 mode uses the same concurrent collection mechanism used
to produce debug=0 and debug=1 profiles, meaning it has the same minimial
stop-the-world penalty. At the same time, it includes the per-goroutine
details like status and wait time that make debug=2 so useful, providing
a "best-of-both-worlds" option.

A new mode is introduced, rather than changing the implementation of the
debug=2 format in-place, as it is not clear that debug=2 can utilize a
concurrent collection mechanism while maintaining the correctness of its
existing output, which includes argument values in its printed stacks.

The difference in STW latency observed by running goroutines during
profile collection is demonstrated by an included benchmark which spawns
a number of goroutines to be profiled and then measures the latency of a
short timer while collecting goroutine profiles.

BenchmarkGoroutineProfileLatencyImpact

                       │        debug=2  │                   debug=3             │
                       │ max_latency_ns  │ max_latency_ns  vs base               │
goroutines=100x3-14         422.2k ± 13%   190.3k ±   38%  -54.93% (p=0.002 n=6)
goroutines=100x10-14        619.7k ± 10%   171.1k ±   43%  -72.38% (p=0.002 n=6)
goroutines=100x50-14       1423.6k ±  7%   174.3k ±   44%  -87.76% (p=0.002 n=6)
goroutines=1000x3-14       2424.8k ±  8%   298.6k ±  106%  -87.68% (p=0.002 n=6)
goroutines=1000x10-14      7378.4k ±  2%   268.2k ±  146%  -96.36% (p=0.002 n=6)
goroutines=1000x50-14     23372.5k ± 10%   330.1k ±  173%  -98.59% (p=0.002 n=6)
goroutines=10000x3-14      42.802M ± 47%   1.991M ±  105%  -95.35% (p=0.002 n=6)
goroutines=10000x10-14    36668.2k ± 95%   743.1k ±   72%  -97.97% (p=0.002 n=6)
goroutines=10000x50-14   120639.1k ±  2%   188.2k ± 2582%  -99.84% (p=0.002 n=6)
geomean                     6.760M         326.2k          -95.18%

The per-goroutine details are included in the profile as labels, along
side any user-set labels. While the pprof format allows for multi-valued
labels, so a collision with a user-set label would preserve both values,
it also discourages them, thus the 'go::' namespace prefix is used to
minimize collisions with user-set labels. The form 'go::' follows the
convention established in the pprof format, which reserves 'pprof::'.

Fixes #74954.

Change-Id: If90eb01887ae3f35be8acc3d239b88dc29d338a8
---
 src/internal/profilerecord/profilerecord.go |  21 +++
 src/net/http/pprof/pprof.go                 |   2 +-
 src/runtime/mprof.go                        |  79 ++++----
 src/runtime/pprof/pprof.go                  | 112 +++++++++---
 src/runtime/pprof/pprof_test.go             | 191 +++++++++++++++++++-
 src/runtime/pprof/proto.go                  |   8 +
 src/runtime/proc.go                         |   6 +-
 src/runtime/traceback.go                    |  53 ++++--
 8 files changed, 375 insertions(+), 97 deletions(-)

diff --git a/src/internal/profilerecord/profilerecord.go b/src/internal/profilerecord/profilerecord.go
index a5efdced8f77c9..f6f7e21c3d8455 100644
--- a/src/internal/profilerecord/profilerecord.go
+++ b/src/internal/profilerecord/profilerecord.go
@@ -8,10 +8,16 @@
 // TODO: Consider moving this to internal/runtime, see golang.org/issue/65355.
 package profilerecord
 
+import "unsafe"
+
 type StackRecord struct {
 	Stack []uintptr
 }
 
+func (r StackRecord) GetStack() []uintptr           { return r.Stack }
+func (r StackRecord) GetLabels() unsafe.Pointer     { return nil }
+func (r StackRecord) GetGoroutine() GoroutineRecord { return GoroutineRecord{} }
+
 type MemProfileRecord struct {
 	AllocBytes, FreeBytes     int64
 	AllocObjects, FreeObjects int64
@@ -26,3 +32,18 @@ type BlockProfileRecord struct {
 	Cycles int64
 	Stack  []uintptr
 }
+
+type GoroutineRecord struct {
+	ID         uint64
+	State      uint32
+	WaitReason uint8
+	CreatorID  uint64
+	CreationPC uintptr
+	WaitSince  int64 // approx time when the g became blocked, in nanoseconds
+	Labels     unsafe.Pointer
+	Stack      []uintptr
+}
+
+func (r GoroutineRecord) GetStack() []uintptr           { return r.Stack }
+func (r GoroutineRecord) GetLabels() unsafe.Pointer     { return r.Labels }
+func (r GoroutineRecord) GetGoroutine() GoroutineRecord { return r }
diff --git a/src/net/http/pprof/pprof.go b/src/net/http/pprof/pprof.go
index 71aade67d32046..91a59a65c7352e 100644
--- a/src/net/http/pprof/pprof.go
+++ b/src/net/http/pprof/pprof.go
@@ -264,7 +264,7 @@ func (name handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 		runtime.GC()
 	}
 	debug, _ := strconv.Atoi(r.FormValue("debug"))
-	if debug != 0 {
+	if debug != 0 && !(name == "goroutine" && debug == 3) {
 		w.Header().Set("Content-Type", "text/plain; charset=utf-8")
 	} else {
 		w.Header().Set("Content-Type", "application/octet-stream")
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index 19e90fbb337e55..f001c3e916e637 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -1245,40 +1245,21 @@ func pprof_threadCreateInternal(p []profilerecord.StackRecord) (n int, ok bool)
 	})
 }
 
-//go:linkname pprof_goroutineProfileWithLabels
-func pprof_goroutineProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
-	return goroutineProfileWithLabels(p, labels)
+//go:linkname pprof_goroutineProfile
+func pprof_goroutineProfile(p []profilerecord.GoroutineRecord) (n int, ok bool) {
+	return goroutineProfileInternal(p)
 }
 
-// labels may be nil. If labels is non-nil, it must have the same length as p.
-func goroutineProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
-	if labels != nil && len(labels) != len(p) {
-		labels = nil
-	}
-
-	return goroutineProfileWithLabelsConcurrent(p, labels)
-}
-
-//go:linkname pprof_goroutineLeakProfileWithLabels
-func pprof_goroutineLeakProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
-	return goroutineLeakProfileWithLabelsConcurrent(p, labels)
-}
-
-// labels may be nil. If labels is non-nil, it must have the same length as p.
-func goroutineLeakProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
-	if labels != nil && len(labels) != len(p) {
-		labels = nil
-	}
-
-	return goroutineLeakProfileWithLabelsConcurrent(p, labels)
+//go:linkname pprof_goroutineLeakProfile
+func pprof_goroutineLeakProfile(p []profilerecord.GoroutineRecord) (n int, ok bool) {
+	return goroutineLeakProfileInternal(p)
 }
 
 var goroutineProfile = struct {
 	sema    uint32
 	active  bool
 	offset  atomic.Int64
-	records []profilerecord.StackRecord
-	labels  []unsafe.Pointer
+	records []profilerecord.GoroutineRecord
 }{
 	sema: 1,
 }
@@ -1316,18 +1297,18 @@ func (p *goroutineProfileStateHolder) CompareAndSwap(old, new goroutineProfileSt
 	return (*atomic.Uint32)(p).CompareAndSwap(uint32(old), uint32(new))
 }
 
-func goroutineLeakProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
+func goroutineLeakProfileInternal(p []profilerecord.GoroutineRecord) (n int, ok bool) {
 	if len(p) == 0 {
 		// An empty slice is obviously too small. Return a rough
 		// allocation estimate.
 		return work.goroutineLeak.count, false
 	}
 
-	// Use the same semaphore as goroutineProfileWithLabelsConcurrent,
+	// Use the same semaphore as goroutineProfileInternal,
 	// because ultimately we still use goroutine profiles.
 	semacquire(&goroutineProfile.sema)
 
-	// Unlike in goroutineProfileWithLabelsConcurrent, we don't need to
+	// Unlike in goroutineProfileInternal, we don't need to
 	// save the current goroutine stack, because it is obviously not leaked.
 
 	pcbuf := makeProfStack() // see saveg() for explanation
@@ -1358,7 +1339,7 @@ func goroutineLeakProfileWithLabelsConcurrent(p []profilerecord.StackRecord, lab
 	return n, true
 }
 
-func goroutineProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
+func goroutineProfileInternal(p []profilerecord.GoroutineRecord) (n int, ok bool) {
 	if len(p) == 0 {
 		// An empty slice is obviously too small. Return a rough
 		// allocation estimate without bothering to STW. As long as
@@ -1401,9 +1382,15 @@ func goroutineProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels
 	systemstack(func() {
 		saveg(pc, sp, ourg, &p[0], pcbuf)
 	})
-	if labels != nil {
-		labels[0] = ourg.labels
-	}
+
+	p[0].ID = ourg.goid
+	p[0].CreatorID = ourg.parentGoid
+	p[0].CreationPC = ourg.gopc
+	p[0].Labels = ourg.labels
+	p[0].State = readgstatus(ourg) &^ _Gscan
+	p[0].WaitReason = uint8(ourg.waitreason)
+	p[0].WaitSince = ourg.waitsince
+
 	ourg.goroutineProfiled.Store(goroutineProfileSatisfied)
 	goroutineProfile.offset.Store(1)
 
@@ -1414,7 +1401,6 @@ func goroutineProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels
 	// field set to goroutineProfileSatisfied.
 	goroutineProfile.active = true
 	goroutineProfile.records = p
-	goroutineProfile.labels = labels
 	startTheWorld(stw)
 
 	// Visit each goroutine that existed as of the startTheWorld call above.
@@ -1436,7 +1422,6 @@ func goroutineProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels
 	endOffset := goroutineProfile.offset.Swap(0)
 	goroutineProfile.active = false
 	goroutineProfile.records = nil
-	goroutineProfile.labels = nil
 	startTheWorld(stw)
 
 	// Restore the invariant that every goroutine struct in allgs has its
@@ -1528,7 +1513,7 @@ func doRecordGoroutineProfile(gp1 *g, pcbuf []uintptr) {
 		// System goroutines should not appear in the profile.
 		// Check this here and not in tryRecordGoroutineProfile because isSystemGoroutine
 		// may change on a goroutine while it is executing, so while the scheduler might
-		// see a system goroutine, goroutineProfileWithLabelsConcurrent might not, and
+		// see a system goroutine, goroutineProfileInternal might not, and
 		// this inconsistency could cause invariants to be violated, such as trying to
 		// record the stack of a running goroutine below. In short, we still want system
 		// goroutines to participate in the same state machine on gp1.goroutineProfiled as
@@ -1545,7 +1530,7 @@ func doRecordGoroutineProfile(gp1 *g, pcbuf []uintptr) {
 	if offset >= len(goroutineProfile.records) {
 		// Should be impossible, but better to return a truncated profile than
 		// to crash the entire process at this point. Instead, deal with it in
-		// goroutineProfileWithLabelsConcurrent where we have more context.
+		// goroutineProfileInternal where we have more context.
 		return
 	}
 
@@ -1559,12 +1544,16 @@ func doRecordGoroutineProfile(gp1 *g, pcbuf []uintptr) {
 	// to avoid schedule delays.
 	systemstack(func() { saveg(^uintptr(0), ^uintptr(0), gp1, &goroutineProfile.records[offset], pcbuf) })
 
-	if goroutineProfile.labels != nil {
-		goroutineProfile.labels[offset] = gp1.labels
-	}
+	goroutineProfile.records[offset].Labels = gp1.labels
+	goroutineProfile.records[offset].ID = gp1.goid
+	goroutineProfile.records[offset].CreatorID = gp1.parentGoid
+	goroutineProfile.records[offset].CreationPC = gp1.gopc
+	goroutineProfile.records[offset].State = readgstatus(gp1) &^ _Gscan
+	goroutineProfile.records[offset].WaitReason = uint8(gp1.waitreason)
+	goroutineProfile.records[offset].WaitSince = gp1.waitsince
 }
 
-func goroutineProfileWithLabelsSync(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
+func goroutineProfileWithLabelsSync(p []profilerecord.GoroutineRecord, labels []unsafe.Pointer) (n int, ok bool) {
 	gp := getg()
 
 	isOK := func(gp1 *g) bool {
@@ -1650,7 +1639,7 @@ func goroutineProfileWithLabelsSync(p []profilerecord.StackRecord, labels []unsa
 // Most clients should use the [runtime/pprof] package instead
 // of calling GoroutineProfile directly.
 func GoroutineProfile(p []StackRecord) (n int, ok bool) {
-	records := make([]profilerecord.StackRecord, len(p))
+	records := make([]profilerecord.GoroutineRecord, len(p))
 	n, ok = goroutineProfileInternal(records)
 	if !ok {
 		return
@@ -1662,11 +1651,7 @@ func GoroutineProfile(p []StackRecord) (n int, ok bool) {
 	return
 }
 
-func goroutineProfileInternal(p []profilerecord.StackRecord) (n int, ok bool) {
-	return goroutineProfileWithLabels(p, nil)
-}
-
-func saveg(pc, sp uintptr, gp *g, r *profilerecord.StackRecord, pcbuf []uintptr) {
+func saveg(pc, sp uintptr, gp *g, r *profilerecord.GoroutineRecord, pcbuf []uintptr) {
 	// To reduce memory usage, we want to allocate a r.Stack that is just big
 	// enough to hold gp's stack trace. Naively we might achieve this by
 	// recording our stack trace into mp.profStack, and then allocating a
diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go
index b524e992b8b209..775620c96157f6 100644
--- a/src/runtime/pprof/pprof.go
+++ b/src/runtime/pprof/pprof.go
@@ -375,7 +375,9 @@ func (p *Profile) Remove(value any) {
 // The predefined profiles may assign meaning to other debug values;
 // for example, when printing the "goroutine" profile, debug=2 means to
 // print the goroutine stacks in the same form that a Go program uses
-// when dying due to an unrecovered panic.
+// when dying due to an unrecovered panic, and debug=3 means to output a
+// binary protobuf profile identical to debug=0, but with per-goroutine
+// details such as ID and status included as sample labels.
 func (p *Profile) WriteTo(w io.Writer, debug int) error {
 	if p.name == "" {
 		panic("pprof: use of zero Profile")
@@ -402,7 +404,9 @@ type stackProfile [][]uintptr
 
 func (x stackProfile) Len() int              { return len(x) }
 func (x stackProfile) Stack(i int) []uintptr { return x[i] }
-func (x stackProfile) Label(i int) *labelMap { return nil }
+func (x stackProfile) Goroutine(i int) profilerecord.GoroutineRecord {
+	return profilerecord.GoroutineRecord{}
+}
 
 // A countProfile is a set of stack traces to be printed as counts
 // grouped by stack trace. There are multiple implementations:
@@ -411,7 +415,7 @@ func (x stackProfile) Label(i int) *labelMap { return nil }
 type countProfile interface {
 	Len() int
 	Stack(i int) []uintptr
-	Label(i int) *labelMap
+	Goroutine(i int) profilerecord.GoroutineRecord
 }
 
 // expandInlinedFrames copies the call stack from pcs into dst, expanding any
@@ -464,7 +468,50 @@ func printCountCycleProfile(w io.Writer, countName, cycleName string, records []
 
 // printCountProfile prints a countProfile at the specified debug level.
 // The profile will be in compressed proto format unless debug is nonzero.
+// Non-zero debug values have the following behaviors:
+//   - debug=1: text format, aggregated by unique stack, state and labels.
+//   - debug=2: handled elsewhere; the output of runtime.Stack().
+//   - debug=3: binary proto format-identical to debug=0 with per-goroutine labels.
 func printCountProfile(w io.Writer, debug int, name string, p countProfile) error {
+	// Debug >1 is non-aggregated, so branch immediately to avoid computing the
+	// aggregated map if we do not need it. This does duplicate a little of the
+	// logic for actually constructing and outputting the proto profile but only
+	// the helper calls so it is minimal and avoids a more complex structuring of
+	// one output path that handles both the aggregated and non-aggregated cases.
+	if debug == 3 {
+		// Output profile in protobuf form.
+		b := newProfileBuilder(w)
+		b.pbValueType(tagProfile_PeriodType, name, "count")
+		b.pb.int64Opt(tagProfile_Period, 1)
+		b.pbValueType(tagProfile_SampleType, name, "count")
+
+		values := []int64{1}
+		var locs []uint64
+		for i := 0; i < p.Len(); i++ {
+			locs = b.appendLocsForStack(locs[:0], p.Stack(i))
+			g := p.Goroutine(i)
+			extra := func() {
+				if g.ID != 0 {
+					b.pbLabelNum(tagSample_Label, "go::goroutine", int64(g.ID))
+					if g.ID != 1 {
+						b.pbLabelNum(tagSample_Label, "go::goroutine_created_by", int64(g.CreatorID))
+					}
+					b.pbLabel(tagSample_Label, "go::goroutine_state", pprof_gStatusString(g.State, g.WaitReason), 0)
+					if mins := pprof_gWaitFor(g.State, g.WaitSince); mins > 0 {
+						b.pbLabelNum(tagSample_Label, "go::goroutine_wait_minutes", int64(mins))
+					}
+				}
+				if g.Labels != nil {
+					for _, lbl := range (*labelMap)(g.Labels).list {
+						b.pbLabel(tagSample_Label, lbl.key, lbl.value, 0)
+					}
+				}
+			}
+			b.pbSample(values, locs, extra)
+		}
+		return b.build()
+	}
+
 	// Build count of each stack.
 	var buf strings.Builder
 	key := func(stk []uintptr, lbls *labelMap) string {
@@ -484,7 +531,7 @@ func printCountProfile(w io.Writer, debug int, name string, p countProfile) erro
 	var keys []string
 	n := p.Len()
 	for i := 0; i < n; i++ {
-		k := key(p.Stack(i), p.Label(i))
+		k := key(p.Stack(i), (*labelMap)(p.Goroutine(i).Labels))
 		if count[k] == 0 {
 			index[k] = i
 			keys = append(keys, k)
@@ -520,9 +567,9 @@ func printCountProfile(w io.Writer, debug int, name string, p countProfile) erro
 		locs = b.appendLocsForStack(locs[:0], p.Stack(index[k]))
 		idx := index[k]
 		var labels func()
-		if p.Label(idx) != nil {
+		if l := p.Goroutine(idx).Labels; l != nil {
 			labels = func() {
-				for _, lbl := range p.Label(idx).list {
+				for _, lbl := range (*labelMap)(l).list {
 					b.pbLabel(tagSample_Label, lbl.key, lbl.value, 0)
 				}
 			}
@@ -742,7 +789,7 @@ func writeThreadCreate(w io.Writer, debug int) error {
 	// Until https://golang.org/issues/6104 is addressed, wrap
 	// ThreadCreateProfile because there's no point in tracking labels when we
 	// don't get any stack-traces.
-	return writeRuntimeProfile(w, debug, "threadcreate", func(p []profilerecord.StackRecord, _ []unsafe.Pointer) (n int, ok bool) {
+	return writeRuntimeProfile(w, debug, "threadcreate", func(p []profilerecord.StackRecord) (n int, ok bool) {
 		return pprof_threadCreateInternal(p)
 	})
 }
@@ -754,10 +801,10 @@ func countGoroutine() int {
 
 // writeGoroutine writes the current runtime GoroutineProfile to w.
 func writeGoroutine(w io.Writer, debug int) error {
-	if debug >= 2 {
+	if debug == 2 {
 		return writeGoroutineStacks(w)
 	}
-	return writeRuntimeProfile(w, debug, "goroutine", pprof_goroutineProfileWithLabels)
+	return writeRuntimeProfile(w, debug, "goroutine", pprof_goroutineProfile)
 }
 
 // writeGoroutineLeak first invokes a GC cycle that performs goroutine leak detection.
@@ -774,7 +821,7 @@ func writeGoroutineLeak(w io.Writer, debug int) error {
 	}
 
 	// Otherwise, write the goroutine leak profile.
-	return writeRuntimeProfile(w, debug, "goroutineleak", pprof_goroutineLeakProfileWithLabels)
+	return writeRuntimeProfile(w, debug, "goroutineleak", pprof_goroutineLeakProfile)
 }
 
 func writeGoroutineStacks(w io.Writer) error {
@@ -798,24 +845,28 @@ func writeGoroutineStacks(w io.Writer) error {
 	return err
 }
 
-func writeRuntimeProfile(w io.Writer, debug int, name string, fetch func([]profilerecord.StackRecord, []unsafe.Pointer) (int, bool)) error {
+type capturedStack interface {
+	GetStack() []uintptr
+	GetLabels() unsafe.Pointer
+	GetGoroutine() profilerecord.GoroutineRecord
+}
+
+func writeRuntimeProfile[T capturedStack](w io.Writer, debug int, name string, fetch func([]T) (int, bool)) error {
 	// Find out how many records there are (fetch(nil)),
 	// allocate that many records, and get the data.
 	// There's a race—more records might be added between
 	// the two calls—so allocate a few extra records for safety
 	// and also try again if we're very unlucky.
 	// The loop should only execute one iteration in the common case.
-	var p []profilerecord.StackRecord
-	var labels []unsafe.Pointer
-	n, ok := fetch(nil, nil)
+	var p []T
+	n, ok := fetch(nil)
 
 	for {
 		// Allocate room for a slightly bigger profile,
 		// in case a few more entries have been added
 		// since the call to ThreadProfile.
-		p = make([]profilerecord.StackRecord, n+10)
-		labels = make([]unsafe.Pointer, n+10)
-		n, ok = fetch(p, labels)
+		p = make([]T, n+10)
+		n, ok = fetch(p)
 		if ok {
 			p = p[0:n]
 			break
@@ -823,17 +874,14 @@ func writeRuntimeProfile(w io.Writer, debug int, name string, fetch func([]profi
 		// Profile grew; try again.
 	}
 
-	return printCountProfile(w, debug, name, &runtimeProfile{p, labels})
+	return printCountProfile(w, debug, name, runtimeProfile[T](p))
 }
 
-type runtimeProfile struct {
-	stk    []profilerecord.StackRecord
-	labels []unsafe.Pointer
-}
+type runtimeProfile[T capturedStack] []T
 
-func (p *runtimeProfile) Len() int              { return len(p.stk) }
-func (p *runtimeProfile) Stack(i int) []uintptr { return p.stk[i].Stack }
-func (p *runtimeProfile) Label(i int) *labelMap { return (*labelMap)(p.labels[i]) }
+func (p runtimeProfile[T]) Len() int                                      { return len(p) }
+func (p runtimeProfile[T]) Stack(i int) []uintptr                         { return p[i].GetStack() }
+func (p runtimeProfile[T]) Goroutine(i int) profilerecord.GoroutineRecord { return p[i].GetGoroutine() }
 
 var cpu struct {
 	sync.Mutex
@@ -996,11 +1044,11 @@ func writeProfileInternal(w io.Writer, debug int, name string, runtimeProfile fu
 	return b.Flush()
 }
 
-//go:linkname pprof_goroutineProfileWithLabels runtime.pprof_goroutineProfileWithLabels
-func pprof_goroutineProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool)
+//go:linkname pprof_goroutineProfile runtime.pprof_goroutineProfile
+func pprof_goroutineProfile(p []profilerecord.GoroutineRecord) (n int, ok bool)
 
-//go:linkname pprof_goroutineLeakProfileWithLabels runtime.pprof_goroutineLeakProfileWithLabels
-func pprof_goroutineLeakProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool)
+//go:linkname pprof_goroutineLeakProfile runtime.pprof_goroutineLeakProfile
+func pprof_goroutineLeakProfile(p []profilerecord.GoroutineRecord) (n int, ok bool)
 
 //go:linkname pprof_cyclesPerSecond runtime/pprof.runtime_cyclesPerSecond
 func pprof_cyclesPerSecond() int64
@@ -1022,3 +1070,9 @@ func pprof_fpunwindExpand(dst, src []uintptr) int
 
 //go:linkname pprof_makeProfStack runtime.pprof_makeProfStack
 func pprof_makeProfStack() []uintptr
+
+//go:linkname pprof_gStatusString runtime.pprof_gStatusString
+func pprof_gStatusString(state uint32, reason uint8) string
+
+//go:linkname pprof_gWaitFor runtime.pprof_gWaitFor
+func pprof_gWaitFor(gpstatus uint32, waitsince int64) int64
diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go
index b816833e52285f..69daad207b5985 100644
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go
@@ -1571,7 +1571,7 @@ func TestGoroutineProfileConcurrency(t *testing.T) {
 	goroutineProf := Lookup("goroutine")
 
 	profilerCalls := func(s string) int {
-		return strings.Count(s, "\truntime/pprof.runtime_goroutineProfileWithLabels+")
+		return strings.Count(s, "\truntime/pprof.runtime_goroutineProfile+")
 	}
 
 	includesFinalizerOrCleanup := func(s string) bool {
@@ -1931,6 +1931,195 @@ func BenchmarkGoroutine(b *testing.B) {
 	}
 }
 
+func TestGoroutineProfileDebug3Creators(t *testing.T) {
+	// Create synthetic goroutines using the helper
+	cleanup := createSyntheticGoroutines(10, 2)
+	defer cleanup()
+
+	p := Lookup("goroutine")
+
+	// Establish a created-by mapping by parsing a debug=2 profile.
+	createdBy := make(map[string]string)
+	var buf bytes.Buffer
+	err := p.WriteTo(&buf, 2)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, goroutine := range strings.Split(buf.String(), "\n\n") {
+		parts := strings.SplitN(goroutine, " ", 3)
+		for _, l := range strings.Split(parts[2], "\n") {
+			if c := strings.Split(l, "in goroutine "); len(c) > 1 {
+				createdBy[parts[1]] = c[1]
+			}
+		}
+	}
+
+	// Verify the debug=3 format of said profile has matching ID and creator labels.
+	buf.Reset()
+	err = p.WriteTo(&buf, 3)
+	if err != nil {
+		t.Fatal(err)
+	}
+	parsed, err := profile.Parse(&buf)
+	if err != nil {
+		t.Fatalf("failed to parse profile: %v", err)
+	}
+	if e, g := len(createdBy), len(parsed.Sample); e > g {
+		t.Fatalf("expected %d goroutines, got %d", e, g)
+	}
+	for _, s := range parsed.Sample {
+		id := strconv.Itoa(int(s.NumLabel["go::goroutine"][0]))
+		got := s.NumLabel["go::goroutine_created_by"]
+		if createdBy[id] == "" {
+			if len(got) != 0 {
+				t.Fatalf("goroutine %s: got created_by %q, want none", id, got)
+			}
+		} else {
+			if e := createdBy[id]; len(got) != 1 || strconv.Itoa(int(got[0])) != e {
+				t.Fatalf("goroutine %s: got created_by %q, want %q", id, got, e)
+			}
+		}
+	}
+}
+
+// BenchmarkGoroutineProfileLatencyImpact measures the latency impact on *other*
+// running goroutines during goroutine profile collection, when variable numbers
+// of other goroutines are profiled. This focus on the observed latency of in
+// other goroutines is what differentiates this benchmark from other benchmarks
+// of goroutine profiling, such as BenchmarkGoroutine, that measure the
+// the performance of profiling itself rather than its impact on the performance
+// of concurrently executing goroutines.
+func BenchmarkGoroutineProfileLatencyImpact(b *testing.B) {
+	for _, numGoroutines := range []int{100, 1000, 10000} {
+		for _, withDepth := range []int{3, 10, 50} {
+			b.Run(fmt.Sprintf("goroutines=%dx%d", numGoroutines, withDepth), func(b *testing.B) {
+				// Setup synthetic blocked goroutines using helper
+				cleanup := createSyntheticGoroutines(numGoroutines, withDepth)
+				defer cleanup()
+				for _, debug := range []int{-1, 1, 2, 3} {
+					name := fmt.Sprintf("debug=%d", debug)
+					if debug < 0 {
+						name = "debug=none"
+					}
+					b.Run(name, func(b *testing.B) {
+						// Launch latency probe goroutines.
+						const numProbes = 3
+						probeStop := make(chan struct{})
+						var probeWg, probesReady sync.WaitGroup
+						probes := make([]time.Duration, numProbes)
+						for prober := 0; prober < numProbes; prober++ {
+							probeWg.Add(1)
+							probesReady.Add(1)
+							go func(idx int) {
+								defer probeWg.Done()
+								probesReady.Done()
+								for {
+									select {
+									case <-probeStop:
+										return
+									default:
+										last := time.Now()
+										for i := 0; i < 20; i++ {
+											latency := time.Since(last)
+											last = time.Now()
+											if probes[idx] < latency {
+												probes[idx] = latency
+											}
+										}
+									}
+								}
+							}(prober)
+						}
+						probesReady.Wait()
+						b.ResetTimer()
+						for i := 0; i < b.N; i++ {
+							if debug >= 0 {
+								time.Sleep(3 * time.Millisecond)
+								var buf bytes.Buffer
+								err := Lookup("goroutine").WriteTo(&buf, debug)
+								if err != nil {
+									b.Fatalf("Profile failed: %v", err)
+								}
+								time.Sleep(3 * time.Millisecond)
+							} else {
+								// Just observe the probes without a profile interrupting them.
+								time.Sleep(6 * time.Millisecond)
+							}
+						}
+						b.StopTimer()
+						close(probeStop)
+						probeWg.Wait()
+
+						var probeMax time.Duration
+						for _, probe := range probes {
+							if probe > probeMax {
+								probeMax = probe
+							}
+						}
+						b.ReportMetric(float64(probeMax), "max_latency_ns")
+					})
+				}
+			})
+		}
+	}
+}
+
+// createSyntheticGoroutines creates a requested number of goroutines to provide
+// testdata against which various goroutine profiling mechanisms such as
+// pprof.Lookup("goroutine").WriteTo(debug=1/2) can be tested or benchmarked.
+// After the requested goroutines are started it returns a cleanup function to
+// stop them. The created goroutines vary their stacks across several sythnetic
+// functions, with a minimum stack depth controlled by minDepth.
+func createSyntheticGoroutines(numGoroutines, minDepth int) func() {
+	var syntheticFn0, syntheticFn1, syntheticFn2 func(<-chan struct{}, int)
+
+	syntheticFn0 = func(ch <-chan struct{}, d int) {
+		if d > 0 {
+			syntheticFn1(ch, d-1)
+		}
+		<-ch
+	}
+	syntheticFn1 = func(ch <-chan struct{}, d int) {
+		if d > 0 {
+			syntheticFn2(ch, d-1)
+		}
+		<-ch
+	}
+	syntheticFn2 = func(ch <-chan struct{}, d int) {
+		if d > 0 {
+			syntheticFn0(ch, d-1)
+		}
+		<-ch
+	}
+
+	var wg, setup sync.WaitGroup
+	setup.Add(numGoroutines)
+	blockCh := make(chan struct{})
+
+	for i := 0; i < numGoroutines; i++ {
+		wg.Add(1)
+		go func(id int) {
+			defer wg.Done()
+			setup.Done()
+			depth := minDepth + (id % 7)
+			switch id % 4 {
+			case 0:
+				syntheticFn0(blockCh, depth)
+			case 1:
+				syntheticFn1(blockCh, depth)
+			case 2:
+				syntheticFn2(blockCh, depth)
+			}
+		}(i)
+	}
+	setup.Wait()
+
+	return func() {
+		close(blockCh)
+		wg.Wait()
+	}
+}
+
 var emptyCallStackTestRun int64
 
 // Issue 18836.
diff --git a/src/runtime/pprof/proto.go b/src/runtime/pprof/proto.go
index 28ceb81542110a..1a38de97388d25 100644
--- a/src/runtime/pprof/proto.go
+++ b/src/runtime/pprof/proto.go
@@ -177,6 +177,14 @@ func (b *profileBuilder) pbLabel(tag int, key, str string, num int64) {
 	b.pb.endMessage(tag, start)
 }
 
+// pbLabel encodes a Label message to b.pb, with only a num component.
+func (b *profileBuilder) pbLabelNum(tag int, key string, num int64) {
+	start := b.pb.startMessage()
+	b.pb.int64Opt(tagLabel_Key, b.stringIndex(key))
+	b.pb.int64Opt(tagLabel_Num, num)
+	b.pb.endMessage(tag, start)
+}
+
 // pbLine encodes a Line message to b.pb.
 func (b *profileBuilder) pbLine(tag int, funcID uint64, line int64) {
 	start := b.pb.startMessage()
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 36949fb2cf0681..0788fcb7389d4a 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4803,7 +4803,6 @@ func exitsyscall() {
 		throw("exitsyscall: syscall frame is no longer valid")
 	}
 
-	gp.waitsince = 0
 	oldp := gp.m.oldp.ptr()
 	gp.m.oldp = 0
 	if exitsyscallfast(oldp) {
@@ -4817,6 +4816,7 @@ func exitsyscall() {
 				tryRecordGoroutineProfileWB(gp)
 			})
 		}
+		gp.waitsince = 0
 		trace := traceAcquire()
 		if trace.ok() {
 			lostP := oldp != gp.m.p.ptr() || gp.m.syscalltick != gp.m.p.ptr().syscalltick
@@ -4878,6 +4878,10 @@ func exitsyscall() {
 	gp.syscallsp = 0
 	gp.m.p.ptr().syscalltick++
 	gp.throwsplit = false
+
+	// NB: we do not reset gp.waitsince in this branch as we have not yet checked
+	// if we need to collect a profile before resuming; we leave resetting it to
+	// execute().
 }
 
 //go:nosplit
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index de62762ba76906..3aad8bf31f6bbd 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -1219,24 +1219,8 @@ func goroutineheader(gp *g) {
 	isScan := gpstatus&_Gscan != 0
 	gpstatus &^= _Gscan // drop the scan bit
 
-	// Basic string status
-	var status string
-	if 0 <= gpstatus && gpstatus < uint32(len(gStatusStrings)) {
-		status = gStatusStrings[gpstatus]
-	} else {
-		status = "???"
-	}
-
-	// Override.
-	if (gpstatus == _Gwaiting || gpstatus == _Gleaked) && gp.waitreason != waitReasonZero {
-		status = gp.waitreason.String()
-	}
-
 	// approx time the G is blocked, in minutes
-	var waitfor int64
-	if (gpstatus == _Gwaiting || gpstatus == _Gsyscall) && gp.waitsince != 0 {
-		waitfor = (nanotime() - gp.waitsince) / 60e9
-	}
+
 	print("goroutine ", gp.goid)
 	if gp.m != nil && gp.m.throwing >= throwTypeRuntime && gp == gp.m.curg || level >= 2 {
 		print(" gp=", gp)
@@ -1246,6 +1230,7 @@ func goroutineheader(gp *g) {
 			print(" m=nil")
 		}
 	}
+	status := gStatusString(gpstatus, gp.waitreason)
 	print(" [", status)
 	if gpstatus == _Gleaked {
 		print(" (leaked)")
@@ -1261,7 +1246,7 @@ func goroutineheader(gp *g) {
 		// suffix to distinguish it from the non-durable form, add it here.
 		print(" (durable)")
 	}
-	if waitfor >= 1 {
+	if waitfor := gWaitFor(gpstatus, gp.waitsince); waitfor >= 1 {
 		print(", ", waitfor, " minutes")
 	}
 	if gp.lockedm != 0 {
@@ -1273,6 +1258,38 @@ func goroutineheader(gp *g) {
 	print("]:\n")
 }
 
+//go:linkname pprof_gWaitFor
+func pprof_gWaitFor(gpstatus uint32, waitsince int64) int64 {
+	return gWaitFor(gpstatus, waitsince)
+}
+
+//go:linkname pprof_gStatusString
+func pprof_gStatusString(gpstatus uint32, reason waitReason) string {
+	return gStatusString(gpstatus, reason)
+}
+
+// gStatusString returns a string representing the status of a goroutine.
+func gStatusString(gpstatus uint32, reason waitReason) string {
+	var status string
+	if gpstatus < uint32(len(gStatusStrings)) {
+		status = gStatusStrings[gpstatus]
+	} else {
+		status = "???"
+	}
+	if (gpstatus == _Gwaiting || gpstatus == _Gleaked) && reason != waitReasonZero {
+		status = reason.String()
+	}
+	return status
+}
+
+// gWaitFor returns the number of minutes that the goroutine has been waiting.
+func gWaitFor(gpstatus uint32, waitsince int64) int64 {
+	if (gpstatus == _Gwaiting || gpstatus == _Gsyscall) && waitsince != 0 {
+		return (nanotime() - waitsince) / 60e9
+	}
+	return 0
+}
+
 func tracebackothers(me *g) {
 	tracebacksomeothers(me, func(*g) bool { return true })
 }