99 "bytes"
1010 "context"
1111 "fmt"
12+ "sort"
1213
1314 "github.com/cockroachdb/cockroach/pkg/base"
1415 "github.com/cockroachdb/cockroach/pkg/jobs"
@@ -19,6 +20,7 @@ import (
1920 "github.com/cockroachdb/cockroach/pkg/util/log"
2021 "github.com/cockroachdb/cockroach/pkg/util/timeutil"
2122 "github.com/cockroachdb/cockroach/pkg/util/tracing"
23+ "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
2224)
2325
2426// ConstructTracingAggregatorProducerMeta constructs a ProducerMetadata that
@@ -45,25 +47,37 @@ func ConstructTracingAggregatorProducerMeta(
4547 }
4648 })
4749
50+ sp := tracing .SpanFromContext (ctx )
51+ if sp != nil {
52+ recType := sp .RecordingType ()
53+ if recType != tracingpb .RecordingOff {
54+ aggEvents .SpanTotals = sp .GetFullTraceRecording (recType ).Root .ChildrenMetadata
55+ }
56+ }
4857 return & execinfrapb.ProducerMetadata {AggregatorEvents : aggEvents }
4958}
5059
5160// ComponentAggregatorStats is a mapping from a component to all the Aggregator
5261// Stats collected for that component.
53- type ComponentAggregatorStats map [execinfrapb.ComponentID ]map [ string ][] byte
62+ type ComponentAggregatorStats map [execinfrapb.ComponentID ]execinfrapb. TracingAggregatorEvents
5463
5564// DeepCopy takes a deep copy of the component aggregator stats map.
5665func (c ComponentAggregatorStats ) DeepCopy () ComponentAggregatorStats {
5766 mapCopy := make (ComponentAggregatorStats , len (c ))
5867 for k , v := range c {
59- innerMap := make (map [string ][]byte , len (v ))
60- for k2 , v2 := range v {
68+ copied := v
69+ copied .Events = make (map [string ][]byte , len (v .Events ))
70+ copied .SpanTotals = make (map [string ]tracingpb.OperationMetadata , len (v .SpanTotals ))
71+ for k2 , v2 := range v .Events {
6172 // Create a copy of the byte slice to avoid modifying the original data.
6273 dataCopy := make ([]byte , len (v2 ))
6374 copy (dataCopy , v2 )
64- innerMap [k2 ] = dataCopy
75+ copied .Events [k2 ] = dataCopy
76+ }
77+ for k2 , v2 := range v .SpanTotals {
78+ copied .SpanTotals [k2 ] = v2
6579 }
66- mapCopy [k ] = innerMap
80+ mapCopy [k ] = copied
6781 }
6882 return mapCopy
6983}
@@ -82,58 +96,76 @@ func FlushTracingAggregatorStats(
8296 db isql.DB ,
8397 perNodeAggregatorStats ComponentAggregatorStats ,
8498) error {
85- return db . Txn ( ctx , func ( ctx context. Context , txn isql. Txn ) error {
86- clusterWideAggregatorStats := make (map [string ]tracing.AggregatorEvent )
87- asOf := timeutil . Now (). Format ( "20060102_150405.00" )
99+ clusterWideSpanStats := make ( map [ string ]tracingpb. OperationMetadata )
100+ clusterWideAggregatorStats := make (map [string ]tracing.AggregatorEvent )
101+ ids := make ([]execinfrapb. ComponentID , 0 , len ( perNodeAggregatorStats ) )
88102
89- var clusterWideSummary bytes.Buffer
90- for component , nameToEvent := range perNodeAggregatorStats {
91- clusterWideSummary .WriteString (fmt .Sprintf ("## SQL Instance ID: %s; Flow ID: %s\n \n " ,
92- component .SQLInstanceID .String (), component .FlowID .String ()))
93- for name , event := range nameToEvent {
94- // Write a proto file per tag. This machine-readable file can be consumed
95- // by other places we want to display this information egs: annotated
96- // DistSQL diagrams, DBConsole etc.
97- filename := fmt .Sprintf ("%s/%s" ,
98- component .SQLInstanceID .String (), asOf )
99- msg , err := protoreflect .DecodeMessage (name , event )
100- if err != nil {
101- clusterWideSummary .WriteString (fmt .Sprintf ("invalid protocol message: %v" , err ))
102- // If we failed to decode the event write the error to the file and
103- // carry on.
104- continue
105- }
106-
107- if err := jobs .WriteProtobinExecutionDetailFile (ctx , filename , msg , txn , jobID ); err != nil {
108- return err
109- }
110-
111- // Construct a single text file that contains information on a per-node
112- // basis as well as a cluster-wide aggregate.
113- clusterWideSummary .WriteString (fmt .Sprintf ("# %s\n " , name ))
114-
115- aggEvent := msg .(tracing.AggregatorEvent )
116- clusterWideSummary .WriteString (aggEvent .String ())
117- clusterWideSummary .WriteString ("\n " )
118-
119- if _ , ok := clusterWideAggregatorStats [name ]; ok {
120- clusterWideAggregatorStats [name ].Combine (aggEvent )
121- } else {
122- clusterWideAggregatorStats [name ] = aggEvent
123- }
124- }
103+ for component := range perNodeAggregatorStats {
104+ ids = append (ids , component )
105+ }
106+ sort .Slice (ids , func (i , j int ) bool { return ids [i ].SQLInstanceID < ids [j ].SQLInstanceID })
107+
108+ // Write a summary for each per-node to a buffer. While doing so, accumulate a
109+ // cluster-wide summary as well to be written to a second buffer below.
110+ var perNode bytes.Buffer
111+ fmt .Fprintf (& perNode , "# Per-componant Details (%d)\n " , len (perNodeAggregatorStats ))
112+ for _ , component := range ids {
113+ nodeStats := perNodeAggregatorStats [component ]
114+ fmt .Fprintf (& perNode , "# SQL Instance ID: %s (%s); Flow/proc ID: %s/%d\n \n " ,
115+ component .SQLInstanceID , component .Region , component .FlowID , component .ID )
116+
117+ // Print span stats.
118+ perNode .WriteString ("## Span Totals\n \n " )
119+ for name , stats := range nodeStats .SpanTotals {
120+ fmt .Fprintf (& perNode , "- %-40s (%d):\t %s\n " , name , stats .Count , stats .Duration )
121+ }
122+ perNode .WriteString ("\n " )
123+
124+ // Add span stats to the cluster-wide span stats.
125+ for spanName , totals := range nodeStats .SpanTotals {
126+ clusterWideSpanStats [spanName ] = clusterWideSpanStats [spanName ].Combine (totals )
125127 }
126128
127- for tag , event := range clusterWideAggregatorStats {
128- clusterWideSummary .WriteString ("## Cluster-wide\n \n " )
129- clusterWideSummary .WriteString (fmt .Sprintf ("# %s\n " , tag ))
130- clusterWideSummary .WriteString (event .String ())
129+ perNode .WriteString ("## Aggregate Stats\n \n " )
130+ for name , event := range nodeStats .Events {
131+ msg , err := protoreflect .DecodeMessage (name , event )
132+ if err != nil {
133+ continue
134+ }
135+ aggEvent := msg .(tracing.AggregatorEvent )
136+ fmt .Fprintf (& perNode , "- %s:\n %s\n \n " , name , aggEvent )
137+
138+ // Populate the cluster-wide aggregator stats.
139+ if _ , ok := clusterWideAggregatorStats [name ]; ok {
140+ clusterWideAggregatorStats [name ].Combine (aggEvent )
141+ } else {
142+ clusterWideAggregatorStats [name ] = aggEvent
143+ }
131144 }
145+ perNode .WriteString ("\n " )
146+ }
132147
133- // Ensure the file always has a trailing newline, regardless of whether or
134- // not the loops above wrote anything.
135- clusterWideSummary .WriteString ("\n " )
136- filename := fmt .Sprintf ("aggregatorstats.%s.txt" , asOf )
137- return jobs .WriteExecutionDetailFile (ctx , filename , clusterWideSummary .Bytes (), txn , jobID )
148+ // Write the cluster-wide summary.
149+ var combined bytes.Buffer
150+ combined .WriteString ("# Cluster-wide\n \n " )
151+ combined .WriteString ("## Span Totals\n \n " )
152+ for name , stats := range clusterWideSpanStats {
153+ fmt .Fprintf (& combined , " - %-40s (%d):\t %s\n " , name , stats .Count , stats .Duration )
154+ }
155+ combined .WriteString ("\n " )
156+ combined .WriteString ("## Aggregate Stats\n \n " )
157+ for name , ev := range clusterWideAggregatorStats {
158+ fmt .Fprintf (& combined , " - %s:\n %s\n " , name , ev )
159+ }
160+ combined .WriteString ("\n " )
161+
162+ return db .Txn (ctx , func (ctx context.Context , txn isql.Txn ) error {
163+ asOf := timeutil .Now ().Format ("20060102_150405.00" )
164+ combinedFilename := fmt .Sprintf ("%s/trace-stats-cluster-wide.txt" , asOf )
165+ perNodeFilename := fmt .Sprintf ("%s/trace-stats-by-node.txt" , asOf )
166+ if err := jobs .WriteExecutionDetailFile (ctx , combinedFilename , combined .Bytes (), txn , jobID ); err != nil {
167+ return err
168+ }
169+ return jobs .WriteExecutionDetailFile (ctx , perNodeFilename , perNode .Bytes (), txn , jobID )
138170 })
139171}
0 commit comments