Skip to content

Commit 6ddfc2c

Browse files
committed
kvnemesis: log a metrics report
This commit adds logging of some key metrics at the end of each kvnemesis run. Fixes: #153793 Release note: None
1 parent a7bdedf commit 6ddfc2c

File tree

3 files changed

+98
-8
lines changed

3 files changed

+98
-8
lines changed

pkg/kv/kvnemesis/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ go_test(
113113
"//pkg/util/hlc",
114114
"//pkg/util/leaktest",
115115
"//pkg/util/log",
116+
"//pkg/util/metric",
116117
"//pkg/util/randutil",
117118
"//pkg/util/stop",
118119
"//pkg/util/syncutil",

pkg/kv/kvnemesis/kvnemesis_test.go

Lines changed: 82 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"os"
1414
"path"
1515
"strconv"
16+
"strings"
1617
"testing"
1718
"time"
1819

@@ -46,6 +47,7 @@ import (
4647
"github.com/cockroachdb/cockroach/pkg/util/hlc"
4748
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
4849
"github.com/cockroachdb/cockroach/pkg/util/log"
50+
"github.com/cockroachdb/cockroach/pkg/util/metric"
4951
"github.com/cockroachdb/cockroach/pkg/util/randutil"
5052
"github.com/cockroachdb/cockroach/pkg/util/stop"
5153
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
@@ -698,19 +700,91 @@ func testKVNemesisImpl(t testing.TB, cfg kvnemesisTestCfg) {
698700
env := &Env{SQLDBs: sqlDBs, Tracker: tr, L: logger, Partitioner: &partitioner, Restarter: tc}
699701
failures, err := RunNemesis(ctx, rng, env, config, cfg.concurrency, cfg.numSteps, cfg.mode, dbs...)
700702

701-
for i := 0; i < cfg.numNodes; i++ {
702-
t.Logf("[%d] proposed: %d", i,
703-
tc.GetFirstStoreFromServer(t, i).Metrics().RaftCommandsProposed.Count())
704-
t.Logf("[%d] reproposed unchanged: %d", i,
705-
tc.GetFirstStoreFromServer(t, i).Metrics().RaftCommandsReproposed.Count())
706-
t.Logf("[%d] reproposed with new LAI: %d", i,
707-
tc.GetFirstStoreFromServer(t, i).Metrics().RaftCommandsReproposedLAI.Count())
708-
}
703+
logMetricsReport(t, tc)
709704

710705
require.NoError(t, err, `%+v`, err)
711706
require.Zero(t, len(failures), "kvnemesis detected failures") // they've been logged already
712707
}
713708

709+
func logMetricsReport(t testing.TB, tc *testcluster.TestCluster) {
710+
metricsOfInterest := []string{
711+
// Raft command metrics
712+
"raft.commands.proposed",
713+
"raft.commands.reproposed.new-lai",
714+
"raft.commands.reproposed.unchanged",
715+
// Transaction metrics
716+
"txn.aborts",
717+
"txn.durations",
718+
"txn.restarts.writetooold",
719+
"txn.restarts.serializable",
720+
"txn.restarts.readwithinuncertainty",
721+
"txn.server_side.1PC.success",
722+
"txnrecovery.failures",
723+
"txnrecovery.successes.aborted",
724+
"txnrecovery.successes.committed",
725+
"txnwaitqueue.deadlocks_total",
726+
}
727+
728+
numNodes := tc.NumServers()
729+
nodeMetrics := make([]map[string]string, numNodes)
730+
metricsMap := make(map[string]struct{}, len(metricsOfInterest))
731+
for _, m := range metricsOfInterest {
732+
metricsMap[m] = struct{}{}
733+
}
734+
for i := 0; i < numNodes; i++ {
735+
nodeMetrics[i] = make(map[string]string)
736+
processMetric := func(name string, v interface{}) {
737+
switch val := v.(type) {
738+
case *metric.Counter:
739+
nodeMetrics[i][name] = fmt.Sprintf("%d", val.Count())
740+
case *metric.Gauge:
741+
nodeMetrics[i][name] = fmt.Sprintf("%d", val.Value())
742+
case metric.IHistogram:
743+
// TODO(mira): Currently, histograms are assumed to store values of type
744+
// time duration.
745+
snapshot := val.CumulativeSnapshot()
746+
makePretty := func(nanos float64) time.Duration {
747+
return time.Duration(nanos).Round(time.Millisecond)
748+
}
749+
meanDur := makePretty(snapshot.Mean())
750+
p50Dur := makePretty(snapshot.ValueAtQuantile(50))
751+
p99Dur := makePretty(snapshot.ValueAtQuantile(99))
752+
nodeMetrics[i][name] = fmt.Sprintf("μ=%v p50=%v p99=%v", meanDur, p50Dur, p99Dur)
753+
default:
754+
nodeMetrics[i][name] = fmt.Sprintf("unknown:%T", v)
755+
}
756+
}
757+
758+
clientRegistry := tc.Server(i).StorageLayer().MetricsRecorder().AppRegistry()
759+
nodeRegistry := tc.Server(i).StorageLayer().MetricsRecorder().NodeRegistry()
760+
storeID := tc.GetFirstStoreFromServer(t, i).StoreID()
761+
storeRegistry := tc.Server(i).StorageLayer().MetricsRecorder().StoreRegistry(storeID)
762+
clientRegistry.Select(metricsMap, func(name string, v interface{}) { processMetric(name, v) })
763+
nodeRegistry.Select(metricsMap, func(name string, v interface{}) { processMetric(name, v) })
764+
storeRegistry.Select(metricsMap, func(name string, v interface{}) { processMetric(name, v) })
765+
}
766+
767+
header := fmt.Sprintf("%-35s", "Metric")
768+
for i := 0; i < numNodes; i++ {
769+
header += fmt.Sprintf(" | %-30s", fmt.Sprintf("Node %d", i+1))
770+
}
771+
t.Log(header)
772+
773+
separator := strings.Repeat("-", 35)
774+
for i := 0; i < numNodes; i++ {
775+
separator += "-+-" + strings.Repeat("-", 30)
776+
}
777+
t.Log(separator)
778+
779+
for _, metricName := range metricsOfInterest {
780+
row := fmt.Sprintf("%-35s", metricName)
781+
for i := 0; i < numNodes; i++ {
782+
row += fmt.Sprintf(" | %-30s", nodeMetrics[i][metricName])
783+
}
784+
t.Log(row)
785+
}
786+
}
787+
714788
// TestRunReproductionSteps is a helper that allows quickly running a kvnemesis
715789
// history.
716790
func TestRunReproductionSteps(t *testing.T) {

pkg/server/status/recorder.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,21 @@ func (mr *MetricsRecorder) AppRegistry() *metric.Registry {
244244
return mr.mu.appRegistry
245245
}
246246

247+
// NodeRegistry returns the metric registry for node-level metrics.
248+
func (mr *MetricsRecorder) NodeRegistry() *metric.Registry {
249+
mr.mu.Lock()
250+
defer mr.mu.Unlock()
251+
return mr.mu.logRegistry
252+
}
253+
254+
// StoreRegistry returns the metric registry for store-level metrics
255+
// corresponding to the provided store ID.
256+
func (mr *MetricsRecorder) StoreRegistry(id roachpb.StoreID) *metric.Registry {
257+
mr.mu.Lock()
258+
defer mr.mu.Unlock()
259+
return mr.mu.storeRegistries[id]
260+
}
261+
247262
// AddNode adds various metric registries an initialized server, along
248263
// with its descriptor and start time.
249264
// The registries are:

0 commit comments

Comments
 (0)