Merge #147827

craig[bot] · stevendanna · craig[bot] · commit b8c93adecc7f · 2025-06-12T23:53:16.000Z
147827: rpc: optimize VerifyClockOffset a bit r=RaduBerinde a=stevendanna This replaces some of our stats calculations with new versions that assume sorted input, avoiding a good deal of the copying and sorting contained in library we were using. Before: BenchmarkVerifyClockOffset 5313 203408 ns/op 82118 B/op 5 allocs/op After: BenchmarkVerifyClockOffset 10000 102450 ns/op 16489 B/op 1 allocs/op Fixes #147825 Release note: None Co-authored-by: Steven Danna <danna@cockroachlabs.com>
diff --git a/pkg/rpc/BUILD.bazel b/pkg/rpc/BUILD.bazel
@@ -157,6 +157,7 @@ go_test(
         "//pkg/util/leaktest",
         "//pkg/util/log",
         "//pkg/util/netutil",
+        "//pkg/util/randutil",
         "//pkg/util/stop",
         "//pkg/util/syncutil",
         "//pkg/util/timeutil",
@@ -168,6 +169,9 @@ go_test(
         "@com_github_gogo_protobuf//types",
         "@com_github_gogo_status//:status",
         "@com_github_golang_mock//gomock",
+        "@com_github_google_go_cmp//cmp",
+        "@com_github_google_go_cmp//cmp/cmpopts",
+        "@com_github_montanaflynn_stats//:stats",
         "@com_github_prometheus_client_model//go",
         "@com_github_stretchr_testify//assert",
         "@com_github_stretchr_testify//require",
diff --git a/pkg/rpc/clock_offset.go b/pkg/rpc/clock_offset.go
@@ -8,10 +8,12 @@ package rpc
 import (
 	"context"
 	"math"
+	"sort"
 	"time"
 
 	"github.com/VividCortex/ewma"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
+	"github.com/cockroachdb/cockroach/pkg/util/buildutil"
 	"github.com/cockroachdb/cockroach/pkg/util/hlc"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/cockroachdb/cockroach/pkg/util/metric"
@@ -340,7 +342,7 @@ func (r *RemoteClockMonitor) VerifyClockOffset(ctx context.Context) error {
 
 	now := r.clock.Now()
 	healthyOffsetCount := 0
-
+	sum := float64(0)
 	offsets, numClocks := func() (stats.Float64Data, int) {
 		r.mu.Lock()
 		defer r.mu.Unlock()
@@ -351,31 +353,24 @@ func (r *RemoteClockMonitor) VerifyClockOffset(ctx context.Context) error {
 				delete(r.mu.offsets, id)
 				continue
 			}
-			offs = append(offs, float64(offset.Offset+offset.Uncertainty))
-			offs = append(offs, float64(offset.Offset-offset.Uncertainty))
+			off1 := float64(offset.Offset + offset.Uncertainty)
+			off2 := float64(offset.Offset - offset.Uncertainty)
+			sum += off1 + off2
+			offs = append(offs, off1, off2)
 			if offset.isHealthy(ctx, r.toleratedOffset) {
 				healthyOffsetCount++
 			}
 		}
 		return offs, len(r.mu.offsets)
 	}()
 
-	mean, err := offsets.Mean()
-	if err != nil && !errors.Is(err, stats.EmptyInput) {
-		return err
-	}
-	stdDev, err := offsets.StandardDeviation()
-	if err != nil && !errors.Is(err, stats.EmptyInput) {
-		return err
-	}
-	median, err := offsets.Median()
-	if err != nil && !errors.Is(err, stats.EmptyInput) {
-		return err
-	}
-	medianAbsoluteDeviation, err := offsets.MedianAbsoluteDeviation()
-	if err != nil && !errors.Is(err, stats.EmptyInput) {
-		return err
-	}
+	sort.Float64s(offsets)
+
+	mean := sum / float64(len(offsets))
+	stdDev := StandardDeviationPopulationKnownMean(offsets, mean)
+	median := MedianSortedInput(offsets)
+	medianAbsoluteDeviation := MedianAbsoluteDeviationPopulationSortedInput(offsets)
+
 	r.metrics.ClockOffsetMeanNanos.Update(int64(mean))
 	r.metrics.ClockOffsetStdDevNanos.Update(int64(stdDev))
 	r.metrics.ClockOffsetMedianNanos.Update(int64(median))
@@ -458,3 +453,99 @@ func updateClockOffsetTracking(
 	remoteClocks.UpdateOffset(ctx, nodeID, offset, pingDuration)
 	return pingDuration, offset, remoteClocks.VerifyClockOffset(ctx)
 }
+
+// The following statistics functions are re-implementations of similar
+// functions provided by github.com/montanaflynn/stats. Those original functions
+// were originally offered under:
+//
+//	The MIT License (MIT)
+//
+//	Copyright (c) 2014-2023 Montana Flynn (https://montanaflynn.com)
+//
+//	Permission is hereby granted, free of charge, to any person obtaining a copy
+//	of this software and associated documentation files (the "Software"), to deal
+//	in the Software without restriction, including without limitation the rights
+//	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//	copies of the Software, and to permit persons to whom the Software is
+//	furnished to do so, subject to the following conditions:
+//
+//	The above copyright notice and this permission notice shall be included in all
+//	copies or substantial portions of the Software.
+//
+
+// StandardDeviationPopulationKnownMean calculates the standard deviation
+// assuming the input is the population and that the given mean is the mean of
+// the input.
+func StandardDeviationPopulationKnownMean(input stats.Float64Data, mean float64) float64 {
+	if input.Len() == 0 {
+		return math.NaN()
+	}
+	return math.Sqrt(PopulationVarianceKnownMean(input, mean))
+}
+
+// PopulationVarianceKnownMean calculates the variance assuming the input is the
+// population and that the given mean is the mean of the input.
+func PopulationVarianceKnownMean(input stats.Float64Data, mean float64) float64 {
+	if input.Len() == 0 {
+		return math.NaN()
+	}
+	variance := float64(0)
+	for _, n := range input {
+		diff := n - mean
+		variance += diff * diff
+	}
+	return variance / float64(input.Len())
+}
+
+// MedianSortedInput calculates the median of the input, assuming it is already
+// sorted.
+func MedianSortedInput(sortedInput stats.Float64Data) float64 {
+	if buildutil.CrdbTestBuild {
+		if !sort.IsSorted(sortedInput) {
+			panic("MedianSortedInput expects sorted input")
+		}
+	}
+
+	l := len(sortedInput)
+	if l == 0 {
+		return math.NaN()
+	} else if l%2 == 0 {
+		return (sortedInput[(l/2)-1] + sortedInput[(l/2)]) / 2.0
+	} else {
+		return sortedInput[l/2]
+	}
+}
+
+// MedianAbsoluteDeviationPopulationSortedInput calculates the median absolute
+// deviation from a pre-sorted population.
+func MedianAbsoluteDeviationPopulationSortedInput(sortedInput stats.Float64Data) float64 {
+	switch sortedInput.Len() {
+	case 0:
+		return math.NaN()
+	case 1:
+		return 0
+	}
+
+	m := MedianSortedInput(sortedInput)
+	a := sortedInput
+
+	// Peal off the largest difference on either end until we reach the midpoint(s).
+	last := 0.0
+	for len(a) > (len(sortedInput) / 2) {
+		leftDiff := m - a[0]
+		rightDiff := a[len(a)-1] - m
+		if leftDiff >= rightDiff {
+			last = leftDiff
+			a = a[1:]
+		} else {
+			last = rightDiff
+			a = a[:len(a)-1]
+		}
+	}
+
+	if len(sortedInput)%2 == 1 {
+		return last
+	} else {
+		return (max(m-a[0], a[len(a)-1]-m) + last) * 0.5
+	}
+}
diff --git a/pkg/rpc/clock_offset_test.go b/pkg/rpc/clock_offset_test.go
@@ -8,14 +8,20 @@ package rpc
 import (
 	"context"
 	"math"
+	"sort"
 	"testing"
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/testutils"
 	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
+	"github.com/cockroachdb/cockroach/pkg/util/randutil"
 	"github.com/cockroachdb/cockroach/pkg/util/stop"
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"github.com/montanaflynn/stats"
+	"github.com/stretchr/testify/require"
 )
 
 const errOffsetGreaterThanMaxOffset = "clock synchronization error: this node is more than .+ away from at least half of the known nodes"
@@ -256,3 +262,94 @@ func TestResettingMaxTrigger(t *testing.T) {
 		}
 	}
 }
+
+// TestStatsFuncs tests our descriptive stats functions against the stats
+// package.
+func TestStatsFuncs(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	rng, _ := randutil.NewTestRand()
+	size := rng.Intn(1000) + 1
+	data := make(stats.Float64Data, size)
+	for i := range size {
+		neg := 1
+		if rng.Float64() > 0.5 {
+			neg = -1
+		}
+		data[i] = float64(neg) * float64(rng.Int63())
+	}
+
+	// TODO(ssd): You'll note differences between whether the test compares
+	// operations on the unsorted data or the sorted data. This is to avoid
+	// failures caused by floating point error. I had hoped to always compare the
+	// unsorted data passed to the reference implementation with the sorted data
+	// passed to our implementation. But even the floatWithinReasonableTolerance
+	// function below, with enough operations the non-associativity of floating
+	// point arithmetic really seems to accumulate.
+	sortedData := make(stats.Float64Data, size)
+	copy(sortedData, data)
+	sort.Float64s(sortedData)
+
+	mean, err := sortedData.Mean()
+	require.NoError(t, err)
+
+	floatWithinReasonableTolerance := func(t *testing.T, expected, actual float64) {
+		const tolerance = 0.0001
+		withinTolerance := cmp.Equal(expected, actual, cmpopts.EquateApprox(tolerance, 0))
+		if !withinTolerance {
+			t.Errorf("values outside tolerance\n  %f (expected)\n  %f (actual)\n  %f (tolerance)", expected, actual, tolerance)
+		}
+	}
+
+	t.Run("StandardDeviationPopulationKnownMean", func(t *testing.T) {
+		ourStdDev := StandardDeviationPopulationKnownMean(data, mean)
+		theirStdDev, err := stats.StandardDeviation(data)
+		require.NoError(t, err)
+		floatWithinReasonableTolerance(t, theirStdDev, ourStdDev)
+	})
+
+	t.Run("MedianSortedInput", func(t *testing.T) {
+		ourMedian := MedianSortedInput(sortedData)
+		theirMedian, err := stats.Median(data)
+		require.NoError(t, err)
+		floatWithinReasonableTolerance(t, theirMedian, ourMedian)
+	})
+
+	t.Run("PopulationVarianceKnownMean", func(t *testing.T) {
+		ourVar := PopulationVarianceKnownMean(sortedData, mean)
+		theirVar, err := stats.PopulationVariance(sortedData)
+		require.NoError(t, err)
+		floatWithinReasonableTolerance(t, theirVar, ourVar)
+	})
+
+	t.Run("MedianAbsoluteDeviationPopulationSortedInput", func(t *testing.T) {
+		ourMedAbsDev := MedianAbsoluteDeviationPopulationSortedInput(sortedData)
+		theirMedianAbsDev, err := stats.MedianAbsoluteDeviationPopulation(data)
+		require.NoError(t, err)
+		floatWithinReasonableTolerance(t, theirMedianAbsDev, ourMedAbsDev)
+	})
+}
+
+func BenchmarkVerifyClockOffset(b *testing.B) {
+	defer leaktest.AfterTest(b)()
+
+	clock := timeutil.NewManualTime(timeutil.Unix(0, 123))
+	maxOffset := 50 * time.Nanosecond
+	monitor := newRemoteClockMonitor(clock, maxOffset, time.Hour, 0)
+	rng, _ := randutil.NewTestRand()
+
+	offsetCount := 1000
+	monitor.mu.offsets = make(map[roachpb.NodeID]RemoteOffset)
+	for i := range offsetCount {
+		neg := int64(1)
+		if rng.Float64() > 0.5 {
+			neg = -1
+		}
+		offset := neg * int64(rng.Float64()*float64(maxOffset))
+		monitor.mu.offsets[roachpb.NodeID(i)] = RemoteOffset{Offset: offset}
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		require.NoError(b, monitor.VerifyClockOffset(context.Background()))
+	}
+}