asim: improve skewedDistribution and its testing

wenyihu6 · wenyihu6 · commit cb851688acd8 · 2025-08-04T11:43:20.000-04:00
Previously, we fixed the skewedDistribution function by generating weights that
decrease by a factor of 1/2 for each subsequent store, and then normalizing them
so that they sum up to 1. The result represented a skewed replica weight
distribution across stores. However, this required two passes: one to generate
and sum the weights, and another to normalize them. This commit improves the
logic by using the finite sum of a geometric series to pre-compute the total,
allowing normalization in a single pass. It also improves test coverage by
adding an echotest that asserts the expected output of the helper functions.
diff --git a/pkg/kv/kvserver/asim/state/BUILD.bazel b/pkg/kv/kvserver/asim/state/BUILD.bazel
@@ -72,7 +72,9 @@ go_test(
         "//pkg/kv/kvserver/load",
         "//pkg/roachpb",
         "//pkg/testutils/datapathutils",
+        "//pkg/testutils/echotest",
         "//pkg/util/hlc",
+        "//pkg/util/leaktest",
         "@com_github_cockroachdb_datadriven//:datadriven",
         "@com_github_stretchr_testify//require",
     ],
diff --git a/pkg/kv/kvserver/asim/state/new_state.go b/pkg/kv/kvserver/asim/state/new_state.go
@@ -39,21 +39,21 @@ func evenDistribution(numOfStores int) []float64 {
 	return distribution
 }
 
-func skewedDistribution(numOfStores, k int) []float64 {
+func skewedDistribution(numOfStores int) []float64 {
 	weights := make([]float64, numOfStores)
-	var total float64
-	// Compute weights.
+	// Sum of weights. Since weights computed won't add up to 1, we normalize it
+	// by dividing the sum of weights. Sum is pre-computed here using the partial
+	// sum formula of a geometric series: sum of 2^(-i) from i = 0 to k gives
+	// 2-2^(-k).
+	// Example: given 3 stores, cur(weights before normalization) is 1, 0.5, 0.25,
+	// sum is 2.0-2^(-2) = 1.75. After normalization, weights are 0.57, 0.29,
+	// 0.14.
+	sum := 2.0 - math.Pow(2, float64(-(numOfStores-1)))
+	cur := float64(1)
 	for i := 0; i < numOfStores; i++ {
-		// weight[0] = 2^(n-1)
-		// weight[1] = 2^(n-2)
-		// ...
-		// weight[n-1] = 2^0
-		weights[i] = math.Pow(2, float64(numOfStores-i-1))
-		total += weights[i]
-	}
-	// Normalize to get ratios.
-	for i := 0; i < numOfStores; i++ {
-		weights[i] /= total
+		// cur is 1, 0.5, 0.25, ...
+		weights[i] = cur / sum
+		cur /= 2
 	}
 	return weights
 }
@@ -285,7 +285,7 @@ func makeStoreList(stores int) []StoreID {
 func RangesInfoSkewedDistribution(
 	stores int, ranges int, minKey int64, maxKey int64, replicationFactor int, rangeSize int64,
 ) RangesInfo {
-	distribution := skewedDistribution(stores, ranges)
+	distribution := skewedDistribution(stores)
 	storeList := makeStoreList(stores)
 
 	return RangesInfoWithDistribution(
diff --git a/pkg/kv/kvserver/asim/state/state_test.go b/pkg/kv/kvserver/asim/state/state_test.go
@@ -6,7 +6,9 @@
 package state
 
 import (
+	"fmt"
 	"math/rand"
+	"strings"
 	"testing"
 	"time"
 
@@ -15,6 +17,9 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/load"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
+	"github.com/cockroachdb/cockroach/pkg/testutils/datapathutils"
+	"github.com/cockroachdb/cockroach/pkg/testutils/echotest"
+	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
 	"github.com/stretchr/testify/require"
 )
 
@@ -414,53 +419,6 @@ func TestOrderedStateLists(t *testing.T) {
 	s = NewStateWeightedRandDistribution(defaultSeed, []float64{0.0, 0.1, 0.3, 0.6}, 1400, 10000, 3, settings)
 	assertListsOrdered(s)
 }
-func TestSkewedDistribution(t *testing.T) {
-	rangeInfo := RangesInfoSkewedDistribution(
-		6 /*stores*/, 100 /*ranges*/, 1 /*minKey*/, 10000 /*maxKey*/, 3 /*replicationFactor*/, 10000 /*rangeSize*/)
-	expectedStoreReplicas := map[roachpb.StoreID]int{
-		1: 100,
-		2: 87,
-		3: 49,
-		4: 30,
-		5: 20,
-		6: 14,
-	}
-
-	totalReplicas := 0
-	stores := map[roachpb.StoreID]int{}
-	for _, rng := range rangeInfo {
-		for _, repl := range rng.Descriptor.InternalReplicas {
-			stores[repl.StoreID]++
-			totalReplicas++
-		}
-	}
-	require.Equal(t, 300, totalReplicas)
-	require.Equal(t, expectedStoreReplicas, stores)
-	require.Equal(t, 6, len(stores))
-}
-func TestEvenDistribution(t *testing.T) {
-	rangeInfo := RangesInfoEvenDistribution(
-		6 /*stores*/, 100 /*ranges*/, 1 /*minKey*/, 10000 /*maxKey*/, 3 /*replicationFactor*/, 10000 /*rangeSize*/)
-	expectedStoreReplicas := map[roachpb.StoreID]int{
-		1: 50,
-		2: 50,
-		3: 50,
-		4: 50,
-		5: 50,
-		6: 50,
-	}
-	totalReplicas := 0
-	stores := map[roachpb.StoreID]int{}
-	for _, rng := range rangeInfo {
-		for _, repl := range rng.Descriptor.InternalReplicas {
-			stores[repl.StoreID]++
-			totalReplicas++
-		}
-	}
-	require.Equal(t, 300, totalReplicas)
-	require.Equal(t, expectedStoreReplicas, stores)
-	require.Equal(t, 6, len(stores))
-}
 
 // TestNewStateDeterministic asserts that the state returned from the new state
 // utility functions is deterministic.
@@ -834,3 +792,73 @@ func TestCapacityOverride(t *testing.T) {
 	// reason.
 	require.Equal(t, 500.0, capacity.WritesPerSecond)
 }
+
+// TestDistribution tests the distribution helper functions. The invariants
+// are that the distributions sum to 1.0 and that the distribution is
+// expected.
+func TestDistribution(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+
+	sum := func(values []float64) float64 {
+		total := 0.0
+		for _, v := range values {
+			total += v
+		}
+		return total
+	}
+
+	const seed = 42
+	randSource := rand.New(rand.NewSource(seed))
+
+	testCases := []struct {
+		numStores int
+		fns       []struct {
+			name string
+			fn   func() []float64
+		}
+	}{
+		{
+			numStores: 3,
+			fns: []struct {
+				name string
+				fn   func() []float64
+			}{
+				{name: "even", fn: func() []float64 { return evenDistribution(3) }},
+				{name: "skewed", fn: func() []float64 { return skewedDistribution(3) }},
+				{name: "exact", fn: func() []float64 { return exactDistribution([]int{1, 1, 1}) }},
+				{name: "weighted_rand", fn: func() []float64 {
+					return weightedRandDistribution(randSource, []float64{0.6, 0.2, 0.2})
+				}},
+				{name: "rand", fn: func() []float64 { return randDistribution(randSource, 3) }},
+			},
+		},
+		{
+			numStores: 10,
+			fns: []struct {
+				name string
+				fn   func() []float64
+			}{
+				{name: "even", fn: func() []float64 { return evenDistribution(10) }},
+				{name: "skewed", fn: func() []float64 { return skewedDistribution(10) }},
+				{name: "exact", fn: func() []float64 { return exactDistribution([]int{2, 2, 2, 2, 2, 1, 1, 1, 1, 1}) }},
+				{name: "weighted_rand", fn: func() []float64 {
+					return weightedRandDistribution(randSource, []float64{0.5, 0.1, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05})
+				}},
+				{name: "rand", fn: func() []float64 { return randDistribution(randSource, 10) }},
+			},
+		},
+	}
+	w := echotest.NewWalker(t, datapathutils.TestDataPath(t, "echotest"))
+	for _, testCase := range testCases {
+		t.Run(fmt.Sprintf("%d_stores", testCase.numStores), func(t *testing.T) {
+			t.Run("distribution", w.Run(t, fmt.Sprintf("%d_stores", testCase.numStores), func(t *testing.T) string {
+				var str strings.Builder
+				for _, fn := range testCase.fns {
+					dist := fn.fn()
+					str.WriteString(fmt.Sprintf("[%s: %.2f, sum: %.2f]\n", fn.name, dist, sum(dist)))
+				}
+				return str.String()
+			}))
+		})
+	}
+}
diff --git a/pkg/kv/kvserver/asim/state/testdata/echotest/10_stores b/pkg/kv/kvserver/asim/state/testdata/echotest/10_stores
@@ -0,0 +1,14 @@
+# This test tests the distribution helper functions. The invariants are that the
+# distributions sum to 1.0 and that the distribution is expected. The input is
+# even, skewed, rand, exact: store replica count (2,2,2,2,2,1,1,1,1,1), and
+# weighted_rand: store replica ratio
+# (0.5,0.1,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05). The output is the
+# distribution for each helper function which represent the replica weight
+# distribution across 10 stores.
+echo
+----
+[even: [0.10 0.10 0.10 0.10 0.10 0.10 0.10 0.10 0.10 0.10], sum: 1.00]
+[skewed: [0.50 0.25 0.13 0.06 0.03 0.02 0.01 0.00 0.00 0.00], sum: 1.00]
+[exact: [0.13 0.13 0.13 0.13 0.13 0.07 0.07 0.07 0.07 0.07], sum: 1.00]
+[weighted_rand: [0.50 0.00 0.00 0.10 0.20 0.00 0.00 0.00 0.10 0.10], sum: 1.00]
+[rand: [0.11 0.09 0.02 0.04 0.20 0.15 0.20 0.04 0.04 0.11], sum: 1.00]
diff --git a/pkg/kv/kvserver/asim/state/testdata/echotest/3_stores b/pkg/kv/kvserver/asim/state/testdata/echotest/3_stores
@@ -0,0 +1,13 @@
+# This test tests the distribution helper functions. The invariants are that the
+# distributions sum to 1.0 and that the distribution is expected. The input is
+# even, skewed, rand, exact: store replica count (1,1,1), and weighted_rand:
+# store replica ratio (0.6,0.2,0.2). The output is the distribution for each
+# helper function which represent the replica weight distribution across 3
+# stores.
+echo
+----
+[even: [0.33 0.33 0.33], sum: 1.00]
+[skewed: [0.57 0.29 0.14], sum: 1.00]
+[exact: [0.33 0.33 0.33], sum: 1.00]
+[weighted_rand: [0.70 0.20 0.10], sum: 1.00]
+[rand: [0.43 0.33 0.24], sum: 1.00]
diff --git a/pkg/testutils/echotest/echotest.go b/pkg/testutils/echotest/echotest.go
@@ -65,17 +65,6 @@ type Walker struct {
 // all files (i.e. the expected outputs, one per test case) are kept.
 //
 // Model usage:
-//
-//	   w := NewWalker(t, datapathutils.TestDataPath(t))
-//		 for _, test := range []struct{ name string }{
-//		    {name: "foo"},
-//		    {name: "bar"},
-//		 } {
-//		    t.Run(test.name, w.Run(t, test.name, func(t *testing.T, path string) {
-//		       Require(t, fmt.Sprintf("hello, %s", test.name), path)
-//		    }))
-//		 }
-//
 // w := NewWalker(t, datapathutils.TestDataPath(t))
 //
 //	for _, test := range []struct{ name string }{