sql: add support for generating split points from table statistics

fqazi · fqazi · commit 547553b860fb · 2025-08-14T09:36:04.000-04:00
Previously, we only injected split points before a backfill began, either at the start of the index span or by copying the splits from an index with matching key columns. This meant that split points were primarily added only when creating a duplicate index or adding a new column. The bulk adder also has logic to create splits during `CREATE INDEX`, but its sampling method can lead to an insufficient number or poor placement of split points if the data samples from different nodes overlap. To address this, this patch introduces the ability to create split points from table statistics, when available. This improves the split and scatter behavior for new indexes, especially those on columns that have existing statistics or a limited number of unique values. Fixes: #148288 Release note (bug fix): Improved split and scatter behavior for CREATE INDEX when statistics are available for key columns. This can be enabled by setting the cluster setting: `schemachanger.backfiller.split_with_stats.enabled`
diff --git a/pkg/cmd/roachtest/tests/schemachange.go b/pkg/cmd/roachtest/tests/schemachange.go
@@ -391,6 +391,10 @@ func makeSchemaChangeBulkIngestTest(
 				db := c.Conn(ctx, t.L(), 1)
 				defer db.Close()
 
+				t.L().Printf("Computing table statistics manually")
+				if _, err := db.Exec("CREATE STATISTICS stats from bulkingest.bulkingest"); err != nil {
+					t.Fatal(err)
+				}
 				if !c.IsLocal() {
 					// Wait for the load generator to run for a few minutes before creating the index.
 					sleepInterval := time.Minute * 5
diff --git a/pkg/sql/BUILD.bazel b/pkg/sql/BUILD.bazel
@@ -484,6 +484,7 @@ go_library(
         "//pkg/sql/row",
         "//pkg/sql/rowcontainer",
         "//pkg/sql/rowenc",
+        "//pkg/sql/rowenc/keyside",
         "//pkg/sql/rowexec",
         "//pkg/sql/rowinfra",
         "//pkg/sql/scheduledlogging",
@@ -695,6 +696,7 @@ go_test(
         "grant_revoke_test.go",
         "grant_role_test.go",
         "index_mutation_test.go",
+        "index_split_scatter_test.go",
         "indexbackfiller_test.go",
         "instrumentation_test.go",
         "internal_test.go",
diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go
@@ -1896,6 +1896,10 @@ type ExecutorTestingKnobs struct {
 	// AfterArbiterRead, if set, will be called after each row read from an arbiter index
 	// for an UPSERT or INSERT.
 	AfterArbiterRead func()
+
+	// BeforeIndexSplitAndScatter is invoked with the split and scatter of an index
+	// occurs.
+	BeforeIndexSplitAndScatter func(splitPoints [][]byte)
 }
 
 // PGWireTestingKnobs contains knobs for the pgwire module.
diff --git a/pkg/sql/index_split_scatter.go b/pkg/sql/index_split_scatter.go
@@ -6,8 +6,10 @@
 package sql
 
 import (
+	"bytes"
 	"context"
 	"math/rand"
+	"sort"
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/keys"
@@ -19,31 +21,177 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog"
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
 	"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
+	"github.com/cockroachdb/cockroach/pkg/sql/rowenc/keyside"
 	"github.com/cockroachdb/cockroach/pkg/sql/schemachanger/scexec"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
+	"github.com/cockroachdb/cockroach/pkg/sql/stats"
 	"github.com/cockroachdb/cockroach/pkg/util/encoding"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/cockroachdb/cockroach/pkg/util/rangedesc"
+	"github.com/cockroachdb/errors"
 )
 
 type indexSplitAndScatter struct {
-	db        *kv.DB
-	codec     keys.SQLCodec
-	sv        *settings.Values
-	rangeIter rangedesc.IteratorFactory
-	nodeDescs kvclient.NodeDescStore
+	db           *kv.DB
+	codec        keys.SQLCodec
+	sv           *settings.Values
+	rangeIter    rangedesc.IteratorFactory
+	nodeDescs    kvclient.NodeDescStore
+	statsCache   *stats.TableStatisticsCache
+	testingKnobs *ExecutorTestingKnobs
 }
 
+var SplitAndScatterWithStats = settings.RegisterBoolSetting(
+	settings.ApplicationLevel,
+	"schemachanger.backfiller.split_with_stats.enabled",
+	"when enabled the index backfiller will generate split and "+
+		"scatter points based table statistics",
+	false,
+)
+
 // NewIndexSplitAndScatter creates a new scexec.IndexSpanSplitter implementation.
 func NewIndexSplitAndScatter(execCfg *ExecutorConfig) scexec.IndexSpanSplitter {
-
 	return &indexSplitAndScatter{
-		db:        execCfg.DB,
-		codec:     execCfg.Codec,
-		sv:        &execCfg.Settings.SV,
-		rangeIter: execCfg.RangeDescIteratorFactory,
-		nodeDescs: execCfg.NodeDescs,
+		db:           execCfg.DB,
+		codec:        execCfg.Codec,
+		sv:           &execCfg.Settings.SV,
+		rangeIter:    execCfg.RangeDescIteratorFactory,
+		nodeDescs:    execCfg.NodeDescs,
+		statsCache:   execCfg.TableStatsCache,
+		testingKnobs: &execCfg.TestingKnobs,
+	}
+}
+
+func (is *indexSplitAndScatter) getSplitPointsWithStats(
+	ctx context.Context, table catalog.TableDescriptor, indexToBackfill catalog.Index, nSplits int,
+) ([][]byte, error) {
+	// Split and scatter with statistics is disabled.
+	if !SplitAndScatterWithStats.Get(is.sv) {
+		return nil, nil
+	}
+	// Fetch the current statistics for this table.
+	tableStats, err := is.statsCache.GetTableStats(ctx, table, nil)
+	if err != nil {
+		return nil, err
+	}
+	// Nothing can be done since no stats exist.
+	if len(tableStats) == 0 {
+		return nil, errors.New("no stats exist for this table")
+	}
+	// Gather the latest stats for each column.
+	keyCols := indexToBackfill.CollectKeyColumnIDs()
+	statsForColumns := make(map[descpb.ColumnID]*stats.TableStatistic)
+	keyCols.ForEach(func(col descpb.ColumnID) {
+		for _, stat := range tableStats {
+			// Skip stats that:
+			// 1) Do not contain this column.
+			// 2) Consist of multiple columns.
+			// 3) Have no histogram information.
+			if stat.Histogram == nil || len(stat.ColumnIDs) != 1 || stat.ColumnIDs[0] != col {
+				continue
+			}
+			statsForColumns[col] = stat
+			break
+		}
+	})
+	rowsPerRange := tableStats[0].RowCount / uint64(nSplits)
+	// Helper function that will append split points, and if necessary, downsample
+	// them if they get too big.
+	var splitPoints [][]byte
+	appendAndShrinkSplitPoint := func(existing [][]byte, add []byte) [][]byte {
+		maxSplitPoints := nSplits * 2
+		if len(existing) < maxSplitPoints {
+			return append(existing, add)
+		}
+		// Otherwise, we can sample these split points.
+		sort.Slice(existing, func(i, j int) bool {
+			return bytes.Compare(existing[i], existing[j]) < 0
+		})
+		// Next get this down to capacity again by taking a uniform sample of the
+		// existing split points.
+		newSplitPoints := make([][]byte, 0, nSplits+1)
+		step := float64(len(existing)) / float64(nSplits)
+		for i := 0; i < nSplits; i++ {
+			newSplitPoints = append(newSplitPoints, existing[int(float64(i)*step)])
+		}
+		newSplitPoints = append(newSplitPoints, add)
+		return newSplitPoints
 	}
+	// The following code generates split points for an index by iterating through
+	// each column of the index. For each column, it uses histogram statistics to
+	// identify points where the data can be divided into chunks of a target size
+	// (`rowsPerRange`).
+	//
+	// For the first column, it creates initial split points. For each subsequent
+	// column, it expands on the previously generated split points. It does this by
+	// appending the new column's split values to each of the existing split points from
+	// prior columns. This causes us to iterate combinatorially over all possible split points,
+	// so the `appendAndShrinkSplitPoint` function is used to downsample and keep the total number
+	// of points controlled.
+
+	// Note: Sadly, only the primary key or columns in indexes will have
+	// detailed information that we can use. All other columns will have
+	// limited splits.
+	for colIdx := 0; colIdx < indexToBackfill.NumKeyColumns(); colIdx++ {
+		lastSplitPoints := append([][]byte{}, splitPoints...)
+		splitPoints = splitPoints[:0]
+		keyColID := indexToBackfill.GetKeyColumnID(colIdx)
+		// Look up the stats and skip if they are missing.
+		stat, ok := statsForColumns[keyColID]
+		if !ok {
+			break
+		}
+		numInBucket := uint64(0)
+		for bucketIdx, bucket := range stat.Histogram {
+			numInBucket += uint64(bucket.NumRange) + uint64(bucket.NumEq)
+			// If we have hit the target rows, then emit a split point. Or
+			// if we are on the last bucket, we should always emit one.
+			if numInBucket >= rowsPerRange || bucketIdx == len(stat.Histogram)-1 {
+				var prevKeys [][]byte
+				// For the first column, we are going to start fresh with the base index prefix.
+				if colIdx == 0 {
+					prevKeys = [][]byte{is.codec.IndexPrefix(uint32(table.GetID()), uint32(indexToBackfill.GetID()))}
+				} else {
+					// For later columns we are going to start with the previous sets of splits.
+					prevKeys = lastSplitPoints
+				}
+				// We don't know where later columns fall, so we will encode these
+				// against all the previous split points (sadly, this will have an exponential
+				// cost). Our limit on the number of split points will resample these if they
+				// become excessive.
+				for _, prevKey := range prevKeys {
+					// Copy the base value before appending the next part of the key.
+					if colIdx > 0 {
+						tempKey := make([]byte, len(prevKey), cap(prevKey))
+						copy(tempKey, prevKey)
+						prevKey = tempKey
+					}
+					newSplit, err := keyside.Encode(prevKey, bucket.UpperBound, encoding.Direction(indexToBackfill.GetKeyColumnDirection(colIdx)+1))
+					if err != nil {
+						return nil, err
+					}
+					splitPoints = appendAndShrinkSplitPoint(splitPoints, newSplit)
+				}
+				numInBucket = 0
+				continue
+			}
+		}
+		// Stop once enough partitions have been created. Or if no partitions exist,
+		// then there is insufficient data for an educated guess. As we process later
+		// columns, we end up creating all possible permutations of the previous split
+		// points we selected, which means the statistical likelihood of a valid split
+		// point getting selected only gets lower.
+		if len(splitPoints) >= nSplits || len(splitPoints) == 0 {
+			break
+		}
+	}
+	// Always emit a split point at the start of the index span if
+	// we generated any split points above
+	if len(splitPoints) > 0 {
+		splitPoints = append(splitPoints, is.codec.IndexPrefix(uint32(table.GetID()), uint32(indexToBackfill.GetID())))
+		log.Infof(ctx, "generated %d split points from statistics for tableId=%d index=%d", len(splitPoints), table.GetID(), indexToBackfill.GetID())
+	}
+	return splitPoints, nil
 }
 
 // MaybeSplitIndexSpans implements the scexec.IndexSpanSplitter interface.
@@ -121,6 +269,13 @@ func (is *indexSplitAndScatter) MaybeSplitIndexSpans(
 		splitPoints = append(splitPoints, newStartKey)
 	}
 
+	if len(splitPoints) == 0 {
+		splitPoints, err = is.getSplitPointsWithStats(ctx, table, indexToBackfill, nSplits)
+		if err != nil {
+			log.Warningf(ctx, "unable to get split points for stats for tableID=%d index=%d due to %v", tableID, indexToBackfill.GetID(), err)
+		}
+	}
+
 	if len(splitPoints) == 0 {
 		// If we can't sample splits from another index, just add one split.
 		log.Infof(ctx, "making a single split point in tableId=%d index=%d", tableID, indexToBackfill.GetID())
@@ -130,6 +285,10 @@ func (is *indexSplitAndScatter) MaybeSplitIndexSpans(
 		if err != nil {
 			return err
 		}
+		// Execute the testing knob before adding a split.
+		if is.testingKnobs.BeforeIndexSplitAndScatter != nil {
+			is.testingKnobs.BeforeIndexSplitAndScatter([][]byte{splitKey})
+		}
 		// We split without scattering here because there is only one split point,
 		// so scattering wouldn't spread that much load.
 		return is.db.AdminSplit(ctx, splitKey, expirationTime)
@@ -143,6 +302,10 @@ func (is *indexSplitAndScatter) MaybeSplitIndexSpans(
 	if step < 1 {
 		step = 1
 	}
+	// Execute the testing knob before the split and scatter.
+	if is.testingKnobs.BeforeIndexSplitAndScatter != nil {
+		is.testingKnobs.BeforeIndexSplitAndScatter(splitPoints)
+	}
 	for i := 0; i < nSplits; i++ {
 		// Evenly space out the ranges that we select from the ranges that are
 		// returned.
diff --git a/pkg/sql/index_split_scatter_test.go b/pkg/sql/index_split_scatter_test.go
@@ -0,0 +1,81 @@
+// Copyright 2025 The Cockroach Authors.
+//
+// Use of this software is governed by the CockroachDB Software License
+// included in the /LICENSE file.
+
+package sql_test
+
+import (
+	"context"
+	"sync/atomic"
+	"testing"
+
+	"github.com/cockroachdb/cockroach/pkg/base"
+	"github.com/cockroachdb/cockroach/pkg/sql"
+	"github.com/cockroachdb/cockroach/pkg/testutils"
+	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
+	"github.com/cockroachdb/cockroach/pkg/testutils/skip"
+	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
+	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
+	"github.com/cockroachdb/cockroach/pkg/util/log"
+	"github.com/stretchr/testify/require"
+)
+
+// TestIndexSplitAndScatterWithStats tests the creation of indexes on tables with statistics,
+// where the splits will be generated using statistics on the table.
+func TestIndexSplitAndScatterWithStats(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+	// This test can be fairly slow and timeout under race / duress.
+	skip.UnderDuress(t)
+
+	testutils.RunTrueAndFalse(t, "StatsCreated", func(t *testing.T, statsExist bool) {
+		ctx := context.Background()
+		var splitHookEnabled atomic.Bool
+		var observedSplitPoints atomic.Int64
+		const numNodes = 3
+		cluster := serverutils.StartCluster(t, numNodes, base.TestClusterArgs{
+			ServerArgs: base.TestServerArgs{
+				Knobs: base.TestingKnobs{
+					SQLExecutor: &sql.ExecutorTestingKnobs{
+						BeforeIndexSplitAndScatter: func(splitPoints [][]byte) {
+							if !splitHookEnabled.Load() {
+								return
+							}
+							observedSplitPoints.Swap(int64(len(splitPoints)))
+						},
+					},
+				},
+			},
+		})
+		defer cluster.Stopper().Stop(ctx)
+		runner := sqlutils.MakeSQLRunner(cluster.ServerConn(0))
+		// Enable split and scatter with stats
+		runner.Exec(t, "SET CLUSTER SETTING schemachanger.backfiller.split_with_stats.enabled = true")
+		// Disable automatic statistics.
+		runner.Exec(t, "SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false")
+		// Create and populate the tables.
+		runner.Exec(t, "CREATE TABLE multi_column_split (b bool, n uuid PRIMARY KEY)")
+		runner.Exec(t, "INSERT INTO multi_column_split (SELECT true, uuid_generate_v1()  FROM generate_series(1, 5000))")
+		runner.Exec(t, "INSERT INTO multi_column_split (SELECT false, uuid_generate_v1() FROM generate_series(1, 5000))")
+		// Generate statistics for these tables.
+		if statsExist {
+			runner.Exec(t, "CREATE STATISTICS st FROM multi_column_split")
+		}
+		// Next create indexes on both tables.
+		splitHookEnabled.Store(true)
+		observedSplitPoints.Store(0)
+		runner.Exec(t, "CREATE INDEX ON multi_column_split (b, n)")
+		// Assert that we generated the target number of split points
+		// automatically.
+		if !statsExist {
+			require.Equal(t, int64(1), observedSplitPoints.Load())
+		} else {
+			expectedCount := sql.PreservedSplitCountMultiple.Get(&cluster.Server(0).ClusterSettings().SV) * numNodes
+			require.Greaterf(t, observedSplitPoints.Load(), expectedCount,
+				"expected %d split points, got %d", expectedCount, observedSplitPoints.Load())
+		}
+		splitHookEnabled.Swap(false)
+	})
+
+}