Merge pull request #151838 from fqazi/blathers/backport-release-25.3-150238

fqazi · web-flow · commit ebcf35ee908b · 2025-09-03T22:59:33.000-04:00
release-25.3: sql: add support for generating split points from table statistics
diff --git a/pkg/cmd/roachtest/tests/schemachange.go b/pkg/cmd/roachtest/tests/schemachange.go
@@ -391,6 +391,10 @@ func makeSchemaChangeBulkIngestTest(
 				db := c.Conn(ctx, t.L(), 1)
 				defer db.Close()
 
+				t.L().Printf("Computing table statistics manually")
+				if _, err := db.Exec("CREATE STATISTICS stats from bulkingest.bulkingest"); err != nil {
+					t.Fatal(err)
+				}
 				if !c.IsLocal() {
 					// Wait for the load generator to run for a few minutes before creating the index.
 					sleepInterval := time.Minute * 5
diff --git a/pkg/sql/BUILD.bazel b/pkg/sql/BUILD.bazel
@@ -484,6 +484,7 @@ go_library(
         "//pkg/sql/row",
         "//pkg/sql/rowcontainer",
         "//pkg/sql/rowenc",
+        "//pkg/sql/rowenc/keyside",
         "//pkg/sql/rowexec",
         "//pkg/sql/rowinfra",
         "//pkg/sql/scheduledlogging",
@@ -695,6 +696,7 @@ go_test(
         "grant_revoke_test.go",
         "grant_role_test.go",
         "index_mutation_test.go",
+        "index_split_scatter_test.go",
         "indexbackfiller_test.go",
         "instrumentation_test.go",
         "internal_test.go",
diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go
@@ -1896,6 +1896,10 @@ type ExecutorTestingKnobs struct {
 	// AfterArbiterRead, if set, will be called after each row read from an arbiter index
 	// for an UPSERT or INSERT.
 	AfterArbiterRead func()
+
+	// BeforeIndexSplitAndScatter is invoked with the split and scatter of an index
+	// occurs.
+	BeforeIndexSplitAndScatter func(splitPoints [][]byte)
 }
 
 // PGWireTestingKnobs contains knobs for the pgwire module.
diff --git a/pkg/sql/index_split_scatter.go b/pkg/sql/index_split_scatter.go
@@ -6,8 +6,10 @@
 package sql
 
 import (
+	"bytes"
 	"context"
 	"math/rand"
+	"sort"
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/keys"
@@ -19,31 +21,177 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog"
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
 	"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
+	"github.com/cockroachdb/cockroach/pkg/sql/rowenc/keyside"
 	"github.com/cockroachdb/cockroach/pkg/sql/schemachanger/scexec"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
+	"github.com/cockroachdb/cockroach/pkg/sql/stats"
 	"github.com/cockroachdb/cockroach/pkg/util/encoding"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/cockroachdb/cockroach/pkg/util/rangedesc"
+	"github.com/cockroachdb/errors"
 )
 
 type indexSplitAndScatter struct {
-	db        *kv.DB
-	codec     keys.SQLCodec
-	sv        *settings.Values
-	rangeIter rangedesc.IteratorFactory
-	nodeDescs kvclient.NodeDescStore
+	db           *kv.DB
+	codec        keys.SQLCodec
+	sv           *settings.Values
+	rangeIter    rangedesc.IteratorFactory
+	nodeDescs    kvclient.NodeDescStore
+	statsCache   *stats.TableStatisticsCache
+	testingKnobs *ExecutorTestingKnobs
 }
 
+var SplitAndScatterWithStats = settings.RegisterBoolSetting(
+	settings.ApplicationLevel,
+	"schemachanger.backfiller.split_with_stats.enabled",
+	"when enabled the index backfiller will generate split and "+
+		"scatter points based table statistics",
+	false,
+)
+
 // NewIndexSplitAndScatter creates a new scexec.IndexSpanSplitter implementation.
 func NewIndexSplitAndScatter(execCfg *ExecutorConfig) scexec.IndexSpanSplitter {
-
 	return &indexSplitAndScatter{
-		db:        execCfg.DB,
-		codec:     execCfg.Codec,
-		sv:        &execCfg.Settings.SV,
-		rangeIter: execCfg.RangeDescIteratorFactory,
-		nodeDescs: execCfg.NodeDescs,
+		db:           execCfg.DB,
+		codec:        execCfg.Codec,
+		sv:           &execCfg.Settings.SV,
+		rangeIter:    execCfg.RangeDescIteratorFactory,
+		nodeDescs:    execCfg.NodeDescs,
+		statsCache:   execCfg.TableStatsCache,
+		testingKnobs: &execCfg.TestingKnobs,
+	}
+}
+
+func (is *indexSplitAndScatter) getSplitPointsWithStats(
+	ctx context.Context, table catalog.TableDescriptor, indexToBackfill catalog.Index, nSplits int,
+) ([][]byte, error) {
+	// Split and scatter with statistics is disabled.
+	if !SplitAndScatterWithStats.Get(is.sv) {
+		return nil, nil
+	}
+	// Fetch the current statistics for this table.
+	tableStats, err := is.statsCache.GetTableStats(ctx, table, nil)
+	if err != nil {
+		return nil, err
+	}
+	// Nothing can be done since no stats exist.
+	if len(tableStats) == 0 {
+		return nil, errors.New("no stats exist for this table")
+	}
+	// Gather the latest stats for each column.
+	keyCols := indexToBackfill.CollectKeyColumnIDs()
+	statsForColumns := make(map[descpb.ColumnID]*stats.TableStatistic)
+	keyCols.ForEach(func(col descpb.ColumnID) {
+		for _, stat := range tableStats {
+			// Skip stats that:
+			// 1) Do not contain this column.
+			// 2) Consist of multiple columns.
+			// 3) Have no histogram information.
+			if stat.Histogram == nil || len(stat.ColumnIDs) != 1 || stat.ColumnIDs[0] != col {
+				continue
+			}
+			statsForColumns[col] = stat
+			break
+		}
+	})
+	rowsPerRange := tableStats[0].RowCount / uint64(nSplits)
+	// Helper function that will append split points, and if necessary, downsample
+	// them if they get too big.
+	var splitPoints [][]byte
+	appendAndShrinkSplitPoint := func(existing [][]byte, add []byte) [][]byte {
+		maxSplitPoints := nSplits * 2
+		if len(existing) < maxSplitPoints {
+			return append(existing, add)
+		}
+		// Otherwise, we can sample these split points.
+		sort.Slice(existing, func(i, j int) bool {
+			return bytes.Compare(existing[i], existing[j]) < 0
+		})
+		// Next get this down to capacity again by taking a uniform sample of the
+		// existing split points.
+		newSplitPoints := make([][]byte, 0, nSplits+1)
+		step := float64(len(existing)) / float64(nSplits)
+		for i := 0; i < nSplits; i++ {
+			newSplitPoints = append(newSplitPoints, existing[int(float64(i)*step)])
+		}
+		newSplitPoints = append(newSplitPoints, add)
+		return newSplitPoints
 	}
+	// The following code generates split points for an index by iterating through
+	// each column of the index. For each column, it uses histogram statistics to
+	// identify points where the data can be divided into chunks of a target size
+	// (`rowsPerRange`).
+	//
+	// For the first column, it creates initial split points. For each subsequent
+	// column, it expands on the previously generated split points. It does this by
+	// appending the new column's split values to each of the existing split points from
+	// prior columns. This causes us to iterate combinatorially over all possible split points,
+	// so the `appendAndShrinkSplitPoint` function is used to downsample and keep the total number
+	// of points controlled.
+
+	// Note: Sadly, only the primary key or columns in indexes will have
+	// detailed information that we can use. All other columns will have
+	// limited splits.
+	for colIdx := 0; colIdx < indexToBackfill.NumKeyColumns(); colIdx++ {
+		lastSplitPoints := append([][]byte{}, splitPoints...)
+		splitPoints = splitPoints[:0]
+		keyColID := indexToBackfill.GetKeyColumnID(colIdx)
+		// Look up the stats and skip if they are missing.
+		stat, ok := statsForColumns[keyColID]
+		if !ok {
+			break
+		}
+		numInBucket := uint64(0)
+		for bucketIdx, bucket := range stat.Histogram {
+			numInBucket += uint64(bucket.NumRange) + uint64(bucket.NumEq)
+			// If we have hit the target rows, then emit a split point. Or
+			// if we are on the last bucket, we should always emit one.
+			if numInBucket >= rowsPerRange || bucketIdx == len(stat.Histogram)-1 {
+				var prevKeys [][]byte
+				// For the first column, we are going to start fresh with the base index prefix.
+				if colIdx == 0 {
+					prevKeys = [][]byte{is.codec.IndexPrefix(uint32(table.GetID()), uint32(indexToBackfill.GetID()))}
+				} else {
+					// For later columns we are going to start with the previous sets of splits.
+					prevKeys = lastSplitPoints
+				}
+				// We don't know where later columns fall, so we will encode these
+				// against all the previous split points (sadly, this will have an exponential
+				// cost). Our limit on the number of split points will resample these if they
+				// become excessive.
+				for _, prevKey := range prevKeys {
+					// Copy the base value before appending the next part of the key.
+					if colIdx > 0 {
+						tempKey := make([]byte, len(prevKey), cap(prevKey))
+						copy(tempKey, prevKey)
+						prevKey = tempKey
+					}
+					newSplit, err := keyside.Encode(prevKey, bucket.UpperBound, encoding.Direction(indexToBackfill.GetKeyColumnDirection(colIdx)+1))
+					if err != nil {
+						return nil, err
+					}
+					splitPoints = appendAndShrinkSplitPoint(splitPoints, newSplit)
+				}
+				numInBucket = 0
+				continue
+			}
+		}
+		// Stop once enough partitions have been created. Or if no partitions exist,
+		// then there is insufficient data for an educated guess. As we process later
+		// columns, we end up creating all possible permutations of the previous split
+		// points we selected, which means the statistical likelihood of a valid split
+		// point getting selected only gets lower.
+		if len(splitPoints) >= nSplits || len(splitPoints) == 0 {
+			break
+		}
+	}
+	// Always emit a split point at the start of the index span if
+	// we generated any split points above
+	if len(splitPoints) > 0 {
+		splitPoints = append(splitPoints, is.codec.IndexPrefix(uint32(table.GetID()), uint32(indexToBackfill.GetID())))
+		log.Infof(ctx, "generated %d split points from statistics for tableId=%d index=%d", len(splitPoints), table.GetID(), indexToBackfill.GetID())
+	}
+	return splitPoints, nil
 }
 
 // MaybeSplitIndexSpans implements the scexec.IndexSpanSplitter interface.
@@ -121,6 +269,13 @@ func (is *indexSplitAndScatter) MaybeSplitIndexSpans(
 		splitPoints = append(splitPoints, newStartKey)
 	}
 
+	if len(splitPoints) == 0 {
+		splitPoints, err = is.getSplitPointsWithStats(ctx, table, indexToBackfill, nSplits)
+		if err != nil {
+			log.Warningf(ctx, "unable to get split points for stats for tableID=%d index=%d due to %v", tableID, indexToBackfill.GetID(), err)
+		}
+	}
+
 	if len(splitPoints) == 0 {
 		// If we can't sample splits from another index, just add one split.
 		log.Infof(ctx, "making a single split point in tableId=%d index=%d", tableID, indexToBackfill.GetID())
@@ -130,6 +285,10 @@ func (is *indexSplitAndScatter) MaybeSplitIndexSpans(
 		if err != nil {
 			return err
 		}
+		// Execute the testing knob before adding a split.
+		if is.testingKnobs.BeforeIndexSplitAndScatter != nil {
+			is.testingKnobs.BeforeIndexSplitAndScatter([][]byte{splitKey})
+		}
 		// We split without scattering here because there is only one split point,
 		// so scattering wouldn't spread that much load.
 		return is.db.AdminSplit(ctx, splitKey, expirationTime)
@@ -143,6 +302,10 @@ func (is *indexSplitAndScatter) MaybeSplitIndexSpans(
 	if step < 1 {
 		step = 1
 	}
+	// Execute the testing knob before the split and scatter.
+	if is.testingKnobs.BeforeIndexSplitAndScatter != nil {
+		is.testingKnobs.BeforeIndexSplitAndScatter(splitPoints)
+	}
 	for i := 0; i < nSplits; i++ {
 		// Evenly space out the ranges that we select from the ranges that are
 		// returned.
diff --git a/pkg/sql/index_split_scatter_test.go b/pkg/sql/index_split_scatter_test.go
@@ -0,0 +1,81 @@
+// Copyright 2025 The Cockroach Authors.
+//
+// Use of this software is governed by the CockroachDB Software License
+// included in the /LICENSE file.
+
+package sql_test
+
+import (
+	"context"
+	"sync/atomic"
+	"testing"
+
+	"github.com/cockroachdb/cockroach/pkg/base"
+	"github.com/cockroachdb/cockroach/pkg/sql"
+	"github.com/cockroachdb/cockroach/pkg/testutils"
+	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
+	"github.com/cockroachdb/cockroach/pkg/testutils/skip"
+	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
+	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
+	"github.com/cockroachdb/cockroach/pkg/util/log"
+	"github.com/stretchr/testify/require"
+)
+
+// TestIndexSplitAndScatterWithStats tests the creation of indexes on tables with statistics,
+// where the splits will be generated using statistics on the table.
+func TestIndexSplitAndScatterWithStats(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+	// This test can be fairly slow and timeout under race / duress.
+	skip.UnderDuress(t)
+
+	testutils.RunTrueAndFalse(t, "StatsCreated", func(t *testing.T, statsExist bool) {
+		ctx := context.Background()
+		var splitHookEnabled atomic.Bool
+		var observedSplitPoints atomic.Int64
+		const numNodes = 3
+		cluster := serverutils.StartCluster(t, numNodes, base.TestClusterArgs{
+			ServerArgs: base.TestServerArgs{
+				Knobs: base.TestingKnobs{
+					SQLExecutor: &sql.ExecutorTestingKnobs{
+						BeforeIndexSplitAndScatter: func(splitPoints [][]byte) {
+							if !splitHookEnabled.Load() {
+								return
+							}
+							observedSplitPoints.Swap(int64(len(splitPoints)))
+						},
+					},
+				},
+			},
+		})
+		defer cluster.Stopper().Stop(ctx)
+		runner := sqlutils.MakeSQLRunner(cluster.ServerConn(0))
+		// Enable split and scatter with stats
+		runner.Exec(t, "SET CLUSTER SETTING schemachanger.backfiller.split_with_stats.enabled = true")
+		// Disable automatic statistics.
+		runner.Exec(t, "SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false")
+		// Create and populate the tables.
+		runner.Exec(t, "CREATE TABLE multi_column_split (b bool, n uuid PRIMARY KEY)")
+		runner.Exec(t, "INSERT INTO multi_column_split (SELECT true, uuid_generate_v1()  FROM generate_series(1, 5000))")
+		runner.Exec(t, "INSERT INTO multi_column_split (SELECT false, uuid_generate_v1() FROM generate_series(1, 5000))")
+		// Generate statistics for these tables.
+		if statsExist {
+			runner.Exec(t, "CREATE STATISTICS st FROM multi_column_split")
+		}
+		// Next create indexes on both tables.
+		splitHookEnabled.Store(true)
+		observedSplitPoints.Store(0)
+		runner.Exec(t, "CREATE INDEX ON multi_column_split (b, n)")
+		// Assert that we generated the target number of split points
+		// automatically.
+		if !statsExist {
+			require.Equal(t, int64(1), observedSplitPoints.Load())
+		} else {
+			expectedCount := sql.PreservedSplitCountMultiple.Get(&cluster.Server(0).ClusterSettings().SV) * numNodes
+			require.Greaterf(t, observedSplitPoints.Load(), expectedCount,
+				"expected %d split points, got %d", expectedCount, observedSplitPoints.Load())
+		}
+		splitHookEnabled.Swap(false)
+	})
+
+}
diff --git a/pkg/sql/logictest/testdata/logic_test/distsql_stats b/pkg/sql/logictest/testdata/logic_test/distsql_stats