Skip to content

Commit da95cf7

Browse files
craig[bot]yuzefovich
andcommitted
Merge #145481
145481: stats: fix "non-nullable col with no value" for multiple column families r=yuzefovich a=yuzefovich This commit fixes an edge case in the timestamp-advancing mechanism of the inconsistent scans that are used by the table statistics collection. In particular, previously on a table with multiple column families it was possible for the scan to stop in the middle of a SQL row, and if that row happens to exist at the old timestamp but to no longer exist at the new timestamp, we'd hit an internal error. The bug is now fixed by setting WholeRowsOfSize option of the BatchRequest if we're scanning a table with multiple column families which guarantees that the scan at each timestamp will stop only at the ends of SQL rows. To reproduce this behavior I extended an existing test that stresses the timestamp-advancing mechanism. Fixes: #145480. Release note (bug fix): Previously, on a table with multiple column families CockroachDB could encounter "Non-nullable column "‹×›:‹×›" with no value" error during table statistics collection in rare cases. The bug has been present since v19.2 and is now fixed. Co-authored-by: Yahor Yuzefovich <[email protected]>
2 parents 7eba613 + f84e3f7 commit da95cf7

File tree

2 files changed

+72
-26
lines changed

2 files changed

+72
-26
lines changed

pkg/sql/create_stats_test.go

Lines changed: 64 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,20 @@ func TestStatsWithLowTTL(t *testing.T) {
3838
// The test depends on reasonable timings, so don't run under race.
3939
skip.UnderRace(t)
4040

41+
rng, _ := randutil.NewTestRand()
4142
var blockTableReader atomic.Bool
4243
blockCh := make(chan struct{})
4344

4445
s, db, _ := serverutils.StartServer(t, base.TestServerArgs{
4546
Knobs: base.TestingKnobs{
4647
DistSQL: &execinfra.TestingKnobs{
47-
// Set the batch size small to avoid having to use a large number of rows.
48-
TableReaderBatchBytesLimit: 100,
48+
// Set the batch size small to avoid having to use a large
49+
// number of rows.
50+
//
51+
// We use a random bytes limit so that the scans have a chance
52+
// to stop at different points within the SQL row (in case of
53+
// multiple column families).
54+
TableReaderBatchBytesLimit: 50 + int64(rng.Intn(100)),
4955
TableReaderStartScanCb: func() {
5056
if blockTableReader.Load() {
5157
<-blockCh
@@ -57,51 +63,82 @@ func TestStatsWithLowTTL(t *testing.T) {
5763
defer s.Stopper().Stop(context.Background())
5864

5965
r := sqlutils.MakeSQLRunner(db)
60-
r.Exec(t, `
61-
SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false;
62-
`)
63-
r.Exec(t, `
64-
CREATE DATABASE test;
65-
USE test;
66-
CREATE TABLE t (k INT PRIMARY KEY, a INT, b INT);
67-
`)
68-
const numRows = 20
69-
r.Exec(t, `INSERT INTO t SELECT k, 2*k, 3*k FROM generate_series(0, $1) AS g(k)`, numRows-1)
66+
r.Exec(t, `SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false;`)
67+
// Sometimes use single column family, sometimes use multiple.
68+
if rng.Intn(2) == 0 {
69+
r.Exec(t, `CREATE TABLE t (k INT PRIMARY KEY, a INT NOT NULL, b INT NOT NULL, FAMILY (k, a, b));`)
70+
} else {
71+
r.Exec(t, `CREATE TABLE t (k INT PRIMARY KEY, a INT NOT NULL, b INT NOT NULL, FAMILY (k), FAMILY (a), FAMILY (b));`)
72+
}
73+
const initialNumRows, maxNumRows = 20, 100
74+
r.Exec(t, `INSERT INTO t SELECT k, 2*k, 3*k FROM generate_series(0, $1) AS g(k)`, initialNumRows-1)
7075

71-
// Start a goroutine that keeps updating rows in the table and issues
72-
// GCRequests simulating a 2 second TTL. While this is running, reading at a
73-
// timestamp older than 2 seconds will likely error out.
76+
// Start a goroutine that keeps modifying rows (updating, deleting,
77+
// inserting new ones) in the table and issues GCRequests simulating a 2
78+
// second TTL. While this is running, reading at a timestamp older than 2
79+
// seconds will likely error out.
7480
var goroutineErr error
7581
var wg sync.WaitGroup
7682
wg.Add(1)
7783
stopCh := make(chan struct{})
84+
// onlyUpdates determines whether only UPDATE stmts are issued in the
85+
// goroutine - this behavior is used in the first part of the test where we
86+
// expect the stats collection to fail (if we allow INSERT and DELETE stmts,
87+
// we might never hit the GC threshold).
88+
var onlyUpdates atomic.Bool
89+
onlyUpdates.Store(true)
7890

7991
go func() {
8092
defer wg.Done()
8193

8294
// Open a separate connection to the database.
8395
db2 := s.SQLConn(t)
8496

85-
_, err := db2.Exec("USE test")
86-
if err != nil {
87-
goroutineErr = err
88-
return
89-
}
90-
rng, _ := randutil.NewTestRand()
97+
nextPK := initialNumRows
9198
for {
9299
select {
93100
case <-stopCh:
94101
return
95102
default:
96103
}
97-
k := rng.Intn(numRows)
98-
if _, err := db2.Exec(`UPDATE t SET a=a+1, b=b+2 WHERE k=$1`, k); err != nil {
99-
goroutineErr = err
100-
return
104+
switch rng.Intn(4) {
105+
case 0, 1:
106+
// In 50% cases try to update an existing row (it's ok if the
107+
// row doesn't exist).
108+
k := rng.Intn(nextPK)
109+
if _, err := db2.Exec(`UPDATE t SET a=a+1, b=b+2 WHERE k=$1`, k); err != nil {
110+
goroutineErr = err
111+
return
112+
}
113+
case 2:
114+
if onlyUpdates.Load() {
115+
continue
116+
}
117+
// In 25% cases try to delete a row (it's ok if the row doesn't
118+
// exist).
119+
k := rng.Intn(nextPK)
120+
if _, err := db2.Exec(`DELETE FROM t WHERE k=$1`, k); err != nil {
121+
goroutineErr = err
122+
return
123+
}
124+
case 3:
125+
if onlyUpdates.Load() {
126+
continue
127+
}
128+
// In 25% cases insert a new row, but don't insert too many rows
129+
// to allow for the stats collection to complete.
130+
if nextPK == maxNumRows {
131+
continue
132+
}
133+
if _, err := db2.Exec(`INSERT INTO t SELECT $1, 2*$1, 3*$1`, nextPK); err != nil {
134+
goroutineErr = err
135+
return
136+
}
137+
nextPK++
101138
}
102139
// Force a table GC of values older than 2 seconds.
103140
if err := s.ForceTableGC(
104-
context.Background(), "test", "t", s.Clock().Now().Add(-int64(2*time.Second), 0),
141+
context.Background(), "defaultdb", "t", s.Clock().Now().Add(-int64(2*time.Second), 0),
105142
); err != nil {
106143
goroutineErr = err
107144
return
@@ -139,6 +176,7 @@ SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false;
139176

140177
// Set up timestamp advance to keep timestamps no older than 1s.
141178
r.Exec(t, `SET CLUSTER SETTING sql.stats.max_timestamp_age = '1s'`)
179+
onlyUpdates.Store(false)
142180

143181
// Block start of the inconsistent scan for 2s so that the initial timestamp
144182
// becomes way too old.

pkg/sql/row/fetcher.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -702,6 +702,14 @@ func (rf *Fetcher) StartInconsistentScan(
702702
}
703703
}
704704

705+
if rf.args.Spec.MaxKeysPerRow > 1 {
706+
// If the table has multiple column families, we need to ensure that
707+
// the scan stops at the end of the full SQL row - otherwise, the
708+
// row might be deleted between two timestamps leading to incorrect
709+
// results (or internal errors).
710+
ba.Header.WholeRowsOfSize = int32(rf.args.Spec.MaxKeysPerRow)
711+
}
712+
705713
log.VEventf(ctx, 2, "inconsistent scan: sending a batch with %d requests", len(ba.Requests))
706714
res, err := txn.Send(ctx, ba)
707715
if err != nil {

0 commit comments

Comments
 (0)