Skip to content

Commit 3e9e83d

Browse files
committed
row: harden inconsistent scan machinery
Table statistics collection uses inconsistent scan machinery which means that we paginate over the table via separate transactions with constantly advancing read timestamps. In particular, once we determine that the current read timestamp is at least 5 minutes old (controlled via `sql.stats.max_timestamp_age` cluster setting), we commit the current txn and open a new one in which we advance the timestamp by the amount of time that has passed since the start of the previous one (i.e. by the duration of the previous txn). This process requires an "initial timestamp" for the very first txn that we use. That value comes from evaluating AS OF SYSTEM TIME clause of CREATE STATISTICS stmt which happens when the job record is created. Previously, if there was a long delay between the job record creation and the job being actually executed, we would return "cannot specify timestamp older than ..." error before creating the first txn. It's unclear to me why this check was put in place since it's not really necessary - we could simply remove it, which would make the very first txn of the inconsistent scan to be committed right away, and things would proceed easily. This commit fixes this problem (by removing the check) but also improves things a bit further by explicitly advancing the "initial timestamp" before the first txn is open. If the initial timestamp is too old, it is advanced to make its age to be 1/10 of the max timestamp age. An example of the txn cycle: ``` current time: 100 initial timestamp: 50 max timestamp age: 10 time 100: advance initial timestamp to 99 100: start scan, timestamp=99 100-109: continue scanning at timestamp=99 109: bump timestamp to 108 109-118: continue scanning at timestamp=108 118: bump timestamp to 117 118-127: continue scanning at timestamp=117 ``` Release note (bug fix): CockroachDB could previously encounter "cannot specify timestamp older than ..." error during the table statistics collection in some cases (like when the cluster is overloaded), and this is now fixed. The bug has been present since 19.1 version.
1 parent 2b29b9c commit 3e9e83d

File tree

6 files changed

+49
-10
lines changed

6 files changed

+49
-10
lines changed

pkg/sql/create_stats_test.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"strconv"
1111
"strings"
1212
"sync"
13+
"sync/atomic"
1314
"testing"
1415
"time"
1516

@@ -37,11 +38,19 @@ func TestStatsWithLowTTL(t *testing.T) {
3738
// The test depends on reasonable timings, so don't run under race.
3839
skip.UnderRace(t)
3940

41+
var blockTableReader atomic.Bool
42+
blockCh := make(chan struct{})
43+
4044
s, db, _ := serverutils.StartServer(t, base.TestServerArgs{
4145
Knobs: base.TestingKnobs{
4246
DistSQL: &execinfra.TestingKnobs{
4347
// Set the batch size small to avoid having to use a large number of rows.
4448
TableReaderBatchBytesLimit: 100,
49+
TableReaderStartScanCb: func() {
50+
if blockTableReader.Load() {
51+
<-blockCh
52+
}
53+
},
4554
},
4655
},
4756
})
@@ -131,6 +140,15 @@ SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false;
131140
// Set up timestamp advance to keep timestamps no older than 1s.
132141
r.Exec(t, `SET CLUSTER SETTING sql.stats.max_timestamp_age = '1s'`)
133142

143+
// Block start of the inconsistent scan for 2s so that the initial timestamp
144+
// becomes way too old.
145+
blockTableReader.Store(true)
146+
go func() {
147+
defer blockTableReader.Store(false)
148+
time.Sleep(2 * time.Second)
149+
close(blockCh)
150+
}()
151+
134152
_, err := db.Exec(`CREATE STATISTICS foo FROM t AS OF SYSTEM TIME '-0.1s'`)
135153
if err != nil {
136154
t.Error(err)

pkg/sql/distsql_plan_stats.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ var maxTimestampAge = settings.RegisterDurationSetting(
7171
"sql.stats.max_timestamp_age",
7272
"maximum age of timestamp during table statistics collection",
7373
5*time.Minute,
74+
// TODO(yuzefovich): we should add non-negative duration validation.
7475
)
7576

7677
// minAutoHistogramSamples and maxAutoHistogramSamples are the bounds used by

pkg/sql/execinfra/server_config.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,10 @@ type TestingKnobs struct {
337337
// will be assigned to the gateway before we start assigning partitions to
338338
// other nodes.
339339
MinimumNumberOfGatewayPartitions int
340+
341+
// TableReaderStartScanCb, when non-nil, will be called whenever the
342+
// TableReader processor starts its scan.
343+
TableReaderStartScanCb func()
340344
}
341345

342346
// ModuleTestingKnobs is part of the base.ModuleTestingKnobs interface.

pkg/sql/execinfrapb/processors_sql.proto

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,8 @@ message TableReaderSpec {
102102
// 50: bump timestamp to 40
103103
// ...
104104
//
105-
// Note: it is an error to perform a historical read at an initial timestamp
106-
// older than this value.
107-
//
105+
// Note: if initial timestamp is older than this value when the scan begins,
106+
// the timestamp will be advanced forward.
108107
optional uint64 max_timestamp_age_nanos = 9 [(gogoproto.nullable) = false];
109108

110109
// Indicates the row-level locking strength to be used by the scan. If set to

pkg/sql/row/fetcher.go

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -637,14 +637,28 @@ func (rf *Fetcher) StartInconsistentScan(
637637
return errors.AssertionFailedf("no spans")
638638
}
639639

640-
txnTimestamp := initialTimestamp
641640
txnStartTime := timeutil.Now()
642-
if txnStartTime.Sub(txnTimestamp.GoTime()) >= maxTimestampAge {
643-
return errors.Errorf(
644-
"AS OF SYSTEM TIME: cannot specify timestamp older than %s for this operation",
645-
maxTimestampAge,
646-
)
641+
if txnStartTime.Sub(initialTimestamp.GoTime()) >= maxTimestampAge {
642+
// The initial timestamp is too far into the past already (which can
643+
// happen when the cluster is overloaded, or there was a delay in the
644+
// stats job being picked up for execution, or some other reason). In
645+
// such case we'll advance the timestamp so that its age is about 1/10
646+
// of the maximum age.
647+
targetTimestampAge := maxTimestampAge.Nanoseconds() / 10
648+
if targetTimestampAge < int64(100*time.Millisecond) {
649+
// Ensure at least 100ms timestamp age. We shouldn't reach this line
650+
// unless the max timestamp age setting is set way too low (or
651+
// negative, by mistake), so we're just being conservative.
652+
targetTimestampAge = int64(100 * time.Millisecond)
653+
}
654+
advanceBy := txnStartTime.Sub(initialTimestamp.GoTime()).Nanoseconds() - targetTimestampAge
655+
if log.V(1) {
656+
log.Infof(ctx, "initial timestamp %v too far into the past, advancing it by %v", initialTimestamp, advanceBy)
657+
}
658+
initialTimestamp = initialTimestamp.Add(advanceBy, 0 /* logical */)
647659
}
660+
661+
txnTimestamp := initialTimestamp
648662
txn := kv.NewTxnWithSteppingEnabled(ctx, db, 0 /* gatewayNodeID */, qualityOfService)
649663
if err := txn.SetFixedTimestamp(ctx, txnTimestamp); err != nil {
650664
return err
@@ -691,7 +705,7 @@ func (rf *Fetcher) StartInconsistentScan(
691705
}
692706

693707
if err := rf.kvFetcher.SetupNextFetch(
694-
ctx, spans, nil, batchBytesLimit,
708+
ctx, spans, nil /* spanIDs */, batchBytesLimit,
695709
rf.rowLimitToKeyLimit(rowLimitHint), false, /* spansCanOverlap */
696710
); err != nil {
697711
return err

pkg/sql/rowexec/tablereader.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,9 @@ func (tr *tableReader) Start(ctx context.Context) {
205205
}
206206

207207
func (tr *tableReader) startScan(ctx context.Context) error {
208+
if cb := tr.FlowCtx.Cfg.TestingKnobs.TableReaderStartScanCb; cb != nil {
209+
cb()
210+
}
208211
limitBatches := !tr.parallelize
209212
var bytesLimit rowinfra.BytesLimit
210213
if !limitBatches {

0 commit comments

Comments
 (0)