Skip to content

Commit d1dfc80

Browse files
committed
sql: don't throw errors for skipped auto stats jobs
Previously, auto stats jobs would throw errors and increase failed jobs counters if they attempted to start while a stats collection was already in progress on the table. For large clusters with 'sql.stats.automatic_job_check_before_creating_job.enabled' set to true, this could create quite a few failed jobs. These failed jobs don't seem to cause any performance issues, but they clutter logs, potentially obscuring real problems and alarming customers, who then file tickets with support to figure out why their jobs are failing. This patch: * refactors the autostats checks to reduce code duplication. * swallows the error for concurrent auto stats creation, logging at INFO level instead. * changes the create stats jobs test so that it no longer expects these jobs creations to fail and instead expects the stats to not be collected. * fixes a bug in the create stats jobs test that would cause it to hang instead of exiting on error. * adds a cluster setting, sql.stats.error_on_concurrent_create_stats.enabled, which controls this new behavior. By default the old behavior is maintained. Fixes: #148413 Release note (ops change): CockroachDB now has a cluster setting, sql.stats.error_on_concurrent_create_stats.enabled, which modifies how it reacts to concurrent auto stats jobs. The default, true, maintains the previous behavior. Setting this to false will cause the concurrent auto stats job to be skipped with just a log entry and no increased error counters.
1 parent 5128b47 commit d1dfc80

File tree

4 files changed

+140
-57
lines changed

4 files changed

+140
-57
lines changed

docs/generated/settings/settings-for-tenants.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,7 @@ sql.stats.automatic_partial_collection.fraction_stale_rows float 0.05 target fra
355355
sql.stats.automatic_partial_collection.min_stale_rows integer 100 target minimum number of stale rows per table that will trigger a partial statistics refresh application
356356
sql.stats.cleanup.recurrence string @hourly cron-tab recurrence for SQL Stats cleanup job application
357357
sql.stats.detailed_latency_metrics.enabled boolean false label latency metrics with the statement fingerprint. Workloads with tens of thousands of distinct query fingerprints should leave this setting false. (experimental, affects performance for workloads with high fingerprint cardinality) application
358+
sql.stats.error_on_concurrent_create_stats.enabled boolean true set to true to error on concurrent CREATE STATISTICS jobs, instead of skipping them application
358359
sql.stats.flush.enabled boolean true if set, SQL execution statistics are periodically flushed to disk application
359360
sql.stats.flush.interval duration 10m0s the interval at which SQL execution statistics are flushed to disk, this value must be less than or equal to 1 hour application
360361
sql.stats.forecasts.enabled boolean true when true, enables generation of statistics forecasts by default for all tables application

docs/generated/settings/settings.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,7 @@
310310
<tr><td><div id="setting-sql-stats-automatic-partial-collection-min-stale-rows" class="anchored"><code>sql.stats.automatic_partial_collection.min_stale_rows</code></div></td><td>integer</td><td><code>100</code></td><td>target minimum number of stale rows per table that will trigger a partial statistics refresh</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
311311
<tr><td><div id="setting-sql-stats-cleanup-recurrence" class="anchored"><code>sql.stats.cleanup.recurrence</code></div></td><td>string</td><td><code>@hourly</code></td><td>cron-tab recurrence for SQL Stats cleanup job</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
312312
<tr><td><div id="setting-sql-stats-detailed-latency-metrics-enabled" class="anchored"><code>sql.stats.detailed_latency_metrics.enabled</code></div></td><td>boolean</td><td><code>false</code></td><td>label latency metrics with the statement fingerprint. Workloads with tens of thousands of distinct query fingerprints should leave this setting false. (experimental, affects performance for workloads with high fingerprint cardinality)</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
313+
<tr><td><div id="setting-sql-stats-error-on-concurrent-create-stats-enabled" class="anchored"><code>sql.stats.error_on_concurrent_create_stats.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>set to true to error on concurrent CREATE STATISTICS jobs, instead of skipping them</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
313314
<tr><td><div id="setting-sql-stats-flush-enabled" class="anchored"><code>sql.stats.flush.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if set, SQL execution statistics are periodically flushed to disk</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
314315
<tr><td><div id="setting-sql-stats-flush-interval" class="anchored"><code>sql.stats.flush.interval</code></div></td><td>duration</td><td><code>10m0s</code></td><td>the interval at which SQL execution statistics are flushed to disk, this value must be less than or equal to 1 hour</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
315316
<tr><td><div id="setting-sql-stats-forecasts-enabled" class="anchored"><code>sql.stats.forecasts.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>when true, enables generation of statistics forecasts by default for all tables</td><td>Serverless/Dedicated/Self-Hosted</td></tr>

pkg/sql/create_stats.go

Lines changed: 52 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,13 @@ var automaticJobCheckBeforeCreatingJob = settings.RegisterBoolSetting(
8383
"transaction as creating the job",
8484
true)
8585

86+
var errorOnConcurrentCreateStats = settings.RegisterBoolSetting(
87+
settings.ApplicationLevel,
88+
"sql.stats.error_on_concurrent_create_stats.enabled",
89+
"set to true to error on concurrent CREATE STATISTICS jobs, instead of skipping them",
90+
true,
91+
settings.WithPublic)
92+
8693
const nonIndexColHistogramBuckets = 2
8794

8895
// StubTableStats generates "stub" statistics for a table which are missing
@@ -166,6 +173,10 @@ func (n *createStatsNode) runJob(ctx context.Context) error {
166173
ctx, nil /* job */, n.p, n.Name == jobspb.AutoPartialStatsName, n.p.ExecCfg().JobRegistry,
167174
details.Table.ID,
168175
); err != nil {
176+
if !errorOnConcurrentCreateStats.Get(n.p.ExecCfg().SV()) && errors.Is(err, stats.ConcurrentCreateStatsError) {
177+
log.Infof(ctx, "concurrent create stats job found, skipping")
178+
return nil
179+
}
169180
return err
170181
}
171182
}
@@ -179,23 +190,18 @@ func (n *createStatsNode) runJob(ctx context.Context) error {
179190
if (n.Name == jobspb.AutoStatsName || n.Name == jobspb.AutoPartialStatsName) && !jobCheckBefore {
180191
// Don't start the job if there is already a CREATE STATISTICS job running.
181192
if err := checkRunningJobsInTxn(
182-
ctx, n.p.EvalContext().Settings, jobspb.InvalidJobID, txn,
193+
ctx, nil /* job */, n.p, n.Name == jobspb.AutoPartialStatsName, txn, n.p.ExecCfg().JobRegistry,
194+
details.Table.ID,
183195
); err != nil {
184196
return err
185197
}
186-
// Don't start auto partial stats jobs if there is another auto partial
187-
// stats job running on the same table.
188-
if n.Name == jobspb.AutoPartialStatsName {
189-
if err := checkRunningAutoPartialJobsInTxn(
190-
ctx, n.p.EvalContext().Settings, jobspb.InvalidJobID, txn,
191-
n.p.ExecCfg().JobRegistry, details.Table.ID,
192-
); err != nil {
193-
return err
194-
}
195-
}
196198
}
197199
return n.p.ExecCfg().JobRegistry.CreateStartableJobWithTxn(ctx, &job, jobID, txn, *record)
198200
}); err != nil {
201+
if !errorOnConcurrentCreateStats.Get(n.p.ExecCfg().SV()) && errors.Is(err, stats.ConcurrentCreateStatsError) {
202+
log.Infof(ctx, "concurrent create stats job found, skipping")
203+
return nil
204+
}
199205
if job != nil {
200206
if cleanupErr := job.CleanupOnRollback(ctx); cleanupErr != nil {
201207
log.Warningf(ctx, "failed to cleanup StartableJob: %v", cleanupErr)
@@ -212,6 +218,9 @@ func (n *createStatsNode) runJob(ctx context.Context) error {
212218
if delErr := n.p.ExecCfg().JobRegistry.DeleteTerminalJobByID(ctx, job.ID()); delErr != nil {
213219
log.Warningf(ctx, "failed to delete job: %v", delErr)
214220
}
221+
if !errorOnConcurrentCreateStats.Get(n.p.ExecCfg().SV()) {
222+
return nil
223+
}
215224
}
216225
}
217226
return err
@@ -901,31 +910,49 @@ func checkRunningJobs(
901910
jobRegistry *jobs.Registry,
902911
tableID descpb.ID,
903912
) error {
904-
jobID := jobspb.InvalidJobID
905-
if job != nil {
906-
jobID = job.ID()
907-
}
908913
return p.ExecCfg().InternalDB.Txn(ctx, func(ctx context.Context, txn isql.Txn) (err error) {
909914
if err = checkRunningJobsInTxn(
910-
ctx, p.ExtendedEvalContext().Settings, jobID, txn,
915+
ctx, job, p, autoPartial, txn, jobRegistry, tableID,
911916
); err != nil {
912917
return err
913918
}
914-
if autoPartial {
915-
return checkRunningAutoPartialJobsInTxn(
916-
ctx, p.ExtendedEvalContext().Settings, jobID, txn, jobRegistry, tableID,
917-
)
918-
}
919919
return nil
920920
})
921921
}
922922

923-
// checkRunningJobsInTxn checks whether there are any other CreateStats jobs
923+
func checkRunningJobsInTxn(
924+
ctx context.Context,
925+
job *jobs.Job,
926+
p JobExecContext,
927+
autoPartial bool,
928+
txn isql.Txn,
929+
jobRegistry *jobs.Registry,
930+
tableID descpb.ID,
931+
) error {
932+
jobID := jobspb.InvalidJobID
933+
if job != nil {
934+
jobID = job.ID()
935+
}
936+
937+
if err := checkRunningJobsInTxnImpl(
938+
ctx, p.ExtendedEvalContext().Settings, jobID, txn,
939+
); err != nil {
940+
return err
941+
}
942+
if autoPartial {
943+
return checkRunningAutoPartialJobsInTxn(
944+
ctx, p.ExtendedEvalContext().Settings, jobID, txn, jobRegistry, tableID,
945+
)
946+
}
947+
return nil
948+
}
949+
950+
// checkRunningJobsInTxnImpl checks whether there are any other CreateStats jobs
924951
// (excluding auto partial stats jobs) in the pending, running, or paused status
925-
// that started earlier than this one. If there are, checkRunningJobsInTxn
926-
// returns an error. If jobID is jobspb.InvalidJobID, checkRunningJobsInTxn just
952+
// that started earlier than this one. If there are, checkRunningJobsInTxnImpl
953+
// returns an error. If jobID is jobspb.InvalidJobID, checkRunningJobsInTxnImpl just
927954
// checks if there are any pending, running, or paused CreateStats jobs.
928-
func checkRunningJobsInTxn(
955+
func checkRunningJobsInTxnImpl(
929956
ctx context.Context, cs *cluster.Settings, jobID jobspb.JobID, txn isql.Txn,
930957
) error {
931958
exists, err := jobs.RunningJobExists(

pkg/sql/stats/create_stats_job_test.go

Lines changed: 86 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -221,10 +221,24 @@ func TestCreateStatisticsCanBeCancelled(t *testing.T) {
221221
// don't run when a full stats job is running. It also tests that manual stat
222222
// jobs (full or partial) are always allowed to run.
223223
func TestAtMostOneRunningCreateStats(t *testing.T) {
224+
testAtMostOneRunningCreateStatsImpl(t, true /* errorOnConcurrentCreateStats */)
225+
}
226+
227+
func TestAtMostOneRunningCreateStatsWithErrorOnConcurrentCreateStats(t *testing.T) {
228+
testAtMostOneRunningCreateStatsImpl(t, false /* errorOnConcurrentCreateStats */)
229+
}
230+
231+
func testAtMostOneRunningCreateStatsImpl(t *testing.T, errorOnConcurrentCreateStats bool) {
224232
defer leaktest.AfterTest(t)()
225233
defer log.Scope(t).Close(t)
226234

227235
var allowRequest chan struct{}
236+
var allowRequestOpen bool
237+
defer func() {
238+
if allowRequestOpen {
239+
close(allowRequest)
240+
}
241+
}()
228242

229243
filter, setTableID := createStatsRequestFilter(&allowRequest)
230244
var params base.TestClusterArgs
@@ -241,37 +255,25 @@ func TestAtMostOneRunningCreateStats(t *testing.T) {
241255
conn := tc.ApplicationLayer(0).SQLConn(t)
242256
sqlDB := sqlutils.MakeSQLRunner(conn)
243257

258+
sqlDB.Exec(t, fmt.Sprintf("SET CLUSTER SETTING sql.stats.error_on_concurrent_create_stats.enabled = %t", errorOnConcurrentCreateStats))
244259
sqlDB.Exec(t, `CREATE DATABASE d`)
245260
sqlDB.Exec(t, `CREATE TABLE d.t (x INT PRIMARY KEY)`)
246261
sqlDB.Exec(t, `INSERT INTO d.t SELECT generate_series(1,1000)`)
247262
var tID descpb.ID
248263
sqlDB.QueryRow(t, `SELECT 'd.t'::regclass::int`).Scan(&tID)
249264
setTableID(tID)
250265

251-
autoFullStatsRunShouldFail := func() {
252-
_, err := conn.Exec(`CREATE STATISTICS __auto__ FROM d.t`)
253-
expected := "another CREATE STATISTICS job is already running"
254-
if !testutils.IsError(err, expected) {
255-
t.Fatalf("expected '%s' error, but got %v", expected, err)
256-
}
257-
}
258-
autoPartialStatsRunShouldFail := func() {
259-
_, err := conn.Exec(`CREATE STATISTICS __auto_partial__ FROM d.t USING EXTREMES`)
260-
expected := "another CREATE STATISTICS job is already running"
261-
if !testutils.IsError(err, expected) {
262-
t.Fatalf("expected '%s' error, but got %v", expected, err)
263-
}
264-
}
265-
266266
// Start a full stat run and let it complete so that future partial stats can
267267
// be collected
268268
allowRequest = make(chan struct{})
269+
allowRequestOpen = true
269270
initialFullStatErrCh := make(chan error)
270271
go func() {
271272
_, err := conn.Exec(`CREATE STATISTICS full_statistic FROM d.t`)
272273
initialFullStatErrCh <- err
273274
}()
274275
close(allowRequest)
276+
allowRequestOpen = false
275277
if err := <-initialFullStatErrCh; err != nil {
276278
t.Fatalf("create stats job should have completed: %s", err)
277279
}
@@ -280,6 +282,7 @@ func TestAtMostOneRunningCreateStats(t *testing.T) {
280282
// be the stat job that runs in the background as we test the behavior of new
281283
// stat jobs.
282284
allowRequest = make(chan struct{})
285+
allowRequestOpen = true
283286
runningManualFullStatErrCh := make(chan error)
284287
go func() {
285288
_, err := conn.Exec(`CREATE STATISTICS s1 FROM d.t`)
@@ -291,9 +294,12 @@ func TestAtMostOneRunningCreateStats(t *testing.T) {
291294
t.Fatal(err)
292295
}
293296

294-
// Attempt to start automatic full and partial stats runs. Both should fail.
295-
autoFullStatsRunShouldFail()
296-
autoPartialStatsRunShouldFail()
297+
// Don't block on autostats jobs.
298+
setTableID(descpb.InvalidID)
299+
300+
// Attempt to start automatic full and partial stats runs.
301+
runAutoStatsJob(t, sqlDB, "d.t", false /* partial */, errorOnConcurrentCreateStats)
302+
runAutoStatsJob(t, sqlDB, "d.t", true /* partial */, errorOnConcurrentCreateStats)
297303

298304
// PAUSE JOB does not block until the job is paused but only requests it.
299305
// Wait until the job is set to paused.
@@ -320,8 +326,8 @@ func TestAtMostOneRunningCreateStats(t *testing.T) {
320326
}
321327

322328
// Starting automatic full and partial stats run should still fail.
323-
autoFullStatsRunShouldFail()
324-
autoPartialStatsRunShouldFail()
329+
runAutoStatsJob(t, sqlDB, "d.t", false /* partial */, errorOnConcurrentCreateStats)
330+
runAutoStatsJob(t, sqlDB, "d.t", true /* partial */, errorOnConcurrentCreateStats)
325331

326332
// Attempt to start manual full and partial stat runs. Both should succeed.
327333
manualFullStatErrCh := make(chan error)
@@ -345,8 +351,8 @@ func TestAtMostOneRunningCreateStats(t *testing.T) {
345351
t.Fatal(err)
346352
}
347353

348-
// Allow the running full stat job and the new full and partial stat jobs to complete.
349354
close(allowRequest)
355+
allowRequestOpen = false
350356

351357
// Verify that the manual full and partial stat jobs completed successfully.
352358
if err := <-manualFullStatErrCh; err != nil {
@@ -366,10 +372,24 @@ func TestAtMostOneRunningCreateStats(t *testing.T) {
366372
// doesn't prevent any new full or partial stat jobs from running, except for
367373
// auto partial stat jobs on the same table.
368374
func TestBackgroundAutoPartialStats(t *testing.T) {
375+
testBackgroundAutoPartialStatsImpl(t, true /* errorOnConcurrentCreateStats */)
376+
}
377+
378+
func TestBackgroundAutoPartialStatsWithErrorOnConcurrentCreateStats(t *testing.T) {
379+
testBackgroundAutoPartialStatsImpl(t, false /* errorOnConcurrentCreateStats */)
380+
}
381+
382+
func testBackgroundAutoPartialStatsImpl(t *testing.T, errorOnConcurrentCreateStats bool) {
369383
defer leaktest.AfterTest(t)()
370384
defer log.Scope(t).Close(t)
371385

372386
var allowRequest chan struct{}
387+
var allowRequestOpen bool
388+
defer func() {
389+
if allowRequestOpen {
390+
close(allowRequest)
391+
}
392+
}()
373393

374394
filter, setTableID := createStatsRequestFilter(&allowRequest)
375395
var params base.TestClusterArgs
@@ -386,6 +406,7 @@ func TestBackgroundAutoPartialStats(t *testing.T) {
386406
conn := tc.ApplicationLayer(0).SQLConn(t)
387407
sqlDB := sqlutils.MakeSQLRunner(conn)
388408

409+
sqlDB.Exec(t, fmt.Sprintf("SET CLUSTER SETTING sql.stats.error_on_concurrent_create_stats.enabled = %t", errorOnConcurrentCreateStats))
389410
sqlDB.Exec(t, `CREATE DATABASE d`)
390411
sqlDB.Exec(t, `CREATE TABLE d.t1 (x INT PRIMARY KEY)`)
391412
sqlDB.Exec(t, `CREATE TABLE d.t2 (x INT PRIMARY KEY)`)
@@ -410,6 +431,7 @@ func TestBackgroundAutoPartialStats(t *testing.T) {
410431
// This will be the stat job that runs in the background as we test the
411432
// behavior of new stat jobs.
412433
allowRequest = make(chan struct{})
434+
allowRequestOpen = true
413435
runningAutoPartialStatErrCh := make(chan error)
414436
go func() {
415437
_, err := conn.Exec(`CREATE STATISTICS __auto_partial__ FROM d.t1 USING EXTREMES`)
@@ -438,6 +460,7 @@ func TestBackgroundAutoPartialStats(t *testing.T) {
438460

439461
// Allow both auto stat jobs to complete.
440462
close(allowRequest)
463+
allowRequestOpen = false
441464

442465
// Verify that both jobs completed successfully.
443466
if err := <-autoFullStatErrCh; err != nil {
@@ -449,6 +472,7 @@ func TestBackgroundAutoPartialStats(t *testing.T) {
449472

450473
// Start another auto partial stat run and wait until it's done one scan.
451474
allowRequest = make(chan struct{})
475+
allowRequestOpen = true
452476
runningAutoPartialStatErrCh = make(chan error)
453477
go func() {
454478
_, err := conn.Exec(`CREATE STATISTICS __auto_partial__ FROM d.t1 USING EXTREMES`)
@@ -461,12 +485,7 @@ func TestBackgroundAutoPartialStats(t *testing.T) {
461485
}
462486

463487
// Attempt to start a simultaneous auto partial stat run on the same table.
464-
// It should fail.
465-
_, err := conn.Exec(`CREATE STATISTICS __auto_partial__ FROM d.t1 USING EXTREMES`)
466-
expected := "another CREATE STATISTICS job is already running"
467-
if !testutils.IsError(err, expected) {
468-
t.Fatalf("expected '%s' error, but got %v", expected, err)
469-
}
488+
runAutoStatsJob(t, sqlDB, "d.t1", true /* partial */, errorOnConcurrentCreateStats)
470489

471490
// Attempt to start a simultaneous auto partial stat run on a different table.
472491
// It should succeed.
@@ -478,20 +497,20 @@ func TestBackgroundAutoPartialStats(t *testing.T) {
478497

479498
select {
480499
case allowRequest <- struct{}{}:
481-
case err = <-runningAutoPartialStatErrCh:
500+
case err := <-runningAutoPartialStatErrCh:
482501
t.Fatal(err)
483-
case err = <-autoPartialStatErrCh:
502+
case err := <-autoPartialStatErrCh:
484503
t.Fatal(err)
485504
}
486505

487-
// Allow both auto partial stat jobs to complete.
488506
close(allowRequest)
507+
allowRequestOpen = false
489508

490509
// Verify that both jobs completed successfully.
491-
if err = <-autoPartialStatErrCh; err != nil {
510+
if err := <-autoPartialStatErrCh; err != nil {
492511
t.Fatalf("create auto partial stats job should have completed: %s", err)
493512
}
494-
if err = <-runningAutoPartialStatErrCh; err != nil {
513+
if err := <-runningAutoPartialStatErrCh; err != nil {
495514
t.Fatalf("create auto partial stats job should have completed: %s", err)
496515
}
497516
}
@@ -929,3 +948,38 @@ func getFractionCompleted(t testing.TB, sqlDB *sqlutils.SQLRunner, jobID jobspb.
929948
})
930949
return progress.Progress.(*jobspb.Progress_FractionCompleted).FractionCompleted
931950
}
951+
952+
func runAutoStatsJob(
953+
t *testing.T, sqlDB *sqlutils.SQLRunner, tableName string, partial bool, shouldFail bool,
954+
) {
955+
var tableID descpb.ID
956+
sqlDB.QueryRow(t, `SELECT $1::regclass::int`, tableName).Scan(&tableID)
957+
958+
var statsName string
959+
if partial {
960+
statsName = "__auto_partial__"
961+
} else {
962+
statsName = "__auto__"
963+
}
964+
var queryPostfix string
965+
if partial {
966+
queryPostfix = " USING EXTREMES"
967+
}
968+
969+
var beforeCount int
970+
sqlDB.QueryRow(t, `SELECT count(*) FROM system.table_statistics WHERE "tableID" = $1 AND name = $2`, tableID, statsName).Scan(&beforeCount)
971+
972+
if shouldFail {
973+
sqlDB.ExpectErr(t, "another CREATE STATISTICS job is already running", fmt.Sprintf("CREATE STATISTICS %s FROM %s%s", statsName, tableName, queryPostfix))
974+
return
975+
}
976+
977+
sqlDB.Exec(t, fmt.Sprintf("CREATE STATISTICS %s FROM %s%s", statsName, tableName, queryPostfix))
978+
979+
var afterCount int
980+
sqlDB.QueryRow(t, `SELECT count(*) FROM system.table_statistics WHERE "tableID" = $1 AND name = $2`, tableID, statsName).Scan(&afterCount)
981+
982+
if beforeCount != afterCount {
983+
t.Fatalf("auto stats job should have failed, but it didn't (beforeCount: %d, afterCount: %d)", beforeCount, afterCount)
984+
}
985+
}

0 commit comments

Comments
 (0)