Skip to content

Commit 8de0ae6

Browse files
craig[bot]mgartner
andcommitted
Merge #144309
144309: jobs: add jobs.avoid_full_scans.enabled r=mgartner a=mgartner #### jobs: add jobs.avoid_full_scans.enabled The `jobs.avoid_full_scans.enabled` cluster setting has been added which adds `AVOID_FULL_SCAN` hints to two internal, jobs-related queries: `find-running-jobs-of-type` and `find-all-running-jobs-of-type`. This hint prevents the optimizer from choosing bad query plans with full table scans. The setting is disabled by default. It will be enabled in a future commit. Release note: None #### jobs: enable jobs.avoid_full_scans.enabled by default Epic: None Release note (performance improvement): Some internal queries executed by the jobs system are now less likely to perform full table scans of the `system.jobs` table, making them more efficient. This change can be reverted by disabling the `jobs.avoid_full_scans.enabled` cluster setting. Co-authored-by: Marcus Gartner <[email protected]>
2 parents 2b77e01 + f5c864b commit 8de0ae6

File tree

6 files changed

+85
-43
lines changed

6 files changed

+85
-43
lines changed

pkg/jobs/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ go_library(
6666
"//pkg/util/log",
6767
"//pkg/util/log/eventpb",
6868
"//pkg/util/log/severity",
69+
"//pkg/util/metamorphic",
6970
"//pkg/util/metric",
7071
"//pkg/util/pprofutil",
7172
"//pkg/util/protoutil",

pkg/jobs/utils.go

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -12,39 +12,56 @@ import (
1212

1313
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
1414
"github.com/cockroachdb/cockroach/pkg/kv"
15+
"github.com/cockroachdb/cockroach/pkg/settings"
16+
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
1517
"github.com/cockroachdb/cockroach/pkg/sql/isql"
1618
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
19+
"github.com/cockroachdb/cockroach/pkg/util/metamorphic"
1720
"github.com/cockroachdb/errors"
1821
)
1922

23+
var testingAvoidFullScans = metamorphic.ConstantWithTestBool(
24+
"jobs.avoid_full_scans_in_find_running_jobs",
25+
true, /* defaultValue */
26+
)
27+
28+
var avoidFullScans = settings.RegisterBoolSetting(
29+
settings.ApplicationLevel,
30+
"jobs.avoid_full_scans_in_find_running_jobs.enabled",
31+
"when true, enables hints to avoid full scans for internal, jobs-related queries",
32+
testingAvoidFullScans)
33+
2034
// RunningJobExists checks that whether there are any job of the given types
2135
// in the pending, running, or paused state, optionally ignoring the job with
2236
// the ID specified by ignoreJobID as well as any jobs created after it, if
2337
// the passed ID is not InvalidJobID.
2438
func RunningJobExists(
25-
ctx context.Context, ignoreJobID jobspb.JobID, txn isql.Txn, jobTypes ...jobspb.Type,
39+
ctx context.Context,
40+
cs *cluster.Settings,
41+
ignoreJobID jobspb.JobID,
42+
txn isql.Txn,
43+
jobTypes ...jobspb.Type,
2644
) (exists bool, retErr error) {
2745
typeStrs, err := getJobTypeStrs(jobTypes)
2846
if err != nil {
2947
return false, err
3048
}
3149

32-
orderBy := " ORDER BY created"
50+
orderBy := "ORDER BY created"
3351
if ignoreJobID == jobspb.InvalidJobID {
3452
// There is no need to order by the created column if there is no job to
3553
// ignore.
3654
orderBy = ""
3755
}
3856

39-
stmt := `
40-
SELECT
41-
id
42-
FROM
43-
system.jobs@jobs_status_created_idx
44-
WHERE
45-
job_type IN ` + typeStrs + ` AND
46-
status IN ` + NonTerminalStateTupleString + orderBy + `
47-
LIMIT 1`
57+
hint := "jobs_status_created_idx"
58+
if avoidFullScans.Get(&cs.SV) {
59+
hint = "{FORCE_INDEX=jobs_status_created_idx,AVOID_FULL_SCAN}"
60+
}
61+
62+
q := `SELECT id FROM system.jobs@%s WHERE job_type IN %s AND status IN %s %s LIMIT 1`
63+
stmt := fmt.Sprintf(q, hint, typeStrs, NonTerminalStateTupleString, orderBy)
64+
4865
it, err := txn.QueryIterator(
4966
ctx,
5067
"find-running-jobs-of-type",
@@ -74,28 +91,31 @@ LIMIT 1`
7491
// by ignoreJobID as well as any jobs created after it, if the passed ID is not
7592
// InvalidJobID.
7693
func RunningJobs(
77-
ctx context.Context, ignoreJobID jobspb.JobID, txn isql.Txn, jobTypes ...jobspb.Type,
94+
ctx context.Context,
95+
cs *cluster.Settings,
96+
ignoreJobID jobspb.JobID,
97+
txn isql.Txn,
98+
jobTypes ...jobspb.Type,
7899
) (jobIDs []jobspb.JobID, retErr error) {
79100
typeStrs, err := getJobTypeStrs(jobTypes)
80101
if err != nil {
81102
return jobIDs, err
82103
}
83104

84-
orderBy := " ORDER BY created"
105+
orderBy := "ORDER BY created"
85106
if ignoreJobID == jobspb.InvalidJobID {
86107
// There is no need to order by the created column if there is no job to
87108
// ignore.
88109
orderBy = ""
89110
}
90111

91-
stmt := `
92-
SELECT
93-
id
94-
FROM
95-
system.jobs@jobs_status_created_idx
96-
WHERE
97-
job_type IN ` + typeStrs + ` AND
98-
status IN ` + NonTerminalStateTupleString + orderBy
112+
hint := "jobs_status_created_idx"
113+
if avoidFullScans.Get(&cs.SV) {
114+
hint = "{FORCE_INDEX=jobs_status_created_idx,AVOID_FULL_SCAN}"
115+
}
116+
117+
q := `SELECT id FROM system.jobs@%s WHERE job_type IN %s AND status IN %s %s`
118+
stmt := fmt.Sprintf(q, hint, typeStrs, NonTerminalStateTupleString, orderBy)
99119
it, err := txn.QueryIterator(
100120
ctx,
101121
"find-all-running-jobs-of-type",

pkg/spanconfig/spanconfigmanager/manager.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ func (m *Manager) run(ctx context.Context) {
128128
return
129129
}
130130

131-
started, err := m.createAndStartJobIfNoneExists(ctx)
131+
started, err := m.createAndStartJobIfNoneExists(ctx, m.settings)
132132
if err != nil {
133133
log.Errorf(ctx, "error starting auto span config reconciliation job: %v", err)
134134
}
@@ -162,7 +162,9 @@ func (m *Manager) run(ctx context.Context) {
162162
// createAndStartJobIfNoneExists creates span config reconciliation job iff it
163163
// hasn't been created already and notifies the jobs registry to adopt it.
164164
// Returns a boolean indicating if the job was created.
165-
func (m *Manager) createAndStartJobIfNoneExists(ctx context.Context) (bool, error) {
165+
func (m *Manager) createAndStartJobIfNoneExists(
166+
ctx context.Context, cs *cluster.Settings,
167+
) (bool, error) {
166168
if m.knobs.ManagerDisableJobCreation {
167169
return false, nil
168170
}
@@ -177,8 +179,9 @@ func (m *Manager) createAndStartJobIfNoneExists(ctx context.Context) (bool, erro
177179

178180
var job *jobs.Job
179181
if err := m.db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error {
180-
exists, err := jobs.RunningJobExists(ctx, jobspb.InvalidJobID, txn,
181-
jobspb.TypeAutoSpanConfigReconciliation)
182+
exists, err := jobs.RunningJobExists(
183+
ctx, cs, jobspb.InvalidJobID, txn, jobspb.TypeAutoSpanConfigReconciliation,
184+
)
182185
if err != nil {
183186
return err
184187
}

pkg/spanconfig/spanconfigmanager/manager_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ func TestManagerConcurrentJobCreation(t *testing.T) {
104104

105105
var g errgroup.Group
106106
g.Go(func() error {
107-
started, err := manager.TestingCreateAndStartJobIfNoneExists(ctx)
107+
started, err := manager.TestingCreateAndStartJobIfNoneExists(ctx, ts.ClusterSettings())
108108
if err != nil {
109109
return err
110110
}
@@ -117,7 +117,7 @@ func TestManagerConcurrentJobCreation(t *testing.T) {
117117
// Only try to start the job if the first goroutine has reached the testing
118118
// knob and is blocked.
119119
<-isBlocked
120-
started, err := manager.TestingCreateAndStartJobIfNoneExists(ctx)
120+
started, err := manager.TestingCreateAndStartJobIfNoneExists(ctx, ts.ClusterSettings())
121121
if err != nil {
122122
return err
123123
}
@@ -183,7 +183,7 @@ func TestManagerStartsJobIfFailed(t *testing.T) {
183183
)
184184
require.NoError(t, err)
185185

186-
started, err := manager.TestingCreateAndStartJobIfNoneExists(ctx)
186+
started, err := manager.TestingCreateAndStartJobIfNoneExists(ctx, ts.ClusterSettings())
187187
require.NoError(t, err)
188188
require.True(t, started)
189189
}
@@ -331,7 +331,7 @@ func TestReconciliationJobErrorAndRecovery(t *testing.T) {
331331
},
332332
)
333333

334-
started, err := manager.TestingCreateAndStartJobIfNoneExists(ctx)
334+
started, err := manager.TestingCreateAndStartJobIfNoneExists(ctx, ts.ClusterSettings())
335335
require.NoError(t, err)
336336
require.True(t, started)
337337

@@ -354,7 +354,7 @@ func TestReconciliationJobErrorAndRecovery(t *testing.T) {
354354
mu.err = nil
355355
mu.Unlock()
356356

357-
started, err = manager.TestingCreateAndStartJobIfNoneExists(ctx)
357+
started, err = manager.TestingCreateAndStartJobIfNoneExists(ctx, ts.ClusterSettings())
358358
require.NoError(t, err)
359359
require.True(t, started)
360360

@@ -421,7 +421,7 @@ func TestReconciliationUsesRightCheckpoint(t *testing.T) {
421421
nil,
422422
)
423423

424-
started, err := manager.TestingCreateAndStartJobIfNoneExists(ctx)
424+
started, err := manager.TestingCreateAndStartJobIfNoneExists(ctx, ts.ClusterSettings())
425425
require.NoError(t, err)
426426
require.True(t, started)
427427

pkg/spanconfig/spanconfigmanager/test_helpers.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,16 @@
55

66
package spanconfigmanager
77

8-
import "context"
8+
import (
9+
"context"
10+
11+
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
12+
)
913

1014
// TestingCreateAndStartJobIfNoneExists is a wrapper around
1115
// createAndStartJobIfNoneExists for testing it.
12-
func (m *Manager) TestingCreateAndStartJobIfNoneExists(ctx context.Context) (bool, error) {
13-
return m.createAndStartJobIfNoneExists(ctx)
16+
func (m *Manager) TestingCreateAndStartJobIfNoneExists(
17+
ctx context.Context, cs *cluster.Settings,
18+
) (bool, error) {
19+
return m.createAndStartJobIfNoneExists(ctx, cs)
1420
}

pkg/sql/create_stats.go

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -160,13 +160,18 @@ func (n *createStatsNode) runJob(ctx context.Context) error {
160160
// (To handle race conditions we check this again after the job starts,
161161
// but this check is used to prevent creating a large number of jobs that
162162
// immediately fail).
163-
if err := checkRunningJobsInTxn(ctx, jobspb.InvalidJobID, txn); err != nil {
163+
if err := checkRunningJobsInTxn(
164+
ctx, n.p.EvalContext().Settings, jobspb.InvalidJobID, txn,
165+
); err != nil {
164166
return err
165167
}
166168
// Don't start auto partial stats jobs if there is another auto partial
167169
// stats job running on the same table.
168170
if n.Name == jobspb.AutoPartialStatsName {
169-
if err := checkRunningAutoPartialJobsInTxn(ctx, jobspb.InvalidJobID, txn, n.p.ExecCfg().JobRegistry, details.Table.ID); err != nil {
171+
if err := checkRunningAutoPartialJobsInTxn(
172+
ctx, n.p.EvalContext().Settings, jobspb.InvalidJobID, txn,
173+
n.p.ExecCfg().JobRegistry, details.Table.ID,
174+
); err != nil {
170175
return err
171176
}
172177
}
@@ -881,11 +886,15 @@ func checkRunningJobs(
881886
jobID = job.ID()
882887
}
883888
return p.ExecCfg().InternalDB.Txn(ctx, func(ctx context.Context, txn isql.Txn) (err error) {
884-
if err = checkRunningJobsInTxn(ctx, jobID, txn); err != nil {
889+
if err = checkRunningJobsInTxn(
890+
ctx, p.ExtendedEvalContext().Settings, jobID, txn,
891+
); err != nil {
885892
return err
886893
}
887894
if autoPartial {
888-
return checkRunningAutoPartialJobsInTxn(ctx, jobID, txn, jobRegistry, tableID)
895+
return checkRunningAutoPartialJobsInTxn(
896+
ctx, p.ExtendedEvalContext().Settings, jobID, txn, jobRegistry, tableID,
897+
)
889898
}
890899
return nil
891900
})
@@ -896,9 +905,11 @@ func checkRunningJobs(
896905
// that started earlier than this one. If there are, checkRunningJobsInTxn
897906
// returns an error. If jobID is jobspb.InvalidJobID, checkRunningJobsInTxn just
898907
// checks if there are any pending, running, or paused CreateStats jobs.
899-
func checkRunningJobsInTxn(ctx context.Context, jobID jobspb.JobID, txn isql.Txn) error {
900-
exists, err := jobs.RunningJobExists(ctx, jobID, txn,
901-
jobspb.TypeCreateStats, jobspb.TypeAutoCreateStats,
908+
func checkRunningJobsInTxn(
909+
ctx context.Context, cs *cluster.Settings, jobID jobspb.JobID, txn isql.Txn,
910+
) error {
911+
exists, err := jobs.RunningJobExists(
912+
ctx, cs, jobID, txn, jobspb.TypeCreateStats, jobspb.TypeAutoCreateStats,
902913
)
903914
if err != nil {
904915
return err
@@ -919,13 +930,14 @@ func checkRunningJobsInTxn(ctx context.Context, jobID jobspb.JobID, txn isql.Txn
919930
// AutoCreatePartialStats jobs for the same table.
920931
func checkRunningAutoPartialJobsInTxn(
921932
ctx context.Context,
933+
cs *cluster.Settings,
922934
jobID jobspb.JobID,
923935
txn isql.Txn,
924936
jobRegistry *jobs.Registry,
925937
tableID descpb.ID,
926938
) error {
927-
autoPartialStatJobIDs, err := jobs.RunningJobs(ctx, jobID, txn,
928-
jobspb.TypeAutoCreatePartialStats,
939+
autoPartialStatJobIDs, err := jobs.RunningJobs(
940+
ctx, cs, jobID, txn, jobspb.TypeAutoCreatePartialStats,
929941
)
930942
if err != nil {
931943
return err

0 commit comments

Comments
 (0)