Skip to content

Commit e147a55

Browse files
committed
upgrades: avoid crdb_internal.system_jobs in upgrade manager
The crdb_internal.system_jobs is a virtual table that joins information from the jobs table and the jobs_info table. When given a job status predicate it does this by running a query such as: WITH latestpayload AS ( SELECT job_id, value FROM system.job_info AS payload WHERE info_key = 'legacy_payload' ORDER BY written DESC ), latestprogress AS ( SELECT job_id, value FROM system.job_info AS progress WHERE info_key = 'legacy_progress' ORDER BY written DESC ) SELECT distinct(id), status, created, payload.value AS payload, progress.value AS progress, created_by_type, created_by_id, claim_session_id, claim_instance_id, num_runs, last_run,job_type FROM system.jobs AS j INNER JOIN latestpayload AS payload ON j.id = payload.job_id LEFT JOIN latestprogress AS progress ON j.id = progress.job_id WHERE j.status = 'cancel-requested'; This uses 2 full scans of the job_info table: ``` • distinct │ distinct on: id, value, value │ └── • merge join │ equality: (job_id) = (id) │ ├── • render │ │ │ └── • filter │ │ estimated row count: 2,787 │ │ filter: info_key = 'legacy_payload' │ │ │ └── • scan │ estimated row count: 5,597 (100% of the table; stats collected 27 minutes ago; using stats forecast for 17 minutes ago) │ table: job_info@primary │ spans: FULL SCAN │ └── • merge join (right outer) │ equality: (job_id) = (id) │ right cols are key │ ├── • render │ │ │ └── • filter │ │ estimated row count: 2,787 │ │ filter: info_key = 'legacy_progress' │ │ │ └── • scan │ estimated row count: 5,597 (100% of the table; stats collected 27 minutes ago; using stats forecast for 17 minutes ago) │ table: job_info@primary │ spans: FULL SCAN │ └── • index join │ table: jobs@primary │ └── • sort │ order: +id │ └── • scan missing stats table: jobs@jobs_status_created_idx spans: [/'cancel-requested' - /'cancel-requested'] ``` Previously, the upgrade manager was using this virtual table as part of a larger query: SELECT id, status FROM ( SELECT id, status, crdb_internal.pb_to_json( 'cockroach.sql.jobs.jobspb.Payload', payload, false ) AS pl FROM crdb_internal.system_jobs WHERE status IN ('running', 'pending', 'cancel-requested', 'pause-requested', 'reverting', 'paused') ) WHERE pl->'migration'->'clusterVersion' = $1::JSONB; I believe the use of the IN operator causes the virtual index's populate function to be called for each value. Perhaps the optimizer accounts for this in some way to avoid this resulting in 2 * 6 full scans of the job table, but it is hard to confirm with the explain output. In at least one recent escalation, we observed this query taking a substantial amount of time as it continually conflicted with other job system queries. Here, we avoid using the virtual table. This allows us to avoid one full scan of the info table since we don't need the progress (only the payload). It also allows us to use the full `IN` predicate directly, avoiding any uncertainty. In a local example, this is substantially faster ``` root@localhost:26257/defaultdb> SELECT id, status -> FROM ( -> SELECT id, -> status, -> crdb_internal.pb_to_json( -> 'cockroach.sql.jobs.jobspb.Payload', -> payload, -> false -- emit_defaults -> ) AS pl -> FROM crdb_internal.system_jobs -> WHERE status IN ('running', 'pending', 'cancel-requested', 'pause-requested', 'reverting', 'paused') -> ) -> WHERE pl->'migration'->'clusterVersion' = '{"activeVersion": {"internal": 84, "majorVal": 22, "minorVal": 2}}'::JSONB; id | status -----+--------- (0 rows) Time: 384ms total (execution 384ms / network 0ms) root@localhost:26257/defaultdb> WITH latestpayload AS (SELECT job_id, value FROM system.job_info AS payload WHERE info_key = 'legacy_payload' ORDER BY written DESC) -> SELECT id, status FROM ( -> SELECT distinct(id), status, crdb_internal.pb_to_json('cockroach.sql.jobs.jobspb.Payload', payload.value, false) AS pl -> FROM system.jobs AS j -> INNER JOIN latestpayload AS payload ON j.id = payload.job_id -> WHERE status IN ('running', 'pending', 'cancel-requested', 'pause-requested', 'reverting', 'paused') -> AND job_type = 'MIGRATION' -> ) WHERE ((pl->'migration')->'clusterVersion') = '{"activeVersion": {"internal": 84, "majorVal": 22, "minorVal": 2}}'::JSONB; id | status -----+--------- (0 rows) Time: 26ms total (execution 26ms / network 0ms) ``` We should do more work to understand contention within the job system, but perhaps speeding up this query will help a bit. Epic: None Release note: None
1 parent cdf6d15 commit e147a55

File tree

2 files changed

+48
-15
lines changed

2 files changed

+48
-15
lines changed

pkg/upgrade/upgrademanager/manager.go

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -778,26 +778,56 @@ func (m *Manager) getOrCreateMigrationJob(
778778
return alreadyCompleted, alreadyExisting, jobID, nil
779779
}
780780

781+
const (
782+
preJobInfoTableQuery = `
783+
SELECT id, status
784+
FROM (
785+
SELECT id, status,
786+
crdb_internal.pb_to_json(
787+
'cockroach.sql.jobs.jobspb.Payload',
788+
payload,
789+
false -- emit_defaults
790+
) AS pl
791+
FROM system.jobs
792+
WHERE status IN ` + jobs.NonTerminalStatusTupleString + `
793+
)
794+
WHERE ((pl->'migration')->'clusterVersion') = $1::JSONB`
795+
postJobInfoTableQuery = `
796+
WITH latestpayload AS (
797+
SELECT job_id, value FROM system.job_info AS payload
798+
WHERE info_key = 'legacy_payload'
799+
ORDER BY written DESC
800+
)
801+
SELECT id, status
802+
FROM (
803+
SELECT
804+
distinct(id),
805+
status,
806+
crdb_internal.pb_to_json(
807+
'cockroach.sql.jobs.jobspb.Payload',
808+
payload.value,
809+
false -- emit_defaults
810+
) AS pl
811+
FROM system.jobs AS j
812+
INNER JOIN latestpayload AS payload ON j.id = payload.job_id
813+
WHERE j.status IN ` + jobs.NonTerminalStatusTupleString + `
814+
AND j.job_type = 'MIGRATION'
815+
)
816+
WHERE ((pl->'migration')->'clusterVersion') = $1::JSONB`
817+
)
818+
781819
func (m *Manager) getRunningMigrationJob(
782820
ctx context.Context, txn isql.Txn, version roachpb.Version,
783821
) (found bool, jobID jobspb.JobID, _ error) {
784822
// Wrap the version into a ClusterVersion so that the JSON looks like what the
785823
// Payload proto has inside.
786824
cv := clusterversion.ClusterVersion{Version: version}
787-
const query = `
788-
SELECT id, status
789-
FROM (
790-
SELECT id,
791-
status,
792-
crdb_internal.pb_to_json(
793-
'cockroach.sql.jobs.jobspb.Payload',
794-
payload,
795-
false -- emit_defaults
796-
) AS pl
797-
FROM crdb_internal.system_jobs
798-
WHERE status IN ` + jobs.NonTerminalStatusTupleString + `
799-
)
800-
WHERE pl->'migration'->'clusterVersion' = $1::JSON;`
825+
var query string
826+
if m.settings.Version.IsActive(ctx, clusterversion.V23_1JobInfoTableIsBackfilled) {
827+
query = postJobInfoTableQuery
828+
} else {
829+
query = preJobInfoTableQuery
830+
}
801831
jsonMsg, err := protoreflect.MessageToJSON(&cv, protoreflect.FmtFlags{EmitDefaults: false})
802832
if err != nil {
803833
return false, 0, errors.Wrap(err, "failed to marshal version to JSON")

pkg/upgrade/upgrademanager/manager_external_test.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,10 @@ func TestAlreadyRunningJobsAreHandledProperly(t *testing.T) {
155155
created_by_type,
156156
created_by_id,
157157
claim_session_id,
158-
claim_instance_id
158+
claim_instance_id,
159+
0,
160+
NULL,
161+
job_type
159162
FROM crdb_internal.system_jobs
160163
WHERE id = $1
161164
)

0 commit comments

Comments
 (0)