Skip to content

Commit 8b1e1a9

Browse files
craig[bot]yuzefovich
andcommitted
Merge #152361
152361: jobs: remove PTS-related metrics for some jobs that don't interact with PTS r=yuzefovich a=yuzefovich This commit splits out the PTS-related metrics from the shared `JobTypeMetrics` struct in order to avoid generating PTS metrics for jobs that don't interact with the PTS system. 3 stats-related and IMPORT ROLLBACK jobs are now excluded from those, but I didn't audit job types not owned by Queries. Epic: CRDB-52656 Release note: None Co-authored-by: Yahor Yuzefovich <[email protected]>
2 parents ac1e576 + 33ebd60 commit 8b1e1a9

File tree

5 files changed

+45
-127
lines changed

5 files changed

+45
-127
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 0 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -4057,15 +4057,6 @@ layers:
40574057
unit: COUNT
40584058
aggregation: AVG
40594059
derivative: NONE
4060-
- name: jobs.auto_create_partial_stats.expired_pts_records
4061-
exported_name: jobs_auto_create_partial_stats_expired_pts_records
4062-
labeled_name: 'jobs.expired_pts_records{type: auto_create_partial_stats}'
4063-
description: Number of expired protected timestamp records owned by auto_create_partial_stats jobs
4064-
y_axis_label: records
4065-
type: COUNTER
4066-
unit: COUNT
4067-
aggregation: AVG
4068-
derivative: NON_NEGATIVE_DERIVATIVE
40694060
- name: jobs.auto_create_partial_stats.fail_or_cancel_completed
40704061
exported_name: jobs_auto_create_partial_stats_fail_or_cancel_completed
40714062
labeled_name: 'jobs.fail_or_cancel{name: auto_create_partial_stats, status: completed}'
@@ -4084,24 +4075,6 @@ layers:
40844075
unit: COUNT
40854076
aggregation: AVG
40864077
derivative: NON_NEGATIVE_DERIVATIVE
4087-
- name: jobs.auto_create_partial_stats.protected_age_sec
4088-
exported_name: jobs_auto_create_partial_stats_protected_age_sec
4089-
labeled_name: 'jobs.protected_age_sec{type: auto_create_partial_stats}'
4090-
description: The age of the oldest PTS record protected by auto_create_partial_stats jobs
4091-
y_axis_label: seconds
4092-
type: GAUGE
4093-
unit: SECONDS
4094-
aggregation: AVG
4095-
derivative: NONE
4096-
- name: jobs.auto_create_partial_stats.protected_record_count
4097-
exported_name: jobs_auto_create_partial_stats_protected_record_count
4098-
labeled_name: 'jobs.protected_record_count{type: auto_create_partial_stats}'
4099-
description: Number of protected timestamp records held by auto_create_partial_stats jobs
4100-
y_axis_label: records
4101-
type: GAUGE
4102-
unit: COUNT
4103-
aggregation: AVG
4104-
derivative: NONE
41054078
- name: jobs.auto_create_partial_stats.resume_completed
41064079
exported_name: jobs_auto_create_partial_stats_resume_completed
41074080
labeled_name: 'jobs.resume{name: auto_create_partial_stats, status: completed}'
@@ -4129,15 +4102,6 @@ layers:
41294102
unit: COUNT
41304103
aggregation: AVG
41314104
derivative: NONE
4132-
- name: jobs.auto_create_stats.expired_pts_records
4133-
exported_name: jobs_auto_create_stats_expired_pts_records
4134-
labeled_name: 'jobs.expired_pts_records{type: auto_create_stats}'
4135-
description: Number of expired protected timestamp records owned by auto_create_stats jobs
4136-
y_axis_label: records
4137-
type: COUNTER
4138-
unit: COUNT
4139-
aggregation: AVG
4140-
derivative: NON_NEGATIVE_DERIVATIVE
41414105
- name: jobs.auto_create_stats.fail_or_cancel_completed
41424106
exported_name: jobs_auto_create_stats_fail_or_cancel_completed
41434107
labeled_name: 'jobs.fail_or_cancel{name: auto_create_stats, status: completed}'
@@ -4156,24 +4120,6 @@ layers:
41564120
unit: COUNT
41574121
aggregation: AVG
41584122
derivative: NON_NEGATIVE_DERIVATIVE
4159-
- name: jobs.auto_create_stats.protected_age_sec
4160-
exported_name: jobs_auto_create_stats_protected_age_sec
4161-
labeled_name: 'jobs.protected_age_sec{type: auto_create_stats}'
4162-
description: The age of the oldest PTS record protected by auto_create_stats jobs
4163-
y_axis_label: seconds
4164-
type: GAUGE
4165-
unit: SECONDS
4166-
aggregation: AVG
4167-
derivative: NONE
4168-
- name: jobs.auto_create_stats.protected_record_count
4169-
exported_name: jobs_auto_create_stats_protected_record_count
4170-
labeled_name: 'jobs.protected_record_count{type: auto_create_stats}'
4171-
description: Number of protected timestamp records held by auto_create_stats jobs
4172-
y_axis_label: records
4173-
type: GAUGE
4174-
unit: COUNT
4175-
aggregation: AVG
4176-
derivative: NONE
41774123
- name: jobs.auto_create_stats.resume_completed
41784124
exported_name: jobs_auto_create_stats_resume_completed
41794125
labeled_name: 'jobs.resume{name: auto_create_stats, status: completed}'
@@ -4776,15 +4722,6 @@ layers:
47764722
unit: COUNT
47774723
aggregation: AVG
47784724
derivative: NONE
4779-
- name: jobs.create_stats.expired_pts_records
4780-
exported_name: jobs_create_stats_expired_pts_records
4781-
labeled_name: 'jobs.expired_pts_records{type: create_stats}'
4782-
description: Number of expired protected timestamp records owned by create_stats jobs
4783-
y_axis_label: records
4784-
type: COUNTER
4785-
unit: COUNT
4786-
aggregation: AVG
4787-
derivative: NON_NEGATIVE_DERIVATIVE
47884725
- name: jobs.create_stats.fail_or_cancel_completed
47894726
exported_name: jobs_create_stats_fail_or_cancel_completed
47904727
labeled_name: 'jobs.fail_or_cancel{name: create_stats, status: completed}'
@@ -4803,24 +4740,6 @@ layers:
48034740
unit: COUNT
48044741
aggregation: AVG
48054742
derivative: NON_NEGATIVE_DERIVATIVE
4806-
- name: jobs.create_stats.protected_age_sec
4807-
exported_name: jobs_create_stats_protected_age_sec
4808-
labeled_name: 'jobs.protected_age_sec{type: create_stats}'
4809-
description: The age of the oldest PTS record protected by create_stats jobs
4810-
y_axis_label: seconds
4811-
type: GAUGE
4812-
unit: SECONDS
4813-
aggregation: AVG
4814-
derivative: NONE
4815-
- name: jobs.create_stats.protected_record_count
4816-
exported_name: jobs_create_stats_protected_record_count
4817-
labeled_name: 'jobs.protected_record_count{type: create_stats}'
4818-
description: Number of protected timestamp records held by create_stats jobs
4819-
y_axis_label: records
4820-
type: GAUGE
4821-
unit: COUNT
4822-
aggregation: AVG
4823-
derivative: NONE
48244743
- name: jobs.create_stats.resume_completed
48254744
exported_name: jobs_create_stats_resume_completed
48264745
labeled_name: 'jobs.resume{name: create_stats, status: completed}'
@@ -5172,15 +5091,6 @@ layers:
51725091
unit: COUNT
51735092
aggregation: AVG
51745093
derivative: NONE
5175-
- name: jobs.import_rollback.expired_pts_records
5176-
exported_name: jobs_import_rollback_expired_pts_records
5177-
labeled_name: 'jobs.expired_pts_records{type: import_rollback}'
5178-
description: Number of expired protected timestamp records owned by import_rollback jobs
5179-
y_axis_label: records
5180-
type: COUNTER
5181-
unit: COUNT
5182-
aggregation: AVG
5183-
derivative: NON_NEGATIVE_DERIVATIVE
51845094
- name: jobs.import_rollback.fail_or_cancel_completed
51855095
exported_name: jobs_import_rollback_fail_or_cancel_completed
51865096
labeled_name: 'jobs.fail_or_cancel{name: import_rollback, status: completed}'
@@ -5199,24 +5109,6 @@ layers:
51995109
unit: COUNT
52005110
aggregation: AVG
52015111
derivative: NON_NEGATIVE_DERIVATIVE
5202-
- name: jobs.import_rollback.protected_age_sec
5203-
exported_name: jobs_import_rollback_protected_age_sec
5204-
labeled_name: 'jobs.protected_age_sec{type: import_rollback}'
5205-
description: The age of the oldest PTS record protected by import_rollback jobs
5206-
y_axis_label: seconds
5207-
type: GAUGE
5208-
unit: SECONDS
5209-
aggregation: AVG
5210-
derivative: NONE
5211-
- name: jobs.import_rollback.protected_record_count
5212-
exported_name: jobs_import_rollback_protected_record_count
5213-
labeled_name: 'jobs.protected_record_count{type: import_rollback}'
5214-
description: Number of protected timestamp records held by import_rollback jobs
5215-
y_axis_label: records
5216-
type: GAUGE
5217-
unit: COUNT
5218-
aggregation: AVG
5219-
derivative: NONE
52205112
- name: jobs.import_rollback.resume_completed
52215113
exported_name: jobs_import_rollback_resume_completed
52225114
labeled_name: 'jobs.resume{name: import_rollback, status: completed}'

pkg/jobs/jobs_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3216,7 +3216,7 @@ func TestJobTypeMetrics(t *testing.T) {
32163216

32173217
checkPTSCounts := func(typ jobspb.Type, count int64) {
32183218
testutils.SucceedsSoon(t, func() error {
3219-
m := reg.MetricsStruct().JobMetrics[typ]
3219+
m := reg.MetricsStruct().JobPTSMetrics[typ]
32203220
if m.NumJobsWithPTS.Value() == count && (count == 0 || m.ProtectedAge.Value() > 0) {
32213221
return nil
32223222
}

pkg/jobs/metrics.go

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ import (
1919

2020
// Metrics are for production monitoring of each job type.
2121
type Metrics struct {
22-
JobMetrics [jobspb.NumJobTypes]*JobTypeMetrics
22+
JobMetrics [jobspb.NumJobTypes]*JobTypeMetrics
23+
JobPTSMetrics [jobspb.NumJobTypes]*JobTypePTSMetrics
2324

2425
// JobSpecificMetrics contains a list of job specific metrics, registered when
2526
// the job was registered with the system. Prior to this array, job
@@ -64,14 +65,21 @@ type JobTypeMetrics struct {
6465
ResumeFailed *metric.Counter
6566
FailOrCancelCompleted *metric.Counter
6667
FailOrCancelRetryError *metric.Counter
68+
}
69+
70+
// MetricStruct implements the metric.Struct interface.
71+
func (JobTypeMetrics) MetricStruct() {}
6772

73+
// JobTypePTSMetrics is a metrics.Struct containing PTS-specific metrics for a
74+
// job type.
75+
type JobTypePTSMetrics struct {
6876
NumJobsWithPTS *metric.Gauge
6977
ExpiredPTS *metric.Counter
7078
ProtectedAge *metric.Gauge
7179
}
7280

7381
// MetricStruct implements the metric.Struct interface.
74-
func (JobTypeMetrics) MetricStruct() {}
82+
func (JobTypePTSMetrics) MetricStruct() {}
7583

7684
func typeToString(jobType jobspb.Type) string {
7785
return strings.ToLower(strings.Replace(jobType.String(), " ", "_", -1))
@@ -413,9 +421,13 @@ func (m *Metrics) init(histogramWindowInterval time.Duration, lookup *cidr.Looku
413421
ResumeFailed: metric.NewCounter(makeMetaResumeFailed(jt)),
414422
FailOrCancelCompleted: metric.NewCounter(makeMetaFailOrCancelCompeted(jt)),
415423
FailOrCancelRetryError: metric.NewCounter(makeMetaFailOrCancelRetryError(jt)),
416-
NumJobsWithPTS: metric.NewGauge(makeMetaProtectedCount(jt)),
417-
ExpiredPTS: metric.NewCounter(makeMetaExpiredPTS(jt)),
418-
ProtectedAge: metric.NewGauge(makeMetaProtectedAge(jt)),
424+
}
425+
if interactsWithPTS(jt) {
426+
m.JobPTSMetrics[jt] = &JobTypePTSMetrics{
427+
NumJobsWithPTS: metric.NewGauge(makeMetaProtectedCount(jt)),
428+
ExpiredPTS: metric.NewCounter(makeMetaExpiredPTS(jt)),
429+
ProtectedAge: metric.NewGauge(makeMetaProtectedAge(jt)),
430+
}
419431
}
420432

421433
if opts, ok := getRegisterOptions(jt); ok {
@@ -429,6 +441,29 @@ func (m *Metrics) init(histogramWindowInterval time.Duration, lookup *cidr.Looku
429441
}
430442
}
431443

444+
// interactsWithPTS returns false when the given job is guaranteed to not
445+
// interact with the PTS system.
446+
func interactsWithPTS(jt jobspb.Type) bool {
447+
switch jt {
448+
case jobspb.TypeImport:
449+
// Note that even though the IMPORT jobs as of 25.4 do not lay protected
450+
// timestamps, we have plans to do so (see #91151), so we'll report that
451+
// IMPORTs do interact with PTS system.
452+
return true
453+
case jobspb.TypeCreateStats, jobspb.TypeAutoCreateStats, jobspb.TypeAutoCreatePartialStats:
454+
// None of the stats jobs interact with the PTS system.
455+
return false
456+
case jobspb.TypeImportRollback:
457+
// IMPORT ROLLBACK job is used to roll back the table for the online
458+
// restore and to bring it back online. It doesn't interact with the PTS
459+
// system.
460+
return false
461+
default:
462+
// TODO(yuzefovich): other job types should be audited.
463+
return true
464+
}
465+
}
466+
432467
// MakeChangefeedMetricsHook allows for registration of changefeed metrics from
433468
// ccl code.
434469
var MakeChangefeedMetricsHook func(time.Duration, *cidr.Lookup) metric.Struct

pkg/jobs/metricspoller/job_statistics.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,10 @@ func updateJobPTSMetrics(
253253
if jobspb.Type(typ) == jobspb.TypeUnspecified { // do not track TypeUnspecified
254254
continue
255255
}
256-
m := jobMetrics.JobMetrics[typ]
256+
m := jobMetrics.JobPTSMetrics[typ]
257+
if m == nil { // this job doesn't interact with PTS system
258+
continue
259+
}
257260
stats, found := ptsStats[jobspb.Type(typ)]
258261
if found {
259262
m.NumJobsWithPTS.Update(stats.numRecords)

pkg/roachprod/opentelemetry/cockroachdb_metrics.go

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -660,22 +660,16 @@ var cockroachdbMetrics = map[string]string{
660660
"jobs_auto_create_stats_currently_idle": "jobs.auto_create_stats.currently_idle",
661661
"jobs_auto_create_stats_currently_paused": "jobs.auto.create.stats.currently_paused",
662662
"jobs_auto_create_stats_currently_running": "jobs.auto.create.stats.currently_running",
663-
"jobs_auto_create_stats_expired_pts_records": "jobs.auto_create_stats.expired_pts_records",
664663
"jobs_auto_create_stats_fail_or_cancel_completed": "jobs.auto_create_stats.fail_or_cancel_completed",
665664
"jobs_auto_create_stats_fail_or_cancel_retry_error": "jobs.auto_create_stats.fail_or_cancel_retry_error",
666-
"jobs_auto_create_stats_protected_age_sec": "jobs.auto_create_stats.protected_age_sec",
667-
"jobs_auto_create_stats_protected_record_count": "jobs.auto_create_stats.protected_record_count",
668665
"jobs_auto_create_stats_resume_completed": "jobs.auto_create_stats.resume_completed",
669666
"jobs_auto_create_stats_resume_failed": "jobs.auto.create.stats.resume_failed",
670667
"jobs_auto_create_stats_resume_retry_error": "jobs.auto_create_stats.resume_retry_error",
671668
"jobs_auto_create_partial_stats_currently_idle": "jobs.auto_create_partial_stats.currently_idle",
672669
"jobs_auto_create_partial_stats_currently_paused": "jobs.auto_create_partial_stats.currently_paused",
673670
"jobs_auto_create_partial_stats_currently_running": "jobs.auto_create_partial_stats.currently_running",
674-
"jobs_auto_create_partial_stats_expired_pts_records": "jobs.auto_create_partial_stats.expired_pts_records",
675671
"jobs_auto_create_partial_stats_fail_or_cancel_completed": "jobs.auto_create_partial_stats.fail_or_cancel_completed",
676672
"jobs_auto_create_partial_stats_fail_or_cancel_retry_error": "jobs.auto_create_partial_stats.fail_or_cancel_retry_error",
677-
"jobs_auto_create_partial_stats_protected_age_sec": "jobs.auto_create_partial_stats.protected_age_sec",
678-
"jobs_auto_create_partial_stats_protected_record_count": "jobs.auto_create_partial_stats.protected_record_count",
679673
"jobs_auto_create_partial_stats_resume_completed": "jobs.auto_create_partial_stats.resume_completed",
680674
"jobs_auto_create_partial_stats_resume_failed": "jobs.auto_create_partial_stats.resume_failed",
681675
"jobs_auto_create_partial_stats_resume_retry_error": "jobs.auto_create_partial_stats.resume_retry_error",
@@ -749,11 +743,8 @@ var cockroachdbMetrics = map[string]string{
749743
"jobs_create_stats_currently_idle": "jobs.create_stats.currently_idle",
750744
"jobs_create_stats_currently_paused": "jobs.create_stats.currently_paused",
751745
"jobs_create_stats_currently_running": "jobs.create.stats.currently_running",
752-
"jobs_create_stats_expired_pts_records": "jobs.create_stats.expired_pts_records",
753746
"jobs_create_stats_fail_or_cancel_completed": "jobs.create_stats.fail_or_cancel_completed",
754747
"jobs_create_stats_fail_or_cancel_retry_error": "jobs.create_stats.fail_or_cancel_retry_error",
755-
"jobs_create_stats_protected_age_sec": "jobs.create_stats.protected_age_sec",
756-
"jobs_create_stats_protected_record_count": "jobs.create_stats.protected_record_count",
757748
"jobs_create_stats_resume_completed": "jobs.create_stats.resume_completed",
758749
"jobs_create_stats_resume_failed": "jobs.create_stats.resume_failed",
759750
"jobs_create_stats_resume_retry_error": "jobs.create_stats.resume_retry_error",
@@ -782,11 +773,8 @@ var cockroachdbMetrics = map[string]string{
782773
"jobs_import_rollback_currently_idle": "jobs.import_rollback.currently_idle",
783774
"jobs_import_rollback_currently_paused": "jobs.import_rollback.currently_paused",
784775
"jobs_import_rollback_currently_running": "jobs.import_rollback.currently_running",
785-
"jobs_import_rollback_expired_pts_records": "jobs.import_rollback.expired_pts_records",
786776
"jobs_import_rollback_fail_or_cancel_completed": "jobs.import_rollback.fail_or_cancel_completed",
787777
"jobs_import_rollback_fail_or_cancel_retry_error": "jobs.import_rollback.fail_or_cancel_retry_error",
788-
"jobs_import_rollback_protected_age_sec": "jobs.import_rollback.protected_age_sec",
789-
"jobs_import_rollback_protected_record_count": "jobs.import_rollback.protected_record_count",
790778
"jobs_import_rollback_resume_completed": "jobs.import_rollback.resume_completed",
791779
"jobs_import_rollback_resume_failed": "jobs.import_rollback.resume_failed",
792780
"jobs_import_rollback_resume_retry_error": "jobs.import_rollback.resume_retry_error",

0 commit comments

Comments
 (0)