Skip to content

Commit a081d93

Browse files
committed
jobs: mark a few auto partial stats metrics as essential
In order to match which metrics we mark as "essential" for AUTO CREATE STATS jobs, we now mark the following AUTO CREATE PARTIAL STATS job metrics: - `jobs.auto_create_partial_stats.currently_paused` - `jobs.auto_create_partial_stats.currently_running` - `jobs.auto_create_partial_stats.resume_failed`. Release note: None
1 parent a10bae0 commit a081d93

File tree

3 files changed

+57
-39
lines changed

3 files changed

+57
-39
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 34 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,39 @@ layers:
330330
essential: true
331331
- name: SQL
332332
metrics:
333+
- name: jobs.auto_create_partial_stats.currently_paused
334+
exported_name: jobs_auto_create_partial_stats_currently_paused
335+
labeled_name: 'jobs{name: auto_create_partial_stats, status: currently_paused}'
336+
description: Number of auto_create_partial_stats jobs currently considered Paused
337+
y_axis_label: jobs
338+
type: GAUGE
339+
unit: COUNT
340+
aggregation: AVG
341+
derivative: NONE
342+
how_to_use: This metric is a high-level indicator that automatically generated partial statistics jobs are paused which can lead to the query optimizer running with stale statistics. Stale statistics can cause suboptimal query plans to be selected leading to poor query performance.
343+
essential: true
344+
- name: jobs.auto_create_partial_stats.currently_running
345+
exported_name: jobs_auto_create_partial_stats_currently_running
346+
labeled_name: 'jobs{type: auto_create_partial_stats, status: currently_running}'
347+
description: Number of auto_create_partial_stats jobs currently running in Resume or OnFailOrCancel state
348+
y_axis_label: jobs
349+
type: GAUGE
350+
unit: COUNT
351+
aggregation: AVG
352+
derivative: NONE
353+
how_to_use: This metric tracks the number of active automatically generated partial statistics jobs that could also be consuming resources. Ensure that foreground SQL traffic is not impacted by correlating this metric with SQL latency and query volume metrics.
354+
essential: true
355+
- name: jobs.auto_create_partial_stats.resume_failed
356+
exported_name: jobs_auto_create_partial_stats_resume_failed
357+
labeled_name: 'jobs.resume{name: auto_create_partial_stats, status: failed}'
358+
description: Number of auto_create_partial_stats jobs which failed with a non-retriable error
359+
y_axis_label: jobs
360+
type: COUNTER
361+
unit: COUNT
362+
aggregation: AVG
363+
derivative: NON_NEGATIVE_DERIVATIVE
364+
how_to_use: This metric is a high-level indicator that automatically generated partial table statistics is failing. Failed statistic creation can lead to the query optimizer running with stale statistics. Stale statistics can cause suboptimal query plans to be selected leading to poor query performance.
365+
essential: true
333366
- name: jobs.auto_create_stats.currently_paused
334367
exported_name: jobs_auto_create_stats_currently_paused
335368
labeled_name: 'jobs{name: auto_create_stats, status: currently_paused}'
@@ -394,7 +427,7 @@ layers:
394427
unit: COUNT
395428
aggregation: AVG
396429
derivative: NONE
397-
how_to_use: This metric tracks the number of active create statistics jobs that may be consuming resources. Ensure that foreground SQL traffic is not impacted by correlating this metric with SQL latency and query volume metrics.
430+
how_to_use: This metric tracks the number of active create statistics jobs that could also be consuming resources. Ensure that foreground SQL traffic is not impacted by correlating this metric with SQL latency and query volume metrics.
398431
essential: true
399432
- name: schedules.BACKUP.failed
400433
exported_name: schedules_BACKUP_failed
@@ -4055,24 +4088,6 @@ layers:
40554088
unit: COUNT
40564089
aggregation: AVG
40574090
derivative: NONE
4058-
- name: jobs.auto_create_partial_stats.currently_paused
4059-
exported_name: jobs_auto_create_partial_stats_currently_paused
4060-
labeled_name: 'jobs{name: auto_create_partial_stats, status: currently_paused}'
4061-
description: Number of auto_create_partial_stats jobs currently considered Paused
4062-
y_axis_label: jobs
4063-
type: GAUGE
4064-
unit: COUNT
4065-
aggregation: AVG
4066-
derivative: NONE
4067-
- name: jobs.auto_create_partial_stats.currently_running
4068-
exported_name: jobs_auto_create_partial_stats_currently_running
4069-
labeled_name: 'jobs{type: auto_create_partial_stats, status: currently_running}'
4070-
description: Number of auto_create_partial_stats jobs currently running in Resume or OnFailOrCancel state
4071-
y_axis_label: jobs
4072-
type: GAUGE
4073-
unit: COUNT
4074-
aggregation: AVG
4075-
derivative: NONE
40764091
- name: jobs.auto_create_partial_stats.expired_pts_records
40774092
exported_name: jobs_auto_create_partial_stats_expired_pts_records
40784093
labeled_name: 'jobs.expired_pts_records{type: auto_create_partial_stats}'
@@ -4136,15 +4151,6 @@ layers:
41364151
unit: COUNT
41374152
aggregation: AVG
41384153
derivative: NON_NEGATIVE_DERIVATIVE
4139-
- name: jobs.auto_create_partial_stats.resume_failed
4140-
exported_name: jobs_auto_create_partial_stats_resume_failed
4141-
labeled_name: 'jobs.resume{name: auto_create_partial_stats, status: failed}'
4142-
description: Number of auto_create_partial_stats jobs which failed with a non-retriable error
4143-
y_axis_label: jobs
4144-
type: COUNTER
4145-
unit: COUNT
4146-
aggregation: AVG
4147-
derivative: NON_NEGATIVE_DERIVATIVE
41484154
- name: jobs.auto_create_partial_stats.resume_retry_error
41494155
exported_name: jobs_auto_create_partial_stats_resume_retry_error
41504156
labeled_name: 'jobs.resume{name: auto_create_partial_stats, status: retry_error}'

pkg/jobs/metrics.go

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,18 @@ func makeMetaCurrentlyRunning(jt jobspb.Type) metric.Metadata {
9797
}
9898

9999
switch jt {
100-
case jobspb.TypeAutoCreateStats:
100+
case jobspb.TypeCreateStats, jobspb.TypeAutoCreateStats, jobspb.TypeAutoCreatePartialStats:
101101
m.Essential = true
102102
m.Category = metric.Metadata_SQL
103-
m.HowToUse = `This metric tracks the number of active automatically generated statistics jobs that could also be consuming resources. Ensure that foreground SQL traffic is not impacted by correlating this metric with SQL latency and query volume metrics.`
104-
case jobspb.TypeCreateStats:
105-
m.Essential = true
106-
m.Category = metric.Metadata_SQL
107-
m.HowToUse = `This metric tracks the number of active create statistics jobs that may be consuming resources. Ensure that foreground SQL traffic is not impacted by correlating this metric with SQL latency and query volume metrics.`
103+
var detail string
104+
if jt == jobspb.TypeCreateStats {
105+
detail = "create"
106+
} else if jt == jobspb.TypeAutoCreateStats {
107+
detail = "automatically generated"
108+
} else {
109+
detail = "automatically generated partial"
110+
}
111+
m.HowToUse = fmt.Sprintf(`This metric tracks the number of active %s statistics jobs that could also be consuming resources. Ensure that foreground SQL traffic is not impacted by correlating this metric with SQL latency and query volume metrics.`, detail)
108112
case jobspb.TypeBackup:
109113
m.Essential = true
110114
m.Category = metric.Metadata_SQL
@@ -151,10 +155,14 @@ func makeMetaCurrentlyPaused(jt jobspb.Type) metric.Metadata {
151155
),
152156
}
153157
switch jt {
154-
case jobspb.TypeAutoCreateStats:
158+
case jobspb.TypeAutoCreateStats, jobspb.TypeAutoCreatePartialStats:
155159
m.Essential = true
156160
m.Category = metric.Metadata_SQL
157-
m.HowToUse = `This metric is a high-level indicator that automatically generated statistics jobs are paused which can lead to the query optimizer running with stale statistics. Stale statistics can cause suboptimal query plans to be selected leading to poor query performance.`
161+
var partialDetail string
162+
if jt == jobspb.TypeAutoCreatePartialStats {
163+
partialDetail = "partial "
164+
}
165+
m.HowToUse = fmt.Sprintf(`This metric is a high-level indicator that automatically generated %sstatistics jobs are paused which can lead to the query optimizer running with stale statistics. Stale statistics can cause suboptimal query plans to be selected leading to poor query performance.`, partialDetail)
158166
case jobspb.TypeBackup:
159167
m.Essential = true
160168
m.Category = metric.Metadata_SQL
@@ -230,10 +238,14 @@ func makeMetaResumeFailed(jt jobspb.Type) metric.Metadata {
230238
}
231239

232240
switch jt {
233-
case jobspb.TypeAutoCreateStats:
241+
case jobspb.TypeAutoCreateStats, jobspb.TypeAutoCreatePartialStats:
234242
m.Essential = true
235243
m.Category = metric.Metadata_SQL
236-
m.HowToUse = `This metric is a high-level indicator that automatically generated table statistics is failing. Failed statistic creation can lead to the query optimizer running with stale statistics. Stale statistics can cause suboptimal query plans to be selected leading to poor query performance.`
244+
var partialDetail string
245+
if jt == jobspb.TypeAutoCreatePartialStats {
246+
partialDetail = "partial "
247+
}
248+
m.HowToUse = fmt.Sprintf(`This metric is a high-level indicator that automatically generated %stable statistics is failing. Failed statistic creation can lead to the query optimizer running with stale statistics. Stale statistics can cause suboptimal query plans to be selected leading to poor query performance.`, partialDetail)
237249
case jobspb.TypeRowLevelTTL:
238250
m.Essential = true
239251
m.Category = metric.Metadata_TTL

pkg/sql/mem_metrics.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ func MakeBaseMemMetrics(endpoint string, histogramWindow time.Duration) BaseMemo
8787
MetaMemMaxBytes := makeMemMetricMetadata(prefix+".max", "Memory usage per sql statement for "+endpoint)
8888
MetaMemCurBytes := makeMemMetricMetadata(prefix+".current", "Current sql statement memory usage for "+endpoint)
8989

90-
// Add Essential flag and category if this is the 'root' endpoint
90+
// Add Essential flag and category if this is the 'root' endpoint.
9191
if endpoint == "root" {
9292
MetaMemCurBytes.Essential = true
9393
MetaMemCurBytes.Category = metric.Metadata_SQL

0 commit comments

Comments
 (0)