Skip to content

Commit 4369159

Browse files
authored
Merge pull request kubernetes#126227 from sanposhiho/queueing_hint_execution_duration_seconds
feature: support queueing_hint_execution_duration_seconds metric
2 parents bb350f7 + 2a51bd8 commit 4369159

File tree

3 files changed

+56
-2
lines changed

3 files changed

+56
-2
lines changed

pkg/scheduler/internal/queue/scheduling_queue.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,7 @@ func (p *PriorityQueue) isPodWorthRequeuing(logger klog.Logger, pInfo *framework
497497
continue
498498
}
499499

500+
start := time.Now()
500501
hint, err := hintfn.QueueingHintFn(logger, pod, oldObj, newObj)
501502
if err != nil {
502503
// If the QueueingHintFn returned an error, we should treat the event as Queue so that we can prevent
@@ -509,6 +510,8 @@ func (p *PriorityQueue) isPodWorthRequeuing(logger klog.Logger, pInfo *framework
509510
}
510511
hint = framework.Queue
511512
}
513+
p.metricsRecorder.ObserveQueueingHintDurationAsync(hintfn.PluginName, event.Label, queueingHintToLabel(hint, err), metrics.SinceInSeconds(start))
514+
512515
if hint == framework.QueueSkip {
513516
continue
514517
}
@@ -536,6 +539,23 @@ func (p *PriorityQueue) isPodWorthRequeuing(logger klog.Logger, pInfo *framework
536539
return queueStrategy
537540
}
538541

542+
// queueingHintToLabel converts a hint and an error from QHint to a label string.
543+
func queueingHintToLabel(hint framework.QueueingHint, err error) string {
544+
if err != nil {
545+
return metrics.QueueingHintResultError
546+
}
547+
548+
switch hint {
549+
case framework.Queue:
550+
return metrics.QueueingHintResultQueue
551+
case framework.QueueSkip:
552+
return metrics.QueueingHintResultQueueSkip
553+
}
554+
555+
// Shouldn't reach here.
556+
return ""
557+
}
558+
539559
// runPreEnqueuePlugins iterates PreEnqueue function in each registered PreEnqueuePlugin.
540560
// It returns true if all PreEnqueue function run successfully; otherwise returns false
541561
// upon the first failure.

pkg/scheduler/metrics/metric_recorder.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,19 @@ func NewMetricsAsyncRecorder(bufferSize int, interval time.Duration, stopCh <-ch
119119
// ObservePluginDurationAsync observes the plugin_execution_duration_seconds metric.
120120
// The metric will be flushed to Prometheus asynchronously.
121121
func (r *MetricAsyncRecorder) ObservePluginDurationAsync(extensionPoint, pluginName, status string, value float64) {
122+
r.observeMetricAsync(PluginExecutionDuration, value, pluginName, extensionPoint, status)
123+
}
124+
125+
// ObserveQueueingHintDurationAsync observes the queueing_hint_execution_duration_seconds metric.
126+
// The metric will be flushed to Prometheus asynchronously.
127+
func (r *MetricAsyncRecorder) ObserveQueueingHintDurationAsync(pluginName, event, hint string, value float64) {
128+
r.observeMetricAsync(queueingHintExecutionDuration, value, pluginName, event, hint)
129+
}
130+
131+
func (r *MetricAsyncRecorder) observeMetricAsync(m *metrics.HistogramVec, value float64, labelsValues ...string) {
122132
newMetric := &metric{
123-
metric: PluginExecutionDuration,
124-
labelValues: []string{pluginName, extensionPoint, status},
133+
metric: m,
134+
labelValues: labelsValues,
125135
value: value,
126136
}
127137
select {

pkg/scheduler/metrics/metrics.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@ import (
2020
"sync"
2121
"time"
2222

23+
utilfeature "k8s.io/apiserver/pkg/util/feature"
2324
"k8s.io/component-base/metrics"
2425
"k8s.io/component-base/metrics/legacyregistry"
26+
"k8s.io/kubernetes/pkg/features"
2527
volumebindingmetrics "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/metrics"
2628
)
2729

@@ -73,6 +75,12 @@ const (
7375
Permit = "Permit"
7476
)
7577

78+
const (
79+
QueueingHintResultQueue = "Queue"
80+
QueueingHintResultQueueSkip = "QueueSkip"
81+
QueueingHintResultError = "Error"
82+
)
83+
7684
// All the histogram based metrics have 1ms as size for the smallest bucket.
7785
var (
7886
scheduleAttempts = metrics.NewCounterVec(
@@ -198,6 +206,19 @@ var (
198206
},
199207
[]string{"plugin", "extension_point", "status"})
200208

209+
// This is only available when the QHint feature gate is enabled.
210+
queueingHintExecutionDuration = metrics.NewHistogramVec(
211+
&metrics.HistogramOpts{
212+
Subsystem: SchedulerSubsystem,
213+
Name: "queueing_hint_execution_duration_seconds",
214+
Help: "Duration for running a queueing hint function of a plugin.",
215+
// Start with 0.01ms with the last bucket being [~22ms, Inf). We use a small factor (1.5)
216+
// so that we have better granularity since plugin latency is very sensitive.
217+
Buckets: metrics.ExponentialBuckets(0.00001, 1.5, 20),
218+
StabilityLevel: metrics.ALPHA,
219+
},
220+
[]string{"plugin", "event", "hint"})
221+
201222
SchedulerQueueIncomingPods = metrics.NewCounterVec(
202223
&metrics.CounterOpts{
203224
Subsystem: SchedulerSubsystem,
@@ -269,6 +290,9 @@ func Register() {
269290
// Register the metrics.
270291
registerMetrics.Do(func() {
271292
RegisterMetrics(metricsList...)
293+
if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) {
294+
RegisterMetrics(queueingHintExecutionDuration)
295+
}
272296
volumebindingmetrics.RegisterVolumeSchedulingMetrics()
273297
})
274298
}

0 commit comments

Comments
 (0)