Skip to content

Commit 5308098

Browse files
SLO Aware Routing Plugins Only (#1849)
* Add all slo aware routing plugins, no integration changes * Add metrics required for plugins to compile * Small scorer changes * Unexport fields not used outside package, consolidate guage and counter metrics for prediction * Fix lints * Break out larger predictor functions into helpers, switch to using bulk prediction, add bulk prediction tests
1 parent 2aaf2a6 commit 5308098

20 files changed

+4792
-0
lines changed

pkg/epp/metrics/metrics.go

Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,104 @@ var (
6363
[]string{"model_name", "target_model_name", "error_code"},
6464
)
6565

66+
// Gauge for various inference request metrics
67+
inferenceGauges = prometheus.NewGaugeVec(
68+
prometheus.GaugeOpts{
69+
Subsystem: InferenceObjectiveComponent,
70+
Name: "inference_request_metric",
71+
Help: metricsutil.HelpMsgWithStability("Consolidated gauge for various inference request metrics including TTFT, TPOT, SLOs, and prediction durations.", compbasemetrics.ALPHA),
72+
},
73+
[]string{"model_name", "target_model_name", "type"},
74+
)
75+
76+
requestTTFT = prometheus.NewHistogramVec(
77+
prometheus.HistogramOpts{
78+
Subsystem: InferenceObjectiveComponent,
79+
Name: "request_ttft_seconds",
80+
Help: metricsutil.HelpMsgWithStability("Inference model TTFT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
81+
Buckets: []float64{
82+
0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3,
83+
4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600,
84+
},
85+
},
86+
[]string{"model_name", "target_model_name"},
87+
)
88+
89+
requestPredictedTTFT = prometheus.NewHistogramVec(
90+
prometheus.HistogramOpts{
91+
Subsystem: InferenceObjectiveComponent,
92+
Name: "request_predicted_ttft_seconds",
93+
Help: metricsutil.HelpMsgWithStability("Inference model Predicted TTFT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
94+
Buckets: []float64{
95+
0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3,
96+
4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600,
97+
},
98+
},
99+
[]string{"model_name", "target_model_name"},
100+
)
101+
102+
// New metrics for TTFT prediction duration
103+
requestTTFTPredictionDuration = prometheus.NewHistogramVec(
104+
prometheus.HistogramOpts{
105+
Subsystem: InferenceObjectiveComponent,
106+
Name: "request_ttft_prediction_duration_seconds",
107+
Help: metricsutil.HelpMsgWithStability("Duration taken to generate TTFT predictions in seconds for each model and target model.", compbasemetrics.ALPHA),
108+
Buckets: []float64{
109+
0.0001, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0,
110+
},
111+
},
112+
[]string{"model_name", "target_model_name"},
113+
)
114+
115+
requestTPOT = prometheus.NewHistogramVec(
116+
prometheus.HistogramOpts{
117+
Subsystem: InferenceObjectiveComponent,
118+
Name: "request_tpot_seconds",
119+
Help: metricsutil.HelpMsgWithStability("Inference model TPOT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
120+
Buckets: []float64{
121+
0.0005, 0.00205, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.125, 0.15, 0.2, 0.3,
122+
0.4, 0.5, 0.6, 0.8, 1, 1.5, 2, 3, 4.5, 6, 12, 18, 24, 30, 36, 48, 60, 90, 120, 180, 270, 360,
123+
},
124+
},
125+
[]string{"model_name", "target_model_name"},
126+
)
127+
128+
requestPredictedTPOT = prometheus.NewHistogramVec(
129+
prometheus.HistogramOpts{
130+
Subsystem: InferenceObjectiveComponent,
131+
Name: "request_predicted_tpot_seconds",
132+
Help: metricsutil.HelpMsgWithStability("Inference model Predicted TPOT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
133+
Buckets: []float64{
134+
0.0005, 0.00205, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.125, 0.15, 0.2, 0.3,
135+
0.4, 0.5, 0.6, 0.8, 1, 1.5, 2, 3, 4.5, 6, 12, 18, 24, 30, 36, 48, 60, 90, 120, 180, 270, 360,
136+
},
137+
},
138+
[]string{"model_name", "target_model_name"},
139+
)
140+
141+
// New metrics for TPOT prediction duration
142+
requestTPOTPredictionDuration = prometheus.NewHistogramVec(
143+
prometheus.HistogramOpts{
144+
Subsystem: InferenceObjectiveComponent,
145+
Name: "request_tpot_prediction_duration_seconds",
146+
Help: metricsutil.HelpMsgWithStability("Duration taken to generate TPOT predictions in seconds for each model and target model.", compbasemetrics.ALPHA),
147+
Buckets: []float64{
148+
0.0001, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0,
149+
},
150+
},
151+
[]string{"model_name", "target_model_name"},
152+
)
153+
154+
// Counter for SLO Violations
155+
sloViolationCounter = prometheus.NewCounterVec(
156+
prometheus.CounterOpts{
157+
Subsystem: InferenceObjectiveComponent,
158+
Name: "request_slo_violation_total",
159+
Help: metricsutil.HelpMsgWithStability("Counter of SLO violations for each model, target model, and violation type.", compbasemetrics.ALPHA),
160+
},
161+
[]string{"model_name", "target_model_name", "type"},
162+
)
163+
66164
requestLatencies = prometheus.NewHistogramVec(
67165
prometheus.HistogramOpts{
68166
Subsystem: InferenceObjectiveComponent,
@@ -282,6 +380,21 @@ var registerMetrics sync.Once
282380
// Register all metrics.
283381
func Register(customCollectors ...prometheus.Collector) {
284382
registerMetrics.Do(func() {
383+
// Register inference gauges
384+
metrics.Registry.MustRegister(inferenceGauges)
385+
386+
// Register Histograms
387+
metrics.Registry.MustRegister(requestTPOT)
388+
metrics.Registry.MustRegister(requestTTFT)
389+
metrics.Registry.MustRegister(requestPredictedTPOT)
390+
metrics.Registry.MustRegister(requestPredictedTTFT)
391+
metrics.Registry.MustRegister(requestTPOTPredictionDuration)
392+
metrics.Registry.MustRegister(requestTTFTPredictionDuration)
393+
394+
// Register SLO violation counters
395+
metrics.Registry.MustRegister(sloViolationCounter)
396+
397+
// Register other metrics
285398
metrics.Registry.MustRegister(requestCounter)
286399
metrics.Registry.MustRegister(requestErrCounter)
287400
metrics.Registry.MustRegister(requestLatencies)
@@ -311,6 +424,21 @@ func Register(customCollectors ...prometheus.Collector) {
311424

312425
// Just for integration test
313426
func Reset() {
427+
// Reset inference gauges
428+
inferenceGauges.Reset()
429+
430+
// Reset Histograms
431+
requestTPOT.Reset()
432+
requestTTFT.Reset()
433+
requestPredictedTPOT.Reset()
434+
requestPredictedTTFT.Reset()
435+
requestTPOTPredictionDuration.Reset()
436+
requestTTFTPredictionDuration.Reset()
437+
438+
// Reset SLO violation counter
439+
sloViolationCounter.Reset()
440+
441+
// Reset other metrics
314442
requestCounter.Reset()
315443
requestErrCounter.Reset()
316444
requestLatencies.Reset()
@@ -363,6 +491,123 @@ func RecordRequestLatencies(ctx context.Context, modelName, targetModelName stri
363491
return true
364492
}
365493

494+
func RecordRequestTPOT(ctx context.Context, modelName, targetModelName string, tpot float64) bool {
495+
if tpot < 0 {
496+
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "TPOT value must be non-negative",
497+
"modelName", modelName, "targetModelName", targetModelName, "tpot", tpot)
498+
return false
499+
}
500+
requestTPOT.WithLabelValues(modelName, targetModelName).Observe(tpot)
501+
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "tpot"}).Set(tpot)
502+
return true
503+
}
504+
505+
// RecordRequestTPOTWithSLO records TPOT and checks for SLO violation.
506+
// If tpot exceeds the threshold, it records a violation (sets gauge to 1 and increments counter).
507+
// If tpot is within limits, it sets gauge to 0.
508+
func RecordRequestTPOTWithSLO(ctx context.Context, modelName, targetModelName string, tpot float64, sloThreshold float64) bool {
509+
if tpot < 0 {
510+
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "TPOT value must be non-negative",
511+
"modelName", modelName, "targetModelName", targetModelName, "tpot", tpot)
512+
return false
513+
}
514+
515+
// Check for SLO violation (tpot exceeds threshold)
516+
if tpot > sloThreshold {
517+
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "tpot_slo_violation"}).Set(1)
518+
sloViolationCounter.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "tpot"}).Inc()
519+
log.FromContext(ctx).V(logutil.DEFAULT).Info("TPOT SLO violation detected",
520+
"modelName", modelName, "targetModelName", targetModelName, "tpot", tpot, "threshold", sloThreshold)
521+
} else {
522+
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "tpot_slo_violation"}).Set(0)
523+
}
524+
525+
return true
526+
}
527+
528+
// TPOT records duration of request.
529+
func RecordRequestPredictedTPOT(ctx context.Context, modelName, targetModelName string, predicted_tpot float64) bool {
530+
if predicted_tpot < 0 {
531+
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "Predicted TPOT value must be non-negative",
532+
"modelName", modelName, "targetModelName", targetModelName, "tpot", predicted_tpot)
533+
return false
534+
}
535+
requestPredictedTPOT.WithLabelValues(modelName, targetModelName).Observe(predicted_tpot)
536+
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "predicted_tpot"}).Set(predicted_tpot)
537+
return true
538+
}
539+
540+
// RecordRequestTPOTPredictionDuration records the duration taken to generate TPOT predictions.
541+
func RecordRequestTPOTPredictionDuration(ctx context.Context, modelName, targetModelName string, duration float64) bool {
542+
if duration < 0 {
543+
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "TPOT prediction duration must be non-negative",
544+
"modelName", modelName, "targetModelName", targetModelName, "duration", duration)
545+
return false
546+
}
547+
requestTPOTPredictionDuration.WithLabelValues(modelName, targetModelName).Observe(duration)
548+
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "tpot_prediction_duration"}).Set(duration)
549+
return true
550+
}
551+
552+
// TTFT records duration of request.
553+
func RecordRequestTTFT(ctx context.Context, modelName, targetModelName string, ttft float64) bool {
554+
if ttft < 0 {
555+
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "TTFT value must be non-negative",
556+
"modelName", modelName, "targetModelName", targetModelName, "ttft", ttft)
557+
return false
558+
}
559+
requestTTFT.WithLabelValues(modelName, targetModelName).Observe(ttft)
560+
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "ttft"}).Set(ttft)
561+
return true
562+
}
563+
564+
// RecordRequestTTFTWithSLO records TTFT and checks for SLO violation.
565+
// If ttft exceeds the threshold, it records a violation (sets gauge to 1 and increments counter).
566+
// If ttft is within limits, it sets gauge to 0.
567+
func RecordRequestTTFTWithSLO(ctx context.Context, modelName, targetModelName string, ttft float64, sloThreshold float64) bool {
568+
if ttft < 0 {
569+
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "TTFT value must be non-negative",
570+
"modelName", modelName, "targetModelName", targetModelName, "ttft", ttft)
571+
return false
572+
}
573+
574+
// Check for SLO violation (ttft exceeds threshold)
575+
if ttft > sloThreshold {
576+
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "ttft_slo_violation"}).Set(1)
577+
sloViolationCounter.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "ttft"}).Inc()
578+
log.FromContext(ctx).V(logutil.DEFAULT).Info("TTFT SLO violation detected",
579+
"modelName", modelName, "targetModelName", targetModelName, "ttft", ttft, "threshold", sloThreshold)
580+
} else {
581+
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "ttft_slo_violation"}).Set(0)
582+
}
583+
584+
return true
585+
}
586+
587+
// TPOT records duration of request.
588+
func RecordRequestPredictedTTFT(ctx context.Context, modelName, targetModelName string, predicted_ttft float64) bool {
589+
if predicted_ttft < 0 {
590+
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "Predicted TTFT value must be non-negative",
591+
"modelName", modelName, "targetModelName", targetModelName, "ttft", predicted_ttft)
592+
return false
593+
}
594+
requestPredictedTTFT.WithLabelValues(modelName, targetModelName).Observe(predicted_ttft)
595+
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "predicted_ttft"}).Set(predicted_ttft)
596+
return true
597+
}
598+
599+
// RecordRequestTTFTPredictionDuration records the duration taken to generate TTFT predictions.
600+
func RecordRequestTTFTPredictionDuration(ctx context.Context, modelName, targetModelName string, duration float64) bool {
601+
if duration < 0 {
602+
log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "TTFT prediction duration must be non-negative",
603+
"modelName", modelName, "targetModelName", targetModelName, "duration", duration)
604+
return false
605+
}
606+
requestTTFTPredictionDuration.WithLabelValues(modelName, targetModelName).Observe(duration)
607+
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "ttft_prediction_duration"}).Set(duration)
608+
return true
609+
}
610+
366611
// RecordResponseSizes records the response sizes.
367612
func RecordResponseSizes(modelName, targetModelName string, size int) {
368613
responseSizes.WithLabelValues(modelName, targetModelName).Observe(float64(size))
@@ -480,3 +725,15 @@ func IncFlowControlQueueSize(fairnessID, priority string) {
480725
func DecFlowControlQueueSize(fairnessID, priority string) {
481726
flowControlQueueSize.WithLabelValues(fairnessID, priority).Dec()
482727
}
728+
729+
// SetTTFTSLOThreshold sets the TTFT SLO threshold for a model.
730+
// This allows dynamic threshold management and makes the threshold visible in metrics.
731+
func SetTTFTSLOThreshold(modelName, targetModelName string, threshold float64) {
732+
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "ttft_slo_threshold"}).Set(threshold)
733+
}
734+
735+
// SetTPOTSLOThreshold sets the TPOT SLO threshold for a model.
736+
// This allows dynamic threshold management and makes the threshold visible in metrics.
737+
func SetTPOTSLOThreshold(modelName, targetModelName string, threshold float64) {
738+
inferenceGauges.With(prometheus.Labels{"model_name": modelName, "target_model_name": targetModelName, "type": "tpot_slo_threshold"}).Set(threshold)
739+
}

pkg/epp/metrics/metrics_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ const (
4646
KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization"
4747
QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size"
4848
PerPodQueueSizeMetrics = InferencePoolComponent + "_per_pod_queue_size"
49+
RequestTTFTSecondsMetric = InferenceObjectiveComponent + "_request_ttft_seconds"
50+
RequestTPOTSecondsMetric = InferenceObjectiveComponent + "_request_tpot_seconds"
4951
)
5052

5153
func TestMain(m *testing.M) {

0 commit comments

Comments
 (0)