@@ -63,6 +63,104 @@ var (
6363 []string {"model_name" , "target_model_name" , "error_code" },
6464 )
6565
66+ // Gauge for various inference request metrics
67+ inferenceGauges = prometheus .NewGaugeVec (
68+ prometheus.GaugeOpts {
69+ Subsystem : InferenceObjectiveComponent ,
70+ Name : "inference_request_metric" ,
71+ Help : metricsutil .HelpMsgWithStability ("Consolidated gauge for various inference request metrics including TTFT, TPOT, SLOs, and prediction durations." , compbasemetrics .ALPHA ),
72+ },
73+ []string {"model_name" , "target_model_name" , "type" },
74+ )
75+
76+ requestTTFT = prometheus .NewHistogramVec (
77+ prometheus.HistogramOpts {
78+ Subsystem : InferenceObjectiveComponent ,
79+ Name : "request_ttft_seconds" ,
80+ Help : metricsutil .HelpMsgWithStability ("Inference model TTFT distribution in seconds for each model and target model." , compbasemetrics .ALPHA ),
81+ Buckets : []float64 {
82+ 0.005 , 0.025 , 0.05 , 0.1 , 0.2 , 0.4 , 0.6 , 0.8 , 1.0 , 1.25 , 1.5 , 2 , 3 ,
83+ 4 , 5 , 6 , 8 , 10 , 15 , 20 , 30 , 45 , 60 , 120 , 180 , 240 , 300 , 360 , 480 , 600 , 900 , 1200 , 1800 , 2700 , 3600 ,
84+ },
85+ },
86+ []string {"model_name" , "target_model_name" },
87+ )
88+
89+ requestPredictedTTFT = prometheus .NewHistogramVec (
90+ prometheus.HistogramOpts {
91+ Subsystem : InferenceObjectiveComponent ,
92+ Name : "request_predicted_ttft_seconds" ,
93+ Help : metricsutil .HelpMsgWithStability ("Inference model Predicted TTFT distribution in seconds for each model and target model." , compbasemetrics .ALPHA ),
94+ Buckets : []float64 {
95+ 0.005 , 0.025 , 0.05 , 0.1 , 0.2 , 0.4 , 0.6 , 0.8 , 1.0 , 1.25 , 1.5 , 2 , 3 ,
96+ 4 , 5 , 6 , 8 , 10 , 15 , 20 , 30 , 45 , 60 , 120 , 180 , 240 , 300 , 360 , 480 , 600 , 900 , 1200 , 1800 , 2700 , 3600 ,
97+ },
98+ },
99+ []string {"model_name" , "target_model_name" },
100+ )
101+
102+ // New metrics for TTFT prediction duration
103+ requestTTFTPredictionDuration = prometheus .NewHistogramVec (
104+ prometheus.HistogramOpts {
105+ Subsystem : InferenceObjectiveComponent ,
106+ Name : "request_ttft_prediction_duration_seconds" ,
107+ Help : metricsutil .HelpMsgWithStability ("Duration taken to generate TTFT predictions in seconds for each model and target model." , compbasemetrics .ALPHA ),
108+ Buckets : []float64 {
109+ 0.0001 , 0.0005 , 0.001 , 0.002 , 0.005 , 0.01 , 0.02 , 0.05 , 0.1 , 0.2 , 0.5 , 1.0 , 2.0 , 5.0 ,
110+ },
111+ },
112+ []string {"model_name" , "target_model_name" },
113+ )
114+
115+ requestTPOT = prometheus .NewHistogramVec (
116+ prometheus.HistogramOpts {
117+ Subsystem : InferenceObjectiveComponent ,
118+ Name : "request_tpot_seconds" ,
119+ Help : metricsutil .HelpMsgWithStability ("Inference model TPOT distribution in seconds for each model and target model." , compbasemetrics .ALPHA ),
120+ Buckets : []float64 {
121+ 0.0005 , 0.00205 , 0.005 , 0.01 , 0.02 , 0.04 , 0.06 , 0.08 , 0.1 , 0.125 , 0.15 , 0.2 , 0.3 ,
122+ 0.4 , 0.5 , 0.6 , 0.8 , 1 , 1.5 , 2 , 3 , 4.5 , 6 , 12 , 18 , 24 , 30 , 36 , 48 , 60 , 90 , 120 , 180 , 270 , 360 ,
123+ },
124+ },
125+ []string {"model_name" , "target_model_name" },
126+ )
127+
128+ requestPredictedTPOT = prometheus .NewHistogramVec (
129+ prometheus.HistogramOpts {
130+ Subsystem : InferenceObjectiveComponent ,
131+ Name : "request_predicted_tpot_seconds" ,
132+ Help : metricsutil .HelpMsgWithStability ("Inference model Predicted TPOT distribution in seconds for each model and target model." , compbasemetrics .ALPHA ),
133+ Buckets : []float64 {
134+ 0.0005 , 0.00205 , 0.005 , 0.01 , 0.02 , 0.04 , 0.06 , 0.08 , 0.1 , 0.125 , 0.15 , 0.2 , 0.3 ,
135+ 0.4 , 0.5 , 0.6 , 0.8 , 1 , 1.5 , 2 , 3 , 4.5 , 6 , 12 , 18 , 24 , 30 , 36 , 48 , 60 , 90 , 120 , 180 , 270 , 360 ,
136+ },
137+ },
138+ []string {"model_name" , "target_model_name" },
139+ )
140+
141+ // New metrics for TPOT prediction duration
142+ requestTPOTPredictionDuration = prometheus .NewHistogramVec (
143+ prometheus.HistogramOpts {
144+ Subsystem : InferenceObjectiveComponent ,
145+ Name : "request_tpot_prediction_duration_seconds" ,
146+ Help : metricsutil .HelpMsgWithStability ("Duration taken to generate TPOT predictions in seconds for each model and target model." , compbasemetrics .ALPHA ),
147+ Buckets : []float64 {
148+ 0.0001 , 0.0005 , 0.001 , 0.002 , 0.005 , 0.01 , 0.02 , 0.05 , 0.1 , 0.2 , 0.5 , 1.0 , 2.0 , 5.0 ,
149+ },
150+ },
151+ []string {"model_name" , "target_model_name" },
152+ )
153+
154+ // Counter for SLO Violations
155+ sloViolationCounter = prometheus .NewCounterVec (
156+ prometheus.CounterOpts {
157+ Subsystem : InferenceObjectiveComponent ,
158+ Name : "request_slo_violation_total" ,
159+ Help : metricsutil .HelpMsgWithStability ("Counter of SLO violations for each model, target model, and violation type." , compbasemetrics .ALPHA ),
160+ },
161+ []string {"model_name" , "target_model_name" , "type" },
162+ )
163+
66164 requestLatencies = prometheus .NewHistogramVec (
67165 prometheus.HistogramOpts {
68166 Subsystem : InferenceObjectiveComponent ,
@@ -282,6 +380,21 @@ var registerMetrics sync.Once
282380// Register all metrics.
283381func Register (customCollectors ... prometheus.Collector ) {
284382 registerMetrics .Do (func () {
383+ // Register inference gauges
384+ metrics .Registry .MustRegister (inferenceGauges )
385+
386+ // Register Histograms
387+ metrics .Registry .MustRegister (requestTPOT )
388+ metrics .Registry .MustRegister (requestTTFT )
389+ metrics .Registry .MustRegister (requestPredictedTPOT )
390+ metrics .Registry .MustRegister (requestPredictedTTFT )
391+ metrics .Registry .MustRegister (requestTPOTPredictionDuration )
392+ metrics .Registry .MustRegister (requestTTFTPredictionDuration )
393+
394+ // Register SLO violation counters
395+ metrics .Registry .MustRegister (sloViolationCounter )
396+
397+ // Register other metrics
285398 metrics .Registry .MustRegister (requestCounter )
286399 metrics .Registry .MustRegister (requestErrCounter )
287400 metrics .Registry .MustRegister (requestLatencies )
@@ -311,6 +424,21 @@ func Register(customCollectors ...prometheus.Collector) {
311424
312425// Just for integration test
313426func Reset () {
427+ // Reset inference gauges
428+ inferenceGauges .Reset ()
429+
430+ // Reset Histograms
431+ requestTPOT .Reset ()
432+ requestTTFT .Reset ()
433+ requestPredictedTPOT .Reset ()
434+ requestPredictedTTFT .Reset ()
435+ requestTPOTPredictionDuration .Reset ()
436+ requestTTFTPredictionDuration .Reset ()
437+
438+ // Reset SLO violation counter
439+ sloViolationCounter .Reset ()
440+
441+ // Reset other metrics
314442 requestCounter .Reset ()
315443 requestErrCounter .Reset ()
316444 requestLatencies .Reset ()
@@ -363,6 +491,123 @@ func RecordRequestLatencies(ctx context.Context, modelName, targetModelName stri
363491 return true
364492}
365493
494+ func RecordRequestTPOT (ctx context.Context , modelName , targetModelName string , tpot float64 ) bool {
495+ if tpot < 0 {
496+ log .FromContext (ctx ).V (logutil .DEFAULT ).Error (nil , "TPOT value must be non-negative" ,
497+ "modelName" , modelName , "targetModelName" , targetModelName , "tpot" , tpot )
498+ return false
499+ }
500+ requestTPOT .WithLabelValues (modelName , targetModelName ).Observe (tpot )
501+ inferenceGauges .With (prometheus.Labels {"model_name" : modelName , "target_model_name" : targetModelName , "type" : "tpot" }).Set (tpot )
502+ return true
503+ }
504+
505+ // RecordRequestTPOTWithSLO records TPOT and checks for SLO violation.
506+ // If tpot exceeds the threshold, it records a violation (sets gauge to 1 and increments counter).
507+ // If tpot is within limits, it sets gauge to 0.
508+ func RecordRequestTPOTWithSLO (ctx context.Context , modelName , targetModelName string , tpot float64 , sloThreshold float64 ) bool {
509+ if tpot < 0 {
510+ log .FromContext (ctx ).V (logutil .DEFAULT ).Error (nil , "TPOT value must be non-negative" ,
511+ "modelName" , modelName , "targetModelName" , targetModelName , "tpot" , tpot )
512+ return false
513+ }
514+
515+ // Check for SLO violation (tpot exceeds threshold)
516+ if tpot > sloThreshold {
517+ inferenceGauges .With (prometheus.Labels {"model_name" : modelName , "target_model_name" : targetModelName , "type" : "tpot_slo_violation" }).Set (1 )
518+ sloViolationCounter .With (prometheus.Labels {"model_name" : modelName , "target_model_name" : targetModelName , "type" : "tpot" }).Inc ()
519+ log .FromContext (ctx ).V (logutil .DEFAULT ).Info ("TPOT SLO violation detected" ,
520+ "modelName" , modelName , "targetModelName" , targetModelName , "tpot" , tpot , "threshold" , sloThreshold )
521+ } else {
522+ inferenceGauges .With (prometheus.Labels {"model_name" : modelName , "target_model_name" : targetModelName , "type" : "tpot_slo_violation" }).Set (0 )
523+ }
524+
525+ return true
526+ }
527+
528+ // TPOT records duration of request.
529+ func RecordRequestPredictedTPOT (ctx context.Context , modelName , targetModelName string , predicted_tpot float64 ) bool {
530+ if predicted_tpot < 0 {
531+ log .FromContext (ctx ).V (logutil .DEFAULT ).Error (nil , "Predicted TPOT value must be non-negative" ,
532+ "modelName" , modelName , "targetModelName" , targetModelName , "tpot" , predicted_tpot )
533+ return false
534+ }
535+ requestPredictedTPOT .WithLabelValues (modelName , targetModelName ).Observe (predicted_tpot )
536+ inferenceGauges .With (prometheus.Labels {"model_name" : modelName , "target_model_name" : targetModelName , "type" : "predicted_tpot" }).Set (predicted_tpot )
537+ return true
538+ }
539+
540+ // RecordRequestTPOTPredictionDuration records the duration taken to generate TPOT predictions.
541+ func RecordRequestTPOTPredictionDuration (ctx context.Context , modelName , targetModelName string , duration float64 ) bool {
542+ if duration < 0 {
543+ log .FromContext (ctx ).V (logutil .DEFAULT ).Error (nil , "TPOT prediction duration must be non-negative" ,
544+ "modelName" , modelName , "targetModelName" , targetModelName , "duration" , duration )
545+ return false
546+ }
547+ requestTPOTPredictionDuration .WithLabelValues (modelName , targetModelName ).Observe (duration )
548+ inferenceGauges .With (prometheus.Labels {"model_name" : modelName , "target_model_name" : targetModelName , "type" : "tpot_prediction_duration" }).Set (duration )
549+ return true
550+ }
551+
552+ // TTFT records duration of request.
553+ func RecordRequestTTFT (ctx context.Context , modelName , targetModelName string , ttft float64 ) bool {
554+ if ttft < 0 {
555+ log .FromContext (ctx ).V (logutil .DEFAULT ).Error (nil , "TTFT value must be non-negative" ,
556+ "modelName" , modelName , "targetModelName" , targetModelName , "ttft" , ttft )
557+ return false
558+ }
559+ requestTTFT .WithLabelValues (modelName , targetModelName ).Observe (ttft )
560+ inferenceGauges .With (prometheus.Labels {"model_name" : modelName , "target_model_name" : targetModelName , "type" : "ttft" }).Set (ttft )
561+ return true
562+ }
563+
564+ // RecordRequestTTFTWithSLO records TTFT and checks for SLO violation.
565+ // If ttft exceeds the threshold, it records a violation (sets gauge to 1 and increments counter).
566+ // If ttft is within limits, it sets gauge to 0.
567+ func RecordRequestTTFTWithSLO (ctx context.Context , modelName , targetModelName string , ttft float64 , sloThreshold float64 ) bool {
568+ if ttft < 0 {
569+ log .FromContext (ctx ).V (logutil .DEFAULT ).Error (nil , "TTFT value must be non-negative" ,
570+ "modelName" , modelName , "targetModelName" , targetModelName , "ttft" , ttft )
571+ return false
572+ }
573+
574+ // Check for SLO violation (ttft exceeds threshold)
575+ if ttft > sloThreshold {
576+ inferenceGauges .With (prometheus.Labels {"model_name" : modelName , "target_model_name" : targetModelName , "type" : "ttft_slo_violation" }).Set (1 )
577+ sloViolationCounter .With (prometheus.Labels {"model_name" : modelName , "target_model_name" : targetModelName , "type" : "ttft" }).Inc ()
578+ log .FromContext (ctx ).V (logutil .DEFAULT ).Info ("TTFT SLO violation detected" ,
579+ "modelName" , modelName , "targetModelName" , targetModelName , "ttft" , ttft , "threshold" , sloThreshold )
580+ } else {
581+ inferenceGauges .With (prometheus.Labels {"model_name" : modelName , "target_model_name" : targetModelName , "type" : "ttft_slo_violation" }).Set (0 )
582+ }
583+
584+ return true
585+ }
586+
587+ // TPOT records duration of request.
588+ func RecordRequestPredictedTTFT (ctx context.Context , modelName , targetModelName string , predicted_ttft float64 ) bool {
589+ if predicted_ttft < 0 {
590+ log .FromContext (ctx ).V (logutil .DEFAULT ).Error (nil , "Predicted TTFT value must be non-negative" ,
591+ "modelName" , modelName , "targetModelName" , targetModelName , "ttft" , predicted_ttft )
592+ return false
593+ }
594+ requestPredictedTTFT .WithLabelValues (modelName , targetModelName ).Observe (predicted_ttft )
595+ inferenceGauges .With (prometheus.Labels {"model_name" : modelName , "target_model_name" : targetModelName , "type" : "predicted_ttft" }).Set (predicted_ttft )
596+ return true
597+ }
598+
599+ // RecordRequestTTFTPredictionDuration records the duration taken to generate TTFT predictions.
600+ func RecordRequestTTFTPredictionDuration (ctx context.Context , modelName , targetModelName string , duration float64 ) bool {
601+ if duration < 0 {
602+ log .FromContext (ctx ).V (logutil .DEFAULT ).Error (nil , "TTFT prediction duration must be non-negative" ,
603+ "modelName" , modelName , "targetModelName" , targetModelName , "duration" , duration )
604+ return false
605+ }
606+ requestTTFTPredictionDuration .WithLabelValues (modelName , targetModelName ).Observe (duration )
607+ inferenceGauges .With (prometheus.Labels {"model_name" : modelName , "target_model_name" : targetModelName , "type" : "ttft_prediction_duration" }).Set (duration )
608+ return true
609+ }
610+
366611// RecordResponseSizes records the response sizes.
367612func RecordResponseSizes (modelName , targetModelName string , size int ) {
368613 responseSizes .WithLabelValues (modelName , targetModelName ).Observe (float64 (size ))
@@ -480,3 +725,15 @@ func IncFlowControlQueueSize(fairnessID, priority string) {
480725func DecFlowControlQueueSize (fairnessID , priority string ) {
481726 flowControlQueueSize .WithLabelValues (fairnessID , priority ).Dec ()
482727}
728+
729+ // SetTTFTSLOThreshold sets the TTFT SLO threshold for a model.
730+ // This allows dynamic threshold management and makes the threshold visible in metrics.
731+ func SetTTFTSLOThreshold (modelName , targetModelName string , threshold float64 ) {
732+ inferenceGauges .With (prometheus.Labels {"model_name" : modelName , "target_model_name" : targetModelName , "type" : "ttft_slo_threshold" }).Set (threshold )
733+ }
734+
735+ // SetTPOTSLOThreshold sets the TPOT SLO threshold for a model.
736+ // This allows dynamic threshold management and makes the threshold visible in metrics.
737+ func SetTPOTSLOThreshold (modelName , targetModelName string , threshold float64 ) {
738+ inferenceGauges .With (prometheus.Labels {"model_name" : modelName , "target_model_name" : targetModelName , "type" : "tpot_slo_threshold" }).Set (threshold )
739+ }
0 commit comments