From 6c7ce3e653be38301ec67c314053c95d49ed72fc Mon Sep 17 00:00:00 2001 From: Dharaneeshwaran Ravichandran Date: Mon, 13 Oct 2025 13:21:13 +0000 Subject: [PATCH] Record EPP NormalizedTimePerOutputToken metric on streaming mode Update e2e/epp/e2e_test & integration/epp/hermetic_test to validate inference_objective_normalized_time_per_output_token_seconds metric Signed-off-by: Dharaneeshwaran Ravichandran --- pkg/epp/handlers/server.go | 1 + site-src/guides/metrics-and-observability.md | 2 +- test/e2e/epp/e2e_test.go | 3 +-- test/integration/epp/hermetic_test.go | 22 +++++++++++++++++++- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index ddfb3316c..3d8d50219 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -276,6 +276,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) reqCtx.ResponseCompleteTimestamp = time.Now() metrics.RecordRequestLatencies(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) metrics.RecordResponseSizes(reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.ResponseSize) + metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens) } reqCtx.respBodyResp = generateResponseBodyResponses(v.ResponseBody.Body, v.ResponseBody.EndOfStream) diff --git a/site-src/guides/metrics-and-observability.md b/site-src/guides/metrics-and-observability.md index 5e1f02e49..e85c49ee9 100644 --- a/site-src/guides/metrics-and-observability.md +++ b/site-src/guides/metrics-and-observability.md @@ -35,7 +35,7 @@ This guide describes the current state of exposed metrics and how to scrape them | inference_objective_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_objective_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_objective_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| normalized_time_per_output_token_seconds | Distribution | Distribution of ntpot (response latency per output token) | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_objective_normalized_time_per_output_token_seconds | Distribution | Distribution of ntpot (response latency per output token) | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_objective_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_objective_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_objective_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | diff --git a/test/e2e/epp/e2e_test.go b/test/e2e/epp/e2e_test.go index c3ff49e58..4d34dfd9b 100644 --- a/test/e2e/epp/e2e_test.go +++ b/test/e2e/epp/e2e_test.go @@ -244,8 +244,7 @@ func verifyMetrics() { "inference_objective_request_total", "inference_objective_request_error_total", "inference_objective_request_duration_seconds", - // TODO: normalized_time_per_output_token_seconds is not actually recorded yet - // "normalized_time_per_output_token_seconds", + "inference_objective_normalized_time_per_output_token_seconds", "inference_objective_request_sizes", "inference_objective_response_sizes", "inference_objective_input_tokens", diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index c2e100f79..f195d4712 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -727,7 +727,27 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) { inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="+Inf"} 1 inference_objective_input_tokens_sum{model_name="",target_model_name=""} 7 inference_objective_input_tokens_count{model_name="",target_model_name=""} 1 - `}, + `, + `inference_objective_normalized_time_per_output_token_seconds`: ` + # HELP inference_objective_normalized_time_per_output_token_seconds [ALPHA] Inference objective latency divided by number of output tokens in seconds for each model and target model. + # TYPE inference_objective_normalized_time_per_output_token_seconds histogram + inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.001"} 0 + inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.002"} 0 + inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.005"} 0 + inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.01"} 0 + inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.02"} 0 + inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.05"} 0 + inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.1"} 0 + inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.2"} 0 + inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.5"} 0 + inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="1"} 0 + inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="2"} 0 + inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="5"} 0 + inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="10"} 0 + inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="+Inf"} 1 + inference_objective_normalized_time_per_output_token_seconds_sum{model_name="",target_model_name=""} 9.223372036854776e+08 + inference_objective_normalized_time_per_output_token_seconds_count{model_name="",target_model_name=""} 1 + `}, wantResponses: []*extProcPb.ProcessingResponse{ integrationutils.NewResponseHeaders( &configPb.HeaderValueOption{