diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go
index ddfb3316c..3d8d50219 100644
--- a/pkg/epp/handlers/server.go
+++ b/pkg/epp/handlers/server.go
@@ -276,6 +276,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
reqCtx.ResponseCompleteTimestamp = time.Now()
metrics.RecordRequestLatencies(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp)
metrics.RecordResponseSizes(reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.ResponseSize)
+ metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
}
reqCtx.respBodyResp = generateResponseBodyResponses(v.ResponseBody.Body, v.ResponseBody.EndOfStream)
diff --git a/site-src/guides/metrics-and-observability.md b/site-src/guides/metrics-and-observability.md
index 5e1f02e49..e85c49ee9 100644
--- a/site-src/guides/metrics-and-observability.md
+++ b/site-src/guides/metrics-and-observability.md
@@ -35,7 +35,7 @@ This guide describes the current state of exposed metrics and how to scrape them
| inference_objective_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
| inference_objective_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
| inference_objective_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
-| normalized_time_per_output_token_seconds | Distribution | Distribution of ntpot (response latency per output token) | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
+| inference_objective_normalized_time_per_output_token_seconds | Distribution | Distribution of ntpot (response latency per output token) | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
| inference_objective_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
| inference_objective_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
| inference_objective_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
diff --git a/test/e2e/epp/e2e_test.go b/test/e2e/epp/e2e_test.go
index c3ff49e58..4d34dfd9b 100644
--- a/test/e2e/epp/e2e_test.go
+++ b/test/e2e/epp/e2e_test.go
@@ -244,8 +244,7 @@ func verifyMetrics() {
"inference_objective_request_total",
"inference_objective_request_error_total",
"inference_objective_request_duration_seconds",
- // TODO: normalized_time_per_output_token_seconds is not actually recorded yet
- // "normalized_time_per_output_token_seconds",
+ "inference_objective_normalized_time_per_output_token_seconds",
"inference_objective_request_sizes",
"inference_objective_response_sizes",
"inference_objective_input_tokens",
diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go
index c2e100f79..f195d4712 100644
--- a/test/integration/epp/hermetic_test.go
+++ b/test/integration/epp/hermetic_test.go
@@ -727,7 +727,27 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) {
inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="+Inf"} 1
inference_objective_input_tokens_sum{model_name="",target_model_name=""} 7
inference_objective_input_tokens_count{model_name="",target_model_name=""} 1
- `},
+ `,
+ `inference_objective_normalized_time_per_output_token_seconds`: `
+ # HELP inference_objective_normalized_time_per_output_token_seconds [ALPHA] Inference objective latency divided by number of output tokens in seconds for each model and target model.
+ # TYPE inference_objective_normalized_time_per_output_token_seconds histogram
+ inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.001"} 0
+ inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.002"} 0
+ inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.005"} 0
+ inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.01"} 0
+ inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.02"} 0
+ inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.05"} 0
+ inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.1"} 0
+ inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.2"} 0
+ inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.5"} 0
+ inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="1"} 0
+ inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="2"} 0
+ inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="5"} 0
+ inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="10"} 0
+ inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="+Inf"} 1
+ inference_objective_normalized_time_per_output_token_seconds_sum{model_name="",target_model_name=""} 9.223372036854776e+08
+ inference_objective_normalized_time_per_output_token_seconds_count{model_name="",target_model_name=""} 1
+ `},
wantResponses: []*extProcPb.ProcessingResponse{
integrationutils.NewResponseHeaders(
&configPb.HeaderValueOption{