Initial tests for new metrics + create constant for part of metrics names

mayabar · mayabar · commit a91af3bbb85e · 2025-10-28T10:19:11.000+02:00
Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;
diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
@@ -32,6 +32,14 @@ import (
 	vllmapi "github.com/llm-d/llm-d-inference-sim/pkg/vllm-api"
 )
 
+const (
+	e2eReqLatencyMetricName    = "vllm:e2e_request_latency_seconds"
+	reqQueueTimeMetricName     = "vllm:request_queue_time_seconds"
+	reqInferenceTimeMetricName = "vllm:request_inference_time_seconds"
+	prefillTimeMetricName      = "vllm:request_prefill_time_seconds"
+	decodeTimeMetricName       = "vllm:request_decode_time_seconds"
+)
+
 // createAndRegisterPrometheus creates and registers prometheus metrics used by vLLM simulator
 // Metrics reported:
 // - lora_requests_info
@@ -114,7 +122,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.e2eReqLatency = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:e2e_request_latency_seconds",
+			Name:      e2eReqLatencyMetricName,
 			Help:      "Histogram of end to end request latency in seconds.",
 			Buckets:   common.RequestLatencyBucketsBoundaries,
 		},
@@ -129,7 +137,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.reqQueueTime = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:request_queue_time_seconds",
+			Name:      reqQueueTimeMetricName,
 			Help:      "Histogram of time spent in WAITING phase for request.",
 			Buckets:   common.RequestLatencyBucketsBoundaries,
 		},
@@ -144,7 +152,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.reqInferenceTime = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:request_inference_time_seconds",
+			Name:      reqInferenceTimeMetricName,
 			Help:      "Histogram of time spent in RUNNING phase for request.",
 			Buckets:   common.RequestLatencyBucketsBoundaries,
 		},
@@ -159,7 +167,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.reqPrefillTime = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:request_prefill_time_seconds",
+			Name:      prefillTimeMetricName,
 			Help:      "Histogram of time spent in PREFILL phase for request.",
 			Buckets:   common.RequestLatencyBucketsBoundaries,
 		},
@@ -174,7 +182,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.reqDecodeTime = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:request_decode_time_seconds",
+			Name:      decodeTimeMetricName,
 			Help:      "Histogram of time spent in DECODE phase for request.",
 			Buckets:   common.RequestLatencyBucketsBoundaries,
 		},
diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
@@ -19,7 +19,9 @@ package llmdinferencesim
 import (
 	"context"
 	"errors"
+	"fmt"
 	"io"
+	"math"
 	"net/http"
 	"os"
 	"reflect"
@@ -164,7 +166,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1`))
 		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="200"} 1`))
 		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="500"} 1`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1`))
+		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1000"} 1`))
 		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="+Inf"} 1`))
 		// request_params_max_tokens_bucket
 		Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1"} 0`))
@@ -815,6 +817,93 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"} 1"))
 		})
 	})
+
+	Context("latency metrics", func() {
+		DescribeTable("should calculate all latency related metrics correctly for a single request",
+			func(testName string, doRemotePrefill bool, doRemoteDecode bool, kvcacheTransferLatency int, kvCacheTransferTimePerToken int,
+				ttft int, prefillTimePerToken int, interTokenLatency int) {
+				// Expect(true).To(BeFalse())
+				// send a single request with a prompt of 5 token and echo mode, so output tokens number of 5 too
+				modelName := "my_model"
+				// Send one request, check that ttft and tpot are as defined in the simulator command line params
+				ctx := context.TODO()
+				args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho,
+					"--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency),
+					"--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken),
+					"--time-to-first-token", strconv.Itoa(ttft),
+					"--prefill-time-per-token", strconv.Itoa(prefillTimePerToken),
+					"--inter-token-latency", strconv.Itoa(interTokenLatency),
+				}
+
+				client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+				Expect(err).NotTo(HaveOccurred())
+
+				// TODO - pass isStreaming
+				openaiclient, params := getOpenAIClientAndChatParams(client, modelName, "1 2 3 4", false)
+				// TODO - how to test remote prefill/decode
+
+				var reqWg, metricsWg sync.WaitGroup
+				metricsWg.Add(1)
+				reqWg.Add(1)
+
+				// send a single request
+				go func() {
+					defer reqWg.Done()
+					defer GinkgoRecover()
+
+					_, err := openaiclient.Chat.Completions.New(ctx, params)
+					Expect(err).NotTo(HaveOccurred())
+				}()
+
+				// wait untill request processing was finished, send /mertics request
+				reqWg.Wait()
+				time.Sleep(300 * time.Millisecond)
+				metricsResp, err := client.Get(metricsUrl)
+				Expect(err).NotTo(HaveOccurred())
+				Expect(metricsResp.StatusCode).To(Equal(http.StatusOK))
+
+				data, err := io.ReadAll(metricsResp.Body)
+				Expect(err).NotTo(HaveOccurred())
+				metrics := string(data)
+
+				numOfTokens := 4
+				var expectedPrefillTime float64
+				// TODO take into consideration remote prefill
+				if ttft > 0 {
+					// time-to-first-token overwrites calculation of prefill time based on number of input tokens
+					expectedPrefillTime = float64(ttft) / 1000
+
+				} else {
+					expectedPrefillTime = float64(numOfTokens*prefillTimePerToken) / 1000
+				}
+				expectedDecodeTime := float64(interTokenLatency*(numOfTokens-1)) / 1000
+				expectedE2ELatency := expectedPrefillTime + expectedDecodeTime
+
+				prevBoundary := math.Inf(-1)
+
+				for _, bucketBoudary := range common.RequestLatencyBucketsBoundaries {
+					checkBucketBoundary(metrics, modelName, prefillTimeMetricName, bucketBoudary, prevBoundary, expectedPrefillTime)
+					checkBucketBoundary(metrics, modelName, decodeTimeMetricName, bucketBoudary, prevBoundary, expectedDecodeTime)
+					checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, bucketBoudary, prevBoundary, expectedE2ELatency)
+
+					prevBoundary = bucketBoudary
+				}
+				// check the last bucket
+				lastBoundary := common.RequestLatencyBucketsBoundaries[len(common.RequestLatencyBucketsBoundaries)-1]
+				checkBucketBoundary(metrics, modelName, prefillTimeMetricName, math.Inf(1), lastBoundary, expectedPrefillTime)
+				checkBucketBoundary(metrics, modelName, decodeTimeMetricName, math.Inf(1), lastBoundary, expectedDecodeTime)
+				checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, math.Inf(1), lastBoundary, expectedE2ELatency)
+			},
+			func(testName string, doRemotePrefill bool, doRemoteDecode bool, kvcacheTransferLatency int, kvCacheTransferTimePerToken int,
+				ttft int, prefillTimePerToken int, interTokenLatency int) string {
+				return fmt.Sprintf("%s\ndoRemotePrefill: %v, doRemoteDecode: %v, kvcacheTransferLatency: %d, kvCacheTransferTimePerToken: %d, ttft: %d, prefillTimePerToken: %d, interTokenLatency: %d",
+					testName, doRemotePrefill, doRemoteDecode, kvcacheTransferLatency, kvCacheTransferTimePerToken, ttft, prefillTimePerToken, interTokenLatency)
+			},
+			// pay attention: do not define times close to bucket boundaries, this can lead to test failure
+			Entry(nil, "constant prefil + inter token time", false, false, 0, 0, 900, 0, 100),
+			Entry(nil, "prefill per token + inter token time", false, false, 0, 0, 0, 100, 100),
+		)
+	})
 })
 
 // isLoraMetricPresent checks if a matching metric exists
@@ -1022,3 +1111,29 @@ func TestBuild125Buckets(t *testing.T) {
 		})
 	}
 }
+
+func getFloatBucketMetricLine(model string, metric string, bucketBoundary float64, count int) string {
+	buckerBoundStr := "+Inf"
+	if bucketBoundary != math.Inf(1) {
+		buckerBoundStr = fmt.Sprintf("%g", bucketBoundary)
+	}
+	return fmt.Sprintf("%s_bucket{model_name=\"%s\",le=\"%s\"} %d", metric, model, buckerBoundStr, count)
+}
+
+func checkBucketBoundary(metrics string, modelName string, metricName string, bucketBoudary float64,
+	prevBoundary float64, expectedValue float64) {
+	if expectedValue > prevBoundary && bucketBoudary > expectedValue && (bucketBoudary-expectedValue) < 0.005 {
+		// expected time is too close to the bucket boudary
+		// it's possiblt that in theory we expect 1 in this bucket but will get 0 and this situation is ok
+		// since there is some additional calculation time
+		fmt.Printf("Expected value is too close to the boundary - skip test for this bucket (%.4f - %.4f] and expected value %.4f\n",
+			prevBoundary, bucketBoudary, expectedValue)
+		return
+	}
+	expectedCount := 0
+	if bucketBoudary > expectedValue {
+		expectedCount = 1
+	}
+	Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(modelName, metricName, bucketBoudary, expectedCount)))
+
+}