Skip to content

Commit cda00c7

Browse files
committed
- Add full list of supported metrics to readme
- Create constants for all metrics - Define all latency related fake metrics in config - Add validation for new fake metrics in config Signed-off-by: Maya Barnea <[email protected]>
1 parent c881401 commit cda00c7

File tree

7 files changed

+100
-38
lines changed

7 files changed

+100
-38
lines changed

README.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,18 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
2626
| vllm:lora_requests_info | Running stats on LoRA requests |
2727
| vllm:num_requests_running | Number of requests currently running on GPU |
2828
| vllm:num_requests_waiting | Prometheus metric for the number of queued requests |
29-
29+
| vllm:e2e_request_latency_seconds | Histogram of end to end request latency in seconds |
30+
| vllm:request_inference_time_seconds | Histogram of time spent in RUNNING phase for request |
31+
| vllm:request_queue_time_seconds | Histogram of time spent in WAITING phase for request |
32+
| vllm:request_prefill_time_seconds | Histogram of time spent in PREFILL phase for request |
33+
| vllm:request_decode_time_seconds | Histogram of time spent in DECODE phase for request |
34+
| vllm:time_to_first_token_seconds | Histogram of time to first token in seconds |
35+
| vllm:time_per_output_token_seconds | Histogram of time per output token in seconds |
36+
| vllm:request_generation_tokens | Number of generation tokens processed |
37+
| vllm:request_params_max_tokens | Histogram of the max_tokens request parameter |
38+
| vllm:request_prompt_tokens | Number of prefill tokens processed |
39+
| vllm:request_success_total | Count of successfully processed requests |
40+
3041
The simulated inference has no connection with the model and LoRA adapters specified in the command line parameters or via the /v1/load_lora_adapter HTTP REST endpoint. The /v1/models endpoint returns simulated results based on those same command line parameters and those loaded via the /v1/load_lora_adapter HTTP REST endpoint.
3142

3243
The simulator supports two modes of operation:

pkg/common/config.go

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -232,16 +232,17 @@ type Metrics struct {
232232
WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
233233
// KVCacheUsagePercentage is the fraction of KV-cache blocks currently in use (from 0 to 1)
234234
KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
235-
// TTFTBuckets is an array of values for time-to-first-token buckets,
236-
// each value in this array is a value for the corresponding bucket.
235+
236+
// Histogram metrics - defined by array of values.
237+
// Each value in this array is a value for the corresponding bucket.
237238
// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
239+
240+
// TTFTBuckets is an array of values for time-to-first-token buckets.
238241
// Buckets upper boundaries in seconds are:
239242
// 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
240243
// 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf
241244
TTFTBucketValues []int `yaml:"ttft-buckets-values" json:"ttft-buckets-values"`
242-
// TPOTBuckets is an array of values for time-per-output-token buckets,
243-
// each value in this array is a value for the corresponding bucket.
244-
// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
245+
// TPOTBuckets is an array of values for time-per-output-token buckets.
245246
// Buckets upper boundaries in seconds are:
246247
// 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
247248
// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
@@ -253,13 +254,21 @@ type Metrics struct {
253254
RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
254255
// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
255256
RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
256-
// E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets,
257-
// each value in this array is a value for the corresponding bucket.
258-
// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
259-
// Buckets upper boundaries in seconds are:
257+
258+
// Latency histograms - have same buckets upper boundaries in seconds are:
260259
// 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
261260
// 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf
261+
262+
// E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets.
262263
E2ERequestLatencyBucketValues []int `yaml:"e2erl-buckets-values" json:"e2erl-buckets-values"`
264+
// ReqQueueTimeBucketValues is an array of values for request queue time buckets.
265+
ReqQueueTimeBucketValues []int `yaml:"queue-time-buckets-values" json:"queue-time-buckets-values"`
266+
// ReqInfTimeBucketValues is an array of values for request inference time buckets.
267+
ReqInfTimeBucketValues []int `yaml:"inf-time-buckets-values" json:"inf-time-buckets-values"`
268+
// ReqPrefillTimeBucketValues is an array of values for request prefill time buckets.
269+
ReqPrefillTimeBucketValues []int `yaml:"prefill-time-buckets-values" json:"prefill-time-buckets-values"`
270+
// ReqDecodeTimeBucketValues is an array of values for request decode time buckets.
271+
ReqDecodeTimeBucketValues []int `yaml:"decode-time-buckets-values" json:"decode-time-buckets-values"`
263272
}
264273

265274
type LorasMetrics struct {
@@ -595,6 +604,38 @@ func (c *Configuration) validate() error {
595604
return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
596605
}
597606
}
607+
608+
for _, v := range c.FakeMetrics.RequestParamsMaxTokens {
609+
if v < 0 {
610+
return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
611+
}
612+
}
613+
614+
for _, v := range c.FakeMetrics.E2ERequestLatencyBucketValues {
615+
if v < 0 {
616+
return errors.New("fake metrics e2erl-buckets-values cannot contain negative values")
617+
}
618+
}
619+
for _, v := range c.FakeMetrics.ReqQueueTimeBucketValues {
620+
if v < 0 {
621+
return errors.New("fake metrics queue-time-buckets-values cannot contain negative values")
622+
}
623+
}
624+
for _, v := range c.FakeMetrics.ReqInfTimeBucketValues {
625+
if v < 0 {
626+
return errors.New("fake metrics inf-time-buckets-values cannot contain negative values")
627+
}
628+
}
629+
for _, v := range c.FakeMetrics.ReqPrefillTimeBucketValues {
630+
if v < 0 {
631+
return errors.New("fake metrics prefill-time-buckets-values cannot contain negative values")
632+
}
633+
}
634+
for _, v := range c.FakeMetrics.ReqDecodeTimeBucketValues {
635+
if v < 0 {
636+
return errors.New("fake metrics decode-time-buckets-values cannot contain negative values")
637+
}
638+
}
598639
}
599640

600641
if c.DPSize < 1 || c.DPSize > 8 {

pkg/llm-d-inference-sim/metrics.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ const (
4343
generationTokensMetricName = "vllm:request_generation_tokens"
4444
paramMaxTokensMetricName = "vllm:request_params_max_tokens"
4545
promptTokensMetricName = "vllm:request_prompt_tokens"
46+
successTotalMetricName = "vllm:request_success_total"
47+
loraRequestsMetricName = "vllm:lora_requests_info"
48+
reqRunningMetricName = "vllm:num_requests_running"
49+
reqWaitingMetricName = "vllm:num_requests_waiting"
50+
gpuCacheUsageMetricName = "vllm:gpu_cache_usage_perc"
4651
)
4752

4853
// createAndRegisterPrometheus creates and registers prometheus metrics used by vLLM simulator
@@ -54,7 +59,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
5459
s.metrics.loraInfo = prometheus.NewGaugeVec(
5560
prometheus.GaugeOpts{
5661
Subsystem: "",
57-
Name: "vllm:lora_requests_info",
62+
Name: loraRequestsMetricName,
5863
Help: "Running stats on lora requests.",
5964
},
6065
[]string{vllmapi.PromLabelMaxLora, vllmapi.PromLabelRunningLoraAdapters, vllmapi.PromLabelWaitingLoraAdapters},
@@ -68,7 +73,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
6873
s.metrics.runningRequests = prometheus.NewGaugeVec(
6974
prometheus.GaugeOpts{
7075
Subsystem: "",
71-
Name: "vllm:num_requests_running",
76+
Name: reqRunningMetricName,
7277
Help: "Number of requests currently running on GPU.",
7378
},
7479
[]string{vllmapi.PromLabelModelName},
@@ -83,7 +88,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
8388
s.metrics.waitingRequests = prometheus.NewGaugeVec(
8489
prometheus.GaugeOpts{
8590
Subsystem: "",
86-
Name: "vllm:num_requests_waiting",
91+
Name: reqWaitingMetricName,
8792
Help: "Prometheus metric for the number of queued requests.",
8893
},
8994
[]string{vllmapi.PromLabelModelName},
@@ -202,7 +207,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
202207
s.metrics.kvCacheUsagePercentage = prometheus.NewGaugeVec(
203208
prometheus.GaugeOpts{
204209
Subsystem: "",
205-
Name: "vllm:gpu_cache_usage_perc",
210+
Name: gpuCacheUsageMetricName,
206211
Help: "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).",
207212
},
208213
[]string{vllmapi.PromLabelModelName},
@@ -258,7 +263,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
258263
s.metrics.requestSuccessTotal = prometheus.NewCounterVec(
259264
prometheus.CounterOpts{
260265
Subsystem: "",
261-
Name: "vllm:request_success_total",
266+
Name: successTotalMetricName,
262267
Help: "Count of successfully processed requests.",
263268
},
264269
[]string{vllmapi.PromLabelModelName, vllmapi.PromLabelFinishReason},

pkg/llm-d-inference-sim/metrics_test.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,8 @@ var _ = Describe("Simulator metrics", Ordered, func() {
107107
data, err := io.ReadAll(metricsResp.Body)
108108
Expect(err).NotTo(HaveOccurred())
109109
metrics := string(data)
110-
Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"testmodel\"} 2"))
111-
Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"testmodel\"} 1"))
110+
Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqRunningMetricName, 2)))
111+
Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqWaitingMetricName, 1)))
112112
})
113113

114114
It("Should record correct prompt and generation token counts", func() {
@@ -168,7 +168,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
168168
// as the number of generated tokens is unpredictable in this test.
169169
// Therefore, we only verify the number of requests and the total number of generated tokens,
170170
// and skip the bucket distribution.
171-
Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_count{model_name="testmodel"} 1`))
171+
Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, generationTokensMetricName+"_count", 1)))
172172
// request_success_total
173173
Expect(metrics).To(MatchRegexp(`vllm:request_success_total{finish_reason="(stop|length)",model_name="testmodel"} 1`))
174174
})
@@ -512,9 +512,9 @@ var _ = Describe("Simulator metrics", Ordered, func() {
512512
Expect(err).NotTo(HaveOccurred())
513513
metrics := string(data)
514514
// Expect three running requests and two blocks in the kv cache - usage 2/16=0.125
515-
Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"Qwen/Qwen2-0.5B\"} 3"))
516-
Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"Qwen/Qwen2-0.5B\"} 0"))
517-
Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"Qwen/Qwen2-0.5B\"} 0.125"))
515+
Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqRunningMetricName, 3)))
516+
Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqWaitingMetricName, 0)))
517+
Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, gpuCacheUsageMetricName, 0.125)))
518518

519519
time.Sleep(4 * time.Second)
520520
metricsResp, err = client.Get(metricsUrl)
@@ -525,9 +525,9 @@ var _ = Describe("Simulator metrics", Ordered, func() {
525525
Expect(err).NotTo(HaveOccurred())
526526
metrics = string(data)
527527
// The requests finished running, expect 0 usage
528-
Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"Qwen/Qwen2-0.5B\"} 0"))
529-
Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"Qwen/Qwen2-0.5B\"} 0"))
530-
Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"Qwen/Qwen2-0.5B\"} 0"))
528+
Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqRunningMetricName, 0)))
529+
Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqWaitingMetricName, 0)))
530+
Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, gpuCacheUsageMetricName, 0)))
531531
}()
532532
wg.Wait()
533533
})
@@ -592,9 +592,9 @@ var _ = Describe("Simulator metrics", Ordered, func() {
592592
// The requests were sent with 500 millisecond intervals, and the first two should be still running.
593593
// The third is waiting, and is still not in the kv-cache.
594594
// We expect one block in the kv-cache, usage 1/16=0.0625.
595-
Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"Qwen/Qwen2-0.5B\"} 2"))
596-
Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"Qwen/Qwen2-0.5B\"} 1"))
597-
Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"Qwen/Qwen2-0.5B\"} 0.0625"))
595+
Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqRunningMetricName, 2)))
596+
Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqWaitingMetricName, 1)))
597+
Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, gpuCacheUsageMetricName, 0.0625)))
598598
}()
599599
wg.Wait()
600600
})
@@ -645,9 +645,9 @@ var _ = Describe("Simulator metrics", Ordered, func() {
645645
data, err := io.ReadAll(resp.Body)
646646
Expect(err).NotTo(HaveOccurred())
647647
metrics := string(data)
648-
Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"testmodel\"} 10"))
649-
Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"testmodel\"} 30"))
650-
Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"testmodel\"} 0.4"))
648+
Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqRunningMetricName, 10)))
649+
Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqWaitingMetricName, 30)))
650+
Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, gpuCacheUsageMetricName, 0.4)))
651651
Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora2\",waiting_lora_adapters=\"lora3\"} 1.257894567e+09"))
652652
Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora3\",waiting_lora_adapters=\"\"} 1.257894569e+09"))
653653

pkg/llm-d-inference-sim/test_utils.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ func getLastLoraMetrics(metrics []string) ([]string, error) {
245245
lastTimestamp := float64(0)
246246
var lastMetrics []string
247247
for _, metric := range metrics {
248-
if strings.HasPrefix(metric, "vllm:lora_requests_info") {
248+
if strings.HasPrefix(metric, loraRequestsMetricName) {
249249
timestamp, err := extractTimestamp(metric)
250250
if err != nil {
251251
return nil, err
@@ -347,6 +347,14 @@ func getFloatBucketMetricLine(model string, metric string, bucketBoundary float6
347347
return fmt.Sprintf("%s %d", getFloatBucketMetricPrefix(model, metric, bucketBoundary), count)
348348
}
349349

350+
func getCountMetricPrefix(model string, metric string) string {
351+
return fmt.Sprintf("%s{model_name=\"%s\"}", metric, model)
352+
}
353+
354+
func getCountMetricLine(model string, metric string, count float64) string {
355+
return fmt.Sprintf("%s %g", getCountMetricPrefix(model, metric), count)
356+
}
357+
350358
// same as getFloatBucketMetricLine but without the value part
351359
func getFloatBucketMetricPrefix(model string, metric string, bucketBoundary float64) string {
352360
buckerBoundStr := "+Inf"

pkg/llm-d-inference-sim/worker_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,8 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
300300

301301
// max-num-seqs is 12, so number of running requests should be 12
302302
// and the number of waiting requests 1000-12=988
303-
Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"testmodel\"} 12"))
304-
Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"testmodel\"} 988"))
303+
Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqRunningMetricName, 12)))
304+
Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqWaitingMetricName, 988)))
305305

306306
// max-loras is 2, so the last lora metric should be:
307307
// running: two loras (doesn't matter which two)
@@ -326,8 +326,8 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
326326
})
327327

328328
It("Should work correctly with many simultaneous requests with many workers", func() {
329-
runningMetric := "vllm:num_requests_running{model_name=\"testmodel\"}"
330-
waitingMetric := "vllm:num_requests_waiting{model_name=\"testmodel\"}"
329+
runningMetric := getCountMetricPrefix(testModel, reqRunningMetricName)
330+
waitingMetric := getCountMetricPrefix(testModel, reqWaitingMetricName)
331331
ctx := context.TODO()
332332
args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom,
333333
"--time-to-first-token", "2000", "--time-to-first-token-std-dev", "600",

pkg/vllm-api/vllm-models.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,6 @@ const (
2626
PromLabelMaxLora = "max_lora"
2727
PromLabelModelName = "model_name"
2828
PromLabelFinishReason = "finish_reason"
29-
30-
VllmLoraRequestInfo = "vllm:lora_requests_info"
31-
VllmNumRequestsRunning = "vllm:num_requests_running"
3229
)
3330

3431
// modelInfo defines data about model returned by /models API

0 commit comments

Comments
 (0)