Skip to content

Commit bc7fbe1

Browse files
committed
feat(metrics): add request prompt, generation, max_tokens and success metrics
Signed-off-by: googs1025 <[email protected]>
1 parent 699452c commit bc7fbe1

File tree

7 files changed

+319
-13
lines changed

7 files changed

+319
-13
lines changed

manifests/config_with_fake.yaml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,18 @@ time-to-first-token: 2000
77
inter-token-latency: 1000
88
kv-cache-transfer-latency: 100
99
seed: 100100100
10-
fake-metrics:
10+
fake-metrics:
1111
running-requests: 16
1212
waiting-requests: 3
1313
kv-cache-usage: 0.3
14+
request-success-total:
15+
stop: 20
16+
length: 0
17+
tool_calls: 0
18+
remote_decode: 0
19+
request-prompt-tokens: [ 10, 20, 30, 15 ]
20+
request-generation-tokens: [ 50, 60, 40 ]
21+
request-params-max-tokens: [ 128, 256, 512 ]
1422
loras:
1523
- '{"running":"lora1,lora2","waiting":"lora3","timestamp":1257894567}'
1624
- '{"running":"lora1,lora3","waiting":"","timestamp":1257894569}'

pkg/common/config.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,13 @@ type Metrics struct {
186186
WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
187187
// KVCacheUsagePercentage is the fraction of KV-cache blocks currently in use (from 0 to 1)
188188
KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
189+
// RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init.
190+
// Each value will be passed to Observe() once at start-up.
191+
RequestPromptTokens []float64 `yaml:"request-prompt-tokens" json:"request-prompt-tokens"` // prompt-length samples
192+
RequestGenerationTokens []float64 `yaml:"request-generation-tokens" json:"request-generation-tokens"` // generation-length samples
193+
RequestParamsMaxTokens []float64 `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
194+
// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
195+
RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
189196
}
190197

191198
type LorasMetrics struct {

pkg/common/config_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,15 @@ var _ = Describe("Simulator configuration", func() {
203203
"{\"running\":\"lora1,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567}",
204204
"{\"running\":\"lora1,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}",
205205
},
206+
RequestPromptTokens: []float64{10, 20, 30, 15},
207+
RequestGenerationTokens: []float64{50, 60, 40},
208+
RequestParamsMaxTokens: []float64{128, 256, 512},
209+
RequestSuccessTotal: map[string]int64{
210+
StopFinishReason: 20,
211+
LengthFinishReason: 0,
212+
ToolsFinishReason: 0,
213+
RemoteDecodeFinishReason: 0,
214+
},
206215
}
207216
test = testCase{
208217
name: "config with fake metrics file",

pkg/llm-d-inference-sim/metrics.go

Lines changed: 140 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package llmdinferencesim
2020

2121
import (
2222
"context"
23+
"math"
2324
"strconv"
2425
"strings"
2526
"sync"
@@ -94,6 +95,61 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
9495
return err
9596
}
9697

98+
s.requestPromptTokens = prometheus.NewHistogramVec(
99+
prometheus.HistogramOpts{
100+
Subsystem: "",
101+
Name: "vllm:request_prompt_tokens",
102+
Help: "Number of prefill tokens processed.",
103+
Buckets: build125Buckets(s.config.MaxModelLen),
104+
},
105+
[]string{vllmapi.PromLabelModelName},
106+
)
107+
if err := s.registry.Register(s.requestPromptTokens); err != nil {
108+
s.logger.Error(err, "Prometheus request_prompt_tokens histogram register failed")
109+
return err
110+
}
111+
112+
s.requestGenerationTokens = prometheus.NewHistogramVec(
113+
prometheus.HistogramOpts{
114+
Subsystem: "",
115+
Name: "vllm:request_generation_tokens",
116+
Help: "Number of generation tokens processed.",
117+
Buckets: build125Buckets(s.config.MaxModelLen),
118+
},
119+
[]string{vllmapi.PromLabelModelName},
120+
)
121+
if err := s.registry.Register(s.requestGenerationTokens); err != nil {
122+
s.logger.Error(err, "Prometheus request_generation_tokens histogram register failed")
123+
return err
124+
}
125+
126+
s.requestParamsMaxTokens = prometheus.NewHistogramVec(
127+
prometheus.HistogramOpts{
128+
Subsystem: "",
129+
Name: "vllm:request_params_max_tokens",
130+
Help: "Histogram of the max_tokens request parameter.",
131+
Buckets: build125Buckets(s.config.MaxModelLen),
132+
},
133+
[]string{vllmapi.PromLabelModelName},
134+
)
135+
if err := s.registry.Register(s.requestParamsMaxTokens); err != nil {
136+
s.logger.Error(err, "Prometheus request_params_max_tokens histogram register failed")
137+
return err
138+
}
139+
140+
s.requestSuccessTotal = prometheus.NewCounterVec(
141+
prometheus.CounterOpts{
142+
Subsystem: "",
143+
Name: "vllm:request_success_total",
144+
Help: "Count of successfully processed requests.",
145+
},
146+
[]string{vllmapi.PromLabelModelName, vllmapi.PromLabelFinishReason},
147+
)
148+
if err := s.registry.Register(s.requestSuccessTotal); err != nil {
149+
s.logger.Error(err, "Prometheus request_success_total counter register failed")
150+
return err
151+
}
152+
97153
s.setInitialPrometheusMetrics()
98154

99155
return nil
@@ -103,12 +159,25 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
103159
// the fake metrics if set
104160
func (s *VllmSimulator) setInitialPrometheusMetrics() {
105161
var nRunningReqs, nWaitingReqs, kvCacheUsage float64
162+
modelName := s.getDisplayedModelName(s.config.Model)
106163
if s.config.FakeMetrics != nil {
107164
nRunningReqs = float64(s.config.FakeMetrics.RunningRequests)
108165
nWaitingReqs = float64(s.config.FakeMetrics.WaitingRequests)
109166
kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage)
167+
for _, requestPromptToken := range s.config.FakeMetrics.RequestPromptTokens {
168+
s.requestPromptTokens.WithLabelValues(modelName).Observe(requestPromptToken)
169+
}
170+
for _, requestGenerationToken := range s.config.FakeMetrics.RequestGenerationTokens {
171+
s.requestGenerationTokens.WithLabelValues(modelName).Observe(requestGenerationToken)
172+
}
173+
for _, requestParamsMaxToken := range s.config.FakeMetrics.RequestParamsMaxTokens {
174+
s.requestParamsMaxTokens.WithLabelValues(modelName).Observe(requestParamsMaxToken)
175+
}
176+
for reason, requestSuccessTotal := range s.config.FakeMetrics.RequestSuccessTotal {
177+
s.requestSuccessTotal.WithLabelValues(modelName, reason).Add(float64(requestSuccessTotal))
178+
}
179+
110180
}
111-
modelName := s.getDisplayedModelName(s.config.Model)
112181
s.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
113182
s.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs)
114183
s.kvCacheUsagePercentage.WithLabelValues(modelName).Set(kvCacheUsage)
@@ -198,6 +267,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
198267
go s.runningRequestsUpdater(ctx)
199268
go s.lorasUpdater(ctx)
200269
go s.kvCacheUsageUpdater(ctx)
270+
go s.recordRequestUpdater(ctx)
201271
}
202272

203273
// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -282,3 +352,72 @@ func (s *VllmSimulator) decrementLoraRefCount(lora string, theMap *sync.Map) {
282352
s.logger.Error(nil, "Zero model reference", "model", lora)
283353
}
284354
}
355+
356+
// recordRequestUpdater listens on requestSuccessChan and drives the Prometheus metric
357+
// for successfully completed requests.
358+
func (s *VllmSimulator) recordRequestUpdater(ctx context.Context) {
359+
for {
360+
select {
361+
case <-ctx.Done():
362+
return
363+
case event := <-s.requestSuccessChan:
364+
s.recordRequestMetricsOnSuccess(
365+
event.promptTokens,
366+
event.generationTokens,
367+
event.maxTokens,
368+
event.finishReason,
369+
)
370+
}
371+
}
372+
}
373+
374+
// requestSuccessEvent represents the data associated with a successfully completed request,
375+
// which is sent through the requestSuccessChan for asynchronous metrics recording.
376+
type requestSuccessEvent struct {
377+
// promptTokens is the number of input (prompt) tokens in the request
378+
promptTokens int
379+
// generationTokens is the number of generated (output) tokens in the response
380+
generationTokens int
381+
// maxTokens is the maximum number of tokens allowed for generation (if specified in the request)
382+
maxTokens *int64
383+
// finishReason indicates why the generation stopped (e.g., "stop", "length", "tool_calls")
384+
finishReason string
385+
}
386+
387+
// recordRequestMetricsOnSuccess records metrics for a successfully completed request
388+
func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens,
389+
generationTokens int, maxTokens *int64, finishReason string) {
390+
modelName := s.getDisplayedModelName(s.config.Model)
391+
s.requestPromptTokens.WithLabelValues(modelName).Observe(float64(promptTokens))
392+
s.requestGenerationTokens.WithLabelValues(modelName).Observe(float64(generationTokens))
393+
if maxTokens != nil {
394+
s.requestParamsMaxTokens.WithLabelValues(modelName).Observe(float64(*maxTokens))
395+
}
396+
s.requestSuccessTotal.WithLabelValues(modelName, finishReason).Inc()
397+
}
398+
399+
// build125Buckets generates histogram buckets in powers of 10 scaled by [1,2,5].
400+
// This matches vLLM's build_1_2_5_buckets() in metrics.py.
401+
//
402+
// Reference: https://github.com/vllm-project/vllm/blob/main/vllm/engine/metrics.py#L175
403+
func build125Buckets(maxValue int) []float64 {
404+
var buckets []float64
405+
exponent := 0
406+
mantissa := []int{1, 2, 5}
407+
408+
for {
409+
complete := true
410+
for _, m := range mantissa {
411+
value := m * int(math.Pow10(exponent))
412+
if value <= maxValue {
413+
buckets = append(buckets, float64(value))
414+
complete = false
415+
}
416+
}
417+
if complete {
418+
break
419+
}
420+
exponent++
421+
}
422+
return buckets
423+
}

pkg/llm-d-inference-sim/metrics_test.go

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@ import (
2222
"io"
2323
"net/http"
2424
"os"
25+
"reflect"
2526
"regexp"
2627
"sort"
2728
"strconv"
2829
"strings"
2930
"sync"
31+
"testing"
3032
"time"
3133

3234
"github.com/llm-d/llm-d-inference-sim/pkg/common"
@@ -114,6 +116,52 @@ var _ = Describe("Simulator metrics", Ordered, func() {
114116
wg.Wait()
115117
})
116118

119+
It("Should record correct prompt and generation token counts", func() {
120+
modelName := "testmodel"
121+
prompt := strings.Repeat("hello ", 10) // ~10 tokens
122+
maxTokens := 25
123+
124+
ctx := context.TODO()
125+
args := []string{"cmd", "--model", modelName, "--mode", common.ModeRandom,
126+
"--time-to-first-token", "100", "--token-generation-delay", "10",
127+
"--max-num-seqs", "4"}
128+
129+
client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
130+
Expect(err).NotTo(HaveOccurred())
131+
132+
openaiclient := openai.NewClient(
133+
option.WithBaseURL(baseURL),
134+
option.WithHTTPClient(client))
135+
136+
params := openai.ChatCompletionNewParams{
137+
Messages: []openai.ChatCompletionMessageParamUnion{
138+
openai.UserMessage(prompt),
139+
},
140+
Model: modelName,
141+
MaxTokens: openai.Int(int64(maxTokens)),
142+
Temperature: openai.Float(0.0),
143+
}
144+
145+
_, err = openaiclient.Chat.Completions.New(ctx, params)
146+
Expect(err).NotTo(HaveOccurred())
147+
148+
time.Sleep(500 * time.Millisecond)
149+
150+
metricsResp, err := client.Get(metricsUrl)
151+
Expect(err).NotTo(HaveOccurred())
152+
Expect(metricsResp.StatusCode).To(Equal(http.StatusOK))
153+
154+
data, err := io.ReadAll(metricsResp.Body)
155+
Expect(err).NotTo(HaveOccurred())
156+
metrics := string(data)
157+
158+
Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="16"}`))
159+
Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_count{model_name="testmodel"} 1`))
160+
Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="32"}`))
161+
Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_count{model_name="testmodel"} 1`))
162+
Expect(metrics).To(ContainSubstring(`vllm:request_success_total{model_name="testmodel",finish_reason="stop"} 1`))
163+
})
164+
117165
It("Should send correct lora metrics", func() {
118166
ctx := context.TODO()
119167
args := []string{"cmd", "--model", model, "--mode", common.ModeRandom,
@@ -572,3 +620,77 @@ func splitString(str string) []string {
572620
}
573621
return strings.Split(str, ",")
574622
}
623+
624+
// TestBuild125Buckets tests the build125Buckets function with various inputs.
625+
func TestBuild125Buckets(t *testing.T) {
626+
tests := []struct {
627+
name string
628+
maxValue int
629+
want []float64
630+
}{
631+
{
632+
name: "max_value zero",
633+
maxValue: 0,
634+
want: []float64{}, // no bucket <= 0
635+
},
636+
{
637+
name: "max_value one",
638+
maxValue: 1,
639+
want: []float64{1},
640+
},
641+
{
642+
name: "max_value five",
643+
maxValue: 5,
644+
want: []float64{1, 2, 5},
645+
},
646+
{
647+
name: "max_value ten",
648+
maxValue: 10,
649+
want: []float64{1, 2, 5, 10},
650+
},
651+
{
652+
name: "max_value 100",
653+
maxValue: 100,
654+
want: []float64{1, 2, 5, 10, 20, 50, 100},
655+
},
656+
{
657+
name: "max_value 999",
658+
maxValue: 999,
659+
want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500},
660+
},
661+
{
662+
name: "max_value 1024",
663+
maxValue: 1024,
664+
want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000},
665+
},
666+
{
667+
name: "max_value 4096",
668+
maxValue: 4096,
669+
want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 4000},
670+
},
671+
{
672+
name: "max_value 32768",
673+
maxValue: 32768,
674+
want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 30000},
675+
},
676+
{
677+
name: "max_value just below power of 10",
678+
maxValue: 99,
679+
want: []float64{1, 2, 5, 10, 20, 50},
680+
},
681+
{
682+
name: "max_value negative",
683+
maxValue: -1,
684+
want: []float64{}, // no positive bucket <= -1
685+
},
686+
}
687+
688+
for _, tt := range tests {
689+
t.Run(tt.name, func(t *testing.T) {
690+
got := build125Buckets(tt.maxValue)
691+
if !reflect.DeepEqual(got, tt.want) {
692+
t.Errorf("build125Buckets(%d) = %v, want %v", tt.maxValue, got, tt.want)
693+
}
694+
})
695+
}
696+
}

0 commit comments

Comments
 (0)