Skip to content

Commit a6edf2d

Browse files
committed
feat(metrics): add request prompt, generation, max_tokens and success metrics
Signed-off-by: googs1025 <[email protected]>
1 parent 699452c commit a6edf2d

File tree

7 files changed

+316
-13
lines changed

7 files changed

+316
-13
lines changed

manifests/config_with_fake.yaml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,18 @@ time-to-first-token: 2000
77
inter-token-latency: 1000
88
kv-cache-transfer-latency: 100
99
seed: 100100100
10-
fake-metrics:
10+
fake-metrics:
1111
running-requests: 16
1212
waiting-requests: 3
1313
kv-cache-usage: 0.3
14+
request-success-total:
15+
stop: 20
16+
length: 0
17+
tool_calls: 0
18+
remote_decode: 0
19+
request-prompt-tokens: [ 10, 20, 30, 15 ]
20+
request-generation-tokens: [ 50, 60, 40 ]
21+
request-params-max-tokens: [ 128, 256, 512 ]
1422
loras:
1523
- '{"running":"lora1,lora2","waiting":"lora3","timestamp":1257894567}'
1624
- '{"running":"lora1,lora3","waiting":"","timestamp":1257894569}'

pkg/common/config.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,13 @@ type Metrics struct {
186186
WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
187187
// KVCacheUsagePercentage is the fraction of KV-cache blocks currently in use (from 0 to 1)
188188
KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
189+
// RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init.
190+
// Each value will be passed to Observe() once at start-up.
191+
RequestPromptTokens []float64 `yaml:"request-prompt-tokens" json:"request-prompt-tokens"` // prompt-length samples
192+
RequestGenerationTokens []float64 `yaml:"request-generation-tokens" json:"request-generation-tokens"` // generation-length samples
193+
RequestParamsMaxTokens []float64 `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
194+
// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
195+
RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
189196
}
190197

191198
type LorasMetrics struct {

pkg/common/config_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,15 @@ var _ = Describe("Simulator configuration", func() {
203203
"{\"running\":\"lora1,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567}",
204204
"{\"running\":\"lora1,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}",
205205
},
206+
RequestPromptTokens: []float64{10, 20, 30, 15},
207+
RequestGenerationTokens: []float64{50, 60, 40},
208+
RequestParamsMaxTokens: []float64{128, 256, 512},
209+
RequestSuccessTotal: map[string]int64{
210+
StopFinishReason: 20,
211+
LengthFinishReason: 0,
212+
ToolsFinishReason: 0,
213+
RemoteDecodeFinishReason: 0,
214+
},
206215
}
207216
test = testCase{
208217
name: "config with fake metrics file",

pkg/llm-d-inference-sim/metrics.go

Lines changed: 140 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package llmdinferencesim
2020

2121
import (
2222
"context"
23+
"math"
2324
"strconv"
2425
"strings"
2526
"sync"
@@ -94,6 +95,61 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
9495
return err
9596
}
9697

98+
s.requestPromptTokens = prometheus.NewHistogramVec(
99+
prometheus.HistogramOpts{
100+
Subsystem: "",
101+
Name: "vllm:request_prompt_tokens",
102+
Help: "Number of prefill tokens processed.",
103+
Buckets: build125Buckets(s.config.MaxModelLen),
104+
},
105+
[]string{vllmapi.PromLabelModelName},
106+
)
107+
if err := s.registry.Register(s.requestPromptTokens); err != nil {
108+
s.logger.Error(err, "Prometheus request_prompt_tokens histogram register failed")
109+
return err
110+
}
111+
112+
s.requestGenerationTokens = prometheus.NewHistogramVec(
113+
prometheus.HistogramOpts{
114+
Subsystem: "",
115+
Name: "vllm:request_generation_tokens",
116+
Help: "Number of generation tokens processed.",
117+
Buckets: build125Buckets(s.config.MaxModelLen),
118+
},
119+
[]string{vllmapi.PromLabelModelName},
120+
)
121+
if err := s.registry.Register(s.requestGenerationTokens); err != nil {
122+
s.logger.Error(err, "Prometheus request_generation_tokens histogram register failed")
123+
return err
124+
}
125+
126+
s.requestParamsMaxTokens = prometheus.NewHistogramVec(
127+
prometheus.HistogramOpts{
128+
Subsystem: "",
129+
Name: "vllm:request_params_max_tokens",
130+
Help: "Histogram of the max_tokens request parameter.",
131+
Buckets: build125Buckets(s.config.MaxModelLen),
132+
},
133+
[]string{vllmapi.PromLabelModelName},
134+
)
135+
if err := s.registry.Register(s.requestParamsMaxTokens); err != nil {
136+
s.logger.Error(err, "Prometheus request_params_max_tokens histogram register failed")
137+
return err
138+
}
139+
140+
s.requestSuccessTotal = prometheus.NewCounterVec(
141+
prometheus.CounterOpts{
142+
Subsystem: "",
143+
Name: "vllm:request_success_total",
144+
Help: "Count of successfully processed requests.",
145+
},
146+
[]string{vllmapi.PromLabelModelName, vllmapi.PromLabelFinishReason},
147+
)
148+
if err := s.registry.Register(s.requestSuccessTotal); err != nil {
149+
s.logger.Error(err, "Prometheus request_success_total counter register failed")
150+
return err
151+
}
152+
97153
s.setInitialPrometheusMetrics()
98154

99155
return nil
@@ -103,12 +159,25 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
103159
// the fake metrics if set
104160
func (s *VllmSimulator) setInitialPrometheusMetrics() {
105161
var nRunningReqs, nWaitingReqs, kvCacheUsage float64
162+
modelName := s.getDisplayedModelName(s.config.Model)
106163
if s.config.FakeMetrics != nil {
107164
nRunningReqs = float64(s.config.FakeMetrics.RunningRequests)
108165
nWaitingReqs = float64(s.config.FakeMetrics.WaitingRequests)
109166
kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage)
167+
for _, requestPromptToken := range s.config.FakeMetrics.RequestPromptTokens {
168+
s.requestPromptTokens.WithLabelValues(modelName).Observe(requestPromptToken)
169+
}
170+
for _, requestGenerationToken := range s.config.FakeMetrics.RequestGenerationTokens {
171+
s.requestGenerationTokens.WithLabelValues(modelName).Observe(requestGenerationToken)
172+
}
173+
for _, requestParamsMaxToken := range s.config.FakeMetrics.RequestParamsMaxTokens {
174+
s.requestParamsMaxTokens.WithLabelValues(modelName).Observe(requestParamsMaxToken)
175+
}
176+
for reason, requestSuccessTotal := range s.config.FakeMetrics.RequestSuccessTotal {
177+
s.requestSuccessTotal.WithLabelValues(modelName, reason).Add(float64(requestSuccessTotal))
178+
}
179+
110180
}
111-
modelName := s.getDisplayedModelName(s.config.Model)
112181
s.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
113182
s.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs)
114183
s.kvCacheUsagePercentage.WithLabelValues(modelName).Set(kvCacheUsage)
@@ -198,6 +267,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
198267
go s.runningRequestsUpdater(ctx)
199268
go s.lorasUpdater(ctx)
200269
go s.kvCacheUsageUpdater(ctx)
270+
go s.recordRequestUpdater(ctx)
201271
}
202272

203273
// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -282,3 +352,72 @@ func (s *VllmSimulator) decrementLoraRefCount(lora string, theMap *sync.Map) {
282352
s.logger.Error(nil, "Zero model reference", "model", lora)
283353
}
284354
}
355+
356+
// recordRequestUpdater listens on requestSuccessChan and drives the Prometheus metric
357+
// for successfully completed requests.
358+
func (s *VllmSimulator) recordRequestUpdater(ctx context.Context) {
359+
for {
360+
select {
361+
case <-ctx.Done():
362+
return
363+
case event := <-s.requestSuccessChan:
364+
s.recordRequestMetricsOnSuccess(
365+
event.promptTokens,
366+
event.generationTokens,
367+
event.maxTokens,
368+
event.finishReason,
369+
)
370+
}
371+
}
372+
}
373+
374+
// requestSuccessEvent represents the data associated with a successfully completed request,
375+
// which is sent through the requestSuccessChan for asynchronous metrics recording.
376+
type requestSuccessEvent struct {
377+
// promptTokens is the number of input (prompt) tokens in the request
378+
promptTokens int
379+
// generationTokens is the number of generated (output) tokens in the response
380+
generationTokens int
381+
// maxTokens is the maximum number of tokens allowed for generation (if specified in the request)
382+
maxTokens *int64
383+
// finishReason indicates why the generation stopped (e.g., "stop", "length", "tool_calls")
384+
finishReason string
385+
}
386+
387+
// recordRequestMetricsOnSuccess records metrics for a successfully completed request
388+
func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens,
389+
generationTokens int, maxTokens *int64, finishReason string) {
390+
modelName := s.getDisplayedModelName(s.config.Model)
391+
s.requestPromptTokens.WithLabelValues(modelName).Observe(float64(promptTokens))
392+
s.requestGenerationTokens.WithLabelValues(modelName).Observe(float64(generationTokens))
393+
if maxTokens != nil {
394+
s.requestParamsMaxTokens.WithLabelValues(modelName).Observe(float64(*maxTokens))
395+
}
396+
s.requestSuccessTotal.WithLabelValues(modelName, finishReason).Inc()
397+
}
398+
399+
// build125Buckets generates histogram buckets in powers of 10 scaled by [1,2,5].
400+
// This matches vLLM's build_1_2_5_buckets() in metrics.py.
401+
//
402+
// Reference: https://github.com/vllm-project/vllm/blob/main/vllm/engine/metrics.py#L175
403+
func build125Buckets(maxValue int) []float64 {
404+
var buckets []float64
405+
exponent := 0
406+
mantissa := []int{1, 2, 5}
407+
408+
for {
409+
complete := true
410+
for _, m := range mantissa {
411+
value := m * int(math.Pow10(exponent))
412+
if value <= maxValue {
413+
buckets = append(buckets, float64(value))
414+
complete = false
415+
}
416+
}
417+
if complete {
418+
break
419+
}
420+
exponent++
421+
}
422+
return buckets
423+
}

pkg/llm-d-inference-sim/metrics_test.go

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@ import (
2222
"io"
2323
"net/http"
2424
"os"
25+
"reflect"
2526
"regexp"
2627
"sort"
2728
"strconv"
2829
"strings"
2930
"sync"
31+
"testing"
3032
"time"
3133

3234
"github.com/llm-d/llm-d-inference-sim/pkg/common"
@@ -114,6 +116,49 @@ var _ = Describe("Simulator metrics", Ordered, func() {
114116
wg.Wait()
115117
})
116118

119+
It("Should record correct prompt and generation token counts", func() {
120+
modelName := "testmodel"
121+
prompt := strings.Repeat("hello ", 10)
122+
maxTokens := 25
123+
124+
ctx := context.TODO()
125+
args := []string{"cmd", "--model", modelName, "--mode", common.ModeRandom,
126+
"--time-to-first-token", "100", "--max-num-seqs", "4"}
127+
128+
client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
129+
Expect(err).NotTo(HaveOccurred())
130+
131+
openaiclient := openai.NewClient(
132+
option.WithBaseURL(baseURL),
133+
option.WithHTTPClient(client))
134+
135+
params := openai.ChatCompletionNewParams{
136+
Messages: []openai.ChatCompletionMessageParamUnion{
137+
openai.UserMessage(prompt),
138+
},
139+
Model: modelName,
140+
MaxTokens: openai.Int(int64(maxTokens)),
141+
Temperature: openai.Float(0.0),
142+
}
143+
144+
_, err = openaiclient.Chat.Completions.New(ctx, params)
145+
Expect(err).NotTo(HaveOccurred())
146+
147+
time.Sleep(500 * time.Millisecond)
148+
149+
metricsResp, err := client.Get(metricsUrl)
150+
Expect(err).NotTo(HaveOccurred())
151+
Expect(metricsResp.StatusCode).To(Equal(http.StatusOK))
152+
153+
data, err := io.ReadAll(metricsResp.Body)
154+
Expect(err).NotTo(HaveOccurred())
155+
metrics := string(data)
156+
Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="50"} 1`))
157+
Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="50"} 1`))
158+
Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_count{model_name="testmodel"} 1`))
159+
Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="stop",model_name="testmodel"} 1`))
160+
})
161+
117162
It("Should send correct lora metrics", func() {
118163
ctx := context.TODO()
119164
args := []string{"cmd", "--model", model, "--mode", common.ModeRandom,
@@ -572,3 +617,77 @@ func splitString(str string) []string {
572617
}
573618
return strings.Split(str, ",")
574619
}
620+
621+
// TestBuild125Buckets tests the build125Buckets function with various inputs.
622+
func TestBuild125Buckets(t *testing.T) {
623+
tests := []struct {
624+
name string
625+
maxValue int
626+
want []float64
627+
}{
628+
{
629+
name: "max_value zero",
630+
maxValue: 0,
631+
want: []float64{}, // no bucket <= 0
632+
},
633+
{
634+
name: "max_value one",
635+
maxValue: 1,
636+
want: []float64{1},
637+
},
638+
{
639+
name: "max_value five",
640+
maxValue: 5,
641+
want: []float64{1, 2, 5},
642+
},
643+
{
644+
name: "max_value ten",
645+
maxValue: 10,
646+
want: []float64{1, 2, 5, 10},
647+
},
648+
{
649+
name: "max_value 100",
650+
maxValue: 100,
651+
want: []float64{1, 2, 5, 10, 20, 50, 100},
652+
},
653+
{
654+
name: "max_value 999",
655+
maxValue: 999,
656+
want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500},
657+
},
658+
{
659+
name: "max_value 1024",
660+
maxValue: 1024,
661+
want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000},
662+
},
663+
{
664+
name: "max_value 4096",
665+
maxValue: 4096,
666+
want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 4000},
667+
},
668+
{
669+
name: "max_value 32768",
670+
maxValue: 32768,
671+
want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 30000},
672+
},
673+
{
674+
name: "max_value just below power of 10",
675+
maxValue: 99,
676+
want: []float64{1, 2, 5, 10, 20, 50},
677+
},
678+
{
679+
name: "max_value negative",
680+
maxValue: -1,
681+
want: []float64{}, // no positive bucket <= -1
682+
},
683+
}
684+
685+
for _, tt := range tests {
686+
t.Run(tt.name, func(t *testing.T) {
687+
got := build125Buckets(tt.maxValue)
688+
if !reflect.DeepEqual(got, tt.want) {
689+
t.Errorf("build125Buckets(%d) = %v, want %v", tt.maxValue, got, tt.want)
690+
}
691+
})
692+
}
693+
}

0 commit comments

Comments
 (0)