Skip to content

Commit 76971e1

Browse files
committed
Auto-configure prefix-cache-scorer parameters from engine metrics
1 parent 32970c0 commit 76971e1

File tree

8 files changed

+84
-6
lines changed

8 files changed

+84
-6
lines changed

cmd/epp/runner/runner.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ var (
9292
kvCacheUsagePercentageMetric = flag.String("kv-cache-usage-percentage-metric", runserver.DefaultKvCacheUsagePercentageMetric, "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).")
9393
// LoRA metrics
9494
loraInfoMetric = flag.String("lora-info-metric", runserver.DefaultLoraInfoMetric, "Prometheus metric for the LoRA info metrics (must be in vLLM label format).")
95+
// Cache info metrics
96+
cacheInfoMetric = flag.String("cache-info-metric", runserver.DefaultCacheInfoMetric, "Prometheus metric for the cache info metrics.")
9597
// metrics related flags
9698
refreshMetricsInterval = flag.Duration("refresh-metrics-interval", runserver.DefaultRefreshMetricsInterval, "interval to refresh metrics")
9799
refreshPrometheusMetricsInterval = flag.Duration("refresh-prometheus-metrics-interval", runserver.DefaultRefreshPrometheusMetricsInterval, "interval to flush prometheus metrics")
@@ -371,6 +373,7 @@ func setupMetricsV1(setupLog logr.Logger) (datalayer.EndpointFactory, error) {
371373
*totalQueuedRequestsMetric,
372374
*kvCacheUsagePercentageMetric,
373375
*loraInfoMetric,
376+
*cacheInfoMetric,
374377
)
375378
if err != nil {
376379
setupLog.Error(err, "Failed to create metric mapping from flags.")
@@ -414,7 +417,7 @@ func setupDatalayer() (datalayer.EndpointFactory, error) {
414417
nil)
415418
extractor, err := dlmetrics.NewExtractor(*totalQueuedRequestsMetric,
416419
*kvCacheUsagePercentageMetric,
417-
*loraInfoMetric)
420+
*loraInfoMetric, *cacheInfoMetric)
418421

419422
if err != nil {
420423
return nil, err
@@ -499,6 +502,9 @@ func verifyMetricMapping(mapping backendmetrics.MetricMapping, logger logr.Logge
499502
if mapping.LoraRequestInfo == nil {
500503
logger.Info("Not scraping metric: LoraRequestInfo")
501504
}
505+
if mapping.CacheConfigInfo == nil {
506+
logger.Info("Not scraping metric: CacheConfigInfo")
507+
}
502508
}
503509

504510
// setupPprofHandlers only implements the pre-defined profiles:

pkg/epp/backend/metrics/metrics.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ const (
3636
LoraInfoRunningAdaptersMetricName = "running_lora_adapters"
3737
LoraInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
3838
LoraInfoMaxAdaptersMetricName = "max_lora"
39+
40+
CacheConfigBlockSizeInfoMetricName = "block_size"
3941
)
4042

4143
type PodMetricsClientImpl struct {
@@ -144,6 +146,24 @@ func (p *PodMetricsClientImpl) promToPodMetrics(
144146
}
145147
}
146148

149+
if p.MetricMapping.CacheConfigInfo != nil {
150+
cacheMetrics, err := p.getMetric(metricFamilies, *p.MetricMapping.CacheConfigInfo)
151+
if err != nil {
152+
errs = multierr.Append(errs, err)
153+
} else {
154+
for _, v := range cacheMetrics.GetLabel() {
155+
if v.GetName() == CacheConfigBlockSizeInfoMetricName {
156+
updated.CacheBlockSize, err = strconv.Atoi(v.GetValue())
157+
if err != nil {
158+
errs = multierr.Append(errs, err)
159+
} else {
160+
break
161+
}
162+
}
163+
}
164+
}
165+
}
166+
147167
return updated, errs
148168
}
149169

pkg/epp/backend/metrics/metrics_spec.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ type MetricMapping struct {
3232
TotalQueuedRequests *MetricSpec
3333
KVCacheUtilization *MetricSpec
3434
LoraRequestInfo *MetricSpec
35+
CacheConfigInfo *MetricSpec
3536
}
3637

3738
// stringToMetricSpec converts a string to a MetricSpec.
@@ -93,7 +94,7 @@ func stringToMetricSpec(specStr string) (*MetricSpec, error) {
9394
}
9495

9596
// NewMetricMapping creates a MetricMapping from string values.
96-
func NewMetricMapping(queuedStr, kvUsageStr, loraReqInfoStr string) (*MetricMapping, error) {
97+
func NewMetricMapping(queuedStr, kvUsageStr, loraReqInfoStr, cacheInfoMetric string) (*MetricMapping, error) {
9798
queuedSpec, err := stringToMetricSpec(queuedStr)
9899
if err != nil {
99100
return nil, fmt.Errorf("error parsing WaitingRequests: %w", err)
@@ -106,10 +107,17 @@ func NewMetricMapping(queuedStr, kvUsageStr, loraReqInfoStr string) (*MetricMapp
106107
if err != nil {
107108
return nil, fmt.Errorf("error parsing loraReqInfoStr: %w", err)
108109
}
110+
111+
cacheInfoSpec, err := stringToMetricSpec(cacheInfoMetric)
112+
if err != nil {
113+
return nil, fmt.Errorf("error parsing cacheInfoMetric: %w", err)
114+
}
115+
109116
mapping := &MetricMapping{
110117
TotalQueuedRequests: queuedSpec,
111118
KVCacheUtilization: kvUsageSpec,
112119
LoraRequestInfo: loraReqInfoSpec,
120+
CacheConfigInfo: cacheInfoSpec,
113121
}
114122

115123
return mapping, nil

pkg/epp/datalayer/metrics.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ type Metrics struct {
3232
WaitingQueueSize int
3333
KVCacheUsagePercent float64
3434
KvCacheMaxTokenCapacity int
35+
CacheBlockSize int
3536

3637
// UpdateTime records the last time when the metrics were updated.
3738
UpdateTime time.Time
@@ -75,6 +76,7 @@ func (m *Metrics) Clone() *Metrics {
7576
WaitingQueueSize: m.WaitingQueueSize,
7677
KVCacheUsagePercent: m.KVCacheUsagePercent,
7778
KvCacheMaxTokenCapacity: m.KvCacheMaxTokenCapacity,
79+
CacheBlockSize: m.CacheBlockSize,
7880
UpdateTime: m.UpdateTime,
7981
}
8082
}

pkg/epp/datalayer/metrics/extractor.go

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ const (
3737
LoraInfoRunningAdaptersMetricName = "running_lora_adapters"
3838
LoraInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
3939
LoraInfoMaxAdaptersMetricName = "max_lora"
40+
41+
CacheConfigBlockSizeInfoMetricName = "block_size"
4042
)
4143

4244
// Extractor implements the metrics extraction based on the model
@@ -49,8 +51,8 @@ type Extractor struct {
4951
// configured with the given metrics' specifications.
5052
// These are mandatory metrics per the MSP specification, and are used
5153
// as the basis for the built-in scheduling plugins.
52-
func NewExtractor(queueSpec, kvusageSpec, loraSpec string) (*Extractor, error) {
53-
mapping, err := NewMapping(queueSpec, kvusageSpec, loraSpec)
54+
func NewExtractor(queueSpec, kvusageSpec, loraSpec, cacheInfoSpec string) (*Extractor, error) {
55+
mapping, err := NewMapping(queueSpec, kvusageSpec, loraSpec, cacheInfoSpec)
5456
if err != nil {
5557
return nil, fmt.Errorf("failed to create extractor metrics Mapping - %w", err)
5658
}
@@ -111,6 +113,16 @@ func (ext *Extractor) Extract(ctx context.Context, data any, ep datalayer.Endpoi
111113
}
112114
}
113115

116+
if spec := ext.mapping.CacheInfo; spec != nil { // extract CacheInfo-specific metrics
117+
metric, err := spec.getLatestMetric(families)
118+
if err != nil {
119+
errs = append(errs, err)
120+
} else if metric != nil {
121+
populateCacheInfoMetrics(clone, metric, &errs)
122+
updated = true
123+
}
124+
}
125+
114126
if updated {
115127
clone.UpdateTime = time.Now()
116128
ep.UpdateMetrics(clone)
@@ -145,6 +157,23 @@ func populateLoRAMetrics(clone *datalayer.Metrics, metric *dto.Metric, errs *[]e
145157
}
146158
}
147159

160+
// populateCacheInfoMetrics updates the metrics with cache info from the metric labels.
161+
func populateCacheInfoMetrics(clone *datalayer.Metrics, metric *dto.Metric, errs *[]error) {
162+
clone.CacheBlockSize = 0
163+
for _, label := range metric.GetLabel() {
164+
if label.GetName() == CacheConfigBlockSizeInfoMetricName {
165+
if label.GetValue() != "" {
166+
if val, err := strconv.Atoi(label.GetValue()); err == nil {
167+
clone.CacheBlockSize = val
168+
break
169+
} else {
170+
*errs = append(*errs, err)
171+
}
172+
}
173+
}
174+
}
175+
}
176+
148177
// addAdapters splits a comma-separated adapter list and stores keys with default value 0.
149178
func addAdapters(m map[string]int, csv string) {
150179
for _, name := range strings.Split(csv, ",") {

pkg/epp/datalayer/metrics/mapping.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,11 @@ type Mapping struct {
2626
TotalQueuedRequests *Spec
2727
KVCacheUtilization *Spec
2828
LoraRequestInfo *LoRASpec
29+
CacheInfo *Spec
2930
}
3031

3132
// NewMapping creates a metrics.Mapping from the input specification strings.
32-
func NewMapping(queue, kvusage, lora string) (*Mapping, error) {
33+
func NewMapping(queue, kvusage, lora, cacheInfo string) (*Mapping, error) {
3334
var errs []error
3435

3536
queueSpec, err := parseStringToSpec(queue)
@@ -44,12 +45,19 @@ func NewMapping(queue, kvusage, lora string) (*Mapping, error) {
4445
if err != nil {
4546
errs = append(errs, err)
4647
}
48+
49+
cacheInfoSpec, err := parseStringToSpec(cacheInfo)
50+
if err != nil {
51+
errs = append(errs, err)
52+
}
53+
4754
if len(errs) != 0 {
4855
return nil, errors.Join(errs...)
4956
}
5057
return &Mapping{
5158
TotalQueuedRequests: queueSpec,
5259
KVCacheUtilization: kvusageSpec,
5360
LoraRequestInfo: loraSpec,
61+
CacheInfo: cacheInfoSpec,
5462
}, nil
5563
}

pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,11 @@ func (p *Plugin) WithName(name string) *Plugin {
180180
// Score returns the scoring result for the given list of pods based on context.
181181
func (p *Plugin) Score(ctx context.Context, cycleState *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
182182
// pre score step, hashing prompt and find longest prefix match.
183-
hashes := hashPrompt(ctx, request, p.config.BlockSize, p.config.MaxPrefixBlocksToMatch)
183+
blockSize := pods[0].GetMetrics().CacheBlockSize * 4
184+
if blockSize <= 0 {
185+
blockSize = p.config.BlockSize
186+
}
187+
hashes := hashPrompt(ctx, request, blockSize, p.config.MaxPrefixBlocksToMatch)
184188
state := &SchedulingContextState{
185189
PrefixHashes: hashes,
186190
PrefixCacheServers: p.matchLongestPrefix(ctx, hashes),

pkg/epp/server/runserver.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ const (
7979
DefaultTotalQueuedRequestsMetric = "vllm:num_requests_waiting" // default for --total-queued-requests-metric
8080
DefaultKvCacheUsagePercentageMetric = "vllm:gpu_cache_usage_perc" // default for --kv-cache-usage-percentage-metric
8181
DefaultLoraInfoMetric = "vllm:lora_requests_info" // default for --lora-info-metric
82+
DefaultCacheInfoMetric = "vllm:cache_config_info" // default for --cache-info-metric
8283
DefaultCertPath = "" // default for --cert-path
8384
DefaultConfigFile = "" // default for --config-file
8485
DefaultConfigText = "" // default for --config-text

0 commit comments

Comments
 (0)