Skip to content

Commit f8d4bc9

Browse files
authored
build: sync with latest GIE v0.0.0-20250715021823 (#239)
Update the inference scheduler to the latest GIE version- 20250715021823. Signed-off-by: Kfir Toledo <[email protected]>
1 parent 4f39530 commit f8d4bc9

File tree

11 files changed

+66
-67
lines changed

11 files changed

+66
-67
lines changed

deploy/config/epp-config.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,18 @@
33
apiVersion: inference.networking.x-k8s.io/v1alpha1
44
kind: EndpointPickerConfig
55
plugins:
6-
- type: prefix-cache
6+
- type: prefix-cache-scorer
77
parameters:
88
hashBlockSize: 5
99
maxPrefixBlocksToMatch: 256
1010
lruCapacityPerServer: 31250
1111
- type: decode-filter
12-
- type: max-score
13-
- type: single-profile
12+
- type: max-score-picker
13+
- type: single-profile-handler
1414
schedulingProfiles:
1515
- name: default
1616
plugins:
1717
- pluginRef: decode-filter
18-
- pluginRef: max-score
19-
- pluginRef: prefix-cache
20-
weight: 50
18+
- pluginRef: max-score-picker
19+
- pluginRef: prefix-cache-scorer
20+
weight: 50

deploy/config/epp-kvcache-load-config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
apiVersion: inference.networking.x-k8s.io/v1alpha1
44
kind: EndpointPickerConfig
55
plugins:
6-
- type: single-profile
6+
- type: single-profile-handler
77
- type: decode-filter
88
- type: kvcache-aware-scorer
99
- type: load-aware-scorer
10-
- type: max-score
10+
- type: max-score-picker
1111
schedulingProfiles:
1212
- name: default
1313
plugins:
@@ -16,4 +16,4 @@ schedulingProfiles:
1616
weight: 2.0
1717
- pluginRef: load-aware-scorer
1818
weight: 1.0
19-
- pluginRef: max-score
19+
- pluginRef: max-score-picker

deploy/config/pd-epp-config.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
apiVersion: inference.networking.x-k8s.io/v1alpha1
33
kind: EndpointPickerConfig
44
plugins:
5-
- type: prefill-header
6-
- type: prefix-cache
5+
- type: prefill-header-handler
6+
- type: prefix-cache-scorer
77
parameters:
88
hashBlockSize: 5
99
maxPrefixBlocksToMatch: 256
1010
lruCapacityPerServer: 31250
1111
- type: prefill-filter
1212
- type: decode-filter
13-
- type: max-score
13+
- type: max-score-picker
1414
- type: pd-profile-handler
1515
parameters:
1616
threshold: 10
@@ -19,12 +19,12 @@ schedulingProfiles:
1919
- name: prefill
2020
plugins:
2121
- pluginRef: prefill-filter
22-
- pluginRef: max-score
23-
- pluginRef: prefix-cache
22+
- pluginRef: max-score-picker
23+
- pluginRef: prefix-cache-scorer
2424
weight: 50
2525
- name: decode
2626
plugins:
2727
- pluginRef: decode-filter
28-
- pluginRef: max-score
29-
- pluginRef: prefix-cache
30-
weight: 50
28+
- pluginRef: max-score-picker
29+
- pluginRef: prefix-cache-scorer
30+
weight: 50

docs/architecture.md

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ These components are maintained in the `llm-d-inference-scheduler` repository an
8383

8484
## Configuration
8585

86-
The set of lifecycle hooks (plugins) that are used by the inference scheduler is determined by how
86+
The set of lifecycle hooks (plugins) that are used by the inference scheduler is determined by how
8787
it is configured. The configuration is in the form of YAML text, which can either be in a file or
8888
specified in-line as a parameter. The configuration defines the set of plugins to be instantiated along with their parameters. Each plugin is also given a name, enabling the same plugin type to be instantiated
8989
multiple times, if needed. Also defined is a set of SchedulingProfiles, which determine the set of
@@ -144,20 +144,20 @@ A complete configuration might look like this:
144144
apiVersion: inference.networking.x-k8s.io/v1alpha1
145145
kind: EndpointPickerConfig
146146
plugins:
147-
- type: prefix-cache
147+
- type: prefix-cache-scorer
148148
parameters:
149149
hashBlockSize: 5
150150
maxPrefixBlocksToMatch: 256
151151
lruCapacityPerServer: 31250
152152
- type: decode-filter
153-
- type: max-score
154-
- type: single-profile
153+
- type: max-score-picker
154+
- type: single-profile-handler
155155
schedulingProfiles:
156156
- name: default
157157
plugins:
158158
- pluginRef: decode-filter
159-
- pluginRef: max-score
160-
- pluginRef: prefix-cache
159+
- pluginRef: max-score-picker
160+
- pluginRef: prefix-cache-scorer
161161
weight: 50
162162
```
163163
@@ -170,9 +170,9 @@ This section describes how to setup the various plugins available with the llm-d
170170

171171
**PrefillHeader**<br>
172172
Sets a header for use in disaggregated prefill/decode<br>
173-
*Type*: prefill-header<br>
173+
*Type*: prefill-header-handler<br>
174174
*Parameters*:<br>
175-
\- `prefillProfile` specifies the name of the profile used for the prefill scheduling. Only needed if the
175+
\- `prefillProfile` specifies the name of the profile used for the prefill scheduling. Only needed if the
176176
prefill profile is not named `prefill`.<br>
177177

178178
**PdProfileHandler**<br>
@@ -244,15 +244,15 @@ The following is an example of what a configuration for disaggregated Prefill/De
244244
apiVersion: inference.networking.x-k8s.io/v1alpha1
245245
kind: EndpointPickerConfig
246246
plugins:
247-
- type: prefill-header
248-
- type: prefix-cache
247+
- type: prefill-header-handler
248+
- type: prefix-cache-scorer
249249
parameters:
250250
hashBlockSize: 5
251251
maxPrefixBlocksToMatch: 256
252252
lruCapacityPerServer: 31250
253253
- type: prefill-filter
254254
- type: decode-filter
255-
- type: max-score
255+
- type: max-score-picker
256256
- type: pd-profile-handler
257257
parameters:
258258
threshold: 10
@@ -261,14 +261,14 @@ schedulingProfiles:
261261
- name: prefill
262262
plugins:
263263
- pluginRef: prefill-filter
264-
- pluginRef: max-score
265-
- pluginRef: prefix-cache
264+
- pluginRef: max-score-picker
265+
- pluginRef: prefix-cache-scorer
266266
weight: 50
267267
- name: decode
268268
plugins:
269269
- pluginRef: decode-filter
270-
- pluginRef: max-score
271-
- pluginRef: prefix-cache
270+
- pluginRef: max-score-picker
271+
- pluginRef: prefix-cache-scorer
272272
weight: 50
273273
```
274274

docs/dp.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ This evolved version removes the requirement for sidecars on the **prefill node*
5050
- If `prefill_worker_id == nil`, runs both stages locally by passing request to local vllm
5151
- If split:
5252
- Sends prefill job to Prefill Worker with a special header `do_remote_decode=true`
53-
- Upon receiving response from Prefill Worker runs decode stage
53+
- Upon receiving response from Prefill Worker runs decode stage
5454

5555
4. **Response Flow**
5656
- Response flows from decode sidecar → Envoy → EPP → User
@@ -119,7 +119,7 @@ This evolved version removes the requirement for sidecars on the **prefill node*
119119

120120
---
121121

122-
## Diagram
122+
## Diagram
123123

124124
![Disaggregated Prefill/Decode Architecture](./images/dp_architecture.png)
125125

go.mod

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ require (
1414
k8s.io/client-go v0.33.2
1515
sigs.k8s.io/controller-runtime v0.21.0
1616
sigs.k8s.io/gateway-api v1.3.0
17-
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250704222130-0e1e964b9bec
17+
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250715021823-e696e686a175
1818
)
1919

2020
require (
@@ -40,27 +40,20 @@ require (
4040
github.com/go-openapi/jsonpointer v0.21.0 // indirect
4141
github.com/go-openapi/jsonreference v0.21.0 // indirect
4242
github.com/go-openapi/swag v0.23.0 // indirect
43-
github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
4443
github.com/gogo/protobuf v1.3.2 // indirect
4544
github.com/google/btree v1.1.3 // indirect
4645
github.com/google/cel-go v0.23.2 // indirect
4746
github.com/google/gnostic-models v0.6.9 // indirect
48-
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect
4947
github.com/google/uuid v1.6.0 // indirect
50-
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
5148
github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 // indirect
5249
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
5350
github.com/inconshreveable/mousetrap v1.1.0 // indirect
5451
github.com/josharian/intern v1.0.0 // indirect
5552
github.com/json-iterator/go v1.1.12 // indirect
5653
github.com/mailru/easyjson v0.7.7 // indirect
57-
github.com/moby/spdystream v0.5.0 // indirect
5854
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
5955
github.com/modern-go/reflect2 v1.0.2 // indirect
6056
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
61-
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
62-
github.com/onsi/ginkgo/v2 v2.23.4 // indirect
63-
github.com/onsi/gomega v1.37.0 // indirect
6457
github.com/pkg/errors v0.9.1 // indirect
6558
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
6659
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
@@ -81,7 +74,6 @@ require (
8174
go.opentelemetry.io/otel/sdk v1.35.0 // indirect
8275
go.opentelemetry.io/otel/trace v1.35.0 // indirect
8376
go.opentelemetry.io/proto/otlp v1.4.0 // indirect
84-
go.uber.org/automaxprocs v1.6.0 // indirect
8577
go.uber.org/multierr v1.11.0 // indirect
8678
go.uber.org/zap v1.27.0 // indirect
8779
go.yaml.in/yaml/v2 v2.4.2 // indirect
@@ -93,7 +85,6 @@ require (
9385
golang.org/x/term v0.32.0 // indirect
9486
golang.org/x/text v0.25.0 // indirect
9587
golang.org/x/time v0.9.0 // indirect
96-
golang.org/x/tools v0.31.0 // indirect
9788
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
9889
google.golang.org/genproto/googleapis/api v0.0.0-20250324211829-b45e905df463 // indirect
9990
google.golang.org/genproto/googleapis/rpc v0.0.0-20250428153025-10db94c68c34 // indirect

go.sum

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ cel.dev/expr v0.23.0 h1:wUb94w6OYQS4uXraxo9U+wUAs9jT47Xvl4iPgAwM2ss=
22
cel.dev/expr v0.23.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw=
33
github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI=
44
github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g=
5-
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
6-
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
75
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
86
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
97
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
@@ -126,8 +124,6 @@ github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1
126124
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
127125
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
128126
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
129-
github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g=
130-
github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U=
131127
github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q=
132128
github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0=
133129
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
@@ -281,8 +277,8 @@ sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytI
281277
sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM=
282278
sigs.k8s.io/gateway-api v1.3.0 h1:q6okN+/UKDATola4JY7zXzx40WO4VISk7i9DIfOvr9M=
283279
sigs.k8s.io/gateway-api v1.3.0/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk=
284-
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250704222130-0e1e964b9bec h1:46xYN9Y8YZ/QHHrh+bEG0kXh9KFBxl6GMRjp2tgKcqk=
285-
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250704222130-0e1e964b9bec/go.mod h1:lki0jx1qysZSZT4Ai2BxuAcpx6G8g5oBgOGuuJzjy/k=
280+
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250715021823-e696e686a175 h1:jKDXyBPAiESgnmirAuX15MeES8yaVj+KykZDhwyAyrk=
281+
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250715021823-e696e686a175/go.mod h1:lki0jx1qysZSZT4Ai2BxuAcpx6G8g5oBgOGuuJzjy/k=
286282
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8=
287283
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
288284
sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=

pkg/plugins/pre-request/pd_prerequest.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import (
1515

1616
const (
1717
// PrefillHeaderHandlerType is the type of the PrefillHeaderHandler
18-
PrefillHeaderHandlerType = "prefill-header"
18+
PrefillHeaderHandlerType = "prefill-header-handler"
1919
// prefillPodHeader is the header name used to indicate Prefill worker <ip:port>
2020
prefillPodHeader = "x-prefiller-host-port"
2121

@@ -74,6 +74,6 @@ func (p *PrefillHeaderHandler) PreRequest(_ context.Context, request *types.LLMR
7474
return // prefill profile failed to run or we chose not to run it, no-op in this case
7575
}
7676

77-
prefillHostPort := net.JoinHostPort(prefillProfileRunResult.TargetPod.GetPod().Address, strconv.Itoa(targetPort))
77+
prefillHostPort := net.JoinHostPort(prefillProfileRunResult.TargetPods[0].GetPod().Address, strconv.Itoa(targetPort))
7878
request.Headers[prefillPodHeader] = prefillHostPort // in the form of <ip:port>
7979
}

pkg/plugins/profile/pd_profile_handler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ func (h *PdProfileHandler) Pick(ctx context.Context, cycleState *types.CycleStat
108108
if err != nil {
109109
log.FromContext(ctx).Error(err, "unable to read prefix state")
110110
} else {
111-
decodePod := profileResults[h.decodeProfile].TargetPod.GetPod().NamespacedName
111+
decodePod := profileResults[h.decodeProfile].TargetPods[0].GetPod().NamespacedName
112112
hitPrefix := max(prefixState.PrefixCacheServers[prefix.ServerID(decodePod)]-1, 0) // The first hit is always the model name
113113
hitPercentagePrefix = float64(hitPrefix*h.hashBlockSize) / float64(len(request.Prompt))
114114
log.FromContext(ctx).V(logutil.DEBUG).Info("Computed hit percentage for prefix cache", "hitPercentage", hitPercentagePrefix,

pkg/plugins/scorer/load_aware_scorer_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ func TestLoadBasedScorer(t *testing.T) {
2828
{
2929
name: "load based scorer",
3030
scorer: scorer.NewLoadAwareScorer(context.Background(), 10),
31+
3132
req: &types.LLMRequest{
3233
TargetModel: "critical",
3334
},
@@ -72,7 +73,7 @@ func TestLoadBasedScorer(t *testing.T) {
7273
},
7374
},
7475
wantRes: &types.ProfileRunResult{
75-
TargetPod: &types.ScoredPod{
76+
TargetPods: []types.Pod{&types.ScoredPod{
7677
Pod: &types.PodMetrics{
7778
Pod: &backend.Pod{
7879
NamespacedName: k8stypes.NamespacedName{Name: "pod2"},
@@ -89,6 +90,7 @@ func TestLoadBasedScorer(t *testing.T) {
8990
},
9091
Score: 0.5,
9192
},
93+
},
9294
},
9395
},
9496
}
@@ -97,7 +99,7 @@ func TestLoadBasedScorer(t *testing.T) {
9799
t.Run(test.name, func(t *testing.T) {
98100
schedulerProfile := framework.NewSchedulerProfile().
99101
WithScorers(framework.NewWeightedScorer(test.scorer, 1)).
100-
WithPicker(picker.NewMaxScorePicker())
102+
WithPicker(picker.NewMaxScorePicker(picker.DefaultMaxNumOfEndpoints))
101103

102104
got, err := schedulerProfile.Run(context.Background(), test.req, nil, test.input)
103105

0 commit comments

Comments
 (0)