Skip to content

Commit 10c8df0

Browse files
fix(webhook): relax HTTPRoute/Gateway validation (#911)
Users should be able to attach custom Gateway while still relying on managed HTTPRoute. While controller already has the logic to support it, the webhook was previously rejecting as invalid, expecting that "bring your own" config is all-or-nothing scenario. This obviously is not desired when users want simply to attach their LLMInferenceService to another gateway. - Webhook validation now permits such configuration, and only rejects when HTTPRoute Spec of LLMInferenceService is also set, treating it as invalid configuration and suggesting removal of parentRef in favor of the global Gateway defined as `.spec.router.gateway.refs` instead - Adjusted EnvTests - Introduced e2e test to cover this case that leverages llmd-simulator for faster feedback loop Signed-off-by: Bartosz Majsak <[email protected]>
1 parent f834155 commit 10c8df0

File tree

8 files changed

+103
-60
lines changed

8 files changed

+103
-60
lines changed

pkg/controller/llmisvc/fixture/gwapi_builders.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,12 @@ func WithBackendRefs(refs ...gatewayapi.HTTPBackendRef) HTTPRouteRuleOption {
283283
}
284284
}
285285

286+
func WithHTTPRouteGatewayRef(references ...gatewayapi.ParentReference) HTTPRouteOption {
287+
return func(route *gatewayapi.HTTPRoute) {
288+
route.Spec.ParentRefs = append(route.Spec.ParentRefs, references...)
289+
}
290+
}
291+
286292
func BackendRefInferencePool(name string) gatewayapi.HTTPBackendRef {
287293
return gatewayapi.HTTPBackendRef{
288294
BackendRef: gatewayapi.BackendRef{
@@ -334,10 +340,6 @@ func Matches(matches ...gatewayapi.HTTPRouteMatch) HTTPRouteRuleOption {
334340
return WithMatches(matches...)
335341
}
336342

337-
func BackendRefs(refs ...gatewayapi.HTTPBackendRef) HTTPRouteRuleOption {
338-
return WithBackendRefs(refs...)
339-
}
340-
341343
func Timeouts(backendTimeout, requestTimeout string) HTTPRouteRuleOption {
342344
return WithTimeouts(backendTimeout, requestTimeout)
343345
}

pkg/controller/llmisvc/router_gateway_conditions_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ func TestHTTPRouteConditionsEvaluation(t *testing.T) {
321321
WithParentRefs(GatewayParentRef("openshift-ai-inference", "openshift-ingress")),
322322
WithHTTPRule(
323323
Matches(PathPrefixMatch("/llm/facebook-opt-125m-single-simulated")),
324-
BackendRefs(ServiceRef("facebook-opt-125m-single-simulated-kserve-workload-svc", 8000, 1)),
324+
WithBackendRefs(ServiceRef("facebook-opt-125m-single-simulated-kserve-workload-svc", 8000, 1)),
325325
Timeouts("0s", "0s"),
326326
Filters(gatewayapi.HTTPRouteFilter{
327327
Type: gatewayapi.HTTPRouteFilterURLRewrite,

pkg/controller/llmisvc/validation/llminferenceservice_validator.go

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -113,23 +113,6 @@ func (l *LLMInferenceServiceValidator) validateRouterCrossFieldConstraints(llmSv
113113
httpRouteRefs := httpRoutePath.Child("refs")
114114
httpRouteSpec := httpRoutePath.Child("spec")
115115

116-
zero := v1alpha1.GatewayRoutesSpec{}
117-
if ptr.Deref(router.Route, zero) == zero && router.Gateway != nil && router.Gateway.Refs != nil {
118-
return field.ErrorList{
119-
field.Invalid(
120-
gwRefsPath,
121-
router.Gateway.Refs,
122-
fmt.Sprintf("unsupported configuration: custom gateway ('%s') cannot be used with managed route ('%s: {}'); "+
123-
"either provide your own HTTP routes ('%s') or remove '%s' to use the managed gateway",
124-
gwRefsPath,
125-
routePath,
126-
httpRouteRefs,
127-
gwRefsPath,
128-
),
129-
),
130-
}
131-
}
132-
133116
httpRoute := router.Route.HTTP
134117
if httpRoute == nil {
135118
return field.ErrorList{}
@@ -162,13 +145,14 @@ func (l *LLMInferenceServiceValidator) validateRouterCrossFieldConstraints(llmSv
162145
))
163146
}
164147

165-
// Managed route spec cannot be used with user-defined gateway refs
166-
if httpRoute.Spec != nil && router.Gateway != nil && len(router.Gateway.Refs) > 0 {
148+
// Managed route spec referring to the gateway cannot be used with user-defined gateway refs
149+
if httpRoute.Spec != nil && len(httpRoute.Spec.ParentRefs) > 0 &&
150+
router.Gateway != nil && len(router.Gateway.Refs) > 0 {
167151
allErrs = append(allErrs, field.Invalid(
168152
httpRoutePath.Child("spec"),
169153
httpRoute.Spec,
170154
fmt.Sprintf("unsupported configuration: managed HTTP route spec ('%s') cannot be used with custom gateway refs ('%s'); "+
171-
"either remove '%s' or '%s'",
155+
"either remove '%s' or '%s.parentRefs'",
172156
httpRouteSpec, gwRefsPath, gwRefsPath, httpRouteSpec,
173157
),
174158
))

pkg/controller/llmisvc/validation/llminferenceservice_validator_int_test.go

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ var _ = Describe("LLMInferenceService webhook validation", func() {
8383
fixture.WithHTTPRouteSpec(&fixture.HTTPRoute("test-route",
8484
fixture.WithHTTPRule(
8585
fixture.Matches(fixture.PathPrefixMatch("/test")),
86-
fixture.BackendRefs(fixture.ServiceRef("test-service", 80, 1)),
86+
fixture.WithBackendRefs(fixture.ServiceRef("test-service", 80, 1)),
8787
),
8888
).Spec),
8989
)
@@ -113,33 +113,17 @@ var _ = Describe("LLMInferenceService webhook validation", func() {
113113
Expect(errValidation.Error()).To(ContainSubstring("cannot be used with a managed gateway"))
114114
})
115115

116-
It("should reject LLMInferenceService with managed route and user-defined gateway refs", func(ctx SpecContext) {
117-
// given
118-
llmSvc := fixture.LLMInferenceService("test-spec-with-gateway-refs",
119-
fixture.InNamespace[*v1alpha1.LLMInferenceService](nsName),
120-
fixture.WithModelURI("hf://facebook/opt-125m"),
121-
fixture.WithGatewayRefs(fixture.LLMGatewayRef("test-gateway", nsName)),
122-
fixture.WithManagedRoute(),
123-
)
124-
125-
// when
126-
errValidation := envTest.Client.Create(ctx, llmSvc)
127-
128-
// then
129-
Expect(errValidation).To(HaveOccurred())
130-
Expect(errValidation.Error()).To(ContainSubstring("cannot be used with managed route"))
131-
})
132-
133-
It("should reject LLMInferenceService with managed route spec and user-defined gateway refs", func(ctx SpecContext) {
116+
It("should reject LLMInferenceService with managed route spec with gateway ref and user-defined gateway refs", func(ctx SpecContext) {
134117
// given
135118
llmSvc := fixture.LLMInferenceService("test-spec-with-gateway-refs",
136119
fixture.InNamespace[*v1alpha1.LLMInferenceService](nsName),
137120
fixture.WithModelURI("hf://facebook/opt-125m"),
138121
fixture.WithGatewayRefs(fixture.LLMGatewayRef("test-gateway", nsName)),
139122
fixture.WithHTTPRouteSpec(&fixture.HTTPRoute("test-route",
123+
fixture.WithHTTPRouteGatewayRef(fixture.GatewayParentRef("test-gateway", nsName)),
140124
fixture.WithHTTPRule(
141125
fixture.Matches(fixture.PathPrefixMatch("/test")),
142-
fixture.BackendRefs(fixture.ServiceRef("custom-backend", 8080, 1)),
126+
fixture.WithBackendRefs(fixture.ServiceRef("custom-backend", 8080, 1)),
143127
fixture.Timeouts("30s", "60s"),
144128
),
145129
).Spec),

test/e2e/llmisvc/README.md

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,32 @@ Tests are marked with both general and cluster-specific capability markers:
2222
- `@pytest.mark.cluster_nvidia` - NVIDIA GPU tests
2323
- `@pytest.mark.cluster_nvidia_roce` - NVIDIA ROCe tests
2424
- `@pytest.mark.cluster_intel` - Intel GPU tests
25+
- `@pytest.mark.llmd_simulator` - Tests using llm-d simulator instead of real model
2526

26-
Examples:
27-
```bash
28-
# Run all LLM inference service tests
27+
## Examples
28+
29+
### Run all LLM inference service tests
30+
31+
```shell
2932
pytest -m "llminferenceservice" test/e2e/llmisvc/
33+
```
3034

31-
# Run only CPU tests
35+
### Run only CPU tests
36+
```shell
3237
pytest -m "llminferenceservice and cluster_cpu" test/e2e/llmisvc/
38+
```
3339

34-
# Run only NVIDIA GPU tests
40+
### Run only NVIDIA GPU tests
41+
```shell
3542
pytest -m "llminferenceservice and cluster_nvidia" test/e2e/llmisvc/
43+
```
3644

37-
# Run all GPU tests (any vendor)
45+
### Run all GPU tests (any vendor)
46+
```shell
3847
pytest -m "llminferenceservice and (cluster_amd or cluster_nvidia or cluster_intel)" test/e2e/llmisvc/
39-
40-
# Run CPU and AMD GPU tests only
48+
```
49+
### Run CPU and AMD GPU tests only
50+
```shell
4151
pytest -m "llminferenceservice and (cluster_cpu or cluster_amd)" test/e2e/llmisvc/
4252
```
4353

test/e2e/llmisvc/fixtures.py

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,50 @@
459459
"router": {
460460
"scheduler": {},
461461
},
462-
}
462+
},
463+
"router-with-gateway-ref": {
464+
"router": {
465+
"gateway": {
466+
"refs": [
467+
{"name": "router-gateway-1", "namespace": KSERVE_TEST_NAMESPACE},
468+
],
469+
},
470+
},
471+
},
472+
"router-with-managed-route": {
473+
"router": {
474+
"route": {}
475+
},
476+
},
477+
"workload-llmd-simulator": {
478+
"replicas": 1,
479+
"model": {"uri": "hf://facebook/opt-125m", "name": "facebook/opt-125m"},
480+
"template": {
481+
"containers": [
482+
{
483+
"name": "main",
484+
"image": "ghcr.io/llm-d/llm-d-inference-sim:v0.5.1",
485+
"command": ["/app/llm-d-inference-sim"],
486+
"args": [
487+
"--port",
488+
"8000",
489+
"--model",
490+
"{{ .Spec.Model.Name }}",
491+
"--mode",
492+
"random",
493+
"--ssl-certfile",
494+
"/etc/ssl/certs/tls.crt",
495+
"--ssl-keyfile",
496+
"/etc/ssl/certs/tls.key"
497+
],
498+
"resources": {
499+
"limits": {"cpu": "1", "memory": "2Gi"},
500+
"requests": {"cpu": "1", "memory": "2Gi"},
501+
},
502+
}
503+
]
504+
},
505+
},
463506
}
464507

465508

@@ -574,7 +617,7 @@ def _get_model_name_from_configs(config_names):
574617
config = LLMINFERENCESERVICE_CONFIGS[config_name]
575618
if "model" in config and "name" in config["model"]:
576619
return config["model"]["name"]
577-
return "default-model"
620+
return "default/model"
578621

579622

580623
def generate_k8s_safe_suffix(base_name: str, extra_parts: List[str] = None) -> str:
@@ -608,7 +651,7 @@ def generate_test_id(test_case) -> str:
608651
return "-".join(test_case.base_refs)
609652

610653

611-
def create_router_resources(gateways, routes, kserve_client=None):
654+
def create_router_resources(gateways, routes=None, kserve_client=None):
612655
if not kserve_client:
613656
kserve_client = KServeClient(config_file=os.environ.get("KUBECONFIG", "~/.kube/config"))
614657

@@ -619,7 +662,7 @@ def create_router_resources(gateways, routes, kserve_client=None):
619662
for gateway in gateways:
620663
create_or_update_gateway(kserve_client, gateway)
621664
gateways_created.append(gateway)
622-
for route in routes:
665+
for route in routes or []:
623666
create_or_update_route(kserve_client, route)
624667
routes_created.append(route)
625668
except Exception as e:
@@ -628,11 +671,11 @@ def create_router_resources(gateways, routes, kserve_client=None):
628671
raise
629672

630673

631-
def delete_router_resources(gateways, routes, kserve_client=None):
674+
def delete_router_resources(gateways, routes=None, kserve_client=None):
632675
if not kserve_client:
633676
kserve_client = KServeClient(config_file=os.environ.get("KUBECONFIG", "~/.kube/config"))
634677

635-
for route in routes:
678+
for route in routes or []:
636679
try:
637680
logger.info(f"Cleaning up HttpRoute {route.get('metadata', {}).get('name')}")
638681
delete_route(kserve_client, route.get("metadata", {}).get("name"), route.get("metadata", {}).get("namespace", "default"))

test/e2e/llmisvc/test_llm_inference_service.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,24 @@ class TestCase:
8383
@pytest.mark.parametrize(
8484
"test_case",
8585
[
86+
pytest.param(
87+
TestCase(
88+
base_refs=[
89+
"router-with-gateway-ref",
90+
"router-with-managed-route",
91+
"model-fb-opt-125m",
92+
"workload-llmd-simulator",
93+
],
94+
prompt="KServe is a",
95+
before_test=[lambda: create_router_resources(
96+
gateways=[ROUTER_GATEWAYS[0]],
97+
)],
98+
after_test=[lambda: delete_router_resources(
99+
gateways=[ROUTER_GATEWAYS[0]],
100+
)],
101+
),
102+
marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node, pytest.mark.llmd_simulator],
103+
),
86104
pytest.param(
87105
TestCase(
88106
base_refs=[

test/e2e/pytest.ini

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,6 @@ markers =
2323
cluster_amd: test targeting cluster with AMD
2424
cluster_intel: test targeting cluster with Intel
2525
cluster_nvidia: test targeting cluster with NVIDIA
26-
cluster_single_node: test targeting single node cluster
26+
cluster_nvidia_roce: test targeting cluster with NVIDIA ROCe
27+
cluster_single_node: test targeting single node cluster
28+
llmd_simulator: test using llm-d simulator

0 commit comments

Comments
 (0)