@@ -282,42 +282,26 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) {
282
282
),
283
283
},
284
284
{
285
- name : "noncritical and all models past threshold, shed request " ,
286
- requests : integrationutils .GenerateStreamedRequestSet (logger , "test4" , modelSheddable , modelSQLLoraTarget , nil ),
285
+ name : "don't shed requests by default " ,
286
+ requests : integrationutils .GenerateStreamedRequestSet (logger , "test4" , modelSQLLora , modelSQLLoraTarget , nil ),
287
287
// pod 0: excluded; above queue size threshold
288
288
// pod 1: excluded; above KV cache threshold
289
289
// pod 2: excluded; above queue size threshold
290
290
pods : newPodStates (
291
- podState {index : 0 , queueSize : 6 , kvCacheUsage : 0.2 , activeModels : []string {"foo" , "bar" , modelSheddableTarget }},
292
- podState {index : 1 , queueSize : 0 , kvCacheUsage : 0.85 , activeModels : []string {"foo" , modelSheddableTarget }},
293
- podState {index : 2 , queueSize : 10 , kvCacheUsage : 0.9 , activeModels : []string {"foo" , modelSheddableTarget }},
294
- ),
295
- wantErr : false ,
296
- wantMetrics : map [string ]string {},
297
- wantResponses : integrationutils .NewImmediateErrorResponse (
298
- envoyTypePb .StatusCode_TooManyRequests ,
299
- "inference gateway: InferencePoolResourceExhausted - system saturated, non-critical request dropped" ,
300
- ),
301
- },
302
- {
303
- name : "noncritical, but one server has capacity, do not shed" ,
304
- requests : integrationutils .GenerateStreamedRequestSet (logger , "test5" , modelSheddable , modelSheddableTarget , nil ),
305
- // Pod 1 will be picked because it has relatively low queue size and low KV cache.
306
- pods : newPodStates (
307
- podState {index : 0 , queueSize : 4 , kvCacheUsage : 0.2 , activeModels : []string {"foo" , "bar" , modelSheddableTarget }},
308
- podState {index : 1 , queueSize : 4 , kvCacheUsage : 0.85 , activeModels : []string {"foo" , modelSheddableTarget }},
309
- podState {index : 2 , queueSize : 10 , kvCacheUsage : 0.9 , activeModels : []string {"foo" , modelSheddableTarget }},
291
+ podState {index : 0 , queueSize : 6 , kvCacheUsage : 0.2 , activeModels : []string {"foo" , "bar" , modelSQLLoraTarget }},
292
+ podState {index : 1 , queueSize : 0 , kvCacheUsage : 0.85 , activeModels : []string {"foo" }},
293
+ podState {index : 2 , queueSize : 10 , kvCacheUsage : 0.9 , activeModels : []string {"foo" }},
310
294
),
311
295
wantMetrics : map [string ]string {
312
296
"inference_model_request_total" : inferenceObjectiveRequestTotal ([]label {
313
- {"model_name" , modelSheddable },
314
- {"target_model_name" , modelSheddableTarget },
297
+ {"model_name" , modelSQLLora },
298
+ {"target_model_name" , modelSQLLoraTarget },
315
299
}),
316
300
},
317
301
wantErr : false ,
318
302
wantResponses : integrationutils .NewRequestBufferedResponse (
319
303
"192.168.1.1:8000" ,
320
- fmt .Sprintf (`{"max_tokens":100,"model":%q,"prompt":"test5 ","temperature":0}` , modelSheddableTarget ),
304
+ fmt .Sprintf (`{"max_tokens":100,"model":%q,"prompt":"test4 ","temperature":0}` , modelSQLLoraTarget ),
321
305
& configPb.HeaderValueOption {
322
306
Header : & configPb.HeaderValue {
323
307
Key : "hi" ,
0 commit comments