@@ -282,42 +282,26 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) {
282282 ),
283283 },
284284 {
285- name : "noncritical and all models past threshold, shed request " ,
286- requests : integrationutils .GenerateStreamedRequestSet (logger , "test4" , modelSheddable , modelSQLLoraTarget , nil ),
285+ name : "don't shed requests by default " ,
286+ requests : integrationutils .GenerateStreamedRequestSet (logger , "test4" , modelSQLLora , modelSQLLoraTarget , nil ),
287287 // pod 0: excluded; above queue size threshold
288288 // pod 1: excluded; above KV cache threshold
289289 // pod 2: excluded; above queue size threshold
290290 pods : newPodStates (
291- podState {index : 0 , queueSize : 6 , kvCacheUsage : 0.2 , activeModels : []string {"foo" , "bar" , modelSheddableTarget }},
292- podState {index : 1 , queueSize : 0 , kvCacheUsage : 0.85 , activeModels : []string {"foo" , modelSheddableTarget }},
293- podState {index : 2 , queueSize : 10 , kvCacheUsage : 0.9 , activeModels : []string {"foo" , modelSheddableTarget }},
294- ),
295- wantErr : false ,
296- wantMetrics : map [string ]string {},
297- wantResponses : integrationutils .NewImmediateErrorResponse (
298- envoyTypePb .StatusCode_TooManyRequests ,
299- "inference gateway: InferencePoolResourceExhausted - system saturated, non-critical request dropped" ,
300- ),
301- },
302- {
303- name : "noncritical, but one server has capacity, do not shed" ,
304- requests : integrationutils .GenerateStreamedRequestSet (logger , "test5" , modelSheddable , modelSheddableTarget , nil ),
305- // Pod 1 will be picked because it has relatively low queue size and low KV cache.
306- pods : newPodStates (
307- podState {index : 0 , queueSize : 4 , kvCacheUsage : 0.2 , activeModels : []string {"foo" , "bar" , modelSheddableTarget }},
308- podState {index : 1 , queueSize : 4 , kvCacheUsage : 0.85 , activeModels : []string {"foo" , modelSheddableTarget }},
309- podState {index : 2 , queueSize : 10 , kvCacheUsage : 0.9 , activeModels : []string {"foo" , modelSheddableTarget }},
291+ podState {index : 0 , queueSize : 6 , kvCacheUsage : 0.2 , activeModels : []string {"foo" , "bar" , modelSQLLoraTarget }},
292+ podState {index : 1 , queueSize : 0 , kvCacheUsage : 0.85 , activeModels : []string {"foo" }},
293+ podState {index : 2 , queueSize : 10 , kvCacheUsage : 0.9 , activeModels : []string {"foo" }},
310294 ),
311295 wantMetrics : map [string ]string {
312296 "inference_model_request_total" : inferenceObjectiveRequestTotal ([]label {
313- {"model_name" , modelSheddable },
314- {"target_model_name" , modelSheddableTarget },
297+ {"model_name" , modelSQLLora },
298+ {"target_model_name" , modelSQLLoraTarget },
315299 }),
316300 },
317301 wantErr : false ,
318302 wantResponses : integrationutils .NewRequestBufferedResponse (
319303 "192.168.1.1:8000" ,
320- fmt .Sprintf (`{"max_tokens":100,"model":%q,"prompt":"test5 ","temperature":0}` , modelSheddableTarget ),
304+ fmt .Sprintf (`{"max_tokens":100,"model":%q,"prompt":"test4 ","temperature":0}` , modelSQLLoraTarget ),
321305 & configPb.HeaderValueOption {
322306 Header : & configPb.HeaderValue {
323307 Key : "hi" ,
0 commit comments