@@ -232,6 +232,19 @@ impl<B: BatchType> Queue<B> {
232
232
let now = Instant :: now ( ) ;
233
233
let mut batch_stats = <B >:: compute_stats ( entries) ;
234
234
let mut prefill_stats = <B >:: compute_stats ( & self . empty_map ) ;
235
+
236
+ // Compute the effective prefill weight limit, taking into account space already consumed
237
+ // by the in-progress batch
238
+ let effective_prefill_weight_limit = match self . config . prefill_weight_limit {
239
+ prefill_limit if prefill_limit == 0 || total_count == 0 => prefill_limit,
240
+ prefill_limit => {
241
+ let current_batch_weight = <B >:: batch_initial_weight ( & batch_stats, total_count) ;
242
+ let pct_space_free = 1.0 - (
243
+ current_batch_weight as f64 / self . config . weight_limit as f64
244
+ ) ;
245
+ ( pct_space_free * prefill_limit as f64 ) as usize
246
+ } ,
247
+ } ;
235
248
// We first do a read-only pass over the queue to allow skipping over large entries
236
249
// that don't fit in the current batch to reach smaller entries that do
237
250
for ( index, entry) in self . buffer . iter ( ) . enumerate ( ) {
@@ -247,7 +260,7 @@ impl<B: BatchType> Queue<B> {
247
260
) ;
248
261
249
262
// Avoid more granular analysis if possible
250
- if <B >:: batch_weight ( & batch_stats, total_count + 1 ) > config. weight_limit {
263
+ if <B >:: batch_max_weight ( & batch_stats, total_count + 1 ) > config. weight_limit {
251
264
// We aren't sure whether this next request will fit, so populate
252
265
// a btree with the current batch of requests, the set of
253
266
// requests already evaluated, and this one, and perform more
@@ -274,9 +287,7 @@ impl<B: BatchType> Queue<B> {
274
287
tree. insert ( ( output_len, input_len, tree. len ( ) ) ) ;
275
288
276
289
// Perform analysis
277
- if <B >:: exceeds_weight (
278
- tree, config. weight_limit , output_len,
279
- ) {
290
+ if <B >:: exceeds_weight ( tree, config. weight_limit , output_len) {
280
291
if chosen_indices. len ( ) + buffer_size < min_size + index + 1 {
281
292
// We don't have enough remaining to meet min_size
282
293
return None
@@ -296,28 +307,22 @@ impl<B: BatchType> Queue<B> {
296
307
metrics:: increment_counter!( "tgi_queue_jump" ) ;
297
308
}
298
309
299
- // Also check whether adding this request will make the batch of new requests
300
- // too expensive latency-wise to perform in a single forward-pass.
301
- let mut prefill_weight_exceeded = false ;
302
- if config. prefill_weight_limit > 0 {
310
+ // Also check whether adding this request will breach the prefill weight limit
311
+ if effective_prefill_weight_limit > 0 {
303
312
let next_prefill_stats = <B >:: update_stats (
304
313
& prefill_stats, input_len, 0
305
314
) ;
306
315
let prefill_weight = <B >:: prefill_weight (
307
316
& next_prefill_stats, chosen_indices. len ( ) + 1
308
317
) ;
309
- if prefill_weight > config. prefill_weight_limit {
310
- if chosen_indices. is_empty ( ) {
311
- prefill_weight_exceeded = true ;
312
- } else {
313
- if let Some ( tree) = btree. as_mut ( ) {
314
- // Remove our tuple from the set
315
- tree. remove ( & ( output_len, input_len, tree. len ( ) - 1 ) ) ;
316
- }
317
- time_cutoff. get_or_insert_with ( || entry. queue_time . add ( CUTOFF_DURATION ) ) ;
318
- metrics:: increment_counter!( "tgi_prefill_weight_limit_exceeded" ) ;
319
- continue
318
+ if prefill_weight > effective_prefill_weight_limit {
319
+ if let Some ( tree) = btree. as_mut ( ) {
320
+ // Remove our tuple from the set
321
+ tree. remove ( & ( output_len, input_len, tree. len ( ) - 1 ) ) ;
320
322
}
323
+ time_cutoff. get_or_insert_with ( || entry. queue_time . add ( CUTOFF_DURATION ) ) ;
324
+ metrics:: increment_counter!( "tgi_prefill_weight_limit_exceeded" ) ;
325
+ continue
321
326
}
322
327
prefill_stats = next_prefill_stats;
323
328
}
@@ -326,7 +331,7 @@ impl<B: BatchType> Queue<B> {
326
331
327
332
chosen_indices. push ( index) ;
328
333
total_count += 1 ;
329
- if total_count >= config. size_limit || prefill_weight_exceeded {
334
+ if total_count >= config. size_limit {
330
335
break
331
336
}
332
337
}
0 commit comments