Refine batch packing logic related to prefill weight limit

njhill · njhill · commit cbeb36f134aa · 2023-09-28T15:06:57.000-07:00
The current batch packing logic does not take the space used by an in-progress batch into account when evaluating the configured prefill weight limit for a candidate add-on batch. This can be significant in cases where the prefill memory cost is significantly higher than that of incremental new token generation for a given batch size.

This fix ensures the effective prefill limit for add-on batches is reduced in proportion to the weight of the existing in-progress batch.
diff --git a/router/src/batch_types.rs b/router/src/batch_types.rs
@@ -9,8 +9,10 @@ pub(crate) trait BatchType: Send + Sync + Clone + 'static {
 
     /// Update batch statistics with an additional request
     fn update_stats(stats: &Self::Stats, input_length: usize, output_length: usize) -> Self::Stats;
-    /// Calculate batch weight given batch statistics
-    fn batch_weight(stats: &Self::Stats, batch_size: usize) -> usize;
+    /// Calculate worst-case max batch weight given batch statistics
+    fn batch_max_weight(stats: &Self::Stats, batch_size: usize) -> usize;
+    /// Calculate initial max batch weight given batch statistics (based on input lengths only)
+    fn batch_initial_weight(stats: &Self::Stats, batch_size: usize) -> usize;
     /// Calculate prefill batch weight given prefill batch statistics
     fn prefill_weight(prefill_stats: &Self::Stats, batch_size: usize) -> usize;
     /// Indicate whether a hypothetical batch will exceed the combined weight limit
@@ -44,21 +46,29 @@ pub(crate) trait BatchType: Send + Sync + Clone + 'static {
 pub(crate) struct FlashBatch {}
 
 impl BatchType for FlashBatch {
-    /// Keep track of total number of tokens in the batch
-    type Stats = usize;
+    /// Keep track of total number of input and output tokens in the batch
+    type Stats = (usize, usize);
 
     fn update_stats(
         total_tokens: &Self::Stats, input_length: usize, output_length: usize
     ) -> Self::Stats {
-        total_tokens + input_length + output_length
+        let (total_in_tokens, total_out_tokens) = total_tokens;
+        (total_in_tokens + input_length, total_out_tokens + output_length)
+    }
+
+    fn batch_max_weight(total_tokens: &Self::Stats, _batch_size: usize) -> usize {
+        let (total_in_tokens, total_out_tokens) = total_tokens;
+        total_in_tokens + total_out_tokens
     }
 
-    fn batch_weight(total_tokens: &Self::Stats, _batch_size: usize) -> usize {
-        *total_tokens
+    fn batch_initial_weight(total_tokens: &Self::Stats, _batch_size: usize) -> usize {
+        let (total_in_tokens, _) = total_tokens;
+        *total_in_tokens
     }
 
     fn prefill_weight(total_tokens: &Self::Stats, _batch_size: usize) -> usize {
-        *total_tokens
+        let (total_in_tokens, _) = total_tokens;
+        *total_in_tokens
     }
 
     fn exceeds_weight(
@@ -106,13 +116,18 @@ impl BatchType for PaddedBatch {
         (max(*max_input_length, input_length), max(*max_output_length, output_length))
     }
 
-    fn batch_weight(max_in_out_lengths: &Self::Stats, batch_size: usize) -> usize {
+    fn batch_max_weight(max_in_out_lengths: &Self::Stats, batch_size: usize) -> usize {
         let (max_input_length, max_output_length) = max_in_out_lengths;
         let max_seq_len = max_input_length + max_output_length;
         // Memory requirement roughly proportional to batch_size * seq_len^2
         batch_size * max_seq_len.pow(2)
     }
 
+    fn batch_initial_weight(max_in_out_lengths: &Self::Stats, batch_size: usize) -> usize {
+        let (max_input_length, _) = max_in_out_lengths;
+        batch_size * max_input_length
+    }
+
     fn prefill_weight(max_in_out_lengths: &Self::Stats, batch_size: usize) -> usize {
         // Empirically, prefill latency is proportional to batch_size * seq_len^(3/2)
         let (max_input_length, _) = max_in_out_lengths;
diff --git a/router/src/queue.rs b/router/src/queue.rs
@@ -232,6 +232,19 @@ impl<B: BatchType> Queue<B> {
         let now = Instant::now();
         let mut batch_stats = <B>::compute_stats(entries);
         let mut prefill_stats = <B>::compute_stats(&self.empty_map);
+
+        // Compute the effective prefill weight limit, taking into account space already consumed
+        // by the in-progress batch
+        let effective_prefill_weight_limit = match self.config.prefill_weight_limit {
+            prefill_limit if prefill_limit == 0 || total_count == 0 => prefill_limit,
+            prefill_limit => {
+                let current_batch_weight = <B>::batch_initial_weight(&batch_stats, total_count);
+                let pct_space_free = 1.0 - (
+                    current_batch_weight as f64 / self.config.weight_limit as f64
+                );
+                (pct_space_free * prefill_limit as f64) as usize
+            },
+        };
         // We first do a read-only pass over the queue to allow skipping over large entries
         // that don't fit in the current batch to reach smaller entries that do
         for (index, entry) in self.buffer.iter().enumerate() {
@@ -247,7 +260,7 @@ impl<B: BatchType> Queue<B> {
             );
 
             // Avoid more granular analysis if possible
-            if <B>::batch_weight(&batch_stats, total_count + 1) > config.weight_limit {
+            if <B>::batch_max_weight(&batch_stats, total_count + 1) > config.weight_limit {
                 // We aren't sure whether this next request will fit, so populate
                 // a btree with the current batch of requests, the set of
                 // requests already evaluated, and this one, and perform more
@@ -274,9 +287,7 @@ impl<B: BatchType> Queue<B> {
                 tree.insert((output_len, input_len, tree.len()));
 
                 // Perform analysis
-                if <B>::exceeds_weight(
-                    tree, config.weight_limit, output_len,
-                ) {
+                if <B>::exceeds_weight(tree, config.weight_limit, output_len) {
                     if chosen_indices.len() + buffer_size < min_size + index + 1 {
                         // We don't have enough remaining to meet min_size
                         return None
@@ -296,28 +307,22 @@ impl<B: BatchType> Queue<B> {
                 metrics::increment_counter!("tgi_queue_jump");
             }
 
-            // Also check whether adding this request will make the batch of new requests
-            // too expensive latency-wise to perform in a single forward-pass.
-            let mut prefill_weight_exceeded = false;
-            if config.prefill_weight_limit > 0 {
+            // Also check whether adding this request will breach the prefill weight limit
+            if effective_prefill_weight_limit > 0 {
                 let next_prefill_stats = <B>::update_stats(
                     &prefill_stats, input_len, 0
                 );
                 let prefill_weight = <B>::prefill_weight(
                     &next_prefill_stats, chosen_indices.len() + 1
                 );
-                if prefill_weight > config.prefill_weight_limit {
-                    if chosen_indices.is_empty() {
-                        prefill_weight_exceeded = true;
-                    } else {
-                        if let Some(tree) = btree.as_mut() {
-                            // Remove our tuple from the set
-                            tree.remove(&(output_len, input_len, tree.len() - 1));
-                        }
-                        time_cutoff.get_or_insert_with(|| entry.queue_time.add(CUTOFF_DURATION));
-                        metrics::increment_counter!("tgi_prefill_weight_limit_exceeded");
-                        continue
+                if prefill_weight > effective_prefill_weight_limit {
+                    if let Some(tree) = btree.as_mut() {
+                        // Remove our tuple from the set
+                        tree.remove(&(output_len, input_len, tree.len() - 1));
                     }
+                    time_cutoff.get_or_insert_with(|| entry.queue_time.add(CUTOFF_DURATION));
+                    metrics::increment_counter!("tgi_prefill_weight_limit_exceeded");
+                    continue
                 }
                 prefill_stats = next_prefill_stats;
             }
@@ -326,7 +331,7 @@ impl<B: BatchType> Queue<B> {
 
             chosen_indices.push(index);
             total_count += 1;
-            if total_count >= config.size_limit || prefill_weight_exceeded {
+            if total_count >= config.size_limit {
                 break
             }
         }
diff --git a/router/src/server.rs b/router/src/server.rs
@@ -190,7 +190,7 @@ impl<B: BatchType> BatchConfigValidator<B> {
         let single_request_stats = <B>::update_stats(
             &B::Stats::default(), max_sequence_length, 0
         );
-        let single_request_weight = <B>::batch_weight(
+        let single_request_weight = <B>::batch_initial_weight(
             &single_request_stats, 1
         );
         let weight_upper_bound = single_request_weight * max_batch_size;