Enforce limit on prefill padding tokens, delay between prefills

njhill · njhill · commit b0d32ef635aa · 2024-01-29T16:53:16.000-08:00
To limit computation wasted on padding and mitigate impact of frequent large-input, tiny-output request workloads.
diff --git a/README.md b/README.md
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
@@ -49,6 +49,8 @@ struct Args {
     max_batch_weight: Option<usize>,
     #[clap(default_value = None, long, env)]
     max_prefill_weight: Option<usize>,
+    #[clap(default_value = "0.2", long, env)]
+    max_prefill_padding: f32,
     #[clap(default_value = "24", long, env)]
     max_waiting_tokens: usize,
     #[clap(default_value = "3000", long, short, env)]
@@ -221,6 +223,8 @@ fn main() -> ExitCode {
         args.max_new_tokens.to_string(),
         "--max-batch-size".to_string(),
         args.max_batch_size.to_string(),
+        "--max-prefill-padding".to_string(),
+        args.max_prefill_padding.to_string(),
         "--max-waiting-tokens".to_string(),
         args.max_waiting_tokens.to_string(),
         "--port".to_string(),
diff --git a/router/src/batch_types.rs b/router/src/batch_types.rs
@@ -15,6 +15,8 @@ pub(crate) trait BatchType: Send + Sync + Clone + 'static {
     fn batch_initial_weight(stats: &Self::Stats, batch_size: usize) -> usize;
     /// Calculate prefill batch weight given prefill batch statistics
     fn prefill_weight(prefill_stats: &Self::Stats, batch_size: usize) -> usize;
+    /// Percentage of batch tokens that are padding
+    fn percent_padding(prefill_stats: &Self::Stats, batch_size: usize) -> f32;
     /// Indicate whether a hypothetical batch will exceed the combined weight limit
     fn exceeds_weight(
         tree: &BTreeSet<(usize, usize, usize)>, max_total_weight: usize, current_output_len: usize
@@ -61,16 +63,18 @@ impl BatchType for FlashBatch {
         total_in_tokens + total_out_tokens
     }
 
-    fn batch_initial_weight(total_tokens: &Self::Stats, _batch_size: usize) -> usize {
-        let (total_in_tokens, _) = total_tokens;
+    fn batch_initial_weight((total_in_tokens, _): &Self::Stats, _batch_size: usize) -> usize {
         *total_in_tokens
     }
 
-    fn prefill_weight(total_tokens: &Self::Stats, _batch_size: usize) -> usize {
-        let (total_in_tokens, _) = total_tokens;
+    fn prefill_weight((total_in_tokens, _): &Self::Stats, _batch_size: usize) -> usize {
         *total_in_tokens
     }
 
+    fn percent_padding(_: &Self::Stats, _batch_size: usize) -> f32 {
+        0.0
+    }
+
     fn exceeds_weight(
         tree: &BTreeSet<(usize, usize, usize)>, max_total_weight: usize, current_output_len: usize
     ) -> bool {
@@ -106,34 +110,44 @@ impl BatchType for FlashBatch {
 pub(crate) struct PaddedBatch {}
 
 impl BatchType for PaddedBatch {
-    /// Keep track of maximum input length, maximum output length
-    type Stats = (usize, usize);
+    /// Keep track of maximum input length, maximum output length, input token count
+    type Stats = (usize, usize, usize);
 
     fn update_stats(
         max_in_out_lengths: &Self::Stats, input_length: usize, output_length: usize
     ) -> Self::Stats {
-        let (max_input_length, max_output_length) = max_in_out_lengths;
-        (max(*max_input_length, input_length), max(*max_output_length, output_length))
+        let (max_input_length, max_output_length, total_in_tokens) = max_in_out_lengths;
+        (
+            max(*max_input_length, input_length),
+            max(*max_output_length, output_length),
+            total_in_tokens + input_length
+        )
     }
 
     fn batch_max_weight(max_in_out_lengths: &Self::Stats, batch_size: usize) -> usize {
-        let (max_input_length, max_output_length) = max_in_out_lengths;
+        let (max_input_length, max_output_length, _) = max_in_out_lengths;
         let max_seq_len = max_input_length + max_output_length;
         // Memory requirement roughly proportional to batch_size * seq_len^2
         batch_size * max_seq_len.pow(2)
     }
 
-    fn batch_initial_weight(max_in_out_lengths: &Self::Stats, batch_size: usize) -> usize {
-        let (max_input_length, _) = max_in_out_lengths;
+    fn batch_initial_weight((max_input_length, _, _): &Self::Stats, batch_size: usize) -> usize {
         batch_size * max_input_length.pow(2)
     }
 
-    fn prefill_weight(max_in_out_lengths: &Self::Stats, batch_size: usize) -> usize {
+    fn prefill_weight((max_input_length, _, _): &Self::Stats, batch_size: usize) -> usize {
         // Empirically, prefill latency is proportional to batch_size * seq_len^(3/2)
-        let (max_input_length, _) = max_in_out_lengths;
         batch_size * max_input_length.pow(3).sqrt()
     }
 
+    fn percent_padding((max_input_length, _, total_in_tokens): &Self::Stats, batch_size: usize) -> f32 {
+        let total_toks = max_input_length * batch_size;
+        match total_toks {
+            0 => 0.0,
+            total_toks => (total_toks - total_in_tokens) as f32 / total_toks as f32,
+        }
+    }
+
     fn exceeds_weight(
         tree: &BTreeSet<(usize, usize, usize)>, max_total_weight: usize, current_output_len: usize
     ) -> bool {
diff --git a/router/src/batcher.rs b/router/src/batcher.rs
@@ -6,14 +6,19 @@ use axum::http::StatusCode;
 use axum::Json;
 use std::future::Future;
 use std::mem::take;
+use std::ops::Add;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::task::{Context, Poll};
+use std::time::Duration;
 use futures::{FutureExt, pin_mut, TryFutureExt};
 use futures::future::Map;
 use nohash_hasher::IntMap;
-use text_generation_client::{ClientError, Token, ShardedClient, CachedBatch, RequestsStatus, InputTokens, GenerateError, Batch, GenerateTokenResponse};
+use text_generation_client::{
+    ClientError, Token, ShardedClient, CachedBatch, RequestsStatus,
+    InputTokens, GenerateError, Batch, GenerateTokenResponse
+};
 use thiserror::Error;
 use tokio::select;
 
@@ -355,11 +360,12 @@ async fn batching_task<B: BatchType>(
         }
         log_new_batch(batch.id, processor.entries());
 
-        let mut cached_batch = processor.prefill(
+        let (mut cached_batch, _) = processor.prefill(
             &mut client, batch, vec![], None, &mut queue,
         ).await;
         let mut waiting_tokens = 1;
         let mut batch_max_remaining_tokens = None;
+        let mut next_prefill_after = None;
 
         // We loop until we do not receive any cached batch from the inference server (== until
         // all requests have met their stopping criteria)
@@ -385,7 +391,8 @@ async fn batching_task<B: BatchType>(
             metrics::gauge!("tgi_batch_max_remaining_tokens", batch_max_remaining_tokens.unwrap() as f64);
 
             // Don't interfere with current batch if it's about to complete
-            if batch_max_remaining_tokens.unwrap() >= 2 {
+            if batch_max_remaining_tokens.unwrap() >= 2 &&
+                next_prefill_after.map_or(true, |t| Instant::now() > t) {
                 // Determine min num of requests for add-on batch based on current batch size and
                 // tokens since last prefill
                 let min_size = if batch_size <= 1 || waiting_tokens >= max_waiting_tokens {
@@ -411,7 +418,7 @@ async fn batching_task<B: BatchType>(
                     // Generate one token for this new batch to have the attention past in cache
                     let first_new_id = new_batch.requests.first()
                         .expect("Batch can't be empty here").id;
-                    let new_cached_batch = processor.prefill(
+                    let (new_cached_batch, prefill_time) = processor.prefill(
                         &mut client, new_batch, to_prune, Some(first_new_id), &mut queue
                     ).await;
 
@@ -424,6 +431,9 @@ async fn batching_task<B: BatchType>(
                     // Reset waiting counter and batch_remaining_tokens
                     waiting_tokens = 1;
                     batch_max_remaining_tokens = None;
+                    // Ensure we wait at least half as long as the last prefill took
+                    // before we do another prefill (unless the entire batch completes by then)
+                    next_prefill_after = Some(Instant::now().add(prefill_time / 2));
                     // Extend current batch with the new batch
                     if let Some(new_batch) = new_cached_batch {
                         let new_batch_id = new_batch.batch_id;
@@ -452,10 +462,12 @@ async fn batching_task<B: BatchType>(
                         // All batches completed or failed, fetch a new one
                         break
                     }
+                } else {
+                    next_prefill_after = None;
                 }
             }
 
-            cached_batch = processor.next_token(&mut client, batches, &mut queue).await;
+            (cached_batch, _) = processor.next_token(&mut client, batches, &mut queue).await;
             waiting_tokens += 1;
             // Reset batch_remaining_tokens if any requests in the batch completed
             if batch_max_remaining_tokens.is_some() && some_completed(&cached_batch) {
@@ -520,29 +532,24 @@ impl<'a> TokenProcessor<'a> {
         // First request id in this batch if it doesn't comprise all current entries
         start_id: Option<u64>,
         queue: &mut Queue<B>,
-    ) -> Option<CachedBatch> {
+    ) -> (Option<CachedBatch>, Duration) {
         let batch_size = batch.requests.len();
         let batch_tokens = batch.total_tokens;
         let start_time = Instant::now();
         metrics::histogram!("tgi_batch_next_tokens", batch_tokens as f64);
         metrics::histogram!(
             "tgi_batch_inference_batch_size", batch_size as f64, "method" => "prefill"
         );
-        self._wrap_future(
-            client.prefill(batch, to_prune).map(|r| {
-                info!(
-                    "Prefill took {:?} for {batch_size} inputs, {batch_tokens} total tokens",
-                    start_time.elapsed(),
-                );
-                r
-            }),
-            "prefill", start_time, start_id, queue
-        ).await
+        let (result, prefill_time) = self._wrap_future(
+            client.prefill(batch, to_prune), "prefill", start_time, start_id, queue
+        ).await;
+        info!("Prefill took {prefill_time:?} for {batch_size} inputs, {batch_tokens} total tokens");
+        (result, prefill_time)
     }
 
     async fn next_token<B: BatchType>(
         &mut self, client: &mut ShardedClient, batches: Vec<CachedBatch>, queue: &mut Queue<B>,
-    ) -> Option<CachedBatch> {
+    ) -> (Option<CachedBatch>, Duration) {
         metrics::histogram!(
             "tgi_batch_inference_batch_size", self.entries.len() as f64, "method" => "next_token"
         );
@@ -561,7 +568,7 @@ impl<'a> TokenProcessor<'a> {
         // First request id in this batch if it doesn't comprise all current entries
         start_id: Option<u64>,
         queue: &mut Queue<B>,
-    ) -> Option<CachedBatch> {
+    ) -> (Option<CachedBatch>, Duration) {
         metrics::increment_counter!("tgi_batch_inference_count", "method" => method);
 
         // We process the shared queue while waiting for the response from the python shard(s)
@@ -574,7 +581,8 @@ impl<'a> TokenProcessor<'a> {
             }
         };
 
-        match result {
+        let elapsed = start_time.elapsed();
+        let result = match result {
             Ok(
                 Some((generated_tokens, input_tokens, errors, next_batch_id, forward_duration))
             ) => {
@@ -587,7 +595,7 @@ impl<'a> TokenProcessor<'a> {
                 self.generation_health.store(true, Ordering::SeqCst);
                 metrics::histogram!(
                     "tgi_batch_inference_duration",
-                    start_time.elapsed().as_secs_f64(),
+                    elapsed.as_secs_f64(),
                     "method" => method,
                     "makeup" => "single_only", // later will possibly be beam_only or mixed
                 );
@@ -626,7 +634,9 @@ impl<'a> TokenProcessor<'a> {
                 self.send_errors(err, start_id);
                 None
             },
-        }
+        };
+
+        (result, elapsed)
     }
 
     /// Send errors to the Batcher for all `request_ids`
diff --git a/router/src/main.rs b/router/src/main.rs
@@ -23,6 +23,8 @@ struct Args {
     max_batch_weight: Option<usize>,
     #[clap(default_value = None, long, env)]
     max_prefill_weight: Option<usize>,
+    #[clap(default_value = "0.2", long, env)]
+    max_prefill_padding: f32,
     #[clap(default_value = "24", long, env)]
     max_waiting_tokens: usize,
     #[clap(default_value = "3000", long, short, env)]
@@ -129,6 +131,7 @@ fn main() -> Result<(), std::io::Error> {
                 max_batch_size: args.max_batch_size,
                 max_batch_weight: args.max_batch_weight,
                 max_prefill_weight: args.max_prefill_weight,
+                max_prefill_padding: args.max_prefill_padding,
                 max_waiting_tokens: args.max_waiting_tokens,
                 client: sharded_client,
                 tokenizer,
diff --git a/router/src/queue.rs b/router/src/queue.rs
@@ -105,6 +105,8 @@ pub(crate) struct BatchingConfig {
     pub(crate) weight_limit: usize,
     /// Maximum weight of individual prefill batches
     pub(crate) prefill_weight_limit: usize,
+    /// Maximum percentage of pad tokens in prefill batches. In range [0, 1]
+    pub(crate) prefill_padding_limit: f32,
 }
 
 /// Request Queue
@@ -249,9 +251,15 @@ impl<B: BatchType> Queue<B> {
                 let pct_space_free = 1.0 - (
                     current_batch_weight as f64 / self.config.weight_limit as f64
                 );
-                (pct_space_free * prefill_limit as f64) as usize
+                let limit = (pct_space_free * prefill_limit as f64) as usize;
+                if limit == 0 {
+                    return None
+                }
+                limit
             },
         };
+        let max_prefill_padding = self.config.prefill_padding_limit;
+
         // We first do a read-only pass over the queue to allow skipping over large entries
         // that don't fit in the current batch to reach smaller entries that do
         for (index, entry) in self.buffer.iter().enumerate() {
@@ -316,14 +324,29 @@ impl<B: BatchType> Queue<B> {
             }
 
             // Also check whether adding this request will breach the prefill weight limit
-            if effective_prefill_weight_limit > 0 {
+            if effective_prefill_weight_limit > 0 || max_prefill_padding < 1.0 {
                 let next_prefill_stats = <B>::update_stats(
                     &prefill_stats, input_len, 0
                 );
-                let prefill_weight = <B>::prefill_weight(
-                    &next_prefill_stats, chosen_indices.len() + 1
-                );
-                if prefill_weight > effective_prefill_weight_limit {
+                let batch_size = chosen_indices.len() + 1;
+                let mut skip = false;
+                if effective_prefill_weight_limit > 0 {
+                    let prefill_weight = <B>::prefill_weight(&next_prefill_stats, batch_size);
+                    if prefill_weight > effective_prefill_weight_limit {
+                        skip = true;
+                        metrics::increment_counter!("tgi_prefill_weight_limit_exceeded");
+                    }
+                }
+                if !skip && max_prefill_padding < 1.0 {
+                    let percentage_padding = <B>::percent_padding(&next_prefill_stats, batch_size);
+                    if percentage_padding > max_prefill_padding {
+                        skip = true;
+                        //TODO if we skip due to padding and added other requests from queue,
+                        // we could consider doing another pass since the padding proportion may have decreased
+                        metrics::increment_counter!("tgi_prefill_padding_limit_exceeded");
+                    }
+                }
+                if skip {
                     if let Some(tree) = btree.as_mut() {
                         // Remove our tuple from the set
                         tree.remove(&(output_len, input_len, tree.len() - 1));
diff --git a/router/src/server.rs b/router/src/server.rs
@@ -245,6 +245,7 @@ pub struct ServerRunArgs {
     pub max_batch_size: usize,
     pub max_batch_weight: Option<usize>,
     pub max_prefill_weight: Option<usize>,
+    pub max_prefill_padding: f32,
     pub max_waiting_tokens: usize,
     pub client: ShardedClient,
     pub tokenizer: Tokenizer,
@@ -273,6 +274,7 @@ pub async fn run(mut args: ServerRunArgs) {
     if use_padding {
         do_run(args, seq2seq, eos_token_id, PaddedBatch{}).await
     } else {
+        args.max_prefill_padding = 1.0; // There's no padding so disable checking for this
         do_run(args, seq2seq, eos_token_id, FlashBatch{}).await
     }
 }
@@ -294,6 +296,11 @@ async fn do_run<B: BatchType>(
             args.max_prefill_weight,
         );
 
+    let max_prefill_padding = args.max_prefill_padding;
+    if max_prefill_padding < 0.0 || max_prefill_padding > 1.0 {
+        panic!("max_prefill_padding ({}) must be a percentage in the range [0.0, 1.0]", max_prefill_padding)
+    }
+
     let tokenizers = AsyncTokenizer::new(
         &args.tokenizer, args.tokenization_workers
     );
@@ -312,6 +319,7 @@ async fn do_run<B: BatchType>(
             size_limit: args.max_batch_size,
             weight_limit: max_batch_weight,
             prefill_weight_limit: max_prefill_weight,
+            prefill_padding_limit: max_prefill_padding,
         },
         args.max_waiting_tokens,
         args.max_concurrent_requests,