IBM
diff --git a/‎integration_tests/text_generation_tests/test_server.py
Lines changed: 1 addition & 1 deletion b/‎integration_tests/text_generation_tests/test_server.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎launcher/src/main.rs
Lines changed: 21 additions & 22 deletions b/‎launcher/src/main.rs
Lines changed: 21 additions & 22 deletions
diff --git a/‎proto/generate.proto
Lines changed: 12 additions & 2 deletions b/‎proto/generate.proto
Lines changed: 12 additions & 2 deletions
diff --git a/‎router/client/src/client.rs
Lines changed: 7 additions & 2 deletions b/‎router/client/src/client.rs
Lines changed: 7 additions & 2 deletions
diff --git a/‎router/client/src/sharded_client.rs
Lines changed: 3 additions & 2 deletions b/‎router/client/src/sharded_client.rs
Lines changed: 3 additions & 2 deletions
diff --git a/‎router/src/batch_types.rs
Lines changed: 43 additions & 37 deletions b/‎router/src/batch_types.rs
Lines changed: 43 additions & 37 deletions
diff --git a/‎router/src/main.rs
Lines changed: 0 additions & 6 deletions b/‎router/src/main.rs
Lines changed: 0 additions & 6 deletions
@@ -61,14 +61,14 @@ def start_server(
         # Reduce this so we can more easily test limit behaviour
         "--max-sequence-length", "200",
         "--max-new-tokens", "169",
-        "--max-batch-weight", "80000",
     ]
 
     if output_special_tokens:
         args.append("--output-special-tokens")
 
     env = os.environ.copy()
     env["RUST_BACKTRACE"] = "full"
+    env["ESTIMATE_MEMORY"] = "manual"
     env["PREFIX_STORE_PATH"] = os.path.join(TESTS_DIR, "prompt_prefixes")
     if not include_cache_env_vars:
         env.pop("TRANSFORMERS_CACHE", None)
 
@@ -16,7 +16,7 @@ use std::{fs, io};
 use std::env::VarError;
 use std::ffi::OsString;
 use std::os::unix::process::CommandExt;
-use tracing::info;
+use tracing::{info, warn};
 
 // In most cases this gives the best performance for inferencing
 const DEFAULT_PYTORCH_CUDA_ALLOC_CONF: &'static str = "expandable_segments:True";
@@ -47,12 +47,10 @@ struct Args {
     max_new_tokens: usize,
     #[clap(default_value = "12", long, env)]
     max_batch_size: usize,
-    #[clap(default_value = None, long, env)]
-    max_batch_weight: Option<usize>,
-    #[clap(default_value = None, long, env)]
-    max_prefill_weight: Option<usize>,
     #[clap(default_value = "0.2", long, env)]
     max_prefill_padding: f32,
+    #[clap(default_value = "20", long, env)]
+    batch_safety_margin: usize,
     #[clap(default_value = "24", long, env)]
     max_waiting_tokens: usize,
     #[clap(default_value = "3000", long, short, env)]
@@ -112,6 +110,20 @@ fn main() -> ExitCode {
         &args.model_name, args.revision.as_deref()
     ).expect("Could not find tokenizer for model");
 
+    match env::var("MAX_BATCH_WEIGHT") {
+        Ok(max_batch_weight) if !max_batch_weight.trim().is_empty() => {
+            warn!("MAX_BATCH_WEIGHT is set to {max_batch_weight} but this parameter will be ignored.");
+        }
+        _ => {}
+    }
+
+    match env::var("MAX_PREFILL_WEIGHT") {
+        Ok(max_prefill_weight) if !max_prefill_weight.trim().is_empty() => {
+            warn!("MAX_PREFILL_WEIGHT is set to {max_prefill_weight} but this parameter will be ignored.");
+        }
+        _ => {}
+    }
+
     // Set PYTORCH_CUDA_ALLOC_CONF to default value if it's not set in the environment
     let cuda_alloc_conf = match env::var("PYTORCH_CUDA_ALLOC_CONF") {
         Err(VarError::NotPresent) if DEFAULT_PYTORCH_CUDA_ALLOC_CONF == "" => None,
@@ -164,7 +176,7 @@ fn main() -> ExitCode {
                 args.max_sequence_length,
                 args.max_new_tokens,
                 args.max_batch_size,
-                args.max_batch_weight,
+                args.batch_safety_margin,
                 args.shard_uds_path,
                 args.cuda_process_memory_fraction,
                 cuda_alloc_conf,
@@ -237,15 +249,6 @@ fn main() -> ExitCode {
         tokenizer_path,
     ];
 
-    if let Some(max_batch_weight) = args.max_batch_weight {
-        argv.push("--max-batch-weight".to_string());
-        argv.push(max_batch_weight.to_string());
-    }
-    if let Some(max_prefill_weight) = args.max_prefill_weight {
-        argv.push("--max-prefill-weight".to_string());
-        argv.push(max_prefill_weight.to_string());
-    }
-
     if let Some(path) = args.tls_key_path {
         argv.push("--tls-key-path".to_string());
         argv.push(path);
@@ -395,7 +398,7 @@ fn shard_manager(
     max_sequence_length: usize,
     max_new_tokens: usize,
     max_batch_size: usize,
-    max_batch_weight: Option<usize>,
+    batch_safety_margin: usize,
     uds_path: String,
     cuda_process_memory_fraction: f32,
     cuda_alloc_conf: Option<&str>,
@@ -428,6 +431,8 @@ fn shard_manager(
         max_new_tokens.to_string(),
         "--max-batch-size".to_string(),
         max_batch_size.to_string(),
+        "--batch-safety-margin".to_string(),
+        batch_safety_margin.to_string(),
         "--uds-path".to_string(),
         uds_path,
         "--cuda-process-memory-fraction".to_string(),
@@ -455,12 +460,6 @@ fn shard_manager(
         shard_argv.push(revision);
     }
 
-    // Maximum batch weight - used only for PT2 compile
-    if let Some(max_batch_weight) = max_batch_weight {
-        shard_argv.push("--max-batch-weight".to_string());
-        shard_argv.push(max_batch_weight.to_string());
-    }
-
     // Copy current process env
     let mut env: Vec<(OsString, OsString)> = env::vars_os().collect();
 
 
@@ -41,16 +41,26 @@ message ClearCacheResponse {}
 /// Empty request
 message ModelInfoRequest {}
 
+message MemoryScalingModel {
+    float prefill_linear_coef0 = 1;
+    float prefill_quadratic_coef0 = 2;
+    float prefill_quadratic_coef1 = 3;
+    float nexttoken_linear_coef0 = 4;
+    float nexttoken_linear_coef1 = 5;
+    uint64 weight_limit = 6;
+}
+
 message ModelInfoResponse {
     enum ModelType {
         CAUSAL_LM = 0;
         SEQ2SEQ_LM = 1;
     }
-
     ModelType model_type = 1;
     uint32 eos_token = 2;
     /// Whether batches are rectangular/padded (false for flash attention)
     bool batch_padding = 3;
+    /// Memory scaling model
+    MemoryScalingModel memory_scaling_model = 4;
 }
 
 message NextTokenChooserParameters {
@@ -211,4 +221,4 @@ message PrefixLookupRequest {
 /// Empty response
 message PrefixLookupResponse {
     uint32 prefix_length = 1;
-}
+}
@@ -76,15 +76,20 @@ impl Client {
 
     /// Get shard model info
     #[instrument(skip(self))]
-    pub async fn model_info(&mut self) -> Result<(ModelType, u32, bool)> {
+    pub async fn model_info(&mut self) -> Result<(ModelType, u32, bool, MemoryScalingModel)> {
         let request = tonic::Request::new(ModelInfoRequest {});
         let response = self.stub
             .model_info(request)
             .instrument(info_span!("model_info"))
             .await?
             .into_inner();
         ModelType::try_from(response.model_type)
-            .map(|mt| (mt, response.eos_token, response.batch_padding))
+            .map(|mt| (
+                mt,
+                response.eos_token,
+                response.batch_padding,
+                response.memory_scaling_model.unwrap(),
+            ))
             .map_err(|_| ClientError::Generation("Unrecognized model type".to_string()))
     }
 
 
@@ -7,6 +7,7 @@ use tonic::transport::Uri;
 use crate::client::GenerateTokenResponse;
 use crate::pb::generate::v1::CachedBatch;
 use crate::pb::generate::v1::model_info_response::ModelType;
+use crate::pb::generate::v1::MemoryScalingModel;
 use crate::sharded_client::Request::{NextToken, Prefill};
 
 #[derive(Clone, Debug)]
@@ -138,8 +139,8 @@ impl ShardedClient {
     }
 
     /// Get shard model info
-    pub async fn model_info(&mut self) -> Result<(bool, u32, bool)> {
+    pub async fn model_info(&mut self) -> Result<(bool, u32, bool, MemoryScalingModel)> {
         self.clients[0].model_info().await
-            .map(|(mt, eos, bpad)| (mt == ModelType::Seq2seqLm, eos, bpad))
+            .map(|(mt, eos, bpad, mem_model)| (mt == ModelType::Seq2seqLm, eos, bpad, mem_model))
     }
 }
@@ -1,32 +1,29 @@
 use std::cmp::max;
 use std::collections::BTreeSet;
 use nohash_hasher::IntMap;
-use num::integer::Roots;
 use crate::queue::Entry;
 
+
 pub(crate) trait BatchType: Send + Sync + Clone + 'static {
     type Stats: Default;
 
     /// Update batch statistics with an additional request
     fn update_stats(stats: &Self::Stats, input_length: usize, output_length: usize) -> Self::Stats;
     /// Calculate worst-case max batch weight given batch statistics
-    fn batch_max_weight(stats: &Self::Stats, batch_size: usize) -> usize;
+    fn batch_max_weight(&self, stats: &Self::Stats, batch_size: usize) -> usize;
     /// Calculate initial max batch weight given batch statistics (based on input lengths only)
-    fn batch_initial_weight(stats: &Self::Stats, batch_size: usize) -> usize;
+    fn batch_initial_weight(&self, stats: &Self::Stats, batch_size: usize) -> usize;
     /// Calculate prefill batch weight given prefill batch statistics
-    fn prefill_weight(prefill_stats: &Self::Stats, batch_size: usize) -> usize;
+    fn prefill_weight(&self, prefill_stats: &Self::Stats, batch_size: usize) -> usize;
     /// Percentage of batch tokens that are padding
     fn percent_padding(prefill_stats: &Self::Stats, batch_size: usize) -> f32;
     /// Indicate whether a hypothetical batch will exceed the combined weight limit
     fn exceeds_weight(
-        tree: &BTreeSet<(usize, usize, usize)>, max_total_weight: usize, current_output_len: usize
+        &self, tree: &BTreeSet<(usize, usize, usize)>, max_total_weight: usize, current_output_len: usize
     ) -> bool;
     /// Provide a count of tokens for a given batch, including padding tokens if applicable
     fn count_tokens(input_lengths: impl Iterator<Item=usize>, batch_size: usize) -> usize;
 
-    /// max_prefill_weight to use when none is specified
-    fn default_max_prefill_weight() -> usize;
-
     /// Compute batch statistics given map of entries
     fn compute_stats(entries: &IntMap<u64, Entry>) -> Self::Stats {
         entries.iter().fold(
@@ -45,7 +42,10 @@ pub(crate) trait BatchType: Send + Sync + Clone + 'static {
 
 /// Non-padded batch used in flash attention
 #[derive(Clone)]
-pub(crate) struct FlashBatch {}
+pub(crate) struct FlashBatch {
+    pub(crate) prefill_gradient: f64,
+    pub(crate) nexttoken_gradient: f64,
+}
 
 impl BatchType for FlashBatch {
     /// Keep track of total number of input and output tokens in the batch
@@ -58,37 +58,38 @@ impl BatchType for FlashBatch {
         (total_in_tokens + input_length, total_out_tokens + output_length)
     }
 
-    fn batch_max_weight(total_tokens: &Self::Stats, _batch_size: usize) -> usize {
+    fn batch_max_weight(&self, total_tokens: &Self::Stats, _batch_size: usize) -> usize {
         let (total_in_tokens, total_out_tokens) = total_tokens;
-        total_in_tokens + total_out_tokens
+        ((*total_in_tokens + *total_out_tokens) as f64 * self.nexttoken_gradient) as usize
     }
 
-    fn batch_initial_weight((total_in_tokens, _): &Self::Stats, _batch_size: usize) -> usize {
-        *total_in_tokens
+    fn batch_initial_weight(&self, (total_in_tokens, _): &Self::Stats, _batch_size: usize) -> usize {
+        (*total_in_tokens as f64 * self.nexttoken_gradient) as usize
     }
 
-    fn prefill_weight((total_in_tokens, _): &Self::Stats, _batch_size: usize) -> usize {
-        *total_in_tokens
+    fn prefill_weight(&self, (total_in_tokens, _): &Self::Stats, _batch_size: usize) -> usize {
+        (*total_in_tokens as f64 * self.prefill_gradient) as usize
     }
 
     fn percent_padding(_: &Self::Stats, _batch_size: usize) -> f32 {
         0.0
     }
 
     fn exceeds_weight(
-        tree: &BTreeSet<(usize, usize, usize)>, max_total_weight: usize, current_output_len: usize
+        &self, tree: &BTreeSet<(usize, usize, usize)>, max_total_weight: usize, current_output_len: usize
     ) -> bool {
         let mut in_sum = 0;
         // Work backwards from longest projected entry
         for (batch_size, (out_len, in_len, _)) in tree.iter().rev().enumerate() {
+            let total_weight_limit = max_total_weight as f64;
             let this_out_len = *out_len;
             in_sum += *in_len;
             // Only need to check segments with output_len > current_output_len
             // will have been checked in a prior iteration
             if this_out_len <= current_output_len {
                 // Check if we breach max space for this segment
-                let token_count = in_sum + (batch_size + 1) * this_out_len;
-                if token_count > max_total_weight {
+                let seg_max_tokens = in_sum + (batch_size + 1) * this_out_len;
+                if seg_max_tokens as f64 * self.nexttoken_gradient > total_weight_limit {
                     return true
                 }
             }
@@ -100,14 +101,16 @@ impl BatchType for FlashBatch {
         input_lengths.sum()
     }
 
-    fn default_max_prefill_weight() -> usize {
-        8192
-    }
 }
 
 /// Regular rectangular padded
 #[derive(Clone)]
-pub(crate) struct PaddedBatch {}
+pub(crate) struct PaddedBatch {
+    pub(crate) prefill_linear_coef1: f64,
+    pub(crate) prefill_quadratic_coef1: f64,
+    pub(crate) prefill_quadratic_coef2: f64,
+    pub(crate) nexttoken_gradient: f64,
+}
 
 impl BatchType for PaddedBatch {
     /// Keep track of maximum input length, maximum output length, input token count
@@ -124,20 +127,26 @@ impl BatchType for PaddedBatch {
         )
     }
 
-    fn batch_max_weight(max_in_out_lengths: &Self::Stats, batch_size: usize) -> usize {
+    fn batch_max_weight(&self, max_in_out_lengths: &Self::Stats, batch_size: usize) -> usize {
         let (max_input_length, max_output_length, _) = max_in_out_lengths;
-        let max_seq_len = max_input_length + max_output_length;
-        // Memory requirement roughly proportional to batch_size * seq_len^2
-        batch_size * max_seq_len.pow(2)
+        let seq_len_upper_bound = max_input_length + max_output_length;
+        ((seq_len_upper_bound * batch_size) as f64 * self.nexttoken_gradient) as usize
     }
 
-    fn batch_initial_weight((max_input_length, _, _): &Self::Stats, batch_size: usize) -> usize {
-        batch_size * max_input_length.pow(2)
+    fn batch_initial_weight(&self, (max_input_length, _, _): &Self::Stats, batch_size: usize) -> usize {
+        ((*max_input_length * batch_size) as f64 * self.nexttoken_gradient) as usize
     }
 
-    fn prefill_weight((max_input_length, _, _): &Self::Stats, batch_size: usize) -> usize {
+    fn prefill_weight(&self, (max_input_length, _, _): &Self::Stats, batch_size: usize) -> usize {
         // Empirically, prefill latency is proportional to batch_size * seq_len^(3/2)
-        batch_size * max_input_length.pow(3).sqrt()
+        let input_tokens = batch_size * max_input_length;
+        let quad_input_tokens = (input_tokens * max_input_length) as f64;
+        let input_tokens = input_tokens as f64;
+        let linear = input_tokens * self.prefill_linear_coef1;
+        let quadratic = input_tokens * self.prefill_quadratic_coef1 +
+            quad_input_tokens * self.prefill_quadratic_coef2;
+
+        f64::max(linear, quadratic) as usize
     }
 
     fn percent_padding((max_input_length, _, total_in_tokens): &Self::Stats, batch_size: usize) -> f32 {
@@ -149,17 +158,18 @@ impl BatchType for PaddedBatch {
     }
 
     fn exceeds_weight(
-        tree: &BTreeSet<(usize, usize, usize)>, max_total_weight: usize, current_output_len: usize
+        &self, tree: &BTreeSet<(usize, usize, usize)>, max_total_weight: usize, current_output_len: usize
     ) -> bool {
+        let total_weight_limit = max_total_weight as f64;
         let mut max_in_len = 0;
         // Work backwards from longest projected entry
         for (batch_size, (out_len, in_len, _)) in tree.iter().rev().enumerate() {
             let this_out_len = *out_len;
             max_in_len = max(max_in_len, *in_len);
             if this_out_len <= current_output_len {
                 // Check if we breach max space for this segment
-                let seq_len = max_in_len + this_out_len;
-                if seq_len.pow(2) * (batch_size + 1) > max_total_weight {
+                let seg_max_tokens = (max_in_len + this_out_len) * (batch_size + 1);
+                if seg_max_tokens as f64 * self.nexttoken_gradient > total_weight_limit {
                     return true
                 }
             }
@@ -170,8 +180,4 @@ impl BatchType for PaddedBatch {
     fn count_tokens(input_lengths: impl Iterator<Item=usize>, batch_size: usize) -> usize {
         input_lengths.max().unwrap_or(0) * batch_size
     }
-
-    fn default_max_prefill_weight() -> usize {
-        300000
-    }
 }
@@ -19,10 +19,6 @@ struct Args {
     max_new_tokens: usize,
     #[clap(default_value = "12", long, env)]
     max_batch_size: usize,
-    #[clap(default_value = None, long, env)]
-    max_batch_weight: Option<usize>,
-    #[clap(default_value = None, long, env)]
-    max_prefill_weight: Option<usize>,
     #[clap(default_value = "0.2", long, env)]
     max_prefill_padding: f32,
     #[clap(default_value = "24", long, env)]
@@ -129,8 +125,6 @@ fn main() -> Result<(), std::io::Error> {
                 max_sequence_length: args.max_sequence_length,
                 max_new_tokens: args.max_new_tokens,
                 max_batch_size: args.max_batch_size,
-                max_batch_weight: args.max_batch_weight,
-                max_prefill_weight: args.max_prefill_weight,
                 max_prefill_padding: args.max_prefill_padding,
                 max_waiting_tokens: args.max_waiting_tokens,
                 client: sharded_client,
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@ use tonic::transport::Uri;`
`7`	`7`	`use crate::client::GenerateTokenResponse;`
`8`	`8`	`use crate::pb::generate::v1::CachedBatch;`
`9`	`9`	`use crate::pb::generate::v1::model_info_response::ModelType;`
	`10`	`+use crate::pb::generate::v1::MemoryScalingModel;`
`10`	`11`	`use crate::sharded_client::Request::{NextToken, Prefill};`
`11`	`12`
`12`	`13`	`#[derive(Clone, Debug)]`
`@@ -138,8 +139,8 @@ impl ShardedClient {`
`138`	`139`	`}`
`139`	`140`
`140`	`141`	`/// Get shard model info`
`141`		`- pub async fn model_info(&mut self) -> Result<(bool, u32, bool)> {`
	`142`	`+ pub async fn model_info(&mut self) -> Result<(bool, u32, bool, MemoryScalingModel)> {`
`142`	`143`	`self.clients[0].model_info().await`
`143`		`- .map(\|(mt, eos, bpad)\| (mt == ModelType::Seq2seqLm, eos, bpad))`
	`144`	`+ .map(\|(mt, eos, bpad, mem_model)\| (mt == ModelType::Seq2seqLm, eos, bpad, mem_model))`
`144`	`145`	`}`
`145`	`146`	`}`