fix: Enforce max_concurrent_requests > 0

njhill · njhill · commit 34c44ff7c413 · 2024-02-08T15:03:27.000-08:00
We had allowed 0 to mean unlimited, but it's used to size a bounded queue. Bounded queue is preferable to unbounded for performance reasons.
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
@@ -41,7 +41,7 @@ struct Args {
     quantize: Option<String>,
     #[clap(long, env)]
     num_shard: Option<usize>,
-    #[clap(default_value = "96", long, env)]
+    #[clap(default_value = "512", long, env)]
     max_concurrent_requests: usize,
     #[clap(default_value = None, long, env)]
     max_sequence_length: Option<usize>,
diff --git a/router/src/main.rs b/router/src/main.rs
@@ -14,7 +14,7 @@ use text_generation_router::server::ServerRunArgs;
 #[derive(Parser, Debug)]
 #[clap(author, version, about, long_about = None)]
 struct Args {
-    #[clap(default_value = "96", long, env)]
+    #[clap(default_value = "512", long, env)]
     max_concurrent_requests: usize,
     #[clap(default_value = "2048", long, env)]
     max_sequence_length: usize,
@@ -73,17 +73,8 @@ fn main() -> Result<(), std::io::Error> {
         tracing_subscriber::fmt().compact().init();
     }
 
-    if args.tokenization_workers == Some(0) {
-        panic!("tokenization_workers must be > 0");
-    }
-
-    if args.tls_key_path.is_some() != args.tls_cert_path.is_some() {
-        panic!("tls: must provide both cert and key")
-    }
-
-    if args.tls_client_ca_cert_path.is_some() && args.tls_cert_path.is_none() {
-        panic!("tls: cannot provide client ca cert without keypair")
-    }
+    // Validate args
+    validate_args(&args);
 
     // Instantiate tokenizer
     let mut tokenizer = Tokenizer::from_file(args.tokenizer_path)
@@ -158,6 +149,42 @@ fn main() -> Result<(), std::io::Error> {
         })
 }
 
+fn validate_args(args: &Args) {
+    if args.tokenization_workers == Some(0) {
+        panic!("tokenization_workers must be > 0");
+    }
+
+    if args.max_concurrent_requests == 0 {
+        panic!("max_concurrent_requests must be > 0");
+    }
+
+    if args.tls_key_path.is_some() != args.tls_cert_path.is_some() {
+        panic!("tls: must provide both cert and key")
+    }
+
+    if args.tls_client_ca_cert_path.is_some() && args.tls_cert_path.is_none() {
+        panic!("tls: cannot provide client ca cert without keypair")
+    }
+
+    if args.max_prefill_padding < 0.0 || args.max_prefill_padding > 1.0 {
+        panic!(
+            "max_prefill_padding ({}) must be a percentage in the range [0.0, 1.0]",
+            args.max_prefill_padding,
+        )
+    }
+
+    if args.max_new_tokens < 1 {
+        panic!("max_new_tokens ({}) at least 1", args.max_new_tokens)
+    }
+
+    if args.max_sequence_length < 2 {
+        panic!(
+            "max_sequence_length ({}) must be at least 2 (1 input + 1 output)",
+            args.max_sequence_length,
+        )
+    }
+}
+
 fn write_termination_log(msg: &str) -> Result<(), io::Error> {
     // Writes a message to the termination log.
     // Creates the logfile if it doesn't exist.
diff --git a/router/src/server.rs b/router/src/server.rs
@@ -184,7 +184,7 @@ impl<'a, B: BatchType> BatchConfigValidator<'a, B> {
     fn validate_batch_config(
         &self,
         max_sequence_length: usize,
-        max_batch_size: usize,
+        _max_batch_size: usize,
         max_batch_weight: usize,
     ) {
         let single_request_stats = <B>::update_stats(
@@ -284,30 +284,17 @@ async fn do_run<B: BatchType>(
         batch_weight_limit,
     );
 
-    let max_prefill_padding = args.max_prefill_padding;
-    if max_prefill_padding < 0.0 || max_prefill_padding > 1.0 {
-        panic!("max_prefill_padding ({}) must be a percentage in the range [0.0, 1.0]", max_prefill_padding)
-    }
-
-    if args.max_new_tokens < 1 {
-        panic!("max_new_tokens ({}) at least 1", args.max_new_tokens)
-    }
-
-    if args.max_sequence_length < 2 {
-        panic!("max_sequence_length ({}) must be at least 2 (1 input + 1 output)", args.max_sequence_length)
-    }
-
     let max_new_tokens = if args.max_new_tokens < args.max_sequence_length {
         args.max_new_tokens
     } else {
-        tracing::warn!(
+        warn!(
             "adjusting max_new_tokens ({}) down to max_sequence_length - 1 ({})",
             args.max_new_tokens,
             args.max_sequence_length-1
         );
         args.max_sequence_length - 1
     };
-    
+
 
     let tokenizers = AsyncTokenizer::new(
         &args.tokenizer, args.tokenization_workers
@@ -326,7 +313,7 @@ async fn do_run<B: BatchType>(
         BatchingConfig {
             size_limit: args.max_batch_size,
             weight_limit: batch_weight_limit,
-            prefill_padding_limit: max_prefill_padding,
+            prefill_padding_limit: args.max_prefill_padding,
         },
         args.max_waiting_tokens,
         args.max_concurrent_requests,