♻️ Default max sequence length based on model configuration

prashantgupta24 · joerunde · declark1 · njhill · commit 36b8d86f24d9 · 2024-02-08T14:53:28.000-08:00
Based on precedence order:
1. MAX_SEQUENCE_LENGTH, if specified by the user
2. A field from config.json corresponding to the max sequence length
3. A default of 2048

Signed-off-by: Prashant Gupta &lt;prashantgupta@us.ibm.com&gt;
Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

Co-authored-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;
Co-authored-by: Daniel Clark &lt;Daniel.Clark@ibm.com&gt;
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml
@@ -9,6 +9,7 @@ description = "Text Generation Launcher"
 clap = { version = "4.4.18", features = ["derive", "env"] }
 ctrlc = { version = "3.4.2", features = ["termination"] }
 nix = "0.27.1"
+serde_json = "^1.0.113"
 tracing = "0.1.40"
 tracing-subscriber = { version = "0.3.18", features = ["json"] }
 
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
@@ -13,15 +13,15 @@ use std::thread;
 use std::thread::sleep;
 use std::time::{Duration, Instant};
 use std::{fs, io};
-use std::fs::File;
 use std::env::VarError;
 use std::ffi::OsString;
+use std::fs::File;
 use std::os::unix::process::CommandExt;
-use String;
 use tracing::{info, warn};
 
 // In most cases this gives the best performance for inferencing
 const DEFAULT_PYTORCH_CUDA_ALLOC_CONF: &'static str = "expandable_segments:True";
+const DEFAULT_MAX_SEQUENCE_LENGTH: usize = 2048;
 
 /// App Configuration
 #[derive(Parser, Debug, Clone)]
@@ -43,8 +43,8 @@ struct Args {
     num_shard: Option<usize>,
     #[clap(default_value = "96", long, env)]
     max_concurrent_requests: usize,
-    #[clap(default_value = "2048", long, env)]
-    max_sequence_length: usize,
+    #[clap(default_value = None, long, env)]
+    max_sequence_length: Option<usize>,
     #[clap(default_value = "1024", long, env)]
     max_new_tokens: usize,
     #[clap(default_value = "12", long, env)]
@@ -117,14 +117,18 @@ fn main() -> ExitCode {
         ),
         _ => (),
     }
-
+    
     // Determine number of shards based on command line arg and env vars
     let num_shard = find_num_shards(args.num_shard);
-
+    
     // Resolve fast tokenizer path
     let tokenizer_path = resolve_tokenizer_path(
         &args.model_name, args.revision.as_deref()
     ).expect("Could not find tokenizer for model");
+    
+    // Determine max sequence length based on command line arg and env vars
+    let config_json_path = tokenizer_path.replace("tokenizer.json", "config.json");
+    let max_sequence_length = get_max_sequence_length(args.max_sequence_length, &config_json_path);
 
     match env::var("MAX_BATCH_WEIGHT") {
         Ok(max_batch_weight) if !max_batch_weight.trim().is_empty() => {
@@ -189,7 +193,7 @@ fn main() -> ExitCode {
                 args.deployment_framework,
                 args.dtype.or(args.dtype_str),
                 args.quantize,
-                args.max_sequence_length,
+                max_sequence_length,
                 args.max_new_tokens,
                 args.max_batch_size,
                 args.batch_safety_margin,
@@ -246,7 +250,7 @@ fn main() -> ExitCode {
         "--max-concurrent-requests".to_string(),
         args.max_concurrent_requests.to_string(),
         "--max-sequence-length".to_string(),
-        args.max_sequence_length.to_string(),
+        max_sequence_length.to_string(),
         "--max-new-tokens".to_string(),
         args.max_new_tokens.to_string(),
         "--max-batch-size".to_string(),
@@ -364,6 +368,69 @@ fn num_cuda_devices() -> Option<usize> {
     let n_devices = devices.split(',').count();
     Some(n_devices)
 }
+/// Finds a max sequence length for the model. In priority order:
+/// 1. MAX_SEQUENCE_LENGTH set by user
+/// 2. The sequence length specified in config.json
+/// 3. A default of 2048
+fn get_max_sequence_length(max_sequence_length: Option<usize>, config_json_path: &String) -> usize {
+    if let Some(max_sequence_length) = max_sequence_length {
+        info!(
+            "Using configured max_sequence_length: {}",
+            max_sequence_length
+        );
+        return max_sequence_length;
+    }
+    if let Ok(model_config) = get_config_json(config_json_path) {
+        if let Some(length) = get_max_sequence_length_from_config(&model_config) {
+            info!(
+                "Loaded max_sequence_length from model config.json: {}",
+                length
+            );
+            return length;
+        }
+    }
+    info!(
+        "Using default max_sequence_length: {}",
+        DEFAULT_MAX_SEQUENCE_LENGTH
+    );
+    DEFAULT_MAX_SEQUENCE_LENGTH
+}
+
+/// Opens the model's config.json file and reads into serde_json value
+fn get_config_json(config_path: &String) -> Result<serde_json::Value, std::io::Error> {
+    let reader = BufReader::new(File::open(config_path)?); 
+    Ok(serde_json::from_reader(reader)?)
+}
+
+/// Attempts to find a max sequence length from the model's config.
+/// There is no standard field for this, different model architectures name it differently.
+/// This checks for some well-known field names, and returns nothing if none are found.
+/// referenced from: https://github.com/vllm-project/vllm/blob/923797fea4d80a4dac4409ece3c450b84d5ba001/vllm/config.py#L557-L592
+fn get_max_sequence_length_from_config(model_config: &serde_json::Value) -> Option<usize> {
+    let possible_keys = [
+        // OPT
+        "max_position_embeddings",
+        // GPT-2
+        "n_positions",
+        // MPT
+        "max_seq_len",
+        // ChatGLM2
+        "seq_length",
+        // Others
+        "max_sequence_length",
+        "max_seq_length",
+        "seq_len",
+    ];
+
+    for key in possible_keys {
+        if let Some(value) = model_config.get(key) {
+            if let Some(value) = value.as_u64() {
+                return Some(value as usize);
+            }
+        }
+    }
+    None
+}
 
 fn find_num_shards(num_shard: Option<usize>) -> usize {
     // get the number of shards given `num_gpu` and `num_shard`