Improve log messages around the max sequence length (#103)

maxdebayser · web-flow · commit 5b5938e1b3f6 · 2024-06-28T14:04:53.000-03:00
#### Motivation The existing messages were confusing to the users. #### Modifications In the router the error message was rephrased to make it more understandable for users who arent familiar with the internals. In the server we now print the maximum possible sequence length limited by the model sequence length. The existing print was showing how much output tokens can fit into the memory if you pass max_sequence_length input tokens and vice-versa. I don't know what I was thinking when I wrote that. #### Related Issues https://github.ibm.com/ai-foundation/watson-fm-stack-tracker/issues/958 --------- Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Maximilien de Bayser <mbayser@br.ibm.com>
diff --git a/router/src/server.rs b/router/src/server.rs
@@ -222,8 +222,8 @@ impl<'a, B: BatchType> BatchConfigValidator<'a, B> {
             self.batch_type.prefill_weight(&single_request_stats, 1);
         if max_batch_weight < single_request_prefill_weight {
             panic!(
-                "max_batch_weight ({}) not large enough for (prefill) max_sequence_length ({})",
-                max_batch_weight, max_sequence_length
+                "The provided max_sequence length ({}) results in a prefill batch weight that exceeds the estimated capacity ({})",
+                max_sequence_length, max_batch_weight
             )
         }
 
@@ -232,8 +232,8 @@ impl<'a, B: BatchType> BatchConfigValidator<'a, B> {
             .batch_initial_weight(&single_request_stats, 1);
         if max_batch_weight < single_request_nexttoken_weight {
             panic!(
-                "max_batch_weight ({}) not large enough for (next-token) max_sequence_length ({})",
-                max_batch_weight, max_sequence_length
+                "The provided max_sequence length ({}) results in a next-token batch weight that exceeds the estimated capacity ({})",
+                max_sequence_length, max_batch_weight
             )
         }
     }
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
@@ -409,13 +409,15 @@ def estimate_memory():
                 memory_scaling_model = estimate_memory()
                 compile()
 
-            max_input = memory_scaling_model.max_input_len_for_nt(1, max_sequence_length-1, sys.maxsize)
-            max_output = memory_scaling_model.max_output_len_for_nt(1, max_sequence_length-1, sys.maxsize)
-
             if local_rank == 0:
+                # For a batch of size 1 and an output of 1, get max input limited by max_sequence_length
+                max_input  = memory_scaling_model.max_input_len_for_nt(1, 1, max_sequence_length)
+                # For a batch of size 1 and an input of 1, get max output limited by max_sequence_length
+                max_output = memory_scaling_model.max_output_len_for_nt(1, 1, max_sequence_length)
+                max_theoretical_len = min(max_input, max_output) + 1
                 print(
                     "Maximum possible sequence length given available memory (for batch size 1): "
-                    f"{min(max_input, max_output)}"
+                    f"{max_theoretical_len}"
                 )
 
         elif ESTIMATE_MEMORY == "manual":

Original file line number	Diff line number	Diff line change
`@@ -222,8 +222,8 @@ impl<'a, B: BatchType> BatchConfigValidator<'a, B> {`
`222`	`222`	`self.batch_type.prefill_weight(&single_request_stats, 1);`
`223`	`223`	`if max_batch_weight < single_request_prefill_weight {`
`224`	`224`	`panic!(`
`225`		`- "max_batch_weight ({}) not large enough for (prefill) max_sequence_length ({})",`
`226`		`- max_batch_weight, max_sequence_length`
	`225`	`+ "The provided max_sequence length ({}) results in a prefill batch weight that exceeds the estimated capacity ({})",`
	`226`	`+ max_sequence_length, max_batch_weight`
`227`	`227`	`)`
`228`	`228`	`}`
`229`	`229`
`@@ -232,8 +232,8 @@ impl<'a, B: BatchType> BatchConfigValidator<'a, B> {`
`232`	`232`	`.batch_initial_weight(&single_request_stats, 1);`
`233`	`233`	`if max_batch_weight < single_request_nexttoken_weight {`
`234`	`234`	`panic!(`
`235`		`- "max_batch_weight ({}) not large enough for (next-token) max_sequence_length ({})",`
`236`		`- max_batch_weight, max_sequence_length`
	`235`	`+ "The provided max_sequence length ({}) results in a next-token batch weight that exceeds the estimated capacity ({})",`
	`236`	`+ max_sequence_length, max_batch_weight`
`237`	`237`	`)`
`238`	`238`	`}`
`239`	`239`	`}`