feat: allow configuration of the max soft prompt length (#33)

joerunde · tjohnson31415 · web-flow · commit 1f4cfbe77e31 · 2024-02-21T13:22:40.000-07:00
Instead of defaulting to a hard-coded 256, the default soft prompt
length is now 50% of the max sequence length.
The env var MAX_PROMPT_PREFIX_LENGTH can be used to override this
default if desired


Signed-off-by: Travis Johnson &lt;tsjohnso@us.ibm.com&gt;
Co-authored-by: TRAVIS JOHNSON &lt;tsjohnso@us.ibm.com&gt;
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -559,7 +559,7 @@ def __init__(
             model_path, AutoModelForCausalLM, dtype, quantize, model_config, max_sequence_length
         )
 
-        super(CausalLM, self).__init__(inference_engine, dtype)
+        super(CausalLM, self).__init__(inference_engine, dtype, max_sequence_length)
 
         if self.model.config.pad_token_id is not None:
             self.tokenizer.pad_token_id = self.model.config.pad_token_id
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -385,7 +385,7 @@ def __init__(
             model_path, auto_model_class, dtype, quantize, model_config, max_sequence_length
         )
 
-        super(FlashCausalLM, self).__init__(inference_engine, dtype)
+        super(FlashCausalLM, self).__init__(inference_engine, dtype, max_sequence_length)
         self.use_position_ids = True
 
         if self.model.config.pad_token_id is not None:
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
@@ -1,4 +1,5 @@
 import inspect
+import math
 import os
 import types
 
@@ -19,9 +20,6 @@
 
 B = TypeVar("B", bound=Batch)
 
-# TODO make configurable, possibly based on configured max seq length
-MAX_PROMPT_PREFIX_LENGTH = 256
-
 CUDA_PAD_TO_MULT_OF_8 = os.getenv("CUDA_PAD_TO_MULT_OF_8", "true").lower() != "false"
 PT2_COMPILE = os.getenv("PT2_COMPILE", "false").lower() != "false"
 
@@ -33,7 +31,7 @@
 
 
 class Model(ABC):
-    def __init__(self, engine: BaseInferenceEngine, dtype: torch.dtype):
+    def __init__(self, engine: BaseInferenceEngine, dtype: torch.dtype, max_seq_length: Optional[int] = None):
         self.engine = engine
         self.config, self.tokenizer, self.model = engine.get_components()
         self.device = engine.get_device()
@@ -50,6 +48,24 @@ def __init__(self, engine: BaseInferenceEngine, dtype: torch.dtype):
 
         if prompt_prefix_supported:
             # Set up prefix cache
+
+            if max_seq_length is None:
+                # shouldn't be None, but just in case since the parameter is passed through as Optional
+                max_seq_length = 2048
+
+            # default value to 50% of the max sequence length
+            max_prompt_prefix_length = math.ceil(max_seq_length * 0.5)
+            if (max_prompt_prefix_env_var := os.getenv("MAX_PROMPT_PREFIX_LENGTH")):
+                try:
+                    max_prompt_prefix_env_var = int(max_prompt_prefix_env_var)
+                except ValueError as exc:
+                    raise ValueError("Invalid value for MAX_PROMPT_PREFIX_LENGTH") from exc
+
+                if max_prompt_prefix_env_var > max_seq_length - 1:
+                    raise ValueError(f"Value for the MAX_PROMPT_PREFIX_LENGTH ({max_prompt_prefix_env_var}) cannot be larger than the max sequence length - 1 ({max_seq_length - 1})")
+
+                max_prompt_prefix_length = max_prompt_prefix_env_var
+
             decoder_start_token_id = self.model.config.decoder_start_token_id
             if decoder_start_token_id is None:
                 decoder_start_token_id = self.tokenizer.bos_token_id
@@ -65,7 +81,7 @@ def __init__(self, engine: BaseInferenceEngine, dtype: torch.dtype):
             self.prefix_cache = PrefixCache(
                 device=self.device,
                 dtype=dtype,
-                max_length=MAX_PROMPT_PREFIX_LENGTH,
+                max_length=max_prompt_prefix_length,
                 encoder_decoder=self.model.config.is_encoder_decoder,
                 return_zero=return_zero,
                 decoder_start_tok_embedding=self.word_embeddings(
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
@@ -557,7 +557,7 @@ def __init__(
         inference_engine = get_inference_engine_class(deployment_framework)(
             model_path, AutoModelForSeq2SeqLM, dtype, quantize, model_config, max_sequence_length
         )
-        super(Seq2SeqLM, self).__init__(inference_engine, dtype)
+        super(Seq2SeqLM, self).__init__(inference_engine, dtype, max_sequence_length)
 
         bos_token_id = self.model.config.decoder_start_token_id
         if bos_token_id is None:

Original file line number	Diff line number	Diff line change
`@@ -559,7 +559,7 @@ def __init__(`
`559`	`559`	`model_path, AutoModelForCausalLM, dtype, quantize, model_config, max_sequence_length`
`560`	`560`	`)`
`561`	`561`
`562`		`- super(CausalLM, self).__init__(inference_engine, dtype)`
	`562`	`+ super(CausalLM, self).__init__(inference_engine, dtype, max_sequence_length)`
`563`	`563`
`564`	`564`	`if self.model.config.pad_token_id is not None:`
`565`	`565`	`self.tokenizer.pad_token_id = self.model.config.pad_token_id`
Original file line number	Diff line number	Diff line change
`@@ -385,7 +385,7 @@ def __init__(`
`385`	`385`	`model_path, auto_model_class, dtype, quantize, model_config, max_sequence_length`
`386`	`386`	`)`
`387`	`387`
`388`		`- super(FlashCausalLM, self).__init__(inference_engine, dtype)`
	`388`	`+ super(FlashCausalLM, self).__init__(inference_engine, dtype, max_sequence_length)`
`389`	`389`	`self.use_position_ids = True`
`390`	`390`
`391`	`391`	`if self.model.config.pad_token_id is not None:`
Original file line number	Diff line number	Diff line change
`@@ -557,7 +557,7 @@ def __init__(`
`557`	`557`	`inference_engine = get_inference_engine_class(deployment_framework)(`
`558`	`558`	`model_path, AutoModelForSeq2SeqLM, dtype, quantize, model_config, max_sequence_length`
`559`	`559`	`)`
`560`		`- super(Seq2SeqLM, self).__init__(inference_engine, dtype)`
	`560`	`+ super(Seq2SeqLM, self).__init__(inference_engine, dtype, max_sequence_length)`
`561`	`561`
`562`	`562`	`bos_token_id = self.model.config.decoder_start_token_id`
`563`	`563`	`if bos_token_id is None:`