fix: Always set word embeddings (IBM#41)

joerunde · web-flow · commit ef7612bde670 · 2024-02-28T15:36:47.000-08:00
A previous OOM fix mistakenly left model.word_embeddings always unset
when the prompt cache was disabled. This causes inference without a
prompt cache to fail. Our tests always set up a prompt cache so they did
not catch this case.

Fix: Always invoke self._setup_prompt_encoder() in model.py again.

Result: Inference without a prompt cache works.

-----

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
@@ -45,10 +45,11 @@ def __init__(self, engine: BaseInferenceEngine, dtype: torch.dtype, max_seq_leng
         # Check whether model supports position_ids
         self.use_position_ids = "position_ids" in inspect.signature(self.model.forward).parameters
 
-        # Short-circuit: Don't set up the prompt encoder if the prompt cache is not set
-        prompt_prefix_supported = self.prompt_cache_set() and self._setup_prompt_encoder()
+        # 🌶️🌶️🌶️ self._setup_prompt_encoder must be called even if the prompt cache is not used.
+        # A required side-effect is that it sets self.word_embeddings.
+        prompt_prefix_supported = self._setup_prompt_encoder()
 
-        if prompt_prefix_supported:
+        if prompt_prefix_supported and self.prompt_cache_set():
             # Set up prefix cache
 
             if max_seq_length is None: