fix: Add input_embeds arg to all flash model impls

njhill · njhill · commit 6b772b017bd5 · 2023-09-28T15:15:34.000-07:00
Means that recent changes to support input_embeds for santacoder (bigcode_gpt) don't break usage of the other flash model impls.
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -489,10 +489,14 @@ def forward(
         cu_seqlens,
         cu_seqlens_q,
         max_s,
+        inputs_embeds: Optional[torch.Tensor] = None,
         past_key_values: Optional[torch.Tensor] = None,
         pre_allocate_past_size: Optional[int] = None,
         lm_head_indices: Optional[torch.Tensor] = None,
     ):
+        if inputs_embeds is not None:
+            raise ValueError("input_embeds not yet supported for flash llama")
+
         hidden_states, present = self.model(
             input_ids,
             position_ids,
diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -398,10 +398,14 @@ def forward(
         cu_seqlens,
         cu_seqlens_q,
         max_s,
+        inputs_embeds: Optional[torch.Tensor] = None,
         past_key_values: Optional[torch.Tensor] = None,
         pre_allocate_past_size: Optional[int] = None,
         lm_head_indices: Optional[torch.Tensor] = None,
     ):
+        if inputs_embeds is not None:
+            raise ValueError("input_embeds not yet supported for flash neox")
+
         hidden_states, present = self.gpt_neox(
             input_ids,
             position_ids,
diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -634,10 +634,14 @@ def forward(
         cu_seqlens,
         cu_seqlens_q,
         max_s,
+        inputs_embeds: Optional[torch.Tensor] = None,
         past_key_values: Optional[torch.Tensor] = None,
         pre_allocate_past_size: Optional[int] = None,
         lm_head_indices: Optional[torch.Tensor] = None,
     ):
+        if inputs_embeds is not None:
+            raise ValueError("input_embeds not yet supported for flash rw (falcon)")
+
         hidden_states, present = self.transformer(
             input_ids,
             position_ids,
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -36,7 +36,7 @@ class FlashCausalLMBatch(Batch):
     input_ids: torch.Tensor
     position_ids: torch.Tensor
     # shape is [sum(seq_lengths), embedding_size]
-    inputs_embeds: torch.Tensor
+    inputs_embeds: Optional[torch.Tensor]
     # cumulative sequence lengths
     cu_seqlens: torch.Tensor
     # cumulative query sequence lengths, only used in decode